aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorPaul Mundt <lethal@linux-sh.org>2011-01-13 01:06:28 -0500
committerPaul Mundt <lethal@linux-sh.org>2011-01-13 01:06:28 -0500
commitf43dc23d5ea91fca257be02138a255f02d98e806 (patch)
treeb29722f6e965316e90ac97abf79923ced250dc21 /net/ipv4
parentf8e53553f452dcbf67cb89c8cba63a1cd6eb4cc0 (diff)
parent4162cf64973df51fc885825bc9ca4d055891c49f (diff)
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6 into common/serial-rework
Conflicts: arch/sh/kernel/cpu/sh2/setup-sh7619.c arch/sh/kernel/cpu/sh2a/setup-mxg.c arch/sh/kernel/cpu/sh2a/setup-sh7201.c arch/sh/kernel/cpu/sh2a/setup-sh7203.c arch/sh/kernel/cpu/sh2a/setup-sh7206.c arch/sh/kernel/cpu/sh3/setup-sh7705.c arch/sh/kernel/cpu/sh3/setup-sh770x.c arch/sh/kernel/cpu/sh3/setup-sh7710.c arch/sh/kernel/cpu/sh3/setup-sh7720.c arch/sh/kernel/cpu/sh4/setup-sh4-202.c arch/sh/kernel/cpu/sh4/setup-sh7750.c arch/sh/kernel/cpu/sh4/setup-sh7760.c arch/sh/kernel/cpu/sh4a/setup-sh7343.c arch/sh/kernel/cpu/sh4a/setup-sh7366.c arch/sh/kernel/cpu/sh4a/setup-sh7722.c arch/sh/kernel/cpu/sh4a/setup-sh7723.c arch/sh/kernel/cpu/sh4a/setup-sh7724.c arch/sh/kernel/cpu/sh4a/setup-sh7763.c arch/sh/kernel/cpu/sh4a/setup-sh7770.c arch/sh/kernel/cpu/sh4a/setup-sh7780.c arch/sh/kernel/cpu/sh4a/setup-sh7785.c arch/sh/kernel/cpu/sh4a/setup-sh7786.c arch/sh/kernel/cpu/sh4a/setup-shx3.c arch/sh/kernel/cpu/sh5/setup-sh5.c drivers/serial/sh-sci.c drivers/serial/sh-sci.h include/linux/serial_sci.h
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig53
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c416
-rw-r--r--net/ipv4/ah4.c303
-rw-r--r--net/ipv4/arp.c416
-rw-r--r--net/ipv4/cipso_ipv4.c7
-rw-r--r--net/ipv4/datagram.c27
-rw-r--r--net/ipv4/devinet.c447
-rw-r--r--net/ipv4/esp4.c38
-rw-r--r--net/ipv4/fib_frontend.c258
-rw-r--r--net/ipv4/fib_hash.c353
-rw-r--r--net/ipv4/fib_lookup.h16
-rw-r--r--net/ipv4/fib_rules.c49
-rw-r--r--net/ipv4/fib_semantics.c368
-rw-r--r--net/ipv4/fib_trie.c210
-rw-r--r--net/ipv4/gre.c152
-rw-r--r--net/ipv4/icmp.c105
-rw-r--r--net/ipv4/igmp.c454
-rw-r--r--net/ipv4/inet_connection_sock.c119
-rw-r--r--net/ipv4/inet_diag.c62
-rw-r--r--net/ipv4/inet_fragment.c2
-rw-r--r--net/ipv4/inet_hashtables.c108
-rw-r--r--net/ipv4/inet_lro.c36
-rw-r--r--net/ipv4/inet_timewait_sock.c156
-rw-r--r--net/ipv4/inetpeer.c452
-rw-r--r--net/ipv4/ip_forward.c13
-rw-r--r--net/ipv4/ip_fragment.c124
-rw-r--r--net/ipv4/ip_gre.c414
-rw-r--r--net/ipv4/ip_input.c44
-rw-r--r--net/ipv4/ip_options.c14
-rw-r--r--net/ipv4/ip_output.c213
-rw-r--r--net/ipv4/ip_sockglue.c119
-rw-r--r--net/ipv4/ipcomp.c19
-rw-r--r--net/ipv4/ipconfig.c114
-rw-r--r--net/ipv4/ipip.c351
-rw-r--r--net/ipv4/ipmr.c1333
-rw-r--r--net/ipv4/netfilter.c41
-rw-r--r--net/ipv4/netfilter/Kconfig6
-rw-r--r--net/ipv4/netfilter/Makefile6
-rw-r--r--net/ipv4/netfilter/arp_tables.c634
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c6
-rw-r--r--net/ipv4/netfilter/arptable_filter.c98
-rw-r--r--net/ipv4/netfilter/ip_queue.c72
-rw-r--r--net/ipv4/netfilter/ip_tables.c987
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c193
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c33
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c222
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c22
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c22
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c16
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c39
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c58
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c28
-rw-r--r--net/ipv4/netfilter/ipt_ah.c28
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c23
-rw-r--r--net/ipv4/netfilter/iptable_filter.c131
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c175
-rw-r--r--net/ipv4/netfilter/iptable_raw.c101
-rw-r--r--net/ipv4/netfilter/iptable_security.c122
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c55
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c39
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c47
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c41
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c164
-rw-r--r--net/ipv4/netfilter/nf_nat_ftp.c114
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c70
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c181
-rw-r--r--net/ipv4/netfilter/nf_nat_irc.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c3
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_common.c12
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_dccp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c12
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c10
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_sctp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_tcp.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udp.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udplite.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_unknown.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c94
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c181
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c54
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c36
-rw-r--r--net/ipv4/netfilter/nf_nat_tftp.c1
-rw-r--r--net/ipv4/proc.c56
-rw-r--r--net/ipv4/protocol.c59
-rw-r--r--net/ipv4/raw.c108
-rw-r--r--net/ipv4/route.c1369
-rw-r--r--net/ipv4/syncookies.c134
-rw-r--r--net/ipv4/sysctl_net_ipv4.c241
-rw-r--r--net/ipv4/tcp.c713
-rw-r--r--net/ipv4/tcp_cong.c10
-rw-r--r--net/ipv4/tcp_diag.c2
-rw-r--r--net/ipv4/tcp_htcp.c10
-rw-r--r--net/ipv4/tcp_hybla.c4
-rw-r--r--net/ipv4/tcp_illinois.c2
-rw-r--r--net/ipv4/tcp_input.c301
-rw-r--r--net/ipv4/tcp_ipv4.c584
-rw-r--r--net/ipv4/tcp_lp.c4
-rw-r--r--net/ipv4/tcp_minisocks.c186
-rw-r--r--net/ipv4/tcp_output.c572
-rw-r--r--net/ipv4/tcp_probe.c38
-rw-r--r--net/ipv4/tcp_timer.c124
-rw-r--r--net/ipv4/tcp_veno.c7
-rw-r--r--net/ipv4/tcp_westwood.c2
-rw-r--r--net/ipv4/tcp_yeah.c4
-rw-r--r--net/ipv4/tunnel4.c51
-rw-r--r--net/ipv4/udp.c842
-rw-r--r--net/ipv4/udp_impl.h4
-rw-r--r--net/ipv4/udplite.c15
-rw-r--r--net/ipv4/xfrm4_input.c8
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c3
-rw-r--r--net/ipv4/xfrm4_output.c2
-rw-r--r--net/ipv4/xfrm4_policy.c129
-rw-r--r--net/ipv4/xfrm4_state.c33
-rw-r--r--net/ipv4/xfrm4_tunnel.c4
116 files changed, 9937 insertions, 7277 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 70491d9035eb..9e95d7fb6d5a 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -46,7 +46,7 @@ config IP_ADVANCED_ROUTER
46 rp_filter on use: 46 rp_filter on use:
47 47
48 echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter 48 echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
49 and 49 or
50 echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter 50 echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
51 51
52 Note that some distributions enable it in startup scripts. 52 Note that some distributions enable it in startup scripts.
@@ -84,7 +84,7 @@ config IP_FIB_TRIE
84 84
85 An experimental study of compression methods for dynamic tries 85 An experimental study of compression methods for dynamic tries
86 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. 86 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
87 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ 87 <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
88 88
89endchoice 89endchoice
90 90
@@ -166,7 +166,7 @@ config IP_PNP_DHCP
166 166
167 If unsure, say Y. Note that if you want to use DHCP, a DHCP server 167 If unsure, say Y. Note that if you want to use DHCP, a DHCP server
168 must be operating on your network. Read 168 must be operating on your network. Read
169 <file:Documentation/filesystems/nfsroot.txt> for details. 169 <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
170 170
171config IP_PNP_BOOTP 171config IP_PNP_BOOTP
172 bool "IP: BOOTP support" 172 bool "IP: BOOTP support"
@@ -181,7 +181,7 @@ config IP_PNP_BOOTP
181 does BOOTP itself, providing all necessary information on the kernel 181 does BOOTP itself, providing all necessary information on the kernel
182 command line, you can say N here. If unsure, say Y. Note that if you 182 command line, you can say N here. If unsure, say Y. Note that if you
183 want to use BOOTP, a BOOTP server must be operating on your network. 183 want to use BOOTP, a BOOTP server must be operating on your network.
184 Read <file:Documentation/filesystems/nfsroot.txt> for details. 184 Read <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
185 185
186config IP_PNP_RARP 186config IP_PNP_RARP
187 bool "IP: RARP support" 187 bool "IP: RARP support"
@@ -194,7 +194,7 @@ config IP_PNP_RARP
194 older protocol which is being obsoleted by BOOTP and DHCP), say Y 194 older protocol which is being obsoleted by BOOTP and DHCP), say Y
195 here. Note that if you want to use RARP, a RARP server must be 195 here. Note that if you want to use RARP, a RARP server must be
196 operating on your network. Read 196 operating on your network. Read
197 <file:Documentation/filesystems/nfsroot.txt> for details. 197 <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
198 198
199# not yet ready.. 199# not yet ready..
200# bool ' IP: ARP support' CONFIG_IP_PNP_ARP 200# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
@@ -215,8 +215,15 @@ config NET_IPIP
215 be inserted in and removed from the running kernel whenever you 215 be inserted in and removed from the running kernel whenever you
216 want). Most people won't need this and can say N. 216 want). Most people won't need this and can say N.
217 217
218config NET_IPGRE_DEMUX
219 tristate "IP: GRE demultiplexer"
220 help
221 This is helper module to demultiplex GRE packets on GRE version field criteria.
222 Required by ip_gre and pptp modules.
223
218config NET_IPGRE 224config NET_IPGRE
219 tristate "IP: GRE tunnels over IP" 225 tristate "IP: GRE tunnels over IP"
226 depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
220 help 227 help
221 Tunneling means encapsulating data of one protocol type within 228 Tunneling means encapsulating data of one protocol type within
222 another protocol and sending it over a channel that understands the 229 another protocol and sending it over a channel that understands the
@@ -250,6 +257,20 @@ config IP_MROUTE
250 <file:Documentation/networking/multicast.txt>. If you haven't heard 257 <file:Documentation/networking/multicast.txt>. If you haven't heard
251 about it, you don't need it. 258 about it, you don't need it.
252 259
260config IP_MROUTE_MULTIPLE_TABLES
261 bool "IP: multicast policy routing"
262 depends on IP_MROUTE && IP_ADVANCED_ROUTER
263 select FIB_RULES
264 help
265 Normally, a multicast router runs a userspace daemon and decides
266 what to do with a multicast packet based on the source and
267 destination addresses. If you say Y here, the multicast router
268 will also be able to take interfaces and packet marks into
269 account and run multiple instances of userspace daemons
270 simultaneously, each one handling a single table.
271
272 If unsure, say N.
273
253config IP_PIMSM_V1 274config IP_PIMSM_V1
254 bool "IP: PIM-SM version 1 support" 275 bool "IP: PIM-SM version 1 support"
255 depends on IP_MROUTE 276 depends on IP_MROUTE
@@ -289,7 +310,7 @@ config ARPD
289 If unsure, say N. 310 If unsure, say N.
290 311
291config SYN_COOKIES 312config SYN_COOKIES
292 bool "IP: TCP syncookie support (disabled per default)" 313 bool "IP: TCP syncookie support"
293 ---help--- 314 ---help---
294 Normal TCP/IP networking is open to an attack known as "SYN 315 Normal TCP/IP networking is open to an attack known as "SYN
295 flooding". This denial-of-service attack prevents legitimate remote 316 flooding". This denial-of-service attack prevents legitimate remote
@@ -314,13 +335,13 @@ config SYN_COOKIES
314 server is really overloaded. If this happens frequently better turn 335 server is really overloaded. If this happens frequently better turn
315 them off. 336 them off.
316 337
317 If you say Y here, note that SYN cookies aren't enabled by default; 338 If you say Y here, you can disable SYN cookies at run time by
318 you can enable them by saying Y to "/proc file system support" and 339 saying Y to "/proc file system support" and
319 "Sysctl support" below and executing the command 340 "Sysctl support" below and executing the command
320 341
321 echo 1 >/proc/sys/net/ipv4/tcp_syncookies 342 echo 0 > /proc/sys/net/ipv4/tcp_syncookies
322 343
323 at boot time after the /proc file system has been mounted. 344 after the /proc file system has been mounted.
324 345
325 If unsure, say N. 346 If unsure, say N.
326 347
@@ -398,7 +419,7 @@ config INET_XFRM_MODE_BEET
398 If unsure, say Y. 419 If unsure, say Y.
399 420
400config INET_LRO 421config INET_LRO
401 bool "Large Receive Offload (ipv4/tcp)" 422 tristate "Large Receive Offload (ipv4/tcp)"
402 default y 423 default y
403 ---help--- 424 ---help---
404 Support for Large Receive Offload (ipv4/tcp). 425 Support for Large Receive Offload (ipv4/tcp).
@@ -541,7 +562,7 @@ config TCP_CONG_VENO
541 distinguishing to circumvent the difficult judgment of the packet loss 562 distinguishing to circumvent the difficult judgment of the packet loss
542 type. TCP Veno cuts down less congestion window in response to random 563 type. TCP Veno cuts down less congestion window in response to random
543 loss packets. 564 loss packets.
544 See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf 565 See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
545 566
546config TCP_CONG_YEAH 567config TCP_CONG_YEAH
547 tristate "YeAH TCP" 568 tristate "YeAH TCP"
@@ -587,9 +608,15 @@ choice
587 config DEFAULT_HTCP 608 config DEFAULT_HTCP
588 bool "Htcp" if TCP_CONG_HTCP=y 609 bool "Htcp" if TCP_CONG_HTCP=y
589 610
611 config DEFAULT_HYBLA
612 bool "Hybla" if TCP_CONG_HYBLA=y
613
590 config DEFAULT_VEGAS 614 config DEFAULT_VEGAS
591 bool "Vegas" if TCP_CONG_VEGAS=y 615 bool "Vegas" if TCP_CONG_VEGAS=y
592 616
617 config DEFAULT_VENO
618 bool "Veno" if TCP_CONG_VENO=y
619
593 config DEFAULT_WESTWOOD 620 config DEFAULT_WESTWOOD
594 bool "Westwood" if TCP_CONG_WESTWOOD=y 621 bool "Westwood" if TCP_CONG_WESTWOOD=y
595 622
@@ -610,8 +637,10 @@ config DEFAULT_TCP_CONG
610 default "bic" if DEFAULT_BIC 637 default "bic" if DEFAULT_BIC
611 default "cubic" if DEFAULT_CUBIC 638 default "cubic" if DEFAULT_CUBIC
612 default "htcp" if DEFAULT_HTCP 639 default "htcp" if DEFAULT_HTCP
640 default "hybla" if DEFAULT_HYBLA
613 default "vegas" if DEFAULT_VEGAS 641 default "vegas" if DEFAULT_VEGAS
614 default "westwood" if DEFAULT_WESTWOOD 642 default "westwood" if DEFAULT_WESTWOOD
643 default "veno" if DEFAULT_VENO
615 default "reno" if DEFAULT_RENO 644 default "reno" if DEFAULT_RENO
616 default "cubic" 645 default "cubic"
617 646
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 80ff87ce43aa..4978d22f9a75 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
20obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 20obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
21obj-$(CONFIG_IP_MROUTE) += ipmr.o 21obj-$(CONFIG_IP_MROUTE) += ipmr.o
22obj-$(CONFIG_NET_IPIP) += ipip.o 22obj-$(CONFIG_NET_IPIP) += ipip.o
23obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
23obj-$(CONFIG_NET_IPGRE) += ip_gre.o 24obj-$(CONFIG_NET_IPGRE) += ip_gre.o
24obj-$(CONFIG_SYN_COOKIES) += syncookies.o 25obj-$(CONFIG_SYN_COOKIES) += syncookies.o
25obj-$(CONFIG_INET_AH) += ah4.o 26obj-$(CONFIG_INET_AH) += ah4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 566ea6c4321d..f2b61107df6c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -86,6 +86,7 @@
86#include <linux/poll.h> 86#include <linux/poll.h>
87#include <linux/netfilter_ipv4.h> 87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h> 88#include <linux/random.h>
89#include <linux/slab.h>
89 90
90#include <asm/uaccess.h> 91#include <asm/uaccess.h>
91#include <asm/system.h> 92#include <asm/system.h>
@@ -124,7 +125,6 @@ static struct list_head inetsw[SOCK_MAX];
124static DEFINE_SPINLOCK(inetsw_lock); 125static DEFINE_SPINLOCK(inetsw_lock);
125 126
126struct ipv4_config ipv4_config; 127struct ipv4_config ipv4_config;
127
128EXPORT_SYMBOL(ipv4_config); 128EXPORT_SYMBOL(ipv4_config);
129 129
130/* New destruction routine */ 130/* New destruction routine */
@@ -139,12 +139,12 @@ void inet_sock_destruct(struct sock *sk)
139 sk_mem_reclaim(sk); 139 sk_mem_reclaim(sk);
140 140
141 if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { 141 if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
142 printk("Attempt to release TCP socket in state %d %p\n", 142 pr_err("Attempt to release TCP socket in state %d %p\n",
143 sk->sk_state, sk); 143 sk->sk_state, sk);
144 return; 144 return;
145 } 145 }
146 if (!sock_flag(sk, SOCK_DEAD)) { 146 if (!sock_flag(sk, SOCK_DEAD)) {
147 printk("Attempt to release alive inet socket %p\n", sk); 147 pr_err("Attempt to release alive inet socket %p\n", sk);
148 return; 148 return;
149 } 149 }
150 150
@@ -154,9 +154,10 @@ void inet_sock_destruct(struct sock *sk)
154 WARN_ON(sk->sk_forward_alloc); 154 WARN_ON(sk->sk_forward_alloc);
155 155
156 kfree(inet->opt); 156 kfree(inet->opt);
157 dst_release(sk->sk_dst_cache); 157 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
158 sk_refcnt_debug_dec(sk); 158 sk_refcnt_debug_dec(sk);
159} 159}
160EXPORT_SYMBOL(inet_sock_destruct);
160 161
161/* 162/*
162 * The routines beyond this point handle the behaviour of an AF_INET 163 * The routines beyond this point handle the behaviour of an AF_INET
@@ -174,12 +175,12 @@ static int inet_autobind(struct sock *sk)
174 /* We may need to bind the socket. */ 175 /* We may need to bind the socket. */
175 lock_sock(sk); 176 lock_sock(sk);
176 inet = inet_sk(sk); 177 inet = inet_sk(sk);
177 if (!inet->num) { 178 if (!inet->inet_num) {
178 if (sk->sk_prot->get_port(sk, 0)) { 179 if (sk->sk_prot->get_port(sk, 0)) {
179 release_sock(sk); 180 release_sock(sk);
180 return -EAGAIN; 181 return -EAGAIN;
181 } 182 }
182 inet->sport = htons(inet->num); 183 inet->inet_sport = htons(inet->inet_num);
183 } 184 }
184 release_sock(sk); 185 release_sock(sk);
185 return 0; 186 return 0;
@@ -219,31 +220,30 @@ out:
219 release_sock(sk); 220 release_sock(sk);
220 return err; 221 return err;
221} 222}
223EXPORT_SYMBOL(inet_listen);
222 224
223u32 inet_ehash_secret __read_mostly; 225u32 inet_ehash_secret __read_mostly;
224EXPORT_SYMBOL(inet_ehash_secret); 226EXPORT_SYMBOL(inet_ehash_secret);
225 227
226/* 228/*
227 * inet_ehash_secret must be set exactly once 229 * inet_ehash_secret must be set exactly once
228 * Instead of using a dedicated spinlock, we (ab)use inetsw_lock
229 */ 230 */
230void build_ehash_secret(void) 231void build_ehash_secret(void)
231{ 232{
232 u32 rnd; 233 u32 rnd;
234
233 do { 235 do {
234 get_random_bytes(&rnd, sizeof(rnd)); 236 get_random_bytes(&rnd, sizeof(rnd));
235 } while (rnd == 0); 237 } while (rnd == 0);
236 spin_lock_bh(&inetsw_lock); 238
237 if (!inet_ehash_secret) 239 cmpxchg(&inet_ehash_secret, 0, rnd);
238 inet_ehash_secret = rnd;
239 spin_unlock_bh(&inetsw_lock);
240} 240}
241EXPORT_SYMBOL(build_ehash_secret); 241EXPORT_SYMBOL(build_ehash_secret);
242 242
243static inline int inet_netns_ok(struct net *net, int protocol) 243static inline int inet_netns_ok(struct net *net, int protocol)
244{ 244{
245 int hash; 245 int hash;
246 struct net_protocol *ipprot; 246 const struct net_protocol *ipprot;
247 247
248 if (net_eq(net, &init_net)) 248 if (net_eq(net, &init_net))
249 return 1; 249 return 1;
@@ -261,7 +261,8 @@ static inline int inet_netns_ok(struct net *net, int protocol)
261 * Create an inet socket. 261 * Create an inet socket.
262 */ 262 */
263 263
264static int inet_create(struct net *net, struct socket *sock, int protocol) 264static int inet_create(struct net *net, struct socket *sock, int protocol,
265 int kern)
265{ 266{
266 struct sock *sk; 267 struct sock *sk;
267 struct inet_protosw *answer; 268 struct inet_protosw *answer;
@@ -324,7 +325,7 @@ lookup_protocol:
324 } 325 }
325 326
326 err = -EPERM; 327 err = -EPERM;
327 if (answer->capability > 0 && !capable(answer->capability)) 328 if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
328 goto out_rcu_unlock; 329 goto out_rcu_unlock;
329 330
330 err = -EAFNOSUPPORT; 331 err = -EAFNOSUPPORT;
@@ -352,8 +353,10 @@ lookup_protocol:
352 inet = inet_sk(sk); 353 inet = inet_sk(sk);
353 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; 354 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
354 355
356 inet->nodefrag = 0;
357
355 if (SOCK_RAW == sock->type) { 358 if (SOCK_RAW == sock->type) {
356 inet->num = protocol; 359 inet->inet_num = protocol;
357 if (IPPROTO_RAW == protocol) 360 if (IPPROTO_RAW == protocol)
358 inet->hdrincl = 1; 361 inet->hdrincl = 1;
359 } 362 }
@@ -363,7 +366,7 @@ lookup_protocol:
363 else 366 else
364 inet->pmtudisc = IP_PMTUDISC_WANT; 367 inet->pmtudisc = IP_PMTUDISC_WANT;
365 368
366 inet->id = 0; 369 inet->inet_id = 0;
367 370
368 sock_init_data(sock, sk); 371 sock_init_data(sock, sk);
369 372
@@ -380,13 +383,13 @@ lookup_protocol:
380 383
381 sk_refcnt_debug_inc(sk); 384 sk_refcnt_debug_inc(sk);
382 385
383 if (inet->num) { 386 if (inet->inet_num) {
384 /* It assumes that any protocol which allows 387 /* It assumes that any protocol which allows
385 * the user to assign a number at socket 388 * the user to assign a number at socket
386 * creation time automatically 389 * creation time automatically
387 * shares. 390 * shares.
388 */ 391 */
389 inet->sport = htons(inet->num); 392 inet->inet_sport = htons(inet->inet_num);
390 /* Add to protocol hash chains. */ 393 /* Add to protocol hash chains. */
391 sk->sk_prot->hash(sk); 394 sk->sk_prot->hash(sk);
392 } 395 }
@@ -416,6 +419,8 @@ int inet_release(struct socket *sock)
416 if (sk) { 419 if (sk) {
417 long timeout; 420 long timeout;
418 421
422 sock_rps_reset_flow(sk);
423
419 /* Applications forget to leave groups before exiting */ 424 /* Applications forget to leave groups before exiting */
420 ip_mc_drop_socket(sk); 425 ip_mc_drop_socket(sk);
421 426
@@ -435,9 +440,11 @@ int inet_release(struct socket *sock)
435 } 440 }
436 return 0; 441 return 0;
437} 442}
443EXPORT_SYMBOL(inet_release);
438 444
439/* It is off by default, see below. */ 445/* It is off by default, see below. */
440int sysctl_ip_nonlocal_bind __read_mostly; 446int sysctl_ip_nonlocal_bind __read_mostly;
447EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
441 448
442int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 449int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
443{ 450{
@@ -491,27 +498,27 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
491 498
492 /* Check these errors (active socket, double bind). */ 499 /* Check these errors (active socket, double bind). */
493 err = -EINVAL; 500 err = -EINVAL;
494 if (sk->sk_state != TCP_CLOSE || inet->num) 501 if (sk->sk_state != TCP_CLOSE || inet->inet_num)
495 goto out_release_sock; 502 goto out_release_sock;
496 503
497 inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; 504 inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
498 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) 505 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
499 inet->saddr = 0; /* Use device */ 506 inet->inet_saddr = 0; /* Use device */
500 507
501 /* Make sure we are allowed to bind here. */ 508 /* Make sure we are allowed to bind here. */
502 if (sk->sk_prot->get_port(sk, snum)) { 509 if (sk->sk_prot->get_port(sk, snum)) {
503 inet->saddr = inet->rcv_saddr = 0; 510 inet->inet_saddr = inet->inet_rcv_saddr = 0;
504 err = -EADDRINUSE; 511 err = -EADDRINUSE;
505 goto out_release_sock; 512 goto out_release_sock;
506 } 513 }
507 514
508 if (inet->rcv_saddr) 515 if (inet->inet_rcv_saddr)
509 sk->sk_userlocks |= SOCK_BINDADDR_LOCK; 516 sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
510 if (snum) 517 if (snum)
511 sk->sk_userlocks |= SOCK_BINDPORT_LOCK; 518 sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
512 inet->sport = htons(inet->num); 519 inet->inet_sport = htons(inet->inet_num);
513 inet->daddr = 0; 520 inet->inet_daddr = 0;
514 inet->dport = 0; 521 inet->inet_dport = 0;
515 sk_dst_reset(sk); 522 sk_dst_reset(sk);
516 err = 0; 523 err = 0;
517out_release_sock: 524out_release_sock:
@@ -519,25 +526,29 @@ out_release_sock:
519out: 526out:
520 return err; 527 return err;
521} 528}
529EXPORT_SYMBOL(inet_bind);
522 530
523int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, 531int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
524 int addr_len, int flags) 532 int addr_len, int flags)
525{ 533{
526 struct sock *sk = sock->sk; 534 struct sock *sk = sock->sk;
527 535
536 if (addr_len < sizeof(uaddr->sa_family))
537 return -EINVAL;
528 if (uaddr->sa_family == AF_UNSPEC) 538 if (uaddr->sa_family == AF_UNSPEC)
529 return sk->sk_prot->disconnect(sk, flags); 539 return sk->sk_prot->disconnect(sk, flags);
530 540
531 if (!inet_sk(sk)->num && inet_autobind(sk)) 541 if (!inet_sk(sk)->inet_num && inet_autobind(sk))
532 return -EAGAIN; 542 return -EAGAIN;
533 return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); 543 return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
534} 544}
545EXPORT_SYMBOL(inet_dgram_connect);
535 546
536static long inet_wait_for_connect(struct sock *sk, long timeo) 547static long inet_wait_for_connect(struct sock *sk, long timeo)
537{ 548{
538 DEFINE_WAIT(wait); 549 DEFINE_WAIT(wait);
539 550
540 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 551 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
541 552
542 /* Basic assumption: if someone sets sk->sk_err, he _must_ 553 /* Basic assumption: if someone sets sk->sk_err, he _must_
543 * change state of the socket from TCP_SYN_*. 554 * change state of the socket from TCP_SYN_*.
@@ -550,9 +561,9 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
550 lock_sock(sk); 561 lock_sock(sk);
551 if (signal_pending(current) || !timeo) 562 if (signal_pending(current) || !timeo)
552 break; 563 break;
553 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 564 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
554 } 565 }
555 finish_wait(sk->sk_sleep, &wait); 566 finish_wait(sk_sleep(sk), &wait);
556 return timeo; 567 return timeo;
557} 568}
558 569
@@ -567,6 +578,9 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
567 int err; 578 int err;
568 long timeo; 579 long timeo;
569 580
581 if (addr_len < sizeof(uaddr->sa_family))
582 return -EINVAL;
583
570 lock_sock(sk); 584 lock_sock(sk);
571 585
572 if (uaddr->sa_family == AF_UNSPEC) { 586 if (uaddr->sa_family == AF_UNSPEC) {
@@ -641,6 +655,7 @@ sock_error:
641 sock->state = SS_DISCONNECTING; 655 sock->state = SS_DISCONNECTING;
642 goto out; 656 goto out;
643} 657}
658EXPORT_SYMBOL(inet_stream_connect);
644 659
645/* 660/*
646 * Accept a pending connection. The TCP layer now gives BSD semantics. 661 * Accept a pending connection. The TCP layer now gives BSD semantics.
@@ -668,6 +683,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
668do_err: 683do_err:
669 return err; 684 return err;
670} 685}
686EXPORT_SYMBOL(inet_accept);
671 687
672 688
673/* 689/*
@@ -678,54 +694,79 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
678{ 694{
679 struct sock *sk = sock->sk; 695 struct sock *sk = sock->sk;
680 struct inet_sock *inet = inet_sk(sk); 696 struct inet_sock *inet = inet_sk(sk);
681 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 697 DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
682 698
683 sin->sin_family = AF_INET; 699 sin->sin_family = AF_INET;
684 if (peer) { 700 if (peer) {
685 if (!inet->dport || 701 if (!inet->inet_dport ||
686 (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && 702 (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
687 peer == 1)) 703 peer == 1))
688 return -ENOTCONN; 704 return -ENOTCONN;
689 sin->sin_port = inet->dport; 705 sin->sin_port = inet->inet_dport;
690 sin->sin_addr.s_addr = inet->daddr; 706 sin->sin_addr.s_addr = inet->inet_daddr;
691 } else { 707 } else {
692 __be32 addr = inet->rcv_saddr; 708 __be32 addr = inet->inet_rcv_saddr;
693 if (!addr) 709 if (!addr)
694 addr = inet->saddr; 710 addr = inet->inet_saddr;
695 sin->sin_port = inet->sport; 711 sin->sin_port = inet->inet_sport;
696 sin->sin_addr.s_addr = addr; 712 sin->sin_addr.s_addr = addr;
697 } 713 }
698 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 714 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
699 *uaddr_len = sizeof(*sin); 715 *uaddr_len = sizeof(*sin);
700 return 0; 716 return 0;
701} 717}
718EXPORT_SYMBOL(inet_getname);
702 719
703int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, 720int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
704 size_t size) 721 size_t size)
705{ 722{
706 struct sock *sk = sock->sk; 723 struct sock *sk = sock->sk;
707 724
725 sock_rps_record_flow(sk);
726
708 /* We may need to bind the socket. */ 727 /* We may need to bind the socket. */
709 if (!inet_sk(sk)->num && inet_autobind(sk)) 728 if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
729 inet_autobind(sk))
710 return -EAGAIN; 730 return -EAGAIN;
711 731
712 return sk->sk_prot->sendmsg(iocb, sk, msg, size); 732 return sk->sk_prot->sendmsg(iocb, sk, msg, size);
713} 733}
734EXPORT_SYMBOL(inet_sendmsg);
714 735
715 736ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
716static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 737 size_t size, int flags)
717{ 738{
718 struct sock *sk = sock->sk; 739 struct sock *sk = sock->sk;
719 740
741 sock_rps_record_flow(sk);
742
720 /* We may need to bind the socket. */ 743 /* We may need to bind the socket. */
721 if (!inet_sk(sk)->num && inet_autobind(sk)) 744 if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
745 inet_autobind(sk))
722 return -EAGAIN; 746 return -EAGAIN;
723 747
724 if (sk->sk_prot->sendpage) 748 if (sk->sk_prot->sendpage)
725 return sk->sk_prot->sendpage(sk, page, offset, size, flags); 749 return sk->sk_prot->sendpage(sk, page, offset, size, flags);
726 return sock_no_sendpage(sock, page, offset, size, flags); 750 return sock_no_sendpage(sock, page, offset, size, flags);
727} 751}
752EXPORT_SYMBOL(inet_sendpage);
753
754int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
755 size_t size, int flags)
756{
757 struct sock *sk = sock->sk;
758 int addr_len = 0;
759 int err;
728 760
761 sock_rps_record_flow(sk);
762
763 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
764 flags & ~MSG_DONTWAIT, &addr_len);
765 if (err >= 0)
766 msg->msg_namelen = addr_len;
767 return err;
768}
769EXPORT_SYMBOL(inet_recvmsg);
729 770
730int inet_shutdown(struct socket *sock, int how) 771int inet_shutdown(struct socket *sock, int how)
731{ 772{
@@ -780,6 +821,7 @@ int inet_shutdown(struct socket *sock, int how)
780 release_sock(sk); 821 release_sock(sk);
781 return err; 822 return err;
782} 823}
824EXPORT_SYMBOL(inet_shutdown);
783 825
784/* 826/*
785 * ioctl() calls you can issue on an INET socket. Most of these are 827 * ioctl() calls you can issue on an INET socket. Most of these are
@@ -798,44 +840,45 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
798 struct net *net = sock_net(sk); 840 struct net *net = sock_net(sk);
799 841
800 switch (cmd) { 842 switch (cmd) {
801 case SIOCGSTAMP: 843 case SIOCGSTAMP:
802 err = sock_get_timestamp(sk, (struct timeval __user *)arg); 844 err = sock_get_timestamp(sk, (struct timeval __user *)arg);
803 break; 845 break;
804 case SIOCGSTAMPNS: 846 case SIOCGSTAMPNS:
805 err = sock_get_timestampns(sk, (struct timespec __user *)arg); 847 err = sock_get_timestampns(sk, (struct timespec __user *)arg);
806 break; 848 break;
807 case SIOCADDRT: 849 case SIOCADDRT:
808 case SIOCDELRT: 850 case SIOCDELRT:
809 case SIOCRTMSG: 851 case SIOCRTMSG:
810 err = ip_rt_ioctl(net, cmd, (void __user *)arg); 852 err = ip_rt_ioctl(net, cmd, (void __user *)arg);
811 break; 853 break;
812 case SIOCDARP: 854 case SIOCDARP:
813 case SIOCGARP: 855 case SIOCGARP:
814 case SIOCSARP: 856 case SIOCSARP:
815 err = arp_ioctl(net, cmd, (void __user *)arg); 857 err = arp_ioctl(net, cmd, (void __user *)arg);
816 break; 858 break;
817 case SIOCGIFADDR: 859 case SIOCGIFADDR:
818 case SIOCSIFADDR: 860 case SIOCSIFADDR:
819 case SIOCGIFBRDADDR: 861 case SIOCGIFBRDADDR:
820 case SIOCSIFBRDADDR: 862 case SIOCSIFBRDADDR:
821 case SIOCGIFNETMASK: 863 case SIOCGIFNETMASK:
822 case SIOCSIFNETMASK: 864 case SIOCSIFNETMASK:
823 case SIOCGIFDSTADDR: 865 case SIOCGIFDSTADDR:
824 case SIOCSIFDSTADDR: 866 case SIOCSIFDSTADDR:
825 case SIOCSIFPFLAGS: 867 case SIOCSIFPFLAGS:
826 case SIOCGIFPFLAGS: 868 case SIOCGIFPFLAGS:
827 case SIOCSIFFLAGS: 869 case SIOCSIFFLAGS:
828 err = devinet_ioctl(net, cmd, (void __user *)arg); 870 err = devinet_ioctl(net, cmd, (void __user *)arg);
829 break; 871 break;
830 default: 872 default:
831 if (sk->sk_prot->ioctl) 873 if (sk->sk_prot->ioctl)
832 err = sk->sk_prot->ioctl(sk, cmd, arg); 874 err = sk->sk_prot->ioctl(sk, cmd, arg);
833 else 875 else
834 err = -ENOIOCTLCMD; 876 err = -ENOIOCTLCMD;
835 break; 877 break;
836 } 878 }
837 return err; 879 return err;
838} 880}
881EXPORT_SYMBOL(inet_ioctl);
839 882
840const struct proto_ops inet_stream_ops = { 883const struct proto_ops inet_stream_ops = {
841 .family = PF_INET, 884 .family = PF_INET,
@@ -852,16 +895,17 @@ const struct proto_ops inet_stream_ops = {
852 .shutdown = inet_shutdown, 895 .shutdown = inet_shutdown,
853 .setsockopt = sock_common_setsockopt, 896 .setsockopt = sock_common_setsockopt,
854 .getsockopt = sock_common_getsockopt, 897 .getsockopt = sock_common_getsockopt,
855 .sendmsg = tcp_sendmsg, 898 .sendmsg = inet_sendmsg,
856 .recvmsg = sock_common_recvmsg, 899 .recvmsg = inet_recvmsg,
857 .mmap = sock_no_mmap, 900 .mmap = sock_no_mmap,
858 .sendpage = tcp_sendpage, 901 .sendpage = inet_sendpage,
859 .splice_read = tcp_splice_read, 902 .splice_read = tcp_splice_read,
860#ifdef CONFIG_COMPAT 903#ifdef CONFIG_COMPAT
861 .compat_setsockopt = compat_sock_common_setsockopt, 904 .compat_setsockopt = compat_sock_common_setsockopt,
862 .compat_getsockopt = compat_sock_common_getsockopt, 905 .compat_getsockopt = compat_sock_common_getsockopt,
863#endif 906#endif
864}; 907};
908EXPORT_SYMBOL(inet_stream_ops);
865 909
866const struct proto_ops inet_dgram_ops = { 910const struct proto_ops inet_dgram_ops = {
867 .family = PF_INET, 911 .family = PF_INET,
@@ -879,7 +923,7 @@ const struct proto_ops inet_dgram_ops = {
879 .setsockopt = sock_common_setsockopt, 923 .setsockopt = sock_common_setsockopt,
880 .getsockopt = sock_common_getsockopt, 924 .getsockopt = sock_common_getsockopt,
881 .sendmsg = inet_sendmsg, 925 .sendmsg = inet_sendmsg,
882 .recvmsg = sock_common_recvmsg, 926 .recvmsg = inet_recvmsg,
883 .mmap = sock_no_mmap, 927 .mmap = sock_no_mmap,
884 .sendpage = inet_sendpage, 928 .sendpage = inet_sendpage,
885#ifdef CONFIG_COMPAT 929#ifdef CONFIG_COMPAT
@@ -887,6 +931,7 @@ const struct proto_ops inet_dgram_ops = {
887 .compat_getsockopt = compat_sock_common_getsockopt, 931 .compat_getsockopt = compat_sock_common_getsockopt,
888#endif 932#endif
889}; 933};
934EXPORT_SYMBOL(inet_dgram_ops);
890 935
891/* 936/*
892 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without 937 * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
@@ -908,7 +953,7 @@ static const struct proto_ops inet_sockraw_ops = {
908 .setsockopt = sock_common_setsockopt, 953 .setsockopt = sock_common_setsockopt,
909 .getsockopt = sock_common_getsockopt, 954 .getsockopt = sock_common_getsockopt,
910 .sendmsg = inet_sendmsg, 955 .sendmsg = inet_sendmsg,
911 .recvmsg = sock_common_recvmsg, 956 .recvmsg = inet_recvmsg,
912 .mmap = sock_no_mmap, 957 .mmap = sock_no_mmap,
913 .sendpage = inet_sendpage, 958 .sendpage = inet_sendpage,
914#ifdef CONFIG_COMPAT 959#ifdef CONFIG_COMPAT
@@ -917,7 +962,7 @@ static const struct proto_ops inet_sockraw_ops = {
917#endif 962#endif
918}; 963};
919 964
920static struct net_proto_family inet_family_ops = { 965static const struct net_proto_family inet_family_ops = {
921 .family = PF_INET, 966 .family = PF_INET,
922 .create = inet_create, 967 .create = inet_create,
923 .owner = THIS_MODULE, 968 .owner = THIS_MODULE,
@@ -933,7 +978,6 @@ static struct inet_protosw inetsw_array[] =
933 .protocol = IPPROTO_TCP, 978 .protocol = IPPROTO_TCP,
934 .prot = &tcp_prot, 979 .prot = &tcp_prot,
935 .ops = &inet_stream_ops, 980 .ops = &inet_stream_ops,
936 .capability = -1,
937 .no_check = 0, 981 .no_check = 0,
938 .flags = INET_PROTOSW_PERMANENT | 982 .flags = INET_PROTOSW_PERMANENT |
939 INET_PROTOSW_ICSK, 983 INET_PROTOSW_ICSK,
@@ -944,7 +988,6 @@ static struct inet_protosw inetsw_array[] =
944 .protocol = IPPROTO_UDP, 988 .protocol = IPPROTO_UDP,
945 .prot = &udp_prot, 989 .prot = &udp_prot,
946 .ops = &inet_dgram_ops, 990 .ops = &inet_dgram_ops,
947 .capability = -1,
948 .no_check = UDP_CSUM_DEFAULT, 991 .no_check = UDP_CSUM_DEFAULT,
949 .flags = INET_PROTOSW_PERMANENT, 992 .flags = INET_PROTOSW_PERMANENT,
950 }, 993 },
@@ -955,7 +998,6 @@ static struct inet_protosw inetsw_array[] =
955 .protocol = IPPROTO_IP, /* wild card */ 998 .protocol = IPPROTO_IP, /* wild card */
956 .prot = &raw_prot, 999 .prot = &raw_prot,
957 .ops = &inet_sockraw_ops, 1000 .ops = &inet_sockraw_ops,
958 .capability = CAP_NET_RAW,
959 .no_check = UDP_CSUM_DEFAULT, 1001 .no_check = UDP_CSUM_DEFAULT,
960 .flags = INET_PROTOSW_REUSE, 1002 .flags = INET_PROTOSW_REUSE,
961 } 1003 }
@@ -1016,6 +1058,7 @@ out_illegal:
1016 p->type); 1058 p->type);
1017 goto out; 1059 goto out;
1018} 1060}
1061EXPORT_SYMBOL(inet_register_protosw);
1019 1062
1020void inet_unregister_protosw(struct inet_protosw *p) 1063void inet_unregister_protosw(struct inet_protosw *p)
1021{ 1064{
@@ -1031,6 +1074,7 @@ void inet_unregister_protosw(struct inet_protosw *p)
1031 synchronize_net(); 1074 synchronize_net();
1032 } 1075 }
1033} 1076}
1077EXPORT_SYMBOL(inet_unregister_protosw);
1034 1078
1035/* 1079/*
1036 * Shall we try to damage output packets if routing dev changes? 1080 * Shall we try to damage output packets if routing dev changes?
@@ -1043,9 +1087,9 @@ static int inet_sk_reselect_saddr(struct sock *sk)
1043 struct inet_sock *inet = inet_sk(sk); 1087 struct inet_sock *inet = inet_sk(sk);
1044 int err; 1088 int err;
1045 struct rtable *rt; 1089 struct rtable *rt;
1046 __be32 old_saddr = inet->saddr; 1090 __be32 old_saddr = inet->inet_saddr;
1047 __be32 new_saddr; 1091 __be32 new_saddr;
1048 __be32 daddr = inet->daddr; 1092 __be32 daddr = inet->inet_daddr;
1049 1093
1050 if (inet->opt && inet->opt->srr) 1094 if (inet->opt && inet->opt->srr)
1051 daddr = inet->opt->faddr; 1095 daddr = inet->opt->faddr;
@@ -1055,11 +1099,11 @@ static int inet_sk_reselect_saddr(struct sock *sk)
1055 RT_CONN_FLAGS(sk), 1099 RT_CONN_FLAGS(sk),
1056 sk->sk_bound_dev_if, 1100 sk->sk_bound_dev_if,
1057 sk->sk_protocol, 1101 sk->sk_protocol,
1058 inet->sport, inet->dport, sk, 0); 1102 inet->inet_sport, inet->inet_dport, sk, 0);
1059 if (err) 1103 if (err)
1060 return err; 1104 return err;
1061 1105
1062 sk_setup_caps(sk, &rt->u.dst); 1106 sk_setup_caps(sk, &rt->dst);
1063 1107
1064 new_saddr = rt->rt_src; 1108 new_saddr = rt->rt_src;
1065 1109
@@ -1071,7 +1115,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
1071 __func__, &old_saddr, &new_saddr); 1115 __func__, &old_saddr, &new_saddr);
1072 } 1116 }
1073 1117
1074 inet->saddr = inet->rcv_saddr = new_saddr; 1118 inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
1075 1119
1076 /* 1120 /*
1077 * XXX The only one ugly spot where we need to 1121 * XXX The only one ugly spot where we need to
@@ -1097,34 +1141,27 @@ int inet_sk_rebuild_header(struct sock *sk)
1097 return 0; 1141 return 0;
1098 1142
1099 /* Reroute. */ 1143 /* Reroute. */
1100 daddr = inet->daddr; 1144 daddr = inet->inet_daddr;
1101 if (inet->opt && inet->opt->srr) 1145 if (inet->opt && inet->opt->srr)
1102 daddr = inet->opt->faddr; 1146 daddr = inet->opt->faddr;
1103{ 1147{
1104 struct flowi fl = { 1148 struct flowi fl = {
1105 .oif = sk->sk_bound_dev_if, 1149 .oif = sk->sk_bound_dev_if,
1106 .nl_u = { 1150 .mark = sk->sk_mark,
1107 .ip4_u = { 1151 .fl4_dst = daddr,
1108 .daddr = daddr, 1152 .fl4_src = inet->inet_saddr,
1109 .saddr = inet->saddr, 1153 .fl4_tos = RT_CONN_FLAGS(sk),
1110 .tos = RT_CONN_FLAGS(sk),
1111 },
1112 },
1113 .proto = sk->sk_protocol, 1154 .proto = sk->sk_protocol,
1114 .flags = inet_sk_flowi_flags(sk), 1155 .flags = inet_sk_flowi_flags(sk),
1115 .uli_u = { 1156 .fl_ip_sport = inet->inet_sport,
1116 .ports = { 1157 .fl_ip_dport = inet->inet_dport,
1117 .sport = inet->sport,
1118 .dport = inet->dport,
1119 },
1120 },
1121 }; 1158 };
1122 1159
1123 security_sk_classify_flow(sk, &fl); 1160 security_sk_classify_flow(sk, &fl);
1124 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0); 1161 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
1125} 1162}
1126 if (!err) 1163 if (!err)
1127 sk_setup_caps(sk, &rt->u.dst); 1164 sk_setup_caps(sk, &rt->dst);
1128 else { 1165 else {
1129 /* Routing failed... */ 1166 /* Routing failed... */
1130 sk->sk_route_caps = 0; 1167 sk->sk_route_caps = 0;
@@ -1141,13 +1178,12 @@ int inet_sk_rebuild_header(struct sock *sk)
1141 1178
1142 return err; 1179 return err;
1143} 1180}
1144
1145EXPORT_SYMBOL(inet_sk_rebuild_header); 1181EXPORT_SYMBOL(inet_sk_rebuild_header);
1146 1182
1147static int inet_gso_send_check(struct sk_buff *skb) 1183static int inet_gso_send_check(struct sk_buff *skb)
1148{ 1184{
1149 struct iphdr *iph; 1185 struct iphdr *iph;
1150 struct net_protocol *ops; 1186 const struct net_protocol *ops;
1151 int proto; 1187 int proto;
1152 int ihl; 1188 int ihl;
1153 int err = -EINVAL; 1189 int err = -EINVAL;
@@ -1183,10 +1219,11 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
1183{ 1219{
1184 struct sk_buff *segs = ERR_PTR(-EINVAL); 1220 struct sk_buff *segs = ERR_PTR(-EINVAL);
1185 struct iphdr *iph; 1221 struct iphdr *iph;
1186 struct net_protocol *ops; 1222 const struct net_protocol *ops;
1187 int proto; 1223 int proto;
1188 int ihl; 1224 int ihl;
1189 int id; 1225 int id;
1226 unsigned int offset = 0;
1190 1227
1191 if (!(features & NETIF_F_V4_CSUM)) 1228 if (!(features & NETIF_F_V4_CSUM))
1192 features &= ~NETIF_F_SG; 1229 features &= ~NETIF_F_SG;
@@ -1229,7 +1266,14 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
1229 skb = segs; 1266 skb = segs;
1230 do { 1267 do {
1231 iph = ip_hdr(skb); 1268 iph = ip_hdr(skb);
1232 iph->id = htons(id++); 1269 if (proto == IPPROTO_UDP) {
1270 iph->id = htons(id);
1271 iph->frag_off = htons(offset >> 3);
1272 if (skb->next != NULL)
1273 iph->frag_off |= htons(IP_MF);
1274 offset += (skb->len - skb->mac_len - iph->ihl * 4);
1275 } else
1276 iph->id = htons(id++);
1233 iph->tot_len = htons(skb->len - skb->mac_len); 1277 iph->tot_len = htons(skb->len - skb->mac_len);
1234 iph->check = 0; 1278 iph->check = 0;
1235 iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); 1279 iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
@@ -1242,7 +1286,7 @@ out:
1242static struct sk_buff **inet_gro_receive(struct sk_buff **head, 1286static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1243 struct sk_buff *skb) 1287 struct sk_buff *skb)
1244{ 1288{
1245 struct net_protocol *ops; 1289 const struct net_protocol *ops;
1246 struct sk_buff **pp = NULL; 1290 struct sk_buff **pp = NULL;
1247 struct sk_buff *p; 1291 struct sk_buff *p;
1248 struct iphdr *iph; 1292 struct iphdr *iph;
@@ -1274,8 +1318,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1274 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) 1318 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
1275 goto out_unlock; 1319 goto out_unlock;
1276 1320
1277 id = ntohl(*(u32 *)&iph->id); 1321 id = ntohl(*(__be32 *)&iph->id);
1278 flush = (u16)((ntohl(*(u32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); 1322 flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
1279 id >>= 16; 1323 id >>= 16;
1280 1324
1281 for (p = *head; p; p = p->next) { 1325 for (p = *head; p; p = p->next) {
@@ -1288,8 +1332,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1288 1332
1289 if ((iph->protocol ^ iph2->protocol) | 1333 if ((iph->protocol ^ iph2->protocol) |
1290 (iph->tos ^ iph2->tos) | 1334 (iph->tos ^ iph2->tos) |
1291 (iph->saddr ^ iph2->saddr) | 1335 ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
1292 (iph->daddr ^ iph2->daddr)) { 1336 ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
1293 NAPI_GRO_CB(p)->same_flow = 0; 1337 NAPI_GRO_CB(p)->same_flow = 0;
1294 continue; 1338 continue;
1295 } 1339 }
@@ -1319,7 +1363,7 @@ out:
1319 1363
1320static int inet_gro_complete(struct sk_buff *skb) 1364static int inet_gro_complete(struct sk_buff *skb)
1321{ 1365{
1322 struct net_protocol *ops; 1366 const struct net_protocol *ops;
1323 struct iphdr *iph = ip_hdr(skb); 1367 struct iphdr *iph = ip_hdr(skb);
1324 int proto = iph->protocol & (MAX_INET_PROTOS - 1); 1368 int proto = iph->protocol & (MAX_INET_PROTOS - 1);
1325 int err = -ENOSYS; 1369 int err = -ENOSYS;
@@ -1361,10 +1405,9 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
1361 } 1405 }
1362 return rc; 1406 return rc;
1363} 1407}
1364
1365EXPORT_SYMBOL_GPL(inet_ctl_sock_create); 1408EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
1366 1409
1367unsigned long snmp_fold_field(void *mib[], int offt) 1410unsigned long snmp_fold_field(void __percpu *mib[], int offt)
1368{ 1411{
1369 unsigned long res = 0; 1412 unsigned long res = 0;
1370 int i; 1413 int i;
@@ -1377,13 +1420,49 @@ unsigned long snmp_fold_field(void *mib[], int offt)
1377} 1420}
1378EXPORT_SYMBOL_GPL(snmp_fold_field); 1421EXPORT_SYMBOL_GPL(snmp_fold_field);
1379 1422
1380int snmp_mib_init(void *ptr[2], size_t mibsize) 1423#if BITS_PER_LONG==32
1424
1425u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
1426{
1427 u64 res = 0;
1428 int cpu;
1429
1430 for_each_possible_cpu(cpu) {
1431 void *bhptr, *userptr;
1432 struct u64_stats_sync *syncp;
1433 u64 v_bh, v_user;
1434 unsigned int start;
1435
1436 /* first mib used by softirq context, we must use _bh() accessors */
1437 bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu);
1438 syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
1439 do {
1440 start = u64_stats_fetch_begin_bh(syncp);
1441 v_bh = *(((u64 *) bhptr) + offt);
1442 } while (u64_stats_fetch_retry_bh(syncp, start));
1443
1444 /* second mib used in USER context */
1445 userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu);
1446 syncp = (struct u64_stats_sync *)(userptr + syncp_offset);
1447 do {
1448 start = u64_stats_fetch_begin(syncp);
1449 v_user = *(((u64 *) userptr) + offt);
1450 } while (u64_stats_fetch_retry(syncp, start));
1451
1452 res += v_bh + v_user;
1453 }
1454 return res;
1455}
1456EXPORT_SYMBOL_GPL(snmp_fold_field64);
1457#endif
1458
1459int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
1381{ 1460{
1382 BUG_ON(ptr == NULL); 1461 BUG_ON(ptr == NULL);
1383 ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); 1462 ptr[0] = __alloc_percpu(mibsize, align);
1384 if (!ptr[0]) 1463 if (!ptr[0])
1385 goto err0; 1464 goto err0;
1386 ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); 1465 ptr[1] = __alloc_percpu(mibsize, align);
1387 if (!ptr[1]) 1466 if (!ptr[1])
1388 goto err1; 1467 goto err1;
1389 return 0; 1468 return 0;
@@ -1395,7 +1474,7 @@ err0:
1395} 1474}
1396EXPORT_SYMBOL_GPL(snmp_mib_init); 1475EXPORT_SYMBOL_GPL(snmp_mib_init);
1397 1476
1398void snmp_mib_free(void *ptr[2]) 1477void snmp_mib_free(void __percpu *ptr[2])
1399{ 1478{
1400 BUG_ON(ptr == NULL); 1479 BUG_ON(ptr == NULL);
1401 free_percpu(ptr[0]); 1480 free_percpu(ptr[0]);
@@ -1405,13 +1484,13 @@ void snmp_mib_free(void *ptr[2])
1405EXPORT_SYMBOL_GPL(snmp_mib_free); 1484EXPORT_SYMBOL_GPL(snmp_mib_free);
1406 1485
1407#ifdef CONFIG_IP_MULTICAST 1486#ifdef CONFIG_IP_MULTICAST
1408static struct net_protocol igmp_protocol = { 1487static const struct net_protocol igmp_protocol = {
1409 .handler = igmp_rcv, 1488 .handler = igmp_rcv,
1410 .netns_ok = 1, 1489 .netns_ok = 1,
1411}; 1490};
1412#endif 1491#endif
1413 1492
1414static struct net_protocol tcp_protocol = { 1493static const struct net_protocol tcp_protocol = {
1415 .handler = tcp_v4_rcv, 1494 .handler = tcp_v4_rcv,
1416 .err_handler = tcp_v4_err, 1495 .err_handler = tcp_v4_err,
1417 .gso_send_check = tcp_v4_gso_send_check, 1496 .gso_send_check = tcp_v4_gso_send_check,
@@ -1422,14 +1501,16 @@ static struct net_protocol tcp_protocol = {
1422 .netns_ok = 1, 1501 .netns_ok = 1,
1423}; 1502};
1424 1503
1425static struct net_protocol udp_protocol = { 1504static const struct net_protocol udp_protocol = {
1426 .handler = udp_rcv, 1505 .handler = udp_rcv,
1427 .err_handler = udp_err, 1506 .err_handler = udp_err,
1507 .gso_send_check = udp4_ufo_send_check,
1508 .gso_segment = udp4_ufo_fragment,
1428 .no_policy = 1, 1509 .no_policy = 1,
1429 .netns_ok = 1, 1510 .netns_ok = 1,
1430}; 1511};
1431 1512
1432static struct net_protocol icmp_protocol = { 1513static const struct net_protocol icmp_protocol = {
1433 .handler = icmp_rcv, 1514 .handler = icmp_rcv,
1434 .no_policy = 1, 1515 .no_policy = 1,
1435 .netns_ok = 1, 1516 .netns_ok = 1,
@@ -1437,56 +1518,63 @@ static struct net_protocol icmp_protocol = {
1437 1518
1438static __net_init int ipv4_mib_init_net(struct net *net) 1519static __net_init int ipv4_mib_init_net(struct net *net)
1439{ 1520{
1440 if (snmp_mib_init((void **)net->mib.tcp_statistics, 1521 if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
1441 sizeof(struct tcp_mib)) < 0) 1522 sizeof(struct tcp_mib),
1523 __alignof__(struct tcp_mib)) < 0)
1442 goto err_tcp_mib; 1524 goto err_tcp_mib;
1443 if (snmp_mib_init((void **)net->mib.ip_statistics, 1525 if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
1444 sizeof(struct ipstats_mib)) < 0) 1526 sizeof(struct ipstats_mib),
1527 __alignof__(struct ipstats_mib)) < 0)
1445 goto err_ip_mib; 1528 goto err_ip_mib;
1446 if (snmp_mib_init((void **)net->mib.net_statistics, 1529 if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
1447 sizeof(struct linux_mib)) < 0) 1530 sizeof(struct linux_mib),
1531 __alignof__(struct linux_mib)) < 0)
1448 goto err_net_mib; 1532 goto err_net_mib;
1449 if (snmp_mib_init((void **)net->mib.udp_statistics, 1533 if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
1450 sizeof(struct udp_mib)) < 0) 1534 sizeof(struct udp_mib),
1535 __alignof__(struct udp_mib)) < 0)
1451 goto err_udp_mib; 1536 goto err_udp_mib;
1452 if (snmp_mib_init((void **)net->mib.udplite_statistics, 1537 if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
1453 sizeof(struct udp_mib)) < 0) 1538 sizeof(struct udp_mib),
1539 __alignof__(struct udp_mib)) < 0)
1454 goto err_udplite_mib; 1540 goto err_udplite_mib;
1455 if (snmp_mib_init((void **)net->mib.icmp_statistics, 1541 if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
1456 sizeof(struct icmp_mib)) < 0) 1542 sizeof(struct icmp_mib),
1543 __alignof__(struct icmp_mib)) < 0)
1457 goto err_icmp_mib; 1544 goto err_icmp_mib;
1458 if (snmp_mib_init((void **)net->mib.icmpmsg_statistics, 1545 if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
1459 sizeof(struct icmpmsg_mib)) < 0) 1546 sizeof(struct icmpmsg_mib),
1547 __alignof__(struct icmpmsg_mib)) < 0)
1460 goto err_icmpmsg_mib; 1548 goto err_icmpmsg_mib;
1461 1549
1462 tcp_mib_init(net); 1550 tcp_mib_init(net);
1463 return 0; 1551 return 0;
1464 1552
1465err_icmpmsg_mib: 1553err_icmpmsg_mib:
1466 snmp_mib_free((void **)net->mib.icmp_statistics); 1554 snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
1467err_icmp_mib: 1555err_icmp_mib:
1468 snmp_mib_free((void **)net->mib.udplite_statistics); 1556 snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
1469err_udplite_mib: 1557err_udplite_mib:
1470 snmp_mib_free((void **)net->mib.udp_statistics); 1558 snmp_mib_free((void __percpu **)net->mib.udp_statistics);
1471err_udp_mib: 1559err_udp_mib:
1472 snmp_mib_free((void **)net->mib.net_statistics); 1560 snmp_mib_free((void __percpu **)net->mib.net_statistics);
1473err_net_mib: 1561err_net_mib:
1474 snmp_mib_free((void **)net->mib.ip_statistics); 1562 snmp_mib_free((void __percpu **)net->mib.ip_statistics);
1475err_ip_mib: 1563err_ip_mib:
1476 snmp_mib_free((void **)net->mib.tcp_statistics); 1564 snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
1477err_tcp_mib: 1565err_tcp_mib:
1478 return -ENOMEM; 1566 return -ENOMEM;
1479} 1567}
1480 1568
1481static __net_exit void ipv4_mib_exit_net(struct net *net) 1569static __net_exit void ipv4_mib_exit_net(struct net *net)
1482{ 1570{
1483 snmp_mib_free((void **)net->mib.icmpmsg_statistics); 1571 snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics);
1484 snmp_mib_free((void **)net->mib.icmp_statistics); 1572 snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
1485 snmp_mib_free((void **)net->mib.udplite_statistics); 1573 snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
1486 snmp_mib_free((void **)net->mib.udp_statistics); 1574 snmp_mib_free((void __percpu **)net->mib.udp_statistics);
1487 snmp_mib_free((void **)net->mib.net_statistics); 1575 snmp_mib_free((void __percpu **)net->mib.net_statistics);
1488 snmp_mib_free((void **)net->mib.ip_statistics); 1576 snmp_mib_free((void __percpu **)net->mib.ip_statistics);
1489 snmp_mib_free((void **)net->mib.tcp_statistics); 1577 snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
1490} 1578}
1491 1579
1492static __net_initdata struct pernet_operations ipv4_mib_ops = { 1580static __net_initdata struct pernet_operations ipv4_mib_ops = {
@@ -1523,9 +1611,13 @@ static int __init inet_init(void)
1523 1611
1524 BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); 1612 BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
1525 1613
1614 sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
1615 if (!sysctl_local_reserved_ports)
1616 goto out;
1617
1526 rc = proto_register(&tcp_prot, 1); 1618 rc = proto_register(&tcp_prot, 1);
1527 if (rc) 1619 if (rc)
1528 goto out; 1620 goto out_free_reserved_ports;
1529 1621
1530 rc = proto_register(&udp_prot, 1); 1622 rc = proto_register(&udp_prot, 1);
1531 if (rc) 1623 if (rc)
@@ -1624,6 +1716,8 @@ out_unregister_udp_proto:
1624 proto_unregister(&udp_prot); 1716 proto_unregister(&udp_prot);
1625out_unregister_tcp_proto: 1717out_unregister_tcp_proto:
1626 proto_unregister(&tcp_prot); 1718 proto_unregister(&tcp_prot);
1719out_free_reserved_ports:
1720 kfree(sysctl_local_reserved_ports);
1627 goto out; 1721 goto out;
1628} 1722}
1629 1723
@@ -1666,19 +1760,3 @@ static int __init ipv4_proc_init(void)
1666 1760
1667MODULE_ALIAS_NETPROTO(PF_INET); 1761MODULE_ALIAS_NETPROTO(PF_INET);
1668 1762
1669EXPORT_SYMBOL(inet_accept);
1670EXPORT_SYMBOL(inet_bind);
1671EXPORT_SYMBOL(inet_dgram_connect);
1672EXPORT_SYMBOL(inet_dgram_ops);
1673EXPORT_SYMBOL(inet_getname);
1674EXPORT_SYMBOL(inet_ioctl);
1675EXPORT_SYMBOL(inet_listen);
1676EXPORT_SYMBOL(inet_register_protosw);
1677EXPORT_SYMBOL(inet_release);
1678EXPORT_SYMBOL(inet_sendmsg);
1679EXPORT_SYMBOL(inet_shutdown);
1680EXPORT_SYMBOL(inet_sock_destruct);
1681EXPORT_SYMBOL(inet_stream_connect);
1682EXPORT_SYMBOL(inet_stream_ops);
1683EXPORT_SYMBOL(inet_unregister_protosw);
1684EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index e878e494296e..86961bec70ab 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,14 +1,73 @@
1#include <crypto/hash.h>
1#include <linux/err.h> 2#include <linux/err.h>
2#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/slab.h>
3#include <net/ip.h> 5#include <net/ip.h>
4#include <net/xfrm.h> 6#include <net/xfrm.h>
5#include <net/ah.h> 7#include <net/ah.h>
6#include <linux/crypto.h> 8#include <linux/crypto.h>
7#include <linux/pfkeyv2.h> 9#include <linux/pfkeyv2.h>
8#include <linux/spinlock.h> 10#include <linux/scatterlist.h>
9#include <net/icmp.h> 11#include <net/icmp.h>
10#include <net/protocol.h> 12#include <net/protocol.h>
11 13
14struct ah_skb_cb {
15 struct xfrm_skb_cb xfrm;
16 void *tmp;
17};
18
19#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
20
21static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
22 unsigned int size)
23{
24 unsigned int len;
25
26 len = size + crypto_ahash_digestsize(ahash) +
27 (crypto_ahash_alignmask(ahash) &
28 ~(crypto_tfm_ctx_alignment() - 1));
29
30 len = ALIGN(len, crypto_tfm_ctx_alignment());
31
32 len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
33 len = ALIGN(len, __alignof__(struct scatterlist));
34
35 len += sizeof(struct scatterlist) * nfrags;
36
37 return kmalloc(len, GFP_ATOMIC);
38}
39
40static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset)
41{
42 return tmp + offset;
43}
44
45static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
46 unsigned int offset)
47{
48 return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
49}
50
51static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
52 u8 *icv)
53{
54 struct ahash_request *req;
55
56 req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
57 crypto_tfm_ctx_alignment());
58
59 ahash_request_set_tfm(req, ahash);
60
61 return req;
62}
63
64static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
65 struct ahash_request *req)
66{
67 return (void *)ALIGN((unsigned long)(req + 1) +
68 crypto_ahash_reqsize(ahash),
69 __alignof__(struct scatterlist));
70}
12 71
13/* Clear mutable options and find final destination to substitute 72/* Clear mutable options and find final destination to substitute
14 * into IP header for icv calculation. Options are already checked 73 * into IP header for icv calculation. Options are already checked
@@ -54,20 +113,72 @@ static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr)
54 return 0; 113 return 0;
55} 114}
56 115
116static void ah_output_done(struct crypto_async_request *base, int err)
117{
118 u8 *icv;
119 struct iphdr *iph;
120 struct sk_buff *skb = base->data;
121 struct xfrm_state *x = skb_dst(skb)->xfrm;
122 struct ah_data *ahp = x->data;
123 struct iphdr *top_iph = ip_hdr(skb);
124 struct ip_auth_hdr *ah = ip_auth_hdr(skb);
125 int ihl = ip_hdrlen(skb);
126
127 iph = AH_SKB_CB(skb)->tmp;
128 icv = ah_tmp_icv(ahp->ahash, iph, ihl);
129 memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
130
131 top_iph->tos = iph->tos;
132 top_iph->ttl = iph->ttl;
133 top_iph->frag_off = iph->frag_off;
134 if (top_iph->ihl != 5) {
135 top_iph->daddr = iph->daddr;
136 memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
137 }
138
139 err = ah->nexthdr;
140
141 kfree(AH_SKB_CB(skb)->tmp);
142 xfrm_output_resume(skb, err);
143}
144
57static int ah_output(struct xfrm_state *x, struct sk_buff *skb) 145static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
58{ 146{
59 int err; 147 int err;
148 int nfrags;
149 int ihl;
150 u8 *icv;
151 struct sk_buff *trailer;
152 struct crypto_ahash *ahash;
153 struct ahash_request *req;
154 struct scatterlist *sg;
60 struct iphdr *iph, *top_iph; 155 struct iphdr *iph, *top_iph;
61 struct ip_auth_hdr *ah; 156 struct ip_auth_hdr *ah;
62 struct ah_data *ahp; 157 struct ah_data *ahp;
63 union { 158
64 struct iphdr iph; 159 ahp = x->data;
65 char buf[60]; 160 ahash = ahp->ahash;
66 } tmp_iph; 161
162 if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
163 goto out;
164 nfrags = err;
67 165
68 skb_push(skb, -skb_network_offset(skb)); 166 skb_push(skb, -skb_network_offset(skb));
167 ah = ip_auth_hdr(skb);
168 ihl = ip_hdrlen(skb);
169
170 err = -ENOMEM;
171 iph = ah_alloc_tmp(ahash, nfrags, ihl);
172 if (!iph)
173 goto out;
174
175 icv = ah_tmp_icv(ahash, iph, ihl);
176 req = ah_tmp_req(ahash, icv);
177 sg = ah_req_sg(ahash, req);
178
179 memset(ah->auth_data, 0, ahp->icv_trunc_len);
180
69 top_iph = ip_hdr(skb); 181 top_iph = ip_hdr(skb);
70 iph = &tmp_iph.iph;
71 182
72 iph->tos = top_iph->tos; 183 iph->tos = top_iph->tos;
73 iph->ttl = top_iph->ttl; 184 iph->ttl = top_iph->ttl;
@@ -78,10 +189,9 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
78 memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); 189 memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
79 err = ip_clear_mutable_options(top_iph, &top_iph->daddr); 190 err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
80 if (err) 191 if (err)
81 goto error; 192 goto out_free;
82 } 193 }
83 194
84 ah = ip_auth_hdr(skb);
85 ah->nexthdr = *skb_mac_header(skb); 195 ah->nexthdr = *skb_mac_header(skb);
86 *skb_mac_header(skb) = IPPROTO_AH; 196 *skb_mac_header(skb) = IPPROTO_AH;
87 197
@@ -91,20 +201,31 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
91 top_iph->ttl = 0; 201 top_iph->ttl = 0;
92 top_iph->check = 0; 202 top_iph->check = 0;
93 203
94 ahp = x->data;
95 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; 204 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
96 205
97 ah->reserved = 0; 206 ah->reserved = 0;
98 ah->spi = x->id.spi; 207 ah->spi = x->id.spi;
99 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); 208 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output);
100 209
101 spin_lock_bh(&x->lock); 210 sg_init_table(sg, nfrags);
102 err = ah_mac_digest(ahp, skb, ah->auth_data); 211 skb_to_sgvec(skb, sg, 0, skb->len);
103 memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len);
104 spin_unlock_bh(&x->lock);
105 212
106 if (err) 213 ahash_request_set_crypt(req, sg, icv, skb->len);
107 goto error; 214 ahash_request_set_callback(req, 0, ah_output_done, skb);
215
216 AH_SKB_CB(skb)->tmp = iph;
217
218 err = crypto_ahash_digest(req);
219 if (err) {
220 if (err == -EINPROGRESS)
221 goto out;
222
223 if (err == -EBUSY)
224 err = NET_XMIT_DROP;
225 goto out_free;
226 }
227
228 memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
108 229
109 top_iph->tos = iph->tos; 230 top_iph->tos = iph->tos;
110 top_iph->ttl = iph->ttl; 231 top_iph->ttl = iph->ttl;
@@ -114,28 +235,67 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
114 memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); 235 memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
115 } 236 }
116 237
117 err = 0; 238out_free:
118 239 kfree(iph);
119error: 240out:
120 return err; 241 return err;
121} 242}
122 243
244static void ah_input_done(struct crypto_async_request *base, int err)
245{
246 u8 *auth_data;
247 u8 *icv;
248 struct iphdr *work_iph;
249 struct sk_buff *skb = base->data;
250 struct xfrm_state *x = xfrm_input_state(skb);
251 struct ah_data *ahp = x->data;
252 struct ip_auth_hdr *ah = ip_auth_hdr(skb);
253 int ihl = ip_hdrlen(skb);
254 int ah_hlen = (ah->hdrlen + 2) << 2;
255
256 work_iph = AH_SKB_CB(skb)->tmp;
257 auth_data = ah_tmp_auth(work_iph, ihl);
258 icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
259
260 err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
261 if (err)
262 goto out;
263
264 skb->network_header += ah_hlen;
265 memcpy(skb_network_header(skb), work_iph, ihl);
266 __skb_pull(skb, ah_hlen + ihl);
267 skb_set_transport_header(skb, -ihl);
268
269 err = ah->nexthdr;
270out:
271 kfree(AH_SKB_CB(skb)->tmp);
272 xfrm_input_resume(skb, err);
273}
274
123static int ah_input(struct xfrm_state *x, struct sk_buff *skb) 275static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
124{ 276{
125 int ah_hlen; 277 int ah_hlen;
126 int ihl; 278 int ihl;
127 int nexthdr; 279 int nexthdr;
128 int err = -EINVAL; 280 int nfrags;
129 struct iphdr *iph; 281 u8 *auth_data;
282 u8 *icv;
283 struct sk_buff *trailer;
284 struct crypto_ahash *ahash;
285 struct ahash_request *req;
286 struct scatterlist *sg;
287 struct iphdr *iph, *work_iph;
130 struct ip_auth_hdr *ah; 288 struct ip_auth_hdr *ah;
131 struct ah_data *ahp; 289 struct ah_data *ahp;
132 char work_buf[60]; 290 int err = -ENOMEM;
133 291
134 if (!pskb_may_pull(skb, sizeof(*ah))) 292 if (!pskb_may_pull(skb, sizeof(*ah)))
135 goto out; 293 goto out;
136 294
137 ah = (struct ip_auth_hdr *)skb->data; 295 ah = (struct ip_auth_hdr *)skb->data;
138 ahp = x->data; 296 ahp = x->data;
297 ahash = ahp->ahash;
298
139 nexthdr = ah->nexthdr; 299 nexthdr = ah->nexthdr;
140 ah_hlen = (ah->hdrlen + 2) << 2; 300 ah_hlen = (ah->hdrlen + 2) << 2;
141 301
@@ -154,11 +314,27 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
154 314
155 skb->ip_summed = CHECKSUM_NONE; 315 skb->ip_summed = CHECKSUM_NONE;
156 316
317
318 if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
319 goto out;
320 nfrags = err;
321
157 ah = (struct ip_auth_hdr *)skb->data; 322 ah = (struct ip_auth_hdr *)skb->data;
158 iph = ip_hdr(skb); 323 iph = ip_hdr(skb);
324 ihl = ip_hdrlen(skb);
325
326 work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len);
327 if (!work_iph)
328 goto out;
329
330 auth_data = ah_tmp_auth(work_iph, ihl);
331 icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
332 req = ah_tmp_req(ahash, icv);
333 sg = ah_req_sg(ahash, req);
159 334
160 ihl = skb->data - skb_network_header(skb); 335 memcpy(work_iph, iph, ihl);
161 memcpy(work_buf, iph, ihl); 336 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
337 memset(ah->auth_data, 0, ahp->icv_trunc_len);
162 338
163 iph->ttl = 0; 339 iph->ttl = 0;
164 iph->tos = 0; 340 iph->tos = 0;
@@ -166,35 +342,44 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
166 iph->check = 0; 342 iph->check = 0;
167 if (ihl > sizeof(*iph)) { 343 if (ihl > sizeof(*iph)) {
168 __be32 dummy; 344 __be32 dummy;
169 if (ip_clear_mutable_options(iph, &dummy)) 345 err = ip_clear_mutable_options(iph, &dummy);
170 goto out; 346 if (err)
347 goto out_free;
171 } 348 }
172 349
173 spin_lock(&x->lock); 350 skb_push(skb, ihl);
174 {
175 u8 auth_data[MAX_AH_AUTH_LEN];
176 351
177 memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); 352 sg_init_table(sg, nfrags);
178 skb_push(skb, ihl); 353 skb_to_sgvec(skb, sg, 0, skb->len);
179 err = ah_mac_digest(ahp, skb, ah->auth_data); 354
180 if (err) 355 ahash_request_set_crypt(req, sg, icv, skb->len);
181 goto unlock; 356 ahash_request_set_callback(req, 0, ah_input_done, skb);
182 if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) 357
183 err = -EBADMSG; 358 AH_SKB_CB(skb)->tmp = work_iph;
359
360 err = crypto_ahash_digest(req);
361 if (err) {
362 if (err == -EINPROGRESS)
363 goto out;
364
365 if (err == -EBUSY)
366 err = NET_XMIT_DROP;
367 goto out_free;
184 } 368 }
185unlock:
186 spin_unlock(&x->lock);
187 369
370 err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
188 if (err) 371 if (err)
189 goto out; 372 goto out_free;
190 373
191 skb->network_header += ah_hlen; 374 skb->network_header += ah_hlen;
192 memcpy(skb_network_header(skb), work_buf, ihl); 375 memcpy(skb_network_header(skb), work_iph, ihl);
193 skb->transport_header = skb->network_header;
194 __skb_pull(skb, ah_hlen + ihl); 376 __skb_pull(skb, ah_hlen + ihl);
377 skb_set_transport_header(skb, -ihl);
195 378
196 return nexthdr; 379 err = nexthdr;
197 380
381out_free:
382 kfree (work_iph);
198out: 383out:
199 return err; 384 return err;
200} 385}
@@ -210,7 +395,7 @@ static void ah4_err(struct sk_buff *skb, u32 info)
210 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 395 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
211 return; 396 return;
212 397
213 x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); 398 x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
214 if (!x) 399 if (!x)
215 return; 400 return;
216 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", 401 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
@@ -222,7 +407,7 @@ static int ah_init_state(struct xfrm_state *x)
222{ 407{
223 struct ah_data *ahp = NULL; 408 struct ah_data *ahp = NULL;
224 struct xfrm_algo_desc *aalg_desc; 409 struct xfrm_algo_desc *aalg_desc;
225 struct crypto_hash *tfm; 410 struct crypto_ahash *ahash;
226 411
227 if (!x->aalg) 412 if (!x->aalg)
228 goto error; 413 goto error;
@@ -231,44 +416,40 @@ static int ah_init_state(struct xfrm_state *x)
231 goto error; 416 goto error;
232 417
233 ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); 418 ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
234 if (ahp == NULL) 419 if (!ahp)
235 return -ENOMEM; 420 return -ENOMEM;
236 421
237 tfm = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC); 422 ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
238 if (IS_ERR(tfm)) 423 if (IS_ERR(ahash))
239 goto error; 424 goto error;
240 425
241 ahp->tfm = tfm; 426 ahp->ahash = ahash;
242 if (crypto_hash_setkey(tfm, x->aalg->alg_key, 427 if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
243 (x->aalg->alg_key_len + 7) / 8)) 428 (x->aalg->alg_key_len + 7) / 8))
244 goto error; 429 goto error;
245 430
246 /* 431 /*
247 * Lookup the algorithm description maintained by xfrm_algo, 432 * Lookup the algorithm description maintained by xfrm_algo,
248 * verify crypto transform properties, and store information 433 * verify crypto transform properties, and store information
249 * we need for AH processing. This lookup cannot fail here 434 * we need for AH processing. This lookup cannot fail here
250 * after a successful crypto_alloc_hash(). 435 * after a successful crypto_alloc_ahash().
251 */ 436 */
252 aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); 437 aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
253 BUG_ON(!aalg_desc); 438 BUG_ON(!aalg_desc);
254 439
255 if (aalg_desc->uinfo.auth.icv_fullbits/8 != 440 if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
256 crypto_hash_digestsize(tfm)) { 441 crypto_ahash_digestsize(ahash)) {
257 printk(KERN_INFO "AH: %s digestsize %u != %hu\n", 442 printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
258 x->aalg->alg_name, crypto_hash_digestsize(tfm), 443 x->aalg->alg_name, crypto_ahash_digestsize(ahash),
259 aalg_desc->uinfo.auth.icv_fullbits/8); 444 aalg_desc->uinfo.auth.icv_fullbits/8);
260 goto error; 445 goto error;
261 } 446 }
262 447
263 ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; 448 ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
264 ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; 449 ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
265 450
266 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); 451 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
267 452
268 ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL);
269 if (!ahp->work_icv)
270 goto error;
271
272 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + 453 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
273 ahp->icv_trunc_len); 454 ahp->icv_trunc_len);
274 if (x->props.mode == XFRM_MODE_TUNNEL) 455 if (x->props.mode == XFRM_MODE_TUNNEL)
@@ -279,8 +460,7 @@ static int ah_init_state(struct xfrm_state *x)
279 460
280error: 461error:
281 if (ahp) { 462 if (ahp) {
282 kfree(ahp->work_icv); 463 crypto_free_ahash(ahp->ahash);
283 crypto_free_hash(ahp->tfm);
284 kfree(ahp); 464 kfree(ahp);
285 } 465 }
286 return -EINVAL; 466 return -EINVAL;
@@ -293,8 +473,7 @@ static void ah_destroy(struct xfrm_state *x)
293 if (!ahp) 473 if (!ahp)
294 return; 474 return;
295 475
296 kfree(ahp->work_icv); 476 crypto_free_ahash(ahp->ahash);
297 crypto_free_hash(ahp->tfm);
298 kfree(ahp); 477 kfree(ahp);
299} 478}
300 479
@@ -311,7 +490,7 @@ static const struct xfrm_type ah_type =
311 .output = ah_output 490 .output = ah_output
312}; 491};
313 492
314static struct net_protocol ah4_protocol = { 493static const struct net_protocol ah4_protocol = {
315 .handler = xfrm4_rcv, 494 .handler = xfrm4_rcv,
316 .err_handler = ah4_err, 495 .err_handler = ah4_err,
317 .no_policy = 1, 496 .no_policy = 1,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 8a3881e28aca..04c8b69fd426 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -55,7 +55,7 @@
55 * Stuart Cheshire : Metricom and grat arp fixes 55 * Stuart Cheshire : Metricom and grat arp fixes
56 * *** FOR 2.1 clean this up *** 56 * *** FOR 2.1 clean this up ***
57 * Lawrence V. Stefani: (08/12/96) Added FDDI support. 57 * Lawrence V. Stefani: (08/12/96) Added FDDI support.
58 * Alan Cox : Took the AP1000 nasty FDDI hack and 58 * Alan Cox : Took the AP1000 nasty FDDI hack and
59 * folded into the mainstream FDDI code. 59 * folded into the mainstream FDDI code.
60 * Ack spit, Linus how did you allow that 60 * Ack spit, Linus how did you allow that
61 * one in... 61 * one in...
@@ -70,6 +70,7 @@
70 * bonding can change the skb before 70 * bonding can change the skb before
71 * sending (e.g. insert 8021q tag). 71 * sending (e.g. insert 8021q tag).
72 * Harald Welte : convert to make use of jenkins hash 72 * Harald Welte : convert to make use of jenkins hash
73 * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support.
73 */ 74 */
74 75
75#include <linux/module.h> 76#include <linux/module.h>
@@ -97,6 +98,7 @@
97#include <linux/net.h> 98#include <linux/net.h>
98#include <linux/rcupdate.h> 99#include <linux/rcupdate.h>
99#include <linux/jhash.h> 100#include <linux/jhash.h>
101#include <linux/slab.h>
100#ifdef CONFIG_SYSCTL 102#ifdef CONFIG_SYSCTL
101#include <linux/sysctl.h> 103#include <linux/sysctl.h>
102#endif 104#endif
@@ -114,23 +116,24 @@
114#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) 116#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
115#include <net/atmclip.h> 117#include <net/atmclip.h>
116struct neigh_table *clip_tbl_hook; 118struct neigh_table *clip_tbl_hook;
119EXPORT_SYMBOL(clip_tbl_hook);
117#endif 120#endif
118 121
119#include <asm/system.h> 122#include <asm/system.h>
120#include <asm/uaccess.h> 123#include <linux/uaccess.h>
121 124
122#include <linux/netfilter_arp.h> 125#include <linux/netfilter_arp.h>
123 126
124/* 127/*
125 * Interface to generic neighbour cache. 128 * Interface to generic neighbour cache.
126 */ 129 */
127static u32 arp_hash(const void *pkey, const struct net_device *dev); 130static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd);
128static int arp_constructor(struct neighbour *neigh); 131static int arp_constructor(struct neighbour *neigh);
129static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); 132static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
130static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); 133static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
131static void parp_redo(struct sk_buff *skb); 134static void parp_redo(struct sk_buff *skb);
132 135
133static struct neigh_ops arp_generic_ops = { 136static const struct neigh_ops arp_generic_ops = {
134 .family = AF_INET, 137 .family = AF_INET,
135 .solicit = arp_solicit, 138 .solicit = arp_solicit,
136 .error_report = arp_error_report, 139 .error_report = arp_error_report,
@@ -140,7 +143,7 @@ static struct neigh_ops arp_generic_ops = {
140 .queue_xmit = dev_queue_xmit, 143 .queue_xmit = dev_queue_xmit,
141}; 144};
142 145
143static struct neigh_ops arp_hh_ops = { 146static const struct neigh_ops arp_hh_ops = {
144 .family = AF_INET, 147 .family = AF_INET,
145 .solicit = arp_solicit, 148 .solicit = arp_solicit,
146 .error_report = arp_error_report, 149 .error_report = arp_error_report,
@@ -150,7 +153,7 @@ static struct neigh_ops arp_hh_ops = {
150 .queue_xmit = dev_queue_xmit, 153 .queue_xmit = dev_queue_xmit,
151}; 154};
152 155
153static struct neigh_ops arp_direct_ops = { 156static const struct neigh_ops arp_direct_ops = {
154 .family = AF_INET, 157 .family = AF_INET,
155 .output = dev_queue_xmit, 158 .output = dev_queue_xmit,
156 .connected_output = dev_queue_xmit, 159 .connected_output = dev_queue_xmit,
@@ -158,7 +161,7 @@ static struct neigh_ops arp_direct_ops = {
158 .queue_xmit = dev_queue_xmit, 161 .queue_xmit = dev_queue_xmit,
159}; 162};
160 163
161struct neigh_ops arp_broken_ops = { 164static const struct neigh_ops arp_broken_ops = {
162 .family = AF_INET, 165 .family = AF_INET,
163 .solicit = arp_solicit, 166 .solicit = arp_solicit,
164 .error_report = arp_error_report, 167 .error_report = arp_error_report,
@@ -169,33 +172,34 @@ struct neigh_ops arp_broken_ops = {
169}; 172};
170 173
171struct neigh_table arp_tbl = { 174struct neigh_table arp_tbl = {
172 .family = AF_INET, 175 .family = AF_INET,
173 .entry_size = sizeof(struct neighbour) + 4, 176 .entry_size = sizeof(struct neighbour) + 4,
174 .key_len = 4, 177 .key_len = 4,
175 .hash = arp_hash, 178 .hash = arp_hash,
176 .constructor = arp_constructor, 179 .constructor = arp_constructor,
177 .proxy_redo = parp_redo, 180 .proxy_redo = parp_redo,
178 .id = "arp_cache", 181 .id = "arp_cache",
179 .parms = { 182 .parms = {
180 .tbl = &arp_tbl, 183 .tbl = &arp_tbl,
181 .base_reachable_time = 30 * HZ, 184 .base_reachable_time = 30 * HZ,
182 .retrans_time = 1 * HZ, 185 .retrans_time = 1 * HZ,
183 .gc_staletime = 60 * HZ, 186 .gc_staletime = 60 * HZ,
184 .reachable_time = 30 * HZ, 187 .reachable_time = 30 * HZ,
185 .delay_probe_time = 5 * HZ, 188 .delay_probe_time = 5 * HZ,
186 .queue_len = 3, 189 .queue_len = 3,
187 .ucast_probes = 3, 190 .ucast_probes = 3,
188 .mcast_probes = 3, 191 .mcast_probes = 3,
189 .anycast_delay = 1 * HZ, 192 .anycast_delay = 1 * HZ,
190 .proxy_delay = (8 * HZ) / 10, 193 .proxy_delay = (8 * HZ) / 10,
191 .proxy_qlen = 64, 194 .proxy_qlen = 64,
192 .locktime = 1 * HZ, 195 .locktime = 1 * HZ,
193 }, 196 },
194 .gc_interval = 30 * HZ, 197 .gc_interval = 30 * HZ,
195 .gc_thresh1 = 128, 198 .gc_thresh1 = 128,
196 .gc_thresh2 = 512, 199 .gc_thresh2 = 512,
197 .gc_thresh3 = 1024, 200 .gc_thresh3 = 1024,
198}; 201};
202EXPORT_SYMBOL(arp_tbl);
199 203
200int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) 204int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
201{ 205{
@@ -221,14 +225,16 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
221} 225}
222 226
223 227
224static u32 arp_hash(const void *pkey, const struct net_device *dev) 228static u32 arp_hash(const void *pkey,
229 const struct net_device *dev,
230 __u32 hash_rnd)
225{ 231{
226 return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd); 232 return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd);
227} 233}
228 234
229static int arp_constructor(struct neighbour *neigh) 235static int arp_constructor(struct neighbour *neigh)
230{ 236{
231 __be32 addr = *(__be32*)neigh->primary_key; 237 __be32 addr = *(__be32 *)neigh->primary_key;
232 struct net_device *dev = neigh->dev; 238 struct net_device *dev = neigh->dev;
233 struct in_device *in_dev; 239 struct in_device *in_dev;
234 struct neigh_parms *parms; 240 struct neigh_parms *parms;
@@ -291,16 +297,19 @@ static int arp_constructor(struct neighbour *neigh)
291 neigh->ops = &arp_broken_ops; 297 neigh->ops = &arp_broken_ops;
292 neigh->output = neigh->ops->output; 298 neigh->output = neigh->ops->output;
293 return 0; 299 return 0;
300#else
301 break;
294#endif 302#endif
295 ;} 303 }
296#endif 304#endif
297 if (neigh->type == RTN_MULTICAST) { 305 if (neigh->type == RTN_MULTICAST) {
298 neigh->nud_state = NUD_NOARP; 306 neigh->nud_state = NUD_NOARP;
299 arp_mc_map(addr, neigh->ha, dev, 1); 307 arp_mc_map(addr, neigh->ha, dev, 1);
300 } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { 308 } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
301 neigh->nud_state = NUD_NOARP; 309 neigh->nud_state = NUD_NOARP;
302 memcpy(neigh->ha, dev->dev_addr, dev->addr_len); 310 memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
303 } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) { 311 } else if (neigh->type == RTN_BROADCAST ||
312 (dev->flags & IFF_POINTOPOINT)) {
304 neigh->nud_state = NUD_NOARP; 313 neigh->nud_state = NUD_NOARP;
305 memcpy(neigh->ha, dev->broadcast, dev->addr_len); 314 memcpy(neigh->ha, dev->broadcast, dev->addr_len);
306 } 315 }
@@ -310,7 +319,7 @@ static int arp_constructor(struct neighbour *neigh)
310 else 319 else
311 neigh->ops = &arp_generic_ops; 320 neigh->ops = &arp_generic_ops;
312 321
313 if (neigh->nud_state&NUD_VALID) 322 if (neigh->nud_state & NUD_VALID)
314 neigh->output = neigh->ops->connected_output; 323 neigh->output = neigh->ops->connected_output;
315 else 324 else
316 neigh->output = neigh->ops->output; 325 neigh->output = neigh->ops->output;
@@ -329,17 +338,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
329 __be32 saddr = 0; 338 __be32 saddr = 0;
330 u8 *dst_ha = NULL; 339 u8 *dst_ha = NULL;
331 struct net_device *dev = neigh->dev; 340 struct net_device *dev = neigh->dev;
332 __be32 target = *(__be32*)neigh->primary_key; 341 __be32 target = *(__be32 *)neigh->primary_key;
333 int probes = atomic_read(&neigh->probes); 342 int probes = atomic_read(&neigh->probes);
334 struct in_device *in_dev = in_dev_get(dev); 343 struct in_device *in_dev;
335 344
336 if (!in_dev) 345 rcu_read_lock();
346 in_dev = __in_dev_get_rcu(dev);
347 if (!in_dev) {
348 rcu_read_unlock();
337 return; 349 return;
338 350 }
339 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { 351 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
340 default: 352 default:
341 case 0: /* By default announce any local IP */ 353 case 0: /* By default announce any local IP */
342 if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL) 354 if (skb && inet_addr_type(dev_net(dev),
355 ip_hdr(skb)->saddr) == RTN_LOCAL)
343 saddr = ip_hdr(skb)->saddr; 356 saddr = ip_hdr(skb)->saddr;
344 break; 357 break;
345 case 1: /* Restrict announcements of saddr in same subnet */ 358 case 1: /* Restrict announcements of saddr in same subnet */
@@ -356,22 +369,26 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
356 case 2: /* Avoid secondary IPs, get a primary/preferred one */ 369 case 2: /* Avoid secondary IPs, get a primary/preferred one */
357 break; 370 break;
358 } 371 }
372 rcu_read_unlock();
359 373
360 if (in_dev)
361 in_dev_put(in_dev);
362 if (!saddr) 374 if (!saddr)
363 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); 375 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
364 376
365 if ((probes -= neigh->parms->ucast_probes) < 0) { 377 probes -= neigh->parms->ucast_probes;
366 if (!(neigh->nud_state&NUD_VALID)) 378 if (probes < 0) {
367 printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); 379 if (!(neigh->nud_state & NUD_VALID))
380 printk(KERN_DEBUG
381 "trying to ucast probe in NUD_INVALID\n");
368 dst_ha = neigh->ha; 382 dst_ha = neigh->ha;
369 read_lock_bh(&neigh->lock); 383 read_lock_bh(&neigh->lock);
370 } else if ((probes -= neigh->parms->app_probes) < 0) { 384 } else {
385 probes -= neigh->parms->app_probes;
386 if (probes < 0) {
371#ifdef CONFIG_ARPD 387#ifdef CONFIG_ARPD
372 neigh_app_ns(neigh); 388 neigh_app_ns(neigh);
373#endif 389#endif
374 return; 390 return;
391 }
375 } 392 }
376 393
377 arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, 394 arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
@@ -416,8 +433,8 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
416 433
417static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) 434static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
418{ 435{
419 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip, 436 struct flowi fl = { .fl4_dst = sip,
420 .saddr = tip } } }; 437 .fl4_src = tip };
421 struct rtable *rt; 438 struct rtable *rt;
422 int flag = 0; 439 int flag = 0;
423 /*unsigned long now; */ 440 /*unsigned long now; */
@@ -425,7 +442,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
425 442
426 if (ip_route_output_key(net, &rt, &fl) < 0) 443 if (ip_route_output_key(net, &rt, &fl) < 0)
427 return 1; 444 return 1;
428 if (rt->u.dst.dev != dev) { 445 if (rt->dst.dev != dev) {
429 NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); 446 NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
430 flag = 1; 447 flag = 1;
431 } 448 }
@@ -444,7 +461,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
444 * is allowed to use this function, it is scheduled to be removed. --ANK 461 * is allowed to use this function, it is scheduled to be removed. --ANK
445 */ 462 */
446 463
447static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev) 464static int arp_set_predefined(int addr_hint, unsigned char *haddr,
465 __be32 paddr, struct net_device *dev)
448{ 466{
449 switch (addr_hint) { 467 switch (addr_hint) {
450 case RTN_LOCAL: 468 case RTN_LOCAL:
@@ -476,17 +494,16 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
476 494
477 paddr = skb_rtable(skb)->rt_gateway; 495 paddr = skb_rtable(skb)->rt_gateway;
478 496
479 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev)) 497 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
498 paddr, dev))
480 return 0; 499 return 0;
481 500
482 n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); 501 n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
483 502
484 if (n) { 503 if (n) {
485 n->used = jiffies; 504 n->used = jiffies;
486 if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) { 505 if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
487 read_lock_bh(&n->lock); 506 neigh_ha_snapshot(haddr, n, dev);
488 memcpy(haddr, n->ha, dev->addr_len);
489 read_unlock_bh(&n->lock);
490 neigh_release(n); 507 neigh_release(n);
491 return 0; 508 return 0;
492 } 509 }
@@ -495,6 +512,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
495 kfree_skb(skb); 512 kfree_skb(skb);
496 return 1; 513 return 1;
497} 514}
515EXPORT_SYMBOL(arp_find);
498 516
499/* END OF OBSOLETE FUNCTIONS */ 517/* END OF OBSOLETE FUNCTIONS */
500 518
@@ -507,13 +525,14 @@ int arp_bind_neighbour(struct dst_entry *dst)
507 return -EINVAL; 525 return -EINVAL;
508 if (n == NULL) { 526 if (n == NULL) {
509 __be32 nexthop = ((struct rtable *)dst)->rt_gateway; 527 __be32 nexthop = ((struct rtable *)dst)->rt_gateway;
510 if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) 528 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
511 nexthop = 0; 529 nexthop = 0;
512 n = __neigh_lookup_errno( 530 n = __neigh_lookup_errno(
513#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) 531#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
514 dev->type == ARPHRD_ATM ? clip_tbl_hook : 532 dev->type == ARPHRD_ATM ?
533 clip_tbl_hook :
515#endif 534#endif
516 &arp_tbl, &nexthop, dev); 535 &arp_tbl, &nexthop, dev);
517 if (IS_ERR(n)) 536 if (IS_ERR(n))
518 return PTR_ERR(n); 537 return PTR_ERR(n);
519 dst->neighbour = n; 538 dst->neighbour = n;
@@ -524,27 +543,67 @@ int arp_bind_neighbour(struct dst_entry *dst)
524/* 543/*
525 * Check if we can use proxy ARP for this path 544 * Check if we can use proxy ARP for this path
526 */ 545 */
527 546static inline int arp_fwd_proxy(struct in_device *in_dev,
528static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) 547 struct net_device *dev, struct rtable *rt)
529{ 548{
530 struct in_device *out_dev; 549 struct in_device *out_dev;
531 int imi, omi = -1; 550 int imi, omi = -1;
532 551
533 if (!IN_DEV_PROXY_ARP(in_dev)) 552 if (rt->dst.dev == dev)
534 return 0; 553 return 0;
535 554
536 if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0) 555 if (!IN_DEV_PROXY_ARP(in_dev))
556 return 0;
557 imi = IN_DEV_MEDIUM_ID(in_dev);
558 if (imi == 0)
537 return 1; 559 return 1;
538 if (imi == -1) 560 if (imi == -1)
539 return 0; 561 return 0;
540 562
541 /* place to check for proxy_arp for routes */ 563 /* place to check for proxy_arp for routes */
542 564
543 if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) { 565 out_dev = __in_dev_get_rcu(rt->dst.dev);
566 if (out_dev)
544 omi = IN_DEV_MEDIUM_ID(out_dev); 567 omi = IN_DEV_MEDIUM_ID(out_dev);
545 in_dev_put(out_dev); 568
546 } 569 return omi != imi && omi != -1;
547 return (omi != imi && omi != -1); 570}
571
572/*
573 * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev)
574 *
575 * RFC3069 supports proxy arp replies back to the same interface. This
576 * is done to support (ethernet) switch features, like RFC 3069, where
577 * the individual ports are not allowed to communicate with each
578 * other, BUT they are allowed to talk to the upstream router. As
579 * described in RFC 3069, it is possible to allow these hosts to
580 * communicate through the upstream router, by proxy_arp'ing.
581 *
582 * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation"
583 *
584 * This technology is known by different names:
585 * In RFC 3069 it is called VLAN Aggregation.
586 * Cisco and Allied Telesyn call it Private VLAN.
587 * Hewlett-Packard call it Source-Port filtering or port-isolation.
588 * Ericsson call it MAC-Forced Forwarding (RFC Draft).
589 *
590 */
591static inline int arp_fwd_pvlan(struct in_device *in_dev,
592 struct net_device *dev, struct rtable *rt,
593 __be32 sip, __be32 tip)
594{
595 /* Private VLAN is only concerned about the same ethernet segment */
596 if (rt->dst.dev != dev)
597 return 0;
598
599 /* Don't reply on self probes (often done by windowz boxes)*/
600 if (sip == tip)
601 return 0;
602
603 if (IN_DEV_PROXY_ARP_PVLAN(in_dev))
604 return 1;
605 else
606 return 0;
548} 607}
549 608
550/* 609/*
@@ -619,13 +678,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
619#endif 678#endif
620#endif 679#endif
621 680
622#ifdef CONFIG_FDDI 681#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
623 case ARPHRD_FDDI: 682 case ARPHRD_FDDI:
624 arp->ar_hrd = htons(ARPHRD_ETHER); 683 arp->ar_hrd = htons(ARPHRD_ETHER);
625 arp->ar_pro = htons(ETH_P_IP); 684 arp->ar_pro = htons(ETH_P_IP);
626 break; 685 break;
627#endif 686#endif
628#ifdef CONFIG_TR 687#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE)
629 case ARPHRD_IEEE802_TR: 688 case ARPHRD_IEEE802_TR:
630 arp->ar_hrd = htons(ARPHRD_IEEE802); 689 arp->ar_hrd = htons(ARPHRD_IEEE802);
631 arp->ar_pro = htons(ETH_P_IP); 690 arp->ar_pro = htons(ETH_P_IP);
@@ -637,7 +696,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
637 arp->ar_pln = 4; 696 arp->ar_pln = 4;
638 arp->ar_op = htons(type); 697 arp->ar_op = htons(type);
639 698
640 arp_ptr=(unsigned char *)(arp+1); 699 arp_ptr = (unsigned char *)(arp + 1);
641 700
642 memcpy(arp_ptr, src_hw, dev->addr_len); 701 memcpy(arp_ptr, src_hw, dev->addr_len);
643 arp_ptr += dev->addr_len; 702 arp_ptr += dev->addr_len;
@@ -656,6 +715,7 @@ out:
656 kfree_skb(skb); 715 kfree_skb(skb);
657 return NULL; 716 return NULL;
658} 717}
718EXPORT_SYMBOL(arp_create);
659 719
660/* 720/*
661 * Send an arp packet. 721 * Send an arp packet.
@@ -665,6 +725,7 @@ void arp_xmit(struct sk_buff *skb)
665 /* Send it off, maybe filter it using firewalling first. */ 725 /* Send it off, maybe filter it using firewalling first. */
666 NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit); 726 NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
667} 727}
728EXPORT_SYMBOL(arp_xmit);
668 729
669/* 730/*
670 * Create and send an arp packet. 731 * Create and send an arp packet.
@@ -685,12 +746,12 @@ void arp_send(int type, int ptype, __be32 dest_ip,
685 746
686 skb = arp_create(type, ptype, dest_ip, dev, src_ip, 747 skb = arp_create(type, ptype, dest_ip, dev, src_ip,
687 dest_hw, src_hw, target_hw); 748 dest_hw, src_hw, target_hw);
688 if (skb == NULL) { 749 if (skb == NULL)
689 return; 750 return;
690 }
691 751
692 arp_xmit(skb); 752 arp_xmit(skb);
693} 753}
754EXPORT_SYMBOL(arp_send);
694 755
695/* 756/*
696 * Process an arp request. 757 * Process an arp request.
@@ -699,7 +760,7 @@ void arp_send(int type, int ptype, __be32 dest_ip,
699static int arp_process(struct sk_buff *skb) 760static int arp_process(struct sk_buff *skb)
700{ 761{
701 struct net_device *dev = skb->dev; 762 struct net_device *dev = skb->dev;
702 struct in_device *in_dev = in_dev_get(dev); 763 struct in_device *in_dev = __in_dev_get_rcu(dev);
703 struct arphdr *arp; 764 struct arphdr *arp;
704 unsigned char *arp_ptr; 765 unsigned char *arp_ptr;
705 struct rtable *rt; 766 struct rtable *rt;
@@ -764,7 +825,7 @@ static int arp_process(struct sk_buff *skb)
764/* 825/*
765 * Extract fields 826 * Extract fields
766 */ 827 */
767 arp_ptr= (unsigned char *)(arp+1); 828 arp_ptr = (unsigned char *)(arp + 1);
768 sha = arp_ptr; 829 sha = arp_ptr;
769 arp_ptr += dev->addr_len; 830 arp_ptr += dev->addr_len;
770 memcpy(&sip, arp_ptr, 4); 831 memcpy(&sip, arp_ptr, 4);
@@ -801,11 +862,8 @@ static int arp_process(struct sk_buff *skb)
801 * cache. 862 * cache.
802 */ 863 */
803 864
804 /* 865 /* Special case: IPv4 duplicate address detection packet (RFC2131) */
805 * Special case: IPv4 duplicate address detection packet (RFC2131) 866 if (sip == 0) {
806 * and Gratuitous ARP/ARP Announce. (RFC3927, Section 2.4)
807 */
808 if (sip == 0 || tip == sip) {
809 if (arp->ar_op == htons(ARPOP_REQUEST) && 867 if (arp->ar_op == htons(ARPOP_REQUEST) &&
810 inet_addr_type(net, tip) == RTN_LOCAL && 868 inet_addr_type(net, tip) == RTN_LOCAL &&
811 !arp_ignore(in_dev, sip, tip)) 869 !arp_ignore(in_dev, sip, tip))
@@ -815,29 +873,32 @@ static int arp_process(struct sk_buff *skb)
815 } 873 }
816 874
817 if (arp->ar_op == htons(ARPOP_REQUEST) && 875 if (arp->ar_op == htons(ARPOP_REQUEST) &&
818 ip_route_input(skb, tip, sip, 0, dev) == 0) { 876 ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
819 877
820 rt = skb_rtable(skb); 878 rt = skb_rtable(skb);
821 addr_type = rt->rt_type; 879 addr_type = rt->rt_type;
822 880
823 if (addr_type == RTN_LOCAL) { 881 if (addr_type == RTN_LOCAL) {
824 int dont_send = 0; 882 int dont_send;
825 883
826 if (!dont_send) 884 dont_send = arp_ignore(in_dev, sip, tip);
827 dont_send |= arp_ignore(in_dev,sip,tip);
828 if (!dont_send && IN_DEV_ARPFILTER(in_dev)) 885 if (!dont_send && IN_DEV_ARPFILTER(in_dev))
829 dont_send |= arp_filter(sip,tip,dev); 886 dont_send = arp_filter(sip, tip, dev);
830 if (!dont_send) { 887 if (!dont_send) {
831 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 888 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
832 if (n) { 889 if (n) {
833 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); 890 arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
891 dev, tip, sha, dev->dev_addr,
892 sha);
834 neigh_release(n); 893 neigh_release(n);
835 } 894 }
836 } 895 }
837 goto out; 896 goto out;
838 } else if (IN_DEV_FORWARD(in_dev)) { 897 } else if (IN_DEV_FORWARD(in_dev)) {
839 if (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && 898 if (addr_type == RTN_UNICAST &&
840 (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { 899 (arp_fwd_proxy(in_dev, dev, rt) ||
900 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
901 pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
841 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 902 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
842 if (n) 903 if (n)
843 neigh_release(n); 904 neigh_release(n);
@@ -845,10 +906,12 @@ static int arp_process(struct sk_buff *skb)
845 if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || 906 if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
846 skb->pkt_type == PACKET_HOST || 907 skb->pkt_type == PACKET_HOST ||
847 in_dev->arp_parms->proxy_delay == 0) { 908 in_dev->arp_parms->proxy_delay == 0) {
848 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); 909 arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
910 dev, tip, sha, dev->dev_addr,
911 sha);
849 } else { 912 } else {
850 pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); 913 pneigh_enqueue(&arp_tbl,
851 in_dev_put(in_dev); 914 in_dev->arp_parms, skb);
852 return 0; 915 return 0;
853 } 916 }
854 goto out; 917 goto out;
@@ -866,7 +929,8 @@ static int arp_process(struct sk_buff *skb)
866 devices (strip is candidate) 929 devices (strip is candidate)
867 */ 930 */
868 if (n == NULL && 931 if (n == NULL &&
869 arp->ar_op == htons(ARPOP_REPLY) && 932 (arp->ar_op == htons(ARPOP_REPLY) ||
933 (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) &&
870 inet_addr_type(net, sip) == RTN_UNICAST) 934 inet_addr_type(net, sip) == RTN_UNICAST)
871 n = __neigh_lookup(&arp_tbl, &sip, dev, 1); 935 n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
872 } 936 }
@@ -888,13 +952,12 @@ static int arp_process(struct sk_buff *skb)
888 if (arp->ar_op != htons(ARPOP_REPLY) || 952 if (arp->ar_op != htons(ARPOP_REPLY) ||
889 skb->pkt_type != PACKET_HOST) 953 skb->pkt_type != PACKET_HOST)
890 state = NUD_STALE; 954 state = NUD_STALE;
891 neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0); 955 neigh_update(n, sha, state,
956 override ? NEIGH_UPDATE_F_OVERRIDE : 0);
892 neigh_release(n); 957 neigh_release(n);
893 } 958 }
894 959
895out: 960out:
896 if (in_dev)
897 in_dev_put(in_dev);
898 consume_skb(skb); 961 consume_skb(skb);
899 return 0; 962 return 0;
900} 963}
@@ -926,7 +989,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
926 arp->ar_pln != 4) 989 arp->ar_pln != 4)
927 goto freeskb; 990 goto freeskb;
928 991
929 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) 992 skb = skb_share_check(skb, GFP_ATOMIC);
993 if (skb == NULL)
930 goto out_of_mem; 994 goto out_of_mem;
931 995
932 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); 996 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
@@ -953,13 +1017,14 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
953 IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; 1017 IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
954 return 0; 1018 return 0;
955 } 1019 }
956 if (__in_dev_get_rtnl(dev)) { 1020 if (__in_dev_get_rcu(dev)) {
957 IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); 1021 IN_DEV_CONF_SET(__in_dev_get_rcu(dev), PROXY_ARP, on);
958 return 0; 1022 return 0;
959 } 1023 }
960 return -ENXIO; 1024 return -ENXIO;
961} 1025}
962 1026
1027/* must be called with rcu_read_lock() */
963static int arp_req_set_public(struct net *net, struct arpreq *r, 1028static int arp_req_set_public(struct net *net, struct arpreq *r,
964 struct net_device *dev) 1029 struct net_device *dev)
965{ 1030{
@@ -969,8 +1034,8 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
969 if (mask && mask != htonl(0xFFFFFFFF)) 1034 if (mask && mask != htonl(0xFFFFFFFF))
970 return -EINVAL; 1035 return -EINVAL;
971 if (!dev && (r->arp_flags & ATF_COM)) { 1036 if (!dev && (r->arp_flags & ATF_COM)) {
972 dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, 1037 dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family,
973 r->arp_ha.sa_data); 1038 r->arp_ha.sa_data);
974 if (!dev) 1039 if (!dev)
975 return -ENODEV; 1040 return -ENODEV;
976 } 1041 }
@@ -984,7 +1049,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
984} 1049}
985 1050
986static int arp_req_set(struct net *net, struct arpreq *r, 1051static int arp_req_set(struct net *net, struct arpreq *r,
987 struct net_device * dev) 1052 struct net_device *dev)
988{ 1053{
989 __be32 ip; 1054 __be32 ip;
990 struct neighbour *neigh; 1055 struct neighbour *neigh;
@@ -997,18 +1062,19 @@ static int arp_req_set(struct net *net, struct arpreq *r,
997 if (r->arp_flags & ATF_PERM) 1062 if (r->arp_flags & ATF_PERM)
998 r->arp_flags |= ATF_COM; 1063 r->arp_flags |= ATF_COM;
999 if (dev == NULL) { 1064 if (dev == NULL) {
1000 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, 1065 struct flowi fl = { .fl4_dst = ip,
1001 .tos = RTO_ONLINK } } }; 1066 .fl4_tos = RTO_ONLINK };
1002 struct rtable * rt; 1067 struct rtable *rt;
1003 if ((err = ip_route_output_key(net, &rt, &fl)) != 0) 1068 err = ip_route_output_key(net, &rt, &fl);
1069 if (err != 0)
1004 return err; 1070 return err;
1005 dev = rt->u.dst.dev; 1071 dev = rt->dst.dev;
1006 ip_rt_put(rt); 1072 ip_rt_put(rt);
1007 if (!dev) 1073 if (!dev)
1008 return -EINVAL; 1074 return -EINVAL;
1009 } 1075 }
1010 switch (dev->type) { 1076 switch (dev->type) {
1011#ifdef CONFIG_FDDI 1077#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
1012 case ARPHRD_FDDI: 1078 case ARPHRD_FDDI:
1013 /* 1079 /*
1014 * According to RFC 1390, FDDI devices should accept ARP 1080 * According to RFC 1390, FDDI devices should accept ARP
@@ -1034,9 +1100,9 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1034 unsigned state = NUD_STALE; 1100 unsigned state = NUD_STALE;
1035 if (r->arp_flags & ATF_PERM) 1101 if (r->arp_flags & ATF_PERM)
1036 state = NUD_PERMANENT; 1102 state = NUD_PERMANENT;
1037 err = neigh_update(neigh, (r->arp_flags&ATF_COM) ? 1103 err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
1038 r->arp_ha.sa_data : NULL, state, 1104 r->arp_ha.sa_data : NULL, state,
1039 NEIGH_UPDATE_F_OVERRIDE| 1105 NEIGH_UPDATE_F_OVERRIDE |
1040 NEIGH_UPDATE_F_ADMIN); 1106 NEIGH_UPDATE_F_ADMIN);
1041 neigh_release(neigh); 1107 neigh_release(neigh);
1042 } 1108 }
@@ -1045,12 +1111,12 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1045 1111
1046static unsigned arp_state_to_flags(struct neighbour *neigh) 1112static unsigned arp_state_to_flags(struct neighbour *neigh)
1047{ 1113{
1048 unsigned flags = 0;
1049 if (neigh->nud_state&NUD_PERMANENT) 1114 if (neigh->nud_state&NUD_PERMANENT)
1050 flags = ATF_PERM|ATF_COM; 1115 return ATF_PERM | ATF_COM;
1051 else if (neigh->nud_state&NUD_VALID) 1116 else if (neigh->nud_state&NUD_VALID)
1052 flags = ATF_COM; 1117 return ATF_COM;
1053 return flags; 1118 else
1119 return 0;
1054} 1120}
1055 1121
1056/* 1122/*
@@ -1077,6 +1143,23 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
1077 return err; 1143 return err;
1078} 1144}
1079 1145
1146int arp_invalidate(struct net_device *dev, __be32 ip)
1147{
1148 struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
1149 int err = -ENXIO;
1150
1151 if (neigh) {
1152 if (neigh->nud_state & ~NUD_NOARP)
1153 err = neigh_update(neigh, NULL, NUD_FAILED,
1154 NEIGH_UPDATE_F_OVERRIDE|
1155 NEIGH_UPDATE_F_ADMIN);
1156 neigh_release(neigh);
1157 }
1158
1159 return err;
1160}
1161EXPORT_SYMBOL(arp_invalidate);
1162
1080static int arp_req_delete_public(struct net *net, struct arpreq *r, 1163static int arp_req_delete_public(struct net *net, struct arpreq *r,
1081 struct net_device *dev) 1164 struct net_device *dev)
1082{ 1165{
@@ -1093,37 +1176,28 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
1093} 1176}
1094 1177
1095static int arp_req_delete(struct net *net, struct arpreq *r, 1178static int arp_req_delete(struct net *net, struct arpreq *r,
1096 struct net_device * dev) 1179 struct net_device *dev)
1097{ 1180{
1098 int err; 1181 int err;
1099 __be32 ip; 1182 __be32 ip;
1100 struct neighbour *neigh;
1101 1183
1102 if (r->arp_flags & ATF_PUBL) 1184 if (r->arp_flags & ATF_PUBL)
1103 return arp_req_delete_public(net, r, dev); 1185 return arp_req_delete_public(net, r, dev);
1104 1186
1105 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; 1187 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
1106 if (dev == NULL) { 1188 if (dev == NULL) {
1107 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, 1189 struct flowi fl = { .fl4_dst = ip,
1108 .tos = RTO_ONLINK } } }; 1190 .fl4_tos = RTO_ONLINK };
1109 struct rtable * rt; 1191 struct rtable *rt;
1110 if ((err = ip_route_output_key(net, &rt, &fl)) != 0) 1192 err = ip_route_output_key(net, &rt, &fl);
1193 if (err != 0)
1111 return err; 1194 return err;
1112 dev = rt->u.dst.dev; 1195 dev = rt->dst.dev;
1113 ip_rt_put(rt); 1196 ip_rt_put(rt);
1114 if (!dev) 1197 if (!dev)
1115 return -EINVAL; 1198 return -EINVAL;
1116 } 1199 }
1117 err = -ENXIO; 1200 return arp_invalidate(dev, ip);
1118 neigh = neigh_lookup(&arp_tbl, &ip, dev);
1119 if (neigh) {
1120 if (neigh->nud_state&~NUD_NOARP)
1121 err = neigh_update(neigh, NULL, NUD_FAILED,
1122 NEIGH_UPDATE_F_OVERRIDE|
1123 NEIGH_UPDATE_F_ADMIN);
1124 neigh_release(neigh);
1125 }
1126 return err;
1127} 1201}
1128 1202
1129/* 1203/*
@@ -1137,32 +1211,33 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1137 struct net_device *dev = NULL; 1211 struct net_device *dev = NULL;
1138 1212
1139 switch (cmd) { 1213 switch (cmd) {
1140 case SIOCDARP: 1214 case SIOCDARP:
1141 case SIOCSARP: 1215 case SIOCSARP:
1142 if (!capable(CAP_NET_ADMIN)) 1216 if (!capable(CAP_NET_ADMIN))
1143 return -EPERM; 1217 return -EPERM;
1144 case SIOCGARP: 1218 case SIOCGARP:
1145 err = copy_from_user(&r, arg, sizeof(struct arpreq)); 1219 err = copy_from_user(&r, arg, sizeof(struct arpreq));
1146 if (err) 1220 if (err)
1147 return -EFAULT; 1221 return -EFAULT;
1148 break; 1222 break;
1149 default: 1223 default:
1150 return -EINVAL; 1224 return -EINVAL;
1151 } 1225 }
1152 1226
1153 if (r.arp_pa.sa_family != AF_INET) 1227 if (r.arp_pa.sa_family != AF_INET)
1154 return -EPFNOSUPPORT; 1228 return -EPFNOSUPPORT;
1155 1229
1156 if (!(r.arp_flags & ATF_PUBL) && 1230 if (!(r.arp_flags & ATF_PUBL) &&
1157 (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB))) 1231 (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
1158 return -EINVAL; 1232 return -EINVAL;
1159 if (!(r.arp_flags & ATF_NETMASK)) 1233 if (!(r.arp_flags & ATF_NETMASK))
1160 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = 1234 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
1161 htonl(0xFFFFFFFFUL); 1235 htonl(0xFFFFFFFFUL);
1162 rtnl_lock(); 1236 rcu_read_lock();
1163 if (r.arp_dev[0]) { 1237 if (r.arp_dev[0]) {
1164 err = -ENODEV; 1238 err = -ENODEV;
1165 if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL) 1239 dev = dev_get_by_name_rcu(net, r.arp_dev);
1240 if (dev == NULL)
1166 goto out; 1241 goto out;
1167 1242
1168 /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ 1243 /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
@@ -1185,16 +1260,17 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1185 break; 1260 break;
1186 case SIOCGARP: 1261 case SIOCGARP:
1187 err = arp_req_get(&r, dev); 1262 err = arp_req_get(&r, dev);
1188 if (!err && copy_to_user(arg, &r, sizeof(r)))
1189 err = -EFAULT;
1190 break; 1263 break;
1191 } 1264 }
1192out: 1265out:
1193 rtnl_unlock(); 1266 rcu_read_unlock();
1267 if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
1268 err = -EFAULT;
1194 return err; 1269 return err;
1195} 1270}
1196 1271
1197static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) 1272static int arp_netdev_event(struct notifier_block *this, unsigned long event,
1273 void *ptr)
1198{ 1274{
1199 struct net_device *dev = ptr; 1275 struct net_device *dev = ptr;
1200 1276
@@ -1242,8 +1318,7 @@ void __init arp_init(void)
1242 dev_add_pack(&arp_packet_type); 1318 dev_add_pack(&arp_packet_type);
1243 arp_proc_init(); 1319 arp_proc_init();
1244#ifdef CONFIG_SYSCTL 1320#ifdef CONFIG_SYSCTL
1245 neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, 1321 neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL);
1246 NET_IPV4_NEIGH, "ipv4", NULL, NULL);
1247#endif 1322#endif
1248 register_netdevice_notifier(&arp_netdev_notifier); 1323 register_netdevice_notifier(&arp_netdev_notifier);
1249} 1324}
@@ -1263,12 +1338,13 @@ static char *ax2asc2(ax25_address *a, char *buf)
1263 for (n = 0, s = buf; n < 6; n++) { 1338 for (n = 0, s = buf; n < 6; n++) {
1264 c = (a->ax25_call[n] >> 1) & 0x7F; 1339 c = (a->ax25_call[n] >> 1) & 0x7F;
1265 1340
1266 if (c != ' ') *s++ = c; 1341 if (c != ' ')
1342 *s++ = c;
1267 } 1343 }
1268 1344
1269 *s++ = '-'; 1345 *s++ = '-';
1270 1346 n = (a->ax25_call[6] >> 1) & 0x0F;
1271 if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { 1347 if (n > 9) {
1272 *s++ = '1'; 1348 *s++ = '1';
1273 n -= 10; 1349 n -= 10;
1274 } 1350 }
@@ -1277,10 +1353,9 @@ static char *ax2asc2(ax25_address *a, char *buf)
1277 *s++ = '\0'; 1353 *s++ = '\0';
1278 1354
1279 if (*buf == '\0' || *buf == '-') 1355 if (*buf == '\0' || *buf == '-')
1280 return "*"; 1356 return "*";
1281 1357
1282 return buf; 1358 return buf;
1283
1284} 1359}
1285#endif /* CONFIG_AX25 */ 1360#endif /* CONFIG_AX25 */
1286 1361
@@ -1307,7 +1382,9 @@ static void arp_format_neigh_entry(struct seq_file *seq,
1307 hbuffer[k++] = hex_asc_lo(n->ha[j]); 1382 hbuffer[k++] = hex_asc_lo(n->ha[j]);
1308 hbuffer[k++] = ':'; 1383 hbuffer[k++] = ':';
1309 } 1384 }
1310 hbuffer[--k] = 0; 1385 if (k != 0)
1386 --k;
1387 hbuffer[k] = 0;
1311#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) 1388#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
1312 } 1389 }
1313#endif 1390#endif
@@ -1358,10 +1435,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
1358/* ------------------------------------------------------------------------ */ 1435/* ------------------------------------------------------------------------ */
1359 1436
1360static const struct seq_operations arp_seq_ops = { 1437static const struct seq_operations arp_seq_ops = {
1361 .start = arp_seq_start, 1438 .start = arp_seq_start,
1362 .next = neigh_seq_next, 1439 .next = neigh_seq_next,
1363 .stop = neigh_seq_stop, 1440 .stop = neigh_seq_stop,
1364 .show = arp_seq_show, 1441 .show = arp_seq_show,
1365}; 1442};
1366 1443
1367static int arp_seq_open(struct inode *inode, struct file *file) 1444static int arp_seq_open(struct inode *inode, struct file *file)
@@ -1409,14 +1486,3 @@ static int __init arp_proc_init(void)
1409} 1486}
1410 1487
1411#endif /* CONFIG_PROC_FS */ 1488#endif /* CONFIG_PROC_FS */
1412
1413EXPORT_SYMBOL(arp_broken_ops);
1414EXPORT_SYMBOL(arp_find);
1415EXPORT_SYMBOL(arp_create);
1416EXPORT_SYMBOL(arp_xmit);
1417EXPORT_SYMBOL(arp_send);
1418EXPORT_SYMBOL(arp_tbl);
1419
1420#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1421EXPORT_SYMBOL(clip_tbl_hook);
1422#endif
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 039cc1ffe977..094e150c6260 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * The CIPSO draft specification can be found in the kernel's Documentation 10 * The CIPSO draft specification can be found in the kernel's Documentation
11 * directory as well as the following URL: 11 * directory as well as the following URL:
12 * http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt 12 * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
13 * The FIPS-188 specification can be found at the following URL: 13 * The FIPS-188 specification can be found at the following URL:
14 * http://www.itl.nist.gov/fipspubs/fip188.htm 14 * http://www.itl.nist.gov/fipspubs/fip188.htm
15 * 15 *
@@ -44,6 +44,7 @@
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/jhash.h> 45#include <linux/jhash.h>
46#include <linux/audit.h> 46#include <linux/audit.h>
47#include <linux/slab.h>
47#include <net/ip.h> 48#include <net/ip.h>
48#include <net/icmp.h> 49#include <net/icmp.h>
49#include <net/tcp.h> 50#include <net/tcp.h>
@@ -289,8 +290,6 @@ void cipso_v4_cache_invalidate(void)
289 cipso_v4_cache[iter].size = 0; 290 cipso_v4_cache[iter].size = 0;
290 spin_unlock_bh(&cipso_v4_cache[iter].lock); 291 spin_unlock_bh(&cipso_v4_cache[iter].lock);
291 } 292 }
292
293 return;
294} 293}
295 294
296/** 295/**
@@ -2017,7 +2016,7 @@ req_setattr_failure:
2017 * values on failure. 2016 * values on failure.
2018 * 2017 *
2019 */ 2018 */
2020int cipso_v4_delopt(struct ip_options **opt_ptr) 2019static int cipso_v4_delopt(struct ip_options **opt_ptr)
2021{ 2020{
2022 int hdr_delta = 0; 2021 int hdr_delta = 0;
2023 struct ip_options *opt = *opt_ptr; 2022 struct ip_options *opt = *opt_ptr;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 5e6c5a0f3fde..174be6caa5c8 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -39,7 +39,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
39 sk_dst_reset(sk); 39 sk_dst_reset(sk);
40 40
41 oif = sk->sk_bound_dev_if; 41 oif = sk->sk_bound_dev_if;
42 saddr = inet->saddr; 42 saddr = inet->inet_saddr;
43 if (ipv4_is_multicast(usin->sin_addr.s_addr)) { 43 if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
44 if (!oif) 44 if (!oif)
45 oif = inet->mc_index; 45 oif = inet->mc_index;
@@ -49,7 +49,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
49 err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, 49 err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
50 RT_CONN_FLAGS(sk), oif, 50 RT_CONN_FLAGS(sk), oif,
51 sk->sk_protocol, 51 sk->sk_protocol,
52 inet->sport, usin->sin_port, sk, 1); 52 inet->inet_sport, usin->sin_port, sk, 1);
53 if (err) { 53 if (err) {
54 if (err == -ENETUNREACH) 54 if (err == -ENETUNREACH)
55 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 55 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
@@ -60,18 +60,19 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
60 ip_rt_put(rt); 60 ip_rt_put(rt);
61 return -EACCES; 61 return -EACCES;
62 } 62 }
63 if (!inet->saddr) 63 if (!inet->inet_saddr)
64 inet->saddr = rt->rt_src; /* Update source address */ 64 inet->inet_saddr = rt->rt_src; /* Update source address */
65 if (!inet->rcv_saddr) 65 if (!inet->inet_rcv_saddr) {
66 inet->rcv_saddr = rt->rt_src; 66 inet->inet_rcv_saddr = rt->rt_src;
67 inet->daddr = rt->rt_dst; 67 if (sk->sk_prot->rehash)
68 inet->dport = usin->sin_port; 68 sk->sk_prot->rehash(sk);
69 }
70 inet->inet_daddr = rt->rt_dst;
71 inet->inet_dport = usin->sin_port;
69 sk->sk_state = TCP_ESTABLISHED; 72 sk->sk_state = TCP_ESTABLISHED;
70 inet->id = jiffies; 73 inet->inet_id = jiffies;
71 74
72 sk_dst_set(sk, &rt->u.dst); 75 sk_dst_set(sk, &rt->dst);
73 return(0); 76 return 0;
74} 77}
75
76EXPORT_SYMBOL(ip4_datagram_connect); 78EXPORT_SYMBOL(ip4_datagram_connect);
77
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 3863c3a4223f..748cb5b337bd 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -50,6 +50,7 @@
50#include <linux/notifier.h> 50#include <linux/notifier.h>
51#include <linux/inetdevice.h> 51#include <linux/inetdevice.h>
52#include <linux/igmp.h> 52#include <linux/igmp.h>
53#include <linux/slab.h>
53#ifdef CONFIG_SYSCTL 54#ifdef CONFIG_SYSCTL
54#include <linux/sysctl.h> 55#include <linux/sysctl.h>
55#endif 56#endif
@@ -64,20 +65,20 @@
64 65
65static struct ipv4_devconf ipv4_devconf = { 66static struct ipv4_devconf ipv4_devconf = {
66 .data = { 67 .data = {
67 [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1, 68 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
68 [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1, 69 [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
69 [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1, 70 [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
70 [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1, 71 [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
71 }, 72 },
72}; 73};
73 74
74static struct ipv4_devconf ipv4_devconf_dflt = { 75static struct ipv4_devconf ipv4_devconf_dflt = {
75 .data = { 76 .data = {
76 [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1, 77 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
77 [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1, 78 [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
78 [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1, 79 [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
79 [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1, 80 [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
80 [NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE - 1] = 1, 81 [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
81 }, 82 },
82}; 83};
83 84
@@ -140,11 +141,11 @@ void in_dev_finish_destroy(struct in_device *idev)
140#endif 141#endif
141 dev_put(dev); 142 dev_put(dev);
142 if (!idev->dead) 143 if (!idev->dead)
143 printk("Freeing alive in_device %p\n", idev); 144 pr_err("Freeing alive in_device %p\n", idev);
144 else { 145 else
145 kfree(idev); 146 kfree(idev);
146 }
147} 147}
148EXPORT_SYMBOL(in_dev_finish_destroy);
148 149
149static struct in_device *inetdev_init(struct net_device *dev) 150static struct in_device *inetdev_init(struct net_device *dev)
150{ 151{
@@ -159,7 +160,8 @@ static struct in_device *inetdev_init(struct net_device *dev)
159 sizeof(in_dev->cnf)); 160 sizeof(in_dev->cnf));
160 in_dev->cnf.sysctl = NULL; 161 in_dev->cnf.sysctl = NULL;
161 in_dev->dev = dev; 162 in_dev->dev = dev;
162 if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) 163 in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
164 if (!in_dev->arp_parms)
163 goto out_kfree; 165 goto out_kfree;
164 if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) 166 if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
165 dev_disable_lro(dev); 167 dev_disable_lro(dev);
@@ -207,7 +209,7 @@ static void inetdev_destroy(struct in_device *in_dev)
207 inet_free_ifa(ifa); 209 inet_free_ifa(ifa);
208 } 210 }
209 211
210 dev->ip_ptr = NULL; 212 rcu_assign_pointer(dev->ip_ptr, NULL);
211 213
212 devinet_sysctl_unregister(in_dev); 214 devinet_sysctl_unregister(in_dev);
213 neigh_parms_release(&arp_tbl, in_dev->arp_parms); 215 neigh_parms_release(&arp_tbl, in_dev->arp_parms);
@@ -401,17 +403,22 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
401 return inet_insert_ifa(ifa); 403 return inet_insert_ifa(ifa);
402} 404}
403 405
406/* Caller must hold RCU or RTNL :
407 * We dont take a reference on found in_device
408 */
404struct in_device *inetdev_by_index(struct net *net, int ifindex) 409struct in_device *inetdev_by_index(struct net *net, int ifindex)
405{ 410{
406 struct net_device *dev; 411 struct net_device *dev;
407 struct in_device *in_dev = NULL; 412 struct in_device *in_dev = NULL;
408 read_lock(&dev_base_lock); 413
409 dev = __dev_get_by_index(net, ifindex); 414 rcu_read_lock();
415 dev = dev_get_by_index_rcu(net, ifindex);
410 if (dev) 416 if (dev)
411 in_dev = in_dev_get(dev); 417 in_dev = rcu_dereference_rtnl(dev->ip_ptr);
412 read_unlock(&dev_base_lock); 418 rcu_read_unlock();
413 return in_dev; 419 return in_dev;
414} 420}
421EXPORT_SYMBOL(inetdev_by_index);
415 422
416/* Called only from RTNL semaphored context. No locks. */ 423/* Called only from RTNL semaphored context. No locks. */
417 424
@@ -449,8 +456,6 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
449 goto errout; 456 goto errout;
450 } 457 }
451 458
452 __in_dev_put(in_dev);
453
454 for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; 459 for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
455 ifap = &ifa->ifa_next) { 460 ifap = &ifa->ifa_next) {
456 if (tb[IFA_LOCAL] && 461 if (tb[IFA_LOCAL] &&
@@ -557,7 +562,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
557 * Determine a default network mask, based on the IP address. 562 * Determine a default network mask, based on the IP address.
558 */ 563 */
559 564
560static __inline__ int inet_abc_len(__be32 addr) 565static inline int inet_abc_len(__be32 addr)
561{ 566{
562 int rc = -1; /* Something else, probably a multicast. */ 567 int rc = -1; /* Something else, probably a multicast. */
563 568
@@ -646,13 +651,15 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
646 rtnl_lock(); 651 rtnl_lock();
647 652
648 ret = -ENODEV; 653 ret = -ENODEV;
649 if ((dev = __dev_get_by_name(net, ifr.ifr_name)) == NULL) 654 dev = __dev_get_by_name(net, ifr.ifr_name);
655 if (!dev)
650 goto done; 656 goto done;
651 657
652 if (colon) 658 if (colon)
653 *colon = ':'; 659 *colon = ':';
654 660
655 if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { 661 in_dev = __in_dev_get_rtnl(dev);
662 if (in_dev) {
656 if (tryaddrmatch) { 663 if (tryaddrmatch) {
657 /* Matthias Andree */ 664 /* Matthias Andree */
658 /* compare label and address (4.4BSD style) */ 665 /* compare label and address (4.4BSD style) */
@@ -720,7 +727,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
720 727
721 if (!ifa) { 728 if (!ifa) {
722 ret = -ENOBUFS; 729 ret = -ENOBUFS;
723 if ((ifa = inet_alloc_ifa()) == NULL) 730 ifa = inet_alloc_ifa();
731 if (!ifa)
724 break; 732 break;
725 if (colon) 733 if (colon)
726 memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); 734 memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
@@ -822,10 +830,10 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
822 struct ifreq ifr; 830 struct ifreq ifr;
823 int done = 0; 831 int done = 0;
824 832
825 if (!in_dev || (ifa = in_dev->ifa_list) == NULL) 833 if (!in_dev)
826 goto out; 834 goto out;
827 835
828 for (; ifa; ifa = ifa->ifa_next) { 836 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
829 if (!buf) { 837 if (!buf) {
830 done += sizeof(ifr); 838 done += sizeof(ifr);
831 continue; 839 continue;
@@ -875,36 +883,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
875 if (!addr) 883 if (!addr)
876 addr = ifa->ifa_local; 884 addr = ifa->ifa_local;
877 } endfor_ifa(in_dev); 885 } endfor_ifa(in_dev);
878no_in_dev:
879 rcu_read_unlock();
880 886
881 if (addr) 887 if (addr)
882 goto out; 888 goto out_unlock;
889no_in_dev:
883 890
884 /* Not loopback addresses on loopback should be preferred 891 /* Not loopback addresses on loopback should be preferred
885 in this case. It is importnat that lo is the first interface 892 in this case. It is importnat that lo is the first interface
886 in dev_base list. 893 in dev_base list.
887 */ 894 */
888 read_lock(&dev_base_lock); 895 for_each_netdev_rcu(net, dev) {
889 rcu_read_lock(); 896 in_dev = __in_dev_get_rcu(dev);
890 for_each_netdev(net, dev) { 897 if (!in_dev)
891 if ((in_dev = __in_dev_get_rcu(dev)) == NULL)
892 continue; 898 continue;
893 899
894 for_primary_ifa(in_dev) { 900 for_primary_ifa(in_dev) {
895 if (ifa->ifa_scope != RT_SCOPE_LINK && 901 if (ifa->ifa_scope != RT_SCOPE_LINK &&
896 ifa->ifa_scope <= scope) { 902 ifa->ifa_scope <= scope) {
897 addr = ifa->ifa_local; 903 addr = ifa->ifa_local;
898 goto out_unlock_both; 904 goto out_unlock;
899 } 905 }
900 } endfor_ifa(in_dev); 906 } endfor_ifa(in_dev);
901 } 907 }
902out_unlock_both: 908out_unlock:
903 read_unlock(&dev_base_lock);
904 rcu_read_unlock(); 909 rcu_read_unlock();
905out:
906 return addr; 910 return addr;
907} 911}
912EXPORT_SYMBOL(inet_select_addr);
908 913
909static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, 914static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
910 __be32 local, int scope) 915 __be32 local, int scope)
@@ -940,7 +945,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
940 } 945 }
941 } endfor_ifa(in_dev); 946 } endfor_ifa(in_dev);
942 947
943 return same? addr : 0; 948 return same ? addr : 0;
944} 949}
945 950
946/* 951/*
@@ -961,17 +966,16 @@ __be32 inet_confirm_addr(struct in_device *in_dev,
961 return confirm_addr_indev(in_dev, dst, local, scope); 966 return confirm_addr_indev(in_dev, dst, local, scope);
962 967
963 net = dev_net(in_dev->dev); 968 net = dev_net(in_dev->dev);
964 read_lock(&dev_base_lock);
965 rcu_read_lock(); 969 rcu_read_lock();
966 for_each_netdev(net, dev) { 970 for_each_netdev_rcu(net, dev) {
967 if ((in_dev = __in_dev_get_rcu(dev))) { 971 in_dev = __in_dev_get_rcu(dev);
972 if (in_dev) {
968 addr = confirm_addr_indev(in_dev, dst, local, scope); 973 addr = confirm_addr_indev(in_dev, dst, local, scope);
969 if (addr) 974 if (addr)
970 break; 975 break;
971 } 976 }
972 } 977 }
973 rcu_read_unlock(); 978 rcu_read_unlock();
974 read_unlock(&dev_base_lock);
975 979
976 return addr; 980 return addr;
977} 981}
@@ -984,14 +988,16 @@ int register_inetaddr_notifier(struct notifier_block *nb)
984{ 988{
985 return blocking_notifier_chain_register(&inetaddr_chain, nb); 989 return blocking_notifier_chain_register(&inetaddr_chain, nb);
986} 990}
991EXPORT_SYMBOL(register_inetaddr_notifier);
987 992
988int unregister_inetaddr_notifier(struct notifier_block *nb) 993int unregister_inetaddr_notifier(struct notifier_block *nb)
989{ 994{
990 return blocking_notifier_chain_unregister(&inetaddr_chain, nb); 995 return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
991} 996}
997EXPORT_SYMBOL(unregister_inetaddr_notifier);
992 998
993/* Rename ifa_labels for a device name change. Make some effort to preserve existing 999/* Rename ifa_labels for a device name change. Make some effort to preserve
994 * alias numbering and to create unique labels if possible. 1000 * existing alias numbering and to create unique labels if possible.
995*/ 1001*/
996static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) 1002static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
997{ 1003{
@@ -1010,11 +1016,10 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
1010 sprintf(old, ":%d", named); 1016 sprintf(old, ":%d", named);
1011 dot = old; 1017 dot = old;
1012 } 1018 }
1013 if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) { 1019 if (strlen(dot) + strlen(dev->name) < IFNAMSIZ)
1014 strcat(ifa->ifa_label, dot); 1020 strcat(ifa->ifa_label, dot);
1015 } else { 1021 else
1016 strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); 1022 strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
1017 }
1018skip: 1023skip:
1019 rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); 1024 rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
1020 } 1025 }
@@ -1055,14 +1060,15 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1055 switch (event) { 1060 switch (event) {
1056 case NETDEV_REGISTER: 1061 case NETDEV_REGISTER:
1057 printk(KERN_DEBUG "inetdev_event: bug\n"); 1062 printk(KERN_DEBUG "inetdev_event: bug\n");
1058 dev->ip_ptr = NULL; 1063 rcu_assign_pointer(dev->ip_ptr, NULL);
1059 break; 1064 break;
1060 case NETDEV_UP: 1065 case NETDEV_UP:
1061 if (!inetdev_valid_mtu(dev->mtu)) 1066 if (!inetdev_valid_mtu(dev->mtu))
1062 break; 1067 break;
1063 if (dev->flags & IFF_LOOPBACK) { 1068 if (dev->flags & IFF_LOOPBACK) {
1064 struct in_ifaddr *ifa; 1069 struct in_ifaddr *ifa = inet_alloc_ifa();
1065 if ((ifa = inet_alloc_ifa()) != NULL) { 1070
1071 if (ifa) {
1066 ifa->ifa_local = 1072 ifa->ifa_local =
1067 ifa->ifa_address = htonl(INADDR_LOOPBACK); 1073 ifa->ifa_address = htonl(INADDR_LOOPBACK);
1068 ifa->ifa_prefixlen = 8; 1074 ifa->ifa_prefixlen = 8;
@@ -1076,17 +1082,28 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1076 } 1082 }
1077 ip_mc_up(in_dev); 1083 ip_mc_up(in_dev);
1078 /* fall through */ 1084 /* fall through */
1085 case NETDEV_NOTIFY_PEERS:
1079 case NETDEV_CHANGEADDR: 1086 case NETDEV_CHANGEADDR:
1080 if (IN_DEV_ARP_NOTIFY(in_dev)) 1087 /* Send gratuitous ARP to notify of link change */
1081 arp_send(ARPOP_REQUEST, ETH_P_ARP, 1088 if (IN_DEV_ARP_NOTIFY(in_dev)) {
1082 in_dev->ifa_list->ifa_address, 1089 struct in_ifaddr *ifa = in_dev->ifa_list;
1083 dev, 1090
1084 in_dev->ifa_list->ifa_address, 1091 if (ifa)
1085 NULL, dev->dev_addr, NULL); 1092 arp_send(ARPOP_REQUEST, ETH_P_ARP,
1093 ifa->ifa_address, dev,
1094 ifa->ifa_address, NULL,
1095 dev->dev_addr, NULL);
1096 }
1086 break; 1097 break;
1087 case NETDEV_DOWN: 1098 case NETDEV_DOWN:
1088 ip_mc_down(in_dev); 1099 ip_mc_down(in_dev);
1089 break; 1100 break;
1101 case NETDEV_PRE_TYPE_CHANGE:
1102 ip_mc_unmap(in_dev);
1103 break;
1104 case NETDEV_POST_TYPE_CHANGE:
1105 ip_mc_remap(in_dev);
1106 break;
1090 case NETDEV_CHANGEMTU: 1107 case NETDEV_CHANGEMTU:
1091 if (inetdev_valid_mtu(dev->mtu)) 1108 if (inetdev_valid_mtu(dev->mtu))
1092 break; 1109 break;
@@ -1160,38 +1177,54 @@ nla_put_failure:
1160static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) 1177static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1161{ 1178{
1162 struct net *net = sock_net(skb->sk); 1179 struct net *net = sock_net(skb->sk);
1163 int idx, ip_idx; 1180 int h, s_h;
1181 int idx, s_idx;
1182 int ip_idx, s_ip_idx;
1164 struct net_device *dev; 1183 struct net_device *dev;
1165 struct in_device *in_dev; 1184 struct in_device *in_dev;
1166 struct in_ifaddr *ifa; 1185 struct in_ifaddr *ifa;
1167 int s_ip_idx, s_idx = cb->args[0]; 1186 struct hlist_head *head;
1187 struct hlist_node *node;
1168 1188
1169 s_ip_idx = ip_idx = cb->args[1]; 1189 s_h = cb->args[0];
1170 idx = 0; 1190 s_idx = idx = cb->args[1];
1171 for_each_netdev(net, dev) { 1191 s_ip_idx = ip_idx = cb->args[2];
1172 if (idx < s_idx) 1192
1173 goto cont; 1193 for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
1174 if (idx > s_idx) 1194 idx = 0;
1175 s_ip_idx = 0; 1195 head = &net->dev_index_head[h];
1176 if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) 1196 rcu_read_lock();
1177 goto cont; 1197 hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
1178 1198 if (idx < s_idx)
1179 for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; 1199 goto cont;
1180 ifa = ifa->ifa_next, ip_idx++) { 1200 if (h > s_h || idx > s_idx)
1181 if (ip_idx < s_ip_idx) 1201 s_ip_idx = 0;
1182 continue; 1202 in_dev = __in_dev_get_rcu(dev);
1183 if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, 1203 if (!in_dev)
1204 goto cont;
1205
1206 for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
1207 ifa = ifa->ifa_next, ip_idx++) {
1208 if (ip_idx < s_ip_idx)
1209 continue;
1210 if (inet_fill_ifaddr(skb, ifa,
1211 NETLINK_CB(cb->skb).pid,
1184 cb->nlh->nlmsg_seq, 1212 cb->nlh->nlmsg_seq,
1185 RTM_NEWADDR, NLM_F_MULTI) <= 0) 1213 RTM_NEWADDR, NLM_F_MULTI) <= 0) {
1186 goto done; 1214 rcu_read_unlock();
1187 } 1215 goto done;
1216 }
1217 }
1188cont: 1218cont:
1189 idx++; 1219 idx++;
1220 }
1221 rcu_read_unlock();
1190 } 1222 }
1191 1223
1192done: 1224done:
1193 cb->args[0] = idx; 1225 cb->args[0] = h;
1194 cb->args[1] = ip_idx; 1226 cb->args[1] = idx;
1227 cb->args[2] = ip_idx;
1195 1228
1196 return skb->len; 1229 return skb->len;
1197} 1230}
@@ -1223,24 +1256,105 @@ errout:
1223 rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); 1256 rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
1224} 1257}
1225 1258
1259static size_t inet_get_link_af_size(const struct net_device *dev)
1260{
1261 struct in_device *in_dev = __in_dev_get_rtnl(dev);
1262
1263 if (!in_dev)
1264 return 0;
1265
1266 return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
1267}
1268
1269static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
1270{
1271 struct in_device *in_dev = __in_dev_get_rtnl(dev);
1272 struct nlattr *nla;
1273 int i;
1274
1275 if (!in_dev)
1276 return -ENODATA;
1277
1278 nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
1279 if (nla == NULL)
1280 return -EMSGSIZE;
1281
1282 for (i = 0; i < IPV4_DEVCONF_MAX; i++)
1283 ((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i];
1284
1285 return 0;
1286}
1287
1288static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
1289 [IFLA_INET_CONF] = { .type = NLA_NESTED },
1290};
1291
1292static int inet_validate_link_af(const struct net_device *dev,
1293 const struct nlattr *nla)
1294{
1295 struct nlattr *a, *tb[IFLA_INET_MAX+1];
1296 int err, rem;
1297
1298 if (dev && !__in_dev_get_rtnl(dev))
1299 return -EAFNOSUPPORT;
1300
1301 err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy);
1302 if (err < 0)
1303 return err;
1304
1305 if (tb[IFLA_INET_CONF]) {
1306 nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) {
1307 int cfgid = nla_type(a);
1308
1309 if (nla_len(a) < 4)
1310 return -EINVAL;
1311
1312 if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX)
1313 return -EINVAL;
1314 }
1315 }
1316
1317 return 0;
1318}
1319
1320static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
1321{
1322 struct in_device *in_dev = __in_dev_get_rtnl(dev);
1323 struct nlattr *a, *tb[IFLA_INET_MAX+1];
1324 int rem;
1325
1326 if (!in_dev)
1327 return -EAFNOSUPPORT;
1328
1329 if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0)
1330 BUG();
1331
1332 if (tb[IFLA_INET_CONF]) {
1333 nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
1334 ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a));
1335 }
1336
1337 return 0;
1338}
1339
1226#ifdef CONFIG_SYSCTL 1340#ifdef CONFIG_SYSCTL
1227 1341
1228static void devinet_copy_dflt_conf(struct net *net, int i) 1342static void devinet_copy_dflt_conf(struct net *net, int i)
1229{ 1343{
1230 struct net_device *dev; 1344 struct net_device *dev;
1231 1345
1232 read_lock(&dev_base_lock); 1346 rcu_read_lock();
1233 for_each_netdev(net, dev) { 1347 for_each_netdev_rcu(net, dev) {
1234 struct in_device *in_dev; 1348 struct in_device *in_dev;
1235 rcu_read_lock(); 1349
1236 in_dev = __in_dev_get_rcu(dev); 1350 in_dev = __in_dev_get_rcu(dev);
1237 if (in_dev && !test_bit(i, in_dev->cnf.state)) 1351 if (in_dev && !test_bit(i, in_dev->cnf.state))
1238 in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; 1352 in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
1239 rcu_read_unlock();
1240 } 1353 }
1241 read_unlock(&dev_base_lock); 1354 rcu_read_unlock();
1242} 1355}
1243 1356
1357/* called with RTNL locked */
1244static void inet_forward_change(struct net *net) 1358static void inet_forward_change(struct net *net)
1245{ 1359{
1246 struct net_device *dev; 1360 struct net_device *dev;
@@ -1249,7 +1363,6 @@ static void inet_forward_change(struct net *net)
1249 IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; 1363 IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
1250 IPV4_DEVCONF_DFLT(net, FORWARDING) = on; 1364 IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
1251 1365
1252 read_lock(&dev_base_lock);
1253 for_each_netdev(net, dev) { 1366 for_each_netdev(net, dev) {
1254 struct in_device *in_dev; 1367 struct in_device *in_dev;
1255 if (on) 1368 if (on)
@@ -1260,14 +1373,13 @@ static void inet_forward_change(struct net *net)
1260 IN_DEV_CONF_SET(in_dev, FORWARDING, on); 1373 IN_DEV_CONF_SET(in_dev, FORWARDING, on);
1261 rcu_read_unlock(); 1374 rcu_read_unlock();
1262 } 1375 }
1263 read_unlock(&dev_base_lock);
1264} 1376}
1265 1377
1266static int devinet_conf_proc(ctl_table *ctl, int write, 1378static int devinet_conf_proc(ctl_table *ctl, int write,
1267 struct file *filp, void __user *buffer, 1379 void __user *buffer,
1268 size_t *lenp, loff_t *ppos) 1380 size_t *lenp, loff_t *ppos)
1269{ 1381{
1270 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 1382 int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
1271 1383
1272 if (write) { 1384 if (write) {
1273 struct ipv4_devconf *cnf = ctl->extra1; 1385 struct ipv4_devconf *cnf = ctl->extra1;
@@ -1283,72 +1395,25 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
1283 return ret; 1395 return ret;
1284} 1396}
1285 1397
1286static int devinet_conf_sysctl(ctl_table *table,
1287 void __user *oldval, size_t __user *oldlenp,
1288 void __user *newval, size_t newlen)
1289{
1290 struct ipv4_devconf *cnf;
1291 struct net *net;
1292 int *valp = table->data;
1293 int new;
1294 int i;
1295
1296 if (!newval || !newlen)
1297 return 0;
1298
1299 if (newlen != sizeof(int))
1300 return -EINVAL;
1301
1302 if (get_user(new, (int __user *)newval))
1303 return -EFAULT;
1304
1305 if (new == *valp)
1306 return 0;
1307
1308 if (oldval && oldlenp) {
1309 size_t len;
1310
1311 if (get_user(len, oldlenp))
1312 return -EFAULT;
1313
1314 if (len) {
1315 if (len > table->maxlen)
1316 len = table->maxlen;
1317 if (copy_to_user(oldval, valp, len))
1318 return -EFAULT;
1319 if (put_user(len, oldlenp))
1320 return -EFAULT;
1321 }
1322 }
1323
1324 *valp = new;
1325
1326 cnf = table->extra1;
1327 net = table->extra2;
1328 i = (int *)table->data - cnf->data;
1329
1330 set_bit(i, cnf->state);
1331
1332 if (cnf == net->ipv4.devconf_dflt)
1333 devinet_copy_dflt_conf(net, i);
1334
1335 return 1;
1336}
1337
1338static int devinet_sysctl_forward(ctl_table *ctl, int write, 1398static int devinet_sysctl_forward(ctl_table *ctl, int write,
1339 struct file *filp, void __user *buffer, 1399 void __user *buffer,
1340 size_t *lenp, loff_t *ppos) 1400 size_t *lenp, loff_t *ppos)
1341{ 1401{
1342 int *valp = ctl->data; 1402 int *valp = ctl->data;
1343 int val = *valp; 1403 int val = *valp;
1344 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 1404 loff_t pos = *ppos;
1405 int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
1345 1406
1346 if (write && *valp != val) { 1407 if (write && *valp != val) {
1347 struct net *net = ctl->extra2; 1408 struct net *net = ctl->extra2;
1348 1409
1349 if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { 1410 if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
1350 if (!rtnl_trylock()) 1411 if (!rtnl_trylock()) {
1412 /* Restore the original values before restarting */
1413 *valp = val;
1414 *ppos = pos;
1351 return restart_syscall(); 1415 return restart_syscall();
1416 }
1352 if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { 1417 if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
1353 inet_forward_change(net); 1418 inet_forward_change(net);
1354 } else if (*valp) { 1419 } else if (*valp) {
@@ -1365,13 +1430,13 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
1365 return ret; 1430 return ret;
1366} 1431}
1367 1432
1368int ipv4_doint_and_flush(ctl_table *ctl, int write, 1433static int ipv4_doint_and_flush(ctl_table *ctl, int write,
1369 struct file *filp, void __user *buffer, 1434 void __user *buffer,
1370 size_t *lenp, loff_t *ppos) 1435 size_t *lenp, loff_t *ppos)
1371{ 1436{
1372 int *valp = ctl->data; 1437 int *valp = ctl->data;
1373 int val = *valp; 1438 int val = *valp;
1374 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 1439 int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
1375 struct net *net = ctl->extra2; 1440 struct net *net = ctl->extra2;
1376 1441
1377 if (write && *valp != val) 1442 if (write && *valp != val)
@@ -1380,57 +1445,37 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write,
1380 return ret; 1445 return ret;
1381} 1446}
1382 1447
1383int ipv4_doint_and_flush_strategy(ctl_table *table, 1448#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
1384 void __user *oldval, size_t __user *oldlenp,
1385 void __user *newval, size_t newlen)
1386{
1387 int ret = devinet_conf_sysctl(table, oldval, oldlenp, newval, newlen);
1388 struct net *net = table->extra2;
1389
1390 if (ret == 1)
1391 rt_cache_flush(net, 0);
1392
1393 return ret;
1394}
1395
1396
1397#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc, sysctl) \
1398 { \ 1449 { \
1399 .ctl_name = NET_IPV4_CONF_ ## attr, \
1400 .procname = name, \ 1450 .procname = name, \
1401 .data = ipv4_devconf.data + \ 1451 .data = ipv4_devconf.data + \
1402 NET_IPV4_CONF_ ## attr - 1, \ 1452 IPV4_DEVCONF_ ## attr - 1, \
1403 .maxlen = sizeof(int), \ 1453 .maxlen = sizeof(int), \
1404 .mode = mval, \ 1454 .mode = mval, \
1405 .proc_handler = proc, \ 1455 .proc_handler = proc, \
1406 .strategy = sysctl, \
1407 .extra1 = &ipv4_devconf, \ 1456 .extra1 = &ipv4_devconf, \
1408 } 1457 }
1409 1458
1410#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \ 1459#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
1411 DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc, \ 1460 DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc)
1412 devinet_conf_sysctl)
1413 1461
1414#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \ 1462#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \
1415 DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc, \ 1463 DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc)
1416 devinet_conf_sysctl)
1417 1464
1418#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc, sysctl) \ 1465#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \
1419 DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc, sysctl) 1466 DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc)
1420 1467
1421#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \ 1468#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
1422 DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush, \ 1469 DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)
1423 ipv4_doint_and_flush_strategy)
1424 1470
1425static struct devinet_sysctl_table { 1471static struct devinet_sysctl_table {
1426 struct ctl_table_header *sysctl_header; 1472 struct ctl_table_header *sysctl_header;
1427 struct ctl_table devinet_vars[__NET_IPV4_CONF_MAX]; 1473 struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
1428 char *dev_name; 1474 char *dev_name;
1429} devinet_sysctl = { 1475} devinet_sysctl = {
1430 .devinet_vars = { 1476 .devinet_vars = {
1431 DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", 1477 DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
1432 devinet_sysctl_forward, 1478 devinet_sysctl_forward),
1433 devinet_conf_sysctl),
1434 DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), 1479 DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
1435 1480
1436 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), 1481 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
@@ -1440,6 +1485,8 @@ static struct devinet_sysctl_table {
1440 DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), 1485 DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
1441 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, 1486 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
1442 "accept_source_route"), 1487 "accept_source_route"),
1488 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
1489 DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
1443 DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), 1490 DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
1444 DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), 1491 DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
1445 DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), 1492 DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
@@ -1450,6 +1497,7 @@ static struct devinet_sysctl_table {
1450 DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), 1497 DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
1451 DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), 1498 DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
1452 DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), 1499 DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
1500 DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
1453 1501
1454 DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), 1502 DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
1455 DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), 1503 DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
@@ -1461,7 +1509,7 @@ static struct devinet_sysctl_table {
1461}; 1509};
1462 1510
1463static int __devinet_sysctl_register(struct net *net, char *dev_name, 1511static int __devinet_sysctl_register(struct net *net, char *dev_name,
1464 int ctl_name, struct ipv4_devconf *p) 1512 struct ipv4_devconf *p)
1465{ 1513{
1466 int i; 1514 int i;
1467 struct devinet_sysctl_table *t; 1515 struct devinet_sysctl_table *t;
@@ -1469,9 +1517,9 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
1469#define DEVINET_CTL_PATH_DEV 3 1517#define DEVINET_CTL_PATH_DEV 3
1470 1518
1471 struct ctl_path devinet_ctl_path[] = { 1519 struct ctl_path devinet_ctl_path[] = {
1472 { .procname = "net", .ctl_name = CTL_NET, }, 1520 { .procname = "net", },
1473 { .procname = "ipv4", .ctl_name = NET_IPV4, }, 1521 { .procname = "ipv4", },
1474 { .procname = "conf", .ctl_name = NET_IPV4_CONF, }, 1522 { .procname = "conf", },
1475 { /* to be set */ }, 1523 { /* to be set */ },
1476 { }, 1524 { },
1477 }; 1525 };
@@ -1496,7 +1544,6 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
1496 goto free; 1544 goto free;
1497 1545
1498 devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name; 1546 devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;
1499 devinet_ctl_path[DEVINET_CTL_PATH_DEV].ctl_name = ctl_name;
1500 1547
1501 t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path, 1548 t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
1502 t->devinet_vars); 1549 t->devinet_vars);
@@ -1529,10 +1576,9 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
1529 1576
1530static void devinet_sysctl_register(struct in_device *idev) 1577static void devinet_sysctl_register(struct in_device *idev)
1531{ 1578{
1532 neigh_sysctl_register(idev->dev, idev->arp_parms, NET_IPV4, 1579 neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL);
1533 NET_IPV4_NEIGH, "ipv4", NULL, NULL);
1534 __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, 1580 __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
1535 idev->dev->ifindex, &idev->cnf); 1581 &idev->cnf);
1536} 1582}
1537 1583
1538static void devinet_sysctl_unregister(struct in_device *idev) 1584static void devinet_sysctl_unregister(struct in_device *idev)
@@ -1543,14 +1589,12 @@ static void devinet_sysctl_unregister(struct in_device *idev)
1543 1589
1544static struct ctl_table ctl_forward_entry[] = { 1590static struct ctl_table ctl_forward_entry[] = {
1545 { 1591 {
1546 .ctl_name = NET_IPV4_FORWARD,
1547 .procname = "ip_forward", 1592 .procname = "ip_forward",
1548 .data = &ipv4_devconf.data[ 1593 .data = &ipv4_devconf.data[
1549 NET_IPV4_CONF_FORWARDING - 1], 1594 IPV4_DEVCONF_FORWARDING - 1],
1550 .maxlen = sizeof(int), 1595 .maxlen = sizeof(int),
1551 .mode = 0644, 1596 .mode = 0644,
1552 .proc_handler = devinet_sysctl_forward, 1597 .proc_handler = devinet_sysctl_forward,
1553 .strategy = devinet_conf_sysctl,
1554 .extra1 = &ipv4_devconf, 1598 .extra1 = &ipv4_devconf,
1555 .extra2 = &init_net, 1599 .extra2 = &init_net,
1556 }, 1600 },
@@ -1558,8 +1602,8 @@ static struct ctl_table ctl_forward_entry[] = {
1558}; 1602};
1559 1603
1560static __net_initdata struct ctl_path net_ipv4_path[] = { 1604static __net_initdata struct ctl_path net_ipv4_path[] = {
1561 { .procname = "net", .ctl_name = CTL_NET, }, 1605 { .procname = "net", },
1562 { .procname = "ipv4", .ctl_name = NET_IPV4, }, 1606 { .procname = "ipv4", },
1563 { }, 1607 { },
1564}; 1608};
1565#endif 1609#endif
@@ -1577,7 +1621,7 @@ static __net_init int devinet_init_net(struct net *net)
1577 all = &ipv4_devconf; 1621 all = &ipv4_devconf;
1578 dflt = &ipv4_devconf_dflt; 1622 dflt = &ipv4_devconf_dflt;
1579 1623
1580 if (net != &init_net) { 1624 if (!net_eq(net, &init_net)) {
1581 all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); 1625 all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
1582 if (all == NULL) 1626 if (all == NULL)
1583 goto err_alloc_all; 1627 goto err_alloc_all;
@@ -1591,20 +1635,18 @@ static __net_init int devinet_init_net(struct net *net)
1591 if (tbl == NULL) 1635 if (tbl == NULL)
1592 goto err_alloc_ctl; 1636 goto err_alloc_ctl;
1593 1637
1594 tbl[0].data = &all->data[NET_IPV4_CONF_FORWARDING - 1]; 1638 tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
1595 tbl[0].extra1 = all; 1639 tbl[0].extra1 = all;
1596 tbl[0].extra2 = net; 1640 tbl[0].extra2 = net;
1597#endif 1641#endif
1598 } 1642 }
1599 1643
1600#ifdef CONFIG_SYSCTL 1644#ifdef CONFIG_SYSCTL
1601 err = __devinet_sysctl_register(net, "all", 1645 err = __devinet_sysctl_register(net, "all", all);
1602 NET_PROTO_CONF_ALL, all);
1603 if (err < 0) 1646 if (err < 0)
1604 goto err_reg_all; 1647 goto err_reg_all;
1605 1648
1606 err = __devinet_sysctl_register(net, "default", 1649 err = __devinet_sysctl_register(net, "default", dflt);
1607 NET_PROTO_CONF_DEFAULT, dflt);
1608 if (err < 0) 1650 if (err < 0)
1609 goto err_reg_dflt; 1651 goto err_reg_dflt;
1610 1652
@@ -1658,6 +1700,14 @@ static __net_initdata struct pernet_operations devinet_ops = {
1658 .exit = devinet_exit_net, 1700 .exit = devinet_exit_net,
1659}; 1701};
1660 1702
1703static struct rtnl_af_ops inet_af_ops = {
1704 .family = AF_INET,
1705 .fill_link_af = inet_fill_link_af,
1706 .get_link_af_size = inet_get_link_af_size,
1707 .validate_link_af = inet_validate_link_af,
1708 .set_link_af = inet_set_link_af,
1709};
1710
1661void __init devinet_init(void) 1711void __init devinet_init(void)
1662{ 1712{
1663 register_pernet_subsys(&devinet_ops); 1713 register_pernet_subsys(&devinet_ops);
@@ -1665,13 +1715,10 @@ void __init devinet_init(void)
1665 register_gifconf(PF_INET, inet_gifconf); 1715 register_gifconf(PF_INET, inet_gifconf);
1666 register_netdevice_notifier(&ip_netdev_notifier); 1716 register_netdevice_notifier(&ip_netdev_notifier);
1667 1717
1718 rtnl_af_register(&inet_af_ops);
1719
1668 rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); 1720 rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL);
1669 rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); 1721 rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL);
1670 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); 1722 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
1671} 1723}
1672 1724
1673EXPORT_SYMBOL(in_dev_finish_destroy);
1674EXPORT_SYMBOL(inet_select_addr);
1675EXPORT_SYMBOL(inetdev_by_index);
1676EXPORT_SYMBOL(register_inetaddr_notifier);
1677EXPORT_SYMBOL(unregister_inetaddr_notifier);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 18bb383ea393..e42a905180f0 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -23,6 +23,8 @@ struct esp_skb_cb {
23 23
24#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) 24#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
25 25
26static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
27
26/* 28/*
27 * Allocate an AEAD request structure with extra space for SG and IV. 29 * Allocate an AEAD request structure with extra space for SG and IV.
28 * 30 *
@@ -117,25 +119,35 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
117 int blksize; 119 int blksize;
118 int clen; 120 int clen;
119 int alen; 121 int alen;
122 int plen;
123 int tfclen;
120 int nfrags; 124 int nfrags;
121 125
122 /* skb is pure payload to encrypt */ 126 /* skb is pure payload to encrypt */
123 127
124 err = -ENOMEM; 128 err = -ENOMEM;
125 129
126 /* Round to block size */
127 clen = skb->len;
128
129 esp = x->data; 130 esp = x->data;
130 aead = esp->aead; 131 aead = esp->aead;
131 alen = crypto_aead_authsize(aead); 132 alen = crypto_aead_authsize(aead);
132 133
134 tfclen = 0;
135 if (x->tfcpad) {
136 struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
137 u32 padto;
138
139 padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
140 if (skb->len < padto)
141 tfclen = padto - skb->len;
142 }
133 blksize = ALIGN(crypto_aead_blocksize(aead), 4); 143 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
134 clen = ALIGN(clen + 2, blksize); 144 clen = ALIGN(skb->len + 2 + tfclen, blksize);
135 if (esp->padlen) 145 if (esp->padlen)
136 clen = ALIGN(clen, esp->padlen); 146 clen = ALIGN(clen, esp->padlen);
147 plen = clen - skb->len - tfclen;
137 148
138 if ((err = skb_cow_data(skb, clen - skb->len + alen, &trailer)) < 0) 149 err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
150 if (err < 0)
139 goto error; 151 goto error;
140 nfrags = err; 152 nfrags = err;
141 153
@@ -150,13 +162,17 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
150 162
151 /* Fill padding... */ 163 /* Fill padding... */
152 tail = skb_tail_pointer(trailer); 164 tail = skb_tail_pointer(trailer);
165 if (tfclen) {
166 memset(tail, 0, tfclen);
167 tail += tfclen;
168 }
153 do { 169 do {
154 int i; 170 int i;
155 for (i=0; i<clen-skb->len - 2; i++) 171 for (i = 0; i < plen - 2; i++)
156 tail[i] = i + 1; 172 tail[i] = i + 1;
157 } while (0); 173 } while (0);
158 tail[clen - skb->len - 2] = (clen - skb->len) - 2; 174 tail[plen - 2] = plen - 2;
159 tail[clen - skb->len - 1] = *skb_mac_header(skb); 175 tail[plen - 1] = *skb_mac_header(skb);
160 pskb_put(skb, trailer, clen - skb->len + alen); 176 pskb_put(skb, trailer, clen - skb->len + alen);
161 177
162 skb_push(skb, -skb_network_offset(skb)); 178 skb_push(skb, -skb_network_offset(skb));
@@ -422,7 +438,7 @@ static void esp4_err(struct sk_buff *skb, u32 info)
422 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 438 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
423 return; 439 return;
424 440
425 x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); 441 x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
426 if (!x) 442 if (!x)
427 return; 443 return;
428 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", 444 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
@@ -530,7 +546,7 @@ static int esp_init_authenc(struct xfrm_state *x)
530 } 546 }
531 547
532 err = crypto_aead_setauthsize( 548 err = crypto_aead_setauthsize(
533 aead, aalg_desc->uinfo.auth.icv_truncbits / 8); 549 aead, x->aalg->alg_trunc_len / 8);
534 if (err) 550 if (err)
535 goto free_key; 551 goto free_key;
536 } 552 }
@@ -615,7 +631,7 @@ static const struct xfrm_type esp_type =
615 .output = esp_output 631 .output = esp_output
616}; 632};
617 633
618static struct net_protocol esp4_protocol = { 634static const struct net_protocol esp4_protocol = {
619 .handler = xfrm4_rcv, 635 .handler = xfrm4_rcv,
620 .err_handler = esp4_err, 636 .err_handler = esp4_err,
621 .no_policy = 1, 637 .no_policy = 1,
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e2f950592566..1d2cdd43a878 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -34,6 +34,7 @@
34#include <linux/skbuff.h> 34#include <linux/skbuff.h>
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/list.h> 36#include <linux/list.h>
37#include <linux/slab.h>
37 38
38#include <net/ip.h> 39#include <net/ip.h>
39#include <net/protocol.h> 40#include <net/protocol.h>
@@ -125,7 +126,7 @@ void fib_select_default(struct net *net,
125#endif 126#endif
126 tb = fib_get_table(net, table); 127 tb = fib_get_table(net, table);
127 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 128 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
128 tb->tb_select_default(tb, flp, res); 129 fib_table_select_default(tb, flp, res);
129} 130}
130 131
131static void fib_flush(struct net *net) 132static void fib_flush(struct net *net)
@@ -139,21 +140,27 @@ static void fib_flush(struct net *net)
139 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 140 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
140 head = &net->ipv4.fib_table_hash[h]; 141 head = &net->ipv4.fib_table_hash[h];
141 hlist_for_each_entry(tb, node, head, tb_hlist) 142 hlist_for_each_entry(tb, node, head, tb_hlist)
142 flushed += tb->tb_flush(tb); 143 flushed += fib_table_flush(tb);
143 } 144 }
144 145
145 if (flushed) 146 if (flushed)
146 rt_cache_flush(net, -1); 147 rt_cache_flush(net, -1);
147} 148}
148 149
149/* 150/**
150 * Find the first device with a given source address. 151 * __ip_dev_find - find the first device with a given source address.
152 * @net: the net namespace
153 * @addr: the source address
154 * @devref: if true, take a reference on the found device
155 *
156 * If a caller uses devref=false, it should be protected by RCU, or RTNL
151 */ 157 */
152 158struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
153struct net_device * ip_dev_find(struct net *net, __be32 addr)
154{ 159{
155 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; 160 struct flowi fl = {
156 struct fib_result res; 161 .fl4_dst = addr,
162 };
163 struct fib_result res = { 0 };
157 struct net_device *dev = NULL; 164 struct net_device *dev = NULL;
158 struct fib_table *local_table; 165 struct fib_table *local_table;
159 166
@@ -161,19 +168,24 @@ struct net_device * ip_dev_find(struct net *net, __be32 addr)
161 res.r = NULL; 168 res.r = NULL;
162#endif 169#endif
163 170
171 rcu_read_lock();
164 local_table = fib_get_table(net, RT_TABLE_LOCAL); 172 local_table = fib_get_table(net, RT_TABLE_LOCAL);
165 if (!local_table || local_table->tb_lookup(local_table, &fl, &res)) 173 if (!local_table ||
174 fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
175 rcu_read_unlock();
166 return NULL; 176 return NULL;
177 }
167 if (res.type != RTN_LOCAL) 178 if (res.type != RTN_LOCAL)
168 goto out; 179 goto out;
169 dev = FIB_RES_DEV(res); 180 dev = FIB_RES_DEV(res);
170 181
171 if (dev) 182 if (dev && devref)
172 dev_hold(dev); 183 dev_hold(dev);
173out: 184out:
174 fib_res_put(&res); 185 rcu_read_unlock();
175 return dev; 186 return dev;
176} 187}
188EXPORT_SYMBOL(__ip_dev_find);
177 189
178/* 190/*
179 * Find address type as if only "dev" was present in the system. If 191 * Find address type as if only "dev" was present in the system. If
@@ -183,7 +195,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
183 const struct net_device *dev, 195 const struct net_device *dev,
184 __be32 addr) 196 __be32 addr)
185{ 197{
186 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; 198 struct flowi fl = { .fl4_dst = addr };
187 struct fib_result res; 199 struct fib_result res;
188 unsigned ret = RTN_BROADCAST; 200 unsigned ret = RTN_BROADCAST;
189 struct fib_table *local_table; 201 struct fib_table *local_table;
@@ -200,11 +212,12 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
200 local_table = fib_get_table(net, RT_TABLE_LOCAL); 212 local_table = fib_get_table(net, RT_TABLE_LOCAL);
201 if (local_table) { 213 if (local_table) {
202 ret = RTN_UNICAST; 214 ret = RTN_UNICAST;
203 if (!local_table->tb_lookup(local_table, &fl, &res)) { 215 rcu_read_lock();
216 if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
204 if (!dev || dev == res.fi->fib_dev) 217 if (!dev || dev == res.fi->fib_dev)
205 ret = res.type; 218 ret = res.type;
206 fib_res_put(&res);
207 } 219 }
220 rcu_read_unlock();
208 } 221 }
209 return ret; 222 return ret;
210} 223}
@@ -213,43 +226,50 @@ unsigned int inet_addr_type(struct net *net, __be32 addr)
213{ 226{
214 return __inet_dev_addr_type(net, NULL, addr); 227 return __inet_dev_addr_type(net, NULL, addr);
215} 228}
229EXPORT_SYMBOL(inet_addr_type);
216 230
217unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, 231unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
218 __be32 addr) 232 __be32 addr)
219{ 233{
220 return __inet_dev_addr_type(net, dev, addr); 234 return __inet_dev_addr_type(net, dev, addr);
221} 235}
236EXPORT_SYMBOL(inet_dev_addr_type);
222 237
223/* Given (packet source, input interface) and optional (dst, oif, tos): 238/* Given (packet source, input interface) and optional (dst, oif, tos):
224 - (main) check, that source is valid i.e. not broadcast or our local 239 * - (main) check, that source is valid i.e. not broadcast or our local
225 address. 240 * address.
226 - figure out what "logical" interface this packet arrived 241 * - figure out what "logical" interface this packet arrived
227 and calculate "specific destination" address. 242 * and calculate "specific destination" address.
228 - check, that packet arrived from expected physical interface. 243 * - check, that packet arrived from expected physical interface.
244 * called with rcu_read_lock()
229 */ 245 */
230
231int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, 246int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
232 struct net_device *dev, __be32 *spec_dst, u32 *itag) 247 struct net_device *dev, __be32 *spec_dst,
248 u32 *itag, u32 mark)
233{ 249{
234 struct in_device *in_dev; 250 struct in_device *in_dev;
235 struct flowi fl = { .nl_u = { .ip4_u = 251 struct flowi fl = {
236 { .daddr = src, 252 .fl4_dst = src,
237 .saddr = dst, 253 .fl4_src = dst,
238 .tos = tos } }, 254 .fl4_tos = tos,
239 .iif = oif }; 255 .mark = mark,
256 .iif = oif
257 };
240 struct fib_result res; 258 struct fib_result res;
241 int no_addr, rpf; 259 int no_addr, rpf, accept_local;
260 bool dev_match;
242 int ret; 261 int ret;
243 struct net *net; 262 struct net *net;
244 263
245 no_addr = rpf = 0; 264 no_addr = rpf = accept_local = 0;
246 rcu_read_lock();
247 in_dev = __in_dev_get_rcu(dev); 265 in_dev = __in_dev_get_rcu(dev);
248 if (in_dev) { 266 if (in_dev) {
249 no_addr = in_dev->ifa_list == NULL; 267 no_addr = in_dev->ifa_list == NULL;
250 rpf = IN_DEV_RPFILTER(in_dev); 268 rpf = IN_DEV_RPFILTER(in_dev);
269 accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
270 if (mark && !IN_DEV_SRC_VMARK(in_dev))
271 fl.mark = 0;
251 } 272 }
252 rcu_read_unlock();
253 273
254 if (in_dev == NULL) 274 if (in_dev == NULL)
255 goto e_inval; 275 goto e_inval;
@@ -257,25 +277,35 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
257 net = dev_net(dev); 277 net = dev_net(dev);
258 if (fib_lookup(net, &fl, &res)) 278 if (fib_lookup(net, &fl, &res))
259 goto last_resort; 279 goto last_resort;
260 if (res.type != RTN_UNICAST) 280 if (res.type != RTN_UNICAST) {
261 goto e_inval_res; 281 if (res.type != RTN_LOCAL || !accept_local)
282 goto e_inval;
283 }
262 *spec_dst = FIB_RES_PREFSRC(res); 284 *spec_dst = FIB_RES_PREFSRC(res);
263 fib_combine_itag(itag, &res); 285 fib_combine_itag(itag, &res);
286 dev_match = false;
287
264#ifdef CONFIG_IP_ROUTE_MULTIPATH 288#ifdef CONFIG_IP_ROUTE_MULTIPATH
265 if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) 289 for (ret = 0; ret < res.fi->fib_nhs; ret++) {
290 struct fib_nh *nh = &res.fi->fib_nh[ret];
291
292 if (nh->nh_dev == dev) {
293 dev_match = true;
294 break;
295 }
296 }
266#else 297#else
267 if (FIB_RES_DEV(res) == dev) 298 if (FIB_RES_DEV(res) == dev)
299 dev_match = true;
268#endif 300#endif
269 { 301 if (dev_match) {
270 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 302 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
271 fib_res_put(&res);
272 return ret; 303 return ret;
273 } 304 }
274 fib_res_put(&res);
275 if (no_addr) 305 if (no_addr)
276 goto last_resort; 306 goto last_resort;
277 if (rpf == 1) 307 if (rpf == 1)
278 goto e_inval; 308 goto e_rpf;
279 fl.oif = dev->ifindex; 309 fl.oif = dev->ifindex;
280 310
281 ret = 0; 311 ret = 0;
@@ -284,21 +314,20 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
284 *spec_dst = FIB_RES_PREFSRC(res); 314 *spec_dst = FIB_RES_PREFSRC(res);
285 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 315 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
286 } 316 }
287 fib_res_put(&res);
288 } 317 }
289 return ret; 318 return ret;
290 319
291last_resort: 320last_resort:
292 if (rpf) 321 if (rpf)
293 goto e_inval; 322 goto e_rpf;
294 *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 323 *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
295 *itag = 0; 324 *itag = 0;
296 return 0; 325 return 0;
297 326
298e_inval_res:
299 fib_res_put(&res);
300e_inval: 327e_inval:
301 return -EINVAL; 328 return -EINVAL;
329e_rpf:
330 return -EXDEV;
302} 331}
303 332
304static inline __be32 sk_extract_addr(struct sockaddr *addr) 333static inline __be32 sk_extract_addr(struct sockaddr *addr)
@@ -447,9 +476,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
447} 476}
448 477
449/* 478/*
450 * Handle IP routing ioctl calls. These are used to manipulate the routing tables 479 * Handle IP routing ioctl calls.
480 * These are used to manipulate the routing tables
451 */ 481 */
452
453int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) 482int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
454{ 483{
455 struct fib_config cfg; 484 struct fib_config cfg;
@@ -473,13 +502,13 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
473 if (cmd == SIOCDELRT) { 502 if (cmd == SIOCDELRT) {
474 tb = fib_get_table(net, cfg.fc_table); 503 tb = fib_get_table(net, cfg.fc_table);
475 if (tb) 504 if (tb)
476 err = tb->tb_delete(tb, &cfg); 505 err = fib_table_delete(tb, &cfg);
477 else 506 else
478 err = -ESRCH; 507 err = -ESRCH;
479 } else { 508 } else {
480 tb = fib_new_table(net, cfg.fc_table); 509 tb = fib_new_table(net, cfg.fc_table);
481 if (tb) 510 if (tb)
482 err = tb->tb_insert(tb, &cfg); 511 err = fib_table_insert(tb, &cfg);
483 else 512 else
484 err = -ENOBUFS; 513 err = -ENOBUFS;
485 } 514 }
@@ -493,7 +522,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
493 return -EINVAL; 522 return -EINVAL;
494} 523}
495 524
496const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { 525const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
497 [RTA_DST] = { .type = NLA_U32 }, 526 [RTA_DST] = { .type = NLA_U32 },
498 [RTA_SRC] = { .type = NLA_U32 }, 527 [RTA_SRC] = { .type = NLA_U32 },
499 [RTA_IIF] = { .type = NLA_U32 }, 528 [RTA_IIF] = { .type = NLA_U32 },
@@ -507,7 +536,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
507}; 536};
508 537
509static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, 538static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
510 struct nlmsghdr *nlh, struct fib_config *cfg) 539 struct nlmsghdr *nlh, struct fib_config *cfg)
511{ 540{
512 struct nlattr *attr; 541 struct nlattr *attr;
513 int err, remaining; 542 int err, remaining;
@@ -594,7 +623,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *ar
594 goto errout; 623 goto errout;
595 } 624 }
596 625
597 err = tb->tb_delete(tb, &cfg); 626 err = fib_table_delete(tb, &cfg);
598errout: 627errout:
599 return err; 628 return err;
600} 629}
@@ -616,7 +645,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *ar
616 goto errout; 645 goto errout;
617 } 646 }
618 647
619 err = tb->tb_insert(tb, &cfg); 648 err = fib_table_insert(tb, &cfg);
620errout: 649errout:
621 return err; 650 return err;
622} 651}
@@ -647,7 +676,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
647 if (dumped) 676 if (dumped)
648 memset(&cb->args[2], 0, sizeof(cb->args) - 677 memset(&cb->args[2], 0, sizeof(cb->args) -
649 2 * sizeof(cb->args[0])); 678 2 * sizeof(cb->args[0]));
650 if (tb->tb_dump(tb, skb, cb) < 0) 679 if (fib_table_dump(tb, skb, cb) < 0)
651 goto out; 680 goto out;
652 dumped = 1; 681 dumped = 1;
653next: 682next:
@@ -662,12 +691,11 @@ out:
662} 691}
663 692
664/* Prepare and feed intra-kernel routing request. 693/* Prepare and feed intra-kernel routing request.
665 Really, it should be netlink message, but :-( netlink 694 * Really, it should be netlink message, but :-( netlink
666 can be not configured, so that we feed it directly 695 * can be not configured, so that we feed it directly
667 to fib engine. It is legal, because all events occur 696 * to fib engine. It is legal, because all events occur
668 only when netlink is already locked. 697 * only when netlink is already locked.
669 */ 698 */
670
671static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) 699static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
672{ 700{
673 struct net *net = dev_net(ifa->ifa_dev->dev); 701 struct net *net = dev_net(ifa->ifa_dev->dev);
@@ -701,9 +729,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
701 cfg.fc_scope = RT_SCOPE_HOST; 729 cfg.fc_scope = RT_SCOPE_HOST;
702 730
703 if (cmd == RTM_NEWROUTE) 731 if (cmd == RTM_NEWROUTE)
704 tb->tb_insert(tb, &cfg); 732 fib_table_insert(tb, &cfg);
705 else 733 else
706 tb->tb_delete(tb, &cfg); 734 fib_table_delete(tb, &cfg);
707} 735}
708 736
709void fib_add_ifaddr(struct in_ifaddr *ifa) 737void fib_add_ifaddr(struct in_ifaddr *ifa)
@@ -713,9 +741,9 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
713 struct in_ifaddr *prim = ifa; 741 struct in_ifaddr *prim = ifa;
714 __be32 mask = ifa->ifa_mask; 742 __be32 mask = ifa->ifa_mask;
715 __be32 addr = ifa->ifa_local; 743 __be32 addr = ifa->ifa_local;
716 __be32 prefix = ifa->ifa_address&mask; 744 __be32 prefix = ifa->ifa_address & mask;
717 745
718 if (ifa->ifa_flags&IFA_F_SECONDARY) { 746 if (ifa->ifa_flags & IFA_F_SECONDARY) {
719 prim = inet_ifa_byprefix(in_dev, prefix, mask); 747 prim = inet_ifa_byprefix(in_dev, prefix, mask);
720 if (prim == NULL) { 748 if (prim == NULL) {
721 printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n"); 749 printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
@@ -725,22 +753,24 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
725 753
726 fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); 754 fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
727 755
728 if (!(dev->flags&IFF_UP)) 756 if (!(dev->flags & IFF_UP))
729 return; 757 return;
730 758
731 /* Add broadcast address, if it is explicitly assigned. */ 759 /* Add broadcast address, if it is explicitly assigned. */
732 if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) 760 if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
733 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); 761 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
734 762
735 if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && 763 if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
736 (prefix != addr || ifa->ifa_prefixlen < 32)) { 764 (prefix != addr || ifa->ifa_prefixlen < 32)) {
737 fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : 765 fib_magic(RTM_NEWROUTE,
738 RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); 766 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
767 prefix, ifa->ifa_prefixlen, prim);
739 768
740 /* Add network specific broadcasts, when it takes a sense */ 769 /* Add network specific broadcasts, when it takes a sense */
741 if (ifa->ifa_prefixlen < 31) { 770 if (ifa->ifa_prefixlen < 31) {
742 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); 771 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
743 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim); 772 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
773 32, prim);
744 } 774 }
745 } 775 }
746} 776}
@@ -751,17 +781,18 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
751 struct net_device *dev = in_dev->dev; 781 struct net_device *dev = in_dev->dev;
752 struct in_ifaddr *ifa1; 782 struct in_ifaddr *ifa1;
753 struct in_ifaddr *prim = ifa; 783 struct in_ifaddr *prim = ifa;
754 __be32 brd = ifa->ifa_address|~ifa->ifa_mask; 784 __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
755 __be32 any = ifa->ifa_address&ifa->ifa_mask; 785 __be32 any = ifa->ifa_address & ifa->ifa_mask;
756#define LOCAL_OK 1 786#define LOCAL_OK 1
757#define BRD_OK 2 787#define BRD_OK 2
758#define BRD0_OK 4 788#define BRD0_OK 4
759#define BRD1_OK 8 789#define BRD1_OK 8
760 unsigned ok = 0; 790 unsigned ok = 0;
761 791
762 if (!(ifa->ifa_flags&IFA_F_SECONDARY)) 792 if (!(ifa->ifa_flags & IFA_F_SECONDARY))
763 fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : 793 fib_magic(RTM_DELROUTE,
764 RTN_UNICAST, any, ifa->ifa_prefixlen, prim); 794 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
795 any, ifa->ifa_prefixlen, prim);
765 else { 796 else {
766 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); 797 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
767 if (prim == NULL) { 798 if (prim == NULL) {
@@ -771,9 +802,9 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
771 } 802 }
772 803
773 /* Deletion is more complicated than add. 804 /* Deletion is more complicated than add.
774 We should take care of not to delete too much :-) 805 * We should take care of not to delete too much :-)
775 806 *
776 Scan address list to be sure that addresses are really gone. 807 * Scan address list to be sure that addresses are really gone.
777 */ 808 */
778 809
779 for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { 810 for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
@@ -787,23 +818,23 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
787 ok |= BRD0_OK; 818 ok |= BRD0_OK;
788 } 819 }
789 820
790 if (!(ok&BRD_OK)) 821 if (!(ok & BRD_OK))
791 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); 822 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
792 if (!(ok&BRD1_OK)) 823 if (!(ok & BRD1_OK))
793 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); 824 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
794 if (!(ok&BRD0_OK)) 825 if (!(ok & BRD0_OK))
795 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); 826 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
796 if (!(ok&LOCAL_OK)) { 827 if (!(ok & LOCAL_OK)) {
797 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); 828 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
798 829
799 /* Check, that this local address finally disappeared. */ 830 /* Check, that this local address finally disappeared. */
800 if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { 831 if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
801 /* And the last, but not the least thing. 832 /* And the last, but not the least thing.
802 We must flush stray FIB entries. 833 * We must flush stray FIB entries.
803 834 *
804 First of all, we scan fib_info list searching 835 * First of all, we scan fib_info list searching
805 for stray nexthop entries, then ignite fib_flush. 836 * for stray nexthop entries, then ignite fib_flush.
806 */ 837 */
807 if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) 838 if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
808 fib_flush(dev_net(dev)); 839 fib_flush(dev_net(dev));
809 } 840 }
@@ -814,14 +845,16 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
814#undef BRD1_OK 845#undef BRD1_OK
815} 846}
816 847
817static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) 848static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
818{ 849{
819 850
820 struct fib_result res; 851 struct fib_result res;
821 struct flowi fl = { .mark = frn->fl_mark, 852 struct flowi fl = {
822 .nl_u = { .ip4_u = { .daddr = frn->fl_addr, 853 .mark = frn->fl_mark,
823 .tos = frn->fl_tos, 854 .fl4_dst = frn->fl_addr,
824 .scope = frn->fl_scope } } }; 855 .fl4_tos = frn->fl_tos,
856 .fl4_scope = frn->fl_scope,
857 };
825 858
826#ifdef CONFIG_IP_MULTIPLE_TABLES 859#ifdef CONFIG_IP_MULTIPLE_TABLES
827 res.r = NULL; 860 res.r = NULL;
@@ -832,15 +865,16 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
832 local_bh_disable(); 865 local_bh_disable();
833 866
834 frn->tb_id = tb->tb_id; 867 frn->tb_id = tb->tb_id;
835 frn->err = tb->tb_lookup(tb, &fl, &res); 868 rcu_read_lock();
869 frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF);
836 870
837 if (!frn->err) { 871 if (!frn->err) {
838 frn->prefixlen = res.prefixlen; 872 frn->prefixlen = res.prefixlen;
839 frn->nh_sel = res.nh_sel; 873 frn->nh_sel = res.nh_sel;
840 frn->type = res.type; 874 frn->type = res.type;
841 frn->scope = res.scope; 875 frn->scope = res.scope;
842 fib_res_put(&res);
843 } 876 }
877 rcu_read_unlock();
844 local_bh_enable(); 878 local_bh_enable();
845 } 879 }
846} 880}
@@ -869,13 +903,13 @@ static void nl_fib_input(struct sk_buff *skb)
869 903
870 nl_fib_lookup(frn, tb); 904 nl_fib_lookup(frn, tb);
871 905
872 pid = NETLINK_CB(skb).pid; /* pid of sending process */ 906 pid = NETLINK_CB(skb).pid; /* pid of sending process */
873 NETLINK_CB(skb).pid = 0; /* from kernel */ 907 NETLINK_CB(skb).pid = 0; /* from kernel */
874 NETLINK_CB(skb).dst_group = 0; /* unicast */ 908 NETLINK_CB(skb).dst_group = 0; /* unicast */
875 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); 909 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
876} 910}
877 911
878static int nl_fib_lookup_init(struct net *net) 912static int __net_init nl_fib_lookup_init(struct net *net)
879{ 913{
880 struct sock *sk; 914 struct sock *sk;
881 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, 915 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
@@ -892,11 +926,11 @@ static void nl_fib_lookup_exit(struct net *net)
892 net->ipv4.fibnl = NULL; 926 net->ipv4.fibnl = NULL;
893} 927}
894 928
895static void fib_disable_ip(struct net_device *dev, int force) 929static void fib_disable_ip(struct net_device *dev, int force, int delay)
896{ 930{
897 if (fib_sync_down_dev(dev, force)) 931 if (fib_sync_down_dev(dev, force))
898 fib_flush(dev_net(dev)); 932 fib_flush(dev_net(dev));
899 rt_cache_flush(dev_net(dev), 0); 933 rt_cache_flush(dev_net(dev), delay);
900 arp_ifdown(dev); 934 arp_ifdown(dev);
901} 935}
902 936
@@ -917,9 +951,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
917 fib_del_ifaddr(ifa); 951 fib_del_ifaddr(ifa);
918 if (ifa->ifa_dev->ifa_list == NULL) { 952 if (ifa->ifa_dev->ifa_list == NULL) {
919 /* Last address was deleted from this interface. 953 /* Last address was deleted from this interface.
920 Disable IP. 954 * Disable IP.
921 */ 955 */
922 fib_disable_ip(dev, 1); 956 fib_disable_ip(dev, 1, 0);
923 } else { 957 } else {
924 rt_cache_flush(dev_net(dev), -1); 958 rt_cache_flush(dev_net(dev), -1);
925 } 959 }
@@ -934,7 +968,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
934 struct in_device *in_dev = __in_dev_get_rtnl(dev); 968 struct in_device *in_dev = __in_dev_get_rtnl(dev);
935 969
936 if (event == NETDEV_UNREGISTER) { 970 if (event == NETDEV_UNREGISTER) {
937 fib_disable_ip(dev, 2); 971 fib_disable_ip(dev, 2, -1);
938 return NOTIFY_DONE; 972 return NOTIFY_DONE;
939 } 973 }
940 974
@@ -952,12 +986,19 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
952 rt_cache_flush(dev_net(dev), -1); 986 rt_cache_flush(dev_net(dev), -1);
953 break; 987 break;
954 case NETDEV_DOWN: 988 case NETDEV_DOWN:
955 fib_disable_ip(dev, 0); 989 fib_disable_ip(dev, 0, 0);
956 break; 990 break;
957 case NETDEV_CHANGEMTU: 991 case NETDEV_CHANGEMTU:
958 case NETDEV_CHANGE: 992 case NETDEV_CHANGE:
959 rt_cache_flush(dev_net(dev), 0); 993 rt_cache_flush(dev_net(dev), 0);
960 break; 994 break;
995 case NETDEV_UNREGISTER_BATCH:
996 /* The batch unregister is only called on the first
997 * device in the list of devices being unregistered.
998 * Therefore we should not pass dev_net(dev) in here.
999 */
1000 rt_cache_flush_batch(NULL);
1001 break;
961 } 1002 }
962 return NOTIFY_DONE; 1003 return NOTIFY_DONE;
963} 1004}
@@ -973,16 +1014,15 @@ static struct notifier_block fib_netdev_notifier = {
973static int __net_init ip_fib_net_init(struct net *net) 1014static int __net_init ip_fib_net_init(struct net *net)
974{ 1015{
975 int err; 1016 int err;
976 unsigned int i; 1017 size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1018
1019 /* Avoid false sharing : Use at least a full cache line */
1020 size = max_t(size_t, size, L1_CACHE_BYTES);
977 1021
978 net->ipv4.fib_table_hash = kzalloc( 1022 net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
979 sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL);
980 if (net->ipv4.fib_table_hash == NULL) 1023 if (net->ipv4.fib_table_hash == NULL)
981 return -ENOMEM; 1024 return -ENOMEM;
982 1025
983 for (i = 0; i < FIB_TABLE_HASHSZ; i++)
984 INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
985
986 err = fib4_rules_init(net); 1026 err = fib4_rules_init(net);
987 if (err < 0) 1027 if (err < 0)
988 goto fail; 1028 goto fail;
@@ -993,7 +1033,7 @@ fail:
993 return err; 1033 return err;
994} 1034}
995 1035
996static void __net_exit ip_fib_net_exit(struct net *net) 1036static void ip_fib_net_exit(struct net *net)
997{ 1037{
998 unsigned int i; 1038 unsigned int i;
999 1039
@@ -1009,8 +1049,8 @@ static void __net_exit ip_fib_net_exit(struct net *net)
1009 head = &net->ipv4.fib_table_hash[i]; 1049 head = &net->ipv4.fib_table_hash[i];
1010 hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { 1050 hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
1011 hlist_del(node); 1051 hlist_del(node);
1012 tb->tb_flush(tb); 1052 fib_table_flush(tb);
1013 kfree(tb); 1053 fib_free_table(tb);
1014 } 1054 }
1015 } 1055 }
1016 kfree(net->ipv4.fib_table_hash); 1056 kfree(net->ipv4.fib_table_hash);
@@ -1063,7 +1103,3 @@ void __init ip_fib_init(void)
1063 1103
1064 fib_hash_init(); 1104 fib_hash_init();
1065} 1105}
1066
1067EXPORT_SYMBOL(inet_addr_type);
1068EXPORT_SYMBOL(inet_dev_addr_type);
1069EXPORT_SYMBOL(ip_dev_find);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index ecd39454235c..b3acb0417b21 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -32,6 +32,7 @@
32#include <linux/skbuff.h> 32#include <linux/skbuff.h>
33#include <linux/netlink.h> 33#include <linux/netlink.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/slab.h>
35 36
36#include <net/net_namespace.h> 37#include <net/net_namespace.h>
37#include <net/ip.h> 38#include <net/ip.h>
@@ -53,36 +54,37 @@ struct fib_node {
53 struct fib_alias fn_embedded_alias; 54 struct fib_alias fn_embedded_alias;
54}; 55};
55 56
56struct fn_zone { 57#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
57 struct fn_zone *fz_next; /* Next not empty zone */
58 struct hlist_head *fz_hash; /* Hash table pointer */
59 int fz_nent; /* Number of entries */
60 58
61 int fz_divisor; /* Hash divisor */ 59struct fn_zone {
60 struct fn_zone __rcu *fz_next; /* Next not empty zone */
61 struct hlist_head __rcu *fz_hash; /* Hash table pointer */
62 seqlock_t fz_lock;
62 u32 fz_hashmask; /* (fz_divisor - 1) */ 63 u32 fz_hashmask; /* (fz_divisor - 1) */
63#define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
64 64
65 int fz_order; /* Zone order */ 65 u8 fz_order; /* Zone order (0..32) */
66 __be32 fz_mask; 66 u8 fz_revorder; /* 32 - fz_order */
67 __be32 fz_mask; /* inet_make_mask(order) */
67#define FZ_MASK(fz) ((fz)->fz_mask) 68#define FZ_MASK(fz) ((fz)->fz_mask)
68};
69 69
70/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask 70 struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
71 * can be cheaper than memory lookup, so that FZ_* macros are used. 71
72 */ 72 int fz_nent; /* Number of entries */
73 int fz_divisor; /* Hash size (mask+1) */
74};
73 75
74struct fn_hash { 76struct fn_hash {
75 struct fn_zone *fn_zones[33]; 77 struct fn_zone *fn_zones[33];
76 struct fn_zone *fn_zone_list; 78 struct fn_zone __rcu *fn_zone_list;
77}; 79};
78 80
79static inline u32 fn_hash(__be32 key, struct fn_zone *fz) 81static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
80{ 82{
81 u32 h = ntohl(key)>>(32 - fz->fz_order); 83 u32 h = ntohl(key) >> fz->fz_revorder;
82 h ^= (h>>20); 84 h ^= (h>>20);
83 h ^= (h>>10); 85 h ^= (h>>10);
84 h ^= (h>>5); 86 h ^= (h>>5);
85 h &= FZ_HASHMASK(fz); 87 h &= fz->fz_hashmask;
86 return h; 88 return h;
87} 89}
88 90
@@ -91,7 +93,6 @@ static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
91 return dst & FZ_MASK(fz); 93 return dst & FZ_MASK(fz);
92} 94}
93 95
94static DEFINE_RWLOCK(fib_hash_lock);
95static unsigned int fib_hash_genid; 96static unsigned int fib_hash_genid;
96 97
97#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head)) 98#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
@@ -100,12 +101,11 @@ static struct hlist_head *fz_hash_alloc(int divisor)
100{ 101{
101 unsigned long size = divisor * sizeof(struct hlist_head); 102 unsigned long size = divisor * sizeof(struct hlist_head);
102 103
103 if (size <= PAGE_SIZE) { 104 if (size <= PAGE_SIZE)
104 return kzalloc(size, GFP_KERNEL); 105 return kzalloc(size, GFP_KERNEL);
105 } else { 106
106 return (struct hlist_head *) 107 return (struct hlist_head *)
107 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); 108 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
108 }
109} 109}
110 110
111/* The fib hash lock must be held when this is called. */ 111/* The fib hash lock must be held when this is called. */
@@ -122,10 +122,11 @@ static inline void fn_rebuild_zone(struct fn_zone *fz,
122 hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { 122 hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
123 struct hlist_head *new_head; 123 struct hlist_head *new_head;
124 124
125 hlist_del(&f->fn_hash); 125 hlist_del_rcu(&f->fn_hash);
126 126
127 new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; 127 new_head = rcu_dereference_protected(fz->fz_hash, 1) +
128 hlist_add_head(&f->fn_hash, new_head); 128 fn_hash(f->fn_key, fz);
129 hlist_add_head_rcu(&f->fn_hash, new_head);
129 } 130 }
130 } 131 }
131} 132}
@@ -146,14 +147,14 @@ static void fn_rehash_zone(struct fn_zone *fz)
146 int old_divisor, new_divisor; 147 int old_divisor, new_divisor;
147 u32 new_hashmask; 148 u32 new_hashmask;
148 149
149 old_divisor = fz->fz_divisor; 150 new_divisor = old_divisor = fz->fz_divisor;
150 151
151 switch (old_divisor) { 152 switch (old_divisor) {
152 case 16: 153 case EMBEDDED_HASH_SIZE:
153 new_divisor = 256; 154 new_divisor *= EMBEDDED_HASH_SIZE;
154 break; 155 break;
155 case 256: 156 case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
156 new_divisor = 1024; 157 new_divisor *= (EMBEDDED_HASH_SIZE/2);
157 break; 158 break;
158 default: 159 default:
159 if ((old_divisor << 1) > FZ_MAX_DIVISOR) { 160 if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
@@ -174,31 +175,55 @@ static void fn_rehash_zone(struct fn_zone *fz)
174 ht = fz_hash_alloc(new_divisor); 175 ht = fz_hash_alloc(new_divisor);
175 176
176 if (ht) { 177 if (ht) {
177 write_lock_bh(&fib_hash_lock); 178 struct fn_zone nfz;
178 old_ht = fz->fz_hash; 179
179 fz->fz_hash = ht; 180 memcpy(&nfz, fz, sizeof(nfz));
181
182 write_seqlock_bh(&fz->fz_lock);
183 old_ht = rcu_dereference_protected(fz->fz_hash, 1);
184 RCU_INIT_POINTER(nfz.fz_hash, ht);
185 nfz.fz_hashmask = new_hashmask;
186 nfz.fz_divisor = new_divisor;
187 fn_rebuild_zone(&nfz, old_ht, old_divisor);
188 fib_hash_genid++;
189 rcu_assign_pointer(fz->fz_hash, ht);
180 fz->fz_hashmask = new_hashmask; 190 fz->fz_hashmask = new_hashmask;
181 fz->fz_divisor = new_divisor; 191 fz->fz_divisor = new_divisor;
182 fn_rebuild_zone(fz, old_ht, old_divisor); 192 write_sequnlock_bh(&fz->fz_lock);
183 fib_hash_genid++;
184 write_unlock_bh(&fib_hash_lock);
185 193
186 fz_hash_free(old_ht, old_divisor); 194 if (old_ht != fz->fz_embedded_hash) {
195 synchronize_rcu();
196 fz_hash_free(old_ht, old_divisor);
197 }
187 } 198 }
188} 199}
189 200
190static inline void fn_free_node(struct fib_node * f) 201static void fn_free_node_rcu(struct rcu_head *head)
191{ 202{
203 struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
204
192 kmem_cache_free(fn_hash_kmem, f); 205 kmem_cache_free(fn_hash_kmem, f);
193} 206}
194 207
208static inline void fn_free_node(struct fib_node *f)
209{
210 call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
211}
212
213static void fn_free_alias_rcu(struct rcu_head *head)
214{
215 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
216
217 kmem_cache_free(fn_alias_kmem, fa);
218}
219
195static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) 220static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
196{ 221{
197 fib_release_info(fa->fa_info); 222 fib_release_info(fa->fa_info);
198 if (fa == &f->fn_embedded_alias) 223 if (fa == &f->fn_embedded_alias)
199 fa->fa_info = NULL; 224 fa->fa_info = NULL;
200 else 225 else
201 kmem_cache_free(fn_alias_kmem, fa); 226 call_rcu(&fa->rcu, fn_free_alias_rcu);
202} 227}
203 228
204static struct fn_zone * 229static struct fn_zone *
@@ -209,73 +234,76 @@ fn_new_zone(struct fn_hash *table, int z)
209 if (!fz) 234 if (!fz)
210 return NULL; 235 return NULL;
211 236
212 if (z) { 237 seqlock_init(&fz->fz_lock);
213 fz->fz_divisor = 16; 238 fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
214 } else { 239 fz->fz_hashmask = fz->fz_divisor - 1;
215 fz->fz_divisor = 1; 240 RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
216 }
217 fz->fz_hashmask = (fz->fz_divisor - 1);
218 fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
219 if (!fz->fz_hash) {
220 kfree(fz);
221 return NULL;
222 }
223 fz->fz_order = z; 241 fz->fz_order = z;
242 fz->fz_revorder = 32 - z;
224 fz->fz_mask = inet_make_mask(z); 243 fz->fz_mask = inet_make_mask(z);
225 244
226 /* Find the first not empty zone with more specific mask */ 245 /* Find the first not empty zone with more specific mask */
227 for (i=z+1; i<=32; i++) 246 for (i = z + 1; i <= 32; i++)
228 if (table->fn_zones[i]) 247 if (table->fn_zones[i])
229 break; 248 break;
230 write_lock_bh(&fib_hash_lock); 249 if (i > 32) {
231 if (i>32) {
232 /* No more specific masks, we are the first. */ 250 /* No more specific masks, we are the first. */
233 fz->fz_next = table->fn_zone_list; 251 rcu_assign_pointer(fz->fz_next,
234 table->fn_zone_list = fz; 252 rtnl_dereference(table->fn_zone_list));
253 rcu_assign_pointer(table->fn_zone_list, fz);
235 } else { 254 } else {
236 fz->fz_next = table->fn_zones[i]->fz_next; 255 rcu_assign_pointer(fz->fz_next,
237 table->fn_zones[i]->fz_next = fz; 256 rtnl_dereference(table->fn_zones[i]->fz_next));
257 rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
238 } 258 }
239 table->fn_zones[z] = fz; 259 table->fn_zones[z] = fz;
240 fib_hash_genid++; 260 fib_hash_genid++;
241 write_unlock_bh(&fib_hash_lock);
242 return fz; 261 return fz;
243} 262}
244 263
245static int 264int fib_table_lookup(struct fib_table *tb,
246fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) 265 const struct flowi *flp, struct fib_result *res,
266 int fib_flags)
247{ 267{
248 int err; 268 int err;
249 struct fn_zone *fz; 269 struct fn_zone *fz;
250 struct fn_hash *t = (struct fn_hash *)tb->tb_data; 270 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
251 271
252 read_lock(&fib_hash_lock); 272 rcu_read_lock();
253 for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { 273 for (fz = rcu_dereference(t->fn_zone_list);
274 fz != NULL;
275 fz = rcu_dereference(fz->fz_next)) {
254 struct hlist_head *head; 276 struct hlist_head *head;
255 struct hlist_node *node; 277 struct hlist_node *node;
256 struct fib_node *f; 278 struct fib_node *f;
257 __be32 k = fz_key(flp->fl4_dst, fz); 279 __be32 k;
280 unsigned int seq;
258 281
259 head = &fz->fz_hash[fn_hash(k, fz)]; 282 do {
260 hlist_for_each_entry(f, node, head, fn_hash) { 283 seq = read_seqbegin(&fz->fz_lock);
261 if (f->fn_key != k) 284 k = fz_key(flp->fl4_dst, fz);
262 continue;
263 285
264 err = fib_semantic_match(&f->fn_alias, 286 head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
287 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
288 if (f->fn_key != k)
289 continue;
290
291 err = fib_semantic_match(&f->fn_alias,
265 flp, res, 292 flp, res,
266 fz->fz_order); 293 fz->fz_order, fib_flags);
267 if (err <= 0) 294 if (err <= 0)
268 goto out; 295 goto out;
269 } 296 }
297 } while (read_seqretry(&fz->fz_lock, seq));
270 } 298 }
271 err = 1; 299 err = 1;
272out: 300out:
273 read_unlock(&fib_hash_lock); 301 rcu_read_unlock();
274 return err; 302 return err;
275} 303}
276 304
277static void 305void fib_table_select_default(struct fib_table *tb,
278fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) 306 const struct flowi *flp, struct fib_result *res)
279{ 307{
280 int order, last_idx; 308 int order, last_idx;
281 struct hlist_node *node; 309 struct hlist_node *node;
@@ -284,6 +312,7 @@ fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
284 struct fib_info *last_resort; 312 struct fib_info *last_resort;
285 struct fn_hash *t = (struct fn_hash *)tb->tb_data; 313 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
286 struct fn_zone *fz = t->fn_zones[0]; 314 struct fn_zone *fz = t->fn_zones[0];
315 struct hlist_head *head;
287 316
288 if (fz == NULL) 317 if (fz == NULL)
289 return; 318 return;
@@ -292,11 +321,12 @@ fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
292 last_resort = NULL; 321 last_resort = NULL;
293 order = -1; 322 order = -1;
294 323
295 read_lock(&fib_hash_lock); 324 rcu_read_lock();
296 hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) { 325 head = rcu_dereference(fz->fz_hash);
326 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
297 struct fib_alias *fa; 327 struct fib_alias *fa;
298 328
299 list_for_each_entry(fa, &f->fn_alias, fa_list) { 329 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
300 struct fib_info *next_fi = fa->fa_info; 330 struct fib_info *next_fi = fa->fa_info;
301 331
302 if (fa->fa_scope != res->scope || 332 if (fa->fa_scope != res->scope ||
@@ -308,7 +338,8 @@ fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
308 if (!next_fi->fib_nh[0].nh_gw || 338 if (!next_fi->fib_nh[0].nh_gw ||
309 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 339 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
310 continue; 340 continue;
311 fa->fa_state |= FA_S_ACCESSED; 341
342 fib_alias_accessed(fa);
312 343
313 if (fi == NULL) { 344 if (fi == NULL) {
314 if (next_fi != res->fi) 345 if (next_fi != res->fi)
@@ -340,25 +371,25 @@ fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib
340 fib_result_assign(res, last_resort); 371 fib_result_assign(res, last_resort);
341 tb->tb_default = last_idx; 372 tb->tb_default = last_idx;
342out: 373out:
343 read_unlock(&fib_hash_lock); 374 rcu_read_unlock();
344} 375}
345 376
346/* Insert node F to FZ. */ 377/* Insert node F to FZ. */
347static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) 378static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
348{ 379{
349 struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; 380 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
350 381
351 hlist_add_head(&f->fn_hash, head); 382 hlist_add_head_rcu(&f->fn_hash, head);
352} 383}
353 384
354/* Return the node in FZ matching KEY. */ 385/* Return the node in FZ matching KEY. */
355static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) 386static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
356{ 387{
357 struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)]; 388 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
358 struct hlist_node *node; 389 struct hlist_node *node;
359 struct fib_node *f; 390 struct fib_node *f;
360 391
361 hlist_for_each_entry(f, node, head, fn_hash) { 392 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
362 if (f->fn_key == key) 393 if (f->fn_key == key)
363 return f; 394 return f;
364 } 395 }
@@ -366,7 +397,18 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
366 return NULL; 397 return NULL;
367} 398}
368 399
369static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) 400
401static struct fib_alias *fib_fast_alloc(struct fib_node *f)
402{
403 struct fib_alias *fa = &f->fn_embedded_alias;
404
405 if (fa->fa_info != NULL)
406 fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
407 return fa;
408}
409
410/* Caller must hold RTNL. */
411int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
370{ 412{
371 struct fn_hash *table = (struct fn_hash *) tb->tb_data; 413 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
372 struct fib_node *new_f = NULL; 414 struct fib_node *new_f = NULL;
@@ -450,7 +492,6 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
450 } 492 }
451 493
452 if (cfg->fc_nlflags & NLM_F_REPLACE) { 494 if (cfg->fc_nlflags & NLM_F_REPLACE) {
453 struct fib_info *fi_drop;
454 u8 state; 495 u8 state;
455 496
456 fa = fa_first; 497 fa = fa_first;
@@ -459,21 +500,25 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
459 err = 0; 500 err = 0;
460 goto out; 501 goto out;
461 } 502 }
462 write_lock_bh(&fib_hash_lock); 503 err = -ENOBUFS;
463 fi_drop = fa->fa_info; 504 new_fa = fib_fast_alloc(f);
464 fa->fa_info = fi; 505 if (new_fa == NULL)
465 fa->fa_type = cfg->fc_type; 506 goto out;
466 fa->fa_scope = cfg->fc_scope; 507
508 new_fa->fa_tos = fa->fa_tos;
509 new_fa->fa_info = fi;
510 new_fa->fa_type = cfg->fc_type;
511 new_fa->fa_scope = cfg->fc_scope;
467 state = fa->fa_state; 512 state = fa->fa_state;
468 fa->fa_state &= ~FA_S_ACCESSED; 513 new_fa->fa_state = state & ~FA_S_ACCESSED;
469 fib_hash_genid++; 514 fib_hash_genid++;
470 write_unlock_bh(&fib_hash_lock); 515 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
471 516
472 fib_release_info(fi_drop); 517 fn_free_alias(fa, f);
473 if (state & FA_S_ACCESSED) 518 if (state & FA_S_ACCESSED)
474 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); 519 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
475 rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id, 520 rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
476 &cfg->fc_nlinfo, NLM_F_REPLACE); 521 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
477 return 0; 522 return 0;
478 } 523 }
479 524
@@ -505,12 +550,10 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
505 f = new_f; 550 f = new_f;
506 } 551 }
507 552
508 new_fa = &f->fn_embedded_alias; 553 new_fa = fib_fast_alloc(f);
509 if (new_fa->fa_info != NULL) { 554 if (new_fa == NULL)
510 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); 555 goto out;
511 if (new_fa == NULL) 556
512 goto out;
513 }
514 new_fa->fa_info = fi; 557 new_fa->fa_info = fi;
515 new_fa->fa_tos = tos; 558 new_fa->fa_tos = tos;
516 new_fa->fa_type = cfg->fc_type; 559 new_fa->fa_type = cfg->fc_type;
@@ -521,13 +564,11 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
521 * Insert new entry to the list. 564 * Insert new entry to the list.
522 */ 565 */
523 566
524 write_lock_bh(&fib_hash_lock);
525 if (new_f) 567 if (new_f)
526 fib_insert_node(fz, new_f); 568 fib_insert_node(fz, new_f);
527 list_add_tail(&new_fa->fa_list, 569 list_add_tail_rcu(&new_fa->fa_list,
528 (fa ? &fa->fa_list : &f->fn_alias)); 570 (fa ? &fa->fa_list : &f->fn_alias));
529 fib_hash_genid++; 571 fib_hash_genid++;
530 write_unlock_bh(&fib_hash_lock);
531 572
532 if (new_f) 573 if (new_f)
533 fz->fz_nent++; 574 fz->fz_nent++;
@@ -544,8 +585,7 @@ out:
544 return err; 585 return err;
545} 586}
546 587
547 588int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
548static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg)
549{ 589{
550 struct fn_hash *table = (struct fn_hash *)tb->tb_data; 590 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
551 struct fib_node *f; 591 struct fib_node *f;
@@ -603,14 +643,12 @@ static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg)
603 tb->tb_id, &cfg->fc_nlinfo, 0); 643 tb->tb_id, &cfg->fc_nlinfo, 0);
604 644
605 kill_fn = 0; 645 kill_fn = 0;
606 write_lock_bh(&fib_hash_lock); 646 list_del_rcu(&fa->fa_list);
607 list_del(&fa->fa_list);
608 if (list_empty(&f->fn_alias)) { 647 if (list_empty(&f->fn_alias)) {
609 hlist_del(&f->fn_hash); 648 hlist_del_rcu(&f->fn_hash);
610 kill_fn = 1; 649 kill_fn = 1;
611 } 650 }
612 fib_hash_genid++; 651 fib_hash_genid++;
613 write_unlock_bh(&fib_hash_lock);
614 652
615 if (fa->fa_state & FA_S_ACCESSED) 653 if (fa->fa_state & FA_S_ACCESSED)
616 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); 654 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
@@ -627,7 +665,7 @@ static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg)
627 665
628static int fn_flush_list(struct fn_zone *fz, int idx) 666static int fn_flush_list(struct fn_zone *fz, int idx)
629{ 667{
630 struct hlist_head *head = &fz->fz_hash[idx]; 668 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
631 struct hlist_node *node, *n; 669 struct hlist_node *node, *n;
632 struct fib_node *f; 670 struct fib_node *f;
633 int found = 0; 671 int found = 0;
@@ -641,14 +679,12 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
641 struct fib_info *fi = fa->fa_info; 679 struct fib_info *fi = fa->fa_info;
642 680
643 if (fi && (fi->fib_flags&RTNH_F_DEAD)) { 681 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
644 write_lock_bh(&fib_hash_lock); 682 list_del_rcu(&fa->fa_list);
645 list_del(&fa->fa_list);
646 if (list_empty(&f->fn_alias)) { 683 if (list_empty(&f->fn_alias)) {
647 hlist_del(&f->fn_hash); 684 hlist_del_rcu(&f->fn_hash);
648 kill_f = 1; 685 kill_f = 1;
649 } 686 }
650 fib_hash_genid++; 687 fib_hash_genid++;
651 write_unlock_bh(&fib_hash_lock);
652 688
653 fn_free_alias(fa, f); 689 fn_free_alias(fa, f);
654 found++; 690 found++;
@@ -662,13 +698,16 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
662 return found; 698 return found;
663} 699}
664 700
665static int fn_hash_flush(struct fib_table *tb) 701/* caller must hold RTNL. */
702int fib_table_flush(struct fib_table *tb)
666{ 703{
667 struct fn_hash *table = (struct fn_hash *) tb->tb_data; 704 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
668 struct fn_zone *fz; 705 struct fn_zone *fz;
669 int found = 0; 706 int found = 0;
670 707
671 for (fz = table->fn_zone_list; fz; fz = fz->fz_next) { 708 for (fz = rtnl_dereference(table->fn_zone_list);
709 fz != NULL;
710 fz = rtnl_dereference(fz->fz_next)) {
672 int i; 711 int i;
673 712
674 for (i = fz->fz_divisor - 1; i >= 0; i--) 713 for (i = fz->fz_divisor - 1; i >= 0; i--)
@@ -677,6 +716,24 @@ static int fn_hash_flush(struct fib_table *tb)
677 return found; 716 return found;
678} 717}
679 718
719void fib_free_table(struct fib_table *tb)
720{
721 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
722 struct fn_zone *fz, *next;
723
724 next = table->fn_zone_list;
725 while (next != NULL) {
726 fz = next;
727 next = fz->fz_next;
728
729 if (fz->fz_hash != fz->fz_embedded_hash)
730 fz_hash_free(fz->fz_hash, fz->fz_divisor);
731
732 kfree(fz);
733 }
734
735 kfree(tb);
736}
680 737
681static inline int 738static inline int
682fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, 739fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
@@ -690,10 +747,10 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
690 747
691 s_i = cb->args[4]; 748 s_i = cb->args[4];
692 i = 0; 749 i = 0;
693 hlist_for_each_entry(f, node, head, fn_hash) { 750 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
694 struct fib_alias *fa; 751 struct fib_alias *fa;
695 752
696 list_for_each_entry(fa, &f->fn_alias, fa_list) { 753 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
697 if (i < s_i) 754 if (i < s_i)
698 goto next; 755 goto next;
699 756
@@ -711,7 +768,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
711 cb->args[4] = i; 768 cb->args[4] = i;
712 return -1; 769 return -1;
713 } 770 }
714 next: 771next:
715 i++; 772 i++;
716 } 773 }
717 } 774 }
@@ -725,14 +782,15 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
725 struct fn_zone *fz) 782 struct fn_zone *fz)
726{ 783{
727 int h, s_h; 784 int h, s_h;
785 struct hlist_head *head = rcu_dereference(fz->fz_hash);
728 786
729 if (fz->fz_hash == NULL) 787 if (head == NULL)
730 return skb->len; 788 return skb->len;
731 s_h = cb->args[3]; 789 s_h = cb->args[3];
732 for (h = s_h; h < fz->fz_divisor; h++) { 790 for (h = s_h; h < fz->fz_divisor; h++) {
733 if (hlist_empty(&fz->fz_hash[h])) 791 if (hlist_empty(head + h))
734 continue; 792 continue;
735 if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h]) < 0) { 793 if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
736 cb->args[3] = h; 794 cb->args[3] = h;
737 return -1; 795 return -1;
738 } 796 }
@@ -743,25 +801,29 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
743 return skb->len; 801 return skb->len;
744} 802}
745 803
746static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) 804int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
805 struct netlink_callback *cb)
747{ 806{
748 int m, s_m; 807 int m = 0, s_m;
749 struct fn_zone *fz; 808 struct fn_zone *fz;
750 struct fn_hash *table = (struct fn_hash *)tb->tb_data; 809 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
751 810
752 s_m = cb->args[2]; 811 s_m = cb->args[2];
753 read_lock(&fib_hash_lock); 812 rcu_read_lock();
754 for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) { 813 for (fz = rcu_dereference(table->fn_zone_list);
755 if (m < s_m) continue; 814 fz != NULL;
815 fz = rcu_dereference(fz->fz_next), m++) {
816 if (m < s_m)
817 continue;
756 if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { 818 if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
757 cb->args[2] = m; 819 cb->args[2] = m;
758 read_unlock(&fib_hash_lock); 820 rcu_read_unlock();
759 return -1; 821 return -1;
760 } 822 }
761 memset(&cb->args[3], 0, 823 memset(&cb->args[3], 0,
762 sizeof(cb->args) - 3*sizeof(cb->args[0])); 824 sizeof(cb->args) - 3*sizeof(cb->args[0]));
763 } 825 }
764 read_unlock(&fib_hash_lock); 826 rcu_read_unlock();
765 cb->args[2] = m; 827 cb->args[2] = m;
766 return skb->len; 828 return skb->len;
767} 829}
@@ -787,12 +849,7 @@ struct fib_table *fib_hash_table(u32 id)
787 849
788 tb->tb_id = id; 850 tb->tb_id = id;
789 tb->tb_default = -1; 851 tb->tb_default = -1;
790 tb->tb_lookup = fn_hash_lookup; 852
791 tb->tb_insert = fn_hash_insert;
792 tb->tb_delete = fn_hash_delete;
793 tb->tb_flush = fn_hash_flush;
794 tb->tb_select_default = fn_hash_select_default;
795 tb->tb_dump = fn_hash_dump;
796 memset(tb->tb_data, 0, sizeof(struct fn_hash)); 853 memset(tb->tb_data, 0, sizeof(struct fn_hash));
797 return tb; 854 return tb;
798} 855}
@@ -829,14 +886,15 @@ static struct fib_alias *fib_get_first(struct seq_file *seq)
829 iter->genid = fib_hash_genid; 886 iter->genid = fib_hash_genid;
830 iter->valid = 1; 887 iter->valid = 1;
831 888
832 for (iter->zone = table->fn_zone_list; iter->zone; 889 for (iter->zone = rcu_dereference(table->fn_zone_list);
833 iter->zone = iter->zone->fz_next) { 890 iter->zone != NULL;
891 iter->zone = rcu_dereference(iter->zone->fz_next)) {
834 int maxslot; 892 int maxslot;
835 893
836 if (!iter->zone->fz_nent) 894 if (!iter->zone->fz_nent)
837 continue; 895 continue;
838 896
839 iter->hash_head = iter->zone->fz_hash; 897 iter->hash_head = rcu_dereference(iter->zone->fz_hash);
840 maxslot = iter->zone->fz_divisor; 898 maxslot = iter->zone->fz_divisor;
841 899
842 for (iter->bucket = 0; iter->bucket < maxslot; 900 for (iter->bucket = 0; iter->bucket < maxslot;
@@ -915,13 +973,13 @@ static struct fib_alias *fib_get_next(struct seq_file *seq)
915 } 973 }
916 } 974 }
917 975
918 iter->zone = iter->zone->fz_next; 976 iter->zone = rcu_dereference(iter->zone->fz_next);
919 977
920 if (!iter->zone) 978 if (!iter->zone)
921 goto out; 979 goto out;
922 980
923 iter->bucket = 0; 981 iter->bucket = 0;
924 iter->hash_head = iter->zone->fz_hash; 982 iter->hash_head = rcu_dereference(iter->zone->fz_hash);
925 983
926 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { 984 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
927 list_for_each_entry(fa, &fn->fn_alias, fa_list) { 985 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
@@ -954,11 +1012,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
954} 1012}
955 1013
956static void *fib_seq_start(struct seq_file *seq, loff_t *pos) 1014static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
957 __acquires(fib_hash_lock) 1015 __acquires(RCU)
958{ 1016{
959 void *v = NULL; 1017 void *v = NULL;
960 1018
961 read_lock(&fib_hash_lock); 1019 rcu_read_lock();
962 if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) 1020 if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
963 v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 1021 v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
964 return v; 1022 return v;
@@ -971,15 +1029,16 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
971} 1029}
972 1030
973static void fib_seq_stop(struct seq_file *seq, void *v) 1031static void fib_seq_stop(struct seq_file *seq, void *v)
974 __releases(fib_hash_lock) 1032 __releases(RCU)
975{ 1033{
976 read_unlock(&fib_hash_lock); 1034 rcu_read_unlock();
977} 1035}
978 1036
979static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) 1037static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
980{ 1038{
981 static const unsigned type2flags[RTN_MAX + 1] = { 1039 static const unsigned type2flags[RTN_MAX + 1] = {
982 [7] = RTF_REJECT, [8] = RTF_REJECT, 1040 [7] = RTF_REJECT,
1041 [8] = RTF_REJECT,
983 }; 1042 };
984 unsigned flags = type2flags[type]; 1043 unsigned flags = type2flags[type];
985 1044
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 637b133973bd..c079cc0ec651 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -12,17 +12,22 @@ struct fib_alias {
12 u8 fa_type; 12 u8 fa_type;
13 u8 fa_scope; 13 u8 fa_scope;
14 u8 fa_state; 14 u8 fa_state;
15#ifdef CONFIG_IP_FIB_TRIE
16 struct rcu_head rcu; 15 struct rcu_head rcu;
17#endif
18}; 16};
19 17
20#define FA_S_ACCESSED 0x01 18#define FA_S_ACCESSED 0x01
21 19
20/* Dont write on fa_state unless needed, to keep it shared on all cpus */
21static inline void fib_alias_accessed(struct fib_alias *fa)
22{
23 if (!(fa->fa_state & FA_S_ACCESSED))
24 fa->fa_state |= FA_S_ACCESSED;
25}
26
22/* Exported by fib_semantics.c */ 27/* Exported by fib_semantics.c */
23extern int fib_semantic_match(struct list_head *head, 28extern int fib_semantic_match(struct list_head *head,
24 const struct flowi *flp, 29 const struct flowi *flp,
25 struct fib_result *res, int prefixlen); 30 struct fib_result *res, int prefixlen, int fib_flags);
26extern void fib_release_info(struct fib_info *); 31extern void fib_release_info(struct fib_info *);
27extern struct fib_info *fib_create_info(struct fib_config *cfg); 32extern struct fib_info *fib_create_info(struct fib_config *cfg);
28extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); 33extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
@@ -42,11 +47,8 @@ extern int fib_detect_death(struct fib_info *fi, int order,
42static inline void fib_result_assign(struct fib_result *res, 47static inline void fib_result_assign(struct fib_result *res,
43 struct fib_info *fi) 48 struct fib_info *fi)
44{ 49{
45 if (res->fi != NULL) 50 /* we used to play games with refcounts, but we now use RCU */
46 fib_info_put(res->fi);
47 res->fi = fi; 51 res->fi = fi;
48 if (fi != NULL)
49 atomic_inc(&fi->fib_clntref);
50} 52}
51 53
52#endif /* _FIB_LOOKUP_H */ 54#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 92d9d97ec5e3..7981a24f5c7b 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -6,7 +6,7 @@
6 * IPv4 Forwarding Information Base: policy rules. 6 * IPv4 Forwarding Information Base: policy rules.
7 * 7 *
8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 * Thomas Graf <tgraf@suug.ch> 9 * Thomas Graf <tgraf@suug.ch>
10 * 10 *
11 * This program is free software; you can redistribute it and/or 11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License 12 * modify it under the terms of the GNU General Public License
@@ -14,7 +14,7 @@
14 * 2 of the License, or (at your option) any later version. 14 * 2 of the License, or (at your option) any later version.
15 * 15 *
16 * Fixes: 16 * Fixes:
17 * Rani Assaf : local_rule cannot be deleted 17 * Rani Assaf : local_rule cannot be deleted
18 * Marc Boucher : routing by fwmark 18 * Marc Boucher : routing by fwmark
19 */ 19 */
20 20
@@ -32,8 +32,7 @@
32#include <net/ip_fib.h> 32#include <net/ip_fib.h>
33#include <net/fib_rules.h> 33#include <net/fib_rules.h>
34 34
35struct fib4_rule 35struct fib4_rule {
36{
37 struct fib_rule common; 36 struct fib_rule common;
38 u8 dst_len; 37 u8 dst_len;
39 u8 src_len; 38 u8 src_len;
@@ -58,6 +57,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
58{ 57{
59 struct fib_lookup_arg arg = { 58 struct fib_lookup_arg arg = {
60 .result = res, 59 .result = res,
60 .flags = FIB_LOOKUP_NOREF,
61 }; 61 };
62 int err; 62 int err;
63 63
@@ -91,10 +91,11 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
91 goto errout; 91 goto errout;
92 } 92 }
93 93
94 if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL) 94 tbl = fib_get_table(rule->fr_net, rule->table);
95 if (!tbl)
95 goto errout; 96 goto errout;
96 97
97 err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result); 98 err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags);
98 if (err > 0) 99 if (err > 0)
99 err = -EAGAIN; 100 err = -EAGAIN;
100errout: 101errout:
@@ -213,7 +214,6 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
213{ 214{
214 struct fib4_rule *rule4 = (struct fib4_rule *) rule; 215 struct fib4_rule *rule4 = (struct fib4_rule *) rule;
215 216
216 frh->family = AF_INET;
217 frh->dst_len = rule4->dst_len; 217 frh->dst_len = rule4->dst_len;
218 frh->src_len = rule4->src_len; 218 frh->src_len = rule4->src_len;
219 frh->tos = rule4->tos; 219 frh->tos = rule4->tos;
@@ -234,23 +234,6 @@ nla_put_failure:
234 return -ENOBUFS; 234 return -ENOBUFS;
235} 235}
236 236
237static u32 fib4_rule_default_pref(struct fib_rules_ops *ops)
238{
239 struct list_head *pos;
240 struct fib_rule *rule;
241
242 if (!list_empty(&ops->rules_list)) {
243 pos = ops->rules_list.next;
244 if (pos->next != &ops->rules_list) {
245 rule = list_entry(pos->next, struct fib_rule, list);
246 if (rule->pref)
247 return rule->pref - 1;
248 }
249 }
250
251 return 0;
252}
253
254static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) 237static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
255{ 238{
256 return nla_total_size(4) /* dst */ 239 return nla_total_size(4) /* dst */
@@ -263,7 +246,7 @@ static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
263 rt_cache_flush(ops->fro_net, -1); 246 rt_cache_flush(ops->fro_net, -1);
264} 247}
265 248
266static struct fib_rules_ops fib4_rules_ops_template = { 249static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
267 .family = AF_INET, 250 .family = AF_INET,
268 .rule_size = sizeof(struct fib4_rule), 251 .rule_size = sizeof(struct fib4_rule),
269 .addr_size = sizeof(u32), 252 .addr_size = sizeof(u32),
@@ -272,7 +255,7 @@ static struct fib_rules_ops fib4_rules_ops_template = {
272 .configure = fib4_rule_configure, 255 .configure = fib4_rule_configure,
273 .compare = fib4_rule_compare, 256 .compare = fib4_rule_compare,
274 .fill = fib4_rule_fill, 257 .fill = fib4_rule_fill,
275 .default_pref = fib4_rule_default_pref, 258 .default_pref = fib_default_rule_pref,
276 .nlmsg_payload = fib4_rule_nlmsg_payload, 259 .nlmsg_payload = fib4_rule_nlmsg_payload,
277 .flush_cache = fib4_rule_flush_cache, 260 .flush_cache = fib4_rule_flush_cache,
278 .nlgroup = RTNLGRP_IPV4_RULE, 261 .nlgroup = RTNLGRP_IPV4_RULE,
@@ -284,7 +267,7 @@ static int fib_default_rules_init(struct fib_rules_ops *ops)
284{ 267{
285 int err; 268 int err;
286 269
287 err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, FIB_RULE_PERMANENT); 270 err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
288 if (err < 0) 271 if (err < 0)
289 return err; 272 return err;
290 err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0); 273 err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
@@ -301,13 +284,9 @@ int __net_init fib4_rules_init(struct net *net)
301 int err; 284 int err;
302 struct fib_rules_ops *ops; 285 struct fib_rules_ops *ops;
303 286
304 ops = kmemdup(&fib4_rules_ops_template, sizeof(*ops), GFP_KERNEL); 287 ops = fib_rules_register(&fib4_rules_ops_template, net);
305 if (ops == NULL) 288 if (IS_ERR(ops))
306 return -ENOMEM; 289 return PTR_ERR(ops);
307 INIT_LIST_HEAD(&ops->rules_list);
308 ops->fro_net = net;
309
310 fib_rules_register(ops);
311 290
312 err = fib_default_rules_init(ops); 291 err = fib_default_rules_init(ops);
313 if (err < 0) 292 if (err < 0)
@@ -318,12 +297,10 @@ int __net_init fib4_rules_init(struct net *net)
318fail: 297fail:
319 /* also cleans all rules already added */ 298 /* also cleans all rules already added */
320 fib_rules_unregister(ops); 299 fib_rules_unregister(ops);
321 kfree(ops);
322 return err; 300 return err;
323} 301}
324 302
325void __net_exit fib4_rules_exit(struct net *net) 303void __net_exit fib4_rules_exit(struct net *net)
326{ 304{
327 fib_rules_unregister(net->ipv4.rules_ops); 305 fib_rules_unregister(net->ipv4.rules_ops);
328 kfree(net->ipv4.rules_ops);
329} 306}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 9b096d6ff3f2..12d3dc3df1b7 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -32,6 +32,7 @@
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/skbuff.h> 33#include <linux/skbuff.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/slab.h>
35 36
36#include <net/arp.h> 37#include <net/arp.h>
37#include <net/ip.h> 38#include <net/ip.h>
@@ -59,21 +60,30 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
59 60
60static DEFINE_SPINLOCK(fib_multipath_lock); 61static DEFINE_SPINLOCK(fib_multipath_lock);
61 62
62#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ 63#define for_nexthops(fi) { \
63for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) 64 int nhsel; const struct fib_nh *nh; \
65 for (nhsel = 0, nh = (fi)->fib_nh; \
66 nhsel < (fi)->fib_nhs; \
67 nh++, nhsel++)
64 68
65#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \ 69#define change_nexthops(fi) { \
66for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) 70 int nhsel; struct fib_nh *nexthop_nh; \
71 for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
72 nhsel < (fi)->fib_nhs; \
73 nexthop_nh++, nhsel++)
67 74
68#else /* CONFIG_IP_ROUTE_MULTIPATH */ 75#else /* CONFIG_IP_ROUTE_MULTIPATH */
69 76
70/* Hope, that gcc will optimize it to get rid of dummy loop */ 77/* Hope, that gcc will optimize it to get rid of dummy loop */
71 78
72#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \ 79#define for_nexthops(fi) { \
73for (nhsel=0; nhsel < 1; nhsel++) 80 int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \
81 for (nhsel = 0; nhsel < 1; nhsel++)
74 82
75#define change_nexthops(fi) { int nhsel = 0; struct fib_nh * nh = (struct fib_nh *)((fi)->fib_nh); \ 83#define change_nexthops(fi) { \
76for (nhsel=0; nhsel < 1; nhsel++) 84 int nhsel; \
85 struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
86 for (nhsel = 0; nhsel < 1; nhsel++)
77 87
78#endif /* CONFIG_IP_ROUTE_MULTIPATH */ 88#endif /* CONFIG_IP_ROUTE_MULTIPATH */
79 89
@@ -85,73 +95,80 @@ static const struct
85 int error; 95 int error;
86 u8 scope; 96 u8 scope;
87} fib_props[RTN_MAX + 1] = { 97} fib_props[RTN_MAX + 1] = {
88 { 98 [RTN_UNSPEC] = {
89 .error = 0, 99 .error = 0,
90 .scope = RT_SCOPE_NOWHERE, 100 .scope = RT_SCOPE_NOWHERE,
91 }, /* RTN_UNSPEC */ 101 },
92 { 102 [RTN_UNICAST] = {
93 .error = 0, 103 .error = 0,
94 .scope = RT_SCOPE_UNIVERSE, 104 .scope = RT_SCOPE_UNIVERSE,
95 }, /* RTN_UNICAST */ 105 },
96 { 106 [RTN_LOCAL] = {
97 .error = 0, 107 .error = 0,
98 .scope = RT_SCOPE_HOST, 108 .scope = RT_SCOPE_HOST,
99 }, /* RTN_LOCAL */ 109 },
100 { 110 [RTN_BROADCAST] = {
101 .error = 0, 111 .error = 0,
102 .scope = RT_SCOPE_LINK, 112 .scope = RT_SCOPE_LINK,
103 }, /* RTN_BROADCAST */ 113 },
104 { 114 [RTN_ANYCAST] = {
105 .error = 0, 115 .error = 0,
106 .scope = RT_SCOPE_LINK, 116 .scope = RT_SCOPE_LINK,
107 }, /* RTN_ANYCAST */ 117 },
108 { 118 [RTN_MULTICAST] = {
109 .error = 0, 119 .error = 0,
110 .scope = RT_SCOPE_UNIVERSE, 120 .scope = RT_SCOPE_UNIVERSE,
111 }, /* RTN_MULTICAST */ 121 },
112 { 122 [RTN_BLACKHOLE] = {
113 .error = -EINVAL, 123 .error = -EINVAL,
114 .scope = RT_SCOPE_UNIVERSE, 124 .scope = RT_SCOPE_UNIVERSE,
115 }, /* RTN_BLACKHOLE */ 125 },
116 { 126 [RTN_UNREACHABLE] = {
117 .error = -EHOSTUNREACH, 127 .error = -EHOSTUNREACH,
118 .scope = RT_SCOPE_UNIVERSE, 128 .scope = RT_SCOPE_UNIVERSE,
119 }, /* RTN_UNREACHABLE */ 129 },
120 { 130 [RTN_PROHIBIT] = {
121 .error = -EACCES, 131 .error = -EACCES,
122 .scope = RT_SCOPE_UNIVERSE, 132 .scope = RT_SCOPE_UNIVERSE,
123 }, /* RTN_PROHIBIT */ 133 },
124 { 134 [RTN_THROW] = {
125 .error = -EAGAIN, 135 .error = -EAGAIN,
126 .scope = RT_SCOPE_UNIVERSE, 136 .scope = RT_SCOPE_UNIVERSE,
127 }, /* RTN_THROW */ 137 },
128 { 138 [RTN_NAT] = {
129 .error = -EINVAL, 139 .error = -EINVAL,
130 .scope = RT_SCOPE_NOWHERE, 140 .scope = RT_SCOPE_NOWHERE,
131 }, /* RTN_NAT */ 141 },
132 { 142 [RTN_XRESOLVE] = {
133 .error = -EINVAL, 143 .error = -EINVAL,
134 .scope = RT_SCOPE_NOWHERE, 144 .scope = RT_SCOPE_NOWHERE,
135 }, /* RTN_XRESOLVE */ 145 },
136}; 146};
137 147
138 148
139/* Release a nexthop info record */ 149/* Release a nexthop info record */
140 150
151static void free_fib_info_rcu(struct rcu_head *head)
152{
153 struct fib_info *fi = container_of(head, struct fib_info, rcu);
154
155 kfree(fi);
156}
157
141void free_fib_info(struct fib_info *fi) 158void free_fib_info(struct fib_info *fi)
142{ 159{
143 if (fi->fib_dead == 0) { 160 if (fi->fib_dead == 0) {
144 printk(KERN_WARNING "Freeing alive fib_info %p\n", fi); 161 pr_warning("Freeing alive fib_info %p\n", fi);
145 return; 162 return;
146 } 163 }
147 change_nexthops(fi) { 164 change_nexthops(fi) {
148 if (nh->nh_dev) 165 if (nexthop_nh->nh_dev)
149 dev_put(nh->nh_dev); 166 dev_put(nexthop_nh->nh_dev);
150 nh->nh_dev = NULL; 167 nexthop_nh->nh_dev = NULL;
151 } endfor_nexthops(fi); 168 } endfor_nexthops(fi);
152 fib_info_cnt--; 169 fib_info_cnt--;
153 release_net(fi->fib_net); 170 release_net(fi->fib_net);
154 kfree(fi); 171 call_rcu(&fi->rcu, free_fib_info_rcu);
155} 172}
156 173
157void fib_release_info(struct fib_info *fi) 174void fib_release_info(struct fib_info *fi)
@@ -162,9 +179,9 @@ void fib_release_info(struct fib_info *fi)
162 if (fi->fib_prefsrc) 179 if (fi->fib_prefsrc)
163 hlist_del(&fi->fib_lhash); 180 hlist_del(&fi->fib_lhash);
164 change_nexthops(fi) { 181 change_nexthops(fi) {
165 if (!nh->nh_dev) 182 if (!nexthop_nh->nh_dev)
166 continue; 183 continue;
167 hlist_del(&nh->nh_hash); 184 hlist_del(&nexthop_nh->nh_hash);
168 } endfor_nexthops(fi) 185 } endfor_nexthops(fi)
169 fi->fib_dead = 1; 186 fi->fib_dead = 1;
170 fib_info_put(fi); 187 fib_info_put(fi);
@@ -172,7 +189,7 @@ void fib_release_info(struct fib_info *fi)
172 spin_unlock_bh(&fib_info_lock); 189 spin_unlock_bh(&fib_info_lock);
173} 190}
174 191
175static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) 192static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
176{ 193{
177 const struct fib_nh *onh = ofi->fib_nh; 194 const struct fib_nh *onh = ofi->fib_nh;
178 195
@@ -186,7 +203,7 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *
186#ifdef CONFIG_NET_CLS_ROUTE 203#ifdef CONFIG_NET_CLS_ROUTE
187 nh->nh_tclassid != onh->nh_tclassid || 204 nh->nh_tclassid != onh->nh_tclassid ||
188#endif 205#endif
189 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) 206 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
190 return -1; 207 return -1;
191 onh++; 208 onh++;
192 } endfor_nexthops(fi); 209 } endfor_nexthops(fi);
@@ -228,7 +245,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
228 head = &fib_info_hash[hash]; 245 head = &fib_info_hash[hash];
229 246
230 hlist_for_each_entry(fi, node, head, fib_hash) { 247 hlist_for_each_entry(fi, node, head, fib_hash) {
231 if (fi->fib_net != nfi->fib_net) 248 if (!net_eq(fi->fib_net, nfi->fib_net))
232 continue; 249 continue;
233 if (fi->fib_nhs != nfi->fib_nhs) 250 if (fi->fib_nhs != nfi->fib_nhs)
234 continue; 251 continue;
@@ -237,7 +254,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
237 nfi->fib_priority == fi->fib_priority && 254 nfi->fib_priority == fi->fib_priority &&
238 memcmp(nfi->fib_metrics, fi->fib_metrics, 255 memcmp(nfi->fib_metrics, fi->fib_metrics,
239 sizeof(fi->fib_metrics)) == 0 && 256 sizeof(fi->fib_metrics)) == 0 &&
240 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && 257 ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
241 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) 258 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
242 return fi; 259 return fi;
243 } 260 }
@@ -246,9 +263,8 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
246} 263}
247 264
248/* Check, that the gateway is already configured. 265/* Check, that the gateway is already configured.
249 Used only by redirect accept routine. 266 * Used only by redirect accept routine.
250 */ 267 */
251
252int ip_fib_check_default(__be32 gw, struct net_device *dev) 268int ip_fib_check_default(__be32 gw, struct net_device *dev)
253{ 269{
254 struct hlist_head *head; 270 struct hlist_head *head;
@@ -263,7 +279,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
263 hlist_for_each_entry(nh, node, head, nh_hash) { 279 hlist_for_each_entry(nh, node, head, nh_hash) {
264 if (nh->nh_dev == dev && 280 if (nh->nh_dev == dev &&
265 nh->nh_gw == gw && 281 nh->nh_gw == gw &&
266 !(nh->nh_flags&RTNH_F_DEAD)) { 282 !(nh->nh_flags & RTNH_F_DEAD)) {
267 spin_unlock(&fib_info_lock); 283 spin_unlock(&fib_info_lock);
268 return 0; 284 return 0;
269 } 285 }
@@ -361,10 +377,10 @@ int fib_detect_death(struct fib_info *fi, int order,
361 } 377 }
362 if (state == NUD_REACHABLE) 378 if (state == NUD_REACHABLE)
363 return 0; 379 return 0;
364 if ((state&NUD_VALID) && order != dflt) 380 if ((state & NUD_VALID) && order != dflt)
365 return 0; 381 return 0;
366 if ((state&NUD_VALID) || 382 if ((state & NUD_VALID) ||
367 (*last_idx<0 && order > dflt)) { 383 (*last_idx < 0 && order > dflt)) {
368 *last_resort = fi; 384 *last_resort = fi;
369 *last_idx = order; 385 *last_idx = order;
370 } 386 }
@@ -395,19 +411,20 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
395 if (!rtnh_ok(rtnh, remaining)) 411 if (!rtnh_ok(rtnh, remaining))
396 return -EINVAL; 412 return -EINVAL;
397 413
398 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; 414 nexthop_nh->nh_flags =
399 nh->nh_oif = rtnh->rtnh_ifindex; 415 (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
400 nh->nh_weight = rtnh->rtnh_hops + 1; 416 nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
417 nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
401 418
402 attrlen = rtnh_attrlen(rtnh); 419 attrlen = rtnh_attrlen(rtnh);
403 if (attrlen > 0) { 420 if (attrlen > 0) {
404 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 421 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
405 422
406 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 423 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
407 nh->nh_gw = nla ? nla_get_be32(nla) : 0; 424 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
408#ifdef CONFIG_NET_CLS_ROUTE 425#ifdef CONFIG_NET_CLS_ROUTE
409 nla = nla_find(attrs, attrlen, RTA_FLOW); 426 nla = nla_find(attrs, attrlen, RTA_FLOW);
410 nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 427 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
411#endif 428#endif
412 } 429 }
413 430
@@ -474,137 +491,133 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
474 491
475 492
476/* 493/*
477 Picture 494 * Picture
478 ------- 495 * -------
479 496 *
480 Semantics of nexthop is very messy by historical reasons. 497 * Semantics of nexthop is very messy by historical reasons.
481 We have to take into account, that: 498 * We have to take into account, that:
482 a) gateway can be actually local interface address, 499 * a) gateway can be actually local interface address,
483 so that gatewayed route is direct. 500 * so that gatewayed route is direct.
484 b) gateway must be on-link address, possibly 501 * b) gateway must be on-link address, possibly
485 described not by an ifaddr, but also by a direct route. 502 * described not by an ifaddr, but also by a direct route.
486 c) If both gateway and interface are specified, they should not 503 * c) If both gateway and interface are specified, they should not
487 contradict. 504 * contradict.
488 d) If we use tunnel routes, gateway could be not on-link. 505 * d) If we use tunnel routes, gateway could be not on-link.
489 506 *
490 Attempt to reconcile all of these (alas, self-contradictory) conditions 507 * Attempt to reconcile all of these (alas, self-contradictory) conditions
491 results in pretty ugly and hairy code with obscure logic. 508 * results in pretty ugly and hairy code with obscure logic.
492 509 *
493 I chose to generalized it instead, so that the size 510 * I chose to generalized it instead, so that the size
494 of code does not increase practically, but it becomes 511 * of code does not increase practically, but it becomes
495 much more general. 512 * much more general.
496 Every prefix is assigned a "scope" value: "host" is local address, 513 * Every prefix is assigned a "scope" value: "host" is local address,
497 "link" is direct route, 514 * "link" is direct route,
498 [ ... "site" ... "interior" ... ] 515 * [ ... "site" ... "interior" ... ]
499 and "universe" is true gateway route with global meaning. 516 * and "universe" is true gateway route with global meaning.
500 517 *
501 Every prefix refers to a set of "nexthop"s (gw, oif), 518 * Every prefix refers to a set of "nexthop"s (gw, oif),
502 where gw must have narrower scope. This recursion stops 519 * where gw must have narrower scope. This recursion stops
503 when gw has LOCAL scope or if "nexthop" is declared ONLINK, 520 * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
504 which means that gw is forced to be on link. 521 * which means that gw is forced to be on link.
505 522 *
506 Code is still hairy, but now it is apparently logically 523 * Code is still hairy, but now it is apparently logically
507 consistent and very flexible. F.e. as by-product it allows 524 * consistent and very flexible. F.e. as by-product it allows
508 to co-exists in peace independent exterior and interior 525 * to co-exists in peace independent exterior and interior
509 routing processes. 526 * routing processes.
510 527 *
511 Normally it looks as following. 528 * Normally it looks as following.
512 529 *
513 {universe prefix} -> (gw, oif) [scope link] 530 * {universe prefix} -> (gw, oif) [scope link]
514 | 531 * |
515 |-> {link prefix} -> (gw, oif) [scope local] 532 * |-> {link prefix} -> (gw, oif) [scope local]
516 | 533 * |
517 |-> {local prefix} (terminal node) 534 * |-> {local prefix} (terminal node)
518 */ 535 */
519
520static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, 536static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
521 struct fib_nh *nh) 537 struct fib_nh *nh)
522{ 538{
523 int err; 539 int err;
524 struct net *net; 540 struct net *net;
541 struct net_device *dev;
525 542
526 net = cfg->fc_nlinfo.nl_net; 543 net = cfg->fc_nlinfo.nl_net;
527 if (nh->nh_gw) { 544 if (nh->nh_gw) {
528 struct fib_result res; 545 struct fib_result res;
529 546
530#ifdef CONFIG_IP_ROUTE_PERVASIVE 547 if (nh->nh_flags & RTNH_F_ONLINK) {
531 if (nh->nh_flags&RTNH_F_PERVASIVE)
532 return 0;
533#endif
534 if (nh->nh_flags&RTNH_F_ONLINK) {
535 struct net_device *dev;
536 548
537 if (cfg->fc_scope >= RT_SCOPE_LINK) 549 if (cfg->fc_scope >= RT_SCOPE_LINK)
538 return -EINVAL; 550 return -EINVAL;
539 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) 551 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
540 return -EINVAL; 552 return -EINVAL;
541 if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL) 553 dev = __dev_get_by_index(net, nh->nh_oif);
554 if (!dev)
542 return -ENODEV; 555 return -ENODEV;
543 if (!(dev->flags&IFF_UP)) 556 if (!(dev->flags & IFF_UP))
544 return -ENETDOWN; 557 return -ENETDOWN;
545 nh->nh_dev = dev; 558 nh->nh_dev = dev;
546 dev_hold(dev); 559 dev_hold(dev);
547 nh->nh_scope = RT_SCOPE_LINK; 560 nh->nh_scope = RT_SCOPE_LINK;
548 return 0; 561 return 0;
549 } 562 }
563 rcu_read_lock();
550 { 564 {
551 struct flowi fl = { 565 struct flowi fl = {
552 .nl_u = { 566 .fl4_dst = nh->nh_gw,
553 .ip4_u = { 567 .fl4_scope = cfg->fc_scope + 1,
554 .daddr = nh->nh_gw,
555 .scope = cfg->fc_scope + 1,
556 },
557 },
558 .oif = nh->nh_oif, 568 .oif = nh->nh_oif,
559 }; 569 };
560 570
561 /* It is not necessary, but requires a bit of thinking */ 571 /* It is not necessary, but requires a bit of thinking */
562 if (fl.fl4_scope < RT_SCOPE_LINK) 572 if (fl.fl4_scope < RT_SCOPE_LINK)
563 fl.fl4_scope = RT_SCOPE_LINK; 573 fl.fl4_scope = RT_SCOPE_LINK;
564 if ((err = fib_lookup(net, &fl, &res)) != 0) 574 err = fib_lookup(net, &fl, &res);
575 if (err) {
576 rcu_read_unlock();
565 return err; 577 return err;
578 }
566 } 579 }
567 err = -EINVAL; 580 err = -EINVAL;
568 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) 581 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
569 goto out; 582 goto out;
570 nh->nh_scope = res.scope; 583 nh->nh_scope = res.scope;
571 nh->nh_oif = FIB_RES_OIF(res); 584 nh->nh_oif = FIB_RES_OIF(res);
572 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) 585 nh->nh_dev = dev = FIB_RES_DEV(res);
586 if (!dev)
573 goto out; 587 goto out;
574 dev_hold(nh->nh_dev); 588 dev_hold(dev);
575 err = -ENETDOWN; 589 err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
576 if (!(nh->nh_dev->flags & IFF_UP))
577 goto out;
578 err = 0;
579out:
580 fib_res_put(&res);
581 return err;
582 } else { 590 } else {
583 struct in_device *in_dev; 591 struct in_device *in_dev;
584 592
585 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) 593 if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
586 return -EINVAL; 594 return -EINVAL;
587 595
596 rcu_read_lock();
597 err = -ENODEV;
588 in_dev = inetdev_by_index(net, nh->nh_oif); 598 in_dev = inetdev_by_index(net, nh->nh_oif);
589 if (in_dev == NULL) 599 if (in_dev == NULL)
590 return -ENODEV; 600 goto out;
591 if (!(in_dev->dev->flags&IFF_UP)) { 601 err = -ENETDOWN;
592 in_dev_put(in_dev); 602 if (!(in_dev->dev->flags & IFF_UP))
593 return -ENETDOWN; 603 goto out;
594 }
595 nh->nh_dev = in_dev->dev; 604 nh->nh_dev = in_dev->dev;
596 dev_hold(nh->nh_dev); 605 dev_hold(nh->nh_dev);
597 nh->nh_scope = RT_SCOPE_HOST; 606 nh->nh_scope = RT_SCOPE_HOST;
598 in_dev_put(in_dev); 607 err = 0;
599 } 608 }
600 return 0; 609out:
610 rcu_read_unlock();
611 return err;
601} 612}
602 613
603static inline unsigned int fib_laddr_hashfn(__be32 val) 614static inline unsigned int fib_laddr_hashfn(__be32 val)
604{ 615{
605 unsigned int mask = (fib_hash_size - 1); 616 unsigned int mask = (fib_hash_size - 1);
606 617
607 return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask; 618 return ((__force u32)val ^
619 ((__force u32)val >> 7) ^
620 ((__force u32)val >> 14)) & mask;
608} 621}
609 622
610static struct hlist_head *fib_hash_alloc(int bytes) 623static struct hlist_head *fib_hash_alloc(int bytes)
@@ -613,7 +626,8 @@ static struct hlist_head *fib_hash_alloc(int bytes)
613 return kzalloc(bytes, GFP_KERNEL); 626 return kzalloc(bytes, GFP_KERNEL);
614 else 627 else
615 return (struct hlist_head *) 628 return (struct hlist_head *)
616 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes)); 629 __get_free_pages(GFP_KERNEL | __GFP_ZERO,
630 get_order(bytes));
617} 631}
618 632
619static void fib_hash_free(struct hlist_head *hash, int bytes) 633static void fib_hash_free(struct hlist_head *hash, int bytes)
@@ -738,7 +752,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
738 752
739 fi->fib_nhs = nhs; 753 fi->fib_nhs = nhs;
740 change_nexthops(fi) { 754 change_nexthops(fi) {
741 nh->nh_parent = fi; 755 nexthop_nh->nh_parent = fi;
742 } endfor_nexthops(fi) 756 } endfor_nexthops(fi)
743 757
744 if (cfg->fc_mx) { 758 if (cfg->fc_mx) {
@@ -808,7 +822,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
808 goto failure; 822 goto failure;
809 } else { 823 } else {
810 change_nexthops(fi) { 824 change_nexthops(fi) {
811 if ((err = fib_check_nh(cfg, fi, nh)) != 0) 825 err = fib_check_nh(cfg, fi, nexthop_nh);
826 if (err != 0)
812 goto failure; 827 goto failure;
813 } endfor_nexthops(fi) 828 } endfor_nexthops(fi)
814 } 829 }
@@ -821,7 +836,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
821 } 836 }
822 837
823link_it: 838link_it:
824 if ((ofi = fib_find_info(fi)) != NULL) { 839 ofi = fib_find_info(fi);
840 if (ofi) {
825 fi->fib_dead = 1; 841 fi->fib_dead = 1;
826 free_fib_info(fi); 842 free_fib_info(fi);
827 ofi->fib_treeref++; 843 ofi->fib_treeref++;
@@ -843,11 +859,11 @@ link_it:
843 struct hlist_head *head; 859 struct hlist_head *head;
844 unsigned int hash; 860 unsigned int hash;
845 861
846 if (!nh->nh_dev) 862 if (!nexthop_nh->nh_dev)
847 continue; 863 continue;
848 hash = fib_devindex_hashfn(nh->nh_dev->ifindex); 864 hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
849 head = &fib_info_devhash[hash]; 865 head = &fib_info_devhash[hash];
850 hlist_add_head(&nh->nh_hash, head); 866 hlist_add_head(&nexthop_nh->nh_hash, head);
851 } endfor_nexthops(fi) 867 } endfor_nexthops(fi)
852 spin_unlock_bh(&fib_info_lock); 868 spin_unlock_bh(&fib_info_lock);
853 return fi; 869 return fi;
@@ -866,7 +882,7 @@ failure:
866 882
867/* Note! fib_semantic_match intentionally uses RCU list functions. */ 883/* Note! fib_semantic_match intentionally uses RCU list functions. */
868int fib_semantic_match(struct list_head *head, const struct flowi *flp, 884int fib_semantic_match(struct list_head *head, const struct flowi *flp,
869 struct fib_result *res, int prefixlen) 885 struct fib_result *res, int prefixlen, int fib_flags)
870{ 886{
871 struct fib_alias *fa; 887 struct fib_alias *fa;
872 int nh_sel = 0; 888 int nh_sel = 0;
@@ -881,7 +897,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
881 if (fa->fa_scope < flp->fl4_scope) 897 if (fa->fa_scope < flp->fl4_scope)
882 continue; 898 continue;
883 899
884 fa->fa_state |= FA_S_ACCESSED; 900 fib_alias_accessed(fa);
885 901
886 err = fib_props[fa->fa_type].error; 902 err = fib_props[fa->fa_type].error;
887 if (err == 0) { 903 if (err == 0) {
@@ -897,7 +913,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
897 case RTN_ANYCAST: 913 case RTN_ANYCAST:
898 case RTN_MULTICAST: 914 case RTN_MULTICAST:
899 for_nexthops(fi) { 915 for_nexthops(fi) {
900 if (nh->nh_flags&RTNH_F_DEAD) 916 if (nh->nh_flags & RTNH_F_DEAD)
901 continue; 917 continue;
902 if (!flp->oif || flp->oif == nh->nh_oif) 918 if (!flp->oif || flp->oif == nh->nh_oif)
903 break; 919 break;
@@ -908,16 +924,15 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
908 goto out_fill_res; 924 goto out_fill_res;
909 } 925 }
910#else 926#else
911 if (nhsel < 1) { 927 if (nhsel < 1)
912 goto out_fill_res; 928 goto out_fill_res;
913 }
914#endif 929#endif
915 endfor_nexthops(fi); 930 endfor_nexthops(fi);
916 continue; 931 continue;
917 932
918 default: 933 default:
919 printk(KERN_WARNING "fib_semantic_match bad type %#x\n", 934 pr_warning("fib_semantic_match bad type %#x\n",
920 fa->fa_type); 935 fa->fa_type);
921 return -EINVAL; 936 return -EINVAL;
922 } 937 }
923 } 938 }
@@ -931,7 +946,8 @@ out_fill_res:
931 res->type = fa->fa_type; 946 res->type = fa->fa_type;
932 res->scope = fa->fa_scope; 947 res->scope = fa->fa_scope;
933 res->fi = fa->fa_info; 948 res->fi = fa->fa_info;
934 atomic_inc(&res->fi->fib_clntref); 949 if (!(fib_flags & FIB_LOOKUP_NOREF))
950 atomic_inc(&res->fi->fib_clntref);
935 return 0; 951 return 0;
936} 952}
937 953
@@ -1030,10 +1046,10 @@ nla_put_failure:
1030} 1046}
1031 1047
1032/* 1048/*
1033 Update FIB if: 1049 * Update FIB if:
1034 - local address disappeared -> we must delete all the entries 1050 * - local address disappeared -> we must delete all the entries
1035 referring to it. 1051 * referring to it.
1036 - device went down -> we must shutdown all nexthops going via it. 1052 * - device went down -> we must shutdown all nexthops going via it.
1037 */ 1053 */
1038int fib_sync_down_addr(struct net *net, __be32 local) 1054int fib_sync_down_addr(struct net *net, __be32 local)
1039{ 1055{
@@ -1047,7 +1063,7 @@ int fib_sync_down_addr(struct net *net, __be32 local)
1047 return 0; 1063 return 0;
1048 1064
1049 hlist_for_each_entry(fi, node, head, fib_lhash) { 1065 hlist_for_each_entry(fi, node, head, fib_lhash) {
1050 if (fi->fib_net != net) 1066 if (!net_eq(fi->fib_net, net))
1051 continue; 1067 continue;
1052 if (fi->fib_prefsrc == local) { 1068 if (fi->fib_prefsrc == local) {
1053 fi->fib_flags |= RTNH_F_DEAD; 1069 fi->fib_flags |= RTNH_F_DEAD;
@@ -1080,21 +1096,21 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1080 prev_fi = fi; 1096 prev_fi = fi;
1081 dead = 0; 1097 dead = 0;
1082 change_nexthops(fi) { 1098 change_nexthops(fi) {
1083 if (nh->nh_flags&RTNH_F_DEAD) 1099 if (nexthop_nh->nh_flags & RTNH_F_DEAD)
1084 dead++; 1100 dead++;
1085 else if (nh->nh_dev == dev && 1101 else if (nexthop_nh->nh_dev == dev &&
1086 nh->nh_scope != scope) { 1102 nexthop_nh->nh_scope != scope) {
1087 nh->nh_flags |= RTNH_F_DEAD; 1103 nexthop_nh->nh_flags |= RTNH_F_DEAD;
1088#ifdef CONFIG_IP_ROUTE_MULTIPATH 1104#ifdef CONFIG_IP_ROUTE_MULTIPATH
1089 spin_lock_bh(&fib_multipath_lock); 1105 spin_lock_bh(&fib_multipath_lock);
1090 fi->fib_power -= nh->nh_power; 1106 fi->fib_power -= nexthop_nh->nh_power;
1091 nh->nh_power = 0; 1107 nexthop_nh->nh_power = 0;
1092 spin_unlock_bh(&fib_multipath_lock); 1108 spin_unlock_bh(&fib_multipath_lock);
1093#endif 1109#endif
1094 dead++; 1110 dead++;
1095 } 1111 }
1096#ifdef CONFIG_IP_ROUTE_MULTIPATH 1112#ifdef CONFIG_IP_ROUTE_MULTIPATH
1097 if (force > 1 && nh->nh_dev == dev) { 1113 if (force > 1 && nexthop_nh->nh_dev == dev) {
1098 dead = fi->fib_nhs; 1114 dead = fi->fib_nhs;
1099 break; 1115 break;
1100 } 1116 }
@@ -1112,10 +1128,9 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1112#ifdef CONFIG_IP_ROUTE_MULTIPATH 1128#ifdef CONFIG_IP_ROUTE_MULTIPATH
1113 1129
1114/* 1130/*
1115 Dead device goes up. We wake up dead nexthops. 1131 * Dead device goes up. We wake up dead nexthops.
1116 It takes sense only on multipath routes. 1132 * It takes sense only on multipath routes.
1117 */ 1133 */
1118
1119int fib_sync_up(struct net_device *dev) 1134int fib_sync_up(struct net_device *dev)
1120{ 1135{
1121 struct fib_info *prev_fi; 1136 struct fib_info *prev_fi;
@@ -1125,7 +1140,7 @@ int fib_sync_up(struct net_device *dev)
1125 struct fib_nh *nh; 1140 struct fib_nh *nh;
1126 int ret; 1141 int ret;
1127 1142
1128 if (!(dev->flags&IFF_UP)) 1143 if (!(dev->flags & IFF_UP))
1129 return 0; 1144 return 0;
1130 1145
1131 prev_fi = NULL; 1146 prev_fi = NULL;
@@ -1144,18 +1159,20 @@ int fib_sync_up(struct net_device *dev)
1144 prev_fi = fi; 1159 prev_fi = fi;
1145 alive = 0; 1160 alive = 0;
1146 change_nexthops(fi) { 1161 change_nexthops(fi) {
1147 if (!(nh->nh_flags&RTNH_F_DEAD)) { 1162 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1148 alive++; 1163 alive++;
1149 continue; 1164 continue;
1150 } 1165 }
1151 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) 1166 if (nexthop_nh->nh_dev == NULL ||
1167 !(nexthop_nh->nh_dev->flags & IFF_UP))
1152 continue; 1168 continue;
1153 if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev)) 1169 if (nexthop_nh->nh_dev != dev ||
1170 !__in_dev_get_rtnl(dev))
1154 continue; 1171 continue;
1155 alive++; 1172 alive++;
1156 spin_lock_bh(&fib_multipath_lock); 1173 spin_lock_bh(&fib_multipath_lock);
1157 nh->nh_power = 0; 1174 nexthop_nh->nh_power = 0;
1158 nh->nh_flags &= ~RTNH_F_DEAD; 1175 nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
1159 spin_unlock_bh(&fib_multipath_lock); 1176 spin_unlock_bh(&fib_multipath_lock);
1160 } endfor_nexthops(fi) 1177 } endfor_nexthops(fi)
1161 1178
@@ -1169,10 +1186,9 @@ int fib_sync_up(struct net_device *dev)
1169} 1186}
1170 1187
1171/* 1188/*
1172 The algorithm is suboptimal, but it provides really 1189 * The algorithm is suboptimal, but it provides really
1173 fair weighted route distribution. 1190 * fair weighted route distribution.
1174 */ 1191 */
1175
1176void fib_select_multipath(const struct flowi *flp, struct fib_result *res) 1192void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1177{ 1193{
1178 struct fib_info *fi = res->fi; 1194 struct fib_info *fi = res->fi;
@@ -1182,9 +1198,9 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1182 if (fi->fib_power <= 0) { 1198 if (fi->fib_power <= 0) {
1183 int power = 0; 1199 int power = 0;
1184 change_nexthops(fi) { 1200 change_nexthops(fi) {
1185 if (!(nh->nh_flags&RTNH_F_DEAD)) { 1201 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1186 power += nh->nh_weight; 1202 power += nexthop_nh->nh_weight;
1187 nh->nh_power = nh->nh_weight; 1203 nexthop_nh->nh_power = nexthop_nh->nh_weight;
1188 } 1204 }
1189 } endfor_nexthops(fi); 1205 } endfor_nexthops(fi);
1190 fi->fib_power = power; 1206 fi->fib_power = power;
@@ -1198,15 +1214,17 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1198 1214
1199 1215
1200 /* w should be random number [0..fi->fib_power-1], 1216 /* w should be random number [0..fi->fib_power-1],
1201 it is pretty bad approximation. 1217 * it is pretty bad approximation.
1202 */ 1218 */
1203 1219
1204 w = jiffies % fi->fib_power; 1220 w = jiffies % fi->fib_power;
1205 1221
1206 change_nexthops(fi) { 1222 change_nexthops(fi) {
1207 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { 1223 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
1208 if ((w -= nh->nh_power) <= 0) { 1224 nexthop_nh->nh_power) {
1209 nh->nh_power--; 1225 w -= nexthop_nh->nh_power;
1226 if (w <= 0) {
1227 nexthop_nh->nh_power--;
1210 fi->fib_power--; 1228 fi->fib_power--;
1211 res->nh_sel = nhsel; 1229 res->nh_sel = nhsel;
1212 spin_unlock_bh(&fib_multipath_lock); 1230 spin_unlock_bh(&fib_multipath_lock);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 012cf5a68581..0f280348e0fd 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -16,7 +16,7 @@
16 * 16 *
17 * An experimental study of compression methods for dynamic tries 17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. 18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ 19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 * 20 *
21 * 21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson 22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
@@ -48,7 +48,7 @@
48 * Patrick McHardy <kaber@trash.net> 48 * Patrick McHardy <kaber@trash.net>
49 */ 49 */
50 50
51#define VERSION "0.408" 51#define VERSION "0.409"
52 52
53#include <asm/uaccess.h> 53#include <asm/uaccess.h>
54#include <asm/system.h> 54#include <asm/system.h>
@@ -71,6 +71,7 @@
71#include <linux/netlink.h> 71#include <linux/netlink.h>
72#include <linux/init.h> 72#include <linux/init.h>
73#include <linux/list.h> 73#include <linux/list.h>
74#include <linux/slab.h>
74#include <net/net_namespace.h> 75#include <net/net_namespace.h>
75#include <net/ip.h> 76#include <net/ip.h>
76#include <net/protocol.h> 77#include <net/protocol.h>
@@ -164,6 +165,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn);
164static struct tnode *halve(struct trie *t, struct tnode *tn); 165static struct tnode *halve(struct trie *t, struct tnode *tn);
165/* tnodes to free after resize(); protected by RTNL */ 166/* tnodes to free after resize(); protected by RTNL */
166static struct tnode *tnode_free_head; 167static struct tnode *tnode_free_head;
168static size_t tnode_free_size;
169
170/*
171 * synchronize_rcu after call_rcu for that many pages; it should be especially
172 * useful before resizing the root node with PREEMPT_NONE configs; the value was
173 * obtained experimentally, aiming to avoid visible slowdown.
174 */
175static const int sync_pages = 128;
167 176
168static struct kmem_cache *fn_alias_kmem __read_mostly; 177static struct kmem_cache *fn_alias_kmem __read_mostly;
169static struct kmem_cache *trie_leaf_kmem __read_mostly; 178static struct kmem_cache *trie_leaf_kmem __read_mostly;
@@ -177,7 +186,7 @@ static inline struct tnode *node_parent_rcu(struct node *node)
177{ 186{
178 struct tnode *ret = node_parent(node); 187 struct tnode *ret = node_parent(node);
179 188
180 return rcu_dereference(ret); 189 return rcu_dereference_rtnl(ret);
181} 190}
182 191
183/* Same as rcu_assign_pointer 192/* Same as rcu_assign_pointer
@@ -200,7 +209,7 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
200{ 209{
201 struct node *ret = tnode_get_child(tn, i); 210 struct node *ret = tnode_get_child(tn, i);
202 211
203 return rcu_dereference(ret); 212 return rcu_dereference_rtnl(ret);
204} 213}
205 214
206static inline int tnode_child_length(const struct tnode *tn) 215static inline int tnode_child_length(const struct tnode *tn)
@@ -316,9 +325,8 @@ static inline void check_tnode(const struct tnode *tn)
316 325
317static const int halve_threshold = 25; 326static const int halve_threshold = 25;
318static const int inflate_threshold = 50; 327static const int inflate_threshold = 50;
319static const int halve_threshold_root = 8; 328static const int halve_threshold_root = 15;
320static const int inflate_threshold_root = 15; 329static const int inflate_threshold_root = 30;
321
322 330
323static void __alias_free_mem(struct rcu_head *head) 331static void __alias_free_mem(struct rcu_head *head)
324{ 332{
@@ -357,7 +365,7 @@ static struct tnode *tnode_alloc(size_t size)
357 if (size <= PAGE_SIZE) 365 if (size <= PAGE_SIZE)
358 return kzalloc(size, GFP_KERNEL); 366 return kzalloc(size, GFP_KERNEL);
359 else 367 else
360 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); 368 return vzalloc(size);
361} 369}
362 370
363static void __tnode_vfree(struct work_struct *arg) 371static void __tnode_vfree(struct work_struct *arg)
@@ -393,6 +401,8 @@ static void tnode_free_safe(struct tnode *tn)
393 BUG_ON(IS_LEAF(tn)); 401 BUG_ON(IS_LEAF(tn));
394 tn->tnode_free = tnode_free_head; 402 tn->tnode_free = tnode_free_head;
395 tnode_free_head = tn; 403 tnode_free_head = tn;
404 tnode_free_size += sizeof(struct tnode) +
405 (sizeof(struct node *) << tn->bits);
396} 406}
397 407
398static void tnode_free_flush(void) 408static void tnode_free_flush(void)
@@ -404,6 +414,11 @@ static void tnode_free_flush(void)
404 tn->tnode_free = NULL; 414 tn->tnode_free = NULL;
405 tnode_free(tn); 415 tnode_free(tn);
406 } 416 }
417
418 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
419 tnode_free_size = 0;
420 synchronize_rcu();
421 }
407} 422}
408 423
409static struct leaf *leaf_new(void) 424static struct leaf *leaf_new(void)
@@ -440,8 +455,8 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
440 tn->empty_children = 1<<bits; 455 tn->empty_children = 1<<bits;
441 } 456 }
442 457
443 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode), 458 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
444 (unsigned long) (sizeof(struct node) << bits)); 459 sizeof(struct node) << bits);
445 return tn; 460 return tn;
446} 461}
447 462
@@ -499,14 +514,14 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
499 rcu_assign_pointer(tn->child[i], n); 514 rcu_assign_pointer(tn->child[i], n);
500} 515}
501 516
517#define MAX_WORK 10
502static struct node *resize(struct trie *t, struct tnode *tn) 518static struct node *resize(struct trie *t, struct tnode *tn)
503{ 519{
504 int i; 520 int i;
505 int err = 0;
506 struct tnode *old_tn; 521 struct tnode *old_tn;
507 int inflate_threshold_use; 522 int inflate_threshold_use;
508 int halve_threshold_use; 523 int halve_threshold_use;
509 int max_resize; 524 int max_work;
510 525
511 if (!tn) 526 if (!tn)
512 return NULL; 527 return NULL;
@@ -521,18 +536,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
521 } 536 }
522 /* One child */ 537 /* One child */
523 if (tn->empty_children == tnode_child_length(tn) - 1) 538 if (tn->empty_children == tnode_child_length(tn) - 1)
524 for (i = 0; i < tnode_child_length(tn); i++) { 539 goto one_child;
525 struct node *n;
526
527 n = tn->child[i];
528 if (!n)
529 continue;
530
531 /* compress one level */
532 node_set_parent(n, NULL);
533 tnode_free_safe(tn);
534 return n;
535 }
536 /* 540 /*
537 * Double as long as the resulting node has a number of 541 * Double as long as the resulting node has a number of
538 * nonempty nodes that are above the threshold. 542 * nonempty nodes that are above the threshold.
@@ -601,14 +605,16 @@ static struct node *resize(struct trie *t, struct tnode *tn)
601 605
602 /* Keep root node larger */ 606 /* Keep root node larger */
603 607
604 if (!tn->parent) 608 if (!node_parent((struct node *)tn)) {
605 inflate_threshold_use = inflate_threshold_root; 609 inflate_threshold_use = inflate_threshold_root;
606 else 610 halve_threshold_use = halve_threshold_root;
611 } else {
607 inflate_threshold_use = inflate_threshold; 612 inflate_threshold_use = inflate_threshold;
613 halve_threshold_use = halve_threshold;
614 }
608 615
609 err = 0; 616 max_work = MAX_WORK;
610 max_resize = 10; 617 while ((tn->full_children > 0 && max_work-- &&
611 while ((tn->full_children > 0 && max_resize-- &&
612 50 * (tn->full_children + tnode_child_length(tn) 618 50 * (tn->full_children + tnode_child_length(tn)
613 - tn->empty_children) 619 - tn->empty_children)
614 >= inflate_threshold_use * tnode_child_length(tn))) { 620 >= inflate_threshold_use * tnode_child_length(tn))) {
@@ -625,35 +631,19 @@ static struct node *resize(struct trie *t, struct tnode *tn)
625 } 631 }
626 } 632 }
627 633
628 if (max_resize < 0) {
629 if (!tn->parent)
630 pr_warning("Fix inflate_threshold_root."
631 " Now=%d size=%d bits\n",
632 inflate_threshold_root, tn->bits);
633 else
634 pr_warning("Fix inflate_threshold."
635 " Now=%d size=%d bits\n",
636 inflate_threshold, tn->bits);
637 }
638
639 check_tnode(tn); 634 check_tnode(tn);
640 635
636 /* Return if at least one inflate is run */
637 if (max_work != MAX_WORK)
638 return (struct node *) tn;
639
641 /* 640 /*
642 * Halve as long as the number of empty children in this 641 * Halve as long as the number of empty children in this
643 * node is above threshold. 642 * node is above threshold.
644 */ 643 */
645 644
646 645 max_work = MAX_WORK;
647 /* Keep root node larger */ 646 while (tn->bits > 1 && max_work-- &&
648
649 if (!tn->parent)
650 halve_threshold_use = halve_threshold_root;
651 else
652 halve_threshold_use = halve_threshold;
653
654 err = 0;
655 max_resize = 10;
656 while (tn->bits > 1 && max_resize-- &&
657 100 * (tnode_child_length(tn) - tn->empty_children) < 647 100 * (tnode_child_length(tn) - tn->empty_children) <
658 halve_threshold_use * tnode_child_length(tn)) { 648 halve_threshold_use * tnode_child_length(tn)) {
659 649
@@ -668,19 +658,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
668 } 658 }
669 } 659 }
670 660
671 if (max_resize < 0) {
672 if (!tn->parent)
673 pr_warning("Fix halve_threshold_root."
674 " Now=%d size=%d bits\n",
675 halve_threshold_root, tn->bits);
676 else
677 pr_warning("Fix halve_threshold."
678 " Now=%d size=%d bits\n",
679 halve_threshold, tn->bits);
680 }
681 661
682 /* Only one child remains */ 662 /* Only one child remains */
683 if (tn->empty_children == tnode_child_length(tn) - 1) 663 if (tn->empty_children == tnode_child_length(tn) - 1) {
664one_child:
684 for (i = 0; i < tnode_child_length(tn); i++) { 665 for (i = 0; i < tnode_child_length(tn); i++) {
685 struct node *n; 666 struct node *n;
686 667
@@ -694,7 +675,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
694 tnode_free_safe(tn); 675 tnode_free_safe(tn);
695 return n; 676 return n;
696 } 677 }
697 678 }
698 return (struct node *) tn; 679 return (struct node *) tn;
699} 680}
700 681
@@ -980,7 +961,7 @@ fib_find_node(struct trie *t, u32 key)
980 struct node *n; 961 struct node *n;
981 962
982 pos = 0; 963 pos = 0;
983 n = rcu_dereference(t->trie); 964 n = rcu_dereference_rtnl(t->trie);
984 965
985 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 966 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
986 tn = (struct tnode *) n; 967 tn = (struct tnode *) n;
@@ -1021,6 +1002,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1021 (struct node *)tn, wasfull); 1002 (struct node *)tn, wasfull);
1022 1003
1023 tp = node_parent((struct node *) tn); 1004 tp = node_parent((struct node *) tn);
1005 if (!tp)
1006 rcu_assign_pointer(t->trie, (struct node *)tn);
1007
1024 tnode_free_flush(); 1008 tnode_free_flush();
1025 if (!tp) 1009 if (!tp)
1026 break; 1010 break;
@@ -1033,8 +1017,6 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1033 1017
1034 rcu_assign_pointer(t->trie, (struct node *)tn); 1018 rcu_assign_pointer(t->trie, (struct node *)tn);
1035 tnode_free_flush(); 1019 tnode_free_flush();
1036
1037 return;
1038} 1020}
1039 1021
1040/* only used from updater-side */ 1022/* only used from updater-side */
@@ -1190,7 +1172,7 @@ done:
1190/* 1172/*
1191 * Caller must hold RTNL. 1173 * Caller must hold RTNL.
1192 */ 1174 */
1193static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) 1175int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1194{ 1176{
1195 struct trie *t = (struct trie *) tb->tb_data; 1177 struct trie *t = (struct trie *) tb->tb_data;
1196 struct fib_alias *fa, *new_fa; 1178 struct fib_alias *fa, *new_fa;
@@ -1360,7 +1342,7 @@ err:
1360/* should be called with rcu_read_lock */ 1342/* should be called with rcu_read_lock */
1361static int check_leaf(struct trie *t, struct leaf *l, 1343static int check_leaf(struct trie *t, struct leaf *l,
1362 t_key key, const struct flowi *flp, 1344 t_key key, const struct flowi *flp,
1363 struct fib_result *res) 1345 struct fib_result *res, int fib_flags)
1364{ 1346{
1365 struct leaf_info *li; 1347 struct leaf_info *li;
1366 struct hlist_head *hhead = &l->list; 1348 struct hlist_head *hhead = &l->list;
@@ -1374,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
1374 if (l->key != (key & ntohl(mask))) 1356 if (l->key != (key & ntohl(mask)))
1375 continue; 1357 continue;
1376 1358
1377 err = fib_semantic_match(&li->falh, flp, res, plen); 1359 err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
1378 1360
1379#ifdef CONFIG_IP_FIB_TRIE_STATS 1361#ifdef CONFIG_IP_FIB_TRIE_STATS
1380 if (err <= 0) 1362 if (err <= 0)
@@ -1389,8 +1371,8 @@ static int check_leaf(struct trie *t, struct leaf *l,
1389 return 1; 1371 return 1;
1390} 1372}
1391 1373
1392static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, 1374int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1393 struct fib_result *res) 1375 struct fib_result *res, int fib_flags)
1394{ 1376{
1395 struct trie *t = (struct trie *) tb->tb_data; 1377 struct trie *t = (struct trie *) tb->tb_data;
1396 int ret; 1378 int ret;
@@ -1402,8 +1384,7 @@ static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1402 t_key cindex = 0; 1384 t_key cindex = 0;
1403 int current_prefix_length = KEYLENGTH; 1385 int current_prefix_length = KEYLENGTH;
1404 struct tnode *cn; 1386 struct tnode *cn;
1405 t_key node_prefix, key_prefix, pref_mismatch; 1387 t_key pref_mismatch;
1406 int mp;
1407 1388
1408 rcu_read_lock(); 1389 rcu_read_lock();
1409 1390
@@ -1417,7 +1398,7 @@ static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1417 1398
1418 /* Just a leaf? */ 1399 /* Just a leaf? */
1419 if (IS_LEAF(n)) { 1400 if (IS_LEAF(n)) {
1420 ret = check_leaf(t, (struct leaf *)n, key, flp, res); 1401 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
1421 goto found; 1402 goto found;
1422 } 1403 }
1423 1404
@@ -1432,7 +1413,7 @@ static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1432 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length), 1413 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1433 pos, bits); 1414 pos, bits);
1434 1415
1435 n = tnode_get_child(pn, cindex); 1416 n = tnode_get_child_rcu(pn, cindex);
1436 1417
1437 if (n == NULL) { 1418 if (n == NULL) {
1438#ifdef CONFIG_IP_FIB_TRIE_STATS 1419#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -1442,7 +1423,7 @@ static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1442 } 1423 }
1443 1424
1444 if (IS_LEAF(n)) { 1425 if (IS_LEAF(n)) {
1445 ret = check_leaf(t, (struct leaf *)n, key, flp, res); 1426 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
1446 if (ret > 0) 1427 if (ret > 0)
1447 goto backtrace; 1428 goto backtrace;
1448 goto found; 1429 goto found;
@@ -1518,10 +1499,7 @@ static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1518 * matching prefix. 1499 * matching prefix.
1519 */ 1500 */
1520 1501
1521 node_prefix = mask_pfx(cn->key, cn->pos); 1502 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1522 key_prefix = mask_pfx(key, cn->pos);
1523 pref_mismatch = key_prefix^node_prefix;
1524 mp = 0;
1525 1503
1526 /* 1504 /*
1527 * In short: If skipped bits in this node do not match 1505 * In short: If skipped bits in this node do not match
@@ -1529,13 +1507,9 @@ static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1529 * state.directly. 1507 * state.directly.
1530 */ 1508 */
1531 if (pref_mismatch) { 1509 if (pref_mismatch) {
1532 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { 1510 int mp = KEYLENGTH - fls(pref_mismatch);
1533 mp++;
1534 pref_mismatch = pref_mismatch << 1;
1535 }
1536 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1537 1511
1538 if (key_prefix != 0) 1512 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1539 goto backtrace; 1513 goto backtrace;
1540 1514
1541 if (current_prefix_length >= cn->pos) 1515 if (current_prefix_length >= cn->pos)
@@ -1567,7 +1541,7 @@ backtrace:
1567 if (chopped_off <= pn->bits) { 1541 if (chopped_off <= pn->bits) {
1568 cindex &= ~(1 << (chopped_off-1)); 1542 cindex &= ~(1 << (chopped_off-1));
1569 } else { 1543 } else {
1570 struct tnode *parent = node_parent((struct node *) pn); 1544 struct tnode *parent = node_parent_rcu((struct node *) pn);
1571 if (!parent) 1545 if (!parent)
1572 goto failed; 1546 goto failed;
1573 1547
@@ -1611,7 +1585,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
1611/* 1585/*
1612 * Caller must hold RTNL. 1586 * Caller must hold RTNL.
1613 */ 1587 */
1614static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) 1588int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1615{ 1589{
1616 struct trie *t = (struct trie *) tb->tb_data; 1590 struct trie *t = (struct trie *) tb->tb_data;
1617 u32 key, mask; 1591 u32 key, mask;
@@ -1759,14 +1733,14 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1759 1733
1760 /* Node empty, walk back up to parent */ 1734 /* Node empty, walk back up to parent */
1761 c = (struct node *) p; 1735 c = (struct node *) p;
1762 } while ( (p = node_parent_rcu(c)) != NULL); 1736 } while ((p = node_parent_rcu(c)) != NULL);
1763 1737
1764 return NULL; /* Root of trie */ 1738 return NULL; /* Root of trie */
1765} 1739}
1766 1740
1767static struct leaf *trie_firstleaf(struct trie *t) 1741static struct leaf *trie_firstleaf(struct trie *t)
1768{ 1742{
1769 struct tnode *n = (struct tnode *) rcu_dereference(t->trie); 1743 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1770 1744
1771 if (!n) 1745 if (!n)
1772 return NULL; 1746 return NULL;
@@ -1780,7 +1754,7 @@ static struct leaf *trie_firstleaf(struct trie *t)
1780static struct leaf *trie_nextleaf(struct leaf *l) 1754static struct leaf *trie_nextleaf(struct leaf *l)
1781{ 1755{
1782 struct node *c = (struct node *) l; 1756 struct node *c = (struct node *) l;
1783 struct tnode *p = node_parent(c); 1757 struct tnode *p = node_parent_rcu(c);
1784 1758
1785 if (!p) 1759 if (!p)
1786 return NULL; /* trie with just one leaf */ 1760 return NULL; /* trie with just one leaf */
@@ -1802,7 +1776,7 @@ static struct leaf *trie_leafindex(struct trie *t, int index)
1802/* 1776/*
1803 * Caller must hold RTNL. 1777 * Caller must hold RTNL.
1804 */ 1778 */
1805static int fn_trie_flush(struct fib_table *tb) 1779int fib_table_flush(struct fib_table *tb)
1806{ 1780{
1807 struct trie *t = (struct trie *) tb->tb_data; 1781 struct trie *t = (struct trie *) tb->tb_data;
1808 struct leaf *l, *ll = NULL; 1782 struct leaf *l, *ll = NULL;
@@ -1823,9 +1797,14 @@ static int fn_trie_flush(struct fib_table *tb)
1823 return found; 1797 return found;
1824} 1798}
1825 1799
1826static void fn_trie_select_default(struct fib_table *tb, 1800void fib_free_table(struct fib_table *tb)
1827 const struct flowi *flp, 1801{
1828 struct fib_result *res) 1802 kfree(tb);
1803}
1804
1805void fib_table_select_default(struct fib_table *tb,
1806 const struct flowi *flp,
1807 struct fib_result *res)
1829{ 1808{
1830 struct trie *t = (struct trie *) tb->tb_data; 1809 struct trie *t = (struct trie *) tb->tb_data;
1831 int order, last_idx; 1810 int order, last_idx;
@@ -1864,7 +1843,8 @@ static void fn_trie_select_default(struct fib_table *tb,
1864 if (!next_fi->fib_nh[0].nh_gw || 1843 if (!next_fi->fib_nh[0].nh_gw ||
1865 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 1844 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1866 continue; 1845 continue;
1867 fa->fa_state |= FA_S_ACCESSED; 1846
1847 fib_alias_accessed(fa);
1868 1848
1869 if (fi == NULL) { 1849 if (fi == NULL) {
1870 if (next_fi != res->fi) 1850 if (next_fi != res->fi)
@@ -1968,8 +1948,8 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1968 return skb->len; 1948 return skb->len;
1969} 1949}
1970 1950
1971static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, 1951int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1972 struct netlink_callback *cb) 1952 struct netlink_callback *cb)
1973{ 1953{
1974 struct leaf *l; 1954 struct leaf *l;
1975 struct trie *t = (struct trie *) tb->tb_data; 1955 struct trie *t = (struct trie *) tb->tb_data;
@@ -2036,12 +2016,6 @@ struct fib_table *fib_hash_table(u32 id)
2036 2016
2037 tb->tb_id = id; 2017 tb->tb_id = id;
2038 tb->tb_default = -1; 2018 tb->tb_default = -1;
2039 tb->tb_lookup = fn_trie_lookup;
2040 tb->tb_insert = fn_trie_insert;
2041 tb->tb_delete = fn_trie_delete;
2042 tb->tb_flush = fn_trie_flush;
2043 tb->tb_select_default = fn_trie_select_default;
2044 tb->tb_dump = fn_trie_dump;
2045 2019
2046 t = (struct trie *) tb->tb_data; 2020 t = (struct trie *) tb->tb_data;
2047 memset(t, 0, sizeof(*t)); 2021 memset(t, 0, sizeof(*t));
@@ -2058,14 +2032,14 @@ struct fib_trie_iter {
2058 struct seq_net_private p; 2032 struct seq_net_private p;
2059 struct fib_table *tb; 2033 struct fib_table *tb;
2060 struct tnode *tnode; 2034 struct tnode *tnode;
2061 unsigned index; 2035 unsigned int index;
2062 unsigned depth; 2036 unsigned int depth;
2063}; 2037};
2064 2038
2065static struct node *fib_trie_get_next(struct fib_trie_iter *iter) 2039static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2066{ 2040{
2067 struct tnode *tn = iter->tnode; 2041 struct tnode *tn = iter->tnode;
2068 unsigned cindex = iter->index; 2042 unsigned int cindex = iter->index;
2069 struct tnode *p; 2043 struct tnode *p;
2070 2044
2071 /* A single entry routing table */ 2045 /* A single entry routing table */
@@ -2174,7 +2148,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2174 */ 2148 */
2175static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) 2149static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2176{ 2150{
2177 unsigned i, max, pointers, bytes, avdepth; 2151 unsigned int i, max, pointers, bytes, avdepth;
2178 2152
2179 if (stat->leaves) 2153 if (stat->leaves)
2180 avdepth = stat->totdepth*100 / stat->leaves; 2154 avdepth = stat->totdepth*100 / stat->leaves;
@@ -2371,7 +2345,8 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2371 2345
2372static void seq_indent(struct seq_file *seq, int n) 2346static void seq_indent(struct seq_file *seq, int n)
2373{ 2347{
2374 while (n-- > 0) seq_puts(seq, " "); 2348 while (n-- > 0)
2349 seq_puts(seq, " ");
2375} 2350}
2376 2351
2377static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) 2352static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
@@ -2388,7 +2363,7 @@ static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2388 } 2363 }
2389} 2364}
2390 2365
2391static const char *rtn_type_names[__RTN_MAX] = { 2366static const char *const rtn_type_names[__RTN_MAX] = {
2392 [RTN_UNSPEC] = "UNSPEC", 2367 [RTN_UNSPEC] = "UNSPEC",
2393 [RTN_UNICAST] = "UNICAST", 2368 [RTN_UNICAST] = "UNICAST",
2394 [RTN_LOCAL] = "LOCAL", 2369 [RTN_LOCAL] = "LOCAL",
@@ -2403,7 +2378,7 @@ static const char *rtn_type_names[__RTN_MAX] = {
2403 [RTN_XRESOLVE] = "XRESOLVE", 2378 [RTN_XRESOLVE] = "XRESOLVE",
2404}; 2379};
2405 2380
2406static inline const char *rtn_type(char *buf, size_t len, unsigned t) 2381static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2407{ 2382{
2408 if (t < __RTN_MAX && rtn_type_names[t]) 2383 if (t < __RTN_MAX && rtn_type_names[t])
2409 return rtn_type_names[t]; 2384 return rtn_type_names[t];
@@ -2559,13 +2534,12 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
2559 rcu_read_unlock(); 2534 rcu_read_unlock();
2560} 2535}
2561 2536
2562static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) 2537static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2563{ 2538{
2564 static unsigned type2flags[RTN_MAX + 1] = { 2539 unsigned int flags = 0;
2565 [7] = RTF_REJECT, [8] = RTF_REJECT,
2566 };
2567 unsigned flags = type2flags[type];
2568 2540
2541 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2542 flags = RTF_REJECT;
2569 if (fi && fi->fib_nh->nh_gw) 2543 if (fi && fi->fib_nh->nh_gw)
2570 flags |= RTF_GATEWAY; 2544 flags |= RTF_GATEWAY;
2571 if (mask == htonl(0xFFFFFFFF)) 2545 if (mask == htonl(0xFFFFFFFF))
@@ -2577,7 +2551,7 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2577/* 2551/*
2578 * This outputs /proc/net/route. 2552 * This outputs /proc/net/route.
2579 * The format of the file is not supposed to be changed 2553 * The format of the file is not supposed to be changed
2580 * and needs to be same as fib_hash output to avoid breaking 2554 * and needs to be same as fib_hash output to avoid breaking
2581 * legacy utilities 2555 * legacy utilities
2582 */ 2556 */
2583static int fib_route_seq_show(struct seq_file *seq, void *v) 2557static int fib_route_seq_show(struct seq_file *seq, void *v)
@@ -2602,7 +2576,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
2602 2576
2603 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 2577 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2604 const struct fib_info *fi = fa->fa_info; 2578 const struct fib_info *fi = fa->fa_info;
2605 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); 2579 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2606 int len; 2580 int len;
2607 2581
2608 if (fa->fa_type == RTN_BROADCAST 2582 if (fa->fa_type == RTN_BROADCAST
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
new file mode 100644
index 000000000000..c6933f2ea310
--- /dev/null
+++ b/net/ipv4/gre.c
@@ -0,0 +1,152 @@
1/*
2 * GRE over IPv4 demultiplexer driver
3 *
4 * Authors: Dmitry Kozlov (xeb@mail.ru)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/kernel.h>
15#include <linux/kmod.h>
16#include <linux/skbuff.h>
17#include <linux/in.h>
18#include <linux/netdevice.h>
19#include <linux/version.h>
20#include <linux/spinlock.h>
21#include <net/protocol.h>
22#include <net/gre.h>
23
24
25static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
26static DEFINE_SPINLOCK(gre_proto_lock);
27
28int gre_add_protocol(const struct gre_protocol *proto, u8 version)
29{
30 if (version >= GREPROTO_MAX)
31 goto err_out;
32
33 spin_lock(&gre_proto_lock);
34 if (gre_proto[version])
35 goto err_out_unlock;
36
37 rcu_assign_pointer(gre_proto[version], proto);
38 spin_unlock(&gre_proto_lock);
39 return 0;
40
41err_out_unlock:
42 spin_unlock(&gre_proto_lock);
43err_out:
44 return -1;
45}
46EXPORT_SYMBOL_GPL(gre_add_protocol);
47
48int gre_del_protocol(const struct gre_protocol *proto, u8 version)
49{
50 if (version >= GREPROTO_MAX)
51 goto err_out;
52
53 spin_lock(&gre_proto_lock);
54 if (rcu_dereference_protected(gre_proto[version],
55 lockdep_is_held(&gre_proto_lock)) != proto)
56 goto err_out_unlock;
57 rcu_assign_pointer(gre_proto[version], NULL);
58 spin_unlock(&gre_proto_lock);
59 synchronize_rcu();
60 return 0;
61
62err_out_unlock:
63 spin_unlock(&gre_proto_lock);
64err_out:
65 return -1;
66}
67EXPORT_SYMBOL_GPL(gre_del_protocol);
68
69static int gre_rcv(struct sk_buff *skb)
70{
71 const struct gre_protocol *proto;
72 u8 ver;
73 int ret;
74
75 if (!pskb_may_pull(skb, 12))
76 goto drop;
77
78 ver = skb->data[1]&0x7f;
79 if (ver >= GREPROTO_MAX)
80 goto drop;
81
82 rcu_read_lock();
83 proto = rcu_dereference(gre_proto[ver]);
84 if (!proto || !proto->handler)
85 goto drop_unlock;
86 ret = proto->handler(skb);
87 rcu_read_unlock();
88 return ret;
89
90drop_unlock:
91 rcu_read_unlock();
92drop:
93 kfree_skb(skb);
94 return NET_RX_DROP;
95}
96
97static void gre_err(struct sk_buff *skb, u32 info)
98{
99 const struct gre_protocol *proto;
100 u8 ver;
101
102 if (!pskb_may_pull(skb, 12))
103 goto drop;
104
105 ver = skb->data[1]&0x7f;
106 if (ver >= GREPROTO_MAX)
107 goto drop;
108
109 rcu_read_lock();
110 proto = rcu_dereference(gre_proto[ver]);
111 if (!proto || !proto->err_handler)
112 goto drop_unlock;
113 proto->err_handler(skb, info);
114 rcu_read_unlock();
115 return;
116
117drop_unlock:
118 rcu_read_unlock();
119drop:
120 kfree_skb(skb);
121}
122
123static const struct net_protocol net_gre_protocol = {
124 .handler = gre_rcv,
125 .err_handler = gre_err,
126 .netns_ok = 1,
127};
128
129static int __init gre_init(void)
130{
131 pr_info("GRE over IPv4 demultiplexor driver");
132
133 if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
134 pr_err("gre: can't add protocol\n");
135 return -EAGAIN;
136 }
137
138 return 0;
139}
140
141static void __exit gre_exit(void)
142{
143 inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
144}
145
146module_init(gre_init);
147module_exit(gre_exit);
148
149MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
150MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
151MODULE_LICENSE("GPL");
152
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 97c410e84388..4aa1b7f01ea0 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -74,6 +74,7 @@
74#include <linux/netdevice.h> 74#include <linux/netdevice.h>
75#include <linux/string.h> 75#include <linux/string.h>
76#include <linux/netfilter_ipv4.h> 76#include <linux/netfilter_ipv4.h>
77#include <linux/slab.h>
77#include <net/snmp.h> 78#include <net/snmp.h>
78#include <net/ip.h> 79#include <net/ip.h>
79#include <net/route.h> 80#include <net/route.h>
@@ -114,7 +115,7 @@ struct icmp_bxm {
114/* An array of errno for error messages from dest unreach. */ 115/* An array of errno for error messages from dest unreach. */
115/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ 116/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
116 117
117struct icmp_err icmp_err_convert[] = { 118const struct icmp_err icmp_err_convert[] = {
118 { 119 {
119 .errno = ENETUNREACH, /* ICMP_NET_UNREACH */ 120 .errno = ENETUNREACH, /* ICMP_NET_UNREACH */
120 .fatal = 0, 121 .fatal = 0,
@@ -180,6 +181,7 @@ struct icmp_err icmp_err_convert[] = {
180 .fatal = 1, 181 .fatal = 1,
181 }, 182 },
182}; 183};
184EXPORT_SYMBOL(icmp_err_convert);
183 185
184/* 186/*
185 * ICMP control array. This specifies what to do with each ICMP. 187 * ICMP control array. This specifies what to do with each ICMP.
@@ -266,11 +268,12 @@ int xrlim_allow(struct dst_entry *dst, int timeout)
266 dst->rate_tokens = token; 268 dst->rate_tokens = token;
267 return rc; 269 return rc;
268} 270}
271EXPORT_SYMBOL(xrlim_allow);
269 272
270static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, 273static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
271 int type, int code) 274 int type, int code)
272{ 275{
273 struct dst_entry *dst = &rt->u.dst; 276 struct dst_entry *dst = &rt->dst;
274 int rc = 1; 277 int rc = 1;
275 278
276 if (type > NR_ICMP_TYPES) 279 if (type > NR_ICMP_TYPES)
@@ -326,13 +329,14 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
326 struct sock *sk; 329 struct sock *sk;
327 struct sk_buff *skb; 330 struct sk_buff *skb;
328 331
329 sk = icmp_sk(dev_net((*rt)->u.dst.dev)); 332 sk = icmp_sk(dev_net((*rt)->dst.dev));
330 if (ip_append_data(sk, icmp_glue_bits, icmp_param, 333 if (ip_append_data(sk, icmp_glue_bits, icmp_param,
331 icmp_param->data_len+icmp_param->head_len, 334 icmp_param->data_len+icmp_param->head_len,
332 icmp_param->head_len, 335 icmp_param->head_len,
333 ipc, rt, MSG_DONTWAIT) < 0) 336 ipc, rt, MSG_DONTWAIT) < 0) {
337 ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS);
334 ip_flush_pending_frames(sk); 338 ip_flush_pending_frames(sk);
335 else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { 339 } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
336 struct icmphdr *icmph = icmp_hdr(skb); 340 struct icmphdr *icmph = icmp_hdr(skb);
337 __wsum csum = 0; 341 __wsum csum = 0;
338 struct sk_buff *skb1; 342 struct sk_buff *skb1;
@@ -357,7 +361,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
357{ 361{
358 struct ipcm_cookie ipc; 362 struct ipcm_cookie ipc;
359 struct rtable *rt = skb_rtable(skb); 363 struct rtable *rt = skb_rtable(skb);
360 struct net *net = dev_net(rt->u.dst.dev); 364 struct net *net = dev_net(rt->dst.dev);
361 struct sock *sk; 365 struct sock *sk;
362 struct inet_sock *inet; 366 struct inet_sock *inet;
363 __be32 daddr; 367 __be32 daddr;
@@ -375,17 +379,16 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
375 inet->tos = ip_hdr(skb)->tos; 379 inet->tos = ip_hdr(skb)->tos;
376 daddr = ipc.addr = rt->rt_src; 380 daddr = ipc.addr = rt->rt_src;
377 ipc.opt = NULL; 381 ipc.opt = NULL;
378 ipc.shtx.flags = 0; 382 ipc.tx_flags = 0;
379 if (icmp_param->replyopts.optlen) { 383 if (icmp_param->replyopts.optlen) {
380 ipc.opt = &icmp_param->replyopts; 384 ipc.opt = &icmp_param->replyopts;
381 if (ipc.opt->srr) 385 if (ipc.opt->srr)
382 daddr = icmp_param->replyopts.faddr; 386 daddr = icmp_param->replyopts.faddr;
383 } 387 }
384 { 388 {
385 struct flowi fl = { .nl_u = { .ip4_u = 389 struct flowi fl = { .fl4_dst= daddr,
386 { .daddr = daddr, 390 .fl4_src = rt->rt_spec_dst,
387 .saddr = rt->rt_spec_dst, 391 .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
388 .tos = RT_TOS(ip_hdr(skb)->tos) } },
389 .proto = IPPROTO_ICMP }; 392 .proto = IPPROTO_ICMP };
390 security_skb_classify_flow(skb, &fl); 393 security_skb_classify_flow(skb, &fl);
391 if (ip_route_output_key(net, &rt, &fl)) 394 if (ip_route_output_key(net, &rt, &fl))
@@ -425,7 +428,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
425 428
426 if (!rt) 429 if (!rt)
427 goto out; 430 goto out;
428 net = dev_net(rt->u.dst.dev); 431 net = dev_net(rt->dst.dev);
429 432
430 /* 433 /*
431 * Find the original header. It is expected to be valid, of course. 434 * Find the original header. It is expected to be valid, of course.
@@ -501,15 +504,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
501 if (!(rt->rt_flags & RTCF_LOCAL)) { 504 if (!(rt->rt_flags & RTCF_LOCAL)) {
502 struct net_device *dev = NULL; 505 struct net_device *dev = NULL;
503 506
504 if (rt->fl.iif && 507 rcu_read_lock();
505 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) 508 if (rt_is_input_route(rt) &&
506 dev = dev_get_by_index(net, rt->fl.iif); 509 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
510 dev = dev_get_by_index_rcu(net, rt->fl.iif);
507 511
508 if (dev) { 512 if (dev)
509 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); 513 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
510 dev_put(dev); 514 else
511 } else
512 saddr = 0; 515 saddr = 0;
516 rcu_read_unlock();
513 } 517 }
514 518
515 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | 519 tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
@@ -533,26 +537,17 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
533 inet_sk(sk)->tos = tos; 537 inet_sk(sk)->tos = tos;
534 ipc.addr = iph->saddr; 538 ipc.addr = iph->saddr;
535 ipc.opt = &icmp_param.replyopts; 539 ipc.opt = &icmp_param.replyopts;
536 ipc.shtx.flags = 0; 540 ipc.tx_flags = 0;
537 541
538 { 542 {
539 struct flowi fl = { 543 struct flowi fl = {
540 .nl_u = { 544 .fl4_dst = icmp_param.replyopts.srr ?
541 .ip4_u = { 545 icmp_param.replyopts.faddr : iph->saddr,
542 .daddr = icmp_param.replyopts.srr ? 546 .fl4_src = saddr,
543 icmp_param.replyopts.faddr : 547 .fl4_tos = RT_TOS(tos),
544 iph->saddr,
545 .saddr = saddr,
546 .tos = RT_TOS(tos)
547 }
548 },
549 .proto = IPPROTO_ICMP, 548 .proto = IPPROTO_ICMP,
550 .uli_u = { 549 .fl_icmp_type = type,
551 .icmpt = { 550 .fl_icmp_code = code,
552 .type = type,
553 .code = code
554 }
555 }
556 }; 551 };
557 int err; 552 int err;
558 struct rtable *rt2; 553 struct rtable *rt2;
@@ -564,6 +559,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
564 /* No need to clone since we're just using its address. */ 559 /* No need to clone since we're just using its address. */
565 rt2 = rt; 560 rt2 = rt;
566 561
562 if (!fl.nl_u.ip4_u.saddr)
563 fl.nl_u.ip4_u.saddr = rt->rt_src;
564
567 err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0); 565 err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0);
568 switch (err) { 566 switch (err) {
569 case 0: 567 case 0:
@@ -584,20 +582,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
584 err = __ip_route_output_key(net, &rt2, &fl); 582 err = __ip_route_output_key(net, &rt2, &fl);
585 else { 583 else {
586 struct flowi fl2 = {}; 584 struct flowi fl2 = {};
587 struct dst_entry *odst; 585 unsigned long orefdst;
588 586
589 fl2.fl4_dst = fl.fl4_src; 587 fl2.fl4_dst = fl.fl4_src;
590 if (ip_route_output_key(net, &rt2, &fl2)) 588 if (ip_route_output_key(net, &rt2, &fl2))
591 goto relookup_failed; 589 goto relookup_failed;
592 590
593 /* Ugh! */ 591 /* Ugh! */
594 odst = skb_dst(skb_in); 592 orefdst = skb_in->_skb_refdst; /* save old refdst */
595 err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, 593 err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
596 RT_TOS(tos), rt2->u.dst.dev); 594 RT_TOS(tos), rt2->dst.dev);
597 595
598 dst_release(&rt2->u.dst); 596 dst_release(&rt2->dst);
599 rt2 = skb_rtable(skb_in); 597 rt2 = skb_rtable(skb_in);
600 skb_dst_set(skb_in, odst); 598 skb_in->_skb_refdst = orefdst; /* restore old refdst */
601 } 599 }
602 600
603 if (err) 601 if (err)
@@ -607,7 +605,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
607 XFRM_LOOKUP_ICMP); 605 XFRM_LOOKUP_ICMP);
608 switch (err) { 606 switch (err) {
609 case 0: 607 case 0:
610 dst_release(&rt->u.dst); 608 dst_release(&rt->dst);
611 rt = rt2; 609 rt = rt2;
612 break; 610 break;
613 case -EPERM: 611 case -EPERM:
@@ -626,7 +624,7 @@ route_done:
626 624
627 /* RFC says return as much as we can without exceeding 576 bytes. */ 625 /* RFC says return as much as we can without exceeding 576 bytes. */
628 626
629 room = dst_mtu(&rt->u.dst); 627 room = dst_mtu(&rt->dst);
630 if (room > 576) 628 if (room > 576)
631 room = 576; 629 room = 576;
632 room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; 630 room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
@@ -644,6 +642,7 @@ out_unlock:
644 icmp_xmit_unlock(sk); 642 icmp_xmit_unlock(sk);
645out:; 643out:;
646} 644}
645EXPORT_SYMBOL(icmp_send);
647 646
648 647
649/* 648/*
@@ -655,7 +654,7 @@ static void icmp_unreach(struct sk_buff *skb)
655 struct iphdr *iph; 654 struct iphdr *iph;
656 struct icmphdr *icmph; 655 struct icmphdr *icmph;
657 int hash, protocol; 656 int hash, protocol;
658 struct net_protocol *ipprot; 657 const struct net_protocol *ipprot;
659 u32 info = 0; 658 u32 info = 0;
660 struct net *net; 659 struct net *net;
661 660
@@ -922,6 +921,7 @@ static void icmp_address(struct sk_buff *skb)
922/* 921/*
923 * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain 922 * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain
924 * loudly if an inconsistency is found. 923 * loudly if an inconsistency is found.
924 * called with rcu_read_lock()
925 */ 925 */
926 926
927static void icmp_address_reply(struct sk_buff *skb) 927static void icmp_address_reply(struct sk_buff *skb)
@@ -932,12 +932,12 @@ static void icmp_address_reply(struct sk_buff *skb)
932 struct in_ifaddr *ifa; 932 struct in_ifaddr *ifa;
933 933
934 if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) 934 if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
935 goto out; 935 return;
936 936
937 in_dev = in_dev_get(dev); 937 in_dev = __in_dev_get_rcu(dev);
938 if (!in_dev) 938 if (!in_dev)
939 goto out; 939 return;
940 rcu_read_lock(); 940
941 if (in_dev->ifa_list && 941 if (in_dev->ifa_list &&
942 IN_DEV_LOG_MARTIANS(in_dev) && 942 IN_DEV_LOG_MARTIANS(in_dev) &&
943 IN_DEV_FORWARD(in_dev)) { 943 IN_DEV_FORWARD(in_dev)) {
@@ -955,9 +955,6 @@ static void icmp_address_reply(struct sk_buff *skb)
955 mp, dev->name, &rt->rt_src); 955 mp, dev->name, &rt->rt_src);
956 } 956 }
957 } 957 }
958 rcu_read_unlock();
959 in_dev_put(in_dev);
960out:;
961} 958}
962 959
963static void icmp_discard(struct sk_buff *skb) 960static void icmp_discard(struct sk_buff *skb)
@@ -971,7 +968,7 @@ int icmp_rcv(struct sk_buff *skb)
971{ 968{
972 struct icmphdr *icmph; 969 struct icmphdr *icmph;
973 struct rtable *rt = skb_rtable(skb); 970 struct rtable *rt = skb_rtable(skb);
974 struct net *net = dev_net(rt->u.dst.dev); 971 struct net *net = dev_net(rt->dst.dev);
975 972
976 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 973 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
977 struct sec_path *sp = skb_sec_path(skb); 974 struct sec_path *sp = skb_sec_path(skb);
@@ -1165,6 +1162,10 @@ static int __net_init icmp_sk_init(struct net *net)
1165 sk->sk_sndbuf = 1162 sk->sk_sndbuf =
1166 (2 * ((64 * 1024) + sizeof(struct sk_buff))); 1163 (2 * ((64 * 1024) + sizeof(struct sk_buff)));
1167 1164
1165 /*
1166 * Speedup sock_wfree()
1167 */
1168 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1168 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT; 1169 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
1169 } 1170 }
1170 1171
@@ -1209,7 +1210,3 @@ int __init icmp_init(void)
1209{ 1210{
1210 return register_pernet_subsys(&icmp_sk_ops); 1211 return register_pernet_subsys(&icmp_sk_ops);
1211} 1212}
1212
1213EXPORT_SYMBOL(icmp_err_convert);
1214EXPORT_SYMBOL(icmp_send);
1215EXPORT_SYMBOL(xrlim_allow);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 01b4284ed694..e0e77e297de3 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -71,6 +71,7 @@
71 */ 71 */
72 72
73#include <linux/module.h> 73#include <linux/module.h>
74#include <linux/slab.h>
74#include <asm/uaccess.h> 75#include <asm/uaccess.h>
75#include <asm/system.h> 76#include <asm/system.h>
76#include <linux/types.h> 77#include <linux/types.h>
@@ -148,21 +149,37 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc);
148static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, 149static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
149 int sfcount, __be32 *psfsrc, int delta); 150 int sfcount, __be32 *psfsrc, int delta);
150 151
152
153static void ip_mc_list_reclaim(struct rcu_head *head)
154{
155 kfree(container_of(head, struct ip_mc_list, rcu));
156}
157
151static void ip_ma_put(struct ip_mc_list *im) 158static void ip_ma_put(struct ip_mc_list *im)
152{ 159{
153 if (atomic_dec_and_test(&im->refcnt)) { 160 if (atomic_dec_and_test(&im->refcnt)) {
154 in_dev_put(im->interface); 161 in_dev_put(im->interface);
155 kfree(im); 162 call_rcu(&im->rcu, ip_mc_list_reclaim);
156 } 163 }
157} 164}
158 165
166#define for_each_pmc_rcu(in_dev, pmc) \
167 for (pmc = rcu_dereference(in_dev->mc_list); \
168 pmc != NULL; \
169 pmc = rcu_dereference(pmc->next_rcu))
170
171#define for_each_pmc_rtnl(in_dev, pmc) \
172 for (pmc = rtnl_dereference(in_dev->mc_list); \
173 pmc != NULL; \
174 pmc = rtnl_dereference(pmc->next_rcu))
175
159#ifdef CONFIG_IP_MULTICAST 176#ifdef CONFIG_IP_MULTICAST
160 177
161/* 178/*
162 * Timer management 179 * Timer management
163 */ 180 */
164 181
165static __inline__ void igmp_stop_timer(struct ip_mc_list *im) 182static void igmp_stop_timer(struct ip_mc_list *im)
166{ 183{
167 spin_lock_bh(&im->lock); 184 spin_lock_bh(&im->lock);
168 if (del_timer(&im->timer)) 185 if (del_timer(&im->timer))
@@ -283,6 +300,8 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
283 return scount; 300 return scount;
284} 301}
285 302
303#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb))
304
286static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) 305static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
287{ 306{
288 struct sk_buff *skb; 307 struct sk_buff *skb;
@@ -291,14 +310,20 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
291 struct igmpv3_report *pig; 310 struct igmpv3_report *pig;
292 struct net *net = dev_net(dev); 311 struct net *net = dev_net(dev);
293 312
294 skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); 313 while (1) {
295 if (skb == NULL) 314 skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev),
296 return NULL; 315 GFP_ATOMIC | __GFP_NOWARN);
316 if (skb)
317 break;
318 size >>= 1;
319 if (size < 256)
320 return NULL;
321 }
322 igmp_skb_size(skb) = size;
297 323
298 { 324 {
299 struct flowi fl = { .oif = dev->ifindex, 325 struct flowi fl = { .oif = dev->ifindex,
300 .nl_u = { .ip4_u = { 326 .fl4_dst = IGMPV3_ALL_MCR,
301 .daddr = IGMPV3_ALL_MCR } },
302 .proto = IPPROTO_IGMP }; 327 .proto = IPPROTO_IGMP };
303 if (ip_route_output_key(net, &rt, &fl)) { 328 if (ip_route_output_key(net, &rt, &fl)) {
304 kfree_skb(skb); 329 kfree_skb(skb);
@@ -311,7 +336,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
311 return NULL; 336 return NULL;
312 } 337 }
313 338
314 skb_dst_set(skb, &rt->u.dst); 339 skb_dst_set(skb, &rt->dst);
315 skb->dev = dev; 340 skb->dev = dev;
316 341
317 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 342 skb_reserve(skb, LL_RESERVED_SPACE(dev));
@@ -329,7 +354,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
329 pip->saddr = rt->rt_src; 354 pip->saddr = rt->rt_src;
330 pip->protocol = IPPROTO_IGMP; 355 pip->protocol = IPPROTO_IGMP;
331 pip->tot_len = 0; /* filled in later */ 356 pip->tot_len = 0; /* filled in later */
332 ip_select_ident(pip, &rt->u.dst, NULL); 357 ip_select_ident(pip, &rt->dst, NULL);
333 ((u8*)&pip[1])[0] = IPOPT_RA; 358 ((u8*)&pip[1])[0] = IPOPT_RA;
334 ((u8*)&pip[1])[1] = 4; 359 ((u8*)&pip[1])[1] = 4;
335 ((u8*)&pip[1])[2] = 0; 360 ((u8*)&pip[1])[2] = 0;
@@ -383,7 +408,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
383 return skb; 408 return skb;
384} 409}
385 410
386#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ 411#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \
387 skb_tailroom(skb)) : 0) 412 skb_tailroom(skb)) : 0)
388 413
389static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, 414static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
@@ -501,8 +526,8 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
501 int type; 526 int type;
502 527
503 if (!pmc) { 528 if (!pmc) {
504 read_lock(&in_dev->mc_list_lock); 529 rcu_read_lock();
505 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 530 for_each_pmc_rcu(in_dev, pmc) {
506 if (pmc->multiaddr == IGMP_ALL_HOSTS) 531 if (pmc->multiaddr == IGMP_ALL_HOSTS)
507 continue; 532 continue;
508 spin_lock_bh(&pmc->lock); 533 spin_lock_bh(&pmc->lock);
@@ -513,7 +538,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
513 skb = add_grec(skb, pmc, type, 0, 0); 538 skb = add_grec(skb, pmc, type, 0, 0);
514 spin_unlock_bh(&pmc->lock); 539 spin_unlock_bh(&pmc->lock);
515 } 540 }
516 read_unlock(&in_dev->mc_list_lock); 541 rcu_read_unlock();
517 } else { 542 } else {
518 spin_lock_bh(&pmc->lock); 543 spin_lock_bh(&pmc->lock);
519 if (pmc->sfcount[MCAST_EXCLUDE]) 544 if (pmc->sfcount[MCAST_EXCLUDE])
@@ -555,7 +580,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
555 struct sk_buff *skb = NULL; 580 struct sk_buff *skb = NULL;
556 int type, dtype; 581 int type, dtype;
557 582
558 read_lock(&in_dev->mc_list_lock); 583 rcu_read_lock();
559 spin_lock_bh(&in_dev->mc_tomb_lock); 584 spin_lock_bh(&in_dev->mc_tomb_lock);
560 585
561 /* deleted MCA's */ 586 /* deleted MCA's */
@@ -592,7 +617,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
592 spin_unlock_bh(&in_dev->mc_tomb_lock); 617 spin_unlock_bh(&in_dev->mc_tomb_lock);
593 618
594 /* change recs */ 619 /* change recs */
595 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 620 for_each_pmc_rcu(in_dev, pmc) {
596 spin_lock_bh(&pmc->lock); 621 spin_lock_bh(&pmc->lock);
597 if (pmc->sfcount[MCAST_EXCLUDE]) { 622 if (pmc->sfcount[MCAST_EXCLUDE]) {
598 type = IGMPV3_BLOCK_OLD_SOURCES; 623 type = IGMPV3_BLOCK_OLD_SOURCES;
@@ -615,7 +640,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
615 } 640 }
616 spin_unlock_bh(&pmc->lock); 641 spin_unlock_bh(&pmc->lock);
617 } 642 }
618 read_unlock(&in_dev->mc_list_lock); 643 rcu_read_unlock();
619 644
620 if (!skb) 645 if (!skb)
621 return; 646 return;
@@ -643,7 +668,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
643 668
644 { 669 {
645 struct flowi fl = { .oif = dev->ifindex, 670 struct flowi fl = { .oif = dev->ifindex,
646 .nl_u = { .ip4_u = { .daddr = dst } }, 671 .fl4_dst = dst,
647 .proto = IPPROTO_IGMP }; 672 .proto = IPPROTO_IGMP };
648 if (ip_route_output_key(net, &rt, &fl)) 673 if (ip_route_output_key(net, &rt, &fl))
649 return -1; 674 return -1;
@@ -659,7 +684,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
659 return -1; 684 return -1;
660 } 685 }
661 686
662 skb_dst_set(skb, &rt->u.dst); 687 skb_dst_set(skb, &rt->dst);
663 688
664 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 689 skb_reserve(skb, LL_RESERVED_SPACE(dev));
665 690
@@ -675,7 +700,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
675 iph->daddr = dst; 700 iph->daddr = dst;
676 iph->saddr = rt->rt_src; 701 iph->saddr = rt->rt_src;
677 iph->protocol = IPPROTO_IGMP; 702 iph->protocol = IPPROTO_IGMP;
678 ip_select_ident(iph, &rt->u.dst, NULL); 703 ip_select_ident(iph, &rt->dst, NULL);
679 ((u8*)&iph[1])[0] = IPOPT_RA; 704 ((u8*)&iph[1])[0] = IPOPT_RA;
680 ((u8*)&iph[1])[1] = 4; 705 ((u8*)&iph[1])[1] = 4;
681 ((u8*)&iph[1])[2] = 0; 706 ((u8*)&iph[1])[2] = 0;
@@ -812,14 +837,14 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group)
812 if (group == IGMP_ALL_HOSTS) 837 if (group == IGMP_ALL_HOSTS)
813 return; 838 return;
814 839
815 read_lock(&in_dev->mc_list_lock); 840 rcu_read_lock();
816 for (im=in_dev->mc_list; im!=NULL; im=im->next) { 841 for_each_pmc_rcu(in_dev, im) {
817 if (im->multiaddr == group) { 842 if (im->multiaddr == group) {
818 igmp_stop_timer(im); 843 igmp_stop_timer(im);
819 break; 844 break;
820 } 845 }
821 } 846 }
822 read_unlock(&in_dev->mc_list_lock); 847 rcu_read_unlock();
823} 848}
824 849
825static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, 850static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
@@ -855,6 +880,18 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
855 igmpv3_clear_delrec(in_dev); 880 igmpv3_clear_delrec(in_dev);
856 } else if (len < 12) { 881 } else if (len < 12) {
857 return; /* ignore bogus packet; freed by caller */ 882 return; /* ignore bogus packet; freed by caller */
883 } else if (IGMP_V1_SEEN(in_dev)) {
884 /* This is a v3 query with v1 queriers present */
885 max_delay = IGMP_Query_Response_Interval;
886 group = 0;
887 } else if (IGMP_V2_SEEN(in_dev)) {
888 /* this is a v3 query with v2 queriers present;
889 * Interpretation of the max_delay code is problematic here.
890 * A real v2 host would use ih_code directly, while v3 has a
891 * different encoding. We use the v3 encoding as more likely
892 * to be intended in a v3 query.
893 */
894 max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
858 } else { /* v3 */ 895 } else { /* v3 */
859 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) 896 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
860 return; 897 return;
@@ -893,8 +930,8 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
893 * - Use the igmp->igmp_code field as the maximum 930 * - Use the igmp->igmp_code field as the maximum
894 * delay possible 931 * delay possible
895 */ 932 */
896 read_lock(&in_dev->mc_list_lock); 933 rcu_read_lock();
897 for (im=in_dev->mc_list; im!=NULL; im=im->next) { 934 for_each_pmc_rcu(in_dev, im) {
898 int changed; 935 int changed;
899 936
900 if (group && group != im->multiaddr) 937 if (group && group != im->multiaddr)
@@ -912,21 +949,22 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
912 if (changed) 949 if (changed)
913 igmp_mod_timer(im, max_delay); 950 igmp_mod_timer(im, max_delay);
914 } 951 }
915 read_unlock(&in_dev->mc_list_lock); 952 rcu_read_unlock();
916} 953}
917 954
955/* called in rcu_read_lock() section */
918int igmp_rcv(struct sk_buff *skb) 956int igmp_rcv(struct sk_buff *skb)
919{ 957{
920 /* This basically follows the spec line by line -- see RFC1112 */ 958 /* This basically follows the spec line by line -- see RFC1112 */
921 struct igmphdr *ih; 959 struct igmphdr *ih;
922 struct in_device *in_dev = in_dev_get(skb->dev); 960 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
923 int len = skb->len; 961 int len = skb->len;
924 962
925 if (in_dev == NULL) 963 if (in_dev == NULL)
926 goto drop; 964 goto drop;
927 965
928 if (!pskb_may_pull(skb, sizeof(struct igmphdr))) 966 if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
929 goto drop_ref; 967 goto drop;
930 968
931 switch (skb->ip_summed) { 969 switch (skb->ip_summed) {
932 case CHECKSUM_COMPLETE: 970 case CHECKSUM_COMPLETE:
@@ -936,7 +974,7 @@ int igmp_rcv(struct sk_buff *skb)
936 case CHECKSUM_NONE: 974 case CHECKSUM_NONE:
937 skb->csum = 0; 975 skb->csum = 0;
938 if (__skb_checksum_complete(skb)) 976 if (__skb_checksum_complete(skb))
939 goto drop_ref; 977 goto drop;
940 } 978 }
941 979
942 ih = igmp_hdr(skb); 980 ih = igmp_hdr(skb);
@@ -946,9 +984,8 @@ int igmp_rcv(struct sk_buff *skb)
946 break; 984 break;
947 case IGMP_HOST_MEMBERSHIP_REPORT: 985 case IGMP_HOST_MEMBERSHIP_REPORT:
948 case IGMPV2_HOST_MEMBERSHIP_REPORT: 986 case IGMPV2_HOST_MEMBERSHIP_REPORT:
949 case IGMPV3_HOST_MEMBERSHIP_REPORT:
950 /* Is it our report looped back? */ 987 /* Is it our report looped back? */
951 if (skb_rtable(skb)->fl.iif == 0) 988 if (rt_is_output_route(skb_rtable(skb)))
952 break; 989 break;
953 /* don't rely on MC router hearing unicast reports */ 990 /* don't rely on MC router hearing unicast reports */
954 if (skb->pkt_type == PACKET_MULTICAST || 991 if (skb->pkt_type == PACKET_MULTICAST ||
@@ -957,9 +994,9 @@ int igmp_rcv(struct sk_buff *skb)
957 break; 994 break;
958 case IGMP_PIM: 995 case IGMP_PIM:
959#ifdef CONFIG_IP_PIMSM_V1 996#ifdef CONFIG_IP_PIMSM_V1
960 in_dev_put(in_dev);
961 return pim_rcv_v1(skb); 997 return pim_rcv_v1(skb);
962#endif 998#endif
999 case IGMPV3_HOST_MEMBERSHIP_REPORT:
963 case IGMP_DVMRP: 1000 case IGMP_DVMRP:
964 case IGMP_TRACE: 1001 case IGMP_TRACE:
965 case IGMP_HOST_LEAVE_MESSAGE: 1002 case IGMP_HOST_LEAVE_MESSAGE:
@@ -970,8 +1007,6 @@ int igmp_rcv(struct sk_buff *skb)
970 break; 1007 break;
971 } 1008 }
972 1009
973drop_ref:
974 in_dev_put(in_dev);
975drop: 1010drop:
976 kfree_skb(skb); 1011 kfree_skb(skb);
977 return 0; 1012 return 0;
@@ -997,7 +1032,7 @@ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
997 --ANK 1032 --ANK
998 */ 1033 */
999 if (arp_mc_map(addr, buf, dev, 0) == 0) 1034 if (arp_mc_map(addr, buf, dev, 0) == 0)
1000 dev_mc_add(dev, buf, dev->addr_len, 0); 1035 dev_mc_add(dev, buf);
1001} 1036}
1002 1037
1003/* 1038/*
@@ -1010,7 +1045,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
1010 struct net_device *dev = in_dev->dev; 1045 struct net_device *dev = in_dev->dev;
1011 1046
1012 if (arp_mc_map(addr, buf, dev, 0) == 0) 1047 if (arp_mc_map(addr, buf, dev, 0) == 0)
1013 dev_mc_delete(dev, buf, dev->addr_len, 0); 1048 dev_mc_del(dev, buf);
1014} 1049}
1015 1050
1016#ifdef CONFIG_IP_MULTICAST 1051#ifdef CONFIG_IP_MULTICAST
@@ -1099,8 +1134,8 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
1099 kfree(pmc); 1134 kfree(pmc);
1100 } 1135 }
1101 /* clear dead sources, too */ 1136 /* clear dead sources, too */
1102 read_lock(&in_dev->mc_list_lock); 1137 rcu_read_lock();
1103 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 1138 for_each_pmc_rcu(in_dev, pmc) {
1104 struct ip_sf_list *psf, *psf_next; 1139 struct ip_sf_list *psf, *psf_next;
1105 1140
1106 spin_lock_bh(&pmc->lock); 1141 spin_lock_bh(&pmc->lock);
@@ -1112,7 +1147,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
1112 kfree(psf); 1147 kfree(psf);
1113 } 1148 }
1114 } 1149 }
1115 read_unlock(&in_dev->mc_list_lock); 1150 rcu_read_unlock();
1116} 1151}
1117#endif 1152#endif
1118 1153
@@ -1198,7 +1233,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1198 1233
1199 ASSERT_RTNL(); 1234 ASSERT_RTNL();
1200 1235
1201 for (im=in_dev->mc_list; im; im=im->next) { 1236 for_each_pmc_rtnl(in_dev, im) {
1202 if (im->multiaddr == addr) { 1237 if (im->multiaddr == addr) {
1203 im->users++; 1238 im->users++;
1204 ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); 1239 ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
@@ -1206,7 +1241,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1206 } 1241 }
1207 } 1242 }
1208 1243
1209 im = kmalloc(sizeof(*im), GFP_KERNEL); 1244 im = kzalloc(sizeof(*im), GFP_KERNEL);
1210 if (!im) 1245 if (!im)
1211 goto out; 1246 goto out;
1212 1247
@@ -1216,26 +1251,18 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1216 im->multiaddr = addr; 1251 im->multiaddr = addr;
1217 /* initial mode is (EX, empty) */ 1252 /* initial mode is (EX, empty) */
1218 im->sfmode = MCAST_EXCLUDE; 1253 im->sfmode = MCAST_EXCLUDE;
1219 im->sfcount[MCAST_INCLUDE] = 0;
1220 im->sfcount[MCAST_EXCLUDE] = 1; 1254 im->sfcount[MCAST_EXCLUDE] = 1;
1221 im->sources = NULL;
1222 im->tomb = NULL;
1223 im->crcount = 0;
1224 atomic_set(&im->refcnt, 1); 1255 atomic_set(&im->refcnt, 1);
1225 spin_lock_init(&im->lock); 1256 spin_lock_init(&im->lock);
1226#ifdef CONFIG_IP_MULTICAST 1257#ifdef CONFIG_IP_MULTICAST
1227 im->tm_running = 0;
1228 setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); 1258 setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im);
1229 im->unsolicit_count = IGMP_Unsolicited_Report_Count; 1259 im->unsolicit_count = IGMP_Unsolicited_Report_Count;
1230 im->reporter = 0;
1231 im->gsquery = 0;
1232#endif 1260#endif
1233 im->loaded = 0; 1261
1234 write_lock_bh(&in_dev->mc_list_lock); 1262 im->next_rcu = in_dev->mc_list;
1235 im->next = in_dev->mc_list;
1236 in_dev->mc_list = im;
1237 in_dev->mc_count++; 1263 in_dev->mc_count++;
1238 write_unlock_bh(&in_dev->mc_list_lock); 1264 rcu_assign_pointer(in_dev->mc_list, im);
1265
1239#ifdef CONFIG_IP_MULTICAST 1266#ifdef CONFIG_IP_MULTICAST
1240 igmpv3_del_delrec(in_dev, im->multiaddr); 1267 igmpv3_del_delrec(in_dev, im->multiaddr);
1241#endif 1268#endif
@@ -1245,28 +1272,36 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1245out: 1272out:
1246 return; 1273 return;
1247} 1274}
1275EXPORT_SYMBOL(ip_mc_inc_group);
1248 1276
1249/* 1277/*
1250 * Resend IGMP JOIN report; used for bonding. 1278 * Resend IGMP JOIN report; used for bonding.
1279 * Called with rcu_read_lock()
1251 */ 1280 */
1252void ip_mc_rejoin_group(struct ip_mc_list *im) 1281void ip_mc_rejoin_groups(struct in_device *in_dev)
1253{ 1282{
1254#ifdef CONFIG_IP_MULTICAST 1283#ifdef CONFIG_IP_MULTICAST
1255 struct in_device *in_dev = im->interface; 1284 struct ip_mc_list *im;
1285 int type;
1256 1286
1257 if (im->multiaddr == IGMP_ALL_HOSTS) 1287 for_each_pmc_rcu(in_dev, im) {
1258 return; 1288 if (im->multiaddr == IGMP_ALL_HOSTS)
1289 continue;
1259 1290
1260 if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { 1291 /* a failover is happening and switches
1261 igmp_mod_timer(im, IGMP_Initial_Report_Delay); 1292 * must be notified immediately
1262 return; 1293 */
1294 if (IGMP_V1_SEEN(in_dev))
1295 type = IGMP_HOST_MEMBERSHIP_REPORT;
1296 else if (IGMP_V2_SEEN(in_dev))
1297 type = IGMPV2_HOST_MEMBERSHIP_REPORT;
1298 else
1299 type = IGMPV3_HOST_MEMBERSHIP_REPORT;
1300 igmp_send_report(in_dev, im, type);
1263 } 1301 }
1264 /* else, v3 */
1265 im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
1266 IGMP_Unsolicited_Report_Count;
1267 igmp_ifc_event(in_dev);
1268#endif 1302#endif
1269} 1303}
1304EXPORT_SYMBOL(ip_mc_rejoin_groups);
1270 1305
1271/* 1306/*
1272 * A socket has left a multicast group on device dev 1307 * A socket has left a multicast group on device dev
@@ -1274,17 +1309,18 @@ void ip_mc_rejoin_group(struct ip_mc_list *im)
1274 1309
1275void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) 1310void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
1276{ 1311{
1277 struct ip_mc_list *i, **ip; 1312 struct ip_mc_list *i;
1313 struct ip_mc_list __rcu **ip;
1278 1314
1279 ASSERT_RTNL(); 1315 ASSERT_RTNL();
1280 1316
1281 for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { 1317 for (ip = &in_dev->mc_list;
1318 (i = rtnl_dereference(*ip)) != NULL;
1319 ip = &i->next_rcu) {
1282 if (i->multiaddr == addr) { 1320 if (i->multiaddr == addr) {
1283 if (--i->users == 0) { 1321 if (--i->users == 0) {
1284 write_lock_bh(&in_dev->mc_list_lock); 1322 *ip = i->next_rcu;
1285 *ip = i->next;
1286 in_dev->mc_count--; 1323 in_dev->mc_count--;
1287 write_unlock_bh(&in_dev->mc_list_lock);
1288 igmp_group_dropped(i); 1324 igmp_group_dropped(i);
1289 1325
1290 if (!in_dev->dead) 1326 if (!in_dev->dead)
@@ -1297,17 +1333,40 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
1297 } 1333 }
1298 } 1334 }
1299} 1335}
1336EXPORT_SYMBOL(ip_mc_dec_group);
1337
1338/* Device changing type */
1339
1340void ip_mc_unmap(struct in_device *in_dev)
1341{
1342 struct ip_mc_list *pmc;
1343
1344 ASSERT_RTNL();
1345
1346 for_each_pmc_rtnl(in_dev, pmc)
1347 igmp_group_dropped(pmc);
1348}
1349
1350void ip_mc_remap(struct in_device *in_dev)
1351{
1352 struct ip_mc_list *pmc;
1353
1354 ASSERT_RTNL();
1355
1356 for_each_pmc_rtnl(in_dev, pmc)
1357 igmp_group_added(pmc);
1358}
1300 1359
1301/* Device going down */ 1360/* Device going down */
1302 1361
1303void ip_mc_down(struct in_device *in_dev) 1362void ip_mc_down(struct in_device *in_dev)
1304{ 1363{
1305 struct ip_mc_list *i; 1364 struct ip_mc_list *pmc;
1306 1365
1307 ASSERT_RTNL(); 1366 ASSERT_RTNL();
1308 1367
1309 for (i=in_dev->mc_list; i; i=i->next) 1368 for_each_pmc_rtnl(in_dev, pmc)
1310 igmp_group_dropped(i); 1369 igmp_group_dropped(pmc);
1311 1370
1312#ifdef CONFIG_IP_MULTICAST 1371#ifdef CONFIG_IP_MULTICAST
1313 in_dev->mr_ifc_count = 0; 1372 in_dev->mr_ifc_count = 0;
@@ -1338,7 +1397,6 @@ void ip_mc_init_dev(struct in_device *in_dev)
1338 in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; 1397 in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
1339#endif 1398#endif
1340 1399
1341 rwlock_init(&in_dev->mc_list_lock);
1342 spin_lock_init(&in_dev->mc_tomb_lock); 1400 spin_lock_init(&in_dev->mc_tomb_lock);
1343} 1401}
1344 1402
@@ -1346,14 +1404,14 @@ void ip_mc_init_dev(struct in_device *in_dev)
1346 1404
1347void ip_mc_up(struct in_device *in_dev) 1405void ip_mc_up(struct in_device *in_dev)
1348{ 1406{
1349 struct ip_mc_list *i; 1407 struct ip_mc_list *pmc;
1350 1408
1351 ASSERT_RTNL(); 1409 ASSERT_RTNL();
1352 1410
1353 ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); 1411 ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
1354 1412
1355 for (i=in_dev->mc_list; i; i=i->next) 1413 for_each_pmc_rtnl(in_dev, pmc)
1356 igmp_group_added(i); 1414 igmp_group_added(pmc);
1357} 1415}
1358 1416
1359/* 1417/*
@@ -1369,42 +1427,35 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
1369 /* Deactivate timers */ 1427 /* Deactivate timers */
1370 ip_mc_down(in_dev); 1428 ip_mc_down(in_dev);
1371 1429
1372 write_lock_bh(&in_dev->mc_list_lock); 1430 while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
1373 while ((i = in_dev->mc_list) != NULL) { 1431 in_dev->mc_list = i->next_rcu;
1374 in_dev->mc_list = i->next;
1375 in_dev->mc_count--; 1432 in_dev->mc_count--;
1376 write_unlock_bh(&in_dev->mc_list_lock); 1433
1377 igmp_group_dropped(i); 1434 igmp_group_dropped(i);
1378 ip_ma_put(i); 1435 ip_ma_put(i);
1379
1380 write_lock_bh(&in_dev->mc_list_lock);
1381 } 1436 }
1382 write_unlock_bh(&in_dev->mc_list_lock);
1383} 1437}
1384 1438
1439/* RTNL is locked */
1385static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) 1440static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1386{ 1441{
1387 struct flowi fl = { .nl_u = { .ip4_u = 1442 struct flowi fl = { .fl4_dst = imr->imr_multiaddr.s_addr };
1388 { .daddr = imr->imr_multiaddr.s_addr } } };
1389 struct rtable *rt; 1443 struct rtable *rt;
1390 struct net_device *dev = NULL; 1444 struct net_device *dev = NULL;
1391 struct in_device *idev = NULL; 1445 struct in_device *idev = NULL;
1392 1446
1393 if (imr->imr_ifindex) { 1447 if (imr->imr_ifindex) {
1394 idev = inetdev_by_index(net, imr->imr_ifindex); 1448 idev = inetdev_by_index(net, imr->imr_ifindex);
1395 if (idev)
1396 __in_dev_put(idev);
1397 return idev; 1449 return idev;
1398 } 1450 }
1399 if (imr->imr_address.s_addr) { 1451 if (imr->imr_address.s_addr) {
1400 dev = ip_dev_find(net, imr->imr_address.s_addr); 1452 dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
1401 if (!dev) 1453 if (!dev)
1402 return NULL; 1454 return NULL;
1403 dev_put(dev);
1404 } 1455 }
1405 1456
1406 if (!dev && !ip_route_output_key(net, &rt, &fl)) { 1457 if (!dev && !ip_route_output_key(net, &rt, &fl)) {
1407 dev = rt->u.dst.dev; 1458 dev = rt->dst.dev;
1408 ip_rt_put(rt); 1459 ip_rt_put(rt);
1409 } 1460 }
1410 if (dev) { 1461 if (dev) {
@@ -1479,18 +1530,18 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
1479 1530
1480 if (!in_dev) 1531 if (!in_dev)
1481 return -ENODEV; 1532 return -ENODEV;
1482 read_lock(&in_dev->mc_list_lock); 1533 rcu_read_lock();
1483 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 1534 for_each_pmc_rcu(in_dev, pmc) {
1484 if (*pmca == pmc->multiaddr) 1535 if (*pmca == pmc->multiaddr)
1485 break; 1536 break;
1486 } 1537 }
1487 if (!pmc) { 1538 if (!pmc) {
1488 /* MCA not found?? bug */ 1539 /* MCA not found?? bug */
1489 read_unlock(&in_dev->mc_list_lock); 1540 rcu_read_unlock();
1490 return -ESRCH; 1541 return -ESRCH;
1491 } 1542 }
1492 spin_lock_bh(&pmc->lock); 1543 spin_lock_bh(&pmc->lock);
1493 read_unlock(&in_dev->mc_list_lock); 1544 rcu_read_unlock();
1494#ifdef CONFIG_IP_MULTICAST 1545#ifdef CONFIG_IP_MULTICAST
1495 sf_markstate(pmc); 1546 sf_markstate(pmc);
1496#endif 1547#endif
@@ -1623,8 +1674,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
1623 if (dpsf->sf_inaddr == psf->sf_inaddr) 1674 if (dpsf->sf_inaddr == psf->sf_inaddr)
1624 break; 1675 break;
1625 if (!dpsf) { 1676 if (!dpsf) {
1626 dpsf = (struct ip_sf_list *) 1677 dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
1627 kmalloc(sizeof(*dpsf), GFP_ATOMIC);
1628 if (!dpsf) 1678 if (!dpsf)
1629 continue; 1679 continue;
1630 *dpsf = *psf; 1680 *dpsf = *psf;
@@ -1652,18 +1702,18 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
1652 1702
1653 if (!in_dev) 1703 if (!in_dev)
1654 return -ENODEV; 1704 return -ENODEV;
1655 read_lock(&in_dev->mc_list_lock); 1705 rcu_read_lock();
1656 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 1706 for_each_pmc_rcu(in_dev, pmc) {
1657 if (*pmca == pmc->multiaddr) 1707 if (*pmca == pmc->multiaddr)
1658 break; 1708 break;
1659 } 1709 }
1660 if (!pmc) { 1710 if (!pmc) {
1661 /* MCA not found?? bug */ 1711 /* MCA not found?? bug */
1662 read_unlock(&in_dev->mc_list_lock); 1712 rcu_read_unlock();
1663 return -ESRCH; 1713 return -ESRCH;
1664 } 1714 }
1665 spin_lock_bh(&pmc->lock); 1715 spin_lock_bh(&pmc->lock);
1666 read_unlock(&in_dev->mc_list_lock); 1716 rcu_read_unlock();
1667 1717
1668#ifdef CONFIG_IP_MULTICAST 1718#ifdef CONFIG_IP_MULTICAST
1669 sf_markstate(pmc); 1719 sf_markstate(pmc);
@@ -1760,7 +1810,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1760 1810
1761 err = -EADDRINUSE; 1811 err = -EADDRINUSE;
1762 ifindex = imr->imr_ifindex; 1812 ifindex = imr->imr_ifindex;
1763 for (i = inet->mc_list; i; i = i->next) { 1813 for_each_pmc_rtnl(inet, i) {
1764 if (i->multi.imr_multiaddr.s_addr == addr && 1814 if (i->multi.imr_multiaddr.s_addr == addr &&
1765 i->multi.imr_ifindex == ifindex) 1815 i->multi.imr_ifindex == ifindex)
1766 goto done; 1816 goto done;
@@ -1774,35 +1824,52 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1774 goto done; 1824 goto done;
1775 1825
1776 memcpy(&iml->multi, imr, sizeof(*imr)); 1826 memcpy(&iml->multi, imr, sizeof(*imr));
1777 iml->next = inet->mc_list; 1827 iml->next_rcu = inet->mc_list;
1778 iml->sflist = NULL; 1828 iml->sflist = NULL;
1779 iml->sfmode = MCAST_EXCLUDE; 1829 iml->sfmode = MCAST_EXCLUDE;
1780 inet->mc_list = iml; 1830 rcu_assign_pointer(inet->mc_list, iml);
1781 ip_mc_inc_group(in_dev, addr); 1831 ip_mc_inc_group(in_dev, addr);
1782 err = 0; 1832 err = 0;
1783done: 1833done:
1784 rtnl_unlock(); 1834 rtnl_unlock();
1785 return err; 1835 return err;
1786} 1836}
1837EXPORT_SYMBOL(ip_mc_join_group);
1838
1839static void ip_sf_socklist_reclaim(struct rcu_head *rp)
1840{
1841 kfree(container_of(rp, struct ip_sf_socklist, rcu));
1842 /* sk_omem_alloc should have been decreased by the caller*/
1843}
1787 1844
1788static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, 1845static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
1789 struct in_device *in_dev) 1846 struct in_device *in_dev)
1790{ 1847{
1848 struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
1791 int err; 1849 int err;
1792 1850
1793 if (iml->sflist == NULL) { 1851 if (psf == NULL) {
1794 /* any-source empty exclude case */ 1852 /* any-source empty exclude case */
1795 return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, 1853 return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
1796 iml->sfmode, 0, NULL, 0); 1854 iml->sfmode, 0, NULL, 0);
1797 } 1855 }
1798 err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, 1856 err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
1799 iml->sfmode, iml->sflist->sl_count, 1857 iml->sfmode, psf->sl_count, psf->sl_addr, 0);
1800 iml->sflist->sl_addr, 0); 1858 rcu_assign_pointer(iml->sflist, NULL);
1801 sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max)); 1859 /* decrease mem now to avoid the memleak warning */
1802 iml->sflist = NULL; 1860 atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
1861 call_rcu(&psf->rcu, ip_sf_socklist_reclaim);
1803 return err; 1862 return err;
1804} 1863}
1805 1864
1865
1866static void ip_mc_socklist_reclaim(struct rcu_head *rp)
1867{
1868 kfree(container_of(rp, struct ip_mc_socklist, rcu));
1869 /* sk_omem_alloc should have been decreased by the caller*/
1870}
1871
1872
1806/* 1873/*
1807 * Ask a socket to leave a group. 1874 * Ask a socket to leave a group.
1808 */ 1875 */
@@ -1810,7 +1877,8 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
1810int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) 1877int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1811{ 1878{
1812 struct inet_sock *inet = inet_sk(sk); 1879 struct inet_sock *inet = inet_sk(sk);
1813 struct ip_mc_socklist *iml, **imlp; 1880 struct ip_mc_socklist *iml;
1881 struct ip_mc_socklist __rcu **imlp;
1814 struct in_device *in_dev; 1882 struct in_device *in_dev;
1815 struct net *net = sock_net(sk); 1883 struct net *net = sock_net(sk);
1816 __be32 group = imr->imr_multiaddr.s_addr; 1884 __be32 group = imr->imr_multiaddr.s_addr;
@@ -1820,7 +1888,9 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1820 rtnl_lock(); 1888 rtnl_lock();
1821 in_dev = ip_mc_find_dev(net, imr); 1889 in_dev = ip_mc_find_dev(net, imr);
1822 ifindex = imr->imr_ifindex; 1890 ifindex = imr->imr_ifindex;
1823 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { 1891 for (imlp = &inet->mc_list;
1892 (iml = rtnl_dereference(*imlp)) != NULL;
1893 imlp = &iml->next_rcu) {
1824 if (iml->multi.imr_multiaddr.s_addr != group) 1894 if (iml->multi.imr_multiaddr.s_addr != group)
1825 continue; 1895 continue;
1826 if (ifindex) { 1896 if (ifindex) {
@@ -1832,12 +1902,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1832 1902
1833 (void) ip_mc_leave_src(sk, iml, in_dev); 1903 (void) ip_mc_leave_src(sk, iml, in_dev);
1834 1904
1835 *imlp = iml->next; 1905 *imlp = iml->next_rcu;
1836 1906
1837 if (in_dev) 1907 if (in_dev)
1838 ip_mc_dec_group(in_dev, group); 1908 ip_mc_dec_group(in_dev, group);
1839 rtnl_unlock(); 1909 rtnl_unlock();
1840 sock_kfree_s(sk, iml, sizeof(*iml)); 1910 /* decrease mem now to avoid the memleak warning */
1911 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
1912 call_rcu(&iml->rcu, ip_mc_socklist_reclaim);
1841 return 0; 1913 return 0;
1842 } 1914 }
1843 if (!in_dev) 1915 if (!in_dev)
@@ -1876,9 +1948,10 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1876 } 1948 }
1877 err = -EADDRNOTAVAIL; 1949 err = -EADDRNOTAVAIL;
1878 1950
1879 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1951 for_each_pmc_rtnl(inet, pmc) {
1880 if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr 1952 if ((pmc->multi.imr_multiaddr.s_addr ==
1881 && pmc->multi.imr_ifindex == imr.imr_ifindex) 1953 imr.imr_multiaddr.s_addr) &&
1954 (pmc->multi.imr_ifindex == imr.imr_ifindex))
1882 break; 1955 break;
1883 } 1956 }
1884 if (!pmc) { /* must have a prior join */ 1957 if (!pmc) { /* must have a prior join */
@@ -1899,7 +1972,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1899 pmc->sfmode = omode; 1972 pmc->sfmode = omode;
1900 } 1973 }
1901 1974
1902 psl = pmc->sflist; 1975 psl = rtnl_dereference(pmc->sflist);
1903 if (!add) { 1976 if (!add) {
1904 if (!psl) 1977 if (!psl)
1905 goto done; /* err = -EADDRNOTAVAIL */ 1978 goto done; /* err = -EADDRNOTAVAIL */
@@ -1951,9 +2024,12 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1951 if (psl) { 2024 if (psl) {
1952 for (i=0; i<psl->sl_count; i++) 2025 for (i=0; i<psl->sl_count; i++)
1953 newpsl->sl_addr[i] = psl->sl_addr[i]; 2026 newpsl->sl_addr[i] = psl->sl_addr[i];
1954 sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); 2027 /* decrease mem now to avoid the memleak warning */
2028 atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
2029 call_rcu(&psl->rcu, ip_sf_socklist_reclaim);
1955 } 2030 }
1956 pmc->sflist = psl = newpsl; 2031 rcu_assign_pointer(pmc->sflist, newpsl);
2032 psl = newpsl;
1957 } 2033 }
1958 rv = 1; /* > 0 for insert logic below if sl_count is 0 */ 2034 rv = 1; /* > 0 for insert logic below if sl_count is 0 */
1959 for (i=0; i<psl->sl_count; i++) { 2035 for (i=0; i<psl->sl_count; i++) {
@@ -2015,7 +2091,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
2015 goto done; 2091 goto done;
2016 } 2092 }
2017 2093
2018 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2094 for_each_pmc_rtnl(inet, pmc) {
2019 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && 2095 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
2020 pmc->multi.imr_ifindex == imr.imr_ifindex) 2096 pmc->multi.imr_ifindex == imr.imr_ifindex)
2021 break; 2097 break;
@@ -2045,15 +2121,17 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
2045 (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr, 2121 (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
2046 msf->imsf_fmode, 0, NULL, 0); 2122 msf->imsf_fmode, 0, NULL, 0);
2047 } 2123 }
2048 psl = pmc->sflist; 2124 psl = rtnl_dereference(pmc->sflist);
2049 if (psl) { 2125 if (psl) {
2050 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 2126 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
2051 psl->sl_count, psl->sl_addr, 0); 2127 psl->sl_count, psl->sl_addr, 0);
2052 sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); 2128 /* decrease mem now to avoid the memleak warning */
2129 atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
2130 call_rcu(&psl->rcu, ip_sf_socklist_reclaim);
2053 } else 2131 } else
2054 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 2132 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
2055 0, NULL, 0); 2133 0, NULL, 0);
2056 pmc->sflist = newpsl; 2134 rcu_assign_pointer(pmc->sflist, newpsl);
2057 pmc->sfmode = msf->imsf_fmode; 2135 pmc->sfmode = msf->imsf_fmode;
2058 err = 0; 2136 err = 0;
2059done: 2137done:
@@ -2091,7 +2169,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
2091 } 2169 }
2092 err = -EADDRNOTAVAIL; 2170 err = -EADDRNOTAVAIL;
2093 2171
2094 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2172 for_each_pmc_rtnl(inet, pmc) {
2095 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && 2173 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
2096 pmc->multi.imr_ifindex == imr.imr_ifindex) 2174 pmc->multi.imr_ifindex == imr.imr_ifindex)
2097 break; 2175 break;
@@ -2099,7 +2177,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
2099 if (!pmc) /* must have a prior join */ 2177 if (!pmc) /* must have a prior join */
2100 goto done; 2178 goto done;
2101 msf->imsf_fmode = pmc->sfmode; 2179 msf->imsf_fmode = pmc->sfmode;
2102 psl = pmc->sflist; 2180 psl = rtnl_dereference(pmc->sflist);
2103 rtnl_unlock(); 2181 rtnl_unlock();
2104 if (!psl) { 2182 if (!psl) {
2105 len = 0; 2183 len = 0;
@@ -2144,7 +2222,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
2144 2222
2145 err = -EADDRNOTAVAIL; 2223 err = -EADDRNOTAVAIL;
2146 2224
2147 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2225 for_each_pmc_rtnl(inet, pmc) {
2148 if (pmc->multi.imr_multiaddr.s_addr == addr && 2226 if (pmc->multi.imr_multiaddr.s_addr == addr &&
2149 pmc->multi.imr_ifindex == gsf->gf_interface) 2227 pmc->multi.imr_ifindex == gsf->gf_interface)
2150 break; 2228 break;
@@ -2152,7 +2230,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
2152 if (!pmc) /* must have a prior join */ 2230 if (!pmc) /* must have a prior join */
2153 goto done; 2231 goto done;
2154 gsf->gf_fmode = pmc->sfmode; 2232 gsf->gf_fmode = pmc->sfmode;
2155 psl = pmc->sflist; 2233 psl = rtnl_dereference(pmc->sflist);
2156 rtnl_unlock(); 2234 rtnl_unlock();
2157 count = psl ? psl->sl_count : 0; 2235 count = psl ? psl->sl_count : 0;
2158 copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; 2236 copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
@@ -2186,30 +2264,40 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
2186 struct ip_mc_socklist *pmc; 2264 struct ip_mc_socklist *pmc;
2187 struct ip_sf_socklist *psl; 2265 struct ip_sf_socklist *psl;
2188 int i; 2266 int i;
2267 int ret;
2189 2268
2269 ret = 1;
2190 if (!ipv4_is_multicast(loc_addr)) 2270 if (!ipv4_is_multicast(loc_addr))
2191 return 1; 2271 goto out;
2192 2272
2193 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2273 rcu_read_lock();
2274 for_each_pmc_rcu(inet, pmc) {
2194 if (pmc->multi.imr_multiaddr.s_addr == loc_addr && 2275 if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
2195 pmc->multi.imr_ifindex == dif) 2276 pmc->multi.imr_ifindex == dif)
2196 break; 2277 break;
2197 } 2278 }
2279 ret = inet->mc_all;
2198 if (!pmc) 2280 if (!pmc)
2199 return inet->mc_all; 2281 goto unlock;
2200 psl = pmc->sflist; 2282 psl = rcu_dereference(pmc->sflist);
2283 ret = (pmc->sfmode == MCAST_EXCLUDE);
2201 if (!psl) 2284 if (!psl)
2202 return pmc->sfmode == MCAST_EXCLUDE; 2285 goto unlock;
2203 2286
2204 for (i=0; i<psl->sl_count; i++) { 2287 for (i=0; i<psl->sl_count; i++) {
2205 if (psl->sl_addr[i] == rmt_addr) 2288 if (psl->sl_addr[i] == rmt_addr)
2206 break; 2289 break;
2207 } 2290 }
2291 ret = 0;
2208 if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) 2292 if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
2209 return 0; 2293 goto unlock;
2210 if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) 2294 if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
2211 return 0; 2295 goto unlock;
2212 return 1; 2296 ret = 1;
2297unlock:
2298 rcu_read_unlock();
2299out:
2300 return ret;
2213} 2301}
2214 2302
2215/* 2303/*
@@ -2226,17 +2314,17 @@ void ip_mc_drop_socket(struct sock *sk)
2226 return; 2314 return;
2227 2315
2228 rtnl_lock(); 2316 rtnl_lock();
2229 while ((iml = inet->mc_list) != NULL) { 2317 while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
2230 struct in_device *in_dev; 2318 struct in_device *in_dev;
2231 inet->mc_list = iml->next;
2232 2319
2320 inet->mc_list = iml->next_rcu;
2233 in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); 2321 in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
2234 (void) ip_mc_leave_src(sk, iml, in_dev); 2322 (void) ip_mc_leave_src(sk, iml, in_dev);
2235 if (in_dev != NULL) { 2323 if (in_dev != NULL)
2236 ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); 2324 ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
2237 in_dev_put(in_dev); 2325 /* decrease mem now to avoid the memleak warning */
2238 } 2326 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
2239 sock_kfree_s(sk, iml, sizeof(*iml)); 2327 call_rcu(&iml->rcu, ip_mc_socklist_reclaim);
2240 } 2328 }
2241 rtnl_unlock(); 2329 rtnl_unlock();
2242} 2330}
@@ -2247,8 +2335,8 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
2247 struct ip_sf_list *psf; 2335 struct ip_sf_list *psf;
2248 int rv = 0; 2336 int rv = 0;
2249 2337
2250 read_lock(&in_dev->mc_list_lock); 2338 rcu_read_lock();
2251 for (im=in_dev->mc_list; im; im=im->next) { 2339 for_each_pmc_rcu(in_dev, im) {
2252 if (im->multiaddr == mc_addr) 2340 if (im->multiaddr == mc_addr)
2253 break; 2341 break;
2254 } 2342 }
@@ -2269,7 +2357,7 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
2269 } else 2357 } else
2270 rv = 1; /* unspecified source; tentatively allow */ 2358 rv = 1; /* unspecified source; tentatively allow */
2271 } 2359 }
2272 read_unlock(&in_dev->mc_list_lock); 2360 rcu_read_unlock();
2273 return rv; 2361 return rv;
2274} 2362}
2275 2363
@@ -2289,19 +2377,17 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
2289 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2377 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2290 2378
2291 state->in_dev = NULL; 2379 state->in_dev = NULL;
2292 for_each_netdev(net, state->dev) { 2380 for_each_netdev_rcu(net, state->dev) {
2293 struct in_device *in_dev; 2381 struct in_device *in_dev;
2294 in_dev = in_dev_get(state->dev); 2382
2383 in_dev = __in_dev_get_rcu(state->dev);
2295 if (!in_dev) 2384 if (!in_dev)
2296 continue; 2385 continue;
2297 read_lock(&in_dev->mc_list_lock); 2386 im = rcu_dereference(in_dev->mc_list);
2298 im = in_dev->mc_list;
2299 if (im) { 2387 if (im) {
2300 state->in_dev = in_dev; 2388 state->in_dev = in_dev;
2301 break; 2389 break;
2302 } 2390 }
2303 read_unlock(&in_dev->mc_list_lock);
2304 in_dev_put(in_dev);
2305 } 2391 }
2306 return im; 2392 return im;
2307} 2393}
@@ -2309,22 +2395,18 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
2309static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im) 2395static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
2310{ 2396{
2311 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2397 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2312 im = im->next; 2398
2399 im = rcu_dereference(im->next_rcu);
2313 while (!im) { 2400 while (!im) {
2314 if (likely(state->in_dev != NULL)) { 2401 state->dev = next_net_device_rcu(state->dev);
2315 read_unlock(&state->in_dev->mc_list_lock);
2316 in_dev_put(state->in_dev);
2317 }
2318 state->dev = next_net_device(state->dev);
2319 if (!state->dev) { 2402 if (!state->dev) {
2320 state->in_dev = NULL; 2403 state->in_dev = NULL;
2321 break; 2404 break;
2322 } 2405 }
2323 state->in_dev = in_dev_get(state->dev); 2406 state->in_dev = __in_dev_get_rcu(state->dev);
2324 if (!state->in_dev) 2407 if (!state->in_dev)
2325 continue; 2408 continue;
2326 read_lock(&state->in_dev->mc_list_lock); 2409 im = rcu_dereference(state->in_dev->mc_list);
2327 im = state->in_dev->mc_list;
2328 } 2410 }
2329 return im; 2411 return im;
2330} 2412}
@@ -2339,9 +2421,9 @@ static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
2339} 2421}
2340 2422
2341static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) 2423static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
2342 __acquires(dev_base_lock) 2424 __acquires(rcu)
2343{ 2425{
2344 read_lock(&dev_base_lock); 2426 rcu_read_lock();
2345 return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2427 return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2346} 2428}
2347 2429
@@ -2357,16 +2439,13 @@ static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2357} 2439}
2358 2440
2359static void igmp_mc_seq_stop(struct seq_file *seq, void *v) 2441static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
2360 __releases(dev_base_lock) 2442 __releases(rcu)
2361{ 2443{
2362 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2444 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2363 if (likely(state->in_dev != NULL)) { 2445
2364 read_unlock(&state->in_dev->mc_list_lock); 2446 state->in_dev = NULL;
2365 in_dev_put(state->in_dev);
2366 state->in_dev = NULL;
2367 }
2368 state->dev = NULL; 2447 state->dev = NULL;
2369 read_unlock(&dev_base_lock); 2448 rcu_read_unlock();
2370} 2449}
2371 2450
2372static int igmp_mc_seq_show(struct seq_file *seq, void *v) 2451static int igmp_mc_seq_show(struct seq_file *seq, void *v)
@@ -2386,7 +2465,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2386 querier = "NONE"; 2465 querier = "NONE";
2387#endif 2466#endif
2388 2467
2389 if (state->in_dev->mc_list == im) { 2468 if (rcu_dereference(state->in_dev->mc_list) == im) {
2390 seq_printf(seq, "%d\t%-10s: %5d %7s\n", 2469 seq_printf(seq, "%d\t%-10s: %5d %7s\n",
2391 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); 2470 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
2392 } 2471 }
@@ -2440,13 +2519,12 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2440 2519
2441 state->idev = NULL; 2520 state->idev = NULL;
2442 state->im = NULL; 2521 state->im = NULL;
2443 for_each_netdev(net, state->dev) { 2522 for_each_netdev_rcu(net, state->dev) {
2444 struct in_device *idev; 2523 struct in_device *idev;
2445 idev = in_dev_get(state->dev); 2524 idev = __in_dev_get_rcu(state->dev);
2446 if (unlikely(idev == NULL)) 2525 if (unlikely(idev == NULL))
2447 continue; 2526 continue;
2448 read_lock(&idev->mc_list_lock); 2527 im = rcu_dereference(idev->mc_list);
2449 im = idev->mc_list;
2450 if (likely(im != NULL)) { 2528 if (likely(im != NULL)) {
2451 spin_lock_bh(&im->lock); 2529 spin_lock_bh(&im->lock);
2452 psf = im->sources; 2530 psf = im->sources;
@@ -2457,8 +2535,6 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2457 } 2535 }
2458 spin_unlock_bh(&im->lock); 2536 spin_unlock_bh(&im->lock);
2459 } 2537 }
2460 read_unlock(&idev->mc_list_lock);
2461 in_dev_put(idev);
2462 } 2538 }
2463 return psf; 2539 return psf;
2464} 2540}
@@ -2472,20 +2548,15 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
2472 spin_unlock_bh(&state->im->lock); 2548 spin_unlock_bh(&state->im->lock);
2473 state->im = state->im->next; 2549 state->im = state->im->next;
2474 while (!state->im) { 2550 while (!state->im) {
2475 if (likely(state->idev != NULL)) { 2551 state->dev = next_net_device_rcu(state->dev);
2476 read_unlock(&state->idev->mc_list_lock);
2477 in_dev_put(state->idev);
2478 }
2479 state->dev = next_net_device(state->dev);
2480 if (!state->dev) { 2552 if (!state->dev) {
2481 state->idev = NULL; 2553 state->idev = NULL;
2482 goto out; 2554 goto out;
2483 } 2555 }
2484 state->idev = in_dev_get(state->dev); 2556 state->idev = __in_dev_get_rcu(state->dev);
2485 if (!state->idev) 2557 if (!state->idev)
2486 continue; 2558 continue;
2487 read_lock(&state->idev->mc_list_lock); 2559 state->im = rcu_dereference(state->idev->mc_list);
2488 state->im = state->idev->mc_list;
2489 } 2560 }
2490 if (!state->im) 2561 if (!state->im)
2491 break; 2562 break;
@@ -2506,8 +2577,9 @@ static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
2506} 2577}
2507 2578
2508static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos) 2579static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
2580 __acquires(rcu)
2509{ 2581{
2510 read_lock(&dev_base_lock); 2582 rcu_read_lock();
2511 return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2583 return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2512} 2584}
2513 2585
@@ -2523,19 +2595,16 @@ static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2523} 2595}
2524 2596
2525static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) 2597static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
2598 __releases(rcu)
2526{ 2599{
2527 struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); 2600 struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
2528 if (likely(state->im != NULL)) { 2601 if (likely(state->im != NULL)) {
2529 spin_unlock_bh(&state->im->lock); 2602 spin_unlock_bh(&state->im->lock);
2530 state->im = NULL; 2603 state->im = NULL;
2531 } 2604 }
2532 if (likely(state->idev != NULL)) { 2605 state->idev = NULL;
2533 read_unlock(&state->idev->mc_list_lock);
2534 in_dev_put(state->idev);
2535 state->idev = NULL;
2536 }
2537 state->dev = NULL; 2606 state->dev = NULL;
2538 read_unlock(&dev_base_lock); 2607 rcu_read_unlock();
2539} 2608}
2540 2609
2541static int igmp_mcf_seq_show(struct seq_file *seq, void *v) 2610static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
@@ -2583,7 +2652,7 @@ static const struct file_operations igmp_mcf_seq_fops = {
2583 .release = seq_release_net, 2652 .release = seq_release_net,
2584}; 2653};
2585 2654
2586static int igmp_net_init(struct net *net) 2655static int __net_init igmp_net_init(struct net *net)
2587{ 2656{
2588 struct proc_dir_entry *pde; 2657 struct proc_dir_entry *pde;
2589 2658
@@ -2601,7 +2670,7 @@ out_igmp:
2601 return -ENOMEM; 2670 return -ENOMEM;
2602} 2671}
2603 2672
2604static void igmp_net_exit(struct net *net) 2673static void __net_exit igmp_net_exit(struct net *net)
2605{ 2674{
2606 proc_net_remove(net, "mcfilter"); 2675 proc_net_remove(net, "mcfilter");
2607 proc_net_remove(net, "igmp"); 2676 proc_net_remove(net, "igmp");
@@ -2617,8 +2686,3 @@ int __init igmp_mc_proc_init(void)
2617 return register_pernet_subsys(&igmp_net_ops); 2686 return register_pernet_subsys(&igmp_net_ops);
2618} 2687}
2619#endif 2688#endif
2620
2621EXPORT_SYMBOL(ip_mc_dec_group);
2622EXPORT_SYMBOL(ip_mc_inc_group);
2623EXPORT_SYMBOL(ip_mc_join_group);
2624EXPORT_SYMBOL(ip_mc_rejoin_group);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 22cd19ee44e5..97e5fb765265 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -37,6 +37,9 @@ struct local_ports sysctl_local_ports __read_mostly = {
37 .range = { 32768, 61000 }, 37 .range = { 32768, 61000 },
38}; 38};
39 39
40unsigned long *sysctl_local_reserved_ports;
41EXPORT_SYMBOL(sysctl_local_reserved_ports);
42
40void inet_get_local_port_range(int *low, int *high) 43void inet_get_local_port_range(int *low, int *high)
41{ 44{
42 unsigned seq; 45 unsigned seq;
@@ -52,7 +55,6 @@ EXPORT_SYMBOL(inet_get_local_port_range);
52int inet_csk_bind_conflict(const struct sock *sk, 55int inet_csk_bind_conflict(const struct sock *sk,
53 const struct inet_bind_bucket *tb) 56 const struct inet_bind_bucket *tb)
54{ 57{
55 const __be32 sk_rcv_saddr = inet_rcv_saddr(sk);
56 struct sock *sk2; 58 struct sock *sk2;
57 struct hlist_node *node; 59 struct hlist_node *node;
58 int reuse = sk->sk_reuse; 60 int reuse = sk->sk_reuse;
@@ -71,17 +73,16 @@ int inet_csk_bind_conflict(const struct sock *sk,
71 !sk2->sk_bound_dev_if || 73 !sk2->sk_bound_dev_if ||
72 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { 74 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
73 if (!reuse || !sk2->sk_reuse || 75 if (!reuse || !sk2->sk_reuse ||
74 sk2->sk_state == TCP_LISTEN) { 76 ((1 << sk2->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))) {
75 const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); 77 const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
76 if (!sk2_rcv_saddr || !sk_rcv_saddr || 78 if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
77 sk2_rcv_saddr == sk_rcv_saddr) 79 sk2_rcv_saddr == sk_rcv_saddr(sk))
78 break; 80 break;
79 } 81 }
80 } 82 }
81 } 83 }
82 return node != NULL; 84 return node != NULL;
83} 85}
84
85EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); 86EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
86 87
87/* Obtain a reference to a local port for the given sock, 88/* Obtain a reference to a local port for the given sock,
@@ -108,18 +109,21 @@ again:
108 109
109 smallest_size = -1; 110 smallest_size = -1;
110 do { 111 do {
112 if (inet_is_reserved_local_port(rover))
113 goto next_nolock;
111 head = &hashinfo->bhash[inet_bhashfn(net, rover, 114 head = &hashinfo->bhash[inet_bhashfn(net, rover,
112 hashinfo->bhash_size)]; 115 hashinfo->bhash_size)];
113 spin_lock(&head->lock); 116 spin_lock(&head->lock);
114 inet_bind_bucket_for_each(tb, node, &head->chain) 117 inet_bind_bucket_for_each(tb, node, &head->chain)
115 if (ib_net(tb) == net && tb->port == rover) { 118 if (net_eq(ib_net(tb), net) && tb->port == rover) {
116 if (tb->fastreuse > 0 && 119 if (tb->fastreuse > 0 &&
117 sk->sk_reuse && 120 sk->sk_reuse &&
118 sk->sk_state != TCP_LISTEN && 121 sk->sk_state != TCP_LISTEN &&
119 (tb->num_owners < smallest_size || smallest_size == -1)) { 122 (tb->num_owners < smallest_size || smallest_size == -1)) {
120 smallest_size = tb->num_owners; 123 smallest_size = tb->num_owners;
121 smallest_rover = rover; 124 smallest_rover = rover;
122 if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { 125 if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
126 !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
123 spin_unlock(&head->lock); 127 spin_unlock(&head->lock);
124 snum = smallest_rover; 128 snum = smallest_rover;
125 goto have_snum; 129 goto have_snum;
@@ -130,6 +134,7 @@ again:
130 break; 134 break;
131 next: 135 next:
132 spin_unlock(&head->lock); 136 spin_unlock(&head->lock);
137 next_nolock:
133 if (++rover > high) 138 if (++rover > high)
134 rover = low; 139 rover = low;
135 } while (--remaining > 0); 140 } while (--remaining > 0);
@@ -158,7 +163,7 @@ have_snum:
158 hashinfo->bhash_size)]; 163 hashinfo->bhash_size)];
159 spin_lock(&head->lock); 164 spin_lock(&head->lock);
160 inet_bind_bucket_for_each(tb, node, &head->chain) 165 inet_bind_bucket_for_each(tb, node, &head->chain)
161 if (ib_net(tb) == net && tb->port == snum) 166 if (net_eq(ib_net(tb), net) && tb->port == snum)
162 goto tb_found; 167 goto tb_found;
163 } 168 }
164 tb = NULL; 169 tb = NULL;
@@ -206,7 +211,6 @@ fail:
206 local_bh_enable(); 211 local_bh_enable();
207 return ret; 212 return ret;
208} 213}
209
210EXPORT_SYMBOL_GPL(inet_csk_get_port); 214EXPORT_SYMBOL_GPL(inet_csk_get_port);
211 215
212/* 216/*
@@ -234,7 +238,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
234 * having to remove and re-insert us on the wait queue. 238 * having to remove and re-insert us on the wait queue.
235 */ 239 */
236 for (;;) { 240 for (;;) {
237 prepare_to_wait_exclusive(sk->sk_sleep, &wait, 241 prepare_to_wait_exclusive(sk_sleep(sk), &wait,
238 TASK_INTERRUPTIBLE); 242 TASK_INTERRUPTIBLE);
239 release_sock(sk); 243 release_sock(sk);
240 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) 244 if (reqsk_queue_empty(&icsk->icsk_accept_queue))
@@ -253,7 +257,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
253 if (!timeo) 257 if (!timeo)
254 break; 258 break;
255 } 259 }
256 finish_wait(sk->sk_sleep, &wait); 260 finish_wait(sk_sleep(sk), &wait);
257 return err; 261 return err;
258} 262}
259 263
@@ -299,7 +303,6 @@ out_err:
299 *err = error; 303 *err = error;
300 goto out; 304 goto out;
301} 305}
302
303EXPORT_SYMBOL(inet_csk_accept); 306EXPORT_SYMBOL(inet_csk_accept);
304 307
305/* 308/*
@@ -321,7 +324,6 @@ void inet_csk_init_xmit_timers(struct sock *sk,
321 setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); 324 setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
322 icsk->icsk_pending = icsk->icsk_ack.pending = 0; 325 icsk->icsk_pending = icsk->icsk_ack.pending = 0;
323} 326}
324
325EXPORT_SYMBOL(inet_csk_init_xmit_timers); 327EXPORT_SYMBOL(inet_csk_init_xmit_timers);
326 328
327void inet_csk_clear_xmit_timers(struct sock *sk) 329void inet_csk_clear_xmit_timers(struct sock *sk)
@@ -334,21 +336,18 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
334 sk_stop_timer(sk, &icsk->icsk_delack_timer); 336 sk_stop_timer(sk, &icsk->icsk_delack_timer);
335 sk_stop_timer(sk, &sk->sk_timer); 337 sk_stop_timer(sk, &sk->sk_timer);
336} 338}
337
338EXPORT_SYMBOL(inet_csk_clear_xmit_timers); 339EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
339 340
340void inet_csk_delete_keepalive_timer(struct sock *sk) 341void inet_csk_delete_keepalive_timer(struct sock *sk)
341{ 342{
342 sk_stop_timer(sk, &sk->sk_timer); 343 sk_stop_timer(sk, &sk->sk_timer);
343} 344}
344
345EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); 345EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
346 346
347void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) 347void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
348{ 348{
349 sk_reset_timer(sk, &sk->sk_timer, jiffies + len); 349 sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
350} 350}
351
352EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); 351EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
353 352
354struct dst_entry *inet_csk_route_req(struct sock *sk, 353struct dst_entry *inet_csk_route_req(struct sock *sk,
@@ -358,17 +357,15 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
358 const struct inet_request_sock *ireq = inet_rsk(req); 357 const struct inet_request_sock *ireq = inet_rsk(req);
359 struct ip_options *opt = inet_rsk(req)->opt; 358 struct ip_options *opt = inet_rsk(req)->opt;
360 struct flowi fl = { .oif = sk->sk_bound_dev_if, 359 struct flowi fl = { .oif = sk->sk_bound_dev_if,
361 .nl_u = { .ip4_u = 360 .mark = sk->sk_mark,
362 { .daddr = ((opt && opt->srr) ? 361 .fl4_dst = ((opt && opt->srr) ?
363 opt->faddr : 362 opt->faddr : ireq->rmt_addr),
364 ireq->rmt_addr), 363 .fl4_src = ireq->loc_addr,
365 .saddr = ireq->loc_addr, 364 .fl4_tos = RT_CONN_FLAGS(sk),
366 .tos = RT_CONN_FLAGS(sk) } },
367 .proto = sk->sk_protocol, 365 .proto = sk->sk_protocol,
368 .flags = inet_sk_flowi_flags(sk), 366 .flags = inet_sk_flowi_flags(sk),
369 .uli_u = { .ports = 367 .fl_ip_sport = inet_sk(sk)->inet_sport,
370 { .sport = inet_sk(sk)->sport, 368 .fl_ip_dport = ireq->rmt_port };
371 .dport = ireq->rmt_port } } };
372 struct net *net = sock_net(sk); 369 struct net *net = sock_net(sk);
373 370
374 security_req_classify_flow(req, &fl); 371 security_req_classify_flow(req, &fl);
@@ -376,7 +373,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
376 goto no_route; 373 goto no_route;
377 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 374 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
378 goto route_err; 375 goto route_err;
379 return &rt->u.dst; 376 return &rt->dst;
380 377
381route_err: 378route_err:
382 ip_rt_put(rt); 379 ip_rt_put(rt);
@@ -384,7 +381,6 @@ no_route:
384 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); 381 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
385 return NULL; 382 return NULL;
386} 383}
387
388EXPORT_SYMBOL_GPL(inet_csk_route_req); 384EXPORT_SYMBOL_GPL(inet_csk_route_req);
389 385
390static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, 386static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
@@ -426,7 +422,6 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
426 422
427 return req; 423 return req;
428} 424}
429
430EXPORT_SYMBOL_GPL(inet_csk_search_req); 425EXPORT_SYMBOL_GPL(inet_csk_search_req);
431 426
432void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, 427void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
@@ -440,11 +435,33 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
440 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); 435 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
441 inet_csk_reqsk_queue_added(sk, timeout); 436 inet_csk_reqsk_queue_added(sk, timeout);
442} 437}
438EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
443 439
444/* Only thing we need from tcp.h */ 440/* Only thing we need from tcp.h */
445extern int sysctl_tcp_synack_retries; 441extern int sysctl_tcp_synack_retries;
446 442
447EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); 443
444/* Decide when to expire the request and when to resend SYN-ACK */
445static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
446 const int max_retries,
447 const u8 rskq_defer_accept,
448 int *expire, int *resend)
449{
450 if (!rskq_defer_accept) {
451 *expire = req->retrans >= thresh;
452 *resend = 1;
453 return;
454 }
455 *expire = req->retrans >= thresh &&
456 (!inet_rsk(req)->acked || req->retrans >= max_retries);
457 /*
458 * Do not resend while waiting for data after ACK,
459 * start to resend on end of deferring period to give
460 * last chance for data or ACK to create established socket.
461 */
462 *resend = !inet_rsk(req)->acked ||
463 req->retrans >= rskq_defer_accept - 1;
464}
448 465
449void inet_csk_reqsk_queue_prune(struct sock *parent, 466void inet_csk_reqsk_queue_prune(struct sock *parent,
450 const unsigned long interval, 467 const unsigned long interval,
@@ -501,9 +518,17 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
501 reqp=&lopt->syn_table[i]; 518 reqp=&lopt->syn_table[i];
502 while ((req = *reqp) != NULL) { 519 while ((req = *reqp) != NULL) {
503 if (time_after_eq(now, req->expires)) { 520 if (time_after_eq(now, req->expires)) {
504 if ((req->retrans < thresh || 521 int expire = 0, resend = 0;
505 (inet_rsk(req)->acked && req->retrans < max_retries)) 522
506 && !req->rsk_ops->rtx_syn_ack(parent, req)) { 523 syn_ack_recalc(req, thresh, max_retries,
524 queue->rskq_defer_accept,
525 &expire, &resend);
526 if (req->rsk_ops->syn_ack_timeout)
527 req->rsk_ops->syn_ack_timeout(parent, req);
528 if (!expire &&
529 (!resend ||
530 !req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
531 inet_rsk(req)->acked)) {
507 unsigned long timeo; 532 unsigned long timeo;
508 533
509 if (req->retrans++ == 0) 534 if (req->retrans++ == 0)
@@ -532,7 +557,6 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
532 if (lopt->qlen) 557 if (lopt->qlen)
533 inet_csk_reset_keepalive_timer(parent, interval); 558 inet_csk_reset_keepalive_timer(parent, interval);
534} 559}
535
536EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); 560EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
537 561
538struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, 562struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
@@ -546,9 +570,9 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
546 newsk->sk_state = TCP_SYN_RECV; 570 newsk->sk_state = TCP_SYN_RECV;
547 newicsk->icsk_bind_hash = NULL; 571 newicsk->icsk_bind_hash = NULL;
548 572
549 inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; 573 inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port;
550 inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port); 574 inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port);
551 inet_sk(newsk)->sport = inet_rsk(req)->loc_port; 575 inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port;
552 newsk->sk_write_space = sk_stream_write_space; 576 newsk->sk_write_space = sk_stream_write_space;
553 577
554 newicsk->icsk_retransmits = 0; 578 newicsk->icsk_retransmits = 0;
@@ -562,7 +586,6 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
562 } 586 }
563 return newsk; 587 return newsk;
564} 588}
565
566EXPORT_SYMBOL_GPL(inet_csk_clone); 589EXPORT_SYMBOL_GPL(inet_csk_clone);
567 590
568/* 591/*
@@ -579,8 +602,8 @@ void inet_csk_destroy_sock(struct sock *sk)
579 /* It cannot be in hash table! */ 602 /* It cannot be in hash table! */
580 WARN_ON(!sk_unhashed(sk)); 603 WARN_ON(!sk_unhashed(sk));
581 604
582 /* If it has not 0 inet_sk(sk)->num, it must be bound */ 605 /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
583 WARN_ON(inet_sk(sk)->num && !inet_csk(sk)->icsk_bind_hash); 606 WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
584 607
585 sk->sk_prot->destroy(sk); 608 sk->sk_prot->destroy(sk);
586 609
@@ -593,7 +616,6 @@ void inet_csk_destroy_sock(struct sock *sk)
593 percpu_counter_dec(sk->sk_prot->orphan_count); 616 percpu_counter_dec(sk->sk_prot->orphan_count);
594 sock_put(sk); 617 sock_put(sk);
595} 618}
596
597EXPORT_SYMBOL(inet_csk_destroy_sock); 619EXPORT_SYMBOL(inet_csk_destroy_sock);
598 620
599int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) 621int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
@@ -615,8 +637,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
615 * after validation is complete. 637 * after validation is complete.
616 */ 638 */
617 sk->sk_state = TCP_LISTEN; 639 sk->sk_state = TCP_LISTEN;
618 if (!sk->sk_prot->get_port(sk, inet->num)) { 640 if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
619 inet->sport = htons(inet->num); 641 inet->inet_sport = htons(inet->inet_num);
620 642
621 sk_dst_reset(sk); 643 sk_dst_reset(sk);
622 sk->sk_prot->hash(sk); 644 sk->sk_prot->hash(sk);
@@ -628,7 +650,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
628 __reqsk_queue_destroy(&icsk->icsk_accept_queue); 650 __reqsk_queue_destroy(&icsk->icsk_accept_queue);
629 return -EADDRINUSE; 651 return -EADDRINUSE;
630} 652}
631
632EXPORT_SYMBOL_GPL(inet_csk_listen_start); 653EXPORT_SYMBOL_GPL(inet_csk_listen_start);
633 654
634/* 655/*
@@ -683,7 +704,6 @@ void inet_csk_listen_stop(struct sock *sk)
683 } 704 }
684 WARN_ON(sk->sk_ack_backlog); 705 WARN_ON(sk->sk_ack_backlog);
685} 706}
686
687EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 707EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
688 708
689void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) 709void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
@@ -692,10 +712,9 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
692 const struct inet_sock *inet = inet_sk(sk); 712 const struct inet_sock *inet = inet_sk(sk);
693 713
694 sin->sin_family = AF_INET; 714 sin->sin_family = AF_INET;
695 sin->sin_addr.s_addr = inet->daddr; 715 sin->sin_addr.s_addr = inet->inet_daddr;
696 sin->sin_port = inet->dport; 716 sin->sin_port = inet->inet_dport;
697} 717}
698
699EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); 718EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
700 719
701#ifdef CONFIG_COMPAT 720#ifdef CONFIG_COMPAT
@@ -710,11 +729,10 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
710 return icsk->icsk_af_ops->getsockopt(sk, level, optname, 729 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
711 optval, optlen); 730 optval, optlen);
712} 731}
713
714EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt); 732EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
715 733
716int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, 734int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
717 char __user *optval, int optlen) 735 char __user *optval, unsigned int optlen)
718{ 736{
719 const struct inet_connection_sock *icsk = inet_csk(sk); 737 const struct inet_connection_sock *icsk = inet_csk(sk);
720 738
@@ -724,6 +742,5 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
724 return icsk->icsk_af_ops->setsockopt(sk, level, optname, 742 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
725 optval, optlen); 743 optval, optlen);
726} 744}
727
728EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); 745EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
729#endif 746#endif
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index a706a47f4dbb..2746c1fa6417 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -14,6 +14,7 @@
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/fcntl.h> 15#include <linux/fcntl.h>
16#include <linux/random.h> 16#include <linux/random.h>
17#include <linux/slab.h>
17#include <linux/cache.h> 18#include <linux/cache.h>
18#include <linux/init.h> 19#include <linux/init.h>
19#include <linux/time.h> 20#include <linux/time.h>
@@ -116,10 +117,10 @@ static int inet_csk_diag_fill(struct sock *sk,
116 r->id.idiag_cookie[0] = (u32)(unsigned long)sk; 117 r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
117 r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); 118 r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
118 119
119 r->id.idiag_sport = inet->sport; 120 r->id.idiag_sport = inet->inet_sport;
120 r->id.idiag_dport = inet->dport; 121 r->id.idiag_dport = inet->inet_dport;
121 r->id.idiag_src[0] = inet->rcv_saddr; 122 r->id.idiag_src[0] = inet->inet_rcv_saddr;
122 r->id.idiag_dst[0] = inet->daddr; 123 r->id.idiag_dst[0] = inet->inet_daddr;
123 124
124#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 125#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
125 if (r->idiag_family == AF_INET6) { 126 if (r->idiag_family == AF_INET6) {
@@ -368,7 +369,7 @@ static int inet_diag_bc_run(const void *bc, int len,
368 yes = entry->sport >= op[1].no; 369 yes = entry->sport >= op[1].no;
369 break; 370 break;
370 case INET_DIAG_BC_S_LE: 371 case INET_DIAG_BC_S_LE:
371 yes = entry->dport <= op[1].no; 372 yes = entry->sport <= op[1].no;
372 break; 373 break;
373 case INET_DIAG_BC_D_GE: 374 case INET_DIAG_BC_D_GE:
374 yes = entry->dport >= op[1].no; 375 yes = entry->dport >= op[1].no;
@@ -424,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len,
424 bc += op->no; 425 bc += op->no;
425 } 426 }
426 } 427 }
427 return (len == 0); 428 return len == 0;
428} 429}
429 430
430static int valid_cc(const void *bc, int len, int cc) 431static int valid_cc(const void *bc, int len, int cc)
@@ -489,9 +490,11 @@ static int inet_csk_diag_dump(struct sock *sk,
489{ 490{
490 struct inet_diag_req *r = NLMSG_DATA(cb->nlh); 491 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
491 492
492 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { 493 if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
493 struct inet_diag_entry entry; 494 struct inet_diag_entry entry;
494 struct rtattr *bc = (struct rtattr *)(r + 1); 495 const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
496 sizeof(*r),
497 INET_DIAG_REQ_BYTECODE);
495 struct inet_sock *inet = inet_sk(sk); 498 struct inet_sock *inet = inet_sk(sk);
496 499
497 entry.family = sk->sk_family; 500 entry.family = sk->sk_family;
@@ -504,14 +507,14 @@ static int inet_csk_diag_dump(struct sock *sk,
504 } else 507 } else
505#endif 508#endif
506 { 509 {
507 entry.saddr = &inet->rcv_saddr; 510 entry.saddr = &inet->inet_rcv_saddr;
508 entry.daddr = &inet->daddr; 511 entry.daddr = &inet->inet_daddr;
509 } 512 }
510 entry.sport = inet->num; 513 entry.sport = inet->inet_num;
511 entry.dport = ntohs(inet->dport); 514 entry.dport = ntohs(inet->inet_dport);
512 entry.userlocks = sk->sk_userlocks; 515 entry.userlocks = sk->sk_userlocks;
513 516
514 if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) 517 if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry))
515 return 0; 518 return 0;
516 } 519 }
517 520
@@ -526,9 +529,11 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
526{ 529{
527 struct inet_diag_req *r = NLMSG_DATA(cb->nlh); 530 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
528 531
529 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { 532 if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
530 struct inet_diag_entry entry; 533 struct inet_diag_entry entry;
531 struct rtattr *bc = (struct rtattr *)(r + 1); 534 const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
535 sizeof(*r),
536 INET_DIAG_REQ_BYTECODE);
532 537
533 entry.family = tw->tw_family; 538 entry.family = tw->tw_family;
534#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 539#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
@@ -547,7 +552,7 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
547 entry.dport = ntohs(tw->tw_dport); 552 entry.dport = ntohs(tw->tw_dport);
548 entry.userlocks = 0; 553 entry.userlocks = 0;
549 554
550 if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) 555 if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry))
551 return 0; 556 return 0;
552 } 557 }
553 558
@@ -584,7 +589,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
584 if (tmo < 0) 589 if (tmo < 0)
585 tmo = 0; 590 tmo = 0;
586 591
587 r->id.idiag_sport = inet->sport; 592 r->id.idiag_sport = inet->inet_sport;
588 r->id.idiag_dport = ireq->rmt_port; 593 r->id.idiag_dport = ireq->rmt_port;
589 r->id.idiag_src[0] = ireq->loc_addr; 594 r->id.idiag_src[0] = ireq->loc_addr;
590 r->id.idiag_dst[0] = ireq->rmt_addr; 595 r->id.idiag_dst[0] = ireq->rmt_addr;
@@ -617,7 +622,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
617 struct inet_diag_req *r = NLMSG_DATA(cb->nlh); 622 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
618 struct inet_connection_sock *icsk = inet_csk(sk); 623 struct inet_connection_sock *icsk = inet_csk(sk);
619 struct listen_sock *lopt; 624 struct listen_sock *lopt;
620 struct rtattr *bc = NULL; 625 const struct nlattr *bc = NULL;
621 struct inet_sock *inet = inet_sk(sk); 626 struct inet_sock *inet = inet_sk(sk);
622 int j, s_j; 627 int j, s_j;
623 int reqnum, s_reqnum; 628 int reqnum, s_reqnum;
@@ -637,9 +642,10 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
637 if (!lopt || !lopt->qlen) 642 if (!lopt || !lopt->qlen)
638 goto out; 643 goto out;
639 644
640 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { 645 if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
641 bc = (struct rtattr *)(r + 1); 646 bc = nlmsg_find_attr(cb->nlh, sizeof(*r),
642 entry.sport = inet->num; 647 INET_DIAG_REQ_BYTECODE);
648 entry.sport = inet->inet_num;
643 entry.userlocks = sk->sk_userlocks; 649 entry.userlocks = sk->sk_userlocks;
644 } 650 }
645 651
@@ -671,8 +677,8 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
671 &ireq->rmt_addr; 677 &ireq->rmt_addr;
672 entry.dport = ntohs(ireq->rmt_port); 678 entry.dport = ntohs(ireq->rmt_port);
673 679
674 if (!inet_diag_bc_run(RTA_DATA(bc), 680 if (!inet_diag_bc_run(nla_data(bc),
675 RTA_PAYLOAD(bc), &entry)) 681 nla_len(bc), &entry))
676 continue; 682 continue;
677 } 683 }
678 684
@@ -732,7 +738,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
732 continue; 738 continue;
733 } 739 }
734 740
735 if (r->id.idiag_sport != inet->sport && 741 if (r->id.idiag_sport != inet->inet_sport &&
736 r->id.idiag_sport) 742 r->id.idiag_sport)
737 goto next_listen; 743 goto next_listen;
738 744
@@ -774,7 +780,7 @@ skip_listen_ht:
774 if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) 780 if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
775 goto unlock; 781 goto unlock;
776 782
777 for (i = s_i; i < hashinfo->ehash_size; i++) { 783 for (i = s_i; i <= hashinfo->ehash_mask; i++) {
778 struct inet_ehash_bucket *head = &hashinfo->ehash[i]; 784 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
779 spinlock_t *lock = inet_ehash_lockp(hashinfo, i); 785 spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
780 struct sock *sk; 786 struct sock *sk;
@@ -797,10 +803,10 @@ skip_listen_ht:
797 goto next_normal; 803 goto next_normal;
798 if (!(r->idiag_states & (1 << sk->sk_state))) 804 if (!(r->idiag_states & (1 << sk->sk_state)))
799 goto next_normal; 805 goto next_normal;
800 if (r->id.idiag_sport != inet->sport && 806 if (r->id.idiag_sport != inet->inet_sport &&
801 r->id.idiag_sport) 807 r->id.idiag_sport)
802 goto next_normal; 808 goto next_normal;
803 if (r->id.idiag_dport != inet->dport && 809 if (r->id.idiag_dport != inet->inet_dport &&
804 r->id.idiag_dport) 810 r->id.idiag_dport)
805 goto next_normal; 811 goto next_normal;
806 if (inet_csk_diag_dump(sk, skb, cb) < 0) { 812 if (inet_csk_diag_dump(sk, skb, cb) < 0) {
@@ -852,7 +858,7 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
852 nlmsg_len(nlh) < hdrlen) 858 nlmsg_len(nlh) < hdrlen)
853 return -EINVAL; 859 return -EINVAL;
854 860
855 if (nlh->nlmsg_flags & NLM_F_DUMP) { 861 if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
856 if (nlmsg_attrlen(nlh, hdrlen)) { 862 if (nlmsg_attrlen(nlh, hdrlen)) {
857 struct nlattr *attr; 863 struct nlattr *attr;
858 864
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index eaf3e2c8646a..5ff2a51b6d0c 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -19,6 +19,7 @@
19#include <linux/random.h> 19#include <linux/random.h>
20#include <linux/skbuff.h> 20#include <linux/skbuff.h>
21#include <linux/rtnetlink.h> 21#include <linux/rtnetlink.h>
22#include <linux/slab.h>
22 23
23#include <net/inet_frag.h> 24#include <net/inet_frag.h>
24 25
@@ -113,7 +114,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
113 fq->last_in |= INET_FRAG_COMPLETE; 114 fq->last_in |= INET_FRAG_COMPLETE;
114 } 115 }
115} 116}
116
117EXPORT_SYMBOL(inet_frag_kill); 117EXPORT_SYMBOL(inet_frag_kill);
118 118
119static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, 119static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 625cc5f64c94..3c0369a3a663 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -64,7 +64,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
64 64
65 atomic_inc(&hashinfo->bsockets); 65 atomic_inc(&hashinfo->bsockets);
66 66
67 inet_sk(sk)->num = snum; 67 inet_sk(sk)->inet_num = snum;
68 sk_add_bind_node(sk, &tb->owners); 68 sk_add_bind_node(sk, &tb->owners);
69 tb->num_owners++; 69 tb->num_owners++;
70 inet_csk(sk)->icsk_bind_hash = tb; 70 inet_csk(sk)->icsk_bind_hash = tb;
@@ -76,7 +76,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
76static void __inet_put_port(struct sock *sk) 76static void __inet_put_port(struct sock *sk)
77{ 77{
78 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 78 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
79 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->num, 79 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
80 hashinfo->bhash_size); 80 hashinfo->bhash_size);
81 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 81 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
82 struct inet_bind_bucket *tb; 82 struct inet_bind_bucket *tb;
@@ -88,7 +88,7 @@ static void __inet_put_port(struct sock *sk)
88 __sk_del_bind_node(sk); 88 __sk_del_bind_node(sk);
89 tb->num_owners--; 89 tb->num_owners--;
90 inet_csk(sk)->icsk_bind_hash = NULL; 90 inet_csk(sk)->icsk_bind_hash = NULL;
91 inet_sk(sk)->num = 0; 91 inet_sk(sk)->inet_num = 0;
92 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 92 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
93 spin_unlock(&head->lock); 93 spin_unlock(&head->lock);
94} 94}
@@ -99,24 +99,45 @@ void inet_put_port(struct sock *sk)
99 __inet_put_port(sk); 99 __inet_put_port(sk);
100 local_bh_enable(); 100 local_bh_enable();
101} 101}
102
103EXPORT_SYMBOL(inet_put_port); 102EXPORT_SYMBOL(inet_put_port);
104 103
105void __inet_inherit_port(struct sock *sk, struct sock *child) 104int __inet_inherit_port(struct sock *sk, struct sock *child)
106{ 105{
107 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 106 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
108 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->num, 107 unsigned short port = inet_sk(child)->inet_num;
108 const int bhash = inet_bhashfn(sock_net(sk), port,
109 table->bhash_size); 109 table->bhash_size);
110 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 110 struct inet_bind_hashbucket *head = &table->bhash[bhash];
111 struct inet_bind_bucket *tb; 111 struct inet_bind_bucket *tb;
112 112
113 spin_lock(&head->lock); 113 spin_lock(&head->lock);
114 tb = inet_csk(sk)->icsk_bind_hash; 114 tb = inet_csk(sk)->icsk_bind_hash;
115 sk_add_bind_node(child, &tb->owners); 115 if (tb->port != port) {
116 inet_csk(child)->icsk_bind_hash = tb; 116 /* NOTE: using tproxy and redirecting skbs to a proxy
117 * on a different listener port breaks the assumption
118 * that the listener socket's icsk_bind_hash is the same
119 * as that of the child socket. We have to look up or
120 * create a new bind bucket for the child here. */
121 struct hlist_node *node;
122 inet_bind_bucket_for_each(tb, node, &head->chain) {
123 if (net_eq(ib_net(tb), sock_net(sk)) &&
124 tb->port == port)
125 break;
126 }
127 if (!node) {
128 tb = inet_bind_bucket_create(table->bind_bucket_cachep,
129 sock_net(sk), head, port);
130 if (!tb) {
131 spin_unlock(&head->lock);
132 return -ENOMEM;
133 }
134 }
135 }
136 inet_bind_hash(child, tb, port);
117 spin_unlock(&head->lock); 137 spin_unlock(&head->lock);
118}
119 138
139 return 0;
140}
120EXPORT_SYMBOL_GPL(__inet_inherit_port); 141EXPORT_SYMBOL_GPL(__inet_inherit_port);
121 142
122static inline int compute_score(struct sock *sk, struct net *net, 143static inline int compute_score(struct sock *sk, struct net *net,
@@ -126,9 +147,9 @@ static inline int compute_score(struct sock *sk, struct net *net,
126 int score = -1; 147 int score = -1;
127 struct inet_sock *inet = inet_sk(sk); 148 struct inet_sock *inet = inet_sk(sk);
128 149
129 if (net_eq(sock_net(sk), net) && inet->num == hnum && 150 if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
130 !ipv6_only_sock(sk)) { 151 !ipv6_only_sock(sk)) {
131 __be32 rcv_saddr = inet->rcv_saddr; 152 __be32 rcv_saddr = inet->inet_rcv_saddr;
132 score = sk->sk_family == PF_INET ? 1 : 0; 153 score = sk->sk_family == PF_INET ? 1 : 0;
133 if (rcv_saddr) { 154 if (rcv_saddr) {
134 if (rcv_saddr != daddr) 155 if (rcv_saddr != daddr)
@@ -209,7 +230,7 @@ struct sock * __inet_lookup_established(struct net *net,
209 * have wildcards anyways. 230 * have wildcards anyways.
210 */ 231 */
211 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 232 unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
212 unsigned int slot = hash & (hashinfo->ehash_size - 1); 233 unsigned int slot = hash & hashinfo->ehash_mask;
213 struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; 234 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
214 235
215 rcu_read_lock(); 236 rcu_read_lock();
@@ -273,18 +294,20 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
273{ 294{
274 struct inet_hashinfo *hinfo = death_row->hashinfo; 295 struct inet_hashinfo *hinfo = death_row->hashinfo;
275 struct inet_sock *inet = inet_sk(sk); 296 struct inet_sock *inet = inet_sk(sk);
276 __be32 daddr = inet->rcv_saddr; 297 __be32 daddr = inet->inet_rcv_saddr;
277 __be32 saddr = inet->daddr; 298 __be32 saddr = inet->inet_daddr;
278 int dif = sk->sk_bound_dev_if; 299 int dif = sk->sk_bound_dev_if;
279 INET_ADDR_COOKIE(acookie, saddr, daddr) 300 INET_ADDR_COOKIE(acookie, saddr, daddr)
280 const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); 301 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
281 struct net *net = sock_net(sk); 302 struct net *net = sock_net(sk);
282 unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); 303 unsigned int hash = inet_ehashfn(net, daddr, lport,
304 saddr, inet->inet_dport);
283 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 305 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
284 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 306 spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
285 struct sock *sk2; 307 struct sock *sk2;
286 const struct hlist_nulls_node *node; 308 const struct hlist_nulls_node *node;
287 struct inet_timewait_sock *tw; 309 struct inet_timewait_sock *tw;
310 int twrefcnt = 0;
288 311
289 spin_lock(lock); 312 spin_lock(lock);
290 313
@@ -312,25 +335,28 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
312unique: 335unique:
313 /* Must record num and sport now. Otherwise we will see 336 /* Must record num and sport now. Otherwise we will see
314 * in hash table socket with a funny identity. */ 337 * in hash table socket with a funny identity. */
315 inet->num = lport; 338 inet->inet_num = lport;
316 inet->sport = htons(lport); 339 inet->inet_sport = htons(lport);
317 sk->sk_hash = hash; 340 sk->sk_hash = hash;
318 WARN_ON(!sk_unhashed(sk)); 341 WARN_ON(!sk_unhashed(sk));
319 __sk_nulls_add_node_rcu(sk, &head->chain); 342 __sk_nulls_add_node_rcu(sk, &head->chain);
343 if (tw) {
344 twrefcnt = inet_twsk_unhash(tw);
345 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
346 }
320 spin_unlock(lock); 347 spin_unlock(lock);
348 if (twrefcnt)
349 inet_twsk_put(tw);
321 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 350 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
322 351
323 if (twp) { 352 if (twp) {
324 *twp = tw; 353 *twp = tw;
325 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
326 } else if (tw) { 354 } else if (tw) {
327 /* Silly. Should hash-dance instead... */ 355 /* Silly. Should hash-dance instead... */
328 inet_twsk_deschedule(tw, death_row); 356 inet_twsk_deschedule(tw, death_row);
329 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
330 357
331 inet_twsk_put(tw); 358 inet_twsk_put(tw);
332 } 359 }
333
334 return 0; 360 return 0;
335 361
336not_unique: 362not_unique:
@@ -341,16 +367,18 @@ not_unique:
341static inline u32 inet_sk_port_offset(const struct sock *sk) 367static inline u32 inet_sk_port_offset(const struct sock *sk)
342{ 368{
343 const struct inet_sock *inet = inet_sk(sk); 369 const struct inet_sock *inet = inet_sk(sk);
344 return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, 370 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
345 inet->dport); 371 inet->inet_daddr,
372 inet->inet_dport);
346} 373}
347 374
348void __inet_hash_nolisten(struct sock *sk) 375int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
349{ 376{
350 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 377 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
351 struct hlist_nulls_head *list; 378 struct hlist_nulls_head *list;
352 spinlock_t *lock; 379 spinlock_t *lock;
353 struct inet_ehash_bucket *head; 380 struct inet_ehash_bucket *head;
381 int twrefcnt = 0;
354 382
355 WARN_ON(!sk_unhashed(sk)); 383 WARN_ON(!sk_unhashed(sk));
356 384
@@ -361,8 +389,13 @@ void __inet_hash_nolisten(struct sock *sk)
361 389
362 spin_lock(lock); 390 spin_lock(lock);
363 __sk_nulls_add_node_rcu(sk, list); 391 __sk_nulls_add_node_rcu(sk, list);
392 if (tw) {
393 WARN_ON(sk->sk_hash != tw->tw_hash);
394 twrefcnt = inet_twsk_unhash(tw);
395 }
364 spin_unlock(lock); 396 spin_unlock(lock);
365 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 397 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
398 return twrefcnt;
366} 399}
367EXPORT_SYMBOL_GPL(__inet_hash_nolisten); 400EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
368 401
@@ -372,7 +405,7 @@ static void __inet_hash(struct sock *sk)
372 struct inet_listen_hashbucket *ilb; 405 struct inet_listen_hashbucket *ilb;
373 406
374 if (sk->sk_state != TCP_LISTEN) { 407 if (sk->sk_state != TCP_LISTEN) {
375 __inet_hash_nolisten(sk); 408 __inet_hash_nolisten(sk, NULL);
376 return; 409 return;
377 } 410 }
378 411
@@ -421,14 +454,15 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
421 struct sock *sk, u32 port_offset, 454 struct sock *sk, u32 port_offset,
422 int (*check_established)(struct inet_timewait_death_row *, 455 int (*check_established)(struct inet_timewait_death_row *,
423 struct sock *, __u16, struct inet_timewait_sock **), 456 struct sock *, __u16, struct inet_timewait_sock **),
424 void (*hash)(struct sock *sk)) 457 int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
425{ 458{
426 struct inet_hashinfo *hinfo = death_row->hashinfo; 459 struct inet_hashinfo *hinfo = death_row->hashinfo;
427 const unsigned short snum = inet_sk(sk)->num; 460 const unsigned short snum = inet_sk(sk)->inet_num;
428 struct inet_bind_hashbucket *head; 461 struct inet_bind_hashbucket *head;
429 struct inet_bind_bucket *tb; 462 struct inet_bind_bucket *tb;
430 int ret; 463 int ret;
431 struct net *net = sock_net(sk); 464 struct net *net = sock_net(sk);
465 int twrefcnt = 1;
432 466
433 if (!snum) { 467 if (!snum) {
434 int i, remaining, low, high, port; 468 int i, remaining, low, high, port;
@@ -443,6 +477,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
443 local_bh_disable(); 477 local_bh_disable();
444 for (i = 1; i <= remaining; i++) { 478 for (i = 1; i <= remaining; i++) {
445 port = low + (i + offset) % remaining; 479 port = low + (i + offset) % remaining;
480 if (inet_is_reserved_local_port(port))
481 continue;
446 head = &hinfo->bhash[inet_bhashfn(net, port, 482 head = &hinfo->bhash[inet_bhashfn(net, port,
447 hinfo->bhash_size)]; 483 hinfo->bhash_size)];
448 spin_lock(&head->lock); 484 spin_lock(&head->lock);
@@ -452,7 +488,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
452 * unique enough. 488 * unique enough.
453 */ 489 */
454 inet_bind_bucket_for_each(tb, node, &head->chain) { 490 inet_bind_bucket_for_each(tb, node, &head->chain) {
455 if (ib_net(tb) == net && tb->port == port) { 491 if (net_eq(ib_net(tb), net) &&
492 tb->port == port) {
456 if (tb->fastreuse >= 0) 493 if (tb->fastreuse >= 0)
457 goto next_port; 494 goto next_port;
458 WARN_ON(hlist_empty(&tb->owners)); 495 WARN_ON(hlist_empty(&tb->owners));
@@ -485,14 +522,19 @@ ok:
485 /* Head lock still held and bh's disabled */ 522 /* Head lock still held and bh's disabled */
486 inet_bind_hash(sk, tb, port); 523 inet_bind_hash(sk, tb, port);
487 if (sk_unhashed(sk)) { 524 if (sk_unhashed(sk)) {
488 inet_sk(sk)->sport = htons(port); 525 inet_sk(sk)->inet_sport = htons(port);
489 hash(sk); 526 twrefcnt += hash(sk, tw);
490 } 527 }
528 if (tw)
529 twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
491 spin_unlock(&head->lock); 530 spin_unlock(&head->lock);
492 531
493 if (tw) { 532 if (tw) {
494 inet_twsk_deschedule(tw, death_row); 533 inet_twsk_deschedule(tw, death_row);
495 inet_twsk_put(tw); 534 while (twrefcnt) {
535 twrefcnt--;
536 inet_twsk_put(tw);
537 }
496 } 538 }
497 539
498 ret = 0; 540 ret = 0;
@@ -503,7 +545,7 @@ ok:
503 tb = inet_csk(sk)->icsk_bind_hash; 545 tb = inet_csk(sk)->icsk_bind_hash;
504 spin_lock_bh(&head->lock); 546 spin_lock_bh(&head->lock);
505 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 547 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
506 hash(sk); 548 hash(sk, NULL);
507 spin_unlock_bh(&head->lock); 549 spin_unlock_bh(&head->lock);
508 return 0; 550 return 0;
509 } else { 551 } else {
@@ -525,7 +567,6 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
525 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), 567 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
526 __inet_check_established, __inet_hash_nolisten); 568 __inet_check_established, __inet_hash_nolisten);
527} 569}
528
529EXPORT_SYMBOL_GPL(inet_hash_connect); 570EXPORT_SYMBOL_GPL(inet_hash_connect);
530 571
531void inet_hashinfo_init(struct inet_hashinfo *h) 572void inet_hashinfo_init(struct inet_hashinfo *h)
@@ -539,5 +580,4 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
539 i + LISTENING_NULLS_BASE); 580 i + LISTENING_NULLS_BASE);
540 } 581 }
541} 582}
542
543EXPORT_SYMBOL_GPL(inet_hashinfo_init); 583EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index 6a667dae315e..47038cb6c138 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -64,15 +64,15 @@ static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph,
64 if (iph->ihl != IPH_LEN_WO_OPTIONS) 64 if (iph->ihl != IPH_LEN_WO_OPTIONS)
65 return -1; 65 return -1;
66 66
67 if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack 67 if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
68 || tcph->rst || tcph->syn || tcph->fin) 68 tcph->rst || tcph->syn || tcph->fin)
69 return -1; 69 return -1;
70 70
71 if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) 71 if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
72 return -1; 72 return -1;
73 73
74 if (tcph->doff != TCPH_LEN_WO_OPTIONS 74 if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
75 && tcph->doff != TCPH_LEN_W_TIMESTAMP) 75 tcph->doff != TCPH_LEN_W_TIMESTAMP)
76 return -1; 76 return -1;
77 77
78 /* check tcp options (only timestamp allowed) */ 78 /* check tcp options (only timestamp allowed) */
@@ -262,10 +262,10 @@ static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
262 struct iphdr *iph, 262 struct iphdr *iph,
263 struct tcphdr *tcph) 263 struct tcphdr *tcph)
264{ 264{
265 if ((lro_desc->iph->saddr != iph->saddr) 265 if ((lro_desc->iph->saddr != iph->saddr) ||
266 || (lro_desc->iph->daddr != iph->daddr) 266 (lro_desc->iph->daddr != iph->daddr) ||
267 || (lro_desc->tcph->source != tcph->source) 267 (lro_desc->tcph->source != tcph->source) ||
268 || (lro_desc->tcph->dest != tcph->dest)) 268 (lro_desc->tcph->dest != tcph->dest))
269 return -1; 269 return -1;
270 return 0; 270 return 0;
271} 271}
@@ -339,9 +339,9 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
339 u64 flags; 339 u64 flags;
340 int vlan_hdr_len = 0; 340 int vlan_hdr_len = 0;
341 341
342 if (!lro_mgr->get_skb_header 342 if (!lro_mgr->get_skb_header ||
343 || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, 343 lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
344 &flags, priv)) 344 &flags, priv))
345 goto out; 345 goto out;
346 346
347 if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) 347 if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
@@ -351,8 +351,8 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
351 if (!lro_desc) 351 if (!lro_desc)
352 goto out; 352 goto out;
353 353
354 if ((skb->protocol == htons(ETH_P_8021Q)) 354 if ((skb->protocol == htons(ETH_P_8021Q)) &&
355 && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) 355 !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
356 vlan_hdr_len = VLAN_HLEN; 356 vlan_hdr_len = VLAN_HLEN;
357 357
358 if (!lro_desc->active) { /* start new lro session */ 358 if (!lro_desc->active) { /* start new lro session */
@@ -446,9 +446,9 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
446 int hdr_len = LRO_MAX_PG_HLEN; 446 int hdr_len = LRO_MAX_PG_HLEN;
447 int vlan_hdr_len = 0; 447 int vlan_hdr_len = 0;
448 448
449 if (!lro_mgr->get_frag_header 449 if (!lro_mgr->get_frag_header ||
450 || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, 450 lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
451 (void *)&tcph, &flags, priv)) { 451 (void *)&tcph, &flags, priv)) {
452 mac_hdr = page_address(frags->page) + frags->page_offset; 452 mac_hdr = page_address(frags->page) + frags->page_offset;
453 goto out1; 453 goto out1;
454 } 454 }
@@ -472,8 +472,8 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
472 if (!skb) 472 if (!skb)
473 goto out; 473 goto out;
474 474
475 if ((skb->protocol == htons(ETH_P_8021Q)) 475 if ((skb->protocol == htons(ETH_P_8021Q)) &&
476 && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) 476 !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
477 vlan_hdr_len = VLAN_HLEN; 477 vlan_hdr_len = VLAN_HLEN;
478 478
479 iph = (void *)(skb->data + vlan_hdr_len); 479 iph = (void *)(skb->data + vlan_hdr_len);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 61283f928825..c5af909cf701 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -10,44 +10,92 @@
10 10
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/kmemcheck.h> 12#include <linux/kmemcheck.h>
13#include <linux/slab.h>
13#include <net/inet_hashtables.h> 14#include <net/inet_hashtables.h>
14#include <net/inet_timewait_sock.h> 15#include <net/inet_timewait_sock.h>
15#include <net/ip.h> 16#include <net/ip.h>
16 17
18
19/**
20 * inet_twsk_unhash - unhash a timewait socket from established hash
21 * @tw: timewait socket
22 *
23 * unhash a timewait socket from established hash, if hashed.
24 * ehash lock must be held by caller.
25 * Returns 1 if caller should call inet_twsk_put() after lock release.
26 */
27int inet_twsk_unhash(struct inet_timewait_sock *tw)
28{
29 if (hlist_nulls_unhashed(&tw->tw_node))
30 return 0;
31
32 hlist_nulls_del_rcu(&tw->tw_node);
33 sk_nulls_node_init(&tw->tw_node);
34 /*
35 * We cannot call inet_twsk_put() ourself under lock,
36 * caller must call it for us.
37 */
38 return 1;
39}
40
41/**
42 * inet_twsk_bind_unhash - unhash a timewait socket from bind hash
43 * @tw: timewait socket
44 * @hashinfo: hashinfo pointer
45 *
46 * unhash a timewait socket from bind hash, if hashed.
47 * bind hash lock must be held by caller.
48 * Returns 1 if caller should call inet_twsk_put() after lock release.
49 */
50int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
51 struct inet_hashinfo *hashinfo)
52{
53 struct inet_bind_bucket *tb = tw->tw_tb;
54
55 if (!tb)
56 return 0;
57
58 __hlist_del(&tw->tw_bind_node);
59 tw->tw_tb = NULL;
60 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
61 /*
62 * We cannot call inet_twsk_put() ourself under lock,
63 * caller must call it for us.
64 */
65 return 1;
66}
67
17/* Must be called with locally disabled BHs. */ 68/* Must be called with locally disabled BHs. */
18static void __inet_twsk_kill(struct inet_timewait_sock *tw, 69static void __inet_twsk_kill(struct inet_timewait_sock *tw,
19 struct inet_hashinfo *hashinfo) 70 struct inet_hashinfo *hashinfo)
20{ 71{
21 struct inet_bind_hashbucket *bhead; 72 struct inet_bind_hashbucket *bhead;
22 struct inet_bind_bucket *tb; 73 int refcnt;
23 /* Unlink from established hashes. */ 74 /* Unlink from established hashes. */
24 spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); 75 spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
25 76
26 spin_lock(lock); 77 spin_lock(lock);
27 if (hlist_nulls_unhashed(&tw->tw_node)) { 78 refcnt = inet_twsk_unhash(tw);
28 spin_unlock(lock);
29 return;
30 }
31 hlist_nulls_del_rcu(&tw->tw_node);
32 sk_nulls_node_init(&tw->tw_node);
33 spin_unlock(lock); 79 spin_unlock(lock);
34 80
35 /* Disassociate with bind bucket. */ 81 /* Disassociate with bind bucket. */
36 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, 82 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
37 hashinfo->bhash_size)]; 83 hashinfo->bhash_size)];
84
38 spin_lock(&bhead->lock); 85 spin_lock(&bhead->lock);
39 tb = tw->tw_tb; 86 refcnt += inet_twsk_bind_unhash(tw, hashinfo);
40 __hlist_del(&tw->tw_bind_node);
41 tw->tw_tb = NULL;
42 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
43 spin_unlock(&bhead->lock); 87 spin_unlock(&bhead->lock);
88
44#ifdef SOCK_REFCNT_DEBUG 89#ifdef SOCK_REFCNT_DEBUG
45 if (atomic_read(&tw->tw_refcnt) != 1) { 90 if (atomic_read(&tw->tw_refcnt) != 1) {
46 printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", 91 printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
47 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); 92 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
48 } 93 }
49#endif 94#endif
50 inet_twsk_put(tw); 95 while (refcnt) {
96 inet_twsk_put(tw);
97 refcnt--;
98 }
51} 99}
52 100
53static noinline void inet_twsk_free(struct inet_timewait_sock *tw) 101static noinline void inet_twsk_free(struct inet_timewait_sock *tw)
@@ -86,7 +134,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
86 Note, that any socket with inet->num != 0 MUST be bound in 134 Note, that any socket with inet->num != 0 MUST be bound in
87 binding cache, even if it is closed. 135 binding cache, even if it is closed.
88 */ 136 */
89 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->num, 137 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
90 hashinfo->bhash_size)]; 138 hashinfo->bhash_size)];
91 spin_lock(&bhead->lock); 139 spin_lock(&bhead->lock);
92 tw->tw_tb = icsk->icsk_bind_hash; 140 tw->tw_tb = icsk->icsk_bind_hash;
@@ -101,16 +149,24 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
101 * Should be done before removing sk from established chain 149 * Should be done before removing sk from established chain
102 * because readers are lockless and search established first. 150 * because readers are lockless and search established first.
103 */ 151 */
104 atomic_inc(&tw->tw_refcnt);
105 inet_twsk_add_node_rcu(tw, &ehead->twchain); 152 inet_twsk_add_node_rcu(tw, &ehead->twchain);
106 153
107 /* Step 3: Remove SK from established hash. */ 154 /* Step 3: Remove SK from established hash. */
108 if (__sk_nulls_del_node_init_rcu(sk)) 155 if (__sk_nulls_del_node_init_rcu(sk))
109 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 156 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
110 157
158 /*
159 * Notes :
160 * - We initially set tw_refcnt to 0 in inet_twsk_alloc()
161 * - We add one reference for the bhash link
162 * - We add one reference for the ehash link
163 * - We want this refcnt update done before allowing other
164 * threads to find this tw in ehash chain.
165 */
166 atomic_add(1 + 1 + 1, &tw->tw_refcnt);
167
111 spin_unlock(lock); 168 spin_unlock(lock);
112} 169}
113
114EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); 170EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
115 171
116struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) 172struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
@@ -124,14 +180,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
124 kmemcheck_annotate_bitfield(tw, flags); 180 kmemcheck_annotate_bitfield(tw, flags);
125 181
126 /* Give us an identity. */ 182 /* Give us an identity. */
127 tw->tw_daddr = inet->daddr; 183 tw->tw_daddr = inet->inet_daddr;
128 tw->tw_rcv_saddr = inet->rcv_saddr; 184 tw->tw_rcv_saddr = inet->inet_rcv_saddr;
129 tw->tw_bound_dev_if = sk->sk_bound_dev_if; 185 tw->tw_bound_dev_if = sk->sk_bound_dev_if;
130 tw->tw_num = inet->num; 186 tw->tw_num = inet->inet_num;
131 tw->tw_state = TCP_TIME_WAIT; 187 tw->tw_state = TCP_TIME_WAIT;
132 tw->tw_substate = state; 188 tw->tw_substate = state;
133 tw->tw_sport = inet->sport; 189 tw->tw_sport = inet->inet_sport;
134 tw->tw_dport = inet->dport; 190 tw->tw_dport = inet->inet_dport;
135 tw->tw_family = sk->sk_family; 191 tw->tw_family = sk->sk_family;
136 tw->tw_reuse = sk->sk_reuse; 192 tw->tw_reuse = sk->sk_reuse;
137 tw->tw_hash = sk->sk_hash; 193 tw->tw_hash = sk->sk_hash;
@@ -139,14 +195,18 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
139 tw->tw_transparent = inet->transparent; 195 tw->tw_transparent = inet->transparent;
140 tw->tw_prot = sk->sk_prot_creator; 196 tw->tw_prot = sk->sk_prot_creator;
141 twsk_net_set(tw, hold_net(sock_net(sk))); 197 twsk_net_set(tw, hold_net(sock_net(sk)));
142 atomic_set(&tw->tw_refcnt, 1); 198 /*
199 * Because we use RCU lookups, we should not set tw_refcnt
200 * to a non null value before everything is setup for this
201 * timewait socket.
202 */
203 atomic_set(&tw->tw_refcnt, 0);
143 inet_twsk_dead_node_init(tw); 204 inet_twsk_dead_node_init(tw);
144 __module_get(tw->tw_prot->owner); 205 __module_get(tw->tw_prot->owner);
145 } 206 }
146 207
147 return tw; 208 return tw;
148} 209}
149
150EXPORT_SYMBOL_GPL(inet_twsk_alloc); 210EXPORT_SYMBOL_GPL(inet_twsk_alloc);
151 211
152/* Returns non-zero if quota exceeded. */ 212/* Returns non-zero if quota exceeded. */
@@ -218,14 +278,13 @@ void inet_twdr_hangman(unsigned long data)
218 /* We purged the entire slot, anything left? */ 278 /* We purged the entire slot, anything left? */
219 if (twdr->tw_count) 279 if (twdr->tw_count)
220 need_timer = 1; 280 need_timer = 1;
281 twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
221 } 282 }
222 twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
223 if (need_timer) 283 if (need_timer)
224 mod_timer(&twdr->tw_timer, jiffies + twdr->period); 284 mod_timer(&twdr->tw_timer, jiffies + twdr->period);
225out: 285out:
226 spin_unlock(&twdr->death_lock); 286 spin_unlock(&twdr->death_lock);
227} 287}
228
229EXPORT_SYMBOL_GPL(inet_twdr_hangman); 288EXPORT_SYMBOL_GPL(inet_twdr_hangman);
230 289
231void inet_twdr_twkill_work(struct work_struct *work) 290void inet_twdr_twkill_work(struct work_struct *work)
@@ -256,7 +315,6 @@ void inet_twdr_twkill_work(struct work_struct *work)
256 spin_unlock_bh(&twdr->death_lock); 315 spin_unlock_bh(&twdr->death_lock);
257 } 316 }
258} 317}
259
260EXPORT_SYMBOL_GPL(inet_twdr_twkill_work); 318EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
261 319
262/* These are always called from BH context. See callers in 320/* These are always called from BH context. See callers in
@@ -276,7 +334,6 @@ void inet_twsk_deschedule(struct inet_timewait_sock *tw,
276 spin_unlock(&twdr->death_lock); 334 spin_unlock(&twdr->death_lock);
277 __inet_twsk_kill(tw, twdr->hashinfo); 335 __inet_twsk_kill(tw, twdr->hashinfo);
278} 336}
279
280EXPORT_SYMBOL(inet_twsk_deschedule); 337EXPORT_SYMBOL(inet_twsk_deschedule);
281 338
282void inet_twsk_schedule(struct inet_timewait_sock *tw, 339void inet_twsk_schedule(struct inet_timewait_sock *tw,
@@ -357,7 +414,6 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
357 mod_timer(&twdr->tw_timer, jiffies + twdr->period); 414 mod_timer(&twdr->tw_timer, jiffies + twdr->period);
358 spin_unlock(&twdr->death_lock); 415 spin_unlock(&twdr->death_lock);
359} 416}
360
361EXPORT_SYMBOL_GPL(inet_twsk_schedule); 417EXPORT_SYMBOL_GPL(inet_twsk_schedule);
362 418
363void inet_twdr_twcal_tick(unsigned long data) 419void inet_twdr_twcal_tick(unsigned long data)
@@ -418,40 +474,48 @@ out:
418#endif 474#endif
419 spin_unlock(&twdr->death_lock); 475 spin_unlock(&twdr->death_lock);
420} 476}
421
422EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); 477EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
423 478
424void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, 479void inet_twsk_purge(struct inet_hashinfo *hashinfo,
425 struct inet_timewait_death_row *twdr, int family) 480 struct inet_timewait_death_row *twdr, int family)
426{ 481{
427 struct inet_timewait_sock *tw; 482 struct inet_timewait_sock *tw;
428 struct sock *sk; 483 struct sock *sk;
429 struct hlist_nulls_node *node; 484 struct hlist_nulls_node *node;
430 int h; 485 unsigned int slot;
431 486
432 local_bh_disable(); 487 for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
433 for (h = 0; h < (hashinfo->ehash_size); h++) { 488 struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
434 struct inet_ehash_bucket *head = 489restart_rcu:
435 inet_ehash_bucket(hashinfo, h); 490 rcu_read_lock();
436 spinlock_t *lock = inet_ehash_lockp(hashinfo, h);
437restart: 491restart:
438 spin_lock(lock); 492 sk_nulls_for_each_rcu(sk, node, &head->twchain) {
439 sk_nulls_for_each(sk, node, &head->twchain) {
440
441 tw = inet_twsk(sk); 493 tw = inet_twsk(sk);
442 if (!net_eq(twsk_net(tw), net) || 494 if ((tw->tw_family != family) ||
443 tw->tw_family != family) 495 atomic_read(&twsk_net(tw)->count))
444 continue; 496 continue;
445 497
446 atomic_inc(&tw->tw_refcnt); 498 if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
447 spin_unlock(lock); 499 continue;
500
501 if (unlikely((tw->tw_family != family) ||
502 atomic_read(&twsk_net(tw)->count))) {
503 inet_twsk_put(tw);
504 goto restart;
505 }
506
507 rcu_read_unlock();
448 inet_twsk_deschedule(tw, twdr); 508 inet_twsk_deschedule(tw, twdr);
449 inet_twsk_put(tw); 509 inet_twsk_put(tw);
450 510 goto restart_rcu;
451 goto restart;
452 } 511 }
453 spin_unlock(lock); 512 /* If the nulls value we got at the end of this lookup is
513 * not the expected one, we must restart lookup.
514 * We probably met an item that was moved to another chain.
515 */
516 if (get_nulls_value(node) != slot)
517 goto restart;
518 rcu_read_unlock();
454 } 519 }
455 local_bh_enable();
456} 520}
457EXPORT_SYMBOL_GPL(inet_twsk_purge); 521EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index b1fbe18feb5a..d9bc85751c74 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -51,8 +51,8 @@
51 * lookups performed with disabled BHs. 51 * lookups performed with disabled BHs.
52 * 52 *
53 * Serialisation issues. 53 * Serialisation issues.
54 * 1. Nodes may appear in the tree only with the pool write lock held. 54 * 1. Nodes may appear in the tree only with the pool lock held.
55 * 2. Nodes may disappear from the tree only with the pool write lock held 55 * 2. Nodes may disappear from the tree only with the pool lock held
56 * AND reference count being 0. 56 * AND reference count being 0.
57 * 3. Nodes appears and disappears from unused node list only under 57 * 3. Nodes appears and disappears from unused node list only under
58 * "inet_peer_unused_lock". 58 * "inet_peer_unused_lock".
@@ -63,27 +63,42 @@
63 * refcnt: atomically against modifications on other CPU; 63 * refcnt: atomically against modifications on other CPU;
64 * usually under some other lock to prevent node disappearing 64 * usually under some other lock to prevent node disappearing
65 * dtime: unused node list lock 65 * dtime: unused node list lock
66 * v4daddr: unchangeable 66 * daddr: unchangeable
67 * ip_id_count: idlock 67 * ip_id_count: atomic value (no lock needed)
68 */ 68 */
69 69
70/* Exported for inet_getid inline function. */
71DEFINE_SPINLOCK(inet_peer_idlock);
72
73static struct kmem_cache *peer_cachep __read_mostly; 70static struct kmem_cache *peer_cachep __read_mostly;
74 71
75#define node_height(x) x->avl_height 72#define node_height(x) x->avl_height
76static struct inet_peer peer_fake_node = { 73
77 .avl_left = &peer_fake_node, 74#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
78 .avl_right = &peer_fake_node, 75#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
76static const struct inet_peer peer_fake_node = {
77 .avl_left = peer_avl_empty_rcu,
78 .avl_right = peer_avl_empty_rcu,
79 .avl_height = 0 79 .avl_height = 0
80}; 80};
81#define peer_avl_empty (&peer_fake_node) 81
82static struct inet_peer *peer_root = peer_avl_empty; 82struct inet_peer_base {
83static DEFINE_RWLOCK(peer_pool_lock); 83 struct inet_peer __rcu *root;
84 spinlock_t lock;
85 int total;
86};
87
88static struct inet_peer_base v4_peers = {
89 .root = peer_avl_empty_rcu,
90 .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock),
91 .total = 0,
92};
93
94static struct inet_peer_base v6_peers = {
95 .root = peer_avl_empty_rcu,
96 .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock),
97 .total = 0,
98};
99
84#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 100#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
85 101
86static int peer_total;
87/* Exported for sysctl_net_ipv4. */ 102/* Exported for sysctl_net_ipv4. */
88int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more 103int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
89 * aggressively at this stage */ 104 * aggressively at this stage */
@@ -92,8 +107,13 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min
92int inet_peer_gc_mintime __read_mostly = 10 * HZ; 107int inet_peer_gc_mintime __read_mostly = 10 * HZ;
93int inet_peer_gc_maxtime __read_mostly = 120 * HZ; 108int inet_peer_gc_maxtime __read_mostly = 120 * HZ;
94 109
95static LIST_HEAD(unused_peers); 110static struct {
96static DEFINE_SPINLOCK(inet_peer_unused_lock); 111 struct list_head list;
112 spinlock_t lock;
113} unused_peers = {
114 .list = LIST_HEAD_INIT(unused_peers.list),
115 .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock),
116};
97 117
98static void peer_check_expire(unsigned long dummy); 118static void peer_check_expire(unsigned long dummy);
99static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); 119static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
@@ -119,7 +139,7 @@ void __init inet_initpeers(void)
119 139
120 peer_cachep = kmem_cache_create("inet_peer_cache", 140 peer_cachep = kmem_cache_create("inet_peer_cache",
121 sizeof(struct inet_peer), 141 sizeof(struct inet_peer),
122 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 142 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
123 NULL); 143 NULL);
124 144
125 /* All the timers, started at system startup tend 145 /* All the timers, started at system startup tend
@@ -134,122 +154,194 @@ void __init inet_initpeers(void)
134/* Called with or without local BH being disabled. */ 154/* Called with or without local BH being disabled. */
135static void unlink_from_unused(struct inet_peer *p) 155static void unlink_from_unused(struct inet_peer *p)
136{ 156{
137 spin_lock_bh(&inet_peer_unused_lock); 157 if (!list_empty(&p->unused)) {
138 list_del_init(&p->unused); 158 spin_lock_bh(&unused_peers.lock);
139 spin_unlock_bh(&inet_peer_unused_lock); 159 list_del_init(&p->unused);
160 spin_unlock_bh(&unused_peers.lock);
161 }
162}
163
164static int addr_compare(const struct inetpeer_addr *a,
165 const struct inetpeer_addr *b)
166{
167 int i, n = (a->family == AF_INET ? 1 : 4);
168
169 for (i = 0; i < n; i++) {
170 if (a->a6[i] == b->a6[i])
171 continue;
172 if (a->a6[i] < b->a6[i])
173 return -1;
174 return 1;
175 }
176
177 return 0;
140} 178}
141 179
142/* 180/*
143 * Called with local BH disabled and the pool lock held. 181 * Called with local BH disabled and the pool lock held.
144 * _stack is known to be NULL or not at compile time,
145 * so compiler will optimize the if (_stack) tests.
146 */ 182 */
147#define lookup(_daddr, _stack) \ 183#define lookup(_daddr, _stack, _base) \
148({ \ 184({ \
149 struct inet_peer *u, **v; \ 185 struct inet_peer *u; \
150 if (_stack != NULL) { \ 186 struct inet_peer __rcu **v; \
151 stackptr = _stack; \ 187 \
152 *stackptr++ = &peer_root; \ 188 stackptr = _stack; \
153 } \ 189 *stackptr++ = &_base->root; \
154 for (u = peer_root; u != peer_avl_empty; ) { \ 190 for (u = rcu_dereference_protected(_base->root, \
155 if (_daddr == u->v4daddr) \ 191 lockdep_is_held(&_base->lock)); \
192 u != peer_avl_empty; ) { \
193 int cmp = addr_compare(_daddr, &u->daddr); \
194 if (cmp == 0) \
156 break; \ 195 break; \
157 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ 196 if (cmp == -1) \
158 v = &u->avl_left; \ 197 v = &u->avl_left; \
159 else \ 198 else \
160 v = &u->avl_right; \ 199 v = &u->avl_right; \
161 if (_stack != NULL) \ 200 *stackptr++ = v; \
162 *stackptr++ = v; \ 201 u = rcu_dereference_protected(*v, \
163 u = *v; \ 202 lockdep_is_held(&_base->lock)); \
164 } \ 203 } \
165 u; \ 204 u; \
166}) 205})
167 206
168/* Called with local BH disabled and the pool write lock held. */ 207/*
169#define lookup_rightempty(start) \ 208 * Called with rcu_read_lock_bh()
209 * Because we hold no lock against a writer, its quite possible we fall
210 * in an endless loop.
211 * But every pointer we follow is guaranteed to be valid thanks to RCU.
212 * We exit from this function if number of links exceeds PEER_MAXDEPTH
213 */
214static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
215 struct inet_peer_base *base)
216{
217 struct inet_peer *u = rcu_dereference_bh(base->root);
218 int count = 0;
219
220 while (u != peer_avl_empty) {
221 int cmp = addr_compare(daddr, &u->daddr);
222 if (cmp == 0) {
223 /* Before taking a reference, check if this entry was
224 * deleted, unlink_from_pool() sets refcnt=-1 to make
225 * distinction between an unused entry (refcnt=0) and
226 * a freed one.
227 */
228 if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1)))
229 u = NULL;
230 return u;
231 }
232 if (cmp == -1)
233 u = rcu_dereference_bh(u->avl_left);
234 else
235 u = rcu_dereference_bh(u->avl_right);
236 if (unlikely(++count == PEER_MAXDEPTH))
237 break;
238 }
239 return NULL;
240}
241
242/* Called with local BH disabled and the pool lock held. */
243#define lookup_rightempty(start, base) \
170({ \ 244({ \
171 struct inet_peer *u, **v; \ 245 struct inet_peer *u; \
246 struct inet_peer __rcu **v; \
172 *stackptr++ = &start->avl_left; \ 247 *stackptr++ = &start->avl_left; \
173 v = &start->avl_left; \ 248 v = &start->avl_left; \
174 for (u = *v; u->avl_right != peer_avl_empty; ) { \ 249 for (u = rcu_dereference_protected(*v, \
250 lockdep_is_held(&base->lock)); \
251 u->avl_right != peer_avl_empty_rcu; ) { \
175 v = &u->avl_right; \ 252 v = &u->avl_right; \
176 *stackptr++ = v; \ 253 *stackptr++ = v; \
177 u = *v; \ 254 u = rcu_dereference_protected(*v, \
255 lockdep_is_held(&base->lock)); \
178 } \ 256 } \
179 u; \ 257 u; \
180}) 258})
181 259
182/* Called with local BH disabled and the pool write lock held. 260/* Called with local BH disabled and the pool lock held.
183 * Variable names are the proof of operation correctness. 261 * Variable names are the proof of operation correctness.
184 * Look into mm/map_avl.c for more detail description of the ideas. */ 262 * Look into mm/map_avl.c for more detail description of the ideas.
185static void peer_avl_rebalance(struct inet_peer **stack[], 263 */
186 struct inet_peer ***stackend) 264static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
265 struct inet_peer __rcu ***stackend,
266 struct inet_peer_base *base)
187{ 267{
188 struct inet_peer **nodep, *node, *l, *r; 268 struct inet_peer __rcu **nodep;
269 struct inet_peer *node, *l, *r;
189 int lh, rh; 270 int lh, rh;
190 271
191 while (stackend > stack) { 272 while (stackend > stack) {
192 nodep = *--stackend; 273 nodep = *--stackend;
193 node = *nodep; 274 node = rcu_dereference_protected(*nodep,
194 l = node->avl_left; 275 lockdep_is_held(&base->lock));
195 r = node->avl_right; 276 l = rcu_dereference_protected(node->avl_left,
277 lockdep_is_held(&base->lock));
278 r = rcu_dereference_protected(node->avl_right,
279 lockdep_is_held(&base->lock));
196 lh = node_height(l); 280 lh = node_height(l);
197 rh = node_height(r); 281 rh = node_height(r);
198 if (lh > rh + 1) { /* l: RH+2 */ 282 if (lh > rh + 1) { /* l: RH+2 */
199 struct inet_peer *ll, *lr, *lrl, *lrr; 283 struct inet_peer *ll, *lr, *lrl, *lrr;
200 int lrh; 284 int lrh;
201 ll = l->avl_left; 285 ll = rcu_dereference_protected(l->avl_left,
202 lr = l->avl_right; 286 lockdep_is_held(&base->lock));
287 lr = rcu_dereference_protected(l->avl_right,
288 lockdep_is_held(&base->lock));
203 lrh = node_height(lr); 289 lrh = node_height(lr);
204 if (lrh <= node_height(ll)) { /* ll: RH+1 */ 290 if (lrh <= node_height(ll)) { /* ll: RH+1 */
205 node->avl_left = lr; /* lr: RH or RH+1 */ 291 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
206 node->avl_right = r; /* r: RH */ 292 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
207 node->avl_height = lrh + 1; /* RH+1 or RH+2 */ 293 node->avl_height = lrh + 1; /* RH+1 or RH+2 */
208 l->avl_left = ll; /* ll: RH+1 */ 294 RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */
209 l->avl_right = node; /* node: RH+1 or RH+2 */ 295 RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */
210 l->avl_height = node->avl_height + 1; 296 l->avl_height = node->avl_height + 1;
211 *nodep = l; 297 RCU_INIT_POINTER(*nodep, l);
212 } else { /* ll: RH, lr: RH+1 */ 298 } else { /* ll: RH, lr: RH+1 */
213 lrl = lr->avl_left; /* lrl: RH or RH-1 */ 299 lrl = rcu_dereference_protected(lr->avl_left,
214 lrr = lr->avl_right; /* lrr: RH or RH-1 */ 300 lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */
215 node->avl_left = lrr; /* lrr: RH or RH-1 */ 301 lrr = rcu_dereference_protected(lr->avl_right,
216 node->avl_right = r; /* r: RH */ 302 lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */
303 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
304 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
217 node->avl_height = rh + 1; /* node: RH+1 */ 305 node->avl_height = rh + 1; /* node: RH+1 */
218 l->avl_left = ll; /* ll: RH */ 306 RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */
219 l->avl_right = lrl; /* lrl: RH or RH-1 */ 307 RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */
220 l->avl_height = rh + 1; /* l: RH+1 */ 308 l->avl_height = rh + 1; /* l: RH+1 */
221 lr->avl_left = l; /* l: RH+1 */ 309 RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */
222 lr->avl_right = node; /* node: RH+1 */ 310 RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */
223 lr->avl_height = rh + 2; 311 lr->avl_height = rh + 2;
224 *nodep = lr; 312 RCU_INIT_POINTER(*nodep, lr);
225 } 313 }
226 } else if (rh > lh + 1) { /* r: LH+2 */ 314 } else if (rh > lh + 1) { /* r: LH+2 */
227 struct inet_peer *rr, *rl, *rlr, *rll; 315 struct inet_peer *rr, *rl, *rlr, *rll;
228 int rlh; 316 int rlh;
229 rr = r->avl_right; 317 rr = rcu_dereference_protected(r->avl_right,
230 rl = r->avl_left; 318 lockdep_is_held(&base->lock));
319 rl = rcu_dereference_protected(r->avl_left,
320 lockdep_is_held(&base->lock));
231 rlh = node_height(rl); 321 rlh = node_height(rl);
232 if (rlh <= node_height(rr)) { /* rr: LH+1 */ 322 if (rlh <= node_height(rr)) { /* rr: LH+1 */
233 node->avl_right = rl; /* rl: LH or LH+1 */ 323 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
234 node->avl_left = l; /* l: LH */ 324 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
235 node->avl_height = rlh + 1; /* LH+1 or LH+2 */ 325 node->avl_height = rlh + 1; /* LH+1 or LH+2 */
236 r->avl_right = rr; /* rr: LH+1 */ 326 RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */
237 r->avl_left = node; /* node: LH+1 or LH+2 */ 327 RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */
238 r->avl_height = node->avl_height + 1; 328 r->avl_height = node->avl_height + 1;
239 *nodep = r; 329 RCU_INIT_POINTER(*nodep, r);
240 } else { /* rr: RH, rl: RH+1 */ 330 } else { /* rr: RH, rl: RH+1 */
241 rlr = rl->avl_right; /* rlr: LH or LH-1 */ 331 rlr = rcu_dereference_protected(rl->avl_right,
242 rll = rl->avl_left; /* rll: LH or LH-1 */ 332 lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */
243 node->avl_right = rll; /* rll: LH or LH-1 */ 333 rll = rcu_dereference_protected(rl->avl_left,
244 node->avl_left = l; /* l: LH */ 334 lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */
335 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
336 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
245 node->avl_height = lh + 1; /* node: LH+1 */ 337 node->avl_height = lh + 1; /* node: LH+1 */
246 r->avl_right = rr; /* rr: LH */ 338 RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */
247 r->avl_left = rlr; /* rlr: LH or LH-1 */ 339 RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */
248 r->avl_height = lh + 1; /* r: LH+1 */ 340 r->avl_height = lh + 1; /* r: LH+1 */
249 rl->avl_right = r; /* r: LH+1 */ 341 RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */
250 rl->avl_left = node; /* node: LH+1 */ 342 RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */
251 rl->avl_height = lh + 2; 343 rl->avl_height = lh + 2;
252 *nodep = rl; 344 RCU_INIT_POINTER(*nodep, rl);
253 } 345 }
254 } else { 346 } else {
255 node->avl_height = (lh > rh ? lh : rh) + 1; 347 node->avl_height = (lh > rh ? lh : rh) + 1;
@@ -257,88 +349,107 @@ static void peer_avl_rebalance(struct inet_peer **stack[],
257 } 349 }
258} 350}
259 351
260/* Called with local BH disabled and the pool write lock held. */ 352/* Called with local BH disabled and the pool lock held. */
261#define link_to_pool(n) \ 353#define link_to_pool(n, base) \
262do { \ 354do { \
263 n->avl_height = 1; \ 355 n->avl_height = 1; \
264 n->avl_left = peer_avl_empty; \ 356 n->avl_left = peer_avl_empty_rcu; \
265 n->avl_right = peer_avl_empty; \ 357 n->avl_right = peer_avl_empty_rcu; \
266 **--stackptr = n; \ 358 /* lockless readers can catch us now */ \
267 peer_avl_rebalance(stack, stackptr); \ 359 rcu_assign_pointer(**--stackptr, n); \
268} while(0) 360 peer_avl_rebalance(stack, stackptr, base); \
361} while (0)
362
363static void inetpeer_free_rcu(struct rcu_head *head)
364{
365 kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
366}
269 367
270/* May be called with local BH enabled. */ 368/* May be called with local BH enabled. */
271static void unlink_from_pool(struct inet_peer *p) 369static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
272{ 370{
273 int do_free; 371 int do_free;
274 372
275 do_free = 0; 373 do_free = 0;
276 374
277 write_lock_bh(&peer_pool_lock); 375 spin_lock_bh(&base->lock);
278 /* Check the reference counter. It was artificially incremented by 1 376 /* Check the reference counter. It was artificially incremented by 1
279 * in cleanup() function to prevent sudden disappearing. If the 377 * in cleanup() function to prevent sudden disappearing. If we can
280 * reference count is still 1 then the node is referenced only as `p' 378 * atomically (because of lockless readers) take this last reference,
281 * here and from the pool. So under the exclusive pool lock it's safe 379 * it's safe to remove the node and free it later.
282 * to remove the node and free it later. */ 380 * We use refcnt=-1 to alert lockless readers this entry is deleted.
283 if (atomic_read(&p->refcnt) == 1) { 381 */
284 struct inet_peer **stack[PEER_MAXDEPTH]; 382 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
285 struct inet_peer ***stackptr, ***delp; 383 struct inet_peer __rcu **stack[PEER_MAXDEPTH];
286 if (lookup(p->v4daddr, stack) != p) 384 struct inet_peer __rcu ***stackptr, ***delp;
385 if (lookup(&p->daddr, stack, base) != p)
287 BUG(); 386 BUG();
288 delp = stackptr - 1; /* *delp[0] == p */ 387 delp = stackptr - 1; /* *delp[0] == p */
289 if (p->avl_left == peer_avl_empty) { 388 if (p->avl_left == peer_avl_empty_rcu) {
290 *delp[0] = p->avl_right; 389 *delp[0] = p->avl_right;
291 --stackptr; 390 --stackptr;
292 } else { 391 } else {
293 /* look for a node to insert instead of p */ 392 /* look for a node to insert instead of p */
294 struct inet_peer *t; 393 struct inet_peer *t;
295 t = lookup_rightempty(p); 394 t = lookup_rightempty(p, base);
296 BUG_ON(*stackptr[-1] != t); 395 BUG_ON(rcu_dereference_protected(*stackptr[-1],
396 lockdep_is_held(&base->lock)) != t);
297 **--stackptr = t->avl_left; 397 **--stackptr = t->avl_left;
298 /* t is removed, t->v4daddr > x->v4daddr for any 398 /* t is removed, t->daddr > x->daddr for any
299 * x in p->avl_left subtree. 399 * x in p->avl_left subtree.
300 * Put t in the old place of p. */ 400 * Put t in the old place of p. */
301 *delp[0] = t; 401 RCU_INIT_POINTER(*delp[0], t);
302 t->avl_left = p->avl_left; 402 t->avl_left = p->avl_left;
303 t->avl_right = p->avl_right; 403 t->avl_right = p->avl_right;
304 t->avl_height = p->avl_height; 404 t->avl_height = p->avl_height;
305 BUG_ON(delp[1] != &p->avl_left); 405 BUG_ON(delp[1] != &p->avl_left);
306 delp[1] = &t->avl_left; /* was &p->avl_left */ 406 delp[1] = &t->avl_left; /* was &p->avl_left */
307 } 407 }
308 peer_avl_rebalance(stack, stackptr); 408 peer_avl_rebalance(stack, stackptr, base);
309 peer_total--; 409 base->total--;
310 do_free = 1; 410 do_free = 1;
311 } 411 }
312 write_unlock_bh(&peer_pool_lock); 412 spin_unlock_bh(&base->lock);
313 413
314 if (do_free) 414 if (do_free)
315 kmem_cache_free(peer_cachep, p); 415 call_rcu_bh(&p->rcu, inetpeer_free_rcu);
316 else 416 else
317 /* The node is used again. Decrease the reference counter 417 /* The node is used again. Decrease the reference counter
318 * back. The loop "cleanup -> unlink_from_unused 418 * back. The loop "cleanup -> unlink_from_unused
319 * -> unlink_from_pool -> putpeer -> link_to_unused 419 * -> unlink_from_pool -> putpeer -> link_to_unused
320 * -> cleanup (for the same node)" 420 * -> cleanup (for the same node)"
321 * doesn't really exist because the entry will have a 421 * doesn't really exist because the entry will have a
322 * recent deletion time and will not be cleaned again soon. */ 422 * recent deletion time and will not be cleaned again soon.
423 */
323 inet_putpeer(p); 424 inet_putpeer(p);
324} 425}
325 426
427static struct inet_peer_base *family_to_base(int family)
428{
429 return (family == AF_INET ? &v4_peers : &v6_peers);
430}
431
432static struct inet_peer_base *peer_to_base(struct inet_peer *p)
433{
434 return family_to_base(p->daddr.family);
435}
436
326/* May be called with local BH enabled. */ 437/* May be called with local BH enabled. */
327static int cleanup_once(unsigned long ttl) 438static int cleanup_once(unsigned long ttl)
328{ 439{
329 struct inet_peer *p = NULL; 440 struct inet_peer *p = NULL;
330 441
331 /* Remove the first entry from the list of unused nodes. */ 442 /* Remove the first entry from the list of unused nodes. */
332 spin_lock_bh(&inet_peer_unused_lock); 443 spin_lock_bh(&unused_peers.lock);
333 if (!list_empty(&unused_peers)) { 444 if (!list_empty(&unused_peers.list)) {
334 __u32 delta; 445 __u32 delta;
335 446
336 p = list_first_entry(&unused_peers, struct inet_peer, unused); 447 p = list_first_entry(&unused_peers.list, struct inet_peer, unused);
337 delta = (__u32)jiffies - p->dtime; 448 delta = (__u32)jiffies - p->dtime;
338 449
339 if (delta < ttl) { 450 if (delta < ttl) {
340 /* Do not prune fresh entries. */ 451 /* Do not prune fresh entries. */
341 spin_unlock_bh(&inet_peer_unused_lock); 452 spin_unlock_bh(&unused_peers.lock);
342 return -1; 453 return -1;
343 } 454 }
344 455
@@ -348,7 +459,7 @@ static int cleanup_once(unsigned long ttl)
348 * before unlink_from_pool() call. */ 459 * before unlink_from_pool() call. */
349 atomic_inc(&p->refcnt); 460 atomic_inc(&p->refcnt);
350 } 461 }
351 spin_unlock_bh(&inet_peer_unused_lock); 462 spin_unlock_bh(&unused_peers.lock);
352 463
353 if (p == NULL) 464 if (p == NULL)
354 /* It means that the total number of USED entries has 465 /* It means that the total number of USED entries has
@@ -356,84 +467,86 @@ static int cleanup_once(unsigned long ttl)
356 * happen because of entry limits in route cache. */ 467 * happen because of entry limits in route cache. */
357 return -1; 468 return -1;
358 469
359 unlink_from_pool(p); 470 unlink_from_pool(p, peer_to_base(p));
360 return 0; 471 return 0;
361} 472}
362 473
363/* Called with or without local BH being disabled. */ 474/* Called with or without local BH being disabled. */
364struct inet_peer *inet_getpeer(__be32 daddr, int create) 475struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
365{ 476{
366 struct inet_peer *p, *n; 477 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
367 struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; 478 struct inet_peer_base *base = family_to_base(AF_INET);
479 struct inet_peer *p;
368 480
369 /* Look up for the address quickly. */ 481 /* Look up for the address quickly, lockless.
370 read_lock_bh(&peer_pool_lock); 482 * Because of a concurrent writer, we might not find an existing entry.
371 p = lookup(daddr, NULL); 483 */
372 if (p != peer_avl_empty) 484 rcu_read_lock_bh();
373 atomic_inc(&p->refcnt); 485 p = lookup_rcu_bh(daddr, base);
374 read_unlock_bh(&peer_pool_lock); 486 rcu_read_unlock_bh();
487
488 if (p) {
489 /* The existing node has been found.
490 * Remove the entry from unused list if it was there.
491 */
492 unlink_from_unused(p);
493 return p;
494 }
375 495
496 /* retry an exact lookup, taking the lock before.
497 * At least, nodes should be hot in our cache.
498 */
499 spin_lock_bh(&base->lock);
500 p = lookup(daddr, stack, base);
376 if (p != peer_avl_empty) { 501 if (p != peer_avl_empty) {
377 /* The existing node has been found. */ 502 atomic_inc(&p->refcnt);
503 spin_unlock_bh(&base->lock);
378 /* Remove the entry from unused list if it was there. */ 504 /* Remove the entry from unused list if it was there. */
379 unlink_from_unused(p); 505 unlink_from_unused(p);
380 return p; 506 return p;
381 } 507 }
508 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
509 if (p) {
510 p->daddr = *daddr;
511 atomic_set(&p->refcnt, 1);
512 atomic_set(&p->rid, 0);
513 atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4));
514 p->tcp_ts_stamp = 0;
515 INIT_LIST_HEAD(&p->unused);
516
517
518 /* Link the node. */
519 link_to_pool(p, base);
520 base->total++;
521 }
522 spin_unlock_bh(&base->lock);
382 523
383 if (!create) 524 if (base->total >= inet_peer_threshold)
384 return NULL;
385
386 /* Allocate the space outside the locked region. */
387 n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
388 if (n == NULL)
389 return NULL;
390 n->v4daddr = daddr;
391 atomic_set(&n->refcnt, 1);
392 atomic_set(&n->rid, 0);
393 n->ip_id_count = secure_ip_id(daddr);
394 n->tcp_ts_stamp = 0;
395
396 write_lock_bh(&peer_pool_lock);
397 /* Check if an entry has suddenly appeared. */
398 p = lookup(daddr, stack);
399 if (p != peer_avl_empty)
400 goto out_free;
401
402 /* Link the node. */
403 link_to_pool(n);
404 INIT_LIST_HEAD(&n->unused);
405 peer_total++;
406 write_unlock_bh(&peer_pool_lock);
407
408 if (peer_total >= inet_peer_threshold)
409 /* Remove one less-recently-used entry. */ 525 /* Remove one less-recently-used entry. */
410 cleanup_once(0); 526 cleanup_once(0);
411 527
412 return n;
413
414out_free:
415 /* The appropriate node is already in the pool. */
416 atomic_inc(&p->refcnt);
417 write_unlock_bh(&peer_pool_lock);
418 /* Remove the entry from unused list if it was there. */
419 unlink_from_unused(p);
420 /* Free preallocated the preallocated node. */
421 kmem_cache_free(peer_cachep, n);
422 return p; 528 return p;
423} 529}
424 530
531static int compute_total(void)
532{
533 return v4_peers.total + v6_peers.total;
534}
535EXPORT_SYMBOL_GPL(inet_getpeer);
536
425/* Called with local BH disabled. */ 537/* Called with local BH disabled. */
426static void peer_check_expire(unsigned long dummy) 538static void peer_check_expire(unsigned long dummy)
427{ 539{
428 unsigned long now = jiffies; 540 unsigned long now = jiffies;
429 int ttl; 541 int ttl, total;
430 542
431 if (peer_total >= inet_peer_threshold) 543 total = compute_total();
544 if (total >= inet_peer_threshold)
432 ttl = inet_peer_minttl; 545 ttl = inet_peer_minttl;
433 else 546 else
434 ttl = inet_peer_maxttl 547 ttl = inet_peer_maxttl
435 - (inet_peer_maxttl - inet_peer_minttl) / HZ * 548 - (inet_peer_maxttl - inet_peer_minttl) / HZ *
436 peer_total / inet_peer_threshold * HZ; 549 total / inet_peer_threshold * HZ;
437 while (!cleanup_once(ttl)) { 550 while (!cleanup_once(ttl)) {
438 if (jiffies != now) 551 if (jiffies != now)
439 break; 552 break;
@@ -442,22 +555,27 @@ static void peer_check_expire(unsigned long dummy)
442 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime 555 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
443 * interval depending on the total number of entries (more entries, 556 * interval depending on the total number of entries (more entries,
444 * less interval). */ 557 * less interval). */
445 if (peer_total >= inet_peer_threshold) 558 total = compute_total();
559 if (total >= inet_peer_threshold)
446 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; 560 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
447 else 561 else
448 peer_periodic_timer.expires = jiffies 562 peer_periodic_timer.expires = jiffies
449 + inet_peer_gc_maxtime 563 + inet_peer_gc_maxtime
450 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * 564 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
451 peer_total / inet_peer_threshold * HZ; 565 total / inet_peer_threshold * HZ;
452 add_timer(&peer_periodic_timer); 566 add_timer(&peer_periodic_timer);
453} 567}
454 568
455void inet_putpeer(struct inet_peer *p) 569void inet_putpeer(struct inet_peer *p)
456{ 570{
457 spin_lock_bh(&inet_peer_unused_lock); 571 local_bh_disable();
458 if (atomic_dec_and_test(&p->refcnt)) { 572
459 list_add_tail(&p->unused, &unused_peers); 573 if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) {
574 list_add_tail(&p->unused, &unused_peers.list);
460 p->dtime = (__u32)jiffies; 575 p->dtime = (__u32)jiffies;
576 spin_unlock(&unused_peers.lock);
461 } 577 }
462 spin_unlock_bh(&inet_peer_unused_lock); 578
579 local_bh_enable();
463} 580}
581EXPORT_SYMBOL_GPL(inet_putpeer);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index a2991bc8e32e..99461f09320f 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -25,6 +25,7 @@
25#include <linux/ip.h> 25#include <linux/ip.h>
26#include <linux/icmp.h> 26#include <linux/icmp.h>
27#include <linux/netdevice.h> 27#include <linux/netdevice.h>
28#include <linux/slab.h>
28#include <net/sock.h> 29#include <net/sock.h>
29#include <net/ip.h> 30#include <net/ip.h>
30#include <net/tcp.h> 31#include <net/tcp.h>
@@ -86,16 +87,16 @@ int ip_forward(struct sk_buff *skb)
86 if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 87 if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
87 goto sr_failed; 88 goto sr_failed;
88 89
89 if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) && 90 if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
90 (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { 91 (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
91 IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS); 92 IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
92 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 93 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
93 htonl(dst_mtu(&rt->u.dst))); 94 htonl(dst_mtu(&rt->dst)));
94 goto drop; 95 goto drop;
95 } 96 }
96 97
97 /* We are about to mangle packet. Copy it! */ 98 /* We are about to mangle packet. Copy it! */
98 if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) 99 if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
99 goto drop; 100 goto drop;
100 iph = ip_hdr(skb); 101 iph = ip_hdr(skb);
101 102
@@ -111,8 +112,8 @@ int ip_forward(struct sk_buff *skb)
111 112
112 skb->priority = rt_tos2priority(iph->tos); 113 skb->priority = rt_tos2priority(iph->tos);
113 114
114 return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev, 115 return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
115 ip_forward_finish); 116 rt->dst.dev, ip_forward_finish);
116 117
117sr_failed: 118sr_failed:
118 /* 119 /*
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 575f9bd51ccd..a1151b8adf3c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -32,6 +32,9 @@
32#include <linux/netdevice.h> 32#include <linux/netdevice.h>
33#include <linux/jhash.h> 33#include <linux/jhash.h>
34#include <linux/random.h> 34#include <linux/random.h>
35#include <linux/slab.h>
36#include <net/route.h>
37#include <net/dst.h>
35#include <net/sock.h> 38#include <net/sock.h>
36#include <net/ip.h> 39#include <net/ip.h>
37#include <net/icmp.h> 40#include <net/icmp.h>
@@ -42,6 +45,7 @@
42#include <linux/udp.h> 45#include <linux/udp.h>
43#include <linux/inet.h> 46#include <linux/inet.h>
44#include <linux/netfilter_ipv4.h> 47#include <linux/netfilter_ipv4.h>
48#include <net/inet_ecn.h>
45 49
46/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 50/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
47 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c 51 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
@@ -67,11 +71,28 @@ struct ipq {
67 __be32 daddr; 71 __be32 daddr;
68 __be16 id; 72 __be16 id;
69 u8 protocol; 73 u8 protocol;
74 u8 ecn; /* RFC3168 support */
70 int iif; 75 int iif;
71 unsigned int rid; 76 unsigned int rid;
72 struct inet_peer *peer; 77 struct inet_peer *peer;
73}; 78};
74 79
80#define IPFRAG_ECN_CLEAR 0x01 /* one frag had INET_ECN_NOT_ECT */
81#define IPFRAG_ECN_SET_CE 0x04 /* one frag had INET_ECN_CE */
82
83static inline u8 ip4_frag_ecn(u8 tos)
84{
85 tos = (tos & INET_ECN_MASK) + 1;
86 /*
87 * After the last operation we have (in binary):
88 * INET_ECN_NOT_ECT => 001
89 * INET_ECN_ECT_1 => 010
90 * INET_ECN_ECT_0 => 011
91 * INET_ECN_CE => 100
92 */
93 return (tos & 2) ? 0 : tos;
94}
95
75static struct inet_frags ip4_frags; 96static struct inet_frags ip4_frags;
76 97
77int ip_frag_nqueues(struct net *net) 98int ip_frag_nqueues(struct net *net)
@@ -113,19 +134,16 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
113 struct ip4_create_arg *arg = a; 134 struct ip4_create_arg *arg = a;
114 135
115 qp = container_of(q, struct ipq, q); 136 qp = container_of(q, struct ipq, q);
116 return (qp->id == arg->iph->id && 137 return qp->id == arg->iph->id &&
117 qp->saddr == arg->iph->saddr && 138 qp->saddr == arg->iph->saddr &&
118 qp->daddr == arg->iph->daddr && 139 qp->daddr == arg->iph->daddr &&
119 qp->protocol == arg->iph->protocol && 140 qp->protocol == arg->iph->protocol &&
120 qp->user == arg->user); 141 qp->user == arg->user;
121} 142}
122 143
123/* Memory Tracking Functions. */ 144/* Memory Tracking Functions. */
124static __inline__ void frag_kfree_skb(struct netns_frags *nf, 145static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
125 struct sk_buff *skb, int *work)
126{ 146{
127 if (work)
128 *work -= skb->truesize;
129 atomic_sub(skb->truesize, &nf->mem); 147 atomic_sub(skb->truesize, &nf->mem);
130 kfree_skb(skb); 148 kfree_skb(skb);
131} 149}
@@ -137,11 +155,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
137 155
138 qp->protocol = arg->iph->protocol; 156 qp->protocol = arg->iph->protocol;
139 qp->id = arg->iph->id; 157 qp->id = arg->iph->id;
158 qp->ecn = ip4_frag_ecn(arg->iph->tos);
140 qp->saddr = arg->iph->saddr; 159 qp->saddr = arg->iph->saddr;
141 qp->daddr = arg->iph->daddr; 160 qp->daddr = arg->iph->daddr;
142 qp->user = arg->user; 161 qp->user = arg->user;
143 qp->peer = sysctl_ipfrag_max_dist ? 162 qp->peer = sysctl_ipfrag_max_dist ?
144 inet_getpeer(arg->iph->saddr, 1) : NULL; 163 inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
145} 164}
146 165
147static __inline__ void ip4_frag_free(struct inet_frag_queue *q) 166static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
@@ -205,11 +224,35 @@ static void ip_expire(unsigned long arg)
205 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { 224 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
206 struct sk_buff *head = qp->q.fragments; 225 struct sk_buff *head = qp->q.fragments;
207 226
208 /* Send an ICMP "Fragment Reassembly Timeout" message. */ 227 rcu_read_lock();
209 if ((head->dev = dev_get_by_index(net, qp->iif)) != NULL) { 228 head->dev = dev_get_by_index_rcu(net, qp->iif);
210 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); 229 if (!head->dev)
211 dev_put(head->dev); 230 goto out_rcu_unlock;
231
232 /*
233 * Only search router table for the head fragment,
234 * when defraging timeout at PRE_ROUTING HOOK.
235 */
236 if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) {
237 const struct iphdr *iph = ip_hdr(head);
238 int err = ip_route_input(head, iph->daddr, iph->saddr,
239 iph->tos, head->dev);
240 if (unlikely(err))
241 goto out_rcu_unlock;
242
243 /*
244 * Only an end host needs to send an ICMP
245 * "Fragment Reassembly Timeout" message, per RFC792.
246 */
247 if (skb_rtable(head)->rt_type != RTN_LOCAL)
248 goto out_rcu_unlock;
249
212 } 250 }
251
252 /* Send an ICMP "Fragment Reassembly Timeout" message. */
253 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
254out_rcu_unlock:
255 rcu_read_unlock();
213 } 256 }
214out: 257out:
215 spin_unlock(&qp->q.lock); 258 spin_unlock(&qp->q.lock);
@@ -282,7 +325,7 @@ static int ip_frag_reinit(struct ipq *qp)
282 fp = qp->q.fragments; 325 fp = qp->q.fragments;
283 do { 326 do {
284 struct sk_buff *xp = fp->next; 327 struct sk_buff *xp = fp->next;
285 frag_kfree_skb(qp->q.net, fp, NULL); 328 frag_kfree_skb(qp->q.net, fp);
286 fp = xp; 329 fp = xp;
287 } while (fp); 330 } while (fp);
288 331
@@ -290,7 +333,9 @@ static int ip_frag_reinit(struct ipq *qp)
290 qp->q.len = 0; 333 qp->q.len = 0;
291 qp->q.meat = 0; 334 qp->q.meat = 0;
292 qp->q.fragments = NULL; 335 qp->q.fragments = NULL;
336 qp->q.fragments_tail = NULL;
293 qp->iif = 0; 337 qp->iif = 0;
338 qp->ecn = 0;
294 339
295 return 0; 340 return 0;
296} 341}
@@ -303,6 +348,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
303 int flags, offset; 348 int flags, offset;
304 int ihl, end; 349 int ihl, end;
305 int err = -ENOENT; 350 int err = -ENOENT;
351 u8 ecn;
306 352
307 if (qp->q.last_in & INET_FRAG_COMPLETE) 353 if (qp->q.last_in & INET_FRAG_COMPLETE)
308 goto err; 354 goto err;
@@ -314,6 +360,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
314 goto err; 360 goto err;
315 } 361 }
316 362
363 ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
317 offset = ntohs(ip_hdr(skb)->frag_off); 364 offset = ntohs(ip_hdr(skb)->frag_off);
318 flags = offset & ~IP_OFFSET; 365 flags = offset & ~IP_OFFSET;
319 offset &= IP_OFFSET; 366 offset &= IP_OFFSET;
@@ -362,6 +409,11 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
362 * in the chain of fragments so far. We must know where to put 409 * in the chain of fragments so far. We must know where to put
363 * this fragment, right? 410 * this fragment, right?
364 */ 411 */
412 prev = qp->q.fragments_tail;
413 if (!prev || FRAG_CB(prev)->offset < offset) {
414 next = NULL;
415 goto found;
416 }
365 prev = NULL; 417 prev = NULL;
366 for (next = qp->q.fragments; next != NULL; next = next->next) { 418 for (next = qp->q.fragments; next != NULL; next = next->next) {
367 if (FRAG_CB(next)->offset >= offset) 419 if (FRAG_CB(next)->offset >= offset)
@@ -369,6 +421,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
369 prev = next; 421 prev = next;
370 } 422 }
371 423
424found:
372 /* We found where to put this one. Check for overlap with 425 /* We found where to put this one. Check for overlap with
373 * preceding fragment, and, if needed, align things so that 426 * preceding fragment, and, if needed, align things so that
374 * any overlaps are eliminated. 427 * any overlaps are eliminated.
@@ -419,7 +472,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
419 qp->q.fragments = next; 472 qp->q.fragments = next;
420 473
421 qp->q.meat -= free_it->len; 474 qp->q.meat -= free_it->len;
422 frag_kfree_skb(qp->q.net, free_it, NULL); 475 frag_kfree_skb(qp->q.net, free_it);
423 } 476 }
424 } 477 }
425 478
@@ -427,6 +480,8 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
427 480
428 /* Insert this fragment in the chain of fragments. */ 481 /* Insert this fragment in the chain of fragments. */
429 skb->next = next; 482 skb->next = next;
483 if (!next)
484 qp->q.fragments_tail = skb;
430 if (prev) 485 if (prev)
431 prev->next = skb; 486 prev->next = skb;
432 else 487 else
@@ -439,6 +494,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
439 } 494 }
440 qp->q.stamp = skb->tstamp; 495 qp->q.stamp = skb->tstamp;
441 qp->q.meat += skb->len; 496 qp->q.meat += skb->len;
497 qp->ecn |= ecn;
442 atomic_add(skb->truesize, &qp->q.net->mem); 498 atomic_add(skb->truesize, &qp->q.net->mem);
443 if (offset == 0) 499 if (offset == 0)
444 qp->q.last_in |= INET_FRAG_FIRST_IN; 500 qp->q.last_in |= INET_FRAG_FIRST_IN;
@@ -480,6 +536,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
480 goto out_nomem; 536 goto out_nomem;
481 537
482 fp->next = head->next; 538 fp->next = head->next;
539 if (!fp->next)
540 qp->q.fragments_tail = fp;
483 prev->next = fp; 541 prev->next = fp;
484 542
485 skb_morph(head, qp->q.fragments); 543 skb_morph(head, qp->q.fragments);
@@ -507,7 +565,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
507 /* If the first fragment is fragmented itself, we split 565 /* If the first fragment is fragmented itself, we split
508 * it to two chunks: the first with data and paged part 566 * it to two chunks: the first with data and paged part
509 * and the second, holding only fragments. */ 567 * and the second, holding only fragments. */
510 if (skb_has_frags(head)) { 568 if (skb_has_frag_list(head)) {
511 struct sk_buff *clone; 569 struct sk_buff *clone;
512 int i, plen = 0; 570 int i, plen = 0;
513 571
@@ -529,7 +587,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
529 587
530 skb_shinfo(head)->frag_list = head->next; 588 skb_shinfo(head)->frag_list = head->next;
531 skb_push(head, head->data - skb_network_header(head)); 589 skb_push(head, head->data - skb_network_header(head));
532 atomic_sub(head->truesize, &qp->q.net->mem);
533 590
534 for (fp=head->next; fp; fp = fp->next) { 591 for (fp=head->next; fp; fp = fp->next) {
535 head->data_len += fp->len; 592 head->data_len += fp->len;
@@ -539,8 +596,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
539 else if (head->ip_summed == CHECKSUM_COMPLETE) 596 else if (head->ip_summed == CHECKSUM_COMPLETE)
540 head->csum = csum_add(head->csum, fp->csum); 597 head->csum = csum_add(head->csum, fp->csum);
541 head->truesize += fp->truesize; 598 head->truesize += fp->truesize;
542 atomic_sub(fp->truesize, &qp->q.net->mem);
543 } 599 }
600 atomic_sub(head->truesize, &qp->q.net->mem);
544 601
545 head->next = NULL; 602 head->next = NULL;
546 head->dev = dev; 603 head->dev = dev;
@@ -549,8 +606,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
549 iph = ip_hdr(head); 606 iph = ip_hdr(head);
550 iph->frag_off = 0; 607 iph->frag_off = 0;
551 iph->tot_len = htons(len); 608 iph->tot_len = htons(len);
609 /* RFC3168 5.3 Fragmentation support
610 * If one fragment had INET_ECN_NOT_ECT,
611 * reassembled frame also has INET_ECN_NOT_ECT
612 * Elif one fragment had INET_ECN_CE
613 * reassembled frame also has INET_ECN_CE
614 */
615 if (qp->ecn & IPFRAG_ECN_CLEAR)
616 iph->tos &= ~INET_ECN_MASK;
617 else if (qp->ecn & IPFRAG_ECN_SET_CE)
618 iph->tos |= INET_ECN_CE;
619
552 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); 620 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
553 qp->q.fragments = NULL; 621 qp->q.fragments = NULL;
622 qp->q.fragments_tail = NULL;
554 return 0; 623 return 0;
555 624
556out_nomem: 625out_nomem:
@@ -563,7 +632,7 @@ out_oversize:
563 printk(KERN_INFO "Oversized IP packet from %pI4.\n", 632 printk(KERN_INFO "Oversized IP packet from %pI4.\n",
564 &qp->saddr); 633 &qp->saddr);
565out_fail: 634out_fail:
566 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMFAILS); 635 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
567 return err; 636 return err;
568} 637}
569 638
@@ -597,13 +666,13 @@ int ip_defrag(struct sk_buff *skb, u32 user)
597 kfree_skb(skb); 666 kfree_skb(skb);
598 return -ENOMEM; 667 return -ENOMEM;
599} 668}
669EXPORT_SYMBOL(ip_defrag);
600 670
601#ifdef CONFIG_SYSCTL 671#ifdef CONFIG_SYSCTL
602static int zero; 672static int zero;
603 673
604static struct ctl_table ip4_frags_ns_ctl_table[] = { 674static struct ctl_table ip4_frags_ns_ctl_table[] = {
605 { 675 {
606 .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH,
607 .procname = "ipfrag_high_thresh", 676 .procname = "ipfrag_high_thresh",
608 .data = &init_net.ipv4.frags.high_thresh, 677 .data = &init_net.ipv4.frags.high_thresh,
609 .maxlen = sizeof(int), 678 .maxlen = sizeof(int),
@@ -611,7 +680,6 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
611 .proc_handler = proc_dointvec 680 .proc_handler = proc_dointvec
612 }, 681 },
613 { 682 {
614 .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH,
615 .procname = "ipfrag_low_thresh", 683 .procname = "ipfrag_low_thresh",
616 .data = &init_net.ipv4.frags.low_thresh, 684 .data = &init_net.ipv4.frags.low_thresh,
617 .maxlen = sizeof(int), 685 .maxlen = sizeof(int),
@@ -619,26 +687,22 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
619 .proc_handler = proc_dointvec 687 .proc_handler = proc_dointvec
620 }, 688 },
621 { 689 {
622 .ctl_name = NET_IPV4_IPFRAG_TIME,
623 .procname = "ipfrag_time", 690 .procname = "ipfrag_time",
624 .data = &init_net.ipv4.frags.timeout, 691 .data = &init_net.ipv4.frags.timeout,
625 .maxlen = sizeof(int), 692 .maxlen = sizeof(int),
626 .mode = 0644, 693 .mode = 0644,
627 .proc_handler = proc_dointvec_jiffies, 694 .proc_handler = proc_dointvec_jiffies,
628 .strategy = sysctl_jiffies
629 }, 695 },
630 { } 696 { }
631}; 697};
632 698
633static struct ctl_table ip4_frags_ctl_table[] = { 699static struct ctl_table ip4_frags_ctl_table[] = {
634 { 700 {
635 .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL,
636 .procname = "ipfrag_secret_interval", 701 .procname = "ipfrag_secret_interval",
637 .data = &ip4_frags.secret_interval, 702 .data = &ip4_frags.secret_interval,
638 .maxlen = sizeof(int), 703 .maxlen = sizeof(int),
639 .mode = 0644, 704 .mode = 0644,
640 .proc_handler = proc_dointvec_jiffies, 705 .proc_handler = proc_dointvec_jiffies,
641 .strategy = sysctl_jiffies
642 }, 706 },
643 { 707 {
644 .procname = "ipfrag_max_dist", 708 .procname = "ipfrag_max_dist",
@@ -651,13 +715,13 @@ static struct ctl_table ip4_frags_ctl_table[] = {
651 { } 715 { }
652}; 716};
653 717
654static int ip4_frags_ns_ctl_register(struct net *net) 718static int __net_init ip4_frags_ns_ctl_register(struct net *net)
655{ 719{
656 struct ctl_table *table; 720 struct ctl_table *table;
657 struct ctl_table_header *hdr; 721 struct ctl_table_header *hdr;
658 722
659 table = ip4_frags_ns_ctl_table; 723 table = ip4_frags_ns_ctl_table;
660 if (net != &init_net) { 724 if (!net_eq(net, &init_net)) {
661 table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); 725 table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
662 if (table == NULL) 726 if (table == NULL)
663 goto err_alloc; 727 goto err_alloc;
@@ -675,13 +739,13 @@ static int ip4_frags_ns_ctl_register(struct net *net)
675 return 0; 739 return 0;
676 740
677err_reg: 741err_reg:
678 if (net != &init_net) 742 if (!net_eq(net, &init_net))
679 kfree(table); 743 kfree(table);
680err_alloc: 744err_alloc:
681 return -ENOMEM; 745 return -ENOMEM;
682} 746}
683 747
684static void ip4_frags_ns_ctl_unregister(struct net *net) 748static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
685{ 749{
686 struct ctl_table *table; 750 struct ctl_table *table;
687 751
@@ -709,7 +773,7 @@ static inline void ip4_frags_ctl_register(void)
709} 773}
710#endif 774#endif
711 775
712static int ipv4_frags_init_net(struct net *net) 776static int __net_init ipv4_frags_init_net(struct net *net)
713{ 777{
714 /* 778 /*
715 * Fragment cache limits. We will commit 256K at one time. Should we 779 * Fragment cache limits. We will commit 256K at one time. Should we
@@ -731,7 +795,7 @@ static int ipv4_frags_init_net(struct net *net)
731 return ip4_frags_ns_ctl_register(net); 795 return ip4_frags_ns_ctl_register(net);
732} 796}
733 797
734static void ipv4_frags_exit_net(struct net *net) 798static void __net_exit ipv4_frags_exit_net(struct net *net)
735{ 799{
736 ip4_frags_ns_ctl_unregister(net); 800 ip4_frags_ns_ctl_unregister(net);
737 inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); 801 inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
@@ -756,5 +820,3 @@ void __init ipfrag_init(void)
756 ip4_frags.secret_interval = 10 * 60 * HZ; 820 ip4_frags.secret_interval = 10 * 60 * HZ;
757 inet_frags_init(&ip4_frags); 821 inet_frags_init(&ip4_frags);
758} 822}
759
760EXPORT_SYMBOL(ip_defrag);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 44e2a3d2359a..eb68a0e34e49 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/types.h> 15#include <linux/types.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
18#include <linux/skbuff.h> 19#include <linux/skbuff.h>
19#include <linux/netdevice.h> 20#include <linux/netdevice.h>
@@ -43,8 +44,9 @@
43#include <net/net_namespace.h> 44#include <net/net_namespace.h>
44#include <net/netns/generic.h> 45#include <net/netns/generic.h>
45#include <net/rtnetlink.h> 46#include <net/rtnetlink.h>
47#include <net/gre.h>
46 48
47#ifdef CONFIG_IPV6 49#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
48#include <net/ipv6.h> 50#include <net/ipv6.h>
49#include <net/ip6_fib.h> 51#include <net/ip6_fib.h>
50#include <net/ip6_route.h> 52#include <net/ip6_route.h>
@@ -62,16 +64,13 @@
62 We cannot track such dead loops during route installation, 64 We cannot track such dead loops during route installation,
63 it is infeasible task. The most general solutions would be 65 it is infeasible task. The most general solutions would be
64 to keep skb->encapsulation counter (sort of local ttl), 66 to keep skb->encapsulation counter (sort of local ttl),
65 and silently drop packet when it expires. It is the best 67 and silently drop packet when it expires. It is a good
66 solution, but it supposes maintaing new variable in ALL 68 solution, but it supposes maintaing new variable in ALL
67 skb, even if no tunneling is used. 69 skb, even if no tunneling is used.
68 70
69 Current solution: t->recursion lock breaks dead loops. It looks 71 Current solution: xmit_recursion breaks dead loops. This is a percpu
70 like dev->tbusy flag, but I preferred new variable, because 72 counter, since when we enter the first ndo_xmit(), cpu migration is
71 the semantics is different. One day, when hard_start_xmit 73 forbidden. We force an exit if this counter reaches RECURSION_LIMIT
72 will be multithreaded we will have to use skb->encapsulation.
73
74
75 74
76 2. Networking dead loops would not kill routers, but would really 75 2. Networking dead loops would not kill routers, but would really
77 kill network. IP hop limit plays role of "t->recursion" in this case, 76 kill network. IP hop limit plays role of "t->recursion" in this case,
@@ -128,9 +127,9 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
128 127
129#define HASH_SIZE 16 128#define HASH_SIZE 16
130 129
131static int ipgre_net_id; 130static int ipgre_net_id __read_mostly;
132struct ipgre_net { 131struct ipgre_net {
133 struct ip_tunnel *tunnels[4][HASH_SIZE]; 132 struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
134 133
135 struct net_device *fb_tunnel_dev; 134 struct net_device *fb_tunnel_dev;
136}; 135};
@@ -159,8 +158,40 @@ struct ipgre_net {
159#define tunnels_r tunnels[2] 158#define tunnels_r tunnels[2]
160#define tunnels_l tunnels[1] 159#define tunnels_l tunnels[1]
161#define tunnels_wc tunnels[0] 160#define tunnels_wc tunnels[0]
161/*
162 * Locking : hash tables are protected by RCU and RTNL
163 */
162 164
163static DEFINE_RWLOCK(ipgre_lock); 165#define for_each_ip_tunnel_rcu(start) \
166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167
168/* often modified stats are per cpu, other are shared (netdev->stats) */
169struct pcpu_tstats {
170 unsigned long rx_packets;
171 unsigned long rx_bytes;
172 unsigned long tx_packets;
173 unsigned long tx_bytes;
174};
175
176static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177{
178 struct pcpu_tstats sum = { 0 };
179 int i;
180
181 for_each_possible_cpu(i) {
182 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183
184 sum.rx_packets += tstats->rx_packets;
185 sum.rx_bytes += tstats->rx_bytes;
186 sum.tx_packets += tstats->tx_packets;
187 sum.tx_bytes += tstats->tx_bytes;
188 }
189 dev->stats.rx_packets = sum.rx_packets;
190 dev->stats.rx_bytes = sum.rx_bytes;
191 dev->stats.tx_packets = sum.tx_packets;
192 dev->stats.tx_bytes = sum.tx_bytes;
193 return &dev->stats;
194}
164 195
165/* Given src, dst and key, find appropriate for input tunnel. */ 196/* Given src, dst and key, find appropriate for input tunnel. */
166 197
@@ -170,15 +201,15 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
170{ 201{
171 struct net *net = dev_net(dev); 202 struct net *net = dev_net(dev);
172 int link = dev->ifindex; 203 int link = dev->ifindex;
173 unsigned h0 = HASH(remote); 204 unsigned int h0 = HASH(remote);
174 unsigned h1 = HASH(key); 205 unsigned int h1 = HASH(key);
175 struct ip_tunnel *t, *cand = NULL; 206 struct ip_tunnel *t, *cand = NULL;
176 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 207 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
177 int dev_type = (gre_proto == htons(ETH_P_TEB)) ? 208 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
178 ARPHRD_ETHER : ARPHRD_IPGRE; 209 ARPHRD_ETHER : ARPHRD_IPGRE;
179 int score, cand_score = 4; 210 int score, cand_score = 4;
180 211
181 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) { 212 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
182 if (local != t->parms.iph.saddr || 213 if (local != t->parms.iph.saddr ||
183 remote != t->parms.iph.daddr || 214 remote != t->parms.iph.daddr ||
184 key != t->parms.i_key || 215 key != t->parms.i_key ||
@@ -203,7 +234,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
203 } 234 }
204 } 235 }
205 236
206 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) { 237 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
207 if (remote != t->parms.iph.daddr || 238 if (remote != t->parms.iph.daddr ||
208 key != t->parms.i_key || 239 key != t->parms.i_key ||
209 !(t->dev->flags & IFF_UP)) 240 !(t->dev->flags & IFF_UP))
@@ -227,7 +258,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
227 } 258 }
228 } 259 }
229 260
230 for (t = ign->tunnels_l[h1]; t; t = t->next) { 261 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
231 if ((local != t->parms.iph.saddr && 262 if ((local != t->parms.iph.saddr &&
232 (local != t->parms.iph.daddr || 263 (local != t->parms.iph.daddr ||
233 !ipv4_is_multicast(local))) || 264 !ipv4_is_multicast(local))) ||
@@ -253,7 +284,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
253 } 284 }
254 } 285 }
255 286
256 for (t = ign->tunnels_wc[h1]; t; t = t->next) { 287 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
257 if (t->parms.i_key != key || 288 if (t->parms.i_key != key ||
258 !(t->dev->flags & IFF_UP)) 289 !(t->dev->flags & IFF_UP))
259 continue; 290 continue;
@@ -279,19 +310,20 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
279 if (cand != NULL) 310 if (cand != NULL)
280 return cand; 311 return cand;
281 312
282 if (ign->fb_tunnel_dev->flags & IFF_UP) 313 dev = ign->fb_tunnel_dev;
283 return netdev_priv(ign->fb_tunnel_dev); 314 if (dev->flags & IFF_UP)
315 return netdev_priv(dev);
284 316
285 return NULL; 317 return NULL;
286} 318}
287 319
288static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, 320static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
289 struct ip_tunnel_parm *parms) 321 struct ip_tunnel_parm *parms)
290{ 322{
291 __be32 remote = parms->iph.daddr; 323 __be32 remote = parms->iph.daddr;
292 __be32 local = parms->iph.saddr; 324 __be32 local = parms->iph.saddr;
293 __be32 key = parms->i_key; 325 __be32 key = parms->i_key;
294 unsigned h = HASH(key); 326 unsigned int h = HASH(key);
295 int prio = 0; 327 int prio = 0;
296 328
297 if (local) 329 if (local)
@@ -304,7 +336,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
304 return &ign->tunnels[prio][h]; 336 return &ign->tunnels[prio][h];
305} 337}
306 338
307static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, 339static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
308 struct ip_tunnel *t) 340 struct ip_tunnel *t)
309{ 341{
310 return __ipgre_bucket(ign, &t->parms); 342 return __ipgre_bucket(ign, &t->parms);
@@ -312,23 +344,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
312 344
313static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) 345static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
314{ 346{
315 struct ip_tunnel **tp = ipgre_bucket(ign, t); 347 struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
316 348
317 t->next = *tp; 349 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
318 write_lock_bh(&ipgre_lock); 350 rcu_assign_pointer(*tp, t);
319 *tp = t;
320 write_unlock_bh(&ipgre_lock);
321} 351}
322 352
323static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) 353static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
324{ 354{
325 struct ip_tunnel **tp; 355 struct ip_tunnel __rcu **tp;
326 356 struct ip_tunnel *iter;
327 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { 357
328 if (t == *tp) { 358 for (tp = ipgre_bucket(ign, t);
329 write_lock_bh(&ipgre_lock); 359 (iter = rtnl_dereference(*tp)) != NULL;
330 *tp = t->next; 360 tp = &iter->next) {
331 write_unlock_bh(&ipgre_lock); 361 if (t == iter) {
362 rcu_assign_pointer(*tp, t->next);
332 break; 363 break;
333 } 364 }
334 } 365 }
@@ -342,10 +373,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
342 __be32 local = parms->iph.saddr; 373 __be32 local = parms->iph.saddr;
343 __be32 key = parms->i_key; 374 __be32 key = parms->i_key;
344 int link = parms->link; 375 int link = parms->link;
345 struct ip_tunnel *t, **tp; 376 struct ip_tunnel *t;
377 struct ip_tunnel __rcu **tp;
346 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 378 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
347 379
348 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) 380 for (tp = __ipgre_bucket(ign, parms);
381 (t = rtnl_dereference(*tp)) != NULL;
382 tp = &t->next)
349 if (local == t->parms.iph.saddr && 383 if (local == t->parms.iph.saddr &&
350 remote == t->parms.iph.daddr && 384 remote == t->parms.iph.daddr &&
351 key == t->parms.i_key && 385 key == t->parms.i_key &&
@@ -356,7 +390,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
356 return t; 390 return t;
357} 391}
358 392
359static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, 393static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
360 struct ip_tunnel_parm *parms, int create) 394 struct ip_tunnel_parm *parms, int create)
361{ 395{
362 struct ip_tunnel *t, *nt; 396 struct ip_tunnel *t, *nt;
@@ -371,11 +405,11 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
371 if (parms->name[0]) 405 if (parms->name[0])
372 strlcpy(name, parms->name, IFNAMSIZ); 406 strlcpy(name, parms->name, IFNAMSIZ);
373 else 407 else
374 sprintf(name, "gre%%d"); 408 strcpy(name, "gre%d");
375 409
376 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); 410 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
377 if (!dev) 411 if (!dev)
378 return NULL; 412 return NULL;
379 413
380 dev_net_set(dev, net); 414 dev_net_set(dev, net);
381 415
@@ -479,7 +513,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
479 break; 513 break;
480 } 514 }
481 515
482 read_lock(&ipgre_lock); 516 rcu_read_lock();
483 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, 517 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
484 flags & GRE_KEY ? 518 flags & GRE_KEY ?
485 *(((__be32 *)p) + (grehlen / 4) - 1) : 0, 519 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
@@ -497,8 +531,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
497 t->err_count = 1; 531 t->err_count = 1;
498 t->err_time = jiffies; 532 t->err_time = jiffies;
499out: 533out:
500 read_unlock(&ipgre_lock); 534 rcu_read_unlock();
501 return;
502} 535}
503 536
504static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) 537static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
@@ -534,7 +567,6 @@ static int ipgre_rcv(struct sk_buff *skb)
534 struct ip_tunnel *tunnel; 567 struct ip_tunnel *tunnel;
535 int offset = 4; 568 int offset = 4;
536 __be16 gre_proto; 569 __be16 gre_proto;
537 unsigned int len;
538 570
539 if (!pskb_may_pull(skb, 16)) 571 if (!pskb_may_pull(skb, 16))
540 goto drop_nolock; 572 goto drop_nolock;
@@ -576,11 +608,11 @@ static int ipgre_rcv(struct sk_buff *skb)
576 608
577 gre_proto = *(__be16 *)(h + 2); 609 gre_proto = *(__be16 *)(h + 2);
578 610
579 read_lock(&ipgre_lock); 611 rcu_read_lock();
580 if ((tunnel = ipgre_tunnel_lookup(skb->dev, 612 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
581 iph->saddr, iph->daddr, key, 613 iph->saddr, iph->daddr, key,
582 gre_proto))) { 614 gre_proto))) {
583 struct net_device_stats *stats = &tunnel->dev->stats; 615 struct pcpu_tstats *tstats;
584 616
585 secpath_reset(skb); 617 secpath_reset(skb);
586 618
@@ -602,36 +634,34 @@ static int ipgre_rcv(struct sk_buff *skb)
602#ifdef CONFIG_NET_IPGRE_BROADCAST 634#ifdef CONFIG_NET_IPGRE_BROADCAST
603 if (ipv4_is_multicast(iph->daddr)) { 635 if (ipv4_is_multicast(iph->daddr)) {
604 /* Looped back packet, drop it! */ 636 /* Looped back packet, drop it! */
605 if (skb_rtable(skb)->fl.iif == 0) 637 if (rt_is_output_route(skb_rtable(skb)))
606 goto drop; 638 goto drop;
607 stats->multicast++; 639 tunnel->dev->stats.multicast++;
608 skb->pkt_type = PACKET_BROADCAST; 640 skb->pkt_type = PACKET_BROADCAST;
609 } 641 }
610#endif 642#endif
611 643
612 if (((flags&GRE_CSUM) && csum) || 644 if (((flags&GRE_CSUM) && csum) ||
613 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { 645 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
614 stats->rx_crc_errors++; 646 tunnel->dev->stats.rx_crc_errors++;
615 stats->rx_errors++; 647 tunnel->dev->stats.rx_errors++;
616 goto drop; 648 goto drop;
617 } 649 }
618 if (tunnel->parms.i_flags&GRE_SEQ) { 650 if (tunnel->parms.i_flags&GRE_SEQ) {
619 if (!(flags&GRE_SEQ) || 651 if (!(flags&GRE_SEQ) ||
620 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { 652 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
621 stats->rx_fifo_errors++; 653 tunnel->dev->stats.rx_fifo_errors++;
622 stats->rx_errors++; 654 tunnel->dev->stats.rx_errors++;
623 goto drop; 655 goto drop;
624 } 656 }
625 tunnel->i_seqno = seqno + 1; 657 tunnel->i_seqno = seqno + 1;
626 } 658 }
627 659
628 len = skb->len;
629
630 /* Warning: All skb pointers will be invalidated! */ 660 /* Warning: All skb pointers will be invalidated! */
631 if (tunnel->dev->type == ARPHRD_ETHER) { 661 if (tunnel->dev->type == ARPHRD_ETHER) {
632 if (!pskb_may_pull(skb, ETH_HLEN)) { 662 if (!pskb_may_pull(skb, ETH_HLEN)) {
633 stats->rx_length_errors++; 663 tunnel->dev->stats.rx_length_errors++;
634 stats->rx_errors++; 664 tunnel->dev->stats.rx_errors++;
635 goto drop; 665 goto drop;
636 } 666 }
637 667
@@ -640,49 +670,45 @@ static int ipgre_rcv(struct sk_buff *skb)
640 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 670 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
641 } 671 }
642 672
643 stats->rx_packets++; 673 tstats = this_cpu_ptr(tunnel->dev->tstats);
644 stats->rx_bytes += len; 674 tstats->rx_packets++;
645 skb->dev = tunnel->dev; 675 tstats->rx_bytes += skb->len;
646 skb_dst_drop(skb); 676
647 nf_reset(skb); 677 __skb_tunnel_rx(skb, tunnel->dev);
648 678
649 skb_reset_network_header(skb); 679 skb_reset_network_header(skb);
650 ipgre_ecn_decapsulate(iph, skb); 680 ipgre_ecn_decapsulate(iph, skb);
651 681
652 netif_rx(skb); 682 netif_rx(skb);
653 read_unlock(&ipgre_lock); 683
654 return(0); 684 rcu_read_unlock();
685 return 0;
655 } 686 }
656 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 687 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
657 688
658drop: 689drop:
659 read_unlock(&ipgre_lock); 690 rcu_read_unlock();
660drop_nolock: 691drop_nolock:
661 kfree_skb(skb); 692 kfree_skb(skb);
662 return(0); 693 return 0;
663} 694}
664 695
665static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 696static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
666{ 697{
667 struct ip_tunnel *tunnel = netdev_priv(dev); 698 struct ip_tunnel *tunnel = netdev_priv(dev);
668 struct net_device_stats *stats = &tunnel->dev->stats; 699 struct pcpu_tstats *tstats;
669 struct iphdr *old_iph = ip_hdr(skb); 700 struct iphdr *old_iph = ip_hdr(skb);
670 struct iphdr *tiph; 701 struct iphdr *tiph;
671 u8 tos; 702 u8 tos;
672 __be16 df; 703 __be16 df;
673 struct rtable *rt; /* Route to the other host */ 704 struct rtable *rt; /* Route to the other host */
674 struct net_device *tdev; /* Device to other host */ 705 struct net_device *tdev; /* Device to other host */
675 struct iphdr *iph; /* Our new IP header */ 706 struct iphdr *iph; /* Our new IP header */
676 unsigned int max_headroom; /* The extra header space needed */ 707 unsigned int max_headroom; /* The extra header space needed */
677 int gre_hlen; 708 int gre_hlen;
678 __be32 dst; 709 __be32 dst;
679 int mtu; 710 int mtu;
680 711
681 if (tunnel->recursion++) {
682 stats->collisions++;
683 goto tx_error;
684 }
685
686 if (dev->type == ARPHRD_ETHER) 712 if (dev->type == ARPHRD_ETHER)
687 IPCB(skb)->flags = 0; 713 IPCB(skb)->flags = 0;
688 714
@@ -698,7 +724,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
698 /* NBMA tunnel */ 724 /* NBMA tunnel */
699 725
700 if (skb_dst(skb) == NULL) { 726 if (skb_dst(skb) == NULL) {
701 stats->tx_fifo_errors++; 727 dev->stats.tx_fifo_errors++;
702 goto tx_error; 728 goto tx_error;
703 } 729 }
704 730
@@ -707,7 +733,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
707 if ((dst = rt->rt_gateway) == 0) 733 if ((dst = rt->rt_gateway) == 0)
708 goto tx_error_icmp; 734 goto tx_error_icmp;
709 } 735 }
710#ifdef CONFIG_IPV6 736#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
711 else if (skb->protocol == htons(ETH_P_IPV6)) { 737 else if (skb->protocol == htons(ETH_P_IPV6)) {
712 struct in6_addr *addr6; 738 struct in6_addr *addr6;
713 int addr_type; 739 int addr_type;
@@ -735,35 +761,38 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
735 } 761 }
736 762
737 tos = tiph->tos; 763 tos = tiph->tos;
738 if (tos&1) { 764 if (tos == 1) {
765 tos = 0;
739 if (skb->protocol == htons(ETH_P_IP)) 766 if (skb->protocol == htons(ETH_P_IP))
740 tos = old_iph->tos; 767 tos = old_iph->tos;
741 tos &= ~1; 768 else if (skb->protocol == htons(ETH_P_IPV6))
769 tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
742 } 770 }
743 771
744 { 772 {
745 struct flowi fl = { .oif = tunnel->parms.link, 773 struct flowi fl = {
746 .nl_u = { .ip4_u = 774 .oif = tunnel->parms.link,
747 { .daddr = dst, 775 .fl4_dst = dst,
748 .saddr = tiph->saddr, 776 .fl4_src = tiph->saddr,
749 .tos = RT_TOS(tos) } }, 777 .fl4_tos = RT_TOS(tos),
750 .proto = IPPROTO_GRE }; 778 .fl_gre_key = tunnel->parms.o_key
779 };
751 if (ip_route_output_key(dev_net(dev), &rt, &fl)) { 780 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
752 stats->tx_carrier_errors++; 781 dev->stats.tx_carrier_errors++;
753 goto tx_error; 782 goto tx_error;
754 } 783 }
755 } 784 }
756 tdev = rt->u.dst.dev; 785 tdev = rt->dst.dev;
757 786
758 if (tdev == dev) { 787 if (tdev == dev) {
759 ip_rt_put(rt); 788 ip_rt_put(rt);
760 stats->collisions++; 789 dev->stats.collisions++;
761 goto tx_error; 790 goto tx_error;
762 } 791 }
763 792
764 df = tiph->frag_off; 793 df = tiph->frag_off;
765 if (df) 794 if (df)
766 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen; 795 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
767 else 796 else
768 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 797 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
769 798
@@ -780,7 +809,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
780 goto tx_error; 809 goto tx_error;
781 } 810 }
782 } 811 }
783#ifdef CONFIG_IPV6 812#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
784 else if (skb->protocol == htons(ETH_P_IPV6)) { 813 else if (skb->protocol == htons(ETH_P_IPV6)) {
785 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); 814 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
786 815
@@ -789,12 +818,12 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
789 !ipv4_is_multicast(tunnel->parms.iph.daddr)) || 818 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
790 rt6->rt6i_dst.plen == 128) { 819 rt6->rt6i_dst.plen == 128) {
791 rt6->rt6i_flags |= RTF_MODIFIED; 820 rt6->rt6i_flags |= RTF_MODIFIED;
792 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu; 821 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
793 } 822 }
794 } 823 }
795 824
796 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { 825 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
797 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); 826 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
798 ip_rt_put(rt); 827 ip_rt_put(rt);
799 goto tx_error; 828 goto tx_error;
800 } 829 }
@@ -811,17 +840,18 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
811 tunnel->err_count = 0; 840 tunnel->err_count = 0;
812 } 841 }
813 842
814 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen; 843 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
815 844
816 if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| 845 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
817 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { 846 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
818 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 847 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
848 if (max_headroom > dev->needed_headroom)
849 dev->needed_headroom = max_headroom;
819 if (!new_skb) { 850 if (!new_skb) {
820 ip_rt_put(rt); 851 ip_rt_put(rt);
821 stats->tx_dropped++; 852 dev->stats.tx_dropped++;
822 dev_kfree_skb(skb); 853 dev_kfree_skb(skb);
823 tunnel->recursion--; 854 return NETDEV_TX_OK;
824 return 0;
825 } 855 }
826 if (skb->sk) 856 if (skb->sk)
827 skb_set_owner_w(new_skb, skb->sk); 857 skb_set_owner_w(new_skb, skb->sk);
@@ -837,7 +867,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
837 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 867 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
838 IPSKB_REROUTED); 868 IPSKB_REROUTED);
839 skb_dst_drop(skb); 869 skb_dst_drop(skb);
840 skb_dst_set(skb, &rt->u.dst); 870 skb_dst_set(skb, &rt->dst);
841 871
842 /* 872 /*
843 * Push down and install the IPIP header. 873 * Push down and install the IPIP header.
@@ -855,12 +885,12 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
855 if ((iph->ttl = tiph->ttl) == 0) { 885 if ((iph->ttl = tiph->ttl) == 0) {
856 if (skb->protocol == htons(ETH_P_IP)) 886 if (skb->protocol == htons(ETH_P_IP))
857 iph->ttl = old_iph->ttl; 887 iph->ttl = old_iph->ttl;
858#ifdef CONFIG_IPV6 888#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
859 else if (skb->protocol == htons(ETH_P_IPV6)) 889 else if (skb->protocol == htons(ETH_P_IPV6))
860 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; 890 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
861#endif 891#endif
862 else 892 else
863 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); 893 iph->ttl = ip4_dst_hoplimit(&rt->dst);
864 } 894 }
865 895
866 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; 896 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
@@ -886,19 +916,17 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
886 } 916 }
887 917
888 nf_reset(skb); 918 nf_reset(skb);
889 919 tstats = this_cpu_ptr(dev->tstats);
890 IPTUNNEL_XMIT(); 920 __IPTUNNEL_XMIT(tstats, &dev->stats);
891 tunnel->recursion--; 921 return NETDEV_TX_OK;
892 return 0;
893 922
894tx_error_icmp: 923tx_error_icmp:
895 dst_link_failure(skb); 924 dst_link_failure(skb);
896 925
897tx_error: 926tx_error:
898 stats->tx_errors++; 927 dev->stats.tx_errors++;
899 dev_kfree_skb(skb); 928 dev_kfree_skb(skb);
900 tunnel->recursion--; 929 return NETDEV_TX_OK;
901 return 0;
902} 930}
903 931
904static int ipgre_tunnel_bind_dev(struct net_device *dev) 932static int ipgre_tunnel_bind_dev(struct net_device *dev)
@@ -916,15 +944,18 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
916 /* Guess output device to choose reasonable mtu and needed_headroom */ 944 /* Guess output device to choose reasonable mtu and needed_headroom */
917 945
918 if (iph->daddr) { 946 if (iph->daddr) {
919 struct flowi fl = { .oif = tunnel->parms.link, 947 struct flowi fl = {
920 .nl_u = { .ip4_u = 948 .oif = tunnel->parms.link,
921 { .daddr = iph->daddr, 949 .fl4_dst = iph->daddr,
922 .saddr = iph->saddr, 950 .fl4_src = iph->saddr,
923 .tos = RT_TOS(iph->tos) } }, 951 .fl4_tos = RT_TOS(iph->tos),
924 .proto = IPPROTO_GRE }; 952 .proto = IPPROTO_GRE,
953 .fl_gre_key = tunnel->parms.o_key
954 };
925 struct rtable *rt; 955 struct rtable *rt;
956
926 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 957 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
927 tdev = rt->u.dst.dev; 958 tdev = rt->dst.dev;
928 ip_rt_put(rt); 959 ip_rt_put(rt);
929 } 960 }
930 961
@@ -951,7 +982,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
951 addend += 4; 982 addend += 4;
952 } 983 }
953 dev->needed_headroom = addend + hlen; 984 dev->needed_headroom = addend + hlen;
954 mtu -= dev->hard_header_len - addend; 985 mtu -= dev->hard_header_len + addend;
955 986
956 if (mtu < 68) 987 if (mtu < 68)
957 mtu = 68; 988 mtu = 68;
@@ -1019,7 +1050,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1019 break; 1050 break;
1020 } 1051 }
1021 } else { 1052 } else {
1022 unsigned nflags = 0; 1053 unsigned int nflags = 0;
1023 1054
1024 t = netdev_priv(dev); 1055 t = netdev_priv(dev);
1025 1056
@@ -1033,6 +1064,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1033 break; 1064 break;
1034 } 1065 }
1035 ipgre_tunnel_unlink(ign, t); 1066 ipgre_tunnel_unlink(ign, t);
1067 synchronize_net();
1036 t->parms.iph.saddr = p.iph.saddr; 1068 t->parms.iph.saddr = p.iph.saddr;
1037 t->parms.iph.daddr = p.iph.daddr; 1069 t->parms.iph.daddr = p.iph.daddr;
1038 t->parms.i_key = p.i_key; 1070 t->parms.i_key = p.i_key;
@@ -1132,7 +1164,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1132 1164
1133static int ipgre_header(struct sk_buff *skb, struct net_device *dev, 1165static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1134 unsigned short type, 1166 unsigned short type,
1135 const void *daddr, const void *saddr, unsigned len) 1167 const void *daddr, const void *saddr, unsigned int len)
1136{ 1168{
1137 struct ip_tunnel *t = netdev_priv(dev); 1169 struct ip_tunnel *t = netdev_priv(dev);
1138 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); 1170 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
@@ -1148,12 +1180,9 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1148 1180
1149 if (saddr) 1181 if (saddr)
1150 memcpy(&iph->saddr, saddr, 4); 1182 memcpy(&iph->saddr, saddr, 4);
1151 1183 if (daddr)
1152 if (daddr) {
1153 memcpy(&iph->daddr, daddr, 4); 1184 memcpy(&iph->daddr, daddr, 4);
1154 return t->hlen; 1185 if (iph->daddr)
1155 }
1156 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1157 return t->hlen; 1186 return t->hlen;
1158 1187
1159 return -t->hlen; 1188 return -t->hlen;
@@ -1177,16 +1206,19 @@ static int ipgre_open(struct net_device *dev)
1177 struct ip_tunnel *t = netdev_priv(dev); 1206 struct ip_tunnel *t = netdev_priv(dev);
1178 1207
1179 if (ipv4_is_multicast(t->parms.iph.daddr)) { 1208 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1180 struct flowi fl = { .oif = t->parms.link, 1209 struct flowi fl = {
1181 .nl_u = { .ip4_u = 1210 .oif = t->parms.link,
1182 { .daddr = t->parms.iph.daddr, 1211 .fl4_dst = t->parms.iph.daddr,
1183 .saddr = t->parms.iph.saddr, 1212 .fl4_src = t->parms.iph.saddr,
1184 .tos = RT_TOS(t->parms.iph.tos) } }, 1213 .fl4_tos = RT_TOS(t->parms.iph.tos),
1185 .proto = IPPROTO_GRE }; 1214 .proto = IPPROTO_GRE,
1215 .fl_gre_key = t->parms.o_key
1216 };
1186 struct rtable *rt; 1217 struct rtable *rt;
1218
1187 if (ip_route_output_key(dev_net(dev), &rt, &fl)) 1219 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1188 return -EADDRNOTAVAIL; 1220 return -EADDRNOTAVAIL;
1189 dev = rt->u.dst.dev; 1221 dev = rt->dst.dev;
1190 ip_rt_put(rt); 1222 ip_rt_put(rt);
1191 if (__in_dev_get_rtnl(dev) == NULL) 1223 if (__in_dev_get_rtnl(dev) == NULL)
1192 return -EADDRNOTAVAIL; 1224 return -EADDRNOTAVAIL;
@@ -1203,10 +1235,8 @@ static int ipgre_close(struct net_device *dev)
1203 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { 1235 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1204 struct in_device *in_dev; 1236 struct in_device *in_dev;
1205 in_dev = inetdev_by_index(dev_net(dev), t->mlink); 1237 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1206 if (in_dev) { 1238 if (in_dev)
1207 ip_mc_dec_group(in_dev, t->parms.iph.daddr); 1239 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1208 in_dev_put(in_dev);
1209 }
1210 } 1240 }
1211 return 0; 1241 return 0;
1212} 1242}
@@ -1223,12 +1253,19 @@ static const struct net_device_ops ipgre_netdev_ops = {
1223 .ndo_start_xmit = ipgre_tunnel_xmit, 1253 .ndo_start_xmit = ipgre_tunnel_xmit,
1224 .ndo_do_ioctl = ipgre_tunnel_ioctl, 1254 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1225 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1255 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1256 .ndo_get_stats = ipgre_get_stats,
1226}; 1257};
1227 1258
1259static void ipgre_dev_free(struct net_device *dev)
1260{
1261 free_percpu(dev->tstats);
1262 free_netdev(dev);
1263}
1264
1228static void ipgre_tunnel_setup(struct net_device *dev) 1265static void ipgre_tunnel_setup(struct net_device *dev)
1229{ 1266{
1230 dev->netdev_ops = &ipgre_netdev_ops; 1267 dev->netdev_ops = &ipgre_netdev_ops;
1231 dev->destructor = free_netdev; 1268 dev->destructor = ipgre_dev_free;
1232 1269
1233 dev->type = ARPHRD_IPGRE; 1270 dev->type = ARPHRD_IPGRE;
1234 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 1271 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
@@ -1266,6 +1303,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
1266 } else 1303 } else
1267 dev->header_ops = &ipgre_header_ops; 1304 dev->header_ops = &ipgre_header_ops;
1268 1305
1306 dev->tstats = alloc_percpu(struct pcpu_tstats);
1307 if (!dev->tstats)
1308 return -ENOMEM;
1309
1269 return 0; 1310 return 0;
1270} 1311}
1271 1312
@@ -1273,7 +1314,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
1273{ 1314{
1274 struct ip_tunnel *tunnel = netdev_priv(dev); 1315 struct ip_tunnel *tunnel = netdev_priv(dev);
1275 struct iphdr *iph = &tunnel->parms.iph; 1316 struct iphdr *iph = &tunnel->parms.iph;
1276 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1277 1317
1278 tunnel->dev = dev; 1318 tunnel->dev = dev;
1279 strcpy(tunnel->parms.name, dev->name); 1319 strcpy(tunnel->parms.name, dev->name);
@@ -1284,17 +1324,15 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
1284 tunnel->hlen = sizeof(struct iphdr) + 4; 1324 tunnel->hlen = sizeof(struct iphdr) + 4;
1285 1325
1286 dev_hold(dev); 1326 dev_hold(dev);
1287 ign->tunnels_wc[0] = tunnel;
1288} 1327}
1289 1328
1290 1329
1291static struct net_protocol ipgre_protocol = { 1330static const struct gre_protocol ipgre_protocol = {
1292 .handler = ipgre_rcv, 1331 .handler = ipgre_rcv,
1293 .err_handler = ipgre_err, 1332 .err_handler = ipgre_err,
1294 .netns_ok = 1,
1295}; 1333};
1296 1334
1297static void ipgre_destroy_tunnels(struct ipgre_net *ign) 1335static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1298{ 1336{
1299 int prio; 1337 int prio;
1300 1338
@@ -1302,25 +1340,21 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1302 int h; 1340 int h;
1303 for (h = 0; h < HASH_SIZE; h++) { 1341 for (h = 0; h < HASH_SIZE; h++) {
1304 struct ip_tunnel *t; 1342 struct ip_tunnel *t;
1305 while ((t = ign->tunnels[prio][h]) != NULL) 1343
1306 unregister_netdevice(t->dev); 1344 t = rtnl_dereference(ign->tunnels[prio][h]);
1345
1346 while (t != NULL) {
1347 unregister_netdevice_queue(t->dev, head);
1348 t = rtnl_dereference(t->next);
1349 }
1307 } 1350 }
1308 } 1351 }
1309} 1352}
1310 1353
1311static int ipgre_init_net(struct net *net) 1354static int __net_init ipgre_init_net(struct net *net)
1312{ 1355{
1356 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1313 int err; 1357 int err;
1314 struct ipgre_net *ign;
1315
1316 err = -ENOMEM;
1317 ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1318 if (ign == NULL)
1319 goto err_alloc;
1320
1321 err = net_assign_generic(net, ipgre_net_id, ign);
1322 if (err < 0)
1323 goto err_assign;
1324 1358
1325 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", 1359 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1326 ipgre_tunnel_setup); 1360 ipgre_tunnel_setup);
@@ -1336,32 +1370,33 @@ static int ipgre_init_net(struct net *net)
1336 if ((err = register_netdev(ign->fb_tunnel_dev))) 1370 if ((err = register_netdev(ign->fb_tunnel_dev)))
1337 goto err_reg_dev; 1371 goto err_reg_dev;
1338 1372
1373 rcu_assign_pointer(ign->tunnels_wc[0],
1374 netdev_priv(ign->fb_tunnel_dev));
1339 return 0; 1375 return 0;
1340 1376
1341err_reg_dev: 1377err_reg_dev:
1342 free_netdev(ign->fb_tunnel_dev); 1378 ipgre_dev_free(ign->fb_tunnel_dev);
1343err_alloc_dev: 1379err_alloc_dev:
1344 /* nothing */
1345err_assign:
1346 kfree(ign);
1347err_alloc:
1348 return err; 1380 return err;
1349} 1381}
1350 1382
1351static void ipgre_exit_net(struct net *net) 1383static void __net_exit ipgre_exit_net(struct net *net)
1352{ 1384{
1353 struct ipgre_net *ign; 1385 struct ipgre_net *ign;
1386 LIST_HEAD(list);
1354 1387
1355 ign = net_generic(net, ipgre_net_id); 1388 ign = net_generic(net, ipgre_net_id);
1356 rtnl_lock(); 1389 rtnl_lock();
1357 ipgre_destroy_tunnels(ign); 1390 ipgre_destroy_tunnels(ign, &list);
1391 unregister_netdevice_many(&list);
1358 rtnl_unlock(); 1392 rtnl_unlock();
1359 kfree(ign);
1360} 1393}
1361 1394
1362static struct pernet_operations ipgre_net_ops = { 1395static struct pernet_operations ipgre_net_ops = {
1363 .init = ipgre_init_net, 1396 .init = ipgre_init_net,
1364 .exit = ipgre_exit_net, 1397 .exit = ipgre_exit_net,
1398 .id = &ipgre_net_id,
1399 .size = sizeof(struct ipgre_net),
1365}; 1400};
1366 1401
1367static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) 1402static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -1458,6 +1493,10 @@ static int ipgre_tap_init(struct net_device *dev)
1458 1493
1459 ipgre_tunnel_bind_dev(dev); 1494 ipgre_tunnel_bind_dev(dev);
1460 1495
1496 dev->tstats = alloc_percpu(struct pcpu_tstats);
1497 if (!dev->tstats)
1498 return -ENOMEM;
1499
1461 return 0; 1500 return 0;
1462} 1501}
1463 1502
@@ -1468,6 +1507,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
1468 .ndo_set_mac_address = eth_mac_addr, 1507 .ndo_set_mac_address = eth_mac_addr,
1469 .ndo_validate_addr = eth_validate_addr, 1508 .ndo_validate_addr = eth_validate_addr,
1470 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1509 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1510 .ndo_get_stats = ipgre_get_stats,
1471}; 1511};
1472 1512
1473static void ipgre_tap_setup(struct net_device *dev) 1513static void ipgre_tap_setup(struct net_device *dev)
@@ -1475,14 +1515,14 @@ static void ipgre_tap_setup(struct net_device *dev)
1475 1515
1476 ether_setup(dev); 1516 ether_setup(dev);
1477 1517
1478 dev->netdev_ops = &ipgre_netdev_ops; 1518 dev->netdev_ops = &ipgre_tap_netdev_ops;
1479 dev->destructor = free_netdev; 1519 dev->destructor = ipgre_dev_free;
1480 1520
1481 dev->iflink = 0; 1521 dev->iflink = 0;
1482 dev->features |= NETIF_F_NETNS_LOCAL; 1522 dev->features |= NETIF_F_NETNS_LOCAL;
1483} 1523}
1484 1524
1485static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[], 1525static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1486 struct nlattr *data[]) 1526 struct nlattr *data[])
1487{ 1527{
1488 struct ip_tunnel *nt; 1528 struct ip_tunnel *nt;
@@ -1504,6 +1544,10 @@ static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1504 if (!tb[IFLA_MTU]) 1544 if (!tb[IFLA_MTU])
1505 dev->mtu = mtu; 1545 dev->mtu = mtu;
1506 1546
1547 /* Can use a lockless transmit, unless we generate output sequences */
1548 if (!(nt->parms.o_flags & GRE_SEQ))
1549 dev->features |= NETIF_F_LLTX;
1550
1507 err = register_netdevice(dev); 1551 err = register_netdevice(dev);
1508 if (err) 1552 if (err)
1509 goto out; 1553 goto out;
@@ -1536,25 +1580,29 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1536 if (t->dev != dev) 1580 if (t->dev != dev)
1537 return -EEXIST; 1581 return -EEXIST;
1538 } else { 1582 } else {
1539 unsigned nflags = 0;
1540
1541 t = nt; 1583 t = nt;
1542 1584
1543 if (ipv4_is_multicast(p.iph.daddr)) 1585 if (dev->type != ARPHRD_ETHER) {
1544 nflags = IFF_BROADCAST; 1586 unsigned int nflags = 0;
1545 else if (p.iph.daddr)
1546 nflags = IFF_POINTOPOINT;
1547 1587
1548 if ((dev->flags ^ nflags) & 1588 if (ipv4_is_multicast(p.iph.daddr))
1549 (IFF_POINTOPOINT | IFF_BROADCAST)) 1589 nflags = IFF_BROADCAST;
1550 return -EINVAL; 1590 else if (p.iph.daddr)
1591 nflags = IFF_POINTOPOINT;
1592
1593 if ((dev->flags ^ nflags) &
1594 (IFF_POINTOPOINT | IFF_BROADCAST))
1595 return -EINVAL;
1596 }
1551 1597
1552 ipgre_tunnel_unlink(ign, t); 1598 ipgre_tunnel_unlink(ign, t);
1553 t->parms.iph.saddr = p.iph.saddr; 1599 t->parms.iph.saddr = p.iph.saddr;
1554 t->parms.iph.daddr = p.iph.daddr; 1600 t->parms.iph.daddr = p.iph.daddr;
1555 t->parms.i_key = p.i_key; 1601 t->parms.i_key = p.i_key;
1556 memcpy(dev->dev_addr, &p.iph.saddr, 4); 1602 if (dev->type != ARPHRD_ETHER) {
1557 memcpy(dev->broadcast, &p.iph.daddr, 4); 1603 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1604 memcpy(dev->broadcast, &p.iph.daddr, 4);
1605 }
1558 ipgre_tunnel_link(ign, t); 1606 ipgre_tunnel_link(ign, t);
1559 netdev_state_change(dev); 1607 netdev_state_change(dev);
1560 } 1608 }
@@ -1672,15 +1720,16 @@ static int __init ipgre_init(void)
1672 1720
1673 printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); 1721 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1674 1722
1675 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) { 1723 err = register_pernet_device(&ipgre_net_ops);
1724 if (err < 0)
1725 return err;
1726
1727 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1728 if (err < 0) {
1676 printk(KERN_INFO "ipgre init: can't add protocol\n"); 1729 printk(KERN_INFO "ipgre init: can't add protocol\n");
1677 return -EAGAIN; 1730 goto add_proto_failed;
1678 } 1731 }
1679 1732
1680 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1681 if (err < 0)
1682 goto gen_device_failed;
1683
1684 err = rtnl_link_register(&ipgre_link_ops); 1733 err = rtnl_link_register(&ipgre_link_ops);
1685 if (err < 0) 1734 if (err < 0)
1686 goto rtnl_link_failed; 1735 goto rtnl_link_failed;
@@ -1695,9 +1744,9 @@ out:
1695tap_ops_failed: 1744tap_ops_failed:
1696 rtnl_link_unregister(&ipgre_link_ops); 1745 rtnl_link_unregister(&ipgre_link_ops);
1697rtnl_link_failed: 1746rtnl_link_failed:
1698 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); 1747 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1699gen_device_failed: 1748add_proto_failed:
1700 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); 1749 unregister_pernet_device(&ipgre_net_ops);
1701 goto out; 1750 goto out;
1702} 1751}
1703 1752
@@ -1705,9 +1754,9 @@ static void __exit ipgre_fini(void)
1705{ 1754{
1706 rtnl_link_unregister(&ipgre_tap_ops); 1755 rtnl_link_unregister(&ipgre_tap_ops);
1707 rtnl_link_unregister(&ipgre_link_ops); 1756 rtnl_link_unregister(&ipgre_link_ops);
1708 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); 1757 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1709 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1710 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1758 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1759 unregister_pernet_device(&ipgre_net_ops);
1711} 1760}
1712 1761
1713module_init(ipgre_init); 1762module_init(ipgre_init);
@@ -1715,3 +1764,4 @@ module_exit(ipgre_fini);
1715MODULE_LICENSE("GPL"); 1764MODULE_LICENSE("GPL");
1716MODULE_ALIAS_RTNL_LINK("gre"); 1765MODULE_ALIAS_RTNL_LINK("gre");
1717MODULE_ALIAS_RTNL_LINK("gretap"); 1766MODULE_ALIAS_RTNL_LINK("gretap");
1767MODULE_ALIAS("gre0");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 490ce20faf38..d859bcc26cb7 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -119,6 +119,7 @@
119#include <linux/kernel.h> 119#include <linux/kernel.h>
120#include <linux/string.h> 120#include <linux/string.h>
121#include <linux/errno.h> 121#include <linux/errno.h>
122#include <linux/slab.h>
122 123
123#include <linux/net.h> 124#include <linux/net.h>
124#include <linux/socket.h> 125#include <linux/socket.h>
@@ -145,7 +146,7 @@
145#include <linux/netlink.h> 146#include <linux/netlink.h>
146 147
147/* 148/*
148 * Process Router Attention IP option 149 * Process Router Attention IP option (RFC 2113)
149 */ 150 */
150int ip_call_ra_chain(struct sk_buff *skb) 151int ip_call_ra_chain(struct sk_buff *skb)
151{ 152{
@@ -154,22 +155,19 @@ int ip_call_ra_chain(struct sk_buff *skb)
154 struct sock *last = NULL; 155 struct sock *last = NULL;
155 struct net_device *dev = skb->dev; 156 struct net_device *dev = skb->dev;
156 157
157 read_lock(&ip_ra_lock); 158 for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
158 for (ra = ip_ra_chain; ra; ra = ra->next) {
159 struct sock *sk = ra->sk; 159 struct sock *sk = ra->sk;
160 160
161 /* If socket is bound to an interface, only report 161 /* If socket is bound to an interface, only report
162 * the packet if it came from that interface. 162 * the packet if it came from that interface.
163 */ 163 */
164 if (sk && inet_sk(sk)->num == protocol && 164 if (sk && inet_sk(sk)->inet_num == protocol &&
165 (!sk->sk_bound_dev_if || 165 (!sk->sk_bound_dev_if ||
166 sk->sk_bound_dev_if == dev->ifindex) && 166 sk->sk_bound_dev_if == dev->ifindex) &&
167 sock_net(sk) == dev_net(dev)) { 167 net_eq(sock_net(sk), dev_net(dev))) {
168 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { 168 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
169 if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) { 169 if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
170 read_unlock(&ip_ra_lock);
171 return 1; 170 return 1;
172 }
173 } 171 }
174 if (last) { 172 if (last) {
175 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 173 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -182,10 +180,8 @@ int ip_call_ra_chain(struct sk_buff *skb)
182 180
183 if (last) { 181 if (last) {
184 raw_rcv(last, skb); 182 raw_rcv(last, skb);
185 read_unlock(&ip_ra_lock);
186 return 1; 183 return 1;
187 } 184 }
188 read_unlock(&ip_ra_lock);
189 return 0; 185 return 0;
190} 186}
191 187
@@ -202,7 +198,7 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
202 { 198 {
203 int protocol = ip_hdr(skb)->protocol; 199 int protocol = ip_hdr(skb)->protocol;
204 int hash, raw; 200 int hash, raw;
205 struct net_protocol *ipprot; 201 const struct net_protocol *ipprot;
206 202
207 resubmit: 203 resubmit:
208 raw = raw_local_deliver(skb, protocol); 204 raw = raw_local_deliver(skb, protocol);
@@ -265,7 +261,7 @@ int ip_local_deliver(struct sk_buff *skb)
265 return 0; 261 return 0;
266 } 262 }
267 263
268 return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL, 264 return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
269 ip_local_deliver_finish); 265 ip_local_deliver_finish);
270} 266}
271 267
@@ -297,18 +293,16 @@ static inline int ip_rcv_options(struct sk_buff *skb)
297 } 293 }
298 294
299 if (unlikely(opt->srr)) { 295 if (unlikely(opt->srr)) {
300 struct in_device *in_dev = in_dev_get(dev); 296 struct in_device *in_dev = __in_dev_get_rcu(dev);
297
301 if (in_dev) { 298 if (in_dev) {
302 if (!IN_DEV_SOURCE_ROUTE(in_dev)) { 299 if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
303 if (IN_DEV_LOG_MARTIANS(in_dev) && 300 if (IN_DEV_LOG_MARTIANS(in_dev) &&
304 net_ratelimit()) 301 net_ratelimit())
305 printk(KERN_INFO "source route option %pI4 -> %pI4\n", 302 printk(KERN_INFO "source route option %pI4 -> %pI4\n",
306 &iph->saddr, &iph->daddr); 303 &iph->saddr, &iph->daddr);
307 in_dev_put(in_dev);
308 goto drop; 304 goto drop;
309 } 305 }
310
311 in_dev_put(in_dev);
312 } 306 }
313 307
314 if (ip_options_rcv_srr(skb)) 308 if (ip_options_rcv_srr(skb))
@@ -330,8 +324,8 @@ static int ip_rcv_finish(struct sk_buff *skb)
330 * how the packet travels inside Linux networking. 324 * how the packet travels inside Linux networking.
331 */ 325 */
332 if (skb_dst(skb) == NULL) { 326 if (skb_dst(skb) == NULL) {
333 int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, 327 int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
334 skb->dev); 328 iph->tos, skb->dev);
335 if (unlikely(err)) { 329 if (unlikely(err)) {
336 if (err == -EHOSTUNREACH) 330 if (err == -EHOSTUNREACH)
337 IP_INC_STATS_BH(dev_net(skb->dev), 331 IP_INC_STATS_BH(dev_net(skb->dev),
@@ -339,13 +333,16 @@ static int ip_rcv_finish(struct sk_buff *skb)
339 else if (err == -ENETUNREACH) 333 else if (err == -ENETUNREACH)
340 IP_INC_STATS_BH(dev_net(skb->dev), 334 IP_INC_STATS_BH(dev_net(skb->dev),
341 IPSTATS_MIB_INNOROUTES); 335 IPSTATS_MIB_INNOROUTES);
336 else if (err == -EXDEV)
337 NET_INC_STATS_BH(dev_net(skb->dev),
338 LINUX_MIB_IPRPFILTER);
342 goto drop; 339 goto drop;
343 } 340 }
344 } 341 }
345 342
346#ifdef CONFIG_NET_CLS_ROUTE 343#ifdef CONFIG_NET_CLS_ROUTE
347 if (unlikely(skb_dst(skb)->tclassid)) { 344 if (unlikely(skb_dst(skb)->tclassid)) {
348 struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id()); 345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
349 u32 idx = skb_dst(skb)->tclassid; 346 u32 idx = skb_dst(skb)->tclassid;
350 st[idx&0xFF].o_packets++; 347 st[idx&0xFF].o_packets++;
351 st[idx&0xFF].o_bytes += skb->len; 348 st[idx&0xFF].o_bytes += skb->len;
@@ -359,10 +356,10 @@ static int ip_rcv_finish(struct sk_buff *skb)
359 356
360 rt = skb_rtable(skb); 357 rt = skb_rtable(skb);
361 if (rt->rt_type == RTN_MULTICAST) { 358 if (rt->rt_type == RTN_MULTICAST) {
362 IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST, 359 IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
363 skb->len); 360 skb->len);
364 } else if (rt->rt_type == RTN_BROADCAST) 361 } else if (rt->rt_type == RTN_BROADCAST)
365 IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST, 362 IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
366 skb->len); 363 skb->len);
367 364
368 return dst_input(skb); 365 return dst_input(skb);
@@ -440,7 +437,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
440 /* Remove any debris in the socket control block */ 437 /* Remove any debris in the socket control block */
441 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); 438 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
442 439
443 return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, 440 /* Must drop socket now because of tproxy. */
441 skb_orphan(skb);
442
443 return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
444 ip_rcv_finish); 444 ip_rcv_finish);
445 445
446inhdr_error: 446inhdr_error:
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 94bf105ef3c9..1906fa35860c 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/capability.h> 12#include <linux/capability.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h>
14#include <linux/types.h> 15#include <linux/types.h>
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16#include <linux/skbuff.h> 17#include <linux/skbuff.h>
@@ -237,7 +238,6 @@ void ip_options_fragment(struct sk_buff * skb)
237 opt->rr_needaddr = 0; 238 opt->rr_needaddr = 0;
238 opt->ts_needaddr = 0; 239 opt->ts_needaddr = 0;
239 opt->ts_needtime = 0; 240 opt->ts_needtime = 0;
240 return;
241} 241}
242 242
243/* 243/*
@@ -466,7 +466,7 @@ error:
466 } 466 }
467 return -EINVAL; 467 return -EINVAL;
468} 468}
469 469EXPORT_SYMBOL(ip_options_compile);
470 470
471/* 471/*
472 * Undo all the changes done by ip_options_compile(). 472 * Undo all the changes done by ip_options_compile().
@@ -600,6 +600,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
600 unsigned char *optptr = skb_network_header(skb) + opt->srr; 600 unsigned char *optptr = skb_network_header(skb) + opt->srr;
601 struct rtable *rt = skb_rtable(skb); 601 struct rtable *rt = skb_rtable(skb);
602 struct rtable *rt2; 602 struct rtable *rt2;
603 unsigned long orefdst;
603 int err; 604 int err;
604 605
605 if (!opt->srr) 606 if (!opt->srr)
@@ -623,16 +624,16 @@ int ip_options_rcv_srr(struct sk_buff *skb)
623 } 624 }
624 memcpy(&nexthop, &optptr[srrptr-1], 4); 625 memcpy(&nexthop, &optptr[srrptr-1], 4);
625 626
626 rt = skb_rtable(skb); 627 orefdst = skb->_skb_refdst;
627 skb_dst_set(skb, NULL); 628 skb_dst_set(skb, NULL);
628 err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); 629 err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
629 rt2 = skb_rtable(skb); 630 rt2 = skb_rtable(skb);
630 if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { 631 if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
631 ip_rt_put(rt2); 632 skb_dst_drop(skb);
632 skb_dst_set(skb, &rt->u.dst); 633 skb->_skb_refdst = orefdst;
633 return -EINVAL; 634 return -EINVAL;
634 } 635 }
635 ip_rt_put(rt); 636 refdst_drop(orefdst);
636 if (rt2->rt_type != RTN_LOCAL) 637 if (rt2->rt_type != RTN_LOCAL)
637 break; 638 break;
638 /* Superfast 8) loopback forward */ 639 /* Superfast 8) loopback forward */
@@ -645,3 +646,4 @@ int ip_options_rcv_srr(struct sk_buff *skb)
645 } 646 }
646 return 0; 647 return 0;
647} 648}
649EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 247026282669..04c7b3ba6b39 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -51,6 +51,7 @@
51#include <linux/string.h> 51#include <linux/string.h>
52#include <linux/errno.h> 52#include <linux/errno.h>
53#include <linux/highmem.h> 53#include <linux/highmem.h>
54#include <linux/slab.h>
54 55
55#include <linux/socket.h> 56#include <linux/socket.h>
56#include <linux/sockios.h> 57#include <linux/sockios.h>
@@ -81,6 +82,7 @@
81#include <linux/tcp.h> 82#include <linux/tcp.h>
82 83
83int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; 84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85EXPORT_SYMBOL(sysctl_ip_default_ttl);
84 86
85/* Generate a checksum for an outgoing IP datagram. */ 87/* Generate a checksum for an outgoing IP datagram. */
86__inline__ void ip_send_check(struct iphdr *iph) 88__inline__ void ip_send_check(struct iphdr *iph)
@@ -88,6 +90,7 @@ __inline__ void ip_send_check(struct iphdr *iph)
88 iph->check = 0; 90 iph->check = 0;
89 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
90} 92}
93EXPORT_SYMBOL(ip_send_check);
91 94
92int __ip_local_out(struct sk_buff *skb) 95int __ip_local_out(struct sk_buff *skb)
93{ 96{
@@ -95,8 +98,8 @@ int __ip_local_out(struct sk_buff *skb)
95 98
96 iph->tot_len = htons(skb->len); 99 iph->tot_len = htons(skb->len);
97 ip_send_check(iph); 100 ip_send_check(iph);
98 return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, 101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
99 dst_output); 102 skb_dst(skb)->dev, dst_output);
100} 103}
101 104
102int ip_local_out(struct sk_buff *skb) 105int ip_local_out(struct sk_buff *skb)
@@ -119,7 +122,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119 newskb->pkt_type = PACKET_LOOPBACK; 122 newskb->pkt_type = PACKET_LOOPBACK;
120 newskb->ip_summed = CHECKSUM_UNNECESSARY; 123 newskb->ip_summed = CHECKSUM_UNNECESSARY;
121 WARN_ON(!skb_dst(newskb)); 124 WARN_ON(!skb_dst(newskb));
122 netif_rx(newskb); 125 netif_rx_ni(newskb);
123 return 0; 126 return 0;
124} 127}
125 128
@@ -128,7 +131,7 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
128 int ttl = inet->uc_ttl; 131 int ttl = inet->uc_ttl;
129 132
130 if (ttl < 0) 133 if (ttl < 0)
131 ttl = dst_metric(dst, RTAX_HOPLIMIT); 134 ttl = ip4_dst_hoplimit(dst);
132 return ttl; 135 return ttl;
133} 136}
134 137
@@ -150,15 +153,15 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
150 iph->version = 4; 153 iph->version = 4;
151 iph->ihl = 5; 154 iph->ihl = 5;
152 iph->tos = inet->tos; 155 iph->tos = inet->tos;
153 if (ip_dont_fragment(sk, &rt->u.dst)) 156 if (ip_dont_fragment(sk, &rt->dst))
154 iph->frag_off = htons(IP_DF); 157 iph->frag_off = htons(IP_DF);
155 else 158 else
156 iph->frag_off = 0; 159 iph->frag_off = 0;
157 iph->ttl = ip_select_ttl(inet, &rt->u.dst); 160 iph->ttl = ip_select_ttl(inet, &rt->dst);
158 iph->daddr = rt->rt_dst; 161 iph->daddr = rt->rt_dst;
159 iph->saddr = rt->rt_src; 162 iph->saddr = rt->rt_src;
160 iph->protocol = sk->sk_protocol; 163 iph->protocol = sk->sk_protocol;
161 ip_select_ident(iph, &rt->u.dst, sk); 164 ip_select_ident(iph, &rt->dst, sk);
162 165
163 if (opt && opt->optlen) { 166 if (opt && opt->optlen) {
164 iph->ihl += opt->optlen>>2; 167 iph->ihl += opt->optlen>>2;
@@ -171,7 +174,6 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
171 /* Send it out. */ 174 /* Send it out. */
172 return ip_local_out(skb); 175 return ip_local_out(skb);
173} 176}
174
175EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); 177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
176 178
177static inline int ip_finish_output2(struct sk_buff *skb) 179static inline int ip_finish_output2(struct sk_buff *skb)
@@ -239,7 +241,7 @@ int ip_mc_output(struct sk_buff *skb)
239{ 241{
240 struct sock *sk = skb->sk; 242 struct sock *sk = skb->sk;
241 struct rtable *rt = skb_rtable(skb); 243 struct rtable *rt = skb_rtable(skb);
242 struct net_device *dev = rt->u.dst.dev; 244 struct net_device *dev = rt->dst.dev;
243 245
244 /* 246 /*
245 * If the indicated interface is up and running, send the packet. 247 * If the indicated interface is up and running, send the packet.
@@ -254,7 +256,7 @@ int ip_mc_output(struct sk_buff *skb)
254 */ 256 */
255 257
256 if (rt->rt_flags&RTCF_MULTICAST) { 258 if (rt->rt_flags&RTCF_MULTICAST) {
257 if ((!sk || inet_sk(sk)->mc_loop) 259 if (sk_mc_loop(sk)
258#ifdef CONFIG_IP_MROUTE 260#ifdef CONFIG_IP_MROUTE
259 /* Small optimization: do not loopback not local frames, 261 /* Small optimization: do not loopback not local frames,
260 which returned after forwarding; they will be dropped 262 which returned after forwarding; they will be dropped
@@ -264,13 +266,15 @@ int ip_mc_output(struct sk_buff *skb)
264 266
265 This check is duplicated in ip_mr_input at the moment. 267 This check is duplicated in ip_mr_input at the moment.
266 */ 268 */
267 && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) 269 &&
270 ((rt->rt_flags & RTCF_LOCAL) ||
271 !(IPCB(skb)->flags & IPSKB_FORWARDED))
268#endif 272#endif
269 ) { 273 ) {
270 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 274 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
271 if (newskb) 275 if (newskb)
272 NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, 276 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
273 NULL, newskb->dev, 277 newskb, NULL, newskb->dev,
274 ip_dev_loopback_xmit); 278 ip_dev_loopback_xmit);
275 } 279 }
276 280
@@ -285,12 +289,12 @@ int ip_mc_output(struct sk_buff *skb)
285 if (rt->rt_flags&RTCF_BROADCAST) { 289 if (rt->rt_flags&RTCF_BROADCAST) {
286 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 290 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
287 if (newskb) 291 if (newskb)
288 NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL, 292 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
289 newskb->dev, ip_dev_loopback_xmit); 293 NULL, newskb->dev, ip_dev_loopback_xmit);
290 } 294 }
291 295
292 return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev, 296 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
293 ip_finish_output, 297 skb->dev, ip_finish_output,
294 !(IPCB(skb)->flags & IPSKB_REROUTED)); 298 !(IPCB(skb)->flags & IPSKB_REROUTED));
295} 299}
296 300
@@ -303,22 +307,24 @@ int ip_output(struct sk_buff *skb)
303 skb->dev = dev; 307 skb->dev = dev;
304 skb->protocol = htons(ETH_P_IP); 308 skb->protocol = htons(ETH_P_IP);
305 309
306 return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, 310 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
307 ip_finish_output, 311 ip_finish_output,
308 !(IPCB(skb)->flags & IPSKB_REROUTED)); 312 !(IPCB(skb)->flags & IPSKB_REROUTED));
309} 313}
310 314
311int ip_queue_xmit(struct sk_buff *skb, int ipfragok) 315int ip_queue_xmit(struct sk_buff *skb)
312{ 316{
313 struct sock *sk = skb->sk; 317 struct sock *sk = skb->sk;
314 struct inet_sock *inet = inet_sk(sk); 318 struct inet_sock *inet = inet_sk(sk);
315 struct ip_options *opt = inet->opt; 319 struct ip_options *opt = inet->opt;
316 struct rtable *rt; 320 struct rtable *rt;
317 struct iphdr *iph; 321 struct iphdr *iph;
322 int res;
318 323
319 /* Skip all of this if the packet is already routed, 324 /* Skip all of this if the packet is already routed,
320 * f.e. by something like SCTP. 325 * f.e. by something like SCTP.
321 */ 326 */
327 rcu_read_lock();
322 rt = skb_rtable(skb); 328 rt = skb_rtable(skb);
323 if (rt != NULL) 329 if (rt != NULL)
324 goto packet_routed; 330 goto packet_routed;
@@ -329,21 +335,20 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
329 __be32 daddr; 335 __be32 daddr;
330 336
331 /* Use correct destination address if we have options. */ 337 /* Use correct destination address if we have options. */
332 daddr = inet->daddr; 338 daddr = inet->inet_daddr;
333 if(opt && opt->srr) 339 if(opt && opt->srr)
334 daddr = opt->faddr; 340 daddr = opt->faddr;
335 341
336 { 342 {
337 struct flowi fl = { .oif = sk->sk_bound_dev_if, 343 struct flowi fl = { .oif = sk->sk_bound_dev_if,
338 .nl_u = { .ip4_u = 344 .mark = sk->sk_mark,
339 { .daddr = daddr, 345 .fl4_dst = daddr,
340 .saddr = inet->saddr, 346 .fl4_src = inet->inet_saddr,
341 .tos = RT_CONN_FLAGS(sk) } }, 347 .fl4_tos = RT_CONN_FLAGS(sk),
342 .proto = sk->sk_protocol, 348 .proto = sk->sk_protocol,
343 .flags = inet_sk_flowi_flags(sk), 349 .flags = inet_sk_flowi_flags(sk),
344 .uli_u = { .ports = 350 .fl_ip_sport = inet->inet_sport,
345 { .sport = inet->sport, 351 .fl_ip_dport = inet->inet_dport };
346 .dport = inet->dport } } };
347 352
348 /* If this fails, retransmit mechanism of transport layer will 353 /* If this fails, retransmit mechanism of transport layer will
349 * keep trying until route appears or the connection times 354 * keep trying until route appears or the connection times
@@ -353,9 +358,9 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
353 if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) 358 if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
354 goto no_route; 359 goto no_route;
355 } 360 }
356 sk_setup_caps(sk, &rt->u.dst); 361 sk_setup_caps(sk, &rt->dst);
357 } 362 }
358 skb_dst_set(skb, dst_clone(&rt->u.dst)); 363 skb_dst_set_noref(skb, &rt->dst);
359 364
360packet_routed: 365packet_routed:
361 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 366 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
@@ -366,11 +371,11 @@ packet_routed:
366 skb_reset_network_header(skb); 371 skb_reset_network_header(skb);
367 iph = ip_hdr(skb); 372 iph = ip_hdr(skb);
368 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); 373 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
369 if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) 374 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
370 iph->frag_off = htons(IP_DF); 375 iph->frag_off = htons(IP_DF);
371 else 376 else
372 iph->frag_off = 0; 377 iph->frag_off = 0;
373 iph->ttl = ip_select_ttl(inet, &rt->u.dst); 378 iph->ttl = ip_select_ttl(inet, &rt->dst);
374 iph->protocol = sk->sk_protocol; 379 iph->protocol = sk->sk_protocol;
375 iph->saddr = rt->rt_src; 380 iph->saddr = rt->rt_src;
376 iph->daddr = rt->rt_dst; 381 iph->daddr = rt->rt_dst;
@@ -378,22 +383,26 @@ packet_routed:
378 383
379 if (opt && opt->optlen) { 384 if (opt && opt->optlen) {
380 iph->ihl += opt->optlen >> 2; 385 iph->ihl += opt->optlen >> 2;
381 ip_options_build(skb, opt, inet->daddr, rt, 0); 386 ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
382 } 387 }
383 388
384 ip_select_ident_more(iph, &rt->u.dst, sk, 389 ip_select_ident_more(iph, &rt->dst, sk,
385 (skb_shinfo(skb)->gso_segs ?: 1) - 1); 390 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
386 391
387 skb->priority = sk->sk_priority; 392 skb->priority = sk->sk_priority;
388 skb->mark = sk->sk_mark; 393 skb->mark = sk->sk_mark;
389 394
390 return ip_local_out(skb); 395 res = ip_local_out(skb);
396 rcu_read_unlock();
397 return res;
391 398
392no_route: 399no_route:
400 rcu_read_unlock();
393 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 401 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
394 kfree_skb(skb); 402 kfree_skb(skb);
395 return -EHOSTUNREACH; 403 return -EHOSTUNREACH;
396} 404}
405EXPORT_SYMBOL(ip_queue_xmit);
397 406
398 407
399static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) 408static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
@@ -402,7 +411,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
402 to->priority = from->priority; 411 to->priority = from->priority;
403 to->protocol = from->protocol; 412 to->protocol = from->protocol;
404 skb_dst_drop(to); 413 skb_dst_drop(to);
405 skb_dst_set(to, dst_clone(skb_dst(from))); 414 skb_dst_copy(to, from);
406 to->dev = from->dev; 415 to->dev = from->dev;
407 to->mark = from->mark; 416 to->mark = from->mark;
408 417
@@ -433,17 +442,16 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
433int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) 442int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
434{ 443{
435 struct iphdr *iph; 444 struct iphdr *iph;
436 int raw = 0;
437 int ptr; 445 int ptr;
438 struct net_device *dev; 446 struct net_device *dev;
439 struct sk_buff *skb2; 447 struct sk_buff *skb2;
440 unsigned int mtu, hlen, left, len, ll_rs, pad; 448 unsigned int mtu, hlen, left, len, ll_rs;
441 int offset; 449 int offset;
442 __be16 not_last_frag; 450 __be16 not_last_frag;
443 struct rtable *rt = skb_rtable(skb); 451 struct rtable *rt = skb_rtable(skb);
444 int err = 0; 452 int err = 0;
445 453
446 dev = rt->u.dst.dev; 454 dev = rt->dst.dev;
447 455
448 /* 456 /*
449 * Point into the IP datagram header. 457 * Point into the IP datagram header.
@@ -464,7 +472,11 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
464 */ 472 */
465 473
466 hlen = iph->ihl * 4; 474 hlen = iph->ihl * 4;
467 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ 475 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
476#ifdef CONFIG_BRIDGE_NETFILTER
477 if (skb->nf_bridge)
478 mtu -= nf_bridge_mtu_reduction(skb);
479#endif
468 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; 480 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
469 481
470 /* When frag_list is given, use it. First, check its validity: 482 /* When frag_list is given, use it. First, check its validity:
@@ -474,10 +486,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
474 * LATER: this step can be merged to real generation of fragments, 486 * LATER: this step can be merged to real generation of fragments,
475 * we can switch to copy when see the first bad fragment. 487 * we can switch to copy when see the first bad fragment.
476 */ 488 */
477 if (skb_has_frags(skb)) { 489 if (skb_has_frag_list(skb)) {
478 struct sk_buff *frag; 490 struct sk_buff *frag, *frag2;
479 int first_len = skb_pagelen(skb); 491 int first_len = skb_pagelen(skb);
480 int truesizes = 0;
481 492
482 if (first_len - hlen > mtu || 493 if (first_len - hlen > mtu ||
483 ((first_len - hlen) & 7) || 494 ((first_len - hlen) & 7) ||
@@ -490,18 +501,18 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
490 if (frag->len > mtu || 501 if (frag->len > mtu ||
491 ((frag->len & 7) && frag->next) || 502 ((frag->len & 7) && frag->next) ||
492 skb_headroom(frag) < hlen) 503 skb_headroom(frag) < hlen)
493 goto slow_path; 504 goto slow_path_clean;
494 505
495 /* Partially cloned skb? */ 506 /* Partially cloned skb? */
496 if (skb_shared(frag)) 507 if (skb_shared(frag))
497 goto slow_path; 508 goto slow_path_clean;
498 509
499 BUG_ON(frag->sk); 510 BUG_ON(frag->sk);
500 if (skb->sk) { 511 if (skb->sk) {
501 frag->sk = skb->sk; 512 frag->sk = skb->sk;
502 frag->destructor = sock_wfree; 513 frag->destructor = sock_wfree;
503 truesizes += frag->truesize;
504 } 514 }
515 skb->truesize -= frag->truesize;
505 } 516 }
506 517
507 /* Everything is OK. Generate! */ 518 /* Everything is OK. Generate! */
@@ -511,7 +522,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
511 frag = skb_shinfo(skb)->frag_list; 522 frag = skb_shinfo(skb)->frag_list;
512 skb_frag_list_init(skb); 523 skb_frag_list_init(skb);
513 skb->data_len = first_len - skb_headlen(skb); 524 skb->data_len = first_len - skb_headlen(skb);
514 skb->truesize -= truesizes;
515 skb->len = first_len; 525 skb->len = first_len;
516 iph->tot_len = htons(first_len); 526 iph->tot_len = htons(first_len);
517 iph->frag_off = htons(IP_MF); 527 iph->frag_off = htons(IP_MF);
@@ -563,18 +573,25 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
563 } 573 }
564 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 574 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
565 return err; 575 return err;
576
577slow_path_clean:
578 skb_walk_frags(skb, frag2) {
579 if (frag2 == frag)
580 break;
581 frag2->sk = NULL;
582 frag2->destructor = NULL;
583 skb->truesize += frag2->truesize;
584 }
566 } 585 }
567 586
568slow_path: 587slow_path:
569 left = skb->len - hlen; /* Space per frame */ 588 left = skb->len - hlen; /* Space per frame */
570 ptr = raw + hlen; /* Where to start from */ 589 ptr = hlen; /* Where to start from */
571 590
572 /* for bridged IP traffic encapsulated inside f.e. a vlan header, 591 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
573 * we need to make room for the encapsulating header 592 * we need to make room for the encapsulating header
574 */ 593 */
575 pad = nf_bridge_pad(skb); 594 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
576 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
577 mtu -= pad;
578 595
579 /* 596 /*
580 * Fragment the datagram. 597 * Fragment the datagram.
@@ -684,7 +701,6 @@ fail:
684 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 701 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
685 return err; 702 return err;
686} 703}
687
688EXPORT_SYMBOL(ip_fragment); 704EXPORT_SYMBOL(ip_fragment);
689 705
690int 706int
@@ -703,6 +719,7 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
703 } 719 }
704 return 0; 720 return 0;
705} 721}
722EXPORT_SYMBOL(ip_generic_getfrag);
706 723
707static inline __wsum 724static inline __wsum
708csum_page(struct page *page, int offset, int copy) 725csum_page(struct page *page, int offset, int copy)
@@ -813,21 +830,22 @@ int ip_append_data(struct sock *sk,
813 inet->cork.addr = ipc->addr; 830 inet->cork.addr = ipc->addr;
814 } 831 }
815 rt = *rtp; 832 rt = *rtp;
833 if (unlikely(!rt))
834 return -EFAULT;
816 /* 835 /*
817 * We steal reference to this route, caller should not release it 836 * We steal reference to this route, caller should not release it
818 */ 837 */
819 *rtp = NULL; 838 *rtp = NULL;
820 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? 839 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
821 rt->u.dst.dev->mtu : 840 rt->dst.dev->mtu :
822 dst_mtu(rt->u.dst.path); 841 dst_mtu(rt->dst.path);
823 inet->cork.dst = &rt->u.dst; 842 inet->cork.dst = &rt->dst;
824 inet->cork.length = 0; 843 inet->cork.length = 0;
825 sk->sk_sndmsg_page = NULL; 844 sk->sk_sndmsg_page = NULL;
826 sk->sk_sndmsg_off = 0; 845 sk->sk_sndmsg_off = 0;
827 if ((exthdrlen = rt->u.dst.header_len) != 0) { 846 exthdrlen = rt->dst.header_len;
828 length += exthdrlen; 847 length += exthdrlen;
829 transhdrlen += exthdrlen; 848 transhdrlen += exthdrlen;
830 }
831 } else { 849 } else {
832 rt = (struct rtable *)inet->cork.dst; 850 rt = (struct rtable *)inet->cork.dst;
833 if (inet->cork.flags & IPCORK_OPT) 851 if (inet->cork.flags & IPCORK_OPT)
@@ -837,13 +855,14 @@ int ip_append_data(struct sock *sk,
837 exthdrlen = 0; 855 exthdrlen = 0;
838 mtu = inet->cork.fragsize; 856 mtu = inet->cork.fragsize;
839 } 857 }
840 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); 858 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
841 859
842 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 860 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
843 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 861 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
844 862
845 if (inet->cork.length + length > 0xFFFF - fragheaderlen) { 863 if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
846 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); 864 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
865 mtu-exthdrlen);
847 return -EMSGSIZE; 866 return -EMSGSIZE;
848 } 867 }
849 868
@@ -853,14 +872,16 @@ int ip_append_data(struct sock *sk,
853 */ 872 */
854 if (transhdrlen && 873 if (transhdrlen &&
855 length + fragheaderlen <= mtu && 874 length + fragheaderlen <= mtu &&
856 rt->u.dst.dev->features & NETIF_F_V4_CSUM && 875 rt->dst.dev->features & NETIF_F_V4_CSUM &&
857 !exthdrlen) 876 !exthdrlen)
858 csummode = CHECKSUM_PARTIAL; 877 csummode = CHECKSUM_PARTIAL;
859 878
879 skb = skb_peek_tail(&sk->sk_write_queue);
880
860 inet->cork.length += length; 881 inet->cork.length += length;
861 if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) && 882 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
862 (sk->sk_protocol == IPPROTO_UDP) && 883 (sk->sk_protocol == IPPROTO_UDP) &&
863 (rt->u.dst.dev->features & NETIF_F_UFO)) { 884 (rt->dst.dev->features & NETIF_F_UFO)) {
864 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, 885 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
865 fragheaderlen, transhdrlen, mtu, 886 fragheaderlen, transhdrlen, mtu,
866 flags); 887 flags);
@@ -876,7 +897,7 @@ int ip_append_data(struct sock *sk,
876 * adding appropriate IP header. 897 * adding appropriate IP header.
877 */ 898 */
878 899
879 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 900 if (!skb)
880 goto alloc_new_skb; 901 goto alloc_new_skb;
881 902
882 while (length > 0) { 903 while (length > 0) {
@@ -908,19 +929,22 @@ alloc_new_skb:
908 fraglen = datalen + fragheaderlen; 929 fraglen = datalen + fragheaderlen;
909 930
910 if ((flags & MSG_MORE) && 931 if ((flags & MSG_MORE) &&
911 !(rt->u.dst.dev->features&NETIF_F_SG)) 932 !(rt->dst.dev->features&NETIF_F_SG))
912 alloclen = mtu; 933 alloclen = mtu;
913 else 934 else
914 alloclen = datalen + fragheaderlen; 935 alloclen = fraglen;
915 936
916 /* The last fragment gets additional space at tail. 937 /* The last fragment gets additional space at tail.
917 * Note, with MSG_MORE we overallocate on fragments, 938 * Note, with MSG_MORE we overallocate on fragments,
918 * because we have no idea what fragment will be 939 * because we have no idea what fragment will be
919 * the last. 940 * the last.
920 */ 941 */
921 if (datalen == length + fraggap) 942 if (datalen == length + fraggap) {
922 alloclen += rt->u.dst.trailer_len; 943 alloclen += rt->dst.trailer_len;
923 944 /* make sure mtu is not reached */
945 if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
946 datalen -= ALIGN(rt->dst.trailer_len, 8);
947 }
924 if (transhdrlen) { 948 if (transhdrlen) {
925 skb = sock_alloc_send_skb(sk, 949 skb = sock_alloc_send_skb(sk,
926 alloclen + hh_len + 15, 950 alloclen + hh_len + 15,
@@ -937,7 +961,7 @@ alloc_new_skb:
937 else 961 else
938 /* only the initial fragment is 962 /* only the initial fragment is
939 time stamped */ 963 time stamped */
940 ipc->shtx.flags = 0; 964 ipc->tx_flags = 0;
941 } 965 }
942 if (skb == NULL) 966 if (skb == NULL)
943 goto error; 967 goto error;
@@ -948,7 +972,7 @@ alloc_new_skb:
948 skb->ip_summed = csummode; 972 skb->ip_summed = csummode;
949 skb->csum = 0; 973 skb->csum = 0;
950 skb_reserve(skb, hh_len); 974 skb_reserve(skb, hh_len);
951 *skb_tx(skb) = ipc->shtx; 975 skb_shinfo(skb)->tx_flags = ipc->tx_flags;
952 976
953 /* 977 /*
954 * Find where to start putting bytes. 978 * Find where to start putting bytes.
@@ -992,7 +1016,7 @@ alloc_new_skb:
992 if (copy > length) 1016 if (copy > length)
993 copy = length; 1017 copy = length;
994 1018
995 if (!(rt->u.dst.dev->features&NETIF_F_SG)) { 1019 if (!(rt->dst.dev->features&NETIF_F_SG)) {
996 unsigned int off; 1020 unsigned int off;
997 1021
998 off = skb->len; 1022 off = skb->len;
@@ -1087,17 +1111,17 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1087 if (inet->cork.flags & IPCORK_OPT) 1111 if (inet->cork.flags & IPCORK_OPT)
1088 opt = inet->cork.opt; 1112 opt = inet->cork.opt;
1089 1113
1090 if (!(rt->u.dst.dev->features&NETIF_F_SG)) 1114 if (!(rt->dst.dev->features&NETIF_F_SG))
1091 return -EOPNOTSUPP; 1115 return -EOPNOTSUPP;
1092 1116
1093 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); 1117 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1094 mtu = inet->cork.fragsize; 1118 mtu = inet->cork.fragsize;
1095 1119
1096 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1120 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1097 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 1121 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1098 1122
1099 if (inet->cork.length + size > 0xFFFF - fragheaderlen) { 1123 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1100 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu); 1124 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1101 return -EMSGSIZE; 1125 return -EMSGSIZE;
1102 } 1126 }
1103 1127
@@ -1105,8 +1129,9 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1105 return -EINVAL; 1129 return -EINVAL;
1106 1130
1107 inet->cork.length += size; 1131 inet->cork.length += size;
1108 if ((sk->sk_protocol == IPPROTO_UDP) && 1132 if ((size + skb->len > mtu) &&
1109 (rt->u.dst.dev->features & NETIF_F_UFO)) { 1133 (sk->sk_protocol == IPPROTO_UDP) &&
1134 (rt->dst.dev->features & NETIF_F_UFO)) {
1110 skb_shinfo(skb)->gso_size = mtu - fragheaderlen; 1135 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1111 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 1136 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1112 } 1137 }
@@ -1243,7 +1268,6 @@ int ip_push_pending_frames(struct sock *sk)
1243 skb->len += tmp_skb->len; 1268 skb->len += tmp_skb->len;
1244 skb->data_len += tmp_skb->len; 1269 skb->data_len += tmp_skb->len;
1245 skb->truesize += tmp_skb->truesize; 1270 skb->truesize += tmp_skb->truesize;
1246 __sock_put(tmp_skb->sk);
1247 tmp_skb->destructor = NULL; 1271 tmp_skb->destructor = NULL;
1248 tmp_skb->sk = NULL; 1272 tmp_skb->sk = NULL;
1249 } 1273 }
@@ -1259,8 +1283,8 @@ int ip_push_pending_frames(struct sock *sk)
1259 * If local_df is set too, we still allow to fragment this frame 1283 * If local_df is set too, we still allow to fragment this frame
1260 * locally. */ 1284 * locally. */
1261 if (inet->pmtudisc >= IP_PMTUDISC_DO || 1285 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1262 (skb->len <= dst_mtu(&rt->u.dst) && 1286 (skb->len <= dst_mtu(&rt->dst) &&
1263 ip_dont_fragment(sk, &rt->u.dst))) 1287 ip_dont_fragment(sk, &rt->dst)))
1264 df = htons(IP_DF); 1288 df = htons(IP_DF);
1265 1289
1266 if (inet->cork.flags & IPCORK_OPT) 1290 if (inet->cork.flags & IPCORK_OPT)
@@ -1269,7 +1293,7 @@ int ip_push_pending_frames(struct sock *sk)
1269 if (rt->rt_type == RTN_MULTICAST) 1293 if (rt->rt_type == RTN_MULTICAST)
1270 ttl = inet->mc_ttl; 1294 ttl = inet->mc_ttl;
1271 else 1295 else
1272 ttl = ip_select_ttl(inet, &rt->u.dst); 1296 ttl = ip_select_ttl(inet, &rt->dst);
1273 1297
1274 iph = (struct iphdr *)skb->data; 1298 iph = (struct iphdr *)skb->data;
1275 iph->version = 4; 1299 iph->version = 4;
@@ -1280,7 +1304,7 @@ int ip_push_pending_frames(struct sock *sk)
1280 } 1304 }
1281 iph->tos = inet->tos; 1305 iph->tos = inet->tos;
1282 iph->frag_off = df; 1306 iph->frag_off = df;
1283 ip_select_ident(iph, &rt->u.dst, sk); 1307 ip_select_ident(iph, &rt->dst, sk);
1284 iph->ttl = ttl; 1308 iph->ttl = ttl;
1285 iph->protocol = sk->sk_protocol; 1309 iph->protocol = sk->sk_protocol;
1286 iph->saddr = rt->rt_src; 1310 iph->saddr = rt->rt_src;
@@ -1293,7 +1317,7 @@ int ip_push_pending_frames(struct sock *sk)
1293 * on dst refcount 1317 * on dst refcount
1294 */ 1318 */
1295 inet->cork.dst = NULL; 1319 inet->cork.dst = NULL;
1296 skb_dst_set(skb, &rt->u.dst); 1320 skb_dst_set(skb, &rt->dst);
1297 1321
1298 if (iph->protocol == IPPROTO_ICMP) 1322 if (iph->protocol == IPPROTO_ICMP)
1299 icmp_out_count(net, ((struct icmphdr *) 1323 icmp_out_count(net, ((struct icmphdr *)
@@ -1303,7 +1327,7 @@ int ip_push_pending_frames(struct sock *sk)
1303 err = ip_local_out(skb); 1327 err = ip_local_out(skb);
1304 if (err) { 1328 if (err) {
1305 if (err > 0) 1329 if (err > 0)
1306 err = inet->recverr ? net_xmit_errno(err) : 0; 1330 err = net_xmit_errno(err);
1307 if (err) 1331 if (err)
1308 goto error; 1332 goto error;
1309 } 1333 }
@@ -1368,7 +1392,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1368 1392
1369 daddr = ipc.addr = rt->rt_src; 1393 daddr = ipc.addr = rt->rt_src;
1370 ipc.opt = NULL; 1394 ipc.opt = NULL;
1371 ipc.shtx.flags = 0; 1395 ipc.tx_flags = 0;
1372 1396
1373 if (replyopts.opt.optlen) { 1397 if (replyopts.opt.optlen) {
1374 ipc.opt = &replyopts.opt; 1398 ipc.opt = &replyopts.opt;
@@ -1379,14 +1403,11 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1379 1403
1380 { 1404 {
1381 struct flowi fl = { .oif = arg->bound_dev_if, 1405 struct flowi fl = { .oif = arg->bound_dev_if,
1382 .nl_u = { .ip4_u = 1406 .fl4_dst = daddr,
1383 { .daddr = daddr, 1407 .fl4_src = rt->rt_spec_dst,
1384 .saddr = rt->rt_spec_dst, 1408 .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
1385 .tos = RT_TOS(ip_hdr(skb)->tos) } }, 1409 .fl_ip_sport = tcp_hdr(skb)->dest,
1386 /* Not quite clean, but right. */ 1410 .fl_ip_dport = tcp_hdr(skb)->source,
1387 .uli_u = { .ports =
1388 { .sport = tcp_hdr(skb)->dest,
1389 .dport = tcp_hdr(skb)->source } },
1390 .proto = sk->sk_protocol, 1411 .proto = sk->sk_protocol,
1391 .flags = ip_reply_arg_flowi_flags(arg) }; 1412 .flags = ip_reply_arg_flowi_flags(arg) };
1392 security_skb_classify_flow(skb, &fl); 1413 security_skb_classify_flow(skb, &fl);
@@ -1430,7 +1451,3 @@ void __init ip_init(void)
1430 igmp_mc_proc_init(); 1451 igmp_mc_proc_init();
1431#endif 1452#endif
1432} 1453}
1433
1434EXPORT_SYMBOL(ip_generic_getfrag);
1435EXPORT_SYMBOL(ip_queue_xmit);
1436EXPORT_SYMBOL(ip_send_check);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index fc7993e9061f..3948c86e59ca 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -23,6 +23,7 @@
23#include <linux/icmp.h> 23#include <linux/icmp.h>
24#include <linux/inetdevice.h> 24#include <linux/inetdevice.h>
25#include <linux/netdevice.h> 25#include <linux/netdevice.h>
26#include <linux/slab.h>
26#include <net/sock.h> 27#include <net/sock.h>
27#include <net/ip.h> 28#include <net/ip.h>
28#include <net/icmp.h> 29#include <net/icmp.h>
@@ -237,48 +238,68 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
237 but receiver should be enough clever f.e. to forward mtrace requests, 238 but receiver should be enough clever f.e. to forward mtrace requests,
238 sent to multicast group to reach destination designated router. 239 sent to multicast group to reach destination designated router.
239 */ 240 */
240struct ip_ra_chain *ip_ra_chain; 241struct ip_ra_chain __rcu *ip_ra_chain;
241DEFINE_RWLOCK(ip_ra_lock); 242static DEFINE_SPINLOCK(ip_ra_lock);
243
244
245static void ip_ra_destroy_rcu(struct rcu_head *head)
246{
247 struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
248
249 sock_put(ra->saved_sk);
250 kfree(ra);
251}
242 252
243int ip_ra_control(struct sock *sk, unsigned char on, 253int ip_ra_control(struct sock *sk, unsigned char on,
244 void (*destructor)(struct sock *)) 254 void (*destructor)(struct sock *))
245{ 255{
246 struct ip_ra_chain *ra, *new_ra, **rap; 256 struct ip_ra_chain *ra, *new_ra;
257 struct ip_ra_chain __rcu **rap;
247 258
248 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW) 259 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
249 return -EINVAL; 260 return -EINVAL;
250 261
251 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; 262 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
252 263
253 write_lock_bh(&ip_ra_lock); 264 spin_lock_bh(&ip_ra_lock);
254 for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { 265 for (rap = &ip_ra_chain;
266 (ra = rcu_dereference_protected(*rap,
267 lockdep_is_held(&ip_ra_lock))) != NULL;
268 rap = &ra->next) {
255 if (ra->sk == sk) { 269 if (ra->sk == sk) {
256 if (on) { 270 if (on) {
257 write_unlock_bh(&ip_ra_lock); 271 spin_unlock_bh(&ip_ra_lock);
258 kfree(new_ra); 272 kfree(new_ra);
259 return -EADDRINUSE; 273 return -EADDRINUSE;
260 } 274 }
261 *rap = ra->next; 275 /* dont let ip_call_ra_chain() use sk again */
262 write_unlock_bh(&ip_ra_lock); 276 ra->sk = NULL;
277 rcu_assign_pointer(*rap, ra->next);
278 spin_unlock_bh(&ip_ra_lock);
263 279
264 if (ra->destructor) 280 if (ra->destructor)
265 ra->destructor(sk); 281 ra->destructor(sk);
266 sock_put(sk); 282 /*
267 kfree(ra); 283 * Delay sock_put(sk) and kfree(ra) after one rcu grace
284 * period. This guarantee ip_call_ra_chain() dont need
285 * to mess with socket refcounts.
286 */
287 ra->saved_sk = sk;
288 call_rcu(&ra->rcu, ip_ra_destroy_rcu);
268 return 0; 289 return 0;
269 } 290 }
270 } 291 }
271 if (new_ra == NULL) { 292 if (new_ra == NULL) {
272 write_unlock_bh(&ip_ra_lock); 293 spin_unlock_bh(&ip_ra_lock);
273 return -ENOBUFS; 294 return -ENOBUFS;
274 } 295 }
275 new_ra->sk = sk; 296 new_ra->sk = sk;
276 new_ra->destructor = destructor; 297 new_ra->destructor = destructor;
277 298
278 new_ra->next = ra; 299 new_ra->next = ra;
279 *rap = new_ra; 300 rcu_assign_pointer(*rap, new_ra);
280 sock_hold(sk); 301 sock_hold(sk);
281 write_unlock_bh(&ip_ra_lock); 302 spin_unlock_bh(&ip_ra_lock);
282 303
283 return 0; 304 return 0;
284} 305}
@@ -286,12 +307,8 @@ int ip_ra_control(struct sock *sk, unsigned char on,
286void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, 307void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
287 __be16 port, u32 info, u8 *payload) 308 __be16 port, u32 info, u8 *payload)
288{ 309{
289 struct inet_sock *inet = inet_sk(sk);
290 struct sock_exterr_skb *serr; 310 struct sock_exterr_skb *serr;
291 311
292 if (!inet->recverr)
293 return;
294
295 skb = skb_clone(skb, GFP_ATOMIC); 312 skb = skb_clone(skb, GFP_ATOMIC);
296 if (!skb) 313 if (!skb)
297 return; 314 return;
@@ -440,7 +457,7 @@ out:
440 */ 457 */
441 458
442static int do_ip_setsockopt(struct sock *sk, int level, 459static int do_ip_setsockopt(struct sock *sk, int level,
443 int optname, char __user *optval, int optlen) 460 int optname, char __user *optval, unsigned int optlen)
444{ 461{
445 struct inet_sock *inet = inet_sk(sk); 462 struct inet_sock *inet = inet_sk(sk);
446 int val = 0, err; 463 int val = 0, err;
@@ -451,7 +468,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
451 (1<<IP_TTL) | (1<<IP_HDRINCL) | 468 (1<<IP_TTL) | (1<<IP_HDRINCL) |
452 (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | 469 (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
453 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | 470 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
454 (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) || 471 (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
472 (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) ||
455 optname == IP_MULTICAST_TTL || 473 optname == IP_MULTICAST_TTL ||
456 optname == IP_MULTICAST_ALL || 474 optname == IP_MULTICAST_ALL ||
457 optname == IP_MULTICAST_LOOP || 475 optname == IP_MULTICAST_LOOP ||
@@ -480,7 +498,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
480 case IP_OPTIONS: 498 case IP_OPTIONS:
481 { 499 {
482 struct ip_options *opt = NULL; 500 struct ip_options *opt = NULL;
483 if (optlen > 40 || optlen < 0) 501 if (optlen > 40)
484 goto e_inval; 502 goto e_inval;
485 err = ip_options_get_from_user(sock_net(sk), &opt, 503 err = ip_options_get_from_user(sock_net(sk), &opt,
486 optval, optlen); 504 optval, optlen);
@@ -492,7 +510,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
492 if (sk->sk_family == PF_INET || 510 if (sk->sk_family == PF_INET ||
493 (!((1 << sk->sk_state) & 511 (!((1 << sk->sk_state) &
494 (TCPF_LISTEN | TCPF_CLOSE)) && 512 (TCPF_LISTEN | TCPF_CLOSE)) &&
495 inet->daddr != LOOPBACK4_IPV6)) { 513 inet->inet_daddr != LOOPBACK4_IPV6)) {
496#endif 514#endif
497 if (inet->opt) 515 if (inet->opt)
498 icsk->icsk_ext_hdr_len -= inet->opt->optlen; 516 icsk->icsk_ext_hdr_len -= inet->opt->optlen;
@@ -574,8 +592,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
574 } 592 }
575 inet->hdrincl = val ? 1 : 0; 593 inet->hdrincl = val ? 1 : 0;
576 break; 594 break;
595 case IP_NODEFRAG:
596 if (sk->sk_type != SOCK_RAW) {
597 err = -ENOPROTOOPT;
598 break;
599 }
600 inet->nodefrag = val ? 1 : 0;
601 break;
577 case IP_MTU_DISCOVER: 602 case IP_MTU_DISCOVER:
578 if (val < 0 || val > 3) 603 if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
579 goto e_inval; 604 goto e_inval;
580 inet->pmtudisc = val; 605 inet->pmtudisc = val;
581 break; 606 break;
@@ -611,6 +636,9 @@ static int do_ip_setsockopt(struct sock *sk, int level,
611 * Check the arguments are allowable 636 * Check the arguments are allowable
612 */ 637 */
613 638
639 if (optlen < sizeof(struct in_addr))
640 goto e_inval;
641
614 err = -EFAULT; 642 err = -EFAULT;
615 if (optlen >= sizeof(struct ip_mreqn)) { 643 if (optlen >= sizeof(struct ip_mreqn)) {
616 if (copy_from_user(&mreq, optval, sizeof(mreq))) 644 if (copy_from_user(&mreq, optval, sizeof(mreq)))
@@ -631,17 +659,16 @@ static int do_ip_setsockopt(struct sock *sk, int level,
631 break; 659 break;
632 } 660 }
633 dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr); 661 dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr);
634 if (dev) { 662 if (dev)
635 mreq.imr_ifindex = dev->ifindex; 663 mreq.imr_ifindex = dev->ifindex;
636 dev_put(dev);
637 }
638 } else 664 } else
639 dev = __dev_get_by_index(sock_net(sk), mreq.imr_ifindex); 665 dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex);
640 666
641 667
642 err = -EADDRNOTAVAIL; 668 err = -EADDRNOTAVAIL;
643 if (!dev) 669 if (!dev)
644 break; 670 break;
671 dev_put(dev);
645 672
646 err = -EINVAL; 673 err = -EINVAL;
647 if (sk->sk_bound_dev_if && 674 if (sk->sk_bound_dev_if &&
@@ -934,6 +961,14 @@ mc_msf_out:
934 inet->transparent = !!val; 961 inet->transparent = !!val;
935 break; 962 break;
936 963
964 case IP_MINTTL:
965 if (optlen < 1)
966 goto e_inval;
967 if (val < 0 || val > 255)
968 goto e_inval;
969 inet->min_ttl = val;
970 break;
971
937 default: 972 default:
938 err = -ENOPROTOOPT; 973 err = -ENOPROTOOPT;
939 break; 974 break;
@@ -946,8 +981,24 @@ e_inval:
946 return -EINVAL; 981 return -EINVAL;
947} 982}
948 983
984/**
985 * ip_queue_rcv_skb - Queue an skb into sock receive queue
986 * @sk: socket
987 * @skb: buffer
988 *
989 * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option
990 * is not set, we drop skb dst entry now, while dst cache line is hot.
991 */
992int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
993{
994 if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO))
995 skb_dst_drop(skb);
996 return sock_queue_rcv_skb(sk, skb);
997}
998EXPORT_SYMBOL(ip_queue_rcv_skb);
999
949int ip_setsockopt(struct sock *sk, int level, 1000int ip_setsockopt(struct sock *sk, int level,
950 int optname, char __user *optval, int optlen) 1001 int optname, char __user *optval, unsigned int optlen)
951{ 1002{
952 int err; 1003 int err;
953 1004
@@ -972,7 +1023,7 @@ EXPORT_SYMBOL(ip_setsockopt);
972 1023
973#ifdef CONFIG_COMPAT 1024#ifdef CONFIG_COMPAT
974int compat_ip_setsockopt(struct sock *sk, int level, int optname, 1025int compat_ip_setsockopt(struct sock *sk, int level, int optname,
975 char __user *optval, int optlen) 1026 char __user *optval, unsigned int optlen)
976{ 1027{
977 int err; 1028 int err;
978 1029
@@ -1082,6 +1133,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1082 case IP_HDRINCL: 1133 case IP_HDRINCL:
1083 val = inet->hdrincl; 1134 val = inet->hdrincl;
1084 break; 1135 break;
1136 case IP_NODEFRAG:
1137 val = inet->nodefrag;
1138 break;
1085 case IP_MTU_DISCOVER: 1139 case IP_MTU_DISCOVER:
1086 val = inet->pmtudisc; 1140 val = inet->pmtudisc;
1087 break; 1141 break;
@@ -1178,8 +1232,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1178 if (inet->cmsg_flags & IP_CMSG_PKTINFO) { 1232 if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
1179 struct in_pktinfo info; 1233 struct in_pktinfo info;
1180 1234
1181 info.ipi_addr.s_addr = inet->rcv_saddr; 1235 info.ipi_addr.s_addr = inet->inet_rcv_saddr;
1182 info.ipi_spec_dst.s_addr = inet->rcv_saddr; 1236 info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
1183 info.ipi_ifindex = inet->mc_index; 1237 info.ipi_ifindex = inet->mc_index;
1184 put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); 1238 put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
1185 } 1239 }
@@ -1196,6 +1250,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1196 case IP_TRANSPARENT: 1250 case IP_TRANSPARENT:
1197 val = inet->transparent; 1251 val = inet->transparent;
1198 break; 1252 break;
1253 case IP_MINTTL:
1254 val = inet->min_ttl;
1255 break;
1199 default: 1256 default:
1200 release_sock(sk); 1257 release_sock(sk);
1201 return -ENOPROTOOPT; 1258 return -ENOPROTOOPT;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 3262ce06294c..629067571f02 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -25,6 +25,7 @@
25 25
26static void ipcomp4_err(struct sk_buff *skb, u32 info) 26static void ipcomp4_err(struct sk_buff *skb, u32 info)
27{ 27{
28 struct net *net = dev_net(skb->dev);
28 __be32 spi; 29 __be32 spi;
29 struct iphdr *iph = (struct iphdr *)skb->data; 30 struct iphdr *iph = (struct iphdr *)skb->data;
30 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); 31 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
@@ -35,7 +36,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
35 return; 36 return;
36 37
37 spi = htonl(ntohs(ipch->cpi)); 38 spi = htonl(ntohs(ipch->cpi));
38 x = xfrm_state_lookup(&init_net, (xfrm_address_t *)&iph->daddr, 39 x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr,
39 spi, IPPROTO_COMP, AF_INET); 40 spi, IPPROTO_COMP, AF_INET);
40 if (!x) 41 if (!x)
41 return; 42 return;
@@ -47,9 +48,10 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
47/* We always hold one tunnel user reference to indicate a tunnel */ 48/* We always hold one tunnel user reference to indicate a tunnel */
48static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) 49static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
49{ 50{
51 struct net *net = xs_net(x);
50 struct xfrm_state *t; 52 struct xfrm_state *t;
51 53
52 t = xfrm_state_alloc(&init_net); 54 t = xfrm_state_alloc(net);
53 if (t == NULL) 55 if (t == NULL)
54 goto out; 56 goto out;
55 57
@@ -61,6 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
61 t->props.mode = x->props.mode; 63 t->props.mode = x->props.mode;
62 t->props.saddr.a4 = x->props.saddr.a4; 64 t->props.saddr.a4 = x->props.saddr.a4;
63 t->props.flags = x->props.flags; 65 t->props.flags = x->props.flags;
66 memcpy(&t->mark, &x->mark, sizeof(t->mark));
64 67
65 if (xfrm_init_state(t)) 68 if (xfrm_init_state(t))
66 goto error; 69 goto error;
@@ -82,10 +85,12 @@ error:
82 */ 85 */
83static int ipcomp_tunnel_attach(struct xfrm_state *x) 86static int ipcomp_tunnel_attach(struct xfrm_state *x)
84{ 87{
88 struct net *net = xs_net(x);
85 int err = 0; 89 int err = 0;
86 struct xfrm_state *t; 90 struct xfrm_state *t;
91 u32 mark = x->mark.v & x->mark.m;
87 92
88 t = xfrm_state_lookup(&init_net, (xfrm_address_t *)&x->id.daddr.a4, 93 t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4,
89 x->props.saddr.a4, IPPROTO_IPIP, AF_INET); 94 x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
90 if (!t) { 95 if (!t) {
91 t = ipcomp_tunnel_create(x); 96 t = ipcomp_tunnel_create(x);
@@ -124,16 +129,12 @@ static int ipcomp4_init_state(struct xfrm_state *x)
124 if (x->props.mode == XFRM_MODE_TUNNEL) { 129 if (x->props.mode == XFRM_MODE_TUNNEL) {
125 err = ipcomp_tunnel_attach(x); 130 err = ipcomp_tunnel_attach(x);
126 if (err) 131 if (err)
127 goto error_tunnel; 132 goto out;
128 } 133 }
129 134
130 err = 0; 135 err = 0;
131out: 136out:
132 return err; 137 return err;
133
134error_tunnel:
135 ipcomp_destroy(x);
136 goto out;
137} 138}
138 139
139static const struct xfrm_type ipcomp_type = { 140static const struct xfrm_type ipcomp_type = {
@@ -146,7 +147,7 @@ static const struct xfrm_type ipcomp_type = {
146 .output = ipcomp_output 147 .output = ipcomp_output
147}; 148};
148 149
149static struct net_protocol ipcomp4_protocol = { 150static const struct net_protocol ipcomp4_protocol = {
150 .handler = xfrm4_rcv, 151 .handler = xfrm4_rcv,
151 .err_handler = ipcomp4_err, 152 .err_handler = ipcomp4_err,
152 .no_policy = 1, 153 .no_policy = 1,
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f8d04c256454..2b097752426b 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -53,6 +53,7 @@
53#include <linux/root_dev.h> 53#include <linux/root_dev.h>
54#include <linux/delay.h> 54#include <linux/delay.h>
55#include <linux/nfs_fs.h> 55#include <linux/nfs_fs.h>
56#include <linux/slab.h>
56#include <net/net_namespace.h> 57#include <net/net_namespace.h>
57#include <net/arp.h> 58#include <net/arp.h>
58#include <net/ip.h> 59#include <net/ip.h>
@@ -187,6 +188,16 @@ struct ic_device {
187static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ 188static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
188static struct net_device *ic_dev __initdata = NULL; /* Selected device */ 189static struct net_device *ic_dev __initdata = NULL; /* Selected device */
189 190
191static bool __init ic_device_match(struct net_device *dev)
192{
193 if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
194 (!(dev->flags & IFF_LOOPBACK) &&
195 (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
196 strncmp(dev->name, "dummy", 5)))
197 return true;
198 return false;
199}
200
190static int __init ic_open_devs(void) 201static int __init ic_open_devs(void)
191{ 202{
192 struct ic_device *d, **last; 203 struct ic_device *d, **last;
@@ -207,10 +218,7 @@ static int __init ic_open_devs(void)
207 for_each_netdev(&init_net, dev) { 218 for_each_netdev(&init_net, dev) {
208 if (dev->flags & IFF_LOOPBACK) 219 if (dev->flags & IFF_LOOPBACK)
209 continue; 220 continue;
210 if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : 221 if (ic_device_match(dev)) {
211 (!(dev->flags & IFF_LOOPBACK) &&
212 (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
213 strncmp(dev->name, "dummy", 5))) {
214 int able = 0; 222 int able = 0;
215 if (dev->mtu >= 364) 223 if (dev->mtu >= 364)
216 able |= IC_BOOTP; 224 able |= IC_BOOTP;
@@ -228,7 +236,7 @@ static int __init ic_open_devs(void)
228 } 236 }
229 if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) { 237 if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
230 rtnl_unlock(); 238 rtnl_unlock();
231 return -1; 239 return -ENOMEM;
232 } 240 }
233 d->dev = dev; 241 d->dev = dev;
234 *last = d; 242 *last = d;
@@ -253,7 +261,7 @@ static int __init ic_open_devs(void)
253 printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name); 261 printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
254 else 262 else
255 printk(KERN_ERR "IP-Config: No network devices available.\n"); 263 printk(KERN_ERR "IP-Config: No network devices available.\n");
256 return -1; 264 return -ENODEV;
257 } 265 }
258 return 0; 266 return 0;
259} 267}
@@ -657,6 +665,13 @@ ic_dhcp_init_options(u8 *options)
657 memcpy(e, ic_req_params, sizeof(ic_req_params)); 665 memcpy(e, ic_req_params, sizeof(ic_req_params));
658 e += sizeof(ic_req_params); 666 e += sizeof(ic_req_params);
659 667
668 if (ic_host_name_set) {
669 *e++ = 12; /* host-name */
670 len = strlen(utsname()->nodename);
671 *e++ = len;
672 memcpy(e, utsname()->nodename, len);
673 e += len;
674 }
660 if (*vendor_class_identifier) { 675 if (*vendor_class_identifier) {
661 printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n", 676 printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n",
662 vendor_class_identifier); 677 vendor_class_identifier);
@@ -968,7 +983,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
968 /* Is it a reply for the device we are configuring? */ 983 /* Is it a reply for the device we are configuring? */
969 if (b->xid != ic_dev_xid) { 984 if (b->xid != ic_dev_xid) {
970 if (net_ratelimit()) 985 if (net_ratelimit())
971 printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet \n"); 986 printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet\n");
972 goto drop_unlock; 987 goto drop_unlock;
973 } 988 }
974 989
@@ -1172,18 +1187,17 @@ static int __init ic_dynamic(void)
1172 schedule_timeout_uninterruptible(1); 1187 schedule_timeout_uninterruptible(1);
1173#ifdef IPCONFIG_DHCP 1188#ifdef IPCONFIG_DHCP
1174 /* DHCP isn't done until we get a DHCPACK. */ 1189 /* DHCP isn't done until we get a DHCPACK. */
1175 if ((ic_got_reply & IC_BOOTP) 1190 if ((ic_got_reply & IC_BOOTP) &&
1176 && (ic_proto_enabled & IC_USE_DHCP) 1191 (ic_proto_enabled & IC_USE_DHCP) &&
1177 && ic_dhcp_msgtype != DHCPACK) 1192 ic_dhcp_msgtype != DHCPACK) {
1178 {
1179 ic_got_reply = 0; 1193 ic_got_reply = 0;
1180 printk(","); 1194 printk(KERN_CONT ",");
1181 continue; 1195 continue;
1182 } 1196 }
1183#endif /* IPCONFIG_DHCP */ 1197#endif /* IPCONFIG_DHCP */
1184 1198
1185 if (ic_got_reply) { 1199 if (ic_got_reply) {
1186 printk(" OK\n"); 1200 printk(KERN_CONT " OK\n");
1187 break; 1201 break;
1188 } 1202 }
1189 1203
@@ -1191,7 +1205,7 @@ static int __init ic_dynamic(void)
1191 continue; 1205 continue;
1192 1206
1193 if (! --retries) { 1207 if (! --retries) {
1194 printk(" timed out!\n"); 1208 printk(KERN_CONT " timed out!\n");
1195 break; 1209 break;
1196 } 1210 }
1197 1211
@@ -1201,7 +1215,7 @@ static int __init ic_dynamic(void)
1201 if (timeout > CONF_TIMEOUT_MAX) 1215 if (timeout > CONF_TIMEOUT_MAX)
1202 timeout = CONF_TIMEOUT_MAX; 1216 timeout = CONF_TIMEOUT_MAX;
1203 1217
1204 printk("."); 1218 printk(KERN_CONT ".");
1205 } 1219 }
1206 1220
1207#ifdef IPCONFIG_BOOTP 1221#ifdef IPCONFIG_BOOTP
@@ -1222,7 +1236,7 @@ static int __init ic_dynamic(void)
1222 ((ic_got_reply & IC_RARP) ? "RARP" 1236 ((ic_got_reply & IC_RARP) ? "RARP"
1223 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), 1237 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
1224 &ic_servaddr); 1238 &ic_servaddr);
1225 printk("my address is %pI4\n", &ic_myaddr); 1239 printk(KERN_CONT "my address is %pI4\n", &ic_myaddr);
1226 1240
1227 return 0; 1241 return 0;
1228} 1242}
@@ -1304,6 +1318,32 @@ __be32 __init root_nfs_parse_addr(char *name)
1304 return addr; 1318 return addr;
1305} 1319}
1306 1320
1321#define DEVICE_WAIT_MAX 12 /* 12 seconds */
1322
1323static int __init wait_for_devices(void)
1324{
1325 int i;
1326
1327 msleep(CONF_PRE_OPEN);
1328 for (i = 0; i < DEVICE_WAIT_MAX; i++) {
1329 struct net_device *dev;
1330 int found = 0;
1331
1332 rtnl_lock();
1333 for_each_netdev(&init_net, dev) {
1334 if (ic_device_match(dev)) {
1335 found = 1;
1336 break;
1337 }
1338 }
1339 rtnl_unlock();
1340 if (found)
1341 return 0;
1342 ssleep(1);
1343 }
1344 return -ENODEV;
1345}
1346
1307/* 1347/*
1308 * IP Autoconfig dispatcher. 1348 * IP Autoconfig dispatcher.
1309 */ 1349 */
@@ -1314,6 +1354,7 @@ static int __init ip_auto_config(void)
1314#ifdef IPCONFIG_DYNAMIC 1354#ifdef IPCONFIG_DYNAMIC
1315 int retries = CONF_OPEN_RETRIES; 1355 int retries = CONF_OPEN_RETRIES;
1316#endif 1356#endif
1357 int err;
1317 1358
1318#ifdef CONFIG_PROC_FS 1359#ifdef CONFIG_PROC_FS
1319 proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); 1360 proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
@@ -1326,12 +1367,15 @@ static int __init ip_auto_config(void)
1326#ifdef IPCONFIG_DYNAMIC 1367#ifdef IPCONFIG_DYNAMIC
1327 try_try_again: 1368 try_try_again:
1328#endif 1369#endif
1329 /* Give hardware a chance to settle */ 1370 /* Wait for devices to appear */
1330 msleep(CONF_PRE_OPEN); 1371 err = wait_for_devices();
1372 if (err)
1373 return err;
1331 1374
1332 /* Setup all network devices */ 1375 /* Setup all network devices */
1333 if (ic_open_devs() < 0) 1376 err = ic_open_devs();
1334 return -1; 1377 if (err)
1378 return err;
1335 1379
1336 /* Give drivers a chance to settle */ 1380 /* Give drivers a chance to settle */
1337 ssleep(CONF_POST_OPEN); 1381 ssleep(CONF_POST_OPEN);
@@ -1344,9 +1388,9 @@ static int __init ip_auto_config(void)
1344 */ 1388 */
1345 if (ic_myaddr == NONE || 1389 if (ic_myaddr == NONE ||
1346#ifdef CONFIG_ROOT_NFS 1390#ifdef CONFIG_ROOT_NFS
1347 (root_server_addr == NONE 1391 (root_server_addr == NONE &&
1348 && ic_servaddr == NONE 1392 ic_servaddr == NONE &&
1349 && ROOT_DEV == Root_NFS) || 1393 ROOT_DEV == Root_NFS) ||
1350#endif 1394#endif
1351 ic_first_dev->next) { 1395 ic_first_dev->next) {
1352#ifdef IPCONFIG_DYNAMIC 1396#ifdef IPCONFIG_DYNAMIC
@@ -1424,19 +1468,19 @@ static int __init ip_auto_config(void)
1424 /* 1468 /*
1425 * Clue in the operator. 1469 * Clue in the operator.
1426 */ 1470 */
1427 printk("IP-Config: Complete:"); 1471 printk("IP-Config: Complete:\n");
1428 printk("\n device=%s", ic_dev->name); 1472 printk(" device=%s", ic_dev->name);
1429 printk(", addr=%pI4", &ic_myaddr); 1473 printk(KERN_CONT ", addr=%pI4", &ic_myaddr);
1430 printk(", mask=%pI4", &ic_netmask); 1474 printk(KERN_CONT ", mask=%pI4", &ic_netmask);
1431 printk(", gw=%pI4", &ic_gateway); 1475 printk(KERN_CONT ", gw=%pI4", &ic_gateway);
1432 printk(",\n host=%s, domain=%s, nis-domain=%s", 1476 printk(KERN_CONT ",\n host=%s, domain=%s, nis-domain=%s",
1433 utsname()->nodename, ic_domain, utsname()->domainname); 1477 utsname()->nodename, ic_domain, utsname()->domainname);
1434 printk(",\n bootserver=%pI4", &ic_servaddr); 1478 printk(KERN_CONT ",\n bootserver=%pI4", &ic_servaddr);
1435 printk(", rootserver=%pI4", &root_server_addr); 1479 printk(KERN_CONT ", rootserver=%pI4", &root_server_addr);
1436 printk(", rootpath=%s", root_server_path); 1480 printk(KERN_CONT ", rootpath=%s", root_server_path);
1437 if (ic_dev_mtu) 1481 if (ic_dev_mtu)
1438 printk(", mtu=%d", ic_dev_mtu); 1482 printk(KERN_CONT ", mtu=%d", ic_dev_mtu);
1439 printk("\n"); 1483 printk(KERN_CONT "\n");
1440#endif /* !SILENT */ 1484#endif /* !SILENT */
1441 1485
1442 return 0; 1486 return 0;
@@ -1447,7 +1491,7 @@ late_initcall(ip_auto_config);
1447 1491
1448/* 1492/*
1449 * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel 1493 * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
1450 * command line parameter. See Documentation/filesystems/nfsroot.txt. 1494 * command line parameter. See Documentation/filesystems/nfs/nfsroot.txt.
1451 */ 1495 */
1452static int __init ic_proto_name(char *name) 1496static int __init ic_proto_name(char *name)
1453{ 1497{
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 93e2b787da20..988f52fba54a 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -95,6 +95,7 @@
95#include <linux/module.h> 95#include <linux/module.h>
96#include <linux/types.h> 96#include <linux/types.h>
97#include <linux/kernel.h> 97#include <linux/kernel.h>
98#include <linux/slab.h>
98#include <asm/uaccess.h> 99#include <asm/uaccess.h>
99#include <linux/skbuff.h> 100#include <linux/skbuff.h>
100#include <linux/netdevice.h> 101#include <linux/netdevice.h>
@@ -119,55 +120,89 @@
119#define HASH_SIZE 16 120#define HASH_SIZE 16
120#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 121#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
121 122
122static int ipip_net_id; 123static int ipip_net_id __read_mostly;
123struct ipip_net { 124struct ipip_net {
124 struct ip_tunnel *tunnels_r_l[HASH_SIZE]; 125 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
125 struct ip_tunnel *tunnels_r[HASH_SIZE]; 126 struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
126 struct ip_tunnel *tunnels_l[HASH_SIZE]; 127 struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
127 struct ip_tunnel *tunnels_wc[1]; 128 struct ip_tunnel __rcu *tunnels_wc[1];
128 struct ip_tunnel **tunnels[4]; 129 struct ip_tunnel __rcu **tunnels[4];
129 130
130 struct net_device *fb_tunnel_dev; 131 struct net_device *fb_tunnel_dev;
131}; 132};
132 133
133static void ipip_fb_tunnel_init(struct net_device *dev); 134static int ipip_tunnel_init(struct net_device *dev);
134static void ipip_tunnel_init(struct net_device *dev);
135static void ipip_tunnel_setup(struct net_device *dev); 135static void ipip_tunnel_setup(struct net_device *dev);
136static void ipip_dev_free(struct net_device *dev);
136 137
137static DEFINE_RWLOCK(ipip_lock); 138/*
139 * Locking : hash tables are protected by RCU and RTNL
140 */
141
142#define for_each_ip_tunnel_rcu(start) \
143 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
144
145/* often modified stats are per cpu, other are shared (netdev->stats) */
146struct pcpu_tstats {
147 unsigned long rx_packets;
148 unsigned long rx_bytes;
149 unsigned long tx_packets;
150 unsigned long tx_bytes;
151};
152
153static struct net_device_stats *ipip_get_stats(struct net_device *dev)
154{
155 struct pcpu_tstats sum = { 0 };
156 int i;
157
158 for_each_possible_cpu(i) {
159 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
160
161 sum.rx_packets += tstats->rx_packets;
162 sum.rx_bytes += tstats->rx_bytes;
163 sum.tx_packets += tstats->tx_packets;
164 sum.tx_bytes += tstats->tx_bytes;
165 }
166 dev->stats.rx_packets = sum.rx_packets;
167 dev->stats.rx_bytes = sum.rx_bytes;
168 dev->stats.tx_packets = sum.tx_packets;
169 dev->stats.tx_bytes = sum.tx_bytes;
170 return &dev->stats;
171}
138 172
139static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, 173static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
140 __be32 remote, __be32 local) 174 __be32 remote, __be32 local)
141{ 175{
142 unsigned h0 = HASH(remote); 176 unsigned int h0 = HASH(remote);
143 unsigned h1 = HASH(local); 177 unsigned int h1 = HASH(local);
144 struct ip_tunnel *t; 178 struct ip_tunnel *t;
145 struct ipip_net *ipn = net_generic(net, ipip_net_id); 179 struct ipip_net *ipn = net_generic(net, ipip_net_id);
146 180
147 for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) { 181 for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
148 if (local == t->parms.iph.saddr && 182 if (local == t->parms.iph.saddr &&
149 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) 183 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
150 return t; 184 return t;
151 } 185
152 for (t = ipn->tunnels_r[h0]; t; t = t->next) { 186 for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
153 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) 187 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
154 return t; 188 return t;
155 } 189
156 for (t = ipn->tunnels_l[h1]; t; t = t->next) { 190 for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
157 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) 191 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
158 return t; 192 return t;
159 } 193
160 if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) 194 t = rcu_dereference(ipn->tunnels_wc[0]);
195 if (t && (t->dev->flags&IFF_UP))
161 return t; 196 return t;
162 return NULL; 197 return NULL;
163} 198}
164 199
165static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, 200static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
166 struct ip_tunnel_parm *parms) 201 struct ip_tunnel_parm *parms)
167{ 202{
168 __be32 remote = parms->iph.daddr; 203 __be32 remote = parms->iph.daddr;
169 __be32 local = parms->iph.saddr; 204 __be32 local = parms->iph.saddr;
170 unsigned h = 0; 205 unsigned int h = 0;
171 int prio = 0; 206 int prio = 0;
172 207
173 if (remote) { 208 if (remote) {
@@ -181,7 +216,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
181 return &ipn->tunnels[prio][h]; 216 return &ipn->tunnels[prio][h];
182} 217}
183 218
184static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, 219static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
185 struct ip_tunnel *t) 220 struct ip_tunnel *t)
186{ 221{
187 return __ipip_bucket(ipn, &t->parms); 222 return __ipip_bucket(ipn, &t->parms);
@@ -189,13 +224,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
189 224
190static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) 225static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
191{ 226{
192 struct ip_tunnel **tp; 227 struct ip_tunnel __rcu **tp;
193 228 struct ip_tunnel *iter;
194 for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { 229
195 if (t == *tp) { 230 for (tp = ipip_bucket(ipn, t);
196 write_lock_bh(&ipip_lock); 231 (iter = rtnl_dereference(*tp)) != NULL;
197 *tp = t->next; 232 tp = &iter->next) {
198 write_unlock_bh(&ipip_lock); 233 if (t == iter) {
234 rcu_assign_pointer(*tp, t->next);
199 break; 235 break;
200 } 236 }
201 } 237 }
@@ -203,12 +239,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
203 239
204static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) 240static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
205{ 241{
206 struct ip_tunnel **tp = ipip_bucket(ipn, t); 242 struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
207 243
208 t->next = *tp; 244 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
209 write_lock_bh(&ipip_lock); 245 rcu_assign_pointer(*tp, t);
210 *tp = t;
211 write_unlock_bh(&ipip_lock);
212} 246}
213 247
214static struct ip_tunnel * ipip_tunnel_locate(struct net *net, 248static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -216,12 +250,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
216{ 250{
217 __be32 remote = parms->iph.daddr; 251 __be32 remote = parms->iph.daddr;
218 __be32 local = parms->iph.saddr; 252 __be32 local = parms->iph.saddr;
219 struct ip_tunnel *t, **tp, *nt; 253 struct ip_tunnel *t, *nt;
254 struct ip_tunnel __rcu **tp;
220 struct net_device *dev; 255 struct net_device *dev;
221 char name[IFNAMSIZ]; 256 char name[IFNAMSIZ];
222 struct ipip_net *ipn = net_generic(net, ipip_net_id); 257 struct ipip_net *ipn = net_generic(net, ipip_net_id);
223 258
224 for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) { 259 for (tp = __ipip_bucket(ipn, parms);
260 (t = rtnl_dereference(*tp)) != NULL;
261 tp = &t->next) {
225 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) 262 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
226 return t; 263 return t;
227 } 264 }
@@ -231,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
231 if (parms->name[0]) 268 if (parms->name[0])
232 strlcpy(name, parms->name, IFNAMSIZ); 269 strlcpy(name, parms->name, IFNAMSIZ);
233 else 270 else
234 sprintf(name, "tunl%%d"); 271 strcpy(name, "tunl%d");
235 272
236 dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); 273 dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
237 if (dev == NULL) 274 if (dev == NULL)
@@ -247,7 +284,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
247 nt = netdev_priv(dev); 284 nt = netdev_priv(dev);
248 nt->parms = *parms; 285 nt->parms = *parms;
249 286
250 ipip_tunnel_init(dev); 287 if (ipip_tunnel_init(dev) < 0)
288 goto failed_free;
251 289
252 if (register_netdevice(dev) < 0) 290 if (register_netdevice(dev) < 0)
253 goto failed_free; 291 goto failed_free;
@@ -257,20 +295,19 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
257 return nt; 295 return nt;
258 296
259failed_free: 297failed_free:
260 free_netdev(dev); 298 ipip_dev_free(dev);
261 return NULL; 299 return NULL;
262} 300}
263 301
302/* called with RTNL */
264static void ipip_tunnel_uninit(struct net_device *dev) 303static void ipip_tunnel_uninit(struct net_device *dev)
265{ 304{
266 struct net *net = dev_net(dev); 305 struct net *net = dev_net(dev);
267 struct ipip_net *ipn = net_generic(net, ipip_net_id); 306 struct ipip_net *ipn = net_generic(net, ipip_net_id);
268 307
269 if (dev == ipn->fb_tunnel_dev) { 308 if (dev == ipn->fb_tunnel_dev)
270 write_lock_bh(&ipip_lock); 309 rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
271 ipn->tunnels_wc[0] = NULL; 310 else
272 write_unlock_bh(&ipip_lock);
273 } else
274 ipip_tunnel_unlink(ipn, netdev_priv(dev)); 311 ipip_tunnel_unlink(ipn, netdev_priv(dev));
275 dev_put(dev); 312 dev_put(dev);
276} 313}
@@ -318,7 +355,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
318 355
319 err = -ENOENT; 356 err = -ENOENT;
320 357
321 read_lock(&ipip_lock); 358 rcu_read_lock();
322 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); 359 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
323 if (t == NULL || t->parms.iph.daddr == 0) 360 if (t == NULL || t->parms.iph.daddr == 0)
324 goto out; 361 goto out;
@@ -333,7 +370,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
333 t->err_count = 1; 370 t->err_count = 1;
334 t->err_time = jiffies; 371 t->err_time = jiffies;
335out: 372out:
336 read_unlock(&ipip_lock); 373 rcu_read_unlock();
337 return err; 374 return err;
338} 375}
339 376
@@ -351,11 +388,13 @@ static int ipip_rcv(struct sk_buff *skb)
351 struct ip_tunnel *tunnel; 388 struct ip_tunnel *tunnel;
352 const struct iphdr *iph = ip_hdr(skb); 389 const struct iphdr *iph = ip_hdr(skb);
353 390
354 read_lock(&ipip_lock); 391 rcu_read_lock();
355 if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), 392 tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
356 iph->saddr, iph->daddr)) != NULL) { 393 if (tunnel != NULL) {
394 struct pcpu_tstats *tstats;
395
357 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 396 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
358 read_unlock(&ipip_lock); 397 rcu_read_unlock();
359 kfree_skb(skb); 398 kfree_skb(skb);
360 return 0; 399 return 0;
361 } 400 }
@@ -367,17 +406,20 @@ static int ipip_rcv(struct sk_buff *skb)
367 skb->protocol = htons(ETH_P_IP); 406 skb->protocol = htons(ETH_P_IP);
368 skb->pkt_type = PACKET_HOST; 407 skb->pkt_type = PACKET_HOST;
369 408
370 tunnel->dev->stats.rx_packets++; 409 tstats = this_cpu_ptr(tunnel->dev->tstats);
371 tunnel->dev->stats.rx_bytes += skb->len; 410 tstats->rx_packets++;
372 skb->dev = tunnel->dev; 411 tstats->rx_bytes += skb->len;
373 skb_dst_drop(skb); 412
374 nf_reset(skb); 413 __skb_tunnel_rx(skb, tunnel->dev);
414
375 ipip_ecn_decapsulate(iph, skb); 415 ipip_ecn_decapsulate(iph, skb);
416
376 netif_rx(skb); 417 netif_rx(skb);
377 read_unlock(&ipip_lock); 418
419 rcu_read_unlock();
378 return 0; 420 return 0;
379 } 421 }
380 read_unlock(&ipip_lock); 422 rcu_read_unlock();
381 423
382 return -1; 424 return -1;
383} 425}
@@ -387,36 +429,31 @@ static int ipip_rcv(struct sk_buff *skb)
387 * and that skb is filled properly by that function. 429 * and that skb is filled properly by that function.
388 */ 430 */
389 431
390static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 432static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
391{ 433{
392 struct ip_tunnel *tunnel = netdev_priv(dev); 434 struct ip_tunnel *tunnel = netdev_priv(dev);
393 struct net_device_stats *stats = &tunnel->dev->stats; 435 struct pcpu_tstats *tstats;
394 struct iphdr *tiph = &tunnel->parms.iph; 436 struct iphdr *tiph = &tunnel->parms.iph;
395 u8 tos = tunnel->parms.iph.tos; 437 u8 tos = tunnel->parms.iph.tos;
396 __be16 df = tiph->frag_off; 438 __be16 df = tiph->frag_off;
397 struct rtable *rt; /* Route to the other host */ 439 struct rtable *rt; /* Route to the other host */
398 struct net_device *tdev; /* Device to other host */ 440 struct net_device *tdev; /* Device to other host */
399 struct iphdr *old_iph = ip_hdr(skb); 441 struct iphdr *old_iph = ip_hdr(skb);
400 struct iphdr *iph; /* Our new IP header */ 442 struct iphdr *iph; /* Our new IP header */
401 unsigned int max_headroom; /* The extra header space needed */ 443 unsigned int max_headroom; /* The extra header space needed */
402 __be32 dst = tiph->daddr; 444 __be32 dst = tiph->daddr;
403 int mtu; 445 int mtu;
404 446
405 if (tunnel->recursion++) {
406 stats->collisions++;
407 goto tx_error;
408 }
409
410 if (skb->protocol != htons(ETH_P_IP)) 447 if (skb->protocol != htons(ETH_P_IP))
411 goto tx_error; 448 goto tx_error;
412 449
413 if (tos&1) 450 if (tos & 1)
414 tos = old_iph->tos; 451 tos = old_iph->tos;
415 452
416 if (!dst) { 453 if (!dst) {
417 /* NBMA tunnel */ 454 /* NBMA tunnel */
418 if ((rt = skb_rtable(skb)) == NULL) { 455 if ((rt = skb_rtable(skb)) == NULL) {
419 stats->tx_fifo_errors++; 456 dev->stats.tx_fifo_errors++;
420 goto tx_error; 457 goto tx_error;
421 } 458 }
422 if ((dst = rt->rt_gateway) == 0) 459 if ((dst = rt->rt_gateway) == 0)
@@ -424,44 +461,48 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
424 } 461 }
425 462
426 { 463 {
427 struct flowi fl = { .oif = tunnel->parms.link, 464 struct flowi fl = {
428 .nl_u = { .ip4_u = 465 .oif = tunnel->parms.link,
429 { .daddr = dst, 466 .fl4_dst = dst,
430 .saddr = tiph->saddr, 467 .fl4_src= tiph->saddr,
431 .tos = RT_TOS(tos) } }, 468 .fl4_tos = RT_TOS(tos),
432 .proto = IPPROTO_IPIP }; 469 .proto = IPPROTO_IPIP
470 };
471
433 if (ip_route_output_key(dev_net(dev), &rt, &fl)) { 472 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
434 stats->tx_carrier_errors++; 473 dev->stats.tx_carrier_errors++;
435 goto tx_error_icmp; 474 goto tx_error_icmp;
436 } 475 }
437 } 476 }
438 tdev = rt->u.dst.dev; 477 tdev = rt->dst.dev;
439 478
440 if (tdev == dev) { 479 if (tdev == dev) {
441 ip_rt_put(rt); 480 ip_rt_put(rt);
442 stats->collisions++; 481 dev->stats.collisions++;
443 goto tx_error; 482 goto tx_error;
444 } 483 }
445 484
446 if (tiph->frag_off) 485 df |= old_iph->frag_off & htons(IP_DF);
447 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
448 else
449 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
450 486
451 if (mtu < 68) { 487 if (df) {
452 stats->collisions++; 488 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
453 ip_rt_put(rt);
454 goto tx_error;
455 }
456 if (skb_dst(skb))
457 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
458 489
459 df |= (old_iph->frag_off&htons(IP_DF)); 490 if (mtu < 68) {
491 dev->stats.collisions++;
492 ip_rt_put(rt);
493 goto tx_error;
494 }
460 495
461 if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { 496 if (skb_dst(skb))
462 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 497 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
463 ip_rt_put(rt); 498
464 goto tx_error; 499 if ((old_iph->frag_off & htons(IP_DF)) &&
500 mtu < ntohs(old_iph->tot_len)) {
501 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
502 htonl(mtu));
503 ip_rt_put(rt);
504 goto tx_error;
505 }
465 } 506 }
466 507
467 if (tunnel->err_count > 0) { 508 if (tunnel->err_count > 0) {
@@ -483,10 +524,9 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
483 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 524 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
484 if (!new_skb) { 525 if (!new_skb) {
485 ip_rt_put(rt); 526 ip_rt_put(rt);
486 stats->tx_dropped++; 527 dev->stats.tx_dropped++;
487 dev_kfree_skb(skb); 528 dev_kfree_skb(skb);
488 tunnel->recursion--; 529 return NETDEV_TX_OK;
489 return 0;
490 } 530 }
491 if (skb->sk) 531 if (skb->sk)
492 skb_set_owner_w(new_skb, skb->sk); 532 skb_set_owner_w(new_skb, skb->sk);
@@ -502,7 +542,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
502 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 542 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
503 IPSKB_REROUTED); 543 IPSKB_REROUTED);
504 skb_dst_drop(skb); 544 skb_dst_drop(skb);
505 skb_dst_set(skb, &rt->u.dst); 545 skb_dst_set(skb, &rt->dst);
506 546
507 /* 547 /*
508 * Push down and install the IPIP header. 548 * Push down and install the IPIP header.
@@ -521,18 +561,16 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
521 iph->ttl = old_iph->ttl; 561 iph->ttl = old_iph->ttl;
522 562
523 nf_reset(skb); 563 nf_reset(skb);
524 564 tstats = this_cpu_ptr(dev->tstats);
525 IPTUNNEL_XMIT(); 565 __IPTUNNEL_XMIT(tstats, &dev->stats);
526 tunnel->recursion--; 566 return NETDEV_TX_OK;
527 return 0;
528 567
529tx_error_icmp: 568tx_error_icmp:
530 dst_link_failure(skb); 569 dst_link_failure(skb);
531tx_error: 570tx_error:
532 stats->tx_errors++; 571 dev->stats.tx_errors++;
533 dev_kfree_skb(skb); 572 dev_kfree_skb(skb);
534 tunnel->recursion--; 573 return NETDEV_TX_OK;
535 return 0;
536} 574}
537 575
538static void ipip_tunnel_bind_dev(struct net_device *dev) 576static void ipip_tunnel_bind_dev(struct net_device *dev)
@@ -545,15 +583,17 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
545 iph = &tunnel->parms.iph; 583 iph = &tunnel->parms.iph;
546 584
547 if (iph->daddr) { 585 if (iph->daddr) {
548 struct flowi fl = { .oif = tunnel->parms.link, 586 struct flowi fl = {
549 .nl_u = { .ip4_u = 587 .oif = tunnel->parms.link,
550 { .daddr = iph->daddr, 588 .fl4_dst = iph->daddr,
551 .saddr = iph->saddr, 589 .fl4_src = iph->saddr,
552 .tos = RT_TOS(iph->tos) } }, 590 .fl4_tos = RT_TOS(iph->tos),
553 .proto = IPPROTO_IPIP }; 591 .proto = IPPROTO_IPIP
592 };
554 struct rtable *rt; 593 struct rtable *rt;
594
555 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 595 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
556 tdev = rt->u.dst.dev; 596 tdev = rt->dst.dev;
557 ip_rt_put(rt); 597 ip_rt_put(rt);
558 } 598 }
559 dev->flags |= IFF_POINTOPOINT; 599 dev->flags |= IFF_POINTOPOINT;
@@ -628,6 +668,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
628 } 668 }
629 t = netdev_priv(dev); 669 t = netdev_priv(dev);
630 ipip_tunnel_unlink(ipn, t); 670 ipip_tunnel_unlink(ipn, t);
671 synchronize_net();
631 t->parms.iph.saddr = p.iph.saddr; 672 t->parms.iph.saddr = p.iph.saddr;
632 t->parms.iph.daddr = p.iph.daddr; 673 t->parms.iph.daddr = p.iph.daddr;
633 memcpy(dev->dev_addr, &p.iph.saddr, 4); 674 memcpy(dev->dev_addr, &p.iph.saddr, 4);
@@ -697,13 +738,19 @@ static const struct net_device_ops ipip_netdev_ops = {
697 .ndo_start_xmit = ipip_tunnel_xmit, 738 .ndo_start_xmit = ipip_tunnel_xmit,
698 .ndo_do_ioctl = ipip_tunnel_ioctl, 739 .ndo_do_ioctl = ipip_tunnel_ioctl,
699 .ndo_change_mtu = ipip_tunnel_change_mtu, 740 .ndo_change_mtu = ipip_tunnel_change_mtu,
700 741 .ndo_get_stats = ipip_get_stats,
701}; 742};
702 743
744static void ipip_dev_free(struct net_device *dev)
745{
746 free_percpu(dev->tstats);
747 free_netdev(dev);
748}
749
703static void ipip_tunnel_setup(struct net_device *dev) 750static void ipip_tunnel_setup(struct net_device *dev)
704{ 751{
705 dev->netdev_ops = &ipip_netdev_ops; 752 dev->netdev_ops = &ipip_netdev_ops;
706 dev->destructor = free_netdev; 753 dev->destructor = ipip_dev_free;
707 754
708 dev->type = ARPHRD_TUNNEL; 755 dev->type = ARPHRD_TUNNEL;
709 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); 756 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
@@ -712,10 +759,11 @@ static void ipip_tunnel_setup(struct net_device *dev)
712 dev->iflink = 0; 759 dev->iflink = 0;
713 dev->addr_len = 4; 760 dev->addr_len = 4;
714 dev->features |= NETIF_F_NETNS_LOCAL; 761 dev->features |= NETIF_F_NETNS_LOCAL;
762 dev->features |= NETIF_F_LLTX;
715 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 763 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
716} 764}
717 765
718static void ipip_tunnel_init(struct net_device *dev) 766static int ipip_tunnel_init(struct net_device *dev)
719{ 767{
720 struct ip_tunnel *tunnel = netdev_priv(dev); 768 struct ip_tunnel *tunnel = netdev_priv(dev);
721 769
@@ -726,9 +774,15 @@ static void ipip_tunnel_init(struct net_device *dev)
726 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 774 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
727 775
728 ipip_tunnel_bind_dev(dev); 776 ipip_tunnel_bind_dev(dev);
777
778 dev->tstats = alloc_percpu(struct pcpu_tstats);
779 if (!dev->tstats)
780 return -ENOMEM;
781
782 return 0;
729} 783}
730 784
731static void ipip_fb_tunnel_init(struct net_device *dev) 785static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
732{ 786{
733 struct ip_tunnel *tunnel = netdev_priv(dev); 787 struct ip_tunnel *tunnel = netdev_priv(dev);
734 struct iphdr *iph = &tunnel->parms.iph; 788 struct iphdr *iph = &tunnel->parms.iph;
@@ -741,11 +795,16 @@ static void ipip_fb_tunnel_init(struct net_device *dev)
741 iph->protocol = IPPROTO_IPIP; 795 iph->protocol = IPPROTO_IPIP;
742 iph->ihl = 5; 796 iph->ihl = 5;
743 797
798 dev->tstats = alloc_percpu(struct pcpu_tstats);
799 if (!dev->tstats)
800 return -ENOMEM;
801
744 dev_hold(dev); 802 dev_hold(dev);
745 ipn->tunnels_wc[0] = tunnel; 803 rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
804 return 0;
746} 805}
747 806
748static struct xfrm_tunnel ipip_handler = { 807static struct xfrm_tunnel ipip_handler __read_mostly = {
749 .handler = ipip_rcv, 808 .handler = ipip_rcv,
750 .err_handler = ipip_err, 809 .err_handler = ipip_err,
751 .priority = 1, 810 .priority = 1,
@@ -754,7 +813,7 @@ static struct xfrm_tunnel ipip_handler = {
754static const char banner[] __initconst = 813static const char banner[] __initconst =
755 KERN_INFO "IPv4 over IPv4 tunneling driver\n"; 814 KERN_INFO "IPv4 over IPv4 tunneling driver\n";
756 815
757static void ipip_destroy_tunnels(struct ipip_net *ipn) 816static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
758{ 817{
759 int prio; 818 int prio;
760 819
@@ -762,25 +821,20 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn)
762 int h; 821 int h;
763 for (h = 0; h < HASH_SIZE; h++) { 822 for (h = 0; h < HASH_SIZE; h++) {
764 struct ip_tunnel *t; 823 struct ip_tunnel *t;
765 while ((t = ipn->tunnels[prio][h]) != NULL) 824
766 unregister_netdevice(t->dev); 825 t = rtnl_dereference(ipn->tunnels[prio][h]);
826 while (t != NULL) {
827 unregister_netdevice_queue(t->dev, head);
828 t = rtnl_dereference(t->next);
829 }
767 } 830 }
768 } 831 }
769} 832}
770 833
771static int ipip_init_net(struct net *net) 834static int __net_init ipip_init_net(struct net *net)
772{ 835{
836 struct ipip_net *ipn = net_generic(net, ipip_net_id);
773 int err; 837 int err;
774 struct ipip_net *ipn;
775
776 err = -ENOMEM;
777 ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL);
778 if (ipn == NULL)
779 goto err_alloc;
780
781 err = net_assign_generic(net, ipip_net_id, ipn);
782 if (err < 0)
783 goto err_assign;
784 838
785 ipn->tunnels[0] = ipn->tunnels_wc; 839 ipn->tunnels[0] = ipn->tunnels_wc;
786 ipn->tunnels[1] = ipn->tunnels_l; 840 ipn->tunnels[1] = ipn->tunnels_l;
@@ -796,7 +850,9 @@ static int ipip_init_net(struct net *net)
796 } 850 }
797 dev_net_set(ipn->fb_tunnel_dev, net); 851 dev_net_set(ipn->fb_tunnel_dev, net);
798 852
799 ipip_fb_tunnel_init(ipn->fb_tunnel_dev); 853 err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
854 if (err)
855 goto err_reg_dev;
800 856
801 if ((err = register_netdev(ipn->fb_tunnel_dev))) 857 if ((err = register_netdev(ipn->fb_tunnel_dev)))
802 goto err_reg_dev; 858 goto err_reg_dev;
@@ -804,30 +860,29 @@ static int ipip_init_net(struct net *net)
804 return 0; 860 return 0;
805 861
806err_reg_dev: 862err_reg_dev:
807 free_netdev(ipn->fb_tunnel_dev); 863 ipip_dev_free(ipn->fb_tunnel_dev);
808err_alloc_dev: 864err_alloc_dev:
809 /* nothing */ 865 /* nothing */
810err_assign:
811 kfree(ipn);
812err_alloc:
813 return err; 866 return err;
814} 867}
815 868
816static void ipip_exit_net(struct net *net) 869static void __net_exit ipip_exit_net(struct net *net)
817{ 870{
818 struct ipip_net *ipn; 871 struct ipip_net *ipn = net_generic(net, ipip_net_id);
872 LIST_HEAD(list);
819 873
820 ipn = net_generic(net, ipip_net_id);
821 rtnl_lock(); 874 rtnl_lock();
822 ipip_destroy_tunnels(ipn); 875 ipip_destroy_tunnels(ipn, &list);
823 unregister_netdevice(ipn->fb_tunnel_dev); 876 unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
877 unregister_netdevice_many(&list);
824 rtnl_unlock(); 878 rtnl_unlock();
825 kfree(ipn);
826} 879}
827 880
828static struct pernet_operations ipip_net_ops = { 881static struct pernet_operations ipip_net_ops = {
829 .init = ipip_init_net, 882 .init = ipip_init_net,
830 .exit = ipip_exit_net, 883 .exit = ipip_exit_net,
884 .id = &ipip_net_id,
885 .size = sizeof(struct ipip_net),
831}; 886};
832 887
833static int __init ipip_init(void) 888static int __init ipip_init(void)
@@ -836,15 +891,14 @@ static int __init ipip_init(void)
836 891
837 printk(banner); 892 printk(banner);
838 893
839 if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) { 894 err = register_pernet_device(&ipip_net_ops);
895 if (err < 0)
896 return err;
897 err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
898 if (err < 0) {
899 unregister_pernet_device(&ipip_net_ops);
840 printk(KERN_INFO "ipip init: can't register tunnel\n"); 900 printk(KERN_INFO "ipip init: can't register tunnel\n");
841 return -EAGAIN;
842 } 901 }
843
844 err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops);
845 if (err)
846 xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
847
848 return err; 902 return err;
849} 903}
850 904
@@ -853,9 +907,10 @@ static void __exit ipip_fini(void)
853 if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) 907 if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
854 printk(KERN_INFO "ipip close: can't deregister tunnel\n"); 908 printk(KERN_INFO "ipip close: can't deregister tunnel\n");
855 909
856 unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops); 910 unregister_pernet_device(&ipip_net_ops);
857} 911}
858 912
859module_init(ipip_init); 913module_init(ipip_init);
860module_exit(ipip_fini); 914module_exit(ipip_fini);
861MODULE_LICENSE("GPL"); 915MODULE_LICENSE("GPL");
916MODULE_ALIAS("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9a8da5ed92b7..3f3a9afd73e0 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -22,7 +22,7 @@
22 * overflow. 22 * overflow.
23 * Carlos Picoto : PIMv1 Support 23 * Carlos Picoto : PIMv1 Support
24 * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header 24 * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header
25 * Relax this requrement to work with older peers. 25 * Relax this requirement to work with older peers.
26 * 26 *
27 */ 27 */
28 28
@@ -47,6 +47,7 @@
47#include <linux/mroute.h> 47#include <linux/mroute.h>
48#include <linux/init.h> 48#include <linux/init.h>
49#include <linux/if_ether.h> 49#include <linux/if_ether.h>
50#include <linux/slab.h>
50#include <net/net_namespace.h> 51#include <net/net_namespace.h>
51#include <net/ip.h> 52#include <net/ip.h>
52#include <net/protocol.h> 53#include <net/protocol.h>
@@ -62,13 +63,42 @@
62#include <net/ipip.h> 63#include <net/ipip.h>
63#include <net/checksum.h> 64#include <net/checksum.h>
64#include <net/netlink.h> 65#include <net/netlink.h>
66#include <net/fib_rules.h>
65 67
66#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) 68#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67#define CONFIG_IP_PIMSM 1 69#define CONFIG_IP_PIMSM 1
68#endif 70#endif
69 71
72struct mr_table {
73 struct list_head list;
74#ifdef CONFIG_NET_NS
75 struct net *net;
76#endif
77 u32 id;
78 struct sock __rcu *mroute_sk;
79 struct timer_list ipmr_expire_timer;
80 struct list_head mfc_unres_queue;
81 struct list_head mfc_cache_array[MFC_LINES];
82 struct vif_device vif_table[MAXVIFS];
83 int maxvif;
84 atomic_t cache_resolve_queue_len;
85 int mroute_do_assert;
86 int mroute_do_pim;
87#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
88 int mroute_reg_vif_num;
89#endif
90};
91
92struct ipmr_rule {
93 struct fib_rule common;
94};
95
96struct ipmr_result {
97 struct mr_table *mrt;
98};
99
70/* Big lock, protecting vif table, mrt cache and mroute socket state. 100/* Big lock, protecting vif table, mrt cache and mroute socket state.
71 Note that the changes are semaphored via rtnl_lock. 101 * Note that the changes are semaphored via rtnl_lock.
72 */ 102 */
73 103
74static DEFINE_RWLOCK(mrt_lock); 104static DEFINE_RWLOCK(mrt_lock);
@@ -77,33 +107,232 @@ static DEFINE_RWLOCK(mrt_lock);
77 * Multicast router control variables 107 * Multicast router control variables
78 */ 108 */
79 109
80#define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL) 110#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
81
82static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */
83 111
84/* Special spinlock for queue of unresolved entries */ 112/* Special spinlock for queue of unresolved entries */
85static DEFINE_SPINLOCK(mfc_unres_lock); 113static DEFINE_SPINLOCK(mfc_unres_lock);
86 114
87/* We return to original Alan's scheme. Hash table of resolved 115/* We return to original Alan's scheme. Hash table of resolved
88 entries is changed only in process context and protected 116 * entries is changed only in process context and protected
89 with weak lock mrt_lock. Queue of unresolved entries is protected 117 * with weak lock mrt_lock. Queue of unresolved entries is protected
90 with strong spinlock mfc_unres_lock. 118 * with strong spinlock mfc_unres_lock.
91 119 *
92 In this case data path is free of exclusive locks at all. 120 * In this case data path is free of exclusive locks at all.
93 */ 121 */
94 122
95static struct kmem_cache *mrt_cachep __read_mostly; 123static struct kmem_cache *mrt_cachep __read_mostly;
96 124
97static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); 125static struct mr_table *ipmr_new_table(struct net *net, u32 id);
98static int ipmr_cache_report(struct net *net, 126static int ip_mr_forward(struct net *net, struct mr_table *mrt,
127 struct sk_buff *skb, struct mfc_cache *cache,
128 int local);
129static int ipmr_cache_report(struct mr_table *mrt,
99 struct sk_buff *pkt, vifi_t vifi, int assert); 130 struct sk_buff *pkt, vifi_t vifi, int assert);
100static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); 131static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
132 struct mfc_cache *c, struct rtmsg *rtm);
133static void ipmr_expire_process(unsigned long arg);
101 134
102#ifdef CONFIG_IP_PIMSM_V2 135#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
103static struct net_protocol pim_protocol; 136#define ipmr_for_each_table(mrt, net) \
137 list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
138
139static struct mr_table *ipmr_get_table(struct net *net, u32 id)
140{
141 struct mr_table *mrt;
142
143 ipmr_for_each_table(mrt, net) {
144 if (mrt->id == id)
145 return mrt;
146 }
147 return NULL;
148}
149
150static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
151 struct mr_table **mrt)
152{
153 struct ipmr_result res;
154 struct fib_lookup_arg arg = { .result = &res, };
155 int err;
156
157 err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
158 if (err < 0)
159 return err;
160 *mrt = res.mrt;
161 return 0;
162}
163
164static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
165 int flags, struct fib_lookup_arg *arg)
166{
167 struct ipmr_result *res = arg->result;
168 struct mr_table *mrt;
169
170 switch (rule->action) {
171 case FR_ACT_TO_TBL:
172 break;
173 case FR_ACT_UNREACHABLE:
174 return -ENETUNREACH;
175 case FR_ACT_PROHIBIT:
176 return -EACCES;
177 case FR_ACT_BLACKHOLE:
178 default:
179 return -EINVAL;
180 }
181
182 mrt = ipmr_get_table(rule->fr_net, rule->table);
183 if (mrt == NULL)
184 return -EAGAIN;
185 res->mrt = mrt;
186 return 0;
187}
188
189static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
190{
191 return 1;
192}
193
194static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
195 FRA_GENERIC_POLICY,
196};
197
198static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
199 struct fib_rule_hdr *frh, struct nlattr **tb)
200{
201 return 0;
202}
203
204static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
205 struct nlattr **tb)
206{
207 return 1;
208}
209
210static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
211 struct fib_rule_hdr *frh)
212{
213 frh->dst_len = 0;
214 frh->src_len = 0;
215 frh->tos = 0;
216 return 0;
217}
218
219static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
220 .family = RTNL_FAMILY_IPMR,
221 .rule_size = sizeof(struct ipmr_rule),
222 .addr_size = sizeof(u32),
223 .action = ipmr_rule_action,
224 .match = ipmr_rule_match,
225 .configure = ipmr_rule_configure,
226 .compare = ipmr_rule_compare,
227 .default_pref = fib_default_rule_pref,
228 .fill = ipmr_rule_fill,
229 .nlgroup = RTNLGRP_IPV4_RULE,
230 .policy = ipmr_rule_policy,
231 .owner = THIS_MODULE,
232};
233
234static int __net_init ipmr_rules_init(struct net *net)
235{
236 struct fib_rules_ops *ops;
237 struct mr_table *mrt;
238 int err;
239
240 ops = fib_rules_register(&ipmr_rules_ops_template, net);
241 if (IS_ERR(ops))
242 return PTR_ERR(ops);
243
244 INIT_LIST_HEAD(&net->ipv4.mr_tables);
245
246 mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
247 if (mrt == NULL) {
248 err = -ENOMEM;
249 goto err1;
250 }
251
252 err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
253 if (err < 0)
254 goto err2;
255
256 net->ipv4.mr_rules_ops = ops;
257 return 0;
258
259err2:
260 kfree(mrt);
261err1:
262 fib_rules_unregister(ops);
263 return err;
264}
265
266static void __net_exit ipmr_rules_exit(struct net *net)
267{
268 struct mr_table *mrt, *next;
269
270 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
271 list_del(&mrt->list);
272 kfree(mrt);
273 }
274 fib_rules_unregister(net->ipv4.mr_rules_ops);
275}
276#else
277#define ipmr_for_each_table(mrt, net) \
278 for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
279
280static struct mr_table *ipmr_get_table(struct net *net, u32 id)
281{
282 return net->ipv4.mrt;
283}
284
285static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
286 struct mr_table **mrt)
287{
288 *mrt = net->ipv4.mrt;
289 return 0;
290}
291
292static int __net_init ipmr_rules_init(struct net *net)
293{
294 net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
295 return net->ipv4.mrt ? 0 : -ENOMEM;
296}
297
298static void __net_exit ipmr_rules_exit(struct net *net)
299{
300 kfree(net->ipv4.mrt);
301}
104#endif 302#endif
105 303
106static struct timer_list ipmr_expire_timer; 304static struct mr_table *ipmr_new_table(struct net *net, u32 id)
305{
306 struct mr_table *mrt;
307 unsigned int i;
308
309 mrt = ipmr_get_table(net, id);
310 if (mrt != NULL)
311 return mrt;
312
313 mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
314 if (mrt == NULL)
315 return NULL;
316 write_pnet(&mrt->net, net);
317 mrt->id = id;
318
319 /* Forwarding cache */
320 for (i = 0; i < MFC_LINES; i++)
321 INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
322
323 INIT_LIST_HEAD(&mrt->mfc_unres_queue);
324
325 setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
326 (unsigned long)mrt);
327
328#ifdef CONFIG_IP_PIMSM
329 mrt->mroute_reg_vif_num = -1;
330#endif
331#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
332 list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
333#endif
334 return mrt;
335}
107 336
108/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ 337/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
109 338
@@ -167,9 +396,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
167 set_fs(KERNEL_DS); 396 set_fs(KERNEL_DS);
168 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); 397 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
169 set_fs(oldfs); 398 set_fs(oldfs);
170 } else 399 } else {
171 err = -EOPNOTSUPP; 400 err = -EOPNOTSUPP;
172 401 }
173 dev = NULL; 402 dev = NULL;
174 403
175 if (err == 0 && 404 if (err == 0 &&
@@ -201,18 +430,30 @@ failure:
201 430
202#ifdef CONFIG_IP_PIMSM 431#ifdef CONFIG_IP_PIMSM
203 432
204static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) 433static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
205{ 434{
206 struct net *net = dev_net(dev); 435 struct net *net = dev_net(dev);
436 struct mr_table *mrt;
437 struct flowi fl = {
438 .oif = dev->ifindex,
439 .iif = skb->skb_iif,
440 .mark = skb->mark,
441 };
442 int err;
443
444 err = ipmr_fib_lookup(net, &fl, &mrt);
445 if (err < 0) {
446 kfree_skb(skb);
447 return err;
448 }
207 449
208 read_lock(&mrt_lock); 450 read_lock(&mrt_lock);
209 dev->stats.tx_bytes += skb->len; 451 dev->stats.tx_bytes += skb->len;
210 dev->stats.tx_packets++; 452 dev->stats.tx_packets++;
211 ipmr_cache_report(net, skb, net->ipv4.mroute_reg_vif_num, 453 ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
212 IGMPMSG_WHOLEPKT);
213 read_unlock(&mrt_lock); 454 read_unlock(&mrt_lock);
214 kfree_skb(skb); 455 kfree_skb(skb);
215 return 0; 456 return NETDEV_TX_OK;
216} 457}
217 458
218static const struct net_device_ops reg_vif_netdev_ops = { 459static const struct net_device_ops reg_vif_netdev_ops = {
@@ -229,12 +470,18 @@ static void reg_vif_setup(struct net_device *dev)
229 dev->features |= NETIF_F_NETNS_LOCAL; 470 dev->features |= NETIF_F_NETNS_LOCAL;
230} 471}
231 472
232static struct net_device *ipmr_reg_vif(struct net *net) 473static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
233{ 474{
234 struct net_device *dev; 475 struct net_device *dev;
235 struct in_device *in_dev; 476 struct in_device *in_dev;
477 char name[IFNAMSIZ];
478
479 if (mrt->id == RT_TABLE_DEFAULT)
480 sprintf(name, "pimreg");
481 else
482 sprintf(name, "pimreg%u", mrt->id);
236 483
237 dev = alloc_netdev(0, "pimreg", reg_vif_setup); 484 dev = alloc_netdev(0, name, reg_vif_setup);
238 485
239 if (dev == NULL) 486 if (dev == NULL)
240 return NULL; 487 return NULL;
@@ -248,7 +495,8 @@ static struct net_device *ipmr_reg_vif(struct net *net)
248 dev->iflink = 0; 495 dev->iflink = 0;
249 496
250 rcu_read_lock(); 497 rcu_read_lock();
251 if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { 498 in_dev = __in_dev_get_rcu(dev);
499 if (!in_dev) {
252 rcu_read_unlock(); 500 rcu_read_unlock();
253 goto failure; 501 goto failure;
254 } 502 }
@@ -279,16 +527,17 @@ failure:
279 * @notify: Set to 1, if the caller is a notifier_call 527 * @notify: Set to 1, if the caller is a notifier_call
280 */ 528 */
281 529
282static int vif_delete(struct net *net, int vifi, int notify) 530static int vif_delete(struct mr_table *mrt, int vifi, int notify,
531 struct list_head *head)
283{ 532{
284 struct vif_device *v; 533 struct vif_device *v;
285 struct net_device *dev; 534 struct net_device *dev;
286 struct in_device *in_dev; 535 struct in_device *in_dev;
287 536
288 if (vifi < 0 || vifi >= net->ipv4.maxvif) 537 if (vifi < 0 || vifi >= mrt->maxvif)
289 return -EADDRNOTAVAIL; 538 return -EADDRNOTAVAIL;
290 539
291 v = &net->ipv4.vif_table[vifi]; 540 v = &mrt->vif_table[vifi];
292 541
293 write_lock_bh(&mrt_lock); 542 write_lock_bh(&mrt_lock);
294 dev = v->dev; 543 dev = v->dev;
@@ -300,52 +549,60 @@ static int vif_delete(struct net *net, int vifi, int notify)
300 } 549 }
301 550
302#ifdef CONFIG_IP_PIMSM 551#ifdef CONFIG_IP_PIMSM
303 if (vifi == net->ipv4.mroute_reg_vif_num) 552 if (vifi == mrt->mroute_reg_vif_num)
304 net->ipv4.mroute_reg_vif_num = -1; 553 mrt->mroute_reg_vif_num = -1;
305#endif 554#endif
306 555
307 if (vifi+1 == net->ipv4.maxvif) { 556 if (vifi + 1 == mrt->maxvif) {
308 int tmp; 557 int tmp;
309 for (tmp=vifi-1; tmp>=0; tmp--) { 558
310 if (VIF_EXISTS(net, tmp)) 559 for (tmp = vifi - 1; tmp >= 0; tmp--) {
560 if (VIF_EXISTS(mrt, tmp))
311 break; 561 break;
312 } 562 }
313 net->ipv4.maxvif = tmp+1; 563 mrt->maxvif = tmp+1;
314 } 564 }
315 565
316 write_unlock_bh(&mrt_lock); 566 write_unlock_bh(&mrt_lock);
317 567
318 dev_set_allmulti(dev, -1); 568 dev_set_allmulti(dev, -1);
319 569
320 if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { 570 in_dev = __in_dev_get_rtnl(dev);
571 if (in_dev) {
321 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; 572 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
322 ip_rt_multicast_event(in_dev); 573 ip_rt_multicast_event(in_dev);
323 } 574 }
324 575
325 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) 576 if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
326 unregister_netdevice(dev); 577 unregister_netdevice_queue(dev, head);
327 578
328 dev_put(dev); 579 dev_put(dev);
329 return 0; 580 return 0;
330} 581}
331 582
332static inline void ipmr_cache_free(struct mfc_cache *c) 583static void ipmr_cache_free_rcu(struct rcu_head *head)
333{ 584{
334 release_net(mfc_net(c)); 585 struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
586
335 kmem_cache_free(mrt_cachep, c); 587 kmem_cache_free(mrt_cachep, c);
336} 588}
337 589
590static inline void ipmr_cache_free(struct mfc_cache *c)
591{
592 call_rcu(&c->rcu, ipmr_cache_free_rcu);
593}
594
338/* Destroy an unresolved cache entry, killing queued skbs 595/* Destroy an unresolved cache entry, killing queued skbs
339 and reporting error to netlink readers. 596 * and reporting error to netlink readers.
340 */ 597 */
341 598
342static void ipmr_destroy_unres(struct mfc_cache *c) 599static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
343{ 600{
601 struct net *net = read_pnet(&mrt->net);
344 struct sk_buff *skb; 602 struct sk_buff *skb;
345 struct nlmsgerr *e; 603 struct nlmsgerr *e;
346 struct net *net = mfc_net(c);
347 604
348 atomic_dec(&net->ipv4.cache_resolve_queue_len); 605 atomic_dec(&mrt->cache_resolve_queue_len);
349 606
350 while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) { 607 while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
351 if (ip_hdr(skb)->version == 0) { 608 if (ip_hdr(skb)->version == 0) {
@@ -358,50 +615,49 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
358 memset(&e->msg, 0, sizeof(e->msg)); 615 memset(&e->msg, 0, sizeof(e->msg));
359 616
360 rtnl_unicast(skb, net, NETLINK_CB(skb).pid); 617 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
361 } else 618 } else {
362 kfree_skb(skb); 619 kfree_skb(skb);
620 }
363 } 621 }
364 622
365 ipmr_cache_free(c); 623 ipmr_cache_free(c);
366} 624}
367 625
368 626
369/* Single timer process for all the unresolved queue. */ 627/* Timer process for the unresolved queue. */
370 628
371static void ipmr_expire_process(unsigned long dummy) 629static void ipmr_expire_process(unsigned long arg)
372{ 630{
631 struct mr_table *mrt = (struct mr_table *)arg;
373 unsigned long now; 632 unsigned long now;
374 unsigned long expires; 633 unsigned long expires;
375 struct mfc_cache *c, **cp; 634 struct mfc_cache *c, *next;
376 635
377 if (!spin_trylock(&mfc_unres_lock)) { 636 if (!spin_trylock(&mfc_unres_lock)) {
378 mod_timer(&ipmr_expire_timer, jiffies+HZ/10); 637 mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
379 return; 638 return;
380 } 639 }
381 640
382 if (mfc_unres_queue == NULL) 641 if (list_empty(&mrt->mfc_unres_queue))
383 goto out; 642 goto out;
384 643
385 now = jiffies; 644 now = jiffies;
386 expires = 10*HZ; 645 expires = 10*HZ;
387 cp = &mfc_unres_queue;
388 646
389 while ((c=*cp) != NULL) { 647 list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
390 if (time_after(c->mfc_un.unres.expires, now)) { 648 if (time_after(c->mfc_un.unres.expires, now)) {
391 unsigned long interval = c->mfc_un.unres.expires - now; 649 unsigned long interval = c->mfc_un.unres.expires - now;
392 if (interval < expires) 650 if (interval < expires)
393 expires = interval; 651 expires = interval;
394 cp = &c->next;
395 continue; 652 continue;
396 } 653 }
397 654
398 *cp = c->next; 655 list_del(&c->list);
399 656 ipmr_destroy_unres(mrt, c);
400 ipmr_destroy_unres(c);
401 } 657 }
402 658
403 if (mfc_unres_queue != NULL) 659 if (!list_empty(&mrt->mfc_unres_queue))
404 mod_timer(&ipmr_expire_timer, jiffies + expires); 660 mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
405 661
406out: 662out:
407 spin_unlock(&mfc_unres_lock); 663 spin_unlock(&mfc_unres_lock);
@@ -409,17 +665,17 @@ out:
409 665
410/* Fill oifs list. It is called under write locked mrt_lock. */ 666/* Fill oifs list. It is called under write locked mrt_lock. */
411 667
412static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls) 668static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
669 unsigned char *ttls)
413{ 670{
414 int vifi; 671 int vifi;
415 struct net *net = mfc_net(cache);
416 672
417 cache->mfc_un.res.minvif = MAXVIFS; 673 cache->mfc_un.res.minvif = MAXVIFS;
418 cache->mfc_un.res.maxvif = 0; 674 cache->mfc_un.res.maxvif = 0;
419 memset(cache->mfc_un.res.ttls, 255, MAXVIFS); 675 memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
420 676
421 for (vifi = 0; vifi < net->ipv4.maxvif; vifi++) { 677 for (vifi = 0; vifi < mrt->maxvif; vifi++) {
422 if (VIF_EXISTS(net, vifi) && 678 if (VIF_EXISTS(mrt, vifi) &&
423 ttls[vifi] && ttls[vifi] < 255) { 679 ttls[vifi] && ttls[vifi] < 255) {
424 cache->mfc_un.res.ttls[vifi] = ttls[vifi]; 680 cache->mfc_un.res.ttls[vifi] = ttls[vifi];
425 if (cache->mfc_un.res.minvif > vifi) 681 if (cache->mfc_un.res.minvif > vifi)
@@ -430,16 +686,17 @@ static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
430 } 686 }
431} 687}
432 688
433static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock) 689static int vif_add(struct net *net, struct mr_table *mrt,
690 struct vifctl *vifc, int mrtsock)
434{ 691{
435 int vifi = vifc->vifc_vifi; 692 int vifi = vifc->vifc_vifi;
436 struct vif_device *v = &net->ipv4.vif_table[vifi]; 693 struct vif_device *v = &mrt->vif_table[vifi];
437 struct net_device *dev; 694 struct net_device *dev;
438 struct in_device *in_dev; 695 struct in_device *in_dev;
439 int err; 696 int err;
440 697
441 /* Is vif busy ? */ 698 /* Is vif busy ? */
442 if (VIF_EXISTS(net, vifi)) 699 if (VIF_EXISTS(mrt, vifi))
443 return -EADDRINUSE; 700 return -EADDRINUSE;
444 701
445 switch (vifc->vifc_flags) { 702 switch (vifc->vifc_flags) {
@@ -449,9 +706,9 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
449 * Special Purpose VIF in PIM 706 * Special Purpose VIF in PIM
450 * All the packets will be sent to the daemon 707 * All the packets will be sent to the daemon
451 */ 708 */
452 if (net->ipv4.mroute_reg_vif_num >= 0) 709 if (mrt->mroute_reg_vif_num >= 0)
453 return -EADDRINUSE; 710 return -EADDRINUSE;
454 dev = ipmr_reg_vif(net); 711 dev = ipmr_reg_vif(net, mrt);
455 if (!dev) 712 if (!dev)
456 return -ENOBUFS; 713 return -ENOBUFS;
457 err = dev_set_allmulti(dev, 1); 714 err = dev_set_allmulti(dev, 1);
@@ -473,8 +730,18 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
473 return err; 730 return err;
474 } 731 }
475 break; 732 break;
733
734 case VIFF_USE_IFINDEX:
476 case 0: 735 case 0:
477 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); 736 if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
737 dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
738 if (dev && __in_dev_get_rtnl(dev) == NULL) {
739 dev_put(dev);
740 return -EADDRNOTAVAIL;
741 }
742 } else {
743 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
744 }
478 if (!dev) 745 if (!dev)
479 return -EADDRNOTAVAIL; 746 return -EADDRNOTAVAIL;
480 err = dev_set_allmulti(dev, 1); 747 err = dev_set_allmulti(dev, 1);
@@ -487,14 +754,16 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
487 return -EINVAL; 754 return -EINVAL;
488 } 755 }
489 756
490 if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) 757 in_dev = __in_dev_get_rtnl(dev);
758 if (!in_dev) {
759 dev_put(dev);
491 return -EADDRNOTAVAIL; 760 return -EADDRNOTAVAIL;
761 }
492 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; 762 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
493 ip_rt_multicast_event(in_dev); 763 ip_rt_multicast_event(in_dev);
494 764
495 /* 765 /* Fill in the VIF structures */
496 * Fill in the VIF structures 766
497 */
498 v->rate_limit = vifc->vifc_rate_limit; 767 v->rate_limit = vifc->vifc_rate_limit;
499 v->local = vifc->vifc_lcl_addr.s_addr; 768 v->local = vifc->vifc_lcl_addr.s_addr;
500 v->remote = vifc->vifc_rmt_addr.s_addr; 769 v->remote = vifc->vifc_rmt_addr.s_addr;
@@ -507,57 +776,57 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
507 v->pkt_in = 0; 776 v->pkt_in = 0;
508 v->pkt_out = 0; 777 v->pkt_out = 0;
509 v->link = dev->ifindex; 778 v->link = dev->ifindex;
510 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) 779 if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
511 v->link = dev->iflink; 780 v->link = dev->iflink;
512 781
513 /* And finish update writing critical data */ 782 /* And finish update writing critical data */
514 write_lock_bh(&mrt_lock); 783 write_lock_bh(&mrt_lock);
515 v->dev = dev; 784 v->dev = dev;
516#ifdef CONFIG_IP_PIMSM 785#ifdef CONFIG_IP_PIMSM
517 if (v->flags&VIFF_REGISTER) 786 if (v->flags & VIFF_REGISTER)
518 net->ipv4.mroute_reg_vif_num = vifi; 787 mrt->mroute_reg_vif_num = vifi;
519#endif 788#endif
520 if (vifi+1 > net->ipv4.maxvif) 789 if (vifi+1 > mrt->maxvif)
521 net->ipv4.maxvif = vifi+1; 790 mrt->maxvif = vifi+1;
522 write_unlock_bh(&mrt_lock); 791 write_unlock_bh(&mrt_lock);
523 return 0; 792 return 0;
524} 793}
525 794
526static struct mfc_cache *ipmr_cache_find(struct net *net, 795/* called with rcu_read_lock() */
796static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
527 __be32 origin, 797 __be32 origin,
528 __be32 mcastgrp) 798 __be32 mcastgrp)
529{ 799{
530 int line = MFC_HASH(mcastgrp, origin); 800 int line = MFC_HASH(mcastgrp, origin);
531 struct mfc_cache *c; 801 struct mfc_cache *c;
532 802
533 for (c = net->ipv4.mfc_cache_array[line]; c; c = c->next) { 803 list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
534 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp) 804 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
535 break; 805 return c;
536 } 806 }
537 return c; 807 return NULL;
538} 808}
539 809
540/* 810/*
541 * Allocate a multicast cache entry 811 * Allocate a multicast cache entry
542 */ 812 */
543static struct mfc_cache *ipmr_cache_alloc(struct net *net) 813static struct mfc_cache *ipmr_cache_alloc(void)
544{ 814{
545 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); 815 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
546 if (c == NULL) 816
547 return NULL; 817 if (c)
548 c->mfc_un.res.minvif = MAXVIFS; 818 c->mfc_un.res.minvif = MAXVIFS;
549 mfc_net_set(c, net);
550 return c; 819 return c;
551} 820}
552 821
553static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net) 822static struct mfc_cache *ipmr_cache_alloc_unres(void)
554{ 823{
555 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); 824 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
556 if (c == NULL) 825
557 return NULL; 826 if (c) {
558 skb_queue_head_init(&c->mfc_un.unres.unresolved); 827 skb_queue_head_init(&c->mfc_un.unres.unresolved);
559 c->mfc_un.unres.expires = jiffies + 10*HZ; 828 c->mfc_un.unres.expires = jiffies + 10*HZ;
560 mfc_net_set(c, net); 829 }
561 return c; 830 return c;
562} 831}
563 832
@@ -565,22 +834,21 @@ static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
565 * A cache entry has gone into a resolved state from queued 834 * A cache entry has gone into a resolved state from queued
566 */ 835 */
567 836
568static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) 837static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
838 struct mfc_cache *uc, struct mfc_cache *c)
569{ 839{
570 struct sk_buff *skb; 840 struct sk_buff *skb;
571 struct nlmsgerr *e; 841 struct nlmsgerr *e;
572 842
573 /* 843 /* Play the pending entries through our router */
574 * Play the pending entries through our router
575 */
576 844
577 while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { 845 while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
578 if (ip_hdr(skb)->version == 0) { 846 if (ip_hdr(skb)->version == 0) {
579 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); 847 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
580 848
581 if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) { 849 if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
582 nlh->nlmsg_len = (skb_tail_pointer(skb) - 850 nlh->nlmsg_len = skb_tail_pointer(skb) -
583 (u8 *)nlh); 851 (u8 *)nlh;
584 } else { 852 } else {
585 nlh->nlmsg_type = NLMSG_ERROR; 853 nlh->nlmsg_type = NLMSG_ERROR;
586 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 854 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
@@ -590,9 +858,10 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
590 memset(&e->msg, 0, sizeof(e->msg)); 858 memset(&e->msg, 0, sizeof(e->msg));
591 } 859 }
592 860
593 rtnl_unicast(skb, mfc_net(c), NETLINK_CB(skb).pid); 861 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
594 } else 862 } else {
595 ip_mr_forward(skb, c, 0); 863 ip_mr_forward(net, mrt, skb, c, 0);
864 }
596 } 865 }
597} 866}
598 867
@@ -603,13 +872,14 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
603 * Called under mrt_lock. 872 * Called under mrt_lock.
604 */ 873 */
605 874
606static int ipmr_cache_report(struct net *net, 875static int ipmr_cache_report(struct mr_table *mrt,
607 struct sk_buff *pkt, vifi_t vifi, int assert) 876 struct sk_buff *pkt, vifi_t vifi, int assert)
608{ 877{
609 struct sk_buff *skb; 878 struct sk_buff *skb;
610 const int ihl = ip_hdrlen(pkt); 879 const int ihl = ip_hdrlen(pkt);
611 struct igmphdr *igmp; 880 struct igmphdr *igmp;
612 struct igmpmsg *msg; 881 struct igmpmsg *msg;
882 struct sock *mroute_sk;
613 int ret; 883 int ret;
614 884
615#ifdef CONFIG_IP_PIMSM 885#ifdef CONFIG_IP_PIMSM
@@ -625,9 +895,9 @@ static int ipmr_cache_report(struct net *net,
625#ifdef CONFIG_IP_PIMSM 895#ifdef CONFIG_IP_PIMSM
626 if (assert == IGMPMSG_WHOLEPKT) { 896 if (assert == IGMPMSG_WHOLEPKT) {
627 /* Ugly, but we have no choice with this interface. 897 /* Ugly, but we have no choice with this interface.
628 Duplicate old header, fix ihl, length etc. 898 * Duplicate old header, fix ihl, length etc.
629 And all this only to mangle msg->im_msgtype and 899 * And all this only to mangle msg->im_msgtype and
630 to set msg->im_mbz to "mbz" :-) 900 * to set msg->im_mbz to "mbz" :-)
631 */ 901 */
632 skb_push(skb, sizeof(struct iphdr)); 902 skb_push(skb, sizeof(struct iphdr));
633 skb_reset_network_header(skb); 903 skb_reset_network_header(skb);
@@ -636,7 +906,7 @@ static int ipmr_cache_report(struct net *net,
636 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); 906 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
637 msg->im_msgtype = IGMPMSG_WHOLEPKT; 907 msg->im_msgtype = IGMPMSG_WHOLEPKT;
638 msg->im_mbz = 0; 908 msg->im_mbz = 0;
639 msg->im_vif = net->ipv4.mroute_reg_vif_num; 909 msg->im_vif = mrt->mroute_reg_vif_num;
640 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; 910 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
641 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + 911 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
642 sizeof(struct iphdr)); 912 sizeof(struct iphdr));
@@ -644,39 +914,38 @@ static int ipmr_cache_report(struct net *net,
644#endif 914#endif
645 { 915 {
646 916
647 /* 917 /* Copy the IP header */
648 * Copy the IP header
649 */
650 918
651 skb->network_header = skb->tail; 919 skb->network_header = skb->tail;
652 skb_put(skb, ihl); 920 skb_put(skb, ihl);
653 skb_copy_to_linear_data(skb, pkt->data, ihl); 921 skb_copy_to_linear_data(skb, pkt->data, ihl);
654 ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ 922 ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
655 msg = (struct igmpmsg *)skb_network_header(skb); 923 msg = (struct igmpmsg *)skb_network_header(skb);
656 msg->im_vif = vifi; 924 msg->im_vif = vifi;
657 skb_dst_set(skb, dst_clone(skb_dst(pkt))); 925 skb_dst_set(skb, dst_clone(skb_dst(pkt)));
658 926
659 /* 927 /* Add our header */
660 * Add our header
661 */
662 928
663 igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); 929 igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
664 igmp->type = 930 igmp->type =
665 msg->im_msgtype = assert; 931 msg->im_msgtype = assert;
666 igmp->code = 0; 932 igmp->code = 0;
667 ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ 933 ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
668 skb->transport_header = skb->network_header; 934 skb->transport_header = skb->network_header;
669 } 935 }
670 936
671 if (net->ipv4.mroute_sk == NULL) { 937 rcu_read_lock();
938 mroute_sk = rcu_dereference(mrt->mroute_sk);
939 if (mroute_sk == NULL) {
940 rcu_read_unlock();
672 kfree_skb(skb); 941 kfree_skb(skb);
673 return -EINVAL; 942 return -EINVAL;
674 } 943 }
675 944
676 /* 945 /* Deliver to mrouted */
677 * Deliver to mrouted 946
678 */ 947 ret = sock_queue_rcv_skb(mroute_sk, skb);
679 ret = sock_queue_rcv_skb(net->ipv4.mroute_sk, skb); 948 rcu_read_unlock();
680 if (ret < 0) { 949 if (ret < 0) {
681 if (net_ratelimit()) 950 if (net_ratelimit())
682 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); 951 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -691,44 +960,42 @@ static int ipmr_cache_report(struct net *net,
691 */ 960 */
692 961
693static int 962static int
694ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb) 963ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
695{ 964{
965 bool found = false;
696 int err; 966 int err;
697 struct mfc_cache *c; 967 struct mfc_cache *c;
698 const struct iphdr *iph = ip_hdr(skb); 968 const struct iphdr *iph = ip_hdr(skb);
699 969
700 spin_lock_bh(&mfc_unres_lock); 970 spin_lock_bh(&mfc_unres_lock);
701 for (c=mfc_unres_queue; c; c=c->next) { 971 list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
702 if (net_eq(mfc_net(c), net) && 972 if (c->mfc_mcastgrp == iph->daddr &&
703 c->mfc_mcastgrp == iph->daddr && 973 c->mfc_origin == iph->saddr) {
704 c->mfc_origin == iph->saddr) 974 found = true;
705 break; 975 break;
976 }
706 } 977 }
707 978
708 if (c == NULL) { 979 if (!found) {
709 /* 980 /* Create a new entry if allowable */
710 * Create a new entry if allowable
711 */
712 981
713 if (atomic_read(&net->ipv4.cache_resolve_queue_len) >= 10 || 982 if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
714 (c = ipmr_cache_alloc_unres(net)) == NULL) { 983 (c = ipmr_cache_alloc_unres()) == NULL) {
715 spin_unlock_bh(&mfc_unres_lock); 984 spin_unlock_bh(&mfc_unres_lock);
716 985
717 kfree_skb(skb); 986 kfree_skb(skb);
718 return -ENOBUFS; 987 return -ENOBUFS;
719 } 988 }
720 989
721 /* 990 /* Fill in the new cache entry */
722 * Fill in the new cache entry 991
723 */
724 c->mfc_parent = -1; 992 c->mfc_parent = -1;
725 c->mfc_origin = iph->saddr; 993 c->mfc_origin = iph->saddr;
726 c->mfc_mcastgrp = iph->daddr; 994 c->mfc_mcastgrp = iph->daddr;
727 995
728 /* 996 /* Reflect first query at mrouted. */
729 * Reflect first query at mrouted. 997
730 */ 998 err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
731 err = ipmr_cache_report(net, skb, vifi, IGMPMSG_NOCACHE);
732 if (err < 0) { 999 if (err < 0) {
733 /* If the report failed throw the cache entry 1000 /* If the report failed throw the cache entry
734 out - Brad Parker 1001 out - Brad Parker
@@ -740,17 +1007,16 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
740 return err; 1007 return err;
741 } 1008 }
742 1009
743 atomic_inc(&net->ipv4.cache_resolve_queue_len); 1010 atomic_inc(&mrt->cache_resolve_queue_len);
744 c->next = mfc_unres_queue; 1011 list_add(&c->list, &mrt->mfc_unres_queue);
745 mfc_unres_queue = c;
746 1012
747 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires); 1013 if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
1014 mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
748 } 1015 }
749 1016
750 /* 1017 /* See if we can append the packet */
751 * See if we can append the packet 1018
752 */ 1019 if (c->mfc_un.unres.unresolved.qlen > 3) {
753 if (c->mfc_un.unres.unresolved.qlen>3) {
754 kfree_skb(skb); 1020 kfree_skb(skb);
755 err = -ENOBUFS; 1021 err = -ENOBUFS;
756 } else { 1022 } else {
@@ -766,20 +1032,17 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
766 * MFC cache manipulation by user space mroute daemon 1032 * MFC cache manipulation by user space mroute daemon
767 */ 1033 */
768 1034
769static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc) 1035static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
770{ 1036{
771 int line; 1037 int line;
772 struct mfc_cache *c, **cp; 1038 struct mfc_cache *c, *next;
773 1039
774 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 1040 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
775 1041
776 for (cp = &net->ipv4.mfc_cache_array[line]; 1042 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
777 (c = *cp) != NULL; cp = &c->next) {
778 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 1043 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
779 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { 1044 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
780 write_lock_bh(&mrt_lock); 1045 list_del_rcu(&c->list);
781 *cp = c->next;
782 write_unlock_bh(&mrt_lock);
783 1046
784 ipmr_cache_free(c); 1047 ipmr_cache_free(c);
785 return 0; 1048 return 0;
@@ -788,24 +1051,30 @@ static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc)
788 return -ENOENT; 1051 return -ENOENT;
789} 1052}
790 1053
791static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock) 1054static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1055 struct mfcctl *mfc, int mrtsock)
792{ 1056{
1057 bool found = false;
793 int line; 1058 int line;
794 struct mfc_cache *uc, *c, **cp; 1059 struct mfc_cache *uc, *c;
1060
1061 if (mfc->mfcc_parent >= MAXVIFS)
1062 return -ENFILE;
795 1063
796 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 1064 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
797 1065
798 for (cp = &net->ipv4.mfc_cache_array[line]; 1066 list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
799 (c = *cp) != NULL; cp = &c->next) {
800 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 1067 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
801 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) 1068 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1069 found = true;
802 break; 1070 break;
1071 }
803 } 1072 }
804 1073
805 if (c != NULL) { 1074 if (found) {
806 write_lock_bh(&mrt_lock); 1075 write_lock_bh(&mrt_lock);
807 c->mfc_parent = mfc->mfcc_parent; 1076 c->mfc_parent = mfc->mfcc_parent;
808 ipmr_update_thresholds(c, mfc->mfcc_ttls); 1077 ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
809 if (!mrtsock) 1078 if (!mrtsock)
810 c->mfc_flags |= MFC_STATIC; 1079 c->mfc_flags |= MFC_STATIC;
811 write_unlock_bh(&mrt_lock); 1080 write_unlock_bh(&mrt_lock);
@@ -815,43 +1084,40 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
815 if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) 1084 if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
816 return -EINVAL; 1085 return -EINVAL;
817 1086
818 c = ipmr_cache_alloc(net); 1087 c = ipmr_cache_alloc();
819 if (c == NULL) 1088 if (c == NULL)
820 return -ENOMEM; 1089 return -ENOMEM;
821 1090
822 c->mfc_origin = mfc->mfcc_origin.s_addr; 1091 c->mfc_origin = mfc->mfcc_origin.s_addr;
823 c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr; 1092 c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
824 c->mfc_parent = mfc->mfcc_parent; 1093 c->mfc_parent = mfc->mfcc_parent;
825 ipmr_update_thresholds(c, mfc->mfcc_ttls); 1094 ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
826 if (!mrtsock) 1095 if (!mrtsock)
827 c->mfc_flags |= MFC_STATIC; 1096 c->mfc_flags |= MFC_STATIC;
828 1097
829 write_lock_bh(&mrt_lock); 1098 list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
830 c->next = net->ipv4.mfc_cache_array[line];
831 net->ipv4.mfc_cache_array[line] = c;
832 write_unlock_bh(&mrt_lock);
833 1099
834 /* 1100 /*
835 * Check to see if we resolved a queued list. If so we 1101 * Check to see if we resolved a queued list. If so we
836 * need to send on the frames and tidy up. 1102 * need to send on the frames and tidy up.
837 */ 1103 */
1104 found = false;
838 spin_lock_bh(&mfc_unres_lock); 1105 spin_lock_bh(&mfc_unres_lock);
839 for (cp = &mfc_unres_queue; (uc=*cp) != NULL; 1106 list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
840 cp = &uc->next) { 1107 if (uc->mfc_origin == c->mfc_origin &&
841 if (net_eq(mfc_net(uc), net) &&
842 uc->mfc_origin == c->mfc_origin &&
843 uc->mfc_mcastgrp == c->mfc_mcastgrp) { 1108 uc->mfc_mcastgrp == c->mfc_mcastgrp) {
844 *cp = uc->next; 1109 list_del(&uc->list);
845 atomic_dec(&net->ipv4.cache_resolve_queue_len); 1110 atomic_dec(&mrt->cache_resolve_queue_len);
1111 found = true;
846 break; 1112 break;
847 } 1113 }
848 } 1114 }
849 if (mfc_unres_queue == NULL) 1115 if (list_empty(&mrt->mfc_unres_queue))
850 del_timer(&ipmr_expire_timer); 1116 del_timer(&mrt->ipmr_expire_timer);
851 spin_unlock_bh(&mfc_unres_lock); 1117 spin_unlock_bh(&mfc_unres_lock);
852 1118
853 if (uc) { 1119 if (found) {
854 ipmr_cache_resolve(uc, c); 1120 ipmr_cache_resolve(net, mrt, uc, c);
855 ipmr_cache_free(uc); 1121 ipmr_cache_free(uc);
856 } 1122 }
857 return 0; 1123 return 0;
@@ -861,69 +1127,56 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
861 * Close the multicast socket, and clear the vif tables etc 1127 * Close the multicast socket, and clear the vif tables etc
862 */ 1128 */
863 1129
864static void mroute_clean_tables(struct net *net) 1130static void mroute_clean_tables(struct mr_table *mrt)
865{ 1131{
866 int i; 1132 int i;
1133 LIST_HEAD(list);
1134 struct mfc_cache *c, *next;
867 1135
868 /* 1136 /* Shut down all active vif entries */
869 * Shut down all active vif entries 1137
870 */ 1138 for (i = 0; i < mrt->maxvif; i++) {
871 for (i = 0; i < net->ipv4.maxvif; i++) { 1139 if (!(mrt->vif_table[i].flags & VIFF_STATIC))
872 if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC)) 1140 vif_delete(mrt, i, 0, &list);
873 vif_delete(net, i, 0);
874 } 1141 }
1142 unregister_netdevice_many(&list);
875 1143
876 /* 1144 /* Wipe the cache */
877 * Wipe the cache
878 */
879 for (i=0; i<MFC_LINES; i++) {
880 struct mfc_cache *c, **cp;
881 1145
882 cp = &net->ipv4.mfc_cache_array[i]; 1146 for (i = 0; i < MFC_LINES; i++) {
883 while ((c = *cp) != NULL) { 1147 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
884 if (c->mfc_flags&MFC_STATIC) { 1148 if (c->mfc_flags & MFC_STATIC)
885 cp = &c->next;
886 continue; 1149 continue;
887 } 1150 list_del_rcu(&c->list);
888 write_lock_bh(&mrt_lock);
889 *cp = c->next;
890 write_unlock_bh(&mrt_lock);
891
892 ipmr_cache_free(c); 1151 ipmr_cache_free(c);
893 } 1152 }
894 } 1153 }
895 1154
896 if (atomic_read(&net->ipv4.cache_resolve_queue_len) != 0) { 1155 if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
897 struct mfc_cache *c, **cp;
898
899 spin_lock_bh(&mfc_unres_lock); 1156 spin_lock_bh(&mfc_unres_lock);
900 cp = &mfc_unres_queue; 1157 list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
901 while ((c = *cp) != NULL) { 1158 list_del(&c->list);
902 if (!net_eq(mfc_net(c), net)) { 1159 ipmr_destroy_unres(mrt, c);
903 cp = &c->next;
904 continue;
905 }
906 *cp = c->next;
907
908 ipmr_destroy_unres(c);
909 } 1160 }
910 spin_unlock_bh(&mfc_unres_lock); 1161 spin_unlock_bh(&mfc_unres_lock);
911 } 1162 }
912} 1163}
913 1164
1165/* called from ip_ra_control(), before an RCU grace period,
1166 * we dont need to call synchronize_rcu() here
1167 */
914static void mrtsock_destruct(struct sock *sk) 1168static void mrtsock_destruct(struct sock *sk)
915{ 1169{
916 struct net *net = sock_net(sk); 1170 struct net *net = sock_net(sk);
1171 struct mr_table *mrt;
917 1172
918 rtnl_lock(); 1173 rtnl_lock();
919 if (sk == net->ipv4.mroute_sk) { 1174 ipmr_for_each_table(mrt, net) {
920 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; 1175 if (sk == rtnl_dereference(mrt->mroute_sk)) {
921 1176 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
922 write_lock_bh(&mrt_lock); 1177 rcu_assign_pointer(mrt->mroute_sk, NULL);
923 net->ipv4.mroute_sk = NULL; 1178 mroute_clean_tables(mrt);
924 write_unlock_bh(&mrt_lock); 1179 }
925
926 mroute_clean_tables(net);
927 } 1180 }
928 rtnl_unlock(); 1181 rtnl_unlock();
929} 1182}
@@ -935,44 +1188,47 @@ static void mrtsock_destruct(struct sock *sk)
935 * MOSPF/PIM router set up we can clean this up. 1188 * MOSPF/PIM router set up we can clean this up.
936 */ 1189 */
937 1190
938int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen) 1191int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
939{ 1192{
940 int ret; 1193 int ret;
941 struct vifctl vif; 1194 struct vifctl vif;
942 struct mfcctl mfc; 1195 struct mfcctl mfc;
943 struct net *net = sock_net(sk); 1196 struct net *net = sock_net(sk);
1197 struct mr_table *mrt;
1198
1199 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1200 if (mrt == NULL)
1201 return -ENOENT;
944 1202
945 if (optname != MRT_INIT) { 1203 if (optname != MRT_INIT) {
946 if (sk != net->ipv4.mroute_sk && !capable(CAP_NET_ADMIN)) 1204 if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
1205 !capable(CAP_NET_ADMIN))
947 return -EACCES; 1206 return -EACCES;
948 } 1207 }
949 1208
950 switch (optname) { 1209 switch (optname) {
951 case MRT_INIT: 1210 case MRT_INIT:
952 if (sk->sk_type != SOCK_RAW || 1211 if (sk->sk_type != SOCK_RAW ||
953 inet_sk(sk)->num != IPPROTO_IGMP) 1212 inet_sk(sk)->inet_num != IPPROTO_IGMP)
954 return -EOPNOTSUPP; 1213 return -EOPNOTSUPP;
955 if (optlen != sizeof(int)) 1214 if (optlen != sizeof(int))
956 return -ENOPROTOOPT; 1215 return -ENOPROTOOPT;
957 1216
958 rtnl_lock(); 1217 rtnl_lock();
959 if (net->ipv4.mroute_sk) { 1218 if (rtnl_dereference(mrt->mroute_sk)) {
960 rtnl_unlock(); 1219 rtnl_unlock();
961 return -EADDRINUSE; 1220 return -EADDRINUSE;
962 } 1221 }
963 1222
964 ret = ip_ra_control(sk, 1, mrtsock_destruct); 1223 ret = ip_ra_control(sk, 1, mrtsock_destruct);
965 if (ret == 0) { 1224 if (ret == 0) {
966 write_lock_bh(&mrt_lock); 1225 rcu_assign_pointer(mrt->mroute_sk, sk);
967 net->ipv4.mroute_sk = sk;
968 write_unlock_bh(&mrt_lock);
969
970 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; 1226 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
971 } 1227 }
972 rtnl_unlock(); 1228 rtnl_unlock();
973 return ret; 1229 return ret;
974 case MRT_DONE: 1230 case MRT_DONE:
975 if (sk != net->ipv4.mroute_sk) 1231 if (sk != rcu_dereference_raw(mrt->mroute_sk))
976 return -EACCES; 1232 return -EACCES;
977 return ip_ra_control(sk, 0, NULL); 1233 return ip_ra_control(sk, 0, NULL);
978 case MRT_ADD_VIF: 1234 case MRT_ADD_VIF:
@@ -985,9 +1241,10 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
985 return -ENFILE; 1241 return -ENFILE;
986 rtnl_lock(); 1242 rtnl_lock();
987 if (optname == MRT_ADD_VIF) { 1243 if (optname == MRT_ADD_VIF) {
988 ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk); 1244 ret = vif_add(net, mrt, &vif,
1245 sk == rtnl_dereference(mrt->mroute_sk));
989 } else { 1246 } else {
990 ret = vif_delete(net, vif.vifc_vifi, 0); 1247 ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
991 } 1248 }
992 rtnl_unlock(); 1249 rtnl_unlock();
993 return ret; 1250 return ret;
@@ -1004,9 +1261,10 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
1004 return -EFAULT; 1261 return -EFAULT;
1005 rtnl_lock(); 1262 rtnl_lock();
1006 if (optname == MRT_DEL_MFC) 1263 if (optname == MRT_DEL_MFC)
1007 ret = ipmr_mfc_delete(net, &mfc); 1264 ret = ipmr_mfc_delete(mrt, &mfc);
1008 else 1265 else
1009 ret = ipmr_mfc_add(net, &mfc, sk == net->ipv4.mroute_sk); 1266 ret = ipmr_mfc_add(net, mrt, &mfc,
1267 sk == rtnl_dereference(mrt->mroute_sk));
1010 rtnl_unlock(); 1268 rtnl_unlock();
1011 return ret; 1269 return ret;
1012 /* 1270 /*
@@ -1015,9 +1273,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
1015 case MRT_ASSERT: 1273 case MRT_ASSERT:
1016 { 1274 {
1017 int v; 1275 int v;
1018 if (get_user(v,(int __user *)optval)) 1276 if (get_user(v, (int __user *)optval))
1019 return -EFAULT; 1277 return -EFAULT;
1020 net->ipv4.mroute_do_assert = (v) ? 1 : 0; 1278 mrt->mroute_do_assert = (v) ? 1 : 0;
1021 return 0; 1279 return 0;
1022 } 1280 }
1023#ifdef CONFIG_IP_PIMSM 1281#ifdef CONFIG_IP_PIMSM
@@ -1025,15 +1283,38 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
1025 { 1283 {
1026 int v; 1284 int v;
1027 1285
1028 if (get_user(v,(int __user *)optval)) 1286 if (get_user(v, (int __user *)optval))
1029 return -EFAULT; 1287 return -EFAULT;
1030 v = (v) ? 1 : 0; 1288 v = (v) ? 1 : 0;
1031 1289
1032 rtnl_lock(); 1290 rtnl_lock();
1033 ret = 0; 1291 ret = 0;
1034 if (v != net->ipv4.mroute_do_pim) { 1292 if (v != mrt->mroute_do_pim) {
1035 net->ipv4.mroute_do_pim = v; 1293 mrt->mroute_do_pim = v;
1036 net->ipv4.mroute_do_assert = v; 1294 mrt->mroute_do_assert = v;
1295 }
1296 rtnl_unlock();
1297 return ret;
1298 }
1299#endif
1300#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
1301 case MRT_TABLE:
1302 {
1303 u32 v;
1304
1305 if (optlen != sizeof(u32))
1306 return -EINVAL;
1307 if (get_user(v, (u32 __user *)optval))
1308 return -EFAULT;
1309
1310 rtnl_lock();
1311 ret = 0;
1312 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1313 ret = -EBUSY;
1314 } else {
1315 if (!ipmr_new_table(net, v))
1316 ret = -ENOMEM;
1317 raw_sk(sk)->ipmr_table = v;
1037 } 1318 }
1038 rtnl_unlock(); 1319 rtnl_unlock();
1039 return ret; 1320 return ret;
@@ -1057,12 +1338,17 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
1057 int olr; 1338 int olr;
1058 int val; 1339 int val;
1059 struct net *net = sock_net(sk); 1340 struct net *net = sock_net(sk);
1341 struct mr_table *mrt;
1342
1343 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1344 if (mrt == NULL)
1345 return -ENOENT;
1060 1346
1061 if (optname != MRT_VERSION && 1347 if (optname != MRT_VERSION &&
1062#ifdef CONFIG_IP_PIMSM 1348#ifdef CONFIG_IP_PIMSM
1063 optname!=MRT_PIM && 1349 optname != MRT_PIM &&
1064#endif 1350#endif
1065 optname!=MRT_ASSERT) 1351 optname != MRT_ASSERT)
1066 return -ENOPROTOOPT; 1352 return -ENOPROTOOPT;
1067 1353
1068 if (get_user(olr, optlen)) 1354 if (get_user(olr, optlen))
@@ -1078,10 +1364,10 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
1078 val = 0x0305; 1364 val = 0x0305;
1079#ifdef CONFIG_IP_PIMSM 1365#ifdef CONFIG_IP_PIMSM
1080 else if (optname == MRT_PIM) 1366 else if (optname == MRT_PIM)
1081 val = net->ipv4.mroute_do_pim; 1367 val = mrt->mroute_do_pim;
1082#endif 1368#endif
1083 else 1369 else
1084 val = net->ipv4.mroute_do_assert; 1370 val = mrt->mroute_do_assert;
1085 if (copy_to_user(optval, &val, olr)) 1371 if (copy_to_user(optval, &val, olr))
1086 return -EFAULT; 1372 return -EFAULT;
1087 return 0; 1373 return 0;
@@ -1098,16 +1384,21 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1098 struct vif_device *vif; 1384 struct vif_device *vif;
1099 struct mfc_cache *c; 1385 struct mfc_cache *c;
1100 struct net *net = sock_net(sk); 1386 struct net *net = sock_net(sk);
1387 struct mr_table *mrt;
1388
1389 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1390 if (mrt == NULL)
1391 return -ENOENT;
1101 1392
1102 switch (cmd) { 1393 switch (cmd) {
1103 case SIOCGETVIFCNT: 1394 case SIOCGETVIFCNT:
1104 if (copy_from_user(&vr, arg, sizeof(vr))) 1395 if (copy_from_user(&vr, arg, sizeof(vr)))
1105 return -EFAULT; 1396 return -EFAULT;
1106 if (vr.vifi >= net->ipv4.maxvif) 1397 if (vr.vifi >= mrt->maxvif)
1107 return -EINVAL; 1398 return -EINVAL;
1108 read_lock(&mrt_lock); 1399 read_lock(&mrt_lock);
1109 vif = &net->ipv4.vif_table[vr.vifi]; 1400 vif = &mrt->vif_table[vr.vifi];
1110 if (VIF_EXISTS(net, vr.vifi)) { 1401 if (VIF_EXISTS(mrt, vr.vifi)) {
1111 vr.icount = vif->pkt_in; 1402 vr.icount = vif->pkt_in;
1112 vr.ocount = vif->pkt_out; 1403 vr.ocount = vif->pkt_out;
1113 vr.ibytes = vif->bytes_in; 1404 vr.ibytes = vif->bytes_in;
@@ -1124,19 +1415,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1124 if (copy_from_user(&sr, arg, sizeof(sr))) 1415 if (copy_from_user(&sr, arg, sizeof(sr)))
1125 return -EFAULT; 1416 return -EFAULT;
1126 1417
1127 read_lock(&mrt_lock); 1418 rcu_read_lock();
1128 c = ipmr_cache_find(net, sr.src.s_addr, sr.grp.s_addr); 1419 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1129 if (c) { 1420 if (c) {
1130 sr.pktcnt = c->mfc_un.res.pkt; 1421 sr.pktcnt = c->mfc_un.res.pkt;
1131 sr.bytecnt = c->mfc_un.res.bytes; 1422 sr.bytecnt = c->mfc_un.res.bytes;
1132 sr.wrong_if = c->mfc_un.res.wrong_if; 1423 sr.wrong_if = c->mfc_un.res.wrong_if;
1133 read_unlock(&mrt_lock); 1424 rcu_read_unlock();
1134 1425
1135 if (copy_to_user(arg, &sr, sizeof(sr))) 1426 if (copy_to_user(arg, &sr, sizeof(sr)))
1136 return -EFAULT; 1427 return -EFAULT;
1137 return 0; 1428 return 0;
1138 } 1429 }
1139 read_unlock(&mrt_lock); 1430 rcu_read_unlock();
1140 return -EADDRNOTAVAIL; 1431 return -EADDRNOTAVAIL;
1141 default: 1432 default:
1142 return -ENOIOCTLCMD; 1433 return -ENOIOCTLCMD;
@@ -1148,19 +1439,22 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
1148{ 1439{
1149 struct net_device *dev = ptr; 1440 struct net_device *dev = ptr;
1150 struct net *net = dev_net(dev); 1441 struct net *net = dev_net(dev);
1442 struct mr_table *mrt;
1151 struct vif_device *v; 1443 struct vif_device *v;
1152 int ct; 1444 int ct;
1153 1445 LIST_HEAD(list);
1154 if (!net_eq(dev_net(dev), net))
1155 return NOTIFY_DONE;
1156 1446
1157 if (event != NETDEV_UNREGISTER) 1447 if (event != NETDEV_UNREGISTER)
1158 return NOTIFY_DONE; 1448 return NOTIFY_DONE;
1159 v = &net->ipv4.vif_table[0]; 1449
1160 for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) { 1450 ipmr_for_each_table(mrt, net) {
1161 if (v->dev == dev) 1451 v = &mrt->vif_table[0];
1162 vif_delete(net, ct, 1); 1452 for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1453 if (v->dev == dev)
1454 vif_delete(mrt, ct, 1, &list);
1455 }
1163 } 1456 }
1457 unregister_netdevice_many(&list);
1164 return NOTIFY_DONE; 1458 return NOTIFY_DONE;
1165} 1459}
1166 1460
@@ -1170,7 +1464,7 @@ static struct notifier_block ip_mr_notifier = {
1170}; 1464};
1171 1465
1172/* 1466/*
1173 * Encapsulate a packet by attaching a valid IPIP header to it. 1467 * Encapsulate a packet by attaching a valid IPIP header to it.
1174 * This avoids tunnel drivers and other mess and gives us the speed so 1468 * This avoids tunnel drivers and other mess and gives us the speed so
1175 * important for multicast video. 1469 * important for multicast video.
1176 */ 1470 */
@@ -1185,7 +1479,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1185 skb_reset_network_header(skb); 1479 skb_reset_network_header(skb);
1186 iph = ip_hdr(skb); 1480 iph = ip_hdr(skb);
1187 1481
1188 iph->version = 4; 1482 iph->version = 4;
1189 iph->tos = old_iph->tos; 1483 iph->tos = old_iph->tos;
1190 iph->ttl = old_iph->ttl; 1484 iph->ttl = old_iph->ttl;
1191 iph->frag_off = 0; 1485 iph->frag_off = 0;
@@ -1203,7 +1497,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1203 1497
1204static inline int ipmr_forward_finish(struct sk_buff *skb) 1498static inline int ipmr_forward_finish(struct sk_buff *skb)
1205{ 1499{
1206 struct ip_options * opt = &(IPCB(skb)->opt); 1500 struct ip_options *opt = &(IPCB(skb)->opt);
1207 1501
1208 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); 1502 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1209 1503
@@ -1217,11 +1511,11 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
1217 * Processing handlers for ipmr_forward 1511 * Processing handlers for ipmr_forward
1218 */ 1512 */
1219 1513
1220static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) 1514static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1515 struct sk_buff *skb, struct mfc_cache *c, int vifi)
1221{ 1516{
1222 struct net *net = mfc_net(c);
1223 const struct iphdr *iph = ip_hdr(skb); 1517 const struct iphdr *iph = ip_hdr(skb);
1224 struct vif_device *vif = &net->ipv4.vif_table[vifi]; 1518 struct vif_device *vif = &mrt->vif_table[vifi];
1225 struct net_device *dev; 1519 struct net_device *dev;
1226 struct rtable *rt; 1520 struct rtable *rt;
1227 int encap = 0; 1521 int encap = 0;
@@ -1235,37 +1529,41 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1235 vif->bytes_out += skb->len; 1529 vif->bytes_out += skb->len;
1236 vif->dev->stats.tx_bytes += skb->len; 1530 vif->dev->stats.tx_bytes += skb->len;
1237 vif->dev->stats.tx_packets++; 1531 vif->dev->stats.tx_packets++;
1238 ipmr_cache_report(net, skb, vifi, IGMPMSG_WHOLEPKT); 1532 ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
1239 goto out_free; 1533 goto out_free;
1240 } 1534 }
1241#endif 1535#endif
1242 1536
1243 if (vif->flags&VIFF_TUNNEL) { 1537 if (vif->flags & VIFF_TUNNEL) {
1244 struct flowi fl = { .oif = vif->link, 1538 struct flowi fl = {
1245 .nl_u = { .ip4_u = 1539 .oif = vif->link,
1246 { .daddr = vif->remote, 1540 .fl4_dst = vif->remote,
1247 .saddr = vif->local, 1541 .fl4_src = vif->local,
1248 .tos = RT_TOS(iph->tos) } }, 1542 .fl4_tos = RT_TOS(iph->tos),
1249 .proto = IPPROTO_IPIP }; 1543 .proto = IPPROTO_IPIP
1544 };
1545
1250 if (ip_route_output_key(net, &rt, &fl)) 1546 if (ip_route_output_key(net, &rt, &fl))
1251 goto out_free; 1547 goto out_free;
1252 encap = sizeof(struct iphdr); 1548 encap = sizeof(struct iphdr);
1253 } else { 1549 } else {
1254 struct flowi fl = { .oif = vif->link, 1550 struct flowi fl = {
1255 .nl_u = { .ip4_u = 1551 .oif = vif->link,
1256 { .daddr = iph->daddr, 1552 .fl4_dst = iph->daddr,
1257 .tos = RT_TOS(iph->tos) } }, 1553 .fl4_tos = RT_TOS(iph->tos),
1258 .proto = IPPROTO_IPIP }; 1554 .proto = IPPROTO_IPIP
1555 };
1556
1259 if (ip_route_output_key(net, &rt, &fl)) 1557 if (ip_route_output_key(net, &rt, &fl))
1260 goto out_free; 1558 goto out_free;
1261 } 1559 }
1262 1560
1263 dev = rt->u.dst.dev; 1561 dev = rt->dst.dev;
1264 1562
1265 if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) { 1563 if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
1266 /* Do not fragment multicasts. Alas, IPv4 does not 1564 /* Do not fragment multicasts. Alas, IPv4 does not
1267 allow to send ICMP, so that packets will disappear 1565 * allow to send ICMP, so that packets will disappear
1268 to blackhole. 1566 * to blackhole.
1269 */ 1567 */
1270 1568
1271 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 1569 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -1273,7 +1571,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1273 goto out_free; 1571 goto out_free;
1274 } 1572 }
1275 1573
1276 encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len; 1574 encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
1277 1575
1278 if (skb_cow(skb, encap)) { 1576 if (skb_cow(skb, encap)) {
1279 ip_rt_put(rt); 1577 ip_rt_put(rt);
@@ -1284,11 +1582,12 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1284 vif->bytes_out += skb->len; 1582 vif->bytes_out += skb->len;
1285 1583
1286 skb_dst_drop(skb); 1584 skb_dst_drop(skb);
1287 skb_dst_set(skb, &rt->u.dst); 1585 skb_dst_set(skb, &rt->dst);
1288 ip_decrease_ttl(ip_hdr(skb)); 1586 ip_decrease_ttl(ip_hdr(skb));
1289 1587
1290 /* FIXME: forward and output firewalls used to be called here. 1588 /* FIXME: forward and output firewalls used to be called here.
1291 * What do we do with netfilter? -- RR */ 1589 * What do we do with netfilter? -- RR
1590 */
1292 if (vif->flags & VIFF_TUNNEL) { 1591 if (vif->flags & VIFF_TUNNEL) {
1293 ip_encap(skb, vif->local, vif->remote); 1592 ip_encap(skb, vif->local, vif->remote);
1294 /* FIXME: extra output firewall step used to be here. --RR */ 1593 /* FIXME: extra output firewall step used to be here. --RR */
@@ -1309,21 +1608,20 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1309 * not mrouter) cannot join to more than one interface - it will 1608 * not mrouter) cannot join to more than one interface - it will
1310 * result in receiving multiple packets. 1609 * result in receiving multiple packets.
1311 */ 1610 */
1312 NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev, 1611 NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
1313 ipmr_forward_finish); 1612 ipmr_forward_finish);
1314 return; 1613 return;
1315 1614
1316out_free: 1615out_free:
1317 kfree_skb(skb); 1616 kfree_skb(skb);
1318 return;
1319} 1617}
1320 1618
1321static int ipmr_find_vif(struct net_device *dev) 1619static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
1322{ 1620{
1323 struct net *net = dev_net(dev);
1324 int ct; 1621 int ct;
1325 for (ct = net->ipv4.maxvif-1; ct >= 0; ct--) { 1622
1326 if (net->ipv4.vif_table[ct].dev == dev) 1623 for (ct = mrt->maxvif-1; ct >= 0; ct--) {
1624 if (mrt->vif_table[ct].dev == dev)
1327 break; 1625 break;
1328 } 1626 }
1329 return ct; 1627 return ct;
@@ -1331,11 +1629,12 @@ static int ipmr_find_vif(struct net_device *dev)
1331 1629
1332/* "local" means that we should preserve one skb (for local delivery) */ 1630/* "local" means that we should preserve one skb (for local delivery) */
1333 1631
1334static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local) 1632static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1633 struct sk_buff *skb, struct mfc_cache *cache,
1634 int local)
1335{ 1635{
1336 int psend = -1; 1636 int psend = -1;
1337 int vif, ct; 1637 int vif, ct;
1338 struct net *net = mfc_net(cache);
1339 1638
1340 vif = cache->mfc_parent; 1639 vif = cache->mfc_parent;
1341 cache->mfc_un.res.pkt++; 1640 cache->mfc_un.res.pkt++;
@@ -1344,55 +1643,58 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
1344 /* 1643 /*
1345 * Wrong interface: drop packet and (maybe) send PIM assert. 1644 * Wrong interface: drop packet and (maybe) send PIM assert.
1346 */ 1645 */
1347 if (net->ipv4.vif_table[vif].dev != skb->dev) { 1646 if (mrt->vif_table[vif].dev != skb->dev) {
1348 int true_vifi; 1647 int true_vifi;
1349 1648
1350 if (skb_rtable(skb)->fl.iif == 0) { 1649 if (rt_is_output_route(skb_rtable(skb))) {
1351 /* It is our own packet, looped back. 1650 /* It is our own packet, looped back.
1352 Very complicated situation... 1651 * Very complicated situation...
1353 1652 *
1354 The best workaround until routing daemons will be 1653 * The best workaround until routing daemons will be
1355 fixed is not to redistribute packet, if it was 1654 * fixed is not to redistribute packet, if it was
1356 send through wrong interface. It means, that 1655 * send through wrong interface. It means, that
1357 multicast applications WILL NOT work for 1656 * multicast applications WILL NOT work for
1358 (S,G), which have default multicast route pointing 1657 * (S,G), which have default multicast route pointing
1359 to wrong oif. In any case, it is not a good 1658 * to wrong oif. In any case, it is not a good
1360 idea to use multicasting applications on router. 1659 * idea to use multicasting applications on router.
1361 */ 1660 */
1362 goto dont_forward; 1661 goto dont_forward;
1363 } 1662 }
1364 1663
1365 cache->mfc_un.res.wrong_if++; 1664 cache->mfc_un.res.wrong_if++;
1366 true_vifi = ipmr_find_vif(skb->dev); 1665 true_vifi = ipmr_find_vif(mrt, skb->dev);
1367 1666
1368 if (true_vifi >= 0 && net->ipv4.mroute_do_assert && 1667 if (true_vifi >= 0 && mrt->mroute_do_assert &&
1369 /* pimsm uses asserts, when switching from RPT to SPT, 1668 /* pimsm uses asserts, when switching from RPT to SPT,
1370 so that we cannot check that packet arrived on an oif. 1669 * so that we cannot check that packet arrived on an oif.
1371 It is bad, but otherwise we would need to move pretty 1670 * It is bad, but otherwise we would need to move pretty
1372 large chunk of pimd to kernel. Ough... --ANK 1671 * large chunk of pimd to kernel. Ough... --ANK
1373 */ 1672 */
1374 (net->ipv4.mroute_do_pim || 1673 (mrt->mroute_do_pim ||
1375 cache->mfc_un.res.ttls[true_vifi] < 255) && 1674 cache->mfc_un.res.ttls[true_vifi] < 255) &&
1376 time_after(jiffies, 1675 time_after(jiffies,
1377 cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { 1676 cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1378 cache->mfc_un.res.last_assert = jiffies; 1677 cache->mfc_un.res.last_assert = jiffies;
1379 ipmr_cache_report(net, skb, true_vifi, IGMPMSG_WRONGVIF); 1678 ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
1380 } 1679 }
1381 goto dont_forward; 1680 goto dont_forward;
1382 } 1681 }
1383 1682
1384 net->ipv4.vif_table[vif].pkt_in++; 1683 mrt->vif_table[vif].pkt_in++;
1385 net->ipv4.vif_table[vif].bytes_in += skb->len; 1684 mrt->vif_table[vif].bytes_in += skb->len;
1386 1685
1387 /* 1686 /*
1388 * Forward the frame 1687 * Forward the frame
1389 */ 1688 */
1390 for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { 1689 for (ct = cache->mfc_un.res.maxvif - 1;
1690 ct >= cache->mfc_un.res.minvif; ct--) {
1391 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { 1691 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1392 if (psend != -1) { 1692 if (psend != -1) {
1393 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1693 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1694
1394 if (skb2) 1695 if (skb2)
1395 ipmr_queue_xmit(skb2, cache, psend); 1696 ipmr_queue_xmit(net, mrt, skb2, cache,
1697 psend);
1396 } 1698 }
1397 psend = ct; 1699 psend = ct;
1398 } 1700 }
@@ -1400,10 +1702,11 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
1400 if (psend != -1) { 1702 if (psend != -1) {
1401 if (local) { 1703 if (local) {
1402 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1704 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1705
1403 if (skb2) 1706 if (skb2)
1404 ipmr_queue_xmit(skb2, cache, psend); 1707 ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1405 } else { 1708 } else {
1406 ipmr_queue_xmit(skb, cache, psend); 1709 ipmr_queue_xmit(net, mrt, skb, cache, psend);
1407 return 0; 1710 return 0;
1408 } 1711 }
1409 } 1712 }
@@ -1417,6 +1720,7 @@ dont_forward:
1417 1720
1418/* 1721/*
1419 * Multicast packets for forwarding arrive here 1722 * Multicast packets for forwarding arrive here
1723 * Called with rcu_read_lock();
1420 */ 1724 */
1421 1725
1422int ip_mr_input(struct sk_buff *skb) 1726int ip_mr_input(struct sk_buff *skb)
@@ -1424,37 +1728,45 @@ int ip_mr_input(struct sk_buff *skb)
1424 struct mfc_cache *cache; 1728 struct mfc_cache *cache;
1425 struct net *net = dev_net(skb->dev); 1729 struct net *net = dev_net(skb->dev);
1426 int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; 1730 int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1731 struct mr_table *mrt;
1732 int err;
1427 1733
1428 /* Packet is looped back after forward, it should not be 1734 /* Packet is looped back after forward, it should not be
1429 forwarded second time, but still can be delivered locally. 1735 * forwarded second time, but still can be delivered locally.
1430 */ 1736 */
1431 if (IPCB(skb)->flags&IPSKB_FORWARDED) 1737 if (IPCB(skb)->flags & IPSKB_FORWARDED)
1432 goto dont_forward; 1738 goto dont_forward;
1433 1739
1740 err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
1741 if (err < 0) {
1742 kfree_skb(skb);
1743 return err;
1744 }
1745
1434 if (!local) { 1746 if (!local) {
1435 if (IPCB(skb)->opt.router_alert) { 1747 if (IPCB(skb)->opt.router_alert) {
1436 if (ip_call_ra_chain(skb)) 1748 if (ip_call_ra_chain(skb))
1437 return 0; 1749 return 0;
1438 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){ 1750 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
1439 /* IGMPv1 (and broken IGMPv2 implementations sort of 1751 /* IGMPv1 (and broken IGMPv2 implementations sort of
1440 Cisco IOS <= 11.2(8)) do not put router alert 1752 * Cisco IOS <= 11.2(8)) do not put router alert
1441 option to IGMP packets destined to routable 1753 * option to IGMP packets destined to routable
1442 groups. It is very bad, because it means 1754 * groups. It is very bad, because it means
1443 that we can forward NO IGMP messages. 1755 * that we can forward NO IGMP messages.
1444 */ 1756 */
1445 read_lock(&mrt_lock); 1757 struct sock *mroute_sk;
1446 if (net->ipv4.mroute_sk) { 1758
1447 nf_reset(skb); 1759 mroute_sk = rcu_dereference(mrt->mroute_sk);
1448 raw_rcv(net->ipv4.mroute_sk, skb); 1760 if (mroute_sk) {
1449 read_unlock(&mrt_lock); 1761 nf_reset(skb);
1450 return 0; 1762 raw_rcv(mroute_sk, skb);
1451 } 1763 return 0;
1452 read_unlock(&mrt_lock); 1764 }
1453 } 1765 }
1454 } 1766 }
1455 1767
1456 read_lock(&mrt_lock); 1768 /* already under rcu_read_lock() */
1457 cache = ipmr_cache_find(net, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); 1769 cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1458 1770
1459 /* 1771 /*
1460 * No usable cache entry 1772 * No usable cache entry
@@ -1465,27 +1777,26 @@ int ip_mr_input(struct sk_buff *skb)
1465 if (local) { 1777 if (local) {
1466 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1778 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1467 ip_local_deliver(skb); 1779 ip_local_deliver(skb);
1468 if (skb2 == NULL) { 1780 if (skb2 == NULL)
1469 read_unlock(&mrt_lock);
1470 return -ENOBUFS; 1781 return -ENOBUFS;
1471 }
1472 skb = skb2; 1782 skb = skb2;
1473 } 1783 }
1474 1784
1475 vif = ipmr_find_vif(skb->dev); 1785 read_lock(&mrt_lock);
1786 vif = ipmr_find_vif(mrt, skb->dev);
1476 if (vif >= 0) { 1787 if (vif >= 0) {
1477 int err = ipmr_cache_unresolved(net, vif, skb); 1788 int err2 = ipmr_cache_unresolved(mrt, vif, skb);
1478 read_unlock(&mrt_lock); 1789 read_unlock(&mrt_lock);
1479 1790
1480 return err; 1791 return err2;
1481 } 1792 }
1482 read_unlock(&mrt_lock); 1793 read_unlock(&mrt_lock);
1483 kfree_skb(skb); 1794 kfree_skb(skb);
1484 return -ENODEV; 1795 return -ENODEV;
1485 } 1796 }
1486 1797
1487 ip_mr_forward(skb, cache, local); 1798 read_lock(&mrt_lock);
1488 1799 ip_mr_forward(net, mrt, skb, cache, local);
1489 read_unlock(&mrt_lock); 1800 read_unlock(&mrt_lock);
1490 1801
1491 if (local) 1802 if (local)
@@ -1501,18 +1812,19 @@ dont_forward:
1501} 1812}
1502 1813
1503#ifdef CONFIG_IP_PIMSM 1814#ifdef CONFIG_IP_PIMSM
1504static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen) 1815/* called with rcu_read_lock() */
1816static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1817 unsigned int pimlen)
1505{ 1818{
1506 struct net_device *reg_dev = NULL; 1819 struct net_device *reg_dev = NULL;
1507 struct iphdr *encap; 1820 struct iphdr *encap;
1508 struct net *net = dev_net(skb->dev);
1509 1821
1510 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); 1822 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1511 /* 1823 /*
1512 Check that: 1824 * Check that:
1513 a. packet is really destinted to a multicast group 1825 * a. packet is really sent to a multicast group
1514 b. packet is not a NULL-REGISTER 1826 * b. packet is not a NULL-REGISTER
1515 c. packet is not truncated 1827 * c. packet is not truncated
1516 */ 1828 */
1517 if (!ipv4_is_multicast(encap->daddr) || 1829 if (!ipv4_is_multicast(encap->daddr) ||
1518 encap->tot_len == 0 || 1830 encap->tot_len == 0 ||
@@ -1520,30 +1832,25 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1520 return 1; 1832 return 1;
1521 1833
1522 read_lock(&mrt_lock); 1834 read_lock(&mrt_lock);
1523 if (net->ipv4.mroute_reg_vif_num >= 0) 1835 if (mrt->mroute_reg_vif_num >= 0)
1524 reg_dev = net->ipv4.vif_table[net->ipv4.mroute_reg_vif_num].dev; 1836 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
1525 if (reg_dev)
1526 dev_hold(reg_dev);
1527 read_unlock(&mrt_lock); 1837 read_unlock(&mrt_lock);
1528 1838
1529 if (reg_dev == NULL) 1839 if (reg_dev == NULL)
1530 return 1; 1840 return 1;
1531 1841
1532 skb->mac_header = skb->network_header; 1842 skb->mac_header = skb->network_header;
1533 skb_pull(skb, (u8*)encap - skb->data); 1843 skb_pull(skb, (u8 *)encap - skb->data);
1534 skb_reset_network_header(skb); 1844 skb_reset_network_header(skb);
1535 skb->dev = reg_dev;
1536 skb->protocol = htons(ETH_P_IP); 1845 skb->protocol = htons(ETH_P_IP);
1537 skb->ip_summed = 0; 1846 skb->ip_summed = CHECKSUM_NONE;
1538 skb->pkt_type = PACKET_HOST; 1847 skb->pkt_type = PACKET_HOST;
1539 skb_dst_drop(skb); 1848
1540 reg_dev->stats.rx_bytes += skb->len; 1849 skb_tunnel_rx(skb, reg_dev);
1541 reg_dev->stats.rx_packets++; 1850
1542 nf_reset(skb);
1543 netif_rx(skb); 1851 netif_rx(skb);
1544 dev_put(reg_dev);
1545 1852
1546 return 0; 1853 return NET_RX_SUCCESS;
1547} 1854}
1548#endif 1855#endif
1549 1856
@@ -1552,21 +1859,25 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1552 * Handle IGMP messages of PIMv1 1859 * Handle IGMP messages of PIMv1
1553 */ 1860 */
1554 1861
1555int pim_rcv_v1(struct sk_buff * skb) 1862int pim_rcv_v1(struct sk_buff *skb)
1556{ 1863{
1557 struct igmphdr *pim; 1864 struct igmphdr *pim;
1558 struct net *net = dev_net(skb->dev); 1865 struct net *net = dev_net(skb->dev);
1866 struct mr_table *mrt;
1559 1867
1560 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) 1868 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1561 goto drop; 1869 goto drop;
1562 1870
1563 pim = igmp_hdr(skb); 1871 pim = igmp_hdr(skb);
1564 1872
1565 if (!net->ipv4.mroute_do_pim || 1873 if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1874 goto drop;
1875
1876 if (!mrt->mroute_do_pim ||
1566 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 1877 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1567 goto drop; 1878 goto drop;
1568 1879
1569 if (__pim_rcv(skb, sizeof(*pim))) { 1880 if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1570drop: 1881drop:
1571 kfree_skb(skb); 1882 kfree_skb(skb);
1572 } 1883 }
@@ -1575,21 +1886,26 @@ drop:
1575#endif 1886#endif
1576 1887
1577#ifdef CONFIG_IP_PIMSM_V2 1888#ifdef CONFIG_IP_PIMSM_V2
1578static int pim_rcv(struct sk_buff * skb) 1889static int pim_rcv(struct sk_buff *skb)
1579{ 1890{
1580 struct pimreghdr *pim; 1891 struct pimreghdr *pim;
1892 struct net *net = dev_net(skb->dev);
1893 struct mr_table *mrt;
1581 1894
1582 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) 1895 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1583 goto drop; 1896 goto drop;
1584 1897
1585 pim = (struct pimreghdr *)skb_transport_header(skb); 1898 pim = (struct pimreghdr *)skb_transport_header(skb);
1586 if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || 1899 if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
1587 (pim->flags&PIM_NULL_REGISTER) || 1900 (pim->flags & PIM_NULL_REGISTER) ||
1588 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 1901 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1589 csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 1902 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1590 goto drop; 1903 goto drop;
1591 1904
1592 if (__pim_rcv(skb, sizeof(*pim))) { 1905 if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1906 goto drop;
1907
1908 if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1593drop: 1909drop:
1594 kfree_skb(skb); 1910 kfree_skb(skb);
1595 } 1911 }
@@ -1597,29 +1913,31 @@ drop:
1597} 1913}
1598#endif 1914#endif
1599 1915
1600static int 1916static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
1601ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) 1917 struct mfc_cache *c, struct rtmsg *rtm)
1602{ 1918{
1603 int ct; 1919 int ct;
1604 struct rtnexthop *nhp; 1920 struct rtnexthop *nhp;
1605 struct net *net = mfc_net(c);
1606 struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev;
1607 u8 *b = skb_tail_pointer(skb); 1921 u8 *b = skb_tail_pointer(skb);
1608 struct rtattr *mp_head; 1922 struct rtattr *mp_head;
1609 1923
1610 if (dev) 1924 /* If cache is unresolved, don't try to parse IIF and OIF */
1611 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); 1925 if (c->mfc_parent >= MAXVIFS)
1926 return -ENOENT;
1927
1928 if (VIF_EXISTS(mrt, c->mfc_parent))
1929 RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
1612 1930
1613 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); 1931 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1614 1932
1615 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { 1933 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1616 if (c->mfc_un.res.ttls[ct] < 255) { 1934 if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
1617 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) 1935 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1618 goto rtattr_failure; 1936 goto rtattr_failure;
1619 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); 1937 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1620 nhp->rtnh_flags = 0; 1938 nhp->rtnh_flags = 0;
1621 nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; 1939 nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1622 nhp->rtnh_ifindex = net->ipv4.vif_table[ct].dev->ifindex; 1940 nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
1623 nhp->rtnh_len = sizeof(*nhp); 1941 nhp->rtnh_len = sizeof(*nhp);
1624 } 1942 }
1625 } 1943 }
@@ -1637,31 +1955,41 @@ int ipmr_get_route(struct net *net,
1637 struct sk_buff *skb, struct rtmsg *rtm, int nowait) 1955 struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1638{ 1956{
1639 int err; 1957 int err;
1958 struct mr_table *mrt;
1640 struct mfc_cache *cache; 1959 struct mfc_cache *cache;
1641 struct rtable *rt = skb_rtable(skb); 1960 struct rtable *rt = skb_rtable(skb);
1642 1961
1643 read_lock(&mrt_lock); 1962 mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
1644 cache = ipmr_cache_find(net, rt->rt_src, rt->rt_dst); 1963 if (mrt == NULL)
1964 return -ENOENT;
1965
1966 rcu_read_lock();
1967 cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
1645 1968
1646 if (cache == NULL) { 1969 if (cache == NULL) {
1647 struct sk_buff *skb2; 1970 struct sk_buff *skb2;
1648 struct iphdr *iph; 1971 struct iphdr *iph;
1649 struct net_device *dev; 1972 struct net_device *dev;
1650 int vif; 1973 int vif = -1;
1651 1974
1652 if (nowait) { 1975 if (nowait) {
1653 read_unlock(&mrt_lock); 1976 rcu_read_unlock();
1654 return -EAGAIN; 1977 return -EAGAIN;
1655 } 1978 }
1656 1979
1657 dev = skb->dev; 1980 dev = skb->dev;
1658 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) { 1981 read_lock(&mrt_lock);
1982 if (dev)
1983 vif = ipmr_find_vif(mrt, dev);
1984 if (vif < 0) {
1659 read_unlock(&mrt_lock); 1985 read_unlock(&mrt_lock);
1986 rcu_read_unlock();
1660 return -ENODEV; 1987 return -ENODEV;
1661 } 1988 }
1662 skb2 = skb_clone(skb, GFP_ATOMIC); 1989 skb2 = skb_clone(skb, GFP_ATOMIC);
1663 if (!skb2) { 1990 if (!skb2) {
1664 read_unlock(&mrt_lock); 1991 read_unlock(&mrt_lock);
1992 rcu_read_unlock();
1665 return -ENOMEM; 1993 return -ENOMEM;
1666 } 1994 }
1667 1995
@@ -1672,24 +2000,111 @@ int ipmr_get_route(struct net *net,
1672 iph->saddr = rt->rt_src; 2000 iph->saddr = rt->rt_src;
1673 iph->daddr = rt->rt_dst; 2001 iph->daddr = rt->rt_dst;
1674 iph->version = 0; 2002 iph->version = 0;
1675 err = ipmr_cache_unresolved(net, vif, skb2); 2003 err = ipmr_cache_unresolved(mrt, vif, skb2);
1676 read_unlock(&mrt_lock); 2004 read_unlock(&mrt_lock);
2005 rcu_read_unlock();
1677 return err; 2006 return err;
1678 } 2007 }
1679 2008
1680 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) 2009 read_lock(&mrt_lock);
2010 if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
1681 cache->mfc_flags |= MFC_NOTIFY; 2011 cache->mfc_flags |= MFC_NOTIFY;
1682 err = ipmr_fill_mroute(skb, cache, rtm); 2012 err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
1683 read_unlock(&mrt_lock); 2013 read_unlock(&mrt_lock);
2014 rcu_read_unlock();
1684 return err; 2015 return err;
1685} 2016}
1686 2017
2018static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2019 u32 pid, u32 seq, struct mfc_cache *c)
2020{
2021 struct nlmsghdr *nlh;
2022 struct rtmsg *rtm;
2023
2024 nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
2025 if (nlh == NULL)
2026 return -EMSGSIZE;
2027
2028 rtm = nlmsg_data(nlh);
2029 rtm->rtm_family = RTNL_FAMILY_IPMR;
2030 rtm->rtm_dst_len = 32;
2031 rtm->rtm_src_len = 32;
2032 rtm->rtm_tos = 0;
2033 rtm->rtm_table = mrt->id;
2034 NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
2035 rtm->rtm_type = RTN_MULTICAST;
2036 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2037 rtm->rtm_protocol = RTPROT_UNSPEC;
2038 rtm->rtm_flags = 0;
2039
2040 NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
2041 NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
2042
2043 if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
2044 goto nla_put_failure;
2045
2046 return nlmsg_end(skb, nlh);
2047
2048nla_put_failure:
2049 nlmsg_cancel(skb, nlh);
2050 return -EMSGSIZE;
2051}
2052
2053static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2054{
2055 struct net *net = sock_net(skb->sk);
2056 struct mr_table *mrt;
2057 struct mfc_cache *mfc;
2058 unsigned int t = 0, s_t;
2059 unsigned int h = 0, s_h;
2060 unsigned int e = 0, s_e;
2061
2062 s_t = cb->args[0];
2063 s_h = cb->args[1];
2064 s_e = cb->args[2];
2065
2066 rcu_read_lock();
2067 ipmr_for_each_table(mrt, net) {
2068 if (t < s_t)
2069 goto next_table;
2070 if (t > s_t)
2071 s_h = 0;
2072 for (h = s_h; h < MFC_LINES; h++) {
2073 list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
2074 if (e < s_e)
2075 goto next_entry;
2076 if (ipmr_fill_mroute(mrt, skb,
2077 NETLINK_CB(cb->skb).pid,
2078 cb->nlh->nlmsg_seq,
2079 mfc) < 0)
2080 goto done;
2081next_entry:
2082 e++;
2083 }
2084 e = s_e = 0;
2085 }
2086 s_h = 0;
2087next_table:
2088 t++;
2089 }
2090done:
2091 rcu_read_unlock();
2092
2093 cb->args[2] = e;
2094 cb->args[1] = h;
2095 cb->args[0] = t;
2096
2097 return skb->len;
2098}
2099
1687#ifdef CONFIG_PROC_FS 2100#ifdef CONFIG_PROC_FS
1688/* 2101/*
1689 * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif 2102 * The /proc interfaces to multicast routing :
2103 * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
1690 */ 2104 */
1691struct ipmr_vif_iter { 2105struct ipmr_vif_iter {
1692 struct seq_net_private p; 2106 struct seq_net_private p;
2107 struct mr_table *mrt;
1693 int ct; 2108 int ct;
1694}; 2109};
1695 2110
@@ -1697,11 +2112,13 @@ static struct vif_device *ipmr_vif_seq_idx(struct net *net,
1697 struct ipmr_vif_iter *iter, 2112 struct ipmr_vif_iter *iter,
1698 loff_t pos) 2113 loff_t pos)
1699{ 2114{
1700 for (iter->ct = 0; iter->ct < net->ipv4.maxvif; ++iter->ct) { 2115 struct mr_table *mrt = iter->mrt;
1701 if (!VIF_EXISTS(net, iter->ct)) 2116
2117 for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
2118 if (!VIF_EXISTS(mrt, iter->ct))
1702 continue; 2119 continue;
1703 if (pos-- == 0) 2120 if (pos-- == 0)
1704 return &net->ipv4.vif_table[iter->ct]; 2121 return &mrt->vif_table[iter->ct];
1705 } 2122 }
1706 return NULL; 2123 return NULL;
1707} 2124}
@@ -1709,7 +2126,15 @@ static struct vif_device *ipmr_vif_seq_idx(struct net *net,
1709static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) 2126static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1710 __acquires(mrt_lock) 2127 __acquires(mrt_lock)
1711{ 2128{
2129 struct ipmr_vif_iter *iter = seq->private;
1712 struct net *net = seq_file_net(seq); 2130 struct net *net = seq_file_net(seq);
2131 struct mr_table *mrt;
2132
2133 mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2134 if (mrt == NULL)
2135 return ERR_PTR(-ENOENT);
2136
2137 iter->mrt = mrt;
1713 2138
1714 read_lock(&mrt_lock); 2139 read_lock(&mrt_lock);
1715 return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1) 2140 return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
@@ -1720,15 +2145,16 @@ static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1720{ 2145{
1721 struct ipmr_vif_iter *iter = seq->private; 2146 struct ipmr_vif_iter *iter = seq->private;
1722 struct net *net = seq_file_net(seq); 2147 struct net *net = seq_file_net(seq);
2148 struct mr_table *mrt = iter->mrt;
1723 2149
1724 ++*pos; 2150 ++*pos;
1725 if (v == SEQ_START_TOKEN) 2151 if (v == SEQ_START_TOKEN)
1726 return ipmr_vif_seq_idx(net, iter, 0); 2152 return ipmr_vif_seq_idx(net, iter, 0);
1727 2153
1728 while (++iter->ct < net->ipv4.maxvif) { 2154 while (++iter->ct < mrt->maxvif) {
1729 if (!VIF_EXISTS(net, iter->ct)) 2155 if (!VIF_EXISTS(mrt, iter->ct))
1730 continue; 2156 continue;
1731 return &net->ipv4.vif_table[iter->ct]; 2157 return &mrt->vif_table[iter->ct];
1732 } 2158 }
1733 return NULL; 2159 return NULL;
1734} 2160}
@@ -1741,7 +2167,8 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1741 2167
1742static int ipmr_vif_seq_show(struct seq_file *seq, void *v) 2168static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1743{ 2169{
1744 struct net *net = seq_file_net(seq); 2170 struct ipmr_vif_iter *iter = seq->private;
2171 struct mr_table *mrt = iter->mrt;
1745 2172
1746 if (v == SEQ_START_TOKEN) { 2173 if (v == SEQ_START_TOKEN) {
1747 seq_puts(seq, 2174 seq_puts(seq,
@@ -1752,7 +2179,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1752 2179
1753 seq_printf(seq, 2180 seq_printf(seq,
1754 "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", 2181 "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
1755 vif - net->ipv4.vif_table, 2182 vif - mrt->vif_table,
1756 name, vif->bytes_in, vif->pkt_in, 2183 name, vif->bytes_in, vif->pkt_in,
1757 vif->bytes_out, vif->pkt_out, 2184 vif->bytes_out, vif->pkt_out,
1758 vif->flags, vif->local, vif->remote); 2185 vif->flags, vif->local, vif->remote);
@@ -1783,7 +2210,8 @@ static const struct file_operations ipmr_vif_fops = {
1783 2210
1784struct ipmr_mfc_iter { 2211struct ipmr_mfc_iter {
1785 struct seq_net_private p; 2212 struct seq_net_private p;
1786 struct mfc_cache **cache; 2213 struct mr_table *mrt;
2214 struct list_head *cache;
1787 int ct; 2215 int ct;
1788}; 2216};
1789 2217
@@ -1791,22 +2219,22 @@ struct ipmr_mfc_iter {
1791static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, 2219static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
1792 struct ipmr_mfc_iter *it, loff_t pos) 2220 struct ipmr_mfc_iter *it, loff_t pos)
1793{ 2221{
2222 struct mr_table *mrt = it->mrt;
1794 struct mfc_cache *mfc; 2223 struct mfc_cache *mfc;
1795 2224
1796 it->cache = net->ipv4.mfc_cache_array; 2225 rcu_read_lock();
1797 read_lock(&mrt_lock); 2226 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
1798 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) 2227 it->cache = &mrt->mfc_cache_array[it->ct];
1799 for (mfc = net->ipv4.mfc_cache_array[it->ct]; 2228 list_for_each_entry_rcu(mfc, it->cache, list)
1800 mfc; mfc = mfc->next)
1801 if (pos-- == 0) 2229 if (pos-- == 0)
1802 return mfc; 2230 return mfc;
1803 read_unlock(&mrt_lock); 2231 }
2232 rcu_read_unlock();
1804 2233
1805 it->cache = &mfc_unres_queue;
1806 spin_lock_bh(&mfc_unres_lock); 2234 spin_lock_bh(&mfc_unres_lock);
1807 for (mfc = mfc_unres_queue; mfc; mfc = mfc->next) 2235 it->cache = &mrt->mfc_unres_queue;
1808 if (net_eq(mfc_net(mfc), net) && 2236 list_for_each_entry(mfc, it->cache, list)
1809 pos-- == 0) 2237 if (pos-- == 0)
1810 return mfc; 2238 return mfc;
1811 spin_unlock_bh(&mfc_unres_lock); 2239 spin_unlock_bh(&mfc_unres_lock);
1812 2240
@@ -1819,7 +2247,13 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1819{ 2247{
1820 struct ipmr_mfc_iter *it = seq->private; 2248 struct ipmr_mfc_iter *it = seq->private;
1821 struct net *net = seq_file_net(seq); 2249 struct net *net = seq_file_net(seq);
2250 struct mr_table *mrt;
1822 2251
2252 mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2253 if (mrt == NULL)
2254 return ERR_PTR(-ENOENT);
2255
2256 it->mrt = mrt;
1823 it->cache = NULL; 2257 it->cache = NULL;
1824 it->ct = 0; 2258 it->ct = 0;
1825 return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) 2259 return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
@@ -1831,39 +2265,38 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1831 struct mfc_cache *mfc = v; 2265 struct mfc_cache *mfc = v;
1832 struct ipmr_mfc_iter *it = seq->private; 2266 struct ipmr_mfc_iter *it = seq->private;
1833 struct net *net = seq_file_net(seq); 2267 struct net *net = seq_file_net(seq);
2268 struct mr_table *mrt = it->mrt;
1834 2269
1835 ++*pos; 2270 ++*pos;
1836 2271
1837 if (v == SEQ_START_TOKEN) 2272 if (v == SEQ_START_TOKEN)
1838 return ipmr_mfc_seq_idx(net, seq->private, 0); 2273 return ipmr_mfc_seq_idx(net, seq->private, 0);
1839 2274
1840 if (mfc->next) 2275 if (mfc->list.next != it->cache)
1841 return mfc->next; 2276 return list_entry(mfc->list.next, struct mfc_cache, list);
1842 2277
1843 if (it->cache == &mfc_unres_queue) 2278 if (it->cache == &mrt->mfc_unres_queue)
1844 goto end_of_list; 2279 goto end_of_list;
1845 2280
1846 BUG_ON(it->cache != net->ipv4.mfc_cache_array); 2281 BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
1847 2282
1848 while (++it->ct < MFC_LINES) { 2283 while (++it->ct < MFC_LINES) {
1849 mfc = net->ipv4.mfc_cache_array[it->ct]; 2284 it->cache = &mrt->mfc_cache_array[it->ct];
1850 if (mfc) 2285 if (list_empty(it->cache))
1851 return mfc; 2286 continue;
2287 return list_first_entry(it->cache, struct mfc_cache, list);
1852 } 2288 }
1853 2289
1854 /* exhausted cache_array, show unresolved */ 2290 /* exhausted cache_array, show unresolved */
1855 read_unlock(&mrt_lock); 2291 rcu_read_unlock();
1856 it->cache = &mfc_unres_queue; 2292 it->cache = &mrt->mfc_unres_queue;
1857 it->ct = 0; 2293 it->ct = 0;
1858 2294
1859 spin_lock_bh(&mfc_unres_lock); 2295 spin_lock_bh(&mfc_unres_lock);
1860 mfc = mfc_unres_queue; 2296 if (!list_empty(it->cache))
1861 while (mfc && !net_eq(mfc_net(mfc), net)) 2297 return list_first_entry(it->cache, struct mfc_cache, list);
1862 mfc = mfc->next;
1863 if (mfc)
1864 return mfc;
1865 2298
1866 end_of_list: 2299end_of_list:
1867 spin_unlock_bh(&mfc_unres_lock); 2300 spin_unlock_bh(&mfc_unres_lock);
1868 it->cache = NULL; 2301 it->cache = NULL;
1869 2302
@@ -1873,18 +2306,17 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1873static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) 2306static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1874{ 2307{
1875 struct ipmr_mfc_iter *it = seq->private; 2308 struct ipmr_mfc_iter *it = seq->private;
1876 struct net *net = seq_file_net(seq); 2309 struct mr_table *mrt = it->mrt;
1877 2310
1878 if (it->cache == &mfc_unres_queue) 2311 if (it->cache == &mrt->mfc_unres_queue)
1879 spin_unlock_bh(&mfc_unres_lock); 2312 spin_unlock_bh(&mfc_unres_lock);
1880 else if (it->cache == net->ipv4.mfc_cache_array) 2313 else if (it->cache == &mrt->mfc_cache_array[it->ct])
1881 read_unlock(&mrt_lock); 2314 rcu_read_unlock();
1882} 2315}
1883 2316
1884static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) 2317static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1885{ 2318{
1886 int n; 2319 int n;
1887 struct net *net = seq_file_net(seq);
1888 2320
1889 if (v == SEQ_START_TOKEN) { 2321 if (v == SEQ_START_TOKEN) {
1890 seq_puts(seq, 2322 seq_puts(seq,
@@ -1892,20 +2324,21 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1892 } else { 2324 } else {
1893 const struct mfc_cache *mfc = v; 2325 const struct mfc_cache *mfc = v;
1894 const struct ipmr_mfc_iter *it = seq->private; 2326 const struct ipmr_mfc_iter *it = seq->private;
2327 const struct mr_table *mrt = it->mrt;
1895 2328
1896 seq_printf(seq, "%08lX %08lX %-3hd", 2329 seq_printf(seq, "%08X %08X %-3hd",
1897 (unsigned long) mfc->mfc_mcastgrp, 2330 (__force u32) mfc->mfc_mcastgrp,
1898 (unsigned long) mfc->mfc_origin, 2331 (__force u32) mfc->mfc_origin,
1899 mfc->mfc_parent); 2332 mfc->mfc_parent);
1900 2333
1901 if (it->cache != &mfc_unres_queue) { 2334 if (it->cache != &mrt->mfc_unres_queue) {
1902 seq_printf(seq, " %8lu %8lu %8lu", 2335 seq_printf(seq, " %8lu %8lu %8lu",
1903 mfc->mfc_un.res.pkt, 2336 mfc->mfc_un.res.pkt,
1904 mfc->mfc_un.res.bytes, 2337 mfc->mfc_un.res.bytes,
1905 mfc->mfc_un.res.wrong_if); 2338 mfc->mfc_un.res.wrong_if);
1906 for (n = mfc->mfc_un.res.minvif; 2339 for (n = mfc->mfc_un.res.minvif;
1907 n < mfc->mfc_un.res.maxvif; n++ ) { 2340 n < mfc->mfc_un.res.maxvif; n++) {
1908 if (VIF_EXISTS(net, n) && 2341 if (VIF_EXISTS(mrt, n) &&
1909 mfc->mfc_un.res.ttls[n] < 255) 2342 mfc->mfc_un.res.ttls[n] < 255)
1910 seq_printf(seq, 2343 seq_printf(seq,
1911 " %2d:%-3d", 2344 " %2d:%-3d",
@@ -1945,7 +2378,7 @@ static const struct file_operations ipmr_mfc_fops = {
1945#endif 2378#endif
1946 2379
1947#ifdef CONFIG_IP_PIMSM_V2 2380#ifdef CONFIG_IP_PIMSM_V2
1948static struct net_protocol pim_protocol = { 2381static const struct net_protocol pim_protocol = {
1949 .handler = pim_rcv, 2382 .handler = pim_rcv,
1950 .netns_ok = 1, 2383 .netns_ok = 1,
1951}; 2384};
@@ -1957,27 +2390,11 @@ static struct net_protocol pim_protocol = {
1957 */ 2390 */
1958static int __net_init ipmr_net_init(struct net *net) 2391static int __net_init ipmr_net_init(struct net *net)
1959{ 2392{
1960 int err = 0; 2393 int err;
1961 2394
1962 net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device), 2395 err = ipmr_rules_init(net);
1963 GFP_KERNEL); 2396 if (err < 0)
1964 if (!net->ipv4.vif_table) {
1965 err = -ENOMEM;
1966 goto fail; 2397 goto fail;
1967 }
1968
1969 /* Forwarding cache */
1970 net->ipv4.mfc_cache_array = kcalloc(MFC_LINES,
1971 sizeof(struct mfc_cache *),
1972 GFP_KERNEL);
1973 if (!net->ipv4.mfc_cache_array) {
1974 err = -ENOMEM;
1975 goto fail_mfc_cache;
1976 }
1977
1978#ifdef CONFIG_IP_PIMSM
1979 net->ipv4.mroute_reg_vif_num = -1;
1980#endif
1981 2398
1982#ifdef CONFIG_PROC_FS 2399#ifdef CONFIG_PROC_FS
1983 err = -ENOMEM; 2400 err = -ENOMEM;
@@ -1992,10 +2409,8 @@ static int __net_init ipmr_net_init(struct net *net)
1992proc_cache_fail: 2409proc_cache_fail:
1993 proc_net_remove(net, "ip_mr_vif"); 2410 proc_net_remove(net, "ip_mr_vif");
1994proc_vif_fail: 2411proc_vif_fail:
1995 kfree(net->ipv4.mfc_cache_array); 2412 ipmr_rules_exit(net);
1996#endif 2413#endif
1997fail_mfc_cache:
1998 kfree(net->ipv4.vif_table);
1999fail: 2414fail:
2000 return err; 2415 return err;
2001} 2416}
@@ -2006,8 +2421,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
2006 proc_net_remove(net, "ip_mr_cache"); 2421 proc_net_remove(net, "ip_mr_cache");
2007 proc_net_remove(net, "ip_mr_vif"); 2422 proc_net_remove(net, "ip_mr_vif");
2008#endif 2423#endif
2009 kfree(net->ipv4.mfc_cache_array); 2424 ipmr_rules_exit(net);
2010 kfree(net->ipv4.vif_table);
2011} 2425}
2012 2426
2013static struct pernet_operations ipmr_net_ops = { 2427static struct pernet_operations ipmr_net_ops = {
@@ -2021,7 +2435,7 @@ int __init ip_mr_init(void)
2021 2435
2022 mrt_cachep = kmem_cache_create("ip_mrt_cache", 2436 mrt_cachep = kmem_cache_create("ip_mrt_cache",
2023 sizeof(struct mfc_cache), 2437 sizeof(struct mfc_cache),
2024 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 2438 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
2025 NULL); 2439 NULL);
2026 if (!mrt_cachep) 2440 if (!mrt_cachep)
2027 return -ENOMEM; 2441 return -ENOMEM;
@@ -2030,7 +2444,6 @@ int __init ip_mr_init(void)
2030 if (err) 2444 if (err)
2031 goto reg_pernet_fail; 2445 goto reg_pernet_fail;
2032 2446
2033 setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
2034 err = register_netdevice_notifier(&ip_mr_notifier); 2447 err = register_netdevice_notifier(&ip_mr_notifier);
2035 if (err) 2448 if (err)
2036 goto reg_notif_fail; 2449 goto reg_notif_fail;
@@ -2041,6 +2454,7 @@ int __init ip_mr_init(void)
2041 goto add_proto_fail; 2454 goto add_proto_fail;
2042 } 2455 }
2043#endif 2456#endif
2457 rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
2044 return 0; 2458 return 0;
2045 2459
2046#ifdef CONFIG_IP_PIMSM_V2 2460#ifdef CONFIG_IP_PIMSM_V2
@@ -2048,7 +2462,6 @@ add_proto_fail:
2048 unregister_netdevice_notifier(&ip_mr_notifier); 2462 unregister_netdevice_notifier(&ip_mr_notifier);
2049#endif 2463#endif
2050reg_notif_fail: 2464reg_notif_fail:
2051 del_timer(&ipmr_expire_timer);
2052 unregister_pernet_subsys(&ipmr_net_ops); 2465 unregister_pernet_subsys(&ipmr_net_ops);
2053reg_pernet_fail: 2466reg_pernet_fail:
2054 kmem_cache_destroy(mrt_cachep); 2467 kmem_cache_destroy(mrt_cachep);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 1725dc0ef688..994a1f29ebbc 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -4,6 +4,7 @@
4#include <linux/netfilter_ipv4.h> 4#include <linux/netfilter_ipv4.h>
5#include <linux/ip.h> 5#include <linux/ip.h>
6#include <linux/skbuff.h> 6#include <linux/skbuff.h>
7#include <linux/gfp.h>
7#include <net/route.h> 8#include <net/route.h>
8#include <net/xfrm.h> 9#include <net/xfrm.h>
9#include <net/ip.h> 10#include <net/ip.h>
@@ -16,7 +17,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
16 const struct iphdr *iph = ip_hdr(skb); 17 const struct iphdr *iph = ip_hdr(skb);
17 struct rtable *rt; 18 struct rtable *rt;
18 struct flowi fl = {}; 19 struct flowi fl = {};
19 struct dst_entry *odst; 20 unsigned long orefdst;
20 unsigned int hh_len; 21 unsigned int hh_len;
21 unsigned int type; 22 unsigned int type;
22 23
@@ -30,10 +31,10 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
30 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. 31 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
31 */ 32 */
32 if (addr_type == RTN_LOCAL) { 33 if (addr_type == RTN_LOCAL) {
33 fl.nl_u.ip4_u.daddr = iph->daddr; 34 fl.fl4_dst = iph->daddr;
34 if (type == RTN_LOCAL) 35 if (type == RTN_LOCAL)
35 fl.nl_u.ip4_u.saddr = iph->saddr; 36 fl.fl4_src = iph->saddr;
36 fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); 37 fl.fl4_tos = RT_TOS(iph->tos);
37 fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; 38 fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
38 fl.mark = skb->mark; 39 fl.mark = skb->mark;
39 fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; 40 fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
@@ -42,22 +43,22 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
42 43
43 /* Drop old route. */ 44 /* Drop old route. */
44 skb_dst_drop(skb); 45 skb_dst_drop(skb);
45 skb_dst_set(skb, &rt->u.dst); 46 skb_dst_set(skb, &rt->dst);
46 } else { 47 } else {
47 /* non-local src, find valid iif to satisfy 48 /* non-local src, find valid iif to satisfy
48 * rp-filter when calling ip_route_input. */ 49 * rp-filter when calling ip_route_input. */
49 fl.nl_u.ip4_u.daddr = iph->saddr; 50 fl.fl4_dst = iph->saddr;
50 if (ip_route_output_key(net, &rt, &fl) != 0) 51 if (ip_route_output_key(net, &rt, &fl) != 0)
51 return -1; 52 return -1;
52 53
53 odst = skb_dst(skb); 54 orefdst = skb->_skb_refdst;
54 if (ip_route_input(skb, iph->daddr, iph->saddr, 55 if (ip_route_input(skb, iph->daddr, iph->saddr,
55 RT_TOS(iph->tos), rt->u.dst.dev) != 0) { 56 RT_TOS(iph->tos), rt->dst.dev) != 0) {
56 dst_release(&rt->u.dst); 57 dst_release(&rt->dst);
57 return -1; 58 return -1;
58 } 59 }
59 dst_release(&rt->u.dst); 60 dst_release(&rt->dst);
60 dst_release(odst); 61 refdst_drop(orefdst);
61 } 62 }
62 63
63 if (skb_dst(skb)->error) 64 if (skb_dst(skb)->error)
@@ -155,10 +156,10 @@ static int nf_ip_reroute(struct sk_buff *skb,
155 if (entry->hook == NF_INET_LOCAL_OUT) { 156 if (entry->hook == NF_INET_LOCAL_OUT) {
156 const struct iphdr *iph = ip_hdr(skb); 157 const struct iphdr *iph = ip_hdr(skb);
157 158
158 if (!(iph->tos == rt_info->tos 159 if (!(iph->tos == rt_info->tos &&
159 && skb->mark == rt_info->mark 160 skb->mark == rt_info->mark &&
160 && iph->daddr == rt_info->daddr 161 iph->daddr == rt_info->daddr &&
161 && iph->saddr == rt_info->saddr)) 162 iph->saddr == rt_info->saddr))
162 return ip_route_me_harder(skb, RTN_UNSPEC); 163 return ip_route_me_harder(skb, RTN_UNSPEC);
163 } 164 }
164 return 0; 165 return 0;
@@ -211,9 +212,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
211 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, 212 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
212 skb->len - dataoff, 0); 213 skb->len - dataoff, 0);
213 skb->ip_summed = CHECKSUM_NONE; 214 skb->ip_summed = CHECKSUM_NONE;
214 csum = __skb_checksum_complete_head(skb, dataoff + len); 215 return __skb_checksum_complete_head(skb, dataoff + len);
215 if (!csum)
216 skb->ip_summed = CHECKSUM_UNNECESSARY;
217 } 216 }
218 return csum; 217 return csum;
219} 218}
@@ -248,9 +247,9 @@ module_exit(ipv4_netfilter_fini);
248 247
249#ifdef CONFIG_SYSCTL 248#ifdef CONFIG_SYSCTL
250struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = { 249struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = {
251 { .procname = "net", .ctl_name = CTL_NET, }, 250 { .procname = "net", },
252 { .procname = "ipv4", .ctl_name = NET_IPV4, }, 251 { .procname = "ipv4", },
253 { .procname = "netfilter", .ctl_name = NET_IPV4_NETFILTER, }, 252 { .procname = "netfilter", },
254 { } 253 { }
255}; 254};
256EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path); 255EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1833bdbf9805..babd1a2bae5f 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -147,7 +147,7 @@ config IP_NF_TARGET_ULOG
147 which can only be viewed through syslog. 147 which can only be viewed through syslog.
148 148
149 The appropriate userspace logging daemon (ulogd) may be obtained from 149 The appropriate userspace logging daemon (ulogd) may be obtained from
150 <http://www.gnumonks.org/projects/ulogd/> 150 <http://www.netfilter.org/projects/ulogd/index.html>
151 151
152 To compile it as a module, choose M here. If unsure, say N. 152 To compile it as a module, choose M here. If unsure, say N.
153 153
@@ -324,10 +324,10 @@ config IP_NF_TARGET_ECN
324 324
325config IP_NF_TARGET_TTL 325config IP_NF_TARGET_TTL
326 tristate '"TTL" target support' 326 tristate '"TTL" target support'
327 depends on NETFILTER_ADVANCED 327 depends on NETFILTER_ADVANCED && IP_NF_MANGLE
328 select NETFILTER_XT_TARGET_HL 328 select NETFILTER_XT_TARGET_HL
329 ---help--- 329 ---help---
330 This is a backwards-compat option for the user's convenience 330 This is a backwards-compatible option for the user's convenience
331 (e.g. when running oldconfig). It selects 331 (e.g. when running oldconfig). It selects
332 CONFIG_NETFILTER_XT_TARGET_HL. 332 CONFIG_NETFILTER_XT_TARGET_HL.
333 333
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 48111594ee9b..19eb59d01037 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -3,15 +3,15 @@
3# 3#
4 4
5# objects for l3 independent conntrack 5# objects for l3 independent conntrack
6nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o 6nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
7ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y) 7ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y)
8ifeq ($(CONFIG_PROC_FS),y) 8ifeq ($(CONFIG_PROC_FS),y)
9nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o 9nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
10endif 10endif
11endif 11endif
12 12
13nf_nat-objs := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o 13nf_nat-y := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
14iptable_nat-objs := nf_nat_rule.o nf_nat_standalone.o 14iptable_nat-y := nf_nat_rule.o nf_nat_standalone.o
15 15
16# connection tracking 16# connection tracking
17obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o 17obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 7505dff4ffdf..e855fffaed95 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -8,7 +8,7 @@
8 * Copyright (C) 2002 David S. Miller (davem@redhat.com) 8 * Copyright (C) 2002 David S. Miller (davem@redhat.com)
9 * 9 *
10 */ 10 */
11 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/skbuff.h> 13#include <linux/skbuff.h>
14#include <linux/netdevice.h> 14#include <linux/netdevice.h>
@@ -27,6 +27,7 @@
27 27
28#include <linux/netfilter/x_tables.h> 28#include <linux/netfilter/x_tables.h>
29#include <linux/netfilter_arp/arp_tables.h> 29#include <linux/netfilter_arp/arp_tables.h>
30#include "../../netfilter/xt_repldata.h"
30 31
31MODULE_LICENSE("GPL"); 32MODULE_LICENSE("GPL");
32MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); 33MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
@@ -48,16 +49,17 @@ MODULE_DESCRIPTION("arptables core");
48#endif 49#endif
49 50
50#ifdef CONFIG_NETFILTER_DEBUG 51#ifdef CONFIG_NETFILTER_DEBUG
51#define ARP_NF_ASSERT(x) \ 52#define ARP_NF_ASSERT(x) WARN_ON(!(x))
52do { \
53 if (!(x)) \
54 printk("ARP_NF_ASSERT: %s:%s:%u\n", \
55 __func__, __FILE__, __LINE__); \
56} while(0)
57#else 53#else
58#define ARP_NF_ASSERT(x) 54#define ARP_NF_ASSERT(x)
59#endif 55#endif
60 56
57void *arpt_alloc_initial_table(const struct xt_table *info)
58{
59 return xt_alloc_initial_table(arpt, ARPT);
60}
61EXPORT_SYMBOL_GPL(arpt_alloc_initial_table);
62
61static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, 63static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
62 const char *hdr_addr, int len) 64 const char *hdr_addr, int len)
63{ 65{
@@ -70,7 +72,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
70 for (i = 0; i < len; i++) 72 for (i = 0; i < len; i++)
71 ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; 73 ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
72 74
73 return (ret != 0); 75 return ret != 0;
74} 76}
75 77
76/* 78/*
@@ -217,16 +219,23 @@ static inline int arp_checkentry(const struct arpt_arp *arp)
217} 219}
218 220
219static unsigned int 221static unsigned int
220arpt_error(struct sk_buff *skb, const struct xt_target_param *par) 222arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
221{ 223{
222 if (net_ratelimit()) 224 if (net_ratelimit())
223 printk("arp_tables: error: '%s'\n", 225 pr_err("arp_tables: error: '%s'\n",
224 (const char *)par->targinfo); 226 (const char *)par->targinfo);
225 227
226 return NF_DROP; 228 return NF_DROP;
227} 229}
228 230
229static inline struct arpt_entry *get_entry(void *base, unsigned int offset) 231static inline const struct xt_entry_target *
232arpt_get_target_c(const struct arpt_entry *e)
233{
234 return arpt_get_target((struct arpt_entry *)e);
235}
236
237static inline struct arpt_entry *
238get_entry(const void *base, unsigned int offset)
230{ 239{
231 return (struct arpt_entry *)(base + offset); 240 return (struct arpt_entry *)(base + offset);
232} 241}
@@ -246,12 +255,11 @@ unsigned int arpt_do_table(struct sk_buff *skb,
246 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); 255 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
247 unsigned int verdict = NF_DROP; 256 unsigned int verdict = NF_DROP;
248 const struct arphdr *arp; 257 const struct arphdr *arp;
249 bool hotdrop = false;
250 struct arpt_entry *e, *back; 258 struct arpt_entry *e, *back;
251 const char *indev, *outdev; 259 const char *indev, *outdev;
252 void *table_base; 260 void *table_base;
253 const struct xt_table_info *private; 261 const struct xt_table_info *private;
254 struct xt_target_param tgpar; 262 struct xt_action_param acpar;
255 263
256 if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) 264 if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
257 return NF_DROP; 265 return NF_DROP;
@@ -266,35 +274,33 @@ unsigned int arpt_do_table(struct sk_buff *skb,
266 e = get_entry(table_base, private->hook_entry[hook]); 274 e = get_entry(table_base, private->hook_entry[hook]);
267 back = get_entry(table_base, private->underflow[hook]); 275 back = get_entry(table_base, private->underflow[hook]);
268 276
269 tgpar.in = in; 277 acpar.in = in;
270 tgpar.out = out; 278 acpar.out = out;
271 tgpar.hooknum = hook; 279 acpar.hooknum = hook;
272 tgpar.family = NFPROTO_ARP; 280 acpar.family = NFPROTO_ARP;
281 acpar.hotdrop = false;
273 282
274 arp = arp_hdr(skb); 283 arp = arp_hdr(skb);
275 do { 284 do {
276 struct arpt_entry_target *t; 285 const struct xt_entry_target *t;
277 int hdr_len;
278 286
279 if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { 287 if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
280 e = arpt_next_entry(e); 288 e = arpt_next_entry(e);
281 continue; 289 continue;
282 } 290 }
283 291
284 hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + 292 ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1);
285 (2 * skb->dev->addr_len);
286 ADD_COUNTER(e->counters, hdr_len, 1);
287 293
288 t = arpt_get_target(e); 294 t = arpt_get_target_c(e);
289 295
290 /* Standard target? */ 296 /* Standard target? */
291 if (!t->u.kernel.target->target) { 297 if (!t->u.kernel.target->target) {
292 int v; 298 int v;
293 299
294 v = ((struct arpt_standard_target *)t)->verdict; 300 v = ((struct xt_standard_target *)t)->verdict;
295 if (v < 0) { 301 if (v < 0) {
296 /* Pop from stack? */ 302 /* Pop from stack? */
297 if (v != ARPT_RETURN) { 303 if (v != XT_RETURN) {
298 verdict = (unsigned)(-v) - 1; 304 verdict = (unsigned)(-v) - 1;
299 break; 305 break;
300 } 306 }
@@ -319,43 +325,39 @@ unsigned int arpt_do_table(struct sk_buff *skb,
319 /* Targets which reenter must return 325 /* Targets which reenter must return
320 * abs. verdicts 326 * abs. verdicts
321 */ 327 */
322 tgpar.target = t->u.kernel.target; 328 acpar.target = t->u.kernel.target;
323 tgpar.targinfo = t->data; 329 acpar.targinfo = t->data;
324 verdict = t->u.kernel.target->target(skb, &tgpar); 330 verdict = t->u.kernel.target->target(skb, &acpar);
325 331
326 /* Target might have changed stuff. */ 332 /* Target might have changed stuff. */
327 arp = arp_hdr(skb); 333 arp = arp_hdr(skb);
328 334
329 if (verdict == ARPT_CONTINUE) 335 if (verdict == XT_CONTINUE)
330 e = arpt_next_entry(e); 336 e = arpt_next_entry(e);
331 else 337 else
332 /* Verdict */ 338 /* Verdict */
333 break; 339 break;
334 } while (!hotdrop); 340 } while (!acpar.hotdrop);
335 xt_info_rdunlock_bh(); 341 xt_info_rdunlock_bh();
336 342
337 if (hotdrop) 343 if (acpar.hotdrop)
338 return NF_DROP; 344 return NF_DROP;
339 else 345 else
340 return verdict; 346 return verdict;
341} 347}
342 348
343/* All zeroes == unconditional rule. */ 349/* All zeroes == unconditional rule. */
344static inline int unconditional(const struct arpt_arp *arp) 350static inline bool unconditional(const struct arpt_arp *arp)
345{ 351{
346 unsigned int i; 352 static const struct arpt_arp uncond;
347 353
348 for (i = 0; i < sizeof(*arp)/sizeof(__u32); i++) 354 return memcmp(arp, &uncond, sizeof(uncond)) == 0;
349 if (((__u32 *)arp)[i])
350 return 0;
351
352 return 1;
353} 355}
354 356
355/* Figures out from what hook each rule can be called: returns 0 if 357/* Figures out from what hook each rule can be called: returns 0 if
356 * there are loops. Puts hook bitmask in comefrom. 358 * there are loops. Puts hook bitmask in comefrom.
357 */ 359 */
358static int mark_source_chains(struct xt_table_info *newinfo, 360static int mark_source_chains(const struct xt_table_info *newinfo,
359 unsigned int valid_hooks, void *entry0) 361 unsigned int valid_hooks, void *entry0)
360{ 362{
361 unsigned int hook; 363 unsigned int hook;
@@ -375,12 +377,12 @@ static int mark_source_chains(struct xt_table_info *newinfo,
375 e->counters.pcnt = pos; 377 e->counters.pcnt = pos;
376 378
377 for (;;) { 379 for (;;) {
378 const struct arpt_standard_target *t 380 const struct xt_standard_target *t
379 = (void *)arpt_get_target(e); 381 = (void *)arpt_get_target_c(e);
380 int visited = e->comefrom & (1 << hook); 382 int visited = e->comefrom & (1 << hook);
381 383
382 if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) { 384 if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
383 printk("arptables: loop hook %u pos %u %08X.\n", 385 pr_notice("arptables: loop hook %u pos %u %08X.\n",
384 hook, pos, e->comefrom); 386 hook, pos, e->comefrom);
385 return 0; 387 return 0;
386 } 388 }
@@ -388,15 +390,15 @@ static int mark_source_chains(struct xt_table_info *newinfo,
388 |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); 390 |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
389 391
390 /* Unconditional return/END. */ 392 /* Unconditional return/END. */
391 if ((e->target_offset == sizeof(struct arpt_entry) 393 if ((e->target_offset == sizeof(struct arpt_entry) &&
392 && (strcmp(t->target.u.user.name, 394 (strcmp(t->target.u.user.name,
393 ARPT_STANDARD_TARGET) == 0) 395 XT_STANDARD_TARGET) == 0) &&
394 && t->verdict < 0 396 t->verdict < 0 && unconditional(&e->arp)) ||
395 && unconditional(&e->arp)) || visited) { 397 visited) {
396 unsigned int oldpos, size; 398 unsigned int oldpos, size;
397 399
398 if ((strcmp(t->target.u.user.name, 400 if ((strcmp(t->target.u.user.name,
399 ARPT_STANDARD_TARGET) == 0) && 401 XT_STANDARD_TARGET) == 0) &&
400 t->verdict < -NF_MAX_VERDICT - 1) { 402 t->verdict < -NF_MAX_VERDICT - 1) {
401 duprintf("mark_source_chains: bad " 403 duprintf("mark_source_chains: bad "
402 "negative verdict (%i)\n", 404 "negative verdict (%i)\n",
@@ -431,8 +433,8 @@ static int mark_source_chains(struct xt_table_info *newinfo,
431 int newpos = t->verdict; 433 int newpos = t->verdict;
432 434
433 if (strcmp(t->target.u.user.name, 435 if (strcmp(t->target.u.user.name,
434 ARPT_STANDARD_TARGET) == 0 436 XT_STANDARD_TARGET) == 0 &&
435 && newpos >= 0) { 437 newpos >= 0) {
436 if (newpos > newinfo->size - 438 if (newpos > newinfo->size -
437 sizeof(struct arpt_entry)) { 439 sizeof(struct arpt_entry)) {
438 duprintf("mark_source_chains: " 440 duprintf("mark_source_chains: "
@@ -460,19 +462,19 @@ static int mark_source_chains(struct xt_table_info *newinfo,
460 return 1; 462 return 1;
461} 463}
462 464
463static inline int check_entry(struct arpt_entry *e, const char *name) 465static inline int check_entry(const struct arpt_entry *e, const char *name)
464{ 466{
465 const struct arpt_entry_target *t; 467 const struct xt_entry_target *t;
466 468
467 if (!arp_checkentry(&e->arp)) { 469 if (!arp_checkentry(&e->arp)) {
468 duprintf("arp_tables: arp check failed %p %s.\n", e, name); 470 duprintf("arp_tables: arp check failed %p %s.\n", e, name);
469 return -EINVAL; 471 return -EINVAL;
470 } 472 }
471 473
472 if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset) 474 if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
473 return -EINVAL; 475 return -EINVAL;
474 476
475 t = arpt_get_target(e); 477 t = arpt_get_target_c(e);
476 if (e->target_offset + t->u.target_size > e->next_offset) 478 if (e->target_offset + t->u.target_size > e->next_offset)
477 return -EINVAL; 479 return -EINVAL;
478 480
@@ -481,7 +483,7 @@ static inline int check_entry(struct arpt_entry *e, const char *name)
481 483
482static inline int check_target(struct arpt_entry *e, const char *name) 484static inline int check_target(struct arpt_entry *e, const char *name)
483{ 485{
484 struct arpt_entry_target *t = arpt_get_target(e); 486 struct xt_entry_target *t = arpt_get_target(e);
485 int ret; 487 int ret;
486 struct xt_tgchk_param par = { 488 struct xt_tgchk_param par = {
487 .table = name, 489 .table = name,
@@ -502,10 +504,9 @@ static inline int check_target(struct arpt_entry *e, const char *name)
502} 504}
503 505
504static inline int 506static inline int
505find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, 507find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
506 unsigned int *i)
507{ 508{
508 struct arpt_entry_target *t; 509 struct xt_entry_target *t;
509 struct xt_target *target; 510 struct xt_target *target;
510 int ret; 511 int ret;
511 512
@@ -514,13 +515,11 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
514 return ret; 515 return ret;
515 516
516 t = arpt_get_target(e); 517 t = arpt_get_target(e);
517 target = try_then_request_module(xt_find_target(NFPROTO_ARP, 518 target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
518 t->u.user.name, 519 t->u.user.revision);
519 t->u.user.revision), 520 if (IS_ERR(target)) {
520 "arpt_%s", t->u.user.name);
521 if (IS_ERR(target) || !target) {
522 duprintf("find_check_entry: `%s' not found\n", t->u.user.name); 521 duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
523 ret = target ? PTR_ERR(target) : -ENOENT; 522 ret = PTR_ERR(target);
524 goto out; 523 goto out;
525 } 524 }
526 t->u.kernel.target = target; 525 t->u.kernel.target = target;
@@ -528,8 +527,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
528 ret = check_target(e, name); 527 ret = check_target(e, name);
529 if (ret) 528 if (ret)
530 goto err; 529 goto err;
531
532 (*i)++;
533 return 0; 530 return 0;
534err: 531err:
535 module_put(t->u.kernel.target->me); 532 module_put(t->u.kernel.target->me);
@@ -537,24 +534,39 @@ out:
537 return ret; 534 return ret;
538} 535}
539 536
537static bool check_underflow(const struct arpt_entry *e)
538{
539 const struct xt_entry_target *t;
540 unsigned int verdict;
541
542 if (!unconditional(&e->arp))
543 return false;
544 t = arpt_get_target_c(e);
545 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
546 return false;
547 verdict = ((struct xt_standard_target *)t)->verdict;
548 verdict = -verdict - 1;
549 return verdict == NF_DROP || verdict == NF_ACCEPT;
550}
551
540static inline int check_entry_size_and_hooks(struct arpt_entry *e, 552static inline int check_entry_size_and_hooks(struct arpt_entry *e,
541 struct xt_table_info *newinfo, 553 struct xt_table_info *newinfo,
542 unsigned char *base, 554 const unsigned char *base,
543 unsigned char *limit, 555 const unsigned char *limit,
544 const unsigned int *hook_entries, 556 const unsigned int *hook_entries,
545 const unsigned int *underflows, 557 const unsigned int *underflows,
546 unsigned int *i) 558 unsigned int valid_hooks)
547{ 559{
548 unsigned int h; 560 unsigned int h;
549 561
550 if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 562 if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
551 || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { 563 (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
552 duprintf("Bad offset %p\n", e); 564 duprintf("Bad offset %p\n", e);
553 return -EINVAL; 565 return -EINVAL;
554 } 566 }
555 567
556 if (e->next_offset 568 if (e->next_offset
557 < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) { 569 < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
558 duprintf("checking: element %p size %u\n", 570 duprintf("checking: element %p size %u\n",
559 e, e->next_offset); 571 e, e->next_offset);
560 return -EINVAL; 572 return -EINVAL;
@@ -562,30 +574,31 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
562 574
563 /* Check hooks & underflows */ 575 /* Check hooks & underflows */
564 for (h = 0; h < NF_ARP_NUMHOOKS; h++) { 576 for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
577 if (!(valid_hooks & (1 << h)))
578 continue;
565 if ((unsigned char *)e - base == hook_entries[h]) 579 if ((unsigned char *)e - base == hook_entries[h])
566 newinfo->hook_entry[h] = hook_entries[h]; 580 newinfo->hook_entry[h] = hook_entries[h];
567 if ((unsigned char *)e - base == underflows[h]) 581 if ((unsigned char *)e - base == underflows[h]) {
582 if (!check_underflow(e)) {
583 pr_err("Underflows must be unconditional and "
584 "use the STANDARD target with "
585 "ACCEPT/DROP\n");
586 return -EINVAL;
587 }
568 newinfo->underflow[h] = underflows[h]; 588 newinfo->underflow[h] = underflows[h];
589 }
569 } 590 }
570 591
571 /* FIXME: underflows must be unconditional, standard verdicts
572 < 0 (not ARPT_RETURN). --RR */
573
574 /* Clear counters and comefrom */ 592 /* Clear counters and comefrom */
575 e->counters = ((struct xt_counters) { 0, 0 }); 593 e->counters = ((struct xt_counters) { 0, 0 });
576 e->comefrom = 0; 594 e->comefrom = 0;
577
578 (*i)++;
579 return 0; 595 return 0;
580} 596}
581 597
582static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) 598static inline void cleanup_entry(struct arpt_entry *e)
583{ 599{
584 struct xt_tgdtor_param par; 600 struct xt_tgdtor_param par;
585 struct arpt_entry_target *t; 601 struct xt_entry_target *t;
586
587 if (i && (*i)-- == 0)
588 return 1;
589 602
590 t = arpt_get_target(e); 603 t = arpt_get_target(e);
591 par.target = t->u.kernel.target; 604 par.target = t->u.kernel.target;
@@ -594,26 +607,20 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
594 if (par.target->destroy != NULL) 607 if (par.target->destroy != NULL)
595 par.target->destroy(&par); 608 par.target->destroy(&par);
596 module_put(par.target->me); 609 module_put(par.target->me);
597 return 0;
598} 610}
599 611
600/* Checks and translates the user-supplied table segment (held in 612/* Checks and translates the user-supplied table segment (held in
601 * newinfo). 613 * newinfo).
602 */ 614 */
603static int translate_table(const char *name, 615static int translate_table(struct xt_table_info *newinfo, void *entry0,
604 unsigned int valid_hooks, 616 const struct arpt_replace *repl)
605 struct xt_table_info *newinfo,
606 void *entry0,
607 unsigned int size,
608 unsigned int number,
609 const unsigned int *hook_entries,
610 const unsigned int *underflows)
611{ 617{
618 struct arpt_entry *iter;
612 unsigned int i; 619 unsigned int i;
613 int ret; 620 int ret = 0;
614 621
615 newinfo->size = size; 622 newinfo->size = repl->size;
616 newinfo->number = number; 623 newinfo->number = repl->num_entries;
617 624
618 /* Init all hooks to impossible value. */ 625 /* Init all hooks to impossible value. */
619 for (i = 0; i < NF_ARP_NUMHOOKS; i++) { 626 for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
@@ -625,52 +632,66 @@ static int translate_table(const char *name,
625 i = 0; 632 i = 0;
626 633
627 /* Walk through entries, checking offsets. */ 634 /* Walk through entries, checking offsets. */
628 ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, 635 xt_entry_foreach(iter, entry0, newinfo->size) {
629 check_entry_size_and_hooks, 636 ret = check_entry_size_and_hooks(iter, newinfo, entry0,
630 newinfo, 637 entry0 + repl->size,
631 entry0, 638 repl->hook_entry,
632 entry0 + size, 639 repl->underflow,
633 hook_entries, underflows, &i); 640 repl->valid_hooks);
641 if (ret != 0)
642 break;
643 ++i;
644 if (strcmp(arpt_get_target(iter)->u.user.name,
645 XT_ERROR_TARGET) == 0)
646 ++newinfo->stacksize;
647 }
634 duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); 648 duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
635 if (ret != 0) 649 if (ret != 0)
636 return ret; 650 return ret;
637 651
638 if (i != number) { 652 if (i != repl->num_entries) {
639 duprintf("translate_table: %u not %u entries\n", 653 duprintf("translate_table: %u not %u entries\n",
640 i, number); 654 i, repl->num_entries);
641 return -EINVAL; 655 return -EINVAL;
642 } 656 }
643 657
644 /* Check hooks all assigned */ 658 /* Check hooks all assigned */
645 for (i = 0; i < NF_ARP_NUMHOOKS; i++) { 659 for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
646 /* Only hooks which are valid */ 660 /* Only hooks which are valid */
647 if (!(valid_hooks & (1 << i))) 661 if (!(repl->valid_hooks & (1 << i)))
648 continue; 662 continue;
649 if (newinfo->hook_entry[i] == 0xFFFFFFFF) { 663 if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
650 duprintf("Invalid hook entry %u %u\n", 664 duprintf("Invalid hook entry %u %u\n",
651 i, hook_entries[i]); 665 i, repl->hook_entry[i]);
652 return -EINVAL; 666 return -EINVAL;
653 } 667 }
654 if (newinfo->underflow[i] == 0xFFFFFFFF) { 668 if (newinfo->underflow[i] == 0xFFFFFFFF) {
655 duprintf("Invalid underflow %u %u\n", 669 duprintf("Invalid underflow %u %u\n",
656 i, underflows[i]); 670 i, repl->underflow[i]);
657 return -EINVAL; 671 return -EINVAL;
658 } 672 }
659 } 673 }
660 674
661 if (!mark_source_chains(newinfo, valid_hooks, entry0)) { 675 if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) {
662 duprintf("Looping hook\n"); 676 duprintf("Looping hook\n");
663 return -ELOOP; 677 return -ELOOP;
664 } 678 }
665 679
666 /* Finally, each sanity check must pass */ 680 /* Finally, each sanity check must pass */
667 i = 0; 681 i = 0;
668 ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, 682 xt_entry_foreach(iter, entry0, newinfo->size) {
669 find_check_entry, name, size, &i); 683 ret = find_check_entry(iter, repl->name, repl->size);
684 if (ret != 0)
685 break;
686 ++i;
687 }
670 688
671 if (ret != 0) { 689 if (ret != 0) {
672 ARPT_ENTRY_ITERATE(entry0, newinfo->size, 690 xt_entry_foreach(iter, entry0, newinfo->size) {
673 cleanup_entry, &i); 691 if (i-- == 0)
692 break;
693 cleanup_entry(iter);
694 }
674 return ret; 695 return ret;
675 } 696 }
676 697
@@ -683,78 +704,45 @@ static int translate_table(const char *name,
683 return ret; 704 return ret;
684} 705}
685 706
686/* Gets counters. */
687static inline int add_entry_to_counter(const struct arpt_entry *e,
688 struct xt_counters total[],
689 unsigned int *i)
690{
691 ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
692
693 (*i)++;
694 return 0;
695}
696
697static inline int set_entry_to_counter(const struct arpt_entry *e,
698 struct xt_counters total[],
699 unsigned int *i)
700{
701 SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
702
703 (*i)++;
704 return 0;
705}
706
707static void get_counters(const struct xt_table_info *t, 707static void get_counters(const struct xt_table_info *t,
708 struct xt_counters counters[]) 708 struct xt_counters counters[])
709{ 709{
710 struct arpt_entry *iter;
710 unsigned int cpu; 711 unsigned int cpu;
711 unsigned int i; 712 unsigned int i;
712 unsigned int curcpu;
713
714 /* Instead of clearing (by a previous call to memset())
715 * the counters and using adds, we set the counters
716 * with data used by 'current' CPU
717 *
718 * Bottom half has to be disabled to prevent deadlock
719 * if new softirq were to run and call ipt_do_table
720 */
721 local_bh_disable();
722 curcpu = smp_processor_id();
723
724 i = 0;
725 ARPT_ENTRY_ITERATE(t->entries[curcpu],
726 t->size,
727 set_entry_to_counter,
728 counters,
729 &i);
730 713
731 for_each_possible_cpu(cpu) { 714 for_each_possible_cpu(cpu) {
732 if (cpu == curcpu) 715 seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
733 continue; 716
734 i = 0; 717 i = 0;
735 xt_info_wrlock(cpu); 718 xt_entry_foreach(iter, t->entries[cpu], t->size) {
736 ARPT_ENTRY_ITERATE(t->entries[cpu], 719 u64 bcnt, pcnt;
737 t->size, 720 unsigned int start;
738 add_entry_to_counter, 721
739 counters, 722 do {
740 &i); 723 start = read_seqbegin(lock);
741 xt_info_wrunlock(cpu); 724 bcnt = iter->counters.bcnt;
725 pcnt = iter->counters.pcnt;
726 } while (read_seqretry(lock, start));
727
728 ADD_COUNTER(counters[i], bcnt, pcnt);
729 ++i;
730 }
742 } 731 }
743 local_bh_enable();
744} 732}
745 733
746static struct xt_counters *alloc_counters(struct xt_table *table) 734static struct xt_counters *alloc_counters(const struct xt_table *table)
747{ 735{
748 unsigned int countersize; 736 unsigned int countersize;
749 struct xt_counters *counters; 737 struct xt_counters *counters;
750 struct xt_table_info *private = table->private; 738 const struct xt_table_info *private = table->private;
751 739
752 /* We need atomic snapshot of counters: rest doesn't change 740 /* We need atomic snapshot of counters: rest doesn't change
753 * (other than comefrom, which userspace doesn't care 741 * (other than comefrom, which userspace doesn't care
754 * about). 742 * about).
755 */ 743 */
756 countersize = sizeof(struct xt_counters) * private->number; 744 countersize = sizeof(struct xt_counters) * private->number;
757 counters = vmalloc_node(countersize, numa_node_id()); 745 counters = vzalloc(countersize);
758 746
759 if (counters == NULL) 747 if (counters == NULL)
760 return ERR_PTR(-ENOMEM); 748 return ERR_PTR(-ENOMEM);
@@ -765,11 +753,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
765} 753}
766 754
767static int copy_entries_to_user(unsigned int total_size, 755static int copy_entries_to_user(unsigned int total_size,
768 struct xt_table *table, 756 const struct xt_table *table,
769 void __user *userptr) 757 void __user *userptr)
770{ 758{
771 unsigned int off, num; 759 unsigned int off, num;
772 struct arpt_entry *e; 760 const struct arpt_entry *e;
773 struct xt_counters *counters; 761 struct xt_counters *counters;
774 struct xt_table_info *private = table->private; 762 struct xt_table_info *private = table->private;
775 int ret = 0; 763 int ret = 0;
@@ -789,7 +777,7 @@ static int copy_entries_to_user(unsigned int total_size,
789 /* FIXME: use iterator macros --RR */ 777 /* FIXME: use iterator macros --RR */
790 /* ... then go back and fix counters and names */ 778 /* ... then go back and fix counters and names */
791 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ 779 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
792 struct arpt_entry_target *t; 780 const struct xt_entry_target *t;
793 781
794 e = (struct arpt_entry *)(loc_cpu_entry + off); 782 e = (struct arpt_entry *)(loc_cpu_entry + off);
795 if (copy_to_user(userptr + off 783 if (copy_to_user(userptr + off
@@ -800,9 +788,9 @@ static int copy_entries_to_user(unsigned int total_size,
800 goto free_counters; 788 goto free_counters;
801 } 789 }
802 790
803 t = arpt_get_target(e); 791 t = arpt_get_target_c(e);
804 if (copy_to_user(userptr + off + e->target_offset 792 if (copy_to_user(userptr + off + e->target_offset
805 + offsetof(struct arpt_entry_target, 793 + offsetof(struct xt_entry_target,
806 u.user.name), 794 u.user.name),
807 t->u.kernel.target->name, 795 t->u.kernel.target->name,
808 strlen(t->u.kernel.target->name)+1) != 0) { 796 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -817,7 +805,7 @@ static int copy_entries_to_user(unsigned int total_size,
817} 805}
818 806
819#ifdef CONFIG_COMPAT 807#ifdef CONFIG_COMPAT
820static void compat_standard_from_user(void *dst, void *src) 808static void compat_standard_from_user(void *dst, const void *src)
821{ 809{
822 int v = *(compat_int_t *)src; 810 int v = *(compat_int_t *)src;
823 811
@@ -826,7 +814,7 @@ static void compat_standard_from_user(void *dst, void *src)
826 memcpy(dst, &v, sizeof(v)); 814 memcpy(dst, &v, sizeof(v));
827} 815}
828 816
829static int compat_standard_to_user(void __user *dst, void *src) 817static int compat_standard_to_user(void __user *dst, const void *src)
830{ 818{
831 compat_int_t cv = *(int *)src; 819 compat_int_t cv = *(int *)src;
832 820
@@ -835,18 +823,18 @@ static int compat_standard_to_user(void __user *dst, void *src)
835 return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; 823 return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
836} 824}
837 825
838static int compat_calc_entry(struct arpt_entry *e, 826static int compat_calc_entry(const struct arpt_entry *e,
839 const struct xt_table_info *info, 827 const struct xt_table_info *info,
840 void *base, struct xt_table_info *newinfo) 828 const void *base, struct xt_table_info *newinfo)
841{ 829{
842 struct arpt_entry_target *t; 830 const struct xt_entry_target *t;
843 unsigned int entry_offset; 831 unsigned int entry_offset;
844 int off, i, ret; 832 int off, i, ret;
845 833
846 off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); 834 off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
847 entry_offset = (void *)e - base; 835 entry_offset = (void *)e - base;
848 836
849 t = arpt_get_target(e); 837 t = arpt_get_target_c(e);
850 off += xt_compat_target_offset(t->u.kernel.target); 838 off += xt_compat_target_offset(t->u.kernel.target);
851 newinfo->size -= off; 839 newinfo->size -= off;
852 ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); 840 ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
@@ -867,7 +855,9 @@ static int compat_calc_entry(struct arpt_entry *e,
867static int compat_table_info(const struct xt_table_info *info, 855static int compat_table_info(const struct xt_table_info *info,
868 struct xt_table_info *newinfo) 856 struct xt_table_info *newinfo)
869{ 857{
858 struct arpt_entry *iter;
870 void *loc_cpu_entry; 859 void *loc_cpu_entry;
860 int ret;
871 861
872 if (!newinfo || !info) 862 if (!newinfo || !info)
873 return -EINVAL; 863 return -EINVAL;
@@ -876,15 +866,19 @@ static int compat_table_info(const struct xt_table_info *info,
876 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 866 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
877 newinfo->initial_entries = 0; 867 newinfo->initial_entries = 0;
878 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 868 loc_cpu_entry = info->entries[raw_smp_processor_id()];
879 return ARPT_ENTRY_ITERATE(loc_cpu_entry, info->size, 869 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
880 compat_calc_entry, info, loc_cpu_entry, 870 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
881 newinfo); 871 if (ret != 0)
872 return ret;
873 }
874 return 0;
882} 875}
883#endif 876#endif
884 877
885static int get_info(struct net *net, void __user *user, int *len, int compat) 878static int get_info(struct net *net, void __user *user,
879 const int *len, int compat)
886{ 880{
887 char name[ARPT_TABLE_MAXNAMELEN]; 881 char name[XT_TABLE_MAXNAMELEN];
888 struct xt_table *t; 882 struct xt_table *t;
889 int ret; 883 int ret;
890 884
@@ -897,7 +891,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
897 if (copy_from_user(name, user, sizeof(name)) != 0) 891 if (copy_from_user(name, user, sizeof(name)) != 0)
898 return -EFAULT; 892 return -EFAULT;
899 893
900 name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; 894 name[XT_TABLE_MAXNAMELEN-1] = '\0';
901#ifdef CONFIG_COMPAT 895#ifdef CONFIG_COMPAT
902 if (compat) 896 if (compat)
903 xt_compat_lock(NFPROTO_ARP); 897 xt_compat_lock(NFPROTO_ARP);
@@ -907,15 +901,16 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
907 if (t && !IS_ERR(t)) { 901 if (t && !IS_ERR(t)) {
908 struct arpt_getinfo info; 902 struct arpt_getinfo info;
909 const struct xt_table_info *private = t->private; 903 const struct xt_table_info *private = t->private;
910
911#ifdef CONFIG_COMPAT 904#ifdef CONFIG_COMPAT
905 struct xt_table_info tmp;
906
912 if (compat) { 907 if (compat) {
913 struct xt_table_info tmp;
914 ret = compat_table_info(private, &tmp); 908 ret = compat_table_info(private, &tmp);
915 xt_compat_flush_offsets(NFPROTO_ARP); 909 xt_compat_flush_offsets(NFPROTO_ARP);
916 private = &tmp; 910 private = &tmp;
917 } 911 }
918#endif 912#endif
913 memset(&info, 0, sizeof(info));
919 info.valid_hooks = t->valid_hooks; 914 info.valid_hooks = t->valid_hooks;
920 memcpy(info.hook_entry, private->hook_entry, 915 memcpy(info.hook_entry, private->hook_entry,
921 sizeof(info.hook_entry)); 916 sizeof(info.hook_entry));
@@ -941,7 +936,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
941} 936}
942 937
943static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, 938static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
944 int *len) 939 const int *len)
945{ 940{
946 int ret; 941 int ret;
947 struct arpt_get_entries get; 942 struct arpt_get_entries get;
@@ -992,10 +987,10 @@ static int __do_replace(struct net *net, const char *name,
992 struct xt_table_info *oldinfo; 987 struct xt_table_info *oldinfo;
993 struct xt_counters *counters; 988 struct xt_counters *counters;
994 void *loc_cpu_old_entry; 989 void *loc_cpu_old_entry;
990 struct arpt_entry *iter;
995 991
996 ret = 0; 992 ret = 0;
997 counters = vmalloc_node(num_counters * sizeof(struct xt_counters), 993 counters = vzalloc(num_counters * sizeof(struct xt_counters));
998 numa_node_id());
999 if (!counters) { 994 if (!counters) {
1000 ret = -ENOMEM; 995 ret = -ENOMEM;
1001 goto out; 996 goto out;
@@ -1035,8 +1030,8 @@ static int __do_replace(struct net *net, const char *name,
1035 1030
1036 /* Decrease module usage counts and free resource */ 1031 /* Decrease module usage counts and free resource */
1037 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1032 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1038 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1033 xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
1039 NULL); 1034 cleanup_entry(iter);
1040 1035
1041 xt_free_table_info(oldinfo); 1036 xt_free_table_info(oldinfo);
1042 if (copy_to_user(counters_ptr, counters, 1037 if (copy_to_user(counters_ptr, counters,
@@ -1055,12 +1050,14 @@ static int __do_replace(struct net *net, const char *name,
1055 return ret; 1050 return ret;
1056} 1051}
1057 1052
1058static int do_replace(struct net *net, void __user *user, unsigned int len) 1053static int do_replace(struct net *net, const void __user *user,
1054 unsigned int len)
1059{ 1055{
1060 int ret; 1056 int ret;
1061 struct arpt_replace tmp; 1057 struct arpt_replace tmp;
1062 struct xt_table_info *newinfo; 1058 struct xt_table_info *newinfo;
1063 void *loc_cpu_entry; 1059 void *loc_cpu_entry;
1060 struct arpt_entry *iter;
1064 1061
1065 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1062 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1066 return -EFAULT; 1063 return -EFAULT;
@@ -1081,9 +1078,7 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
1081 goto free_newinfo; 1078 goto free_newinfo;
1082 } 1079 }
1083 1080
1084 ret = translate_table(tmp.name, tmp.valid_hooks, 1081 ret = translate_table(newinfo, loc_cpu_entry, &tmp);
1085 newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
1086 tmp.hook_entry, tmp.underflow);
1087 if (ret != 0) 1082 if (ret != 0)
1088 goto free_newinfo; 1083 goto free_newinfo;
1089 1084
@@ -1096,27 +1091,15 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
1096 return 0; 1091 return 0;
1097 1092
1098 free_newinfo_untrans: 1093 free_newinfo_untrans:
1099 ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); 1094 xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
1095 cleanup_entry(iter);
1100 free_newinfo: 1096 free_newinfo:
1101 xt_free_table_info(newinfo); 1097 xt_free_table_info(newinfo);
1102 return ret; 1098 return ret;
1103} 1099}
1104 1100
1105/* We're lazy, and add to the first CPU; overflow works its fey magic 1101static int do_add_counters(struct net *net, const void __user *user,
1106 * and everything is OK. */ 1102 unsigned int len, int compat)
1107static int
1108add_counter_to_entry(struct arpt_entry *e,
1109 const struct xt_counters addme[],
1110 unsigned int *i)
1111{
1112 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1113
1114 (*i)++;
1115 return 0;
1116}
1117
1118static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1119 int compat)
1120{ 1103{
1121 unsigned int i, curcpu; 1104 unsigned int i, curcpu;
1122 struct xt_counters_info tmp; 1105 struct xt_counters_info tmp;
@@ -1129,6 +1112,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1129 const struct xt_table_info *private; 1112 const struct xt_table_info *private;
1130 int ret = 0; 1113 int ret = 0;
1131 void *loc_cpu_entry; 1114 void *loc_cpu_entry;
1115 struct arpt_entry *iter;
1132#ifdef CONFIG_COMPAT 1116#ifdef CONFIG_COMPAT
1133 struct compat_xt_counters_info compat_tmp; 1117 struct compat_xt_counters_info compat_tmp;
1134 1118
@@ -1159,7 +1143,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1159 if (len != size + num_counters * sizeof(struct xt_counters)) 1143 if (len != size + num_counters * sizeof(struct xt_counters))
1160 return -EINVAL; 1144 return -EINVAL;
1161 1145
1162 paddc = vmalloc_node(len - size, numa_node_id()); 1146 paddc = vmalloc(len - size);
1163 if (!paddc) 1147 if (!paddc)
1164 return -ENOMEM; 1148 return -ENOMEM;
1165 1149
@@ -1186,11 +1170,10 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1186 curcpu = smp_processor_id(); 1170 curcpu = smp_processor_id();
1187 loc_cpu_entry = private->entries[curcpu]; 1171 loc_cpu_entry = private->entries[curcpu];
1188 xt_info_wrlock(curcpu); 1172 xt_info_wrlock(curcpu);
1189 ARPT_ENTRY_ITERATE(loc_cpu_entry, 1173 xt_entry_foreach(iter, loc_cpu_entry, private->size) {
1190 private->size, 1174 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
1191 add_counter_to_entry, 1175 ++i;
1192 paddc, 1176 }
1193 &i);
1194 xt_info_wrunlock(curcpu); 1177 xt_info_wrunlock(curcpu);
1195 unlock_up_free: 1178 unlock_up_free:
1196 local_bh_enable(); 1179 local_bh_enable();
@@ -1203,38 +1186,32 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1203} 1186}
1204 1187
1205#ifdef CONFIG_COMPAT 1188#ifdef CONFIG_COMPAT
1206static inline int 1189static inline void compat_release_entry(struct compat_arpt_entry *e)
1207compat_release_entry(struct compat_arpt_entry *e, unsigned int *i)
1208{ 1190{
1209 struct arpt_entry_target *t; 1191 struct xt_entry_target *t;
1210
1211 if (i && (*i)-- == 0)
1212 return 1;
1213 1192
1214 t = compat_arpt_get_target(e); 1193 t = compat_arpt_get_target(e);
1215 module_put(t->u.kernel.target->me); 1194 module_put(t->u.kernel.target->me);
1216 return 0;
1217} 1195}
1218 1196
1219static inline int 1197static inline int
1220check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, 1198check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1221 struct xt_table_info *newinfo, 1199 struct xt_table_info *newinfo,
1222 unsigned int *size, 1200 unsigned int *size,
1223 unsigned char *base, 1201 const unsigned char *base,
1224 unsigned char *limit, 1202 const unsigned char *limit,
1225 unsigned int *hook_entries, 1203 const unsigned int *hook_entries,
1226 unsigned int *underflows, 1204 const unsigned int *underflows,
1227 unsigned int *i,
1228 const char *name) 1205 const char *name)
1229{ 1206{
1230 struct arpt_entry_target *t; 1207 struct xt_entry_target *t;
1231 struct xt_target *target; 1208 struct xt_target *target;
1232 unsigned int entry_offset; 1209 unsigned int entry_offset;
1233 int ret, off, h; 1210 int ret, off, h;
1234 1211
1235 duprintf("check_compat_entry_size_and_hooks %p\n", e); 1212 duprintf("check_compat_entry_size_and_hooks %p\n", e);
1236 if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 1213 if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
1237 || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { 1214 (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) {
1238 duprintf("Bad offset %p, limit = %p\n", e, limit); 1215 duprintf("Bad offset %p, limit = %p\n", e, limit);
1239 return -EINVAL; 1216 return -EINVAL;
1240 } 1217 }
@@ -1255,14 +1232,12 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1255 entry_offset = (void *)e - (void *)base; 1232 entry_offset = (void *)e - (void *)base;
1256 1233
1257 t = compat_arpt_get_target(e); 1234 t = compat_arpt_get_target(e);
1258 target = try_then_request_module(xt_find_target(NFPROTO_ARP, 1235 target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
1259 t->u.user.name, 1236 t->u.user.revision);
1260 t->u.user.revision), 1237 if (IS_ERR(target)) {
1261 "arpt_%s", t->u.user.name);
1262 if (IS_ERR(target) || !target) {
1263 duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", 1238 duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
1264 t->u.user.name); 1239 t->u.user.name);
1265 ret = target ? PTR_ERR(target) : -ENOENT; 1240 ret = PTR_ERR(target);
1266 goto out; 1241 goto out;
1267 } 1242 }
1268 t->u.kernel.target = target; 1243 t->u.kernel.target = target;
@@ -1284,8 +1259,6 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1284 /* Clear counters and comefrom */ 1259 /* Clear counters and comefrom */
1285 memset(&e->counters, 0, sizeof(e->counters)); 1260 memset(&e->counters, 0, sizeof(e->counters));
1286 e->comefrom = 0; 1261 e->comefrom = 0;
1287
1288 (*i)++;
1289 return 0; 1262 return 0;
1290 1263
1291release_target: 1264release_target:
@@ -1299,7 +1272,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
1299 unsigned int *size, const char *name, 1272 unsigned int *size, const char *name,
1300 struct xt_table_info *newinfo, unsigned char *base) 1273 struct xt_table_info *newinfo, unsigned char *base)
1301{ 1274{
1302 struct arpt_entry_target *t; 1275 struct xt_entry_target *t;
1303 struct xt_target *target; 1276 struct xt_target *target;
1304 struct arpt_entry *de; 1277 struct arpt_entry *de;
1305 unsigned int origsize; 1278 unsigned int origsize;
@@ -1329,19 +1302,6 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
1329 return ret; 1302 return ret;
1330} 1303}
1331 1304
1332static inline int compat_check_entry(struct arpt_entry *e, const char *name,
1333 unsigned int *i)
1334{
1335 int ret;
1336
1337 ret = check_target(e, name);
1338 if (ret)
1339 return ret;
1340
1341 (*i)++;
1342 return 0;
1343}
1344
1345static int translate_compat_table(const char *name, 1305static int translate_compat_table(const char *name,
1346 unsigned int valid_hooks, 1306 unsigned int valid_hooks,
1347 struct xt_table_info **pinfo, 1307 struct xt_table_info **pinfo,
@@ -1354,8 +1314,10 @@ static int translate_compat_table(const char *name,
1354 unsigned int i, j; 1314 unsigned int i, j;
1355 struct xt_table_info *newinfo, *info; 1315 struct xt_table_info *newinfo, *info;
1356 void *pos, *entry0, *entry1; 1316 void *pos, *entry0, *entry1;
1317 struct compat_arpt_entry *iter0;
1318 struct arpt_entry *iter1;
1357 unsigned int size; 1319 unsigned int size;
1358 int ret; 1320 int ret = 0;
1359 1321
1360 info = *pinfo; 1322 info = *pinfo;
1361 entry0 = *pentry0; 1323 entry0 = *pentry0;
@@ -1372,13 +1334,17 @@ static int translate_compat_table(const char *name,
1372 j = 0; 1334 j = 0;
1373 xt_compat_lock(NFPROTO_ARP); 1335 xt_compat_lock(NFPROTO_ARP);
1374 /* Walk through entries, checking offsets. */ 1336 /* Walk through entries, checking offsets. */
1375 ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, 1337 xt_entry_foreach(iter0, entry0, total_size) {
1376 check_compat_entry_size_and_hooks, 1338 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
1377 info, &size, entry0, 1339 entry0,
1378 entry0 + total_size, 1340 entry0 + total_size,
1379 hook_entries, underflows, &j, name); 1341 hook_entries,
1380 if (ret != 0) 1342 underflows,
1381 goto out_unlock; 1343 name);
1344 if (ret != 0)
1345 goto out_unlock;
1346 ++j;
1347 }
1382 1348
1383 ret = -EINVAL; 1349 ret = -EINVAL;
1384 if (j != number) { 1350 if (j != number) {
@@ -1417,9 +1383,12 @@ static int translate_compat_table(const char *name,
1417 entry1 = newinfo->entries[raw_smp_processor_id()]; 1383 entry1 = newinfo->entries[raw_smp_processor_id()];
1418 pos = entry1; 1384 pos = entry1;
1419 size = total_size; 1385 size = total_size;
1420 ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, 1386 xt_entry_foreach(iter0, entry0, total_size) {
1421 compat_copy_entry_from_user, 1387 ret = compat_copy_entry_from_user(iter0, &pos, &size,
1422 &pos, &size, name, newinfo, entry1); 1388 name, newinfo, entry1);
1389 if (ret != 0)
1390 break;
1391 }
1423 xt_compat_flush_offsets(NFPROTO_ARP); 1392 xt_compat_flush_offsets(NFPROTO_ARP);
1424 xt_compat_unlock(NFPROTO_ARP); 1393 xt_compat_unlock(NFPROTO_ARP);
1425 if (ret) 1394 if (ret)
@@ -1430,13 +1399,35 @@ static int translate_compat_table(const char *name,
1430 goto free_newinfo; 1399 goto free_newinfo;
1431 1400
1432 i = 0; 1401 i = 0;
1433 ret = ARPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, 1402 xt_entry_foreach(iter1, entry1, newinfo->size) {
1434 name, &i); 1403 ret = check_target(iter1, name);
1404 if (ret != 0)
1405 break;
1406 ++i;
1407 if (strcmp(arpt_get_target(iter1)->u.user.name,
1408 XT_ERROR_TARGET) == 0)
1409 ++newinfo->stacksize;
1410 }
1435 if (ret) { 1411 if (ret) {
1412 /*
1413 * The first i matches need cleanup_entry (calls ->destroy)
1414 * because they had called ->check already. The other j-i
1415 * entries need only release.
1416 */
1417 int skip = i;
1436 j -= i; 1418 j -= i;
1437 COMPAT_ARPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, 1419 xt_entry_foreach(iter0, entry0, newinfo->size) {
1438 compat_release_entry, &j); 1420 if (skip-- > 0)
1439 ARPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); 1421 continue;
1422 if (j-- == 0)
1423 break;
1424 compat_release_entry(iter0);
1425 }
1426 xt_entry_foreach(iter1, entry1, newinfo->size) {
1427 if (i-- == 0)
1428 break;
1429 cleanup_entry(iter1);
1430 }
1440 xt_free_table_info(newinfo); 1431 xt_free_table_info(newinfo);
1441 return ret; 1432 return ret;
1442 } 1433 }
@@ -1454,7 +1445,11 @@ static int translate_compat_table(const char *name,
1454free_newinfo: 1445free_newinfo:
1455 xt_free_table_info(newinfo); 1446 xt_free_table_info(newinfo);
1456out: 1447out:
1457 COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); 1448 xt_entry_foreach(iter0, entry0, total_size) {
1449 if (j-- == 0)
1450 break;
1451 compat_release_entry(iter0);
1452 }
1458 return ret; 1453 return ret;
1459out_unlock: 1454out_unlock:
1460 xt_compat_flush_offsets(NFPROTO_ARP); 1455 xt_compat_flush_offsets(NFPROTO_ARP);
@@ -1463,7 +1458,7 @@ out_unlock:
1463} 1458}
1464 1459
1465struct compat_arpt_replace { 1460struct compat_arpt_replace {
1466 char name[ARPT_TABLE_MAXNAMELEN]; 1461 char name[XT_TABLE_MAXNAMELEN];
1467 u32 valid_hooks; 1462 u32 valid_hooks;
1468 u32 num_entries; 1463 u32 num_entries;
1469 u32 size; 1464 u32 size;
@@ -1481,6 +1476,7 @@ static int compat_do_replace(struct net *net, void __user *user,
1481 struct compat_arpt_replace tmp; 1476 struct compat_arpt_replace tmp;
1482 struct xt_table_info *newinfo; 1477 struct xt_table_info *newinfo;
1483 void *loc_cpu_entry; 1478 void *loc_cpu_entry;
1479 struct arpt_entry *iter;
1484 1480
1485 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1481 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1486 return -EFAULT; 1482 return -EFAULT;
@@ -1518,7 +1514,8 @@ static int compat_do_replace(struct net *net, void __user *user,
1518 return 0; 1514 return 0;
1519 1515
1520 free_newinfo_untrans: 1516 free_newinfo_untrans:
1521 ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); 1517 xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
1518 cleanup_entry(iter);
1522 free_newinfo: 1519 free_newinfo:
1523 xt_free_table_info(newinfo); 1520 xt_free_table_info(newinfo);
1524 return ret; 1521 return ret;
@@ -1552,22 +1549,20 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
1552static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, 1549static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
1553 compat_uint_t *size, 1550 compat_uint_t *size,
1554 struct xt_counters *counters, 1551 struct xt_counters *counters,
1555 unsigned int *i) 1552 unsigned int i)
1556{ 1553{
1557 struct arpt_entry_target *t; 1554 struct xt_entry_target *t;
1558 struct compat_arpt_entry __user *ce; 1555 struct compat_arpt_entry __user *ce;
1559 u_int16_t target_offset, next_offset; 1556 u_int16_t target_offset, next_offset;
1560 compat_uint_t origsize; 1557 compat_uint_t origsize;
1561 int ret; 1558 int ret;
1562 1559
1563 ret = -EFAULT;
1564 origsize = *size; 1560 origsize = *size;
1565 ce = (struct compat_arpt_entry __user *)*dstptr; 1561 ce = (struct compat_arpt_entry __user *)*dstptr;
1566 if (copy_to_user(ce, e, sizeof(struct arpt_entry))) 1562 if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 ||
1567 goto out; 1563 copy_to_user(&ce->counters, &counters[i],
1568 1564 sizeof(counters[i])) != 0)
1569 if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) 1565 return -EFAULT;
1570 goto out;
1571 1566
1572 *dstptr += sizeof(struct compat_arpt_entry); 1567 *dstptr += sizeof(struct compat_arpt_entry);
1573 *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); 1568 *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
@@ -1577,18 +1572,12 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
1577 t = arpt_get_target(e); 1572 t = arpt_get_target(e);
1578 ret = xt_compat_target_to_user(t, dstptr, size); 1573 ret = xt_compat_target_to_user(t, dstptr, size);
1579 if (ret) 1574 if (ret)
1580 goto out; 1575 return ret;
1581 ret = -EFAULT;
1582 next_offset = e->next_offset - (origsize - *size); 1576 next_offset = e->next_offset - (origsize - *size);
1583 if (put_user(target_offset, &ce->target_offset)) 1577 if (put_user(target_offset, &ce->target_offset) != 0 ||
1584 goto out; 1578 put_user(next_offset, &ce->next_offset) != 0)
1585 if (put_user(next_offset, &ce->next_offset)) 1579 return -EFAULT;
1586 goto out;
1587
1588 (*i)++;
1589 return 0; 1580 return 0;
1590out:
1591 return ret;
1592} 1581}
1593 1582
1594static int compat_copy_entries_to_user(unsigned int total_size, 1583static int compat_copy_entries_to_user(unsigned int total_size,
@@ -1602,6 +1591,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
1602 int ret = 0; 1591 int ret = 0;
1603 void *loc_cpu_entry; 1592 void *loc_cpu_entry;
1604 unsigned int i = 0; 1593 unsigned int i = 0;
1594 struct arpt_entry *iter;
1605 1595
1606 counters = alloc_counters(table); 1596 counters = alloc_counters(table);
1607 if (IS_ERR(counters)) 1597 if (IS_ERR(counters))
@@ -1611,15 +1601,18 @@ static int compat_copy_entries_to_user(unsigned int total_size,
1611 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1601 loc_cpu_entry = private->entries[raw_smp_processor_id()];
1612 pos = userptr; 1602 pos = userptr;
1613 size = total_size; 1603 size = total_size;
1614 ret = ARPT_ENTRY_ITERATE(loc_cpu_entry, total_size, 1604 xt_entry_foreach(iter, loc_cpu_entry, total_size) {
1615 compat_copy_entry_to_user, 1605 ret = compat_copy_entry_to_user(iter, &pos,
1616 &pos, &size, counters, &i); 1606 &size, counters, i++);
1607 if (ret != 0)
1608 break;
1609 }
1617 vfree(counters); 1610 vfree(counters);
1618 return ret; 1611 return ret;
1619} 1612}
1620 1613
1621struct compat_arpt_get_entries { 1614struct compat_arpt_get_entries {
1622 char name[ARPT_TABLE_MAXNAMELEN]; 1615 char name[XT_TABLE_MAXNAMELEN];
1623 compat_uint_t size; 1616 compat_uint_t size;
1624 struct compat_arpt_entry entrytable[0]; 1617 struct compat_arpt_entry entrytable[0];
1625}; 1618};
@@ -1760,13 +1753,13 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
1760 return ret; 1753 return ret;
1761} 1754}
1762 1755
1763struct xt_table *arpt_register_table(struct net *net, struct xt_table *table, 1756struct xt_table *arpt_register_table(struct net *net,
1757 const struct xt_table *table,
1764 const struct arpt_replace *repl) 1758 const struct arpt_replace *repl)
1765{ 1759{
1766 int ret; 1760 int ret;
1767 struct xt_table_info *newinfo; 1761 struct xt_table_info *newinfo;
1768 struct xt_table_info bootstrap 1762 struct xt_table_info bootstrap = {0};
1769 = { 0, 0, 0, { 0 }, { 0 }, { } };
1770 void *loc_cpu_entry; 1763 void *loc_cpu_entry;
1771 struct xt_table *new_table; 1764 struct xt_table *new_table;
1772 1765
@@ -1780,12 +1773,7 @@ struct xt_table *arpt_register_table(struct net *net, struct xt_table *table,
1780 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; 1773 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1781 memcpy(loc_cpu_entry, repl->entries, repl->size); 1774 memcpy(loc_cpu_entry, repl->entries, repl->size);
1782 1775
1783 ret = translate_table(table->name, table->valid_hooks, 1776 ret = translate_table(newinfo, loc_cpu_entry, repl);
1784 newinfo, loc_cpu_entry, repl->size,
1785 repl->num_entries,
1786 repl->hook_entry,
1787 repl->underflow);
1788
1789 duprintf("arpt_register_table: translate table gives %d\n", ret); 1777 duprintf("arpt_register_table: translate table gives %d\n", ret);
1790 if (ret != 0) 1778 if (ret != 0)
1791 goto out_free; 1779 goto out_free;
@@ -1808,35 +1796,37 @@ void arpt_unregister_table(struct xt_table *table)
1808 struct xt_table_info *private; 1796 struct xt_table_info *private;
1809 void *loc_cpu_entry; 1797 void *loc_cpu_entry;
1810 struct module *table_owner = table->me; 1798 struct module *table_owner = table->me;
1799 struct arpt_entry *iter;
1811 1800
1812 private = xt_unregister_table(table); 1801 private = xt_unregister_table(table);
1813 1802
1814 /* Decrease module usage counts and free resources */ 1803 /* Decrease module usage counts and free resources */
1815 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1804 loc_cpu_entry = private->entries[raw_smp_processor_id()];
1816 ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size, 1805 xt_entry_foreach(iter, loc_cpu_entry, private->size)
1817 cleanup_entry, NULL); 1806 cleanup_entry(iter);
1818 if (private->number > private->initial_entries) 1807 if (private->number > private->initial_entries)
1819 module_put(table_owner); 1808 module_put(table_owner);
1820 xt_free_table_info(private); 1809 xt_free_table_info(private);
1821} 1810}
1822 1811
1823/* The built-in targets: standard (NULL) and error. */ 1812/* The built-in targets: standard (NULL) and error. */
1824static struct xt_target arpt_standard_target __read_mostly = { 1813static struct xt_target arpt_builtin_tg[] __read_mostly = {
1825 .name = ARPT_STANDARD_TARGET, 1814 {
1826 .targetsize = sizeof(int), 1815 .name = XT_STANDARD_TARGET,
1827 .family = NFPROTO_ARP, 1816 .targetsize = sizeof(int),
1817 .family = NFPROTO_ARP,
1828#ifdef CONFIG_COMPAT 1818#ifdef CONFIG_COMPAT
1829 .compatsize = sizeof(compat_int_t), 1819 .compatsize = sizeof(compat_int_t),
1830 .compat_from_user = compat_standard_from_user, 1820 .compat_from_user = compat_standard_from_user,
1831 .compat_to_user = compat_standard_to_user, 1821 .compat_to_user = compat_standard_to_user,
1832#endif 1822#endif
1833}; 1823 },
1834 1824 {
1835static struct xt_target arpt_error_target __read_mostly = { 1825 .name = XT_ERROR_TARGET,
1836 .name = ARPT_ERROR_TARGET, 1826 .target = arpt_error,
1837 .target = arpt_error, 1827 .targetsize = XT_FUNCTION_MAXNAMELEN,
1838 .targetsize = ARPT_FUNCTION_MAXNAMELEN, 1828 .family = NFPROTO_ARP,
1839 .family = NFPROTO_ARP, 1829 },
1840}; 1830};
1841 1831
1842static struct nf_sockopt_ops arpt_sockopts = { 1832static struct nf_sockopt_ops arpt_sockopts = {
@@ -1880,12 +1870,9 @@ static int __init arp_tables_init(void)
1880 goto err1; 1870 goto err1;
1881 1871
1882 /* Noone else will be downing sem now, so we won't sleep */ 1872 /* Noone else will be downing sem now, so we won't sleep */
1883 ret = xt_register_target(&arpt_standard_target); 1873 ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
1884 if (ret < 0) 1874 if (ret < 0)
1885 goto err2; 1875 goto err2;
1886 ret = xt_register_target(&arpt_error_target);
1887 if (ret < 0)
1888 goto err3;
1889 1876
1890 /* Register setsockopt */ 1877 /* Register setsockopt */
1891 ret = nf_register_sockopt(&arpt_sockopts); 1878 ret = nf_register_sockopt(&arpt_sockopts);
@@ -1896,9 +1883,7 @@ static int __init arp_tables_init(void)
1896 return 0; 1883 return 0;
1897 1884
1898err4: 1885err4:
1899 xt_unregister_target(&arpt_error_target); 1886 xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
1900err3:
1901 xt_unregister_target(&arpt_standard_target);
1902err2: 1887err2:
1903 unregister_pernet_subsys(&arp_tables_net_ops); 1888 unregister_pernet_subsys(&arp_tables_net_ops);
1904err1: 1889err1:
@@ -1908,8 +1893,7 @@ err1:
1908static void __exit arp_tables_fini(void) 1893static void __exit arp_tables_fini(void)
1909{ 1894{
1910 nf_unregister_sockopt(&arpt_sockopts); 1895 nf_unregister_sockopt(&arpt_sockopts);
1911 xt_unregister_target(&arpt_error_target); 1896 xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
1912 xt_unregister_target(&arpt_standard_target);
1913 unregister_pernet_subsys(&arp_tables_net_ops); 1897 unregister_pernet_subsys(&arp_tables_net_ops);
1914} 1898}
1915 1899
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index b0d5b1d0a769..b8ddcc480ed9 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -9,7 +9,7 @@ MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
9MODULE_DESCRIPTION("arptables arp payload mangle target"); 9MODULE_DESCRIPTION("arptables arp payload mangle target");
10 10
11static unsigned int 11static unsigned int
12target(struct sk_buff *skb, const struct xt_target_param *par) 12target(struct sk_buff *skb, const struct xt_action_param *par)
13{ 13{
14 const struct arpt_mangle *mangle = par->targinfo; 14 const struct arpt_mangle *mangle = par->targinfo;
15 const struct arphdr *arp; 15 const struct arphdr *arp;
@@ -54,7 +54,7 @@ target(struct sk_buff *skb, const struct xt_target_param *par)
54 return mangle->target; 54 return mangle->target;
55} 55}
56 56
57static bool checkentry(const struct xt_tgchk_param *par) 57static int checkentry(const struct xt_tgchk_param *par)
58{ 58{
59 const struct arpt_mangle *mangle = par->targinfo; 59 const struct arpt_mangle *mangle = par->targinfo;
60 60
@@ -63,7 +63,7 @@ static bool checkentry(const struct xt_tgchk_param *par)
63 return false; 63 return false;
64 64
65 if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && 65 if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
66 mangle->target != ARPT_CONTINUE) 66 mangle->target != XT_CONTINUE)
67 return false; 67 return false;
68 return true; 68 return true;
69} 69}
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 6ecfdae7c589..79ca5e70d497 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -6,7 +6,9 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/netfilter/x_tables.h>
9#include <linux/netfilter_arp/arp_tables.h> 10#include <linux/netfilter_arp/arp_tables.h>
11#include <linux/slab.h>
10 12
11MODULE_LICENSE("GPL"); 13MODULE_LICENSE("GPL");
12MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); 14MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
@@ -15,93 +17,37 @@ MODULE_DESCRIPTION("arptables filter table");
15#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ 17#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
16 (1 << NF_ARP_FORWARD)) 18 (1 << NF_ARP_FORWARD))
17 19
18static struct 20static const struct xt_table packet_filter = {
19{
20 struct arpt_replace repl;
21 struct arpt_standard entries[3];
22 struct arpt_error term;
23} initial_table __net_initdata = {
24 .repl = {
25 .name = "filter",
26 .valid_hooks = FILTER_VALID_HOOKS,
27 .num_entries = 4,
28 .size = sizeof(struct arpt_standard) * 3 + sizeof(struct arpt_error),
29 .hook_entry = {
30 [NF_ARP_IN] = 0,
31 [NF_ARP_OUT] = sizeof(struct arpt_standard),
32 [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard),
33 },
34 .underflow = {
35 [NF_ARP_IN] = 0,
36 [NF_ARP_OUT] = sizeof(struct arpt_standard),
37 [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard),
38 },
39 },
40 .entries = {
41 ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_IN */
42 ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_OUT */
43 ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_FORWARD */
44 },
45 .term = ARPT_ERROR_INIT,
46};
47
48static struct xt_table packet_filter = {
49 .name = "filter", 21 .name = "filter",
50 .valid_hooks = FILTER_VALID_HOOKS, 22 .valid_hooks = FILTER_VALID_HOOKS,
51 .me = THIS_MODULE, 23 .me = THIS_MODULE,
52 .af = NFPROTO_ARP, 24 .af = NFPROTO_ARP,
25 .priority = NF_IP_PRI_FILTER,
53}; 26};
54 27
55/* The work comes in here from netfilter.c */ 28/* The work comes in here from netfilter.c */
56static unsigned int arpt_in_hook(unsigned int hook, 29static unsigned int
57 struct sk_buff *skb, 30arptable_filter_hook(unsigned int hook, struct sk_buff *skb,
58 const struct net_device *in, 31 const struct net_device *in, const struct net_device *out,
59 const struct net_device *out, 32 int (*okfn)(struct sk_buff *))
60 int (*okfn)(struct sk_buff *))
61{ 33{
62 return arpt_do_table(skb, hook, in, out, 34 const struct net *net = dev_net((in != NULL) ? in : out);
63 dev_net(in)->ipv4.arptable_filter);
64}
65 35
66static unsigned int arpt_out_hook(unsigned int hook, 36 return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter);
67 struct sk_buff *skb,
68 const struct net_device *in,
69 const struct net_device *out,
70 int (*okfn)(struct sk_buff *))
71{
72 return arpt_do_table(skb, hook, in, out,
73 dev_net(out)->ipv4.arptable_filter);
74} 37}
75 38
76static struct nf_hook_ops arpt_ops[] __read_mostly = { 39static struct nf_hook_ops *arpfilter_ops __read_mostly;
77 {
78 .hook = arpt_in_hook,
79 .owner = THIS_MODULE,
80 .pf = NFPROTO_ARP,
81 .hooknum = NF_ARP_IN,
82 .priority = NF_IP_PRI_FILTER,
83 },
84 {
85 .hook = arpt_out_hook,
86 .owner = THIS_MODULE,
87 .pf = NFPROTO_ARP,
88 .hooknum = NF_ARP_OUT,
89 .priority = NF_IP_PRI_FILTER,
90 },
91 {
92 .hook = arpt_in_hook,
93 .owner = THIS_MODULE,
94 .pf = NFPROTO_ARP,
95 .hooknum = NF_ARP_FORWARD,
96 .priority = NF_IP_PRI_FILTER,
97 },
98};
99 40
100static int __net_init arptable_filter_net_init(struct net *net) 41static int __net_init arptable_filter_net_init(struct net *net)
101{ 42{
102 /* Register table */ 43 struct arpt_replace *repl;
44
45 repl = arpt_alloc_initial_table(&packet_filter);
46 if (repl == NULL)
47 return -ENOMEM;
103 net->ipv4.arptable_filter = 48 net->ipv4.arptable_filter =
104 arpt_register_table(net, &packet_filter, &initial_table.repl); 49 arpt_register_table(net, &packet_filter, repl);
50 kfree(repl);
105 if (IS_ERR(net->ipv4.arptable_filter)) 51 if (IS_ERR(net->ipv4.arptable_filter))
106 return PTR_ERR(net->ipv4.arptable_filter); 52 return PTR_ERR(net->ipv4.arptable_filter);
107 return 0; 53 return 0;
@@ -125,9 +71,11 @@ static int __init arptable_filter_init(void)
125 if (ret < 0) 71 if (ret < 0)
126 return ret; 72 return ret;
127 73
128 ret = nf_register_hooks(arpt_ops, ARRAY_SIZE(arpt_ops)); 74 arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook);
129 if (ret < 0) 75 if (IS_ERR(arpfilter_ops)) {
76 ret = PTR_ERR(arpfilter_ops);
130 goto cleanup_table; 77 goto cleanup_table;
78 }
131 return ret; 79 return ret;
132 80
133cleanup_table: 81cleanup_table:
@@ -137,7 +85,7 @@ cleanup_table:
137 85
138static void __exit arptable_filter_fini(void) 86static void __exit arptable_filter_fini(void)
139{ 87{
140 nf_unregister_hooks(arpt_ops, ARRAY_SIZE(arpt_ops)); 88 xt_hook_unlink(&packet_filter, arpfilter_ops);
141 unregister_pernet_subsys(&arptable_filter_net_ops); 89 unregister_pernet_subsys(&arptable_filter_net_ops);
142} 90}
143 91
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index c156db215987..d2c1311cb28d 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -26,6 +26,7 @@
26#include <linux/security.h> 26#include <linux/security.h>
27#include <linux/net.h> 27#include <linux/net.h>
28#include <linux/mutex.h> 28#include <linux/mutex.h>
29#include <linux/slab.h>
29#include <net/net_namespace.h> 30#include <net/net_namespace.h>
30#include <net/sock.h> 31#include <net/sock.h>
31#include <net/route.h> 32#include <net/route.h>
@@ -41,7 +42,7 @@ typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
41 42
42static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; 43static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
43static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; 44static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
44static DEFINE_RWLOCK(queue_lock); 45static DEFINE_SPINLOCK(queue_lock);
45static int peer_pid __read_mostly; 46static int peer_pid __read_mostly;
46static unsigned int copy_range __read_mostly; 47static unsigned int copy_range __read_mostly;
47static unsigned int queue_total; 48static unsigned int queue_total;
@@ -71,10 +72,10 @@ __ipq_set_mode(unsigned char mode, unsigned int range)
71 break; 72 break;
72 73
73 case IPQ_COPY_PACKET: 74 case IPQ_COPY_PACKET:
74 copy_mode = mode; 75 if (range > 0xFFFF)
76 range = 0xFFFF;
75 copy_range = range; 77 copy_range = range;
76 if (copy_range > 0xFFFF) 78 copy_mode = mode;
77 copy_range = 0xFFFF;
78 break; 79 break;
79 80
80 default: 81 default:
@@ -100,7 +101,7 @@ ipq_find_dequeue_entry(unsigned long id)
100{ 101{
101 struct nf_queue_entry *entry = NULL, *i; 102 struct nf_queue_entry *entry = NULL, *i;
102 103
103 write_lock_bh(&queue_lock); 104 spin_lock_bh(&queue_lock);
104 105
105 list_for_each_entry(i, &queue_list, list) { 106 list_for_each_entry(i, &queue_list, list) {
106 if ((unsigned long)i == id) { 107 if ((unsigned long)i == id) {
@@ -114,7 +115,7 @@ ipq_find_dequeue_entry(unsigned long id)
114 queue_total--; 115 queue_total--;
115 } 116 }
116 117
117 write_unlock_bh(&queue_lock); 118 spin_unlock_bh(&queue_lock);
118 return entry; 119 return entry;
119} 120}
120 121
@@ -135,9 +136,9 @@ __ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
135static void 136static void
136ipq_flush(ipq_cmpfn cmpfn, unsigned long data) 137ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
137{ 138{
138 write_lock_bh(&queue_lock); 139 spin_lock_bh(&queue_lock);
139 __ipq_flush(cmpfn, data); 140 __ipq_flush(cmpfn, data);
140 write_unlock_bh(&queue_lock); 141 spin_unlock_bh(&queue_lock);
141} 142}
142 143
143static struct sk_buff * 144static struct sk_buff *
@@ -151,37 +152,29 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
151 struct nlmsghdr *nlh; 152 struct nlmsghdr *nlh;
152 struct timeval tv; 153 struct timeval tv;
153 154
154 read_lock_bh(&queue_lock); 155 switch (ACCESS_ONCE(copy_mode)) {
155
156 switch (copy_mode) {
157 case IPQ_COPY_META: 156 case IPQ_COPY_META:
158 case IPQ_COPY_NONE: 157 case IPQ_COPY_NONE:
159 size = NLMSG_SPACE(sizeof(*pmsg)); 158 size = NLMSG_SPACE(sizeof(*pmsg));
160 break; 159 break;
161 160
162 case IPQ_COPY_PACKET: 161 case IPQ_COPY_PACKET:
163 if ((entry->skb->ip_summed == CHECKSUM_PARTIAL || 162 if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
164 entry->skb->ip_summed == CHECKSUM_COMPLETE) && 163 (*errp = skb_checksum_help(entry->skb)))
165 (*errp = skb_checksum_help(entry->skb))) {
166 read_unlock_bh(&queue_lock);
167 return NULL; 164 return NULL;
168 } 165
169 if (copy_range == 0 || copy_range > entry->skb->len) 166 data_len = ACCESS_ONCE(copy_range);
167 if (data_len == 0 || data_len > entry->skb->len)
170 data_len = entry->skb->len; 168 data_len = entry->skb->len;
171 else
172 data_len = copy_range;
173 169
174 size = NLMSG_SPACE(sizeof(*pmsg) + data_len); 170 size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
175 break; 171 break;
176 172
177 default: 173 default:
178 *errp = -EINVAL; 174 *errp = -EINVAL;
179 read_unlock_bh(&queue_lock);
180 return NULL; 175 return NULL;
181 } 176 }
182 177
183 read_unlock_bh(&queue_lock);
184
185 skb = alloc_skb(size, GFP_ATOMIC); 178 skb = alloc_skb(size, GFP_ATOMIC);
186 if (!skb) 179 if (!skb)
187 goto nlmsg_failure; 180 goto nlmsg_failure;
@@ -242,7 +235,7 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
242 if (nskb == NULL) 235 if (nskb == NULL)
243 return status; 236 return status;
244 237
245 write_lock_bh(&queue_lock); 238 spin_lock_bh(&queue_lock);
246 239
247 if (!peer_pid) 240 if (!peer_pid)
248 goto err_out_free_nskb; 241 goto err_out_free_nskb;
@@ -266,14 +259,14 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
266 259
267 __ipq_enqueue_entry(entry); 260 __ipq_enqueue_entry(entry);
268 261
269 write_unlock_bh(&queue_lock); 262 spin_unlock_bh(&queue_lock);
270 return status; 263 return status;
271 264
272err_out_free_nskb: 265err_out_free_nskb:
273 kfree_skb(nskb); 266 kfree_skb(nskb);
274 267
275err_out_unlock: 268err_out_unlock:
276 write_unlock_bh(&queue_lock); 269 spin_unlock_bh(&queue_lock);
277 return status; 270 return status;
278} 271}
279 272
@@ -342,9 +335,9 @@ ipq_set_mode(unsigned char mode, unsigned int range)
342{ 335{
343 int status; 336 int status;
344 337
345 write_lock_bh(&queue_lock); 338 spin_lock_bh(&queue_lock);
346 status = __ipq_set_mode(mode, range); 339 status = __ipq_set_mode(mode, range);
347 write_unlock_bh(&queue_lock); 340 spin_unlock_bh(&queue_lock);
348 return status; 341 return status;
349} 342}
350 343
@@ -440,11 +433,11 @@ __ipq_rcv_skb(struct sk_buff *skb)
440 if (security_netlink_recv(skb, CAP_NET_ADMIN)) 433 if (security_netlink_recv(skb, CAP_NET_ADMIN))
441 RCV_SKB_FAIL(-EPERM); 434 RCV_SKB_FAIL(-EPERM);
442 435
443 write_lock_bh(&queue_lock); 436 spin_lock_bh(&queue_lock);
444 437
445 if (peer_pid) { 438 if (peer_pid) {
446 if (peer_pid != pid) { 439 if (peer_pid != pid) {
447 write_unlock_bh(&queue_lock); 440 spin_unlock_bh(&queue_lock);
448 RCV_SKB_FAIL(-EBUSY); 441 RCV_SKB_FAIL(-EBUSY);
449 } 442 }
450 } else { 443 } else {
@@ -452,7 +445,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
452 peer_pid = pid; 445 peer_pid = pid;
453 } 446 }
454 447
455 write_unlock_bh(&queue_lock); 448 spin_unlock_bh(&queue_lock);
456 449
457 status = ipq_receive_peer(NLMSG_DATA(nlh), type, 450 status = ipq_receive_peer(NLMSG_DATA(nlh), type,
458 nlmsglen - NLMSG_LENGTH(0)); 451 nlmsglen - NLMSG_LENGTH(0));
@@ -461,7 +454,6 @@ __ipq_rcv_skb(struct sk_buff *skb)
461 454
462 if (flags & NLM_F_ACK) 455 if (flags & NLM_F_ACK)
463 netlink_ack(skb, nlh, 0); 456 netlink_ack(skb, nlh, 0);
464 return;
465} 457}
466 458
467static void 459static void
@@ -497,12 +489,11 @@ ipq_rcv_nl_event(struct notifier_block *this,
497{ 489{
498 struct netlink_notify *n = ptr; 490 struct netlink_notify *n = ptr;
499 491
500 if (event == NETLINK_URELEASE && 492 if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
501 n->protocol == NETLINK_FIREWALL && n->pid) { 493 spin_lock_bh(&queue_lock);
502 write_lock_bh(&queue_lock); 494 if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
503 if ((n->net == &init_net) && (n->pid == peer_pid))
504 __ipq_reset(); 495 __ipq_reset();
505 write_unlock_bh(&queue_lock); 496 spin_unlock_bh(&queue_lock);
506 } 497 }
507 return NOTIFY_DONE; 498 return NOTIFY_DONE;
508} 499}
@@ -516,21 +507,20 @@ static struct ctl_table_header *ipq_sysctl_header;
516 507
517static ctl_table ipq_table[] = { 508static ctl_table ipq_table[] = {
518 { 509 {
519 .ctl_name = NET_IPQ_QMAX,
520 .procname = NET_IPQ_QMAX_NAME, 510 .procname = NET_IPQ_QMAX_NAME,
521 .data = &queue_maxlen, 511 .data = &queue_maxlen,
522 .maxlen = sizeof(queue_maxlen), 512 .maxlen = sizeof(queue_maxlen),
523 .mode = 0644, 513 .mode = 0644,
524 .proc_handler = proc_dointvec 514 .proc_handler = proc_dointvec
525 }, 515 },
526 { .ctl_name = 0 } 516 { }
527}; 517};
528#endif 518#endif
529 519
530#ifdef CONFIG_PROC_FS 520#ifdef CONFIG_PROC_FS
531static int ip_queue_show(struct seq_file *m, void *v) 521static int ip_queue_show(struct seq_file *m, void *v)
532{ 522{
533 read_lock_bh(&queue_lock); 523 spin_lock_bh(&queue_lock);
534 524
535 seq_printf(m, 525 seq_printf(m,
536 "Peer PID : %d\n" 526 "Peer PID : %d\n"
@@ -548,7 +538,7 @@ static int ip_queue_show(struct seq_file *m, void *v)
548 queue_dropped, 538 queue_dropped,
549 queue_user_dropped); 539 queue_user_dropped);
550 540
551 read_unlock_bh(&queue_lock); 541 spin_unlock_bh(&queue_lock);
552 return 0; 542 return 0;
553} 543}
554 544
@@ -622,7 +612,7 @@ cleanup_netlink_notifier:
622static void __exit ip_queue_fini(void) 612static void __exit ip_queue_fini(void)
623{ 613{
624 nf_unregister_queue_handlers(&nfqh); 614 nf_unregister_queue_handlers(&nfqh);
625 synchronize_net(); 615
626 ipq_flush(NULL, 0); 616 ipq_flush(NULL, 0);
627 617
628#ifdef CONFIG_SYSCTL 618#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index fdefae6b5dfc..652efea013dc 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -8,6 +8,7 @@
8 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation. 9 * published by the Free Software Foundation.
10 */ 10 */
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11#include <linux/cache.h> 12#include <linux/cache.h>
12#include <linux/capability.h> 13#include <linux/capability.h>
13#include <linux/skbuff.h> 14#include <linux/skbuff.h>
@@ -27,6 +28,7 @@
27#include <linux/netfilter/x_tables.h> 28#include <linux/netfilter/x_tables.h>
28#include <linux/netfilter_ipv4/ip_tables.h> 29#include <linux/netfilter_ipv4/ip_tables.h>
29#include <net/netfilter/nf_log.h> 30#include <net/netfilter/nf_log.h>
31#include "../../netfilter/xt_repldata.h"
30 32
31MODULE_LICENSE("GPL"); 33MODULE_LICENSE("GPL");
32MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 34MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -37,24 +39,19 @@ MODULE_DESCRIPTION("IPv4 packet filter");
37/*#define DEBUG_IP_FIREWALL_USER*/ 39/*#define DEBUG_IP_FIREWALL_USER*/
38 40
39#ifdef DEBUG_IP_FIREWALL 41#ifdef DEBUG_IP_FIREWALL
40#define dprintf(format, args...) printk(format , ## args) 42#define dprintf(format, args...) pr_info(format , ## args)
41#else 43#else
42#define dprintf(format, args...) 44#define dprintf(format, args...)
43#endif 45#endif
44 46
45#ifdef DEBUG_IP_FIREWALL_USER 47#ifdef DEBUG_IP_FIREWALL_USER
46#define duprintf(format, args...) printk(format , ## args) 48#define duprintf(format, args...) pr_info(format , ## args)
47#else 49#else
48#define duprintf(format, args...) 50#define duprintf(format, args...)
49#endif 51#endif
50 52
51#ifdef CONFIG_NETFILTER_DEBUG 53#ifdef CONFIG_NETFILTER_DEBUG
52#define IP_NF_ASSERT(x) \ 54#define IP_NF_ASSERT(x) WARN_ON(!(x))
53do { \
54 if (!(x)) \
55 printk("IP_NF_ASSERT: %s:%s:%u\n", \
56 __func__, __FILE__, __LINE__); \
57} while(0)
58#else 55#else
59#define IP_NF_ASSERT(x) 56#define IP_NF_ASSERT(x)
60#endif 57#endif
@@ -65,6 +62,12 @@ do { \
65#define inline 62#define inline
66#endif 63#endif
67 64
65void *ipt_alloc_initial_table(const struct xt_table *info)
66{
67 return xt_alloc_initial_table(ipt, IPT);
68}
69EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
70
68/* 71/*
69 We keep a set of rules for each CPU, so we can avoid write-locking 72 We keep a set of rules for each CPU, so we can avoid write-locking
70 them in the softirq when updating the counters and therefore 73 them in the softirq when updating the counters and therefore
@@ -88,9 +91,9 @@ ip_packet_match(const struct iphdr *ip,
88#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) 91#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg)))
89 92
90 if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, 93 if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
91 IPT_INV_SRCIP) 94 IPT_INV_SRCIP) ||
92 || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, 95 FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
93 IPT_INV_DSTIP)) { 96 IPT_INV_DSTIP)) {
94 dprintf("Source or dest mismatch.\n"); 97 dprintf("Source or dest mismatch.\n");
95 98
96 dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", 99 dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
@@ -121,8 +124,8 @@ ip_packet_match(const struct iphdr *ip,
121 } 124 }
122 125
123 /* Check specific protocol */ 126 /* Check specific protocol */
124 if (ipinfo->proto 127 if (ipinfo->proto &&
125 && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { 128 FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
126 dprintf("Packet protocol %hi does not match %hi.%s\n", 129 dprintf("Packet protocol %hi does not match %hi.%s\n",
127 ip->protocol, ipinfo->proto, 130 ip->protocol, ipinfo->proto,
128 ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); 131 ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
@@ -157,52 +160,38 @@ ip_checkentry(const struct ipt_ip *ip)
157} 160}
158 161
159static unsigned int 162static unsigned int
160ipt_error(struct sk_buff *skb, const struct xt_target_param *par) 163ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
161{ 164{
162 if (net_ratelimit()) 165 if (net_ratelimit())
163 printk("ip_tables: error: `%s'\n", 166 pr_info("error: `%s'\n", (const char *)par->targinfo);
164 (const char *)par->targinfo);
165 167
166 return NF_DROP; 168 return NF_DROP;
167} 169}
168 170
169/* Performance critical - called for every packet */
170static inline bool
171do_match(struct ipt_entry_match *m, const struct sk_buff *skb,
172 struct xt_match_param *par)
173{
174 par->match = m->u.kernel.match;
175 par->matchinfo = m->data;
176
177 /* Stop iteration if it doesn't match */
178 if (!m->u.kernel.match->match(skb, par))
179 return true;
180 else
181 return false;
182}
183
184/* Performance critical */ 171/* Performance critical */
185static inline struct ipt_entry * 172static inline struct ipt_entry *
186get_entry(void *base, unsigned int offset) 173get_entry(const void *base, unsigned int offset)
187{ 174{
188 return (struct ipt_entry *)(base + offset); 175 return (struct ipt_entry *)(base + offset);
189} 176}
190 177
191/* All zeroes == unconditional rule. */ 178/* All zeroes == unconditional rule. */
192/* Mildly perf critical (only if packet tracing is on) */ 179/* Mildly perf critical (only if packet tracing is on) */
193static inline int 180static inline bool unconditional(const struct ipt_ip *ip)
194unconditional(const struct ipt_ip *ip)
195{ 181{
196 unsigned int i; 182 static const struct ipt_ip uncond;
197
198 for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++)
199 if (((__u32 *)ip)[i])
200 return 0;
201 183
202 return 1; 184 return memcmp(ip, &uncond, sizeof(uncond)) == 0;
203#undef FWINV 185#undef FWINV
204} 186}
205 187
188/* for const-correctness */
189static inline const struct xt_entry_target *
190ipt_get_target_c(const struct ipt_entry *e)
191{
192 return ipt_get_target((struct ipt_entry *)e);
193}
194
206#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ 195#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
207 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) 196 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
208static const char *const hooknames[] = { 197static const char *const hooknames[] = {
@@ -237,24 +226,24 @@ static struct nf_loginfo trace_loginfo = {
237 226
238/* Mildly perf critical (only if packet tracing is on) */ 227/* Mildly perf critical (only if packet tracing is on) */
239static inline int 228static inline int
240get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e, 229get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
241 const char *hookname, const char **chainname, 230 const char *hookname, const char **chainname,
242 const char **comment, unsigned int *rulenum) 231 const char **comment, unsigned int *rulenum)
243{ 232{
244 struct ipt_standard_target *t = (void *)ipt_get_target(s); 233 const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
245 234
246 if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) { 235 if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
247 /* Head of user chain: ERROR target with chainname */ 236 /* Head of user chain: ERROR target with chainname */
248 *chainname = t->target.data; 237 *chainname = t->target.data;
249 (*rulenum) = 0; 238 (*rulenum) = 0;
250 } else if (s == e) { 239 } else if (s == e) {
251 (*rulenum)++; 240 (*rulenum)++;
252 241
253 if (s->target_offset == sizeof(struct ipt_entry) 242 if (s->target_offset == sizeof(struct ipt_entry) &&
254 && strcmp(t->target.u.kernel.target->name, 243 strcmp(t->target.u.kernel.target->name,
255 IPT_STANDARD_TARGET) == 0 244 XT_STANDARD_TARGET) == 0 &&
256 && t->verdict < 0 245 t->verdict < 0 &&
257 && unconditional(&s->ip)) { 246 unconditional(&s->ip)) {
258 /* Tail of chains: STANDARD target (return/policy) */ 247 /* Tail of chains: STANDARD target (return/policy) */
259 *comment = *chainname == hookname 248 *comment = *chainname == hookname
260 ? comments[NF_IP_TRACE_COMMENT_POLICY] 249 ? comments[NF_IP_TRACE_COMMENT_POLICY]
@@ -267,17 +256,18 @@ get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e,
267 return 0; 256 return 0;
268} 257}
269 258
270static void trace_packet(struct sk_buff *skb, 259static void trace_packet(const struct sk_buff *skb,
271 unsigned int hook, 260 unsigned int hook,
272 const struct net_device *in, 261 const struct net_device *in,
273 const struct net_device *out, 262 const struct net_device *out,
274 const char *tablename, 263 const char *tablename,
275 struct xt_table_info *private, 264 const struct xt_table_info *private,
276 struct ipt_entry *e) 265 const struct ipt_entry *e)
277{ 266{
278 void *table_base; 267 const void *table_base;
279 const struct ipt_entry *root; 268 const struct ipt_entry *root;
280 const char *hookname, *chainname, *comment; 269 const char *hookname, *chainname, *comment;
270 const struct ipt_entry *iter;
281 unsigned int rulenum = 0; 271 unsigned int rulenum = 0;
282 272
283 table_base = private->entries[smp_processor_id()]; 273 table_base = private->entries[smp_processor_id()];
@@ -286,10 +276,10 @@ static void trace_packet(struct sk_buff *skb,
286 hookname = chainname = hooknames[hook]; 276 hookname = chainname = hooknames[hook];
287 comment = comments[NF_IP_TRACE_COMMENT_RULE]; 277 comment = comments[NF_IP_TRACE_COMMENT_RULE];
288 278
289 IPT_ENTRY_ITERATE(root, 279 xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
290 private->size - private->hook_entry[hook], 280 if (get_chainname_rulenum(iter, e, hookname,
291 get_chainname_rulenum, 281 &chainname, &comment, &rulenum) != 0)
292 e, hookname, &chainname, &comment, &rulenum); 282 break;
293 283
294 nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo, 284 nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo,
295 "TRACE: %s:%s:%s:%u ", 285 "TRACE: %s:%s:%s:%u ",
@@ -311,24 +301,19 @@ ipt_do_table(struct sk_buff *skb,
311 const struct net_device *out, 301 const struct net_device *out,
312 struct xt_table *table) 302 struct xt_table *table)
313{ 303{
314#define tb_comefrom ((struct ipt_entry *)table_base)->comefrom
315
316 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); 304 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
317 const struct iphdr *ip; 305 const struct iphdr *ip;
318 u_int16_t datalen;
319 bool hotdrop = false;
320 /* Initializing verdict to NF_DROP keeps gcc happy. */ 306 /* Initializing verdict to NF_DROP keeps gcc happy. */
321 unsigned int verdict = NF_DROP; 307 unsigned int verdict = NF_DROP;
322 const char *indev, *outdev; 308 const char *indev, *outdev;
323 void *table_base; 309 const void *table_base;
324 struct ipt_entry *e, *back; 310 struct ipt_entry *e, **jumpstack;
325 struct xt_table_info *private; 311 unsigned int *stackptr, origptr, cpu;
326 struct xt_match_param mtpar; 312 const struct xt_table_info *private;
327 struct xt_target_param tgpar; 313 struct xt_action_param acpar;
328 314
329 /* Initialization */ 315 /* Initialization */
330 ip = ip_hdr(skb); 316 ip = ip_hdr(skb);
331 datalen = skb->len - ip->ihl * 4;
332 indev = in ? in->name : nulldevname; 317 indev = in ? in->name : nulldevname;
333 outdev = out ? out->name : nulldevname; 318 outdev = out ? out->name : nulldevname;
334 /* We handle fragments by dealing with the first fragment as 319 /* We handle fragments by dealing with the first fragment as
@@ -337,37 +322,49 @@ ipt_do_table(struct sk_buff *skb,
337 * things we don't know, ie. tcp syn flag or ports). If the 322 * things we don't know, ie. tcp syn flag or ports). If the
338 * rule is also a fragment-specific rule, non-fragments won't 323 * rule is also a fragment-specific rule, non-fragments won't
339 * match it. */ 324 * match it. */
340 mtpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; 325 acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
341 mtpar.thoff = ip_hdrlen(skb); 326 acpar.thoff = ip_hdrlen(skb);
342 mtpar.hotdrop = &hotdrop; 327 acpar.hotdrop = false;
343 mtpar.in = tgpar.in = in; 328 acpar.in = in;
344 mtpar.out = tgpar.out = out; 329 acpar.out = out;
345 mtpar.family = tgpar.family = NFPROTO_IPV4; 330 acpar.family = NFPROTO_IPV4;
346 mtpar.hooknum = tgpar.hooknum = hook; 331 acpar.hooknum = hook;
347 332
348 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 333 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
349 xt_info_rdlock_bh(); 334 xt_info_rdlock_bh();
350 private = table->private; 335 private = table->private;
351 table_base = private->entries[smp_processor_id()]; 336 cpu = smp_processor_id();
337 table_base = private->entries[cpu];
338 jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
339 stackptr = per_cpu_ptr(private->stackptr, cpu);
340 origptr = *stackptr;
352 341
353 e = get_entry(table_base, private->hook_entry[hook]); 342 e = get_entry(table_base, private->hook_entry[hook]);
354 343
355 /* For return from builtin chain */ 344 pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n",
356 back = get_entry(table_base, private->underflow[hook]); 345 table->name, hook, origptr,
346 get_entry(table_base, private->underflow[hook]));
357 347
358 do { 348 do {
359 struct ipt_entry_target *t; 349 const struct xt_entry_target *t;
350 const struct xt_entry_match *ematch;
360 351
361 IP_NF_ASSERT(e); 352 IP_NF_ASSERT(e);
362 IP_NF_ASSERT(back);
363 if (!ip_packet_match(ip, indev, outdev, 353 if (!ip_packet_match(ip, indev, outdev,
364 &e->ip, mtpar.fragoff) || 354 &e->ip, acpar.fragoff)) {
365 IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0) { 355 no_match:
366 e = ipt_next_entry(e); 356 e = ipt_next_entry(e);
367 continue; 357 continue;
368 } 358 }
369 359
370 ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); 360 xt_ematch_foreach(ematch, e) {
361 acpar.match = ematch->u.kernel.match;
362 acpar.matchinfo = ematch->data;
363 if (!acpar.match->match(skb, &acpar))
364 goto no_match;
365 }
366
367 ADD_COUNTER(e->counters, skb->len, 1);
371 368
372 t = ipt_get_target(e); 369 t = ipt_get_target(e);
373 IP_NF_ASSERT(t->u.kernel.target); 370 IP_NF_ASSERT(t->u.kernel.target);
@@ -383,75 +380,70 @@ ipt_do_table(struct sk_buff *skb,
383 if (!t->u.kernel.target->target) { 380 if (!t->u.kernel.target->target) {
384 int v; 381 int v;
385 382
386 v = ((struct ipt_standard_target *)t)->verdict; 383 v = ((struct xt_standard_target *)t)->verdict;
387 if (v < 0) { 384 if (v < 0) {
388 /* Pop from stack? */ 385 /* Pop from stack? */
389 if (v != IPT_RETURN) { 386 if (v != XT_RETURN) {
390 verdict = (unsigned)(-v) - 1; 387 verdict = (unsigned)(-v) - 1;
391 break; 388 break;
392 } 389 }
393 e = back; 390 if (*stackptr == 0) {
394 back = get_entry(table_base, back->comefrom); 391 e = get_entry(table_base,
392 private->underflow[hook]);
393 pr_debug("Underflow (this is normal) "
394 "to %p\n", e);
395 } else {
396 e = jumpstack[--*stackptr];
397 pr_debug("Pulled %p out from pos %u\n",
398 e, *stackptr);
399 e = ipt_next_entry(e);
400 }
395 continue; 401 continue;
396 } 402 }
397 if (table_base + v != ipt_next_entry(e) 403 if (table_base + v != ipt_next_entry(e) &&
398 && !(e->ip.flags & IPT_F_GOTO)) { 404 !(e->ip.flags & IPT_F_GOTO)) {
399 /* Save old back ptr in next entry */ 405 if (*stackptr >= private->stacksize) {
400 struct ipt_entry *next = ipt_next_entry(e); 406 verdict = NF_DROP;
401 next->comefrom = (void *)back - table_base; 407 break;
402 /* set back pointer to next entry */ 408 }
403 back = next; 409 jumpstack[(*stackptr)++] = e;
410 pr_debug("Pushed %p into pos %u\n",
411 e, *stackptr - 1);
404 } 412 }
405 413
406 e = get_entry(table_base, v); 414 e = get_entry(table_base, v);
407 continue; 415 continue;
408 } 416 }
409 417
410 /* Targets which reenter must return 418 acpar.target = t->u.kernel.target;
411 abs. verdicts */ 419 acpar.targinfo = t->data;
412 tgpar.target = t->u.kernel.target;
413 tgpar.targinfo = t->data;
414
415 420
416#ifdef CONFIG_NETFILTER_DEBUG 421 verdict = t->u.kernel.target->target(skb, &acpar);
417 tb_comefrom = 0xeeeeeeec;
418#endif
419 verdict = t->u.kernel.target->target(skb, &tgpar);
420#ifdef CONFIG_NETFILTER_DEBUG
421 if (tb_comefrom != 0xeeeeeeec && verdict == IPT_CONTINUE) {
422 printk("Target %s reentered!\n",
423 t->u.kernel.target->name);
424 verdict = NF_DROP;
425 }
426 tb_comefrom = 0x57acc001;
427#endif
428 /* Target might have changed stuff. */ 422 /* Target might have changed stuff. */
429 ip = ip_hdr(skb); 423 ip = ip_hdr(skb);
430 datalen = skb->len - ip->ihl * 4; 424 if (verdict == XT_CONTINUE)
431
432 if (verdict == IPT_CONTINUE)
433 e = ipt_next_entry(e); 425 e = ipt_next_entry(e);
434 else 426 else
435 /* Verdict */ 427 /* Verdict */
436 break; 428 break;
437 } while (!hotdrop); 429 } while (!acpar.hotdrop);
438 xt_info_rdunlock_bh(); 430 xt_info_rdunlock_bh();
439 431 pr_debug("Exiting %s; resetting sp from %u to %u\n",
432 __func__, *stackptr, origptr);
433 *stackptr = origptr;
440#ifdef DEBUG_ALLOW_ALL 434#ifdef DEBUG_ALLOW_ALL
441 return NF_ACCEPT; 435 return NF_ACCEPT;
442#else 436#else
443 if (hotdrop) 437 if (acpar.hotdrop)
444 return NF_DROP; 438 return NF_DROP;
445 else return verdict; 439 else return verdict;
446#endif 440#endif
447
448#undef tb_comefrom
449} 441}
450 442
451/* Figures out from what hook each rule can be called: returns 0 if 443/* Figures out from what hook each rule can be called: returns 0 if
452 there are loops. Puts hook bitmask in comefrom. */ 444 there are loops. Puts hook bitmask in comefrom. */
453static int 445static int
454mark_source_chains(struct xt_table_info *newinfo, 446mark_source_chains(const struct xt_table_info *newinfo,
455 unsigned int valid_hooks, void *entry0) 447 unsigned int valid_hooks, void *entry0)
456{ 448{
457 unsigned int hook; 449 unsigned int hook;
@@ -469,27 +461,27 @@ mark_source_chains(struct xt_table_info *newinfo,
469 e->counters.pcnt = pos; 461 e->counters.pcnt = pos;
470 462
471 for (;;) { 463 for (;;) {
472 struct ipt_standard_target *t 464 const struct xt_standard_target *t
473 = (void *)ipt_get_target(e); 465 = (void *)ipt_get_target_c(e);
474 int visited = e->comefrom & (1 << hook); 466 int visited = e->comefrom & (1 << hook);
475 467
476 if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { 468 if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
477 printk("iptables: loop hook %u pos %u %08X.\n", 469 pr_err("iptables: loop hook %u pos %u %08X.\n",
478 hook, pos, e->comefrom); 470 hook, pos, e->comefrom);
479 return 0; 471 return 0;
480 } 472 }
481 e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); 473 e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
482 474
483 /* Unconditional return/END. */ 475 /* Unconditional return/END. */
484 if ((e->target_offset == sizeof(struct ipt_entry) 476 if ((e->target_offset == sizeof(struct ipt_entry) &&
485 && (strcmp(t->target.u.user.name, 477 (strcmp(t->target.u.user.name,
486 IPT_STANDARD_TARGET) == 0) 478 XT_STANDARD_TARGET) == 0) &&
487 && t->verdict < 0 479 t->verdict < 0 && unconditional(&e->ip)) ||
488 && unconditional(&e->ip)) || visited) { 480 visited) {
489 unsigned int oldpos, size; 481 unsigned int oldpos, size;
490 482
491 if ((strcmp(t->target.u.user.name, 483 if ((strcmp(t->target.u.user.name,
492 IPT_STANDARD_TARGET) == 0) && 484 XT_STANDARD_TARGET) == 0) &&
493 t->verdict < -NF_MAX_VERDICT - 1) { 485 t->verdict < -NF_MAX_VERDICT - 1) {
494 duprintf("mark_source_chains: bad " 486 duprintf("mark_source_chains: bad "
495 "negative verdict (%i)\n", 487 "negative verdict (%i)\n",
@@ -532,8 +524,8 @@ mark_source_chains(struct xt_table_info *newinfo,
532 int newpos = t->verdict; 524 int newpos = t->verdict;
533 525
534 if (strcmp(t->target.u.user.name, 526 if (strcmp(t->target.u.user.name,
535 IPT_STANDARD_TARGET) == 0 527 XT_STANDARD_TARGET) == 0 &&
536 && newpos >= 0) { 528 newpos >= 0) {
537 if (newpos > newinfo->size - 529 if (newpos > newinfo->size -
538 sizeof(struct ipt_entry)) { 530 sizeof(struct ipt_entry)) {
539 duprintf("mark_source_chains: " 531 duprintf("mark_source_chains: "
@@ -560,38 +552,34 @@ mark_source_chains(struct xt_table_info *newinfo,
560 return 1; 552 return 1;
561} 553}
562 554
563static int 555static void cleanup_match(struct xt_entry_match *m, struct net *net)
564cleanup_match(struct ipt_entry_match *m, unsigned int *i)
565{ 556{
566 struct xt_mtdtor_param par; 557 struct xt_mtdtor_param par;
567 558
568 if (i && (*i)-- == 0) 559 par.net = net;
569 return 1;
570
571 par.match = m->u.kernel.match; 560 par.match = m->u.kernel.match;
572 par.matchinfo = m->data; 561 par.matchinfo = m->data;
573 par.family = NFPROTO_IPV4; 562 par.family = NFPROTO_IPV4;
574 if (par.match->destroy != NULL) 563 if (par.match->destroy != NULL)
575 par.match->destroy(&par); 564 par.match->destroy(&par);
576 module_put(par.match->me); 565 module_put(par.match->me);
577 return 0;
578} 566}
579 567
580static int 568static int
581check_entry(struct ipt_entry *e, const char *name) 569check_entry(const struct ipt_entry *e, const char *name)
582{ 570{
583 struct ipt_entry_target *t; 571 const struct xt_entry_target *t;
584 572
585 if (!ip_checkentry(&e->ip)) { 573 if (!ip_checkentry(&e->ip)) {
586 duprintf("ip_tables: ip check failed %p %s.\n", e, name); 574 duprintf("ip check failed %p %s.\n", e, par->match->name);
587 return -EINVAL; 575 return -EINVAL;
588 } 576 }
589 577
590 if (e->target_offset + sizeof(struct ipt_entry_target) > 578 if (e->target_offset + sizeof(struct xt_entry_target) >
591 e->next_offset) 579 e->next_offset)
592 return -EINVAL; 580 return -EINVAL;
593 581
594 t = ipt_get_target(e); 582 t = ipt_get_target_c(e);
595 if (e->target_offset + t->u.target_size > e->next_offset) 583 if (e->target_offset + t->u.target_size > e->next_offset)
596 return -EINVAL; 584 return -EINVAL;
597 585
@@ -599,8 +587,7 @@ check_entry(struct ipt_entry *e, const char *name)
599} 587}
600 588
601static int 589static int
602check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, 590check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
603 unsigned int *i)
604{ 591{
605 const struct ipt_ip *ip = par->entryinfo; 592 const struct ipt_ip *ip = par->entryinfo;
606 int ret; 593 int ret;
@@ -611,31 +598,27 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par,
611 ret = xt_check_match(par, m->u.match_size - sizeof(*m), 598 ret = xt_check_match(par, m->u.match_size - sizeof(*m),
612 ip->proto, ip->invflags & IPT_INV_PROTO); 599 ip->proto, ip->invflags & IPT_INV_PROTO);
613 if (ret < 0) { 600 if (ret < 0) {
614 duprintf("ip_tables: check failed for `%s'.\n", 601 duprintf("check failed for `%s'.\n", par->match->name);
615 par.match->name);
616 return ret; 602 return ret;
617 } 603 }
618 ++*i;
619 return 0; 604 return 0;
620} 605}
621 606
622static int 607static int
623find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, 608find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
624 unsigned int *i)
625{ 609{
626 struct xt_match *match; 610 struct xt_match *match;
627 int ret; 611 int ret;
628 612
629 match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name, 613 match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
630 m->u.user.revision), 614 m->u.user.revision);
631 "ipt_%s", m->u.user.name); 615 if (IS_ERR(match)) {
632 if (IS_ERR(match) || !match) {
633 duprintf("find_check_match: `%s' not found\n", m->u.user.name); 616 duprintf("find_check_match: `%s' not found\n", m->u.user.name);
634 return match ? PTR_ERR(match) : -ENOENT; 617 return PTR_ERR(match);
635 } 618 }
636 m->u.kernel.match = match; 619 m->u.kernel.match = match;
637 620
638 ret = check_match(m, par, i); 621 ret = check_match(m, par);
639 if (ret) 622 if (ret)
640 goto err; 623 goto err;
641 624
@@ -645,10 +628,11 @@ err:
645 return ret; 628 return ret;
646} 629}
647 630
648static int check_target(struct ipt_entry *e, const char *name) 631static int check_target(struct ipt_entry *e, struct net *net, const char *name)
649{ 632{
650 struct ipt_entry_target *t = ipt_get_target(e); 633 struct xt_entry_target *t = ipt_get_target(e);
651 struct xt_tgchk_param par = { 634 struct xt_tgchk_param par = {
635 .net = net,
652 .table = name, 636 .table = name,
653 .entryinfo = e, 637 .entryinfo = e,
654 .target = t->u.kernel.target, 638 .target = t->u.kernel.target,
@@ -661,7 +645,7 @@ static int check_target(struct ipt_entry *e, const char *name)
661 ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 645 ret = xt_check_target(&par, t->u.target_size - sizeof(*t),
662 e->ip.proto, e->ip.invflags & IPT_INV_PROTO); 646 e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
663 if (ret < 0) { 647 if (ret < 0) {
664 duprintf("ip_tables: check failed for `%s'.\n", 648 duprintf("check failed for `%s'.\n",
665 t->u.kernel.target->name); 649 t->u.kernel.target->name);
666 return ret; 650 return ret;
667 } 651 }
@@ -669,72 +653,92 @@ static int check_target(struct ipt_entry *e, const char *name)
669} 653}
670 654
671static int 655static int
672find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, 656find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
673 unsigned int *i) 657 unsigned int size)
674{ 658{
675 struct ipt_entry_target *t; 659 struct xt_entry_target *t;
676 struct xt_target *target; 660 struct xt_target *target;
677 int ret; 661 int ret;
678 unsigned int j; 662 unsigned int j;
679 struct xt_mtchk_param mtpar; 663 struct xt_mtchk_param mtpar;
664 struct xt_entry_match *ematch;
680 665
681 ret = check_entry(e, name); 666 ret = check_entry(e, name);
682 if (ret) 667 if (ret)
683 return ret; 668 return ret;
684 669
685 j = 0; 670 j = 0;
671 mtpar.net = net;
686 mtpar.table = name; 672 mtpar.table = name;
687 mtpar.entryinfo = &e->ip; 673 mtpar.entryinfo = &e->ip;
688 mtpar.hook_mask = e->comefrom; 674 mtpar.hook_mask = e->comefrom;
689 mtpar.family = NFPROTO_IPV4; 675 mtpar.family = NFPROTO_IPV4;
690 ret = IPT_MATCH_ITERATE(e, find_check_match, &mtpar, &j); 676 xt_ematch_foreach(ematch, e) {
691 if (ret != 0) 677 ret = find_check_match(ematch, &mtpar);
692 goto cleanup_matches; 678 if (ret != 0)
679 goto cleanup_matches;
680 ++j;
681 }
693 682
694 t = ipt_get_target(e); 683 t = ipt_get_target(e);
695 target = try_then_request_module(xt_find_target(AF_INET, 684 target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
696 t->u.user.name, 685 t->u.user.revision);
697 t->u.user.revision), 686 if (IS_ERR(target)) {
698 "ipt_%s", t->u.user.name);
699 if (IS_ERR(target) || !target) {
700 duprintf("find_check_entry: `%s' not found\n", t->u.user.name); 687 duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
701 ret = target ? PTR_ERR(target) : -ENOENT; 688 ret = PTR_ERR(target);
702 goto cleanup_matches; 689 goto cleanup_matches;
703 } 690 }
704 t->u.kernel.target = target; 691 t->u.kernel.target = target;
705 692
706 ret = check_target(e, name); 693 ret = check_target(e, net, name);
707 if (ret) 694 if (ret)
708 goto err; 695 goto err;
709
710 (*i)++;
711 return 0; 696 return 0;
712 err: 697 err:
713 module_put(t->u.kernel.target->me); 698 module_put(t->u.kernel.target->me);
714 cleanup_matches: 699 cleanup_matches:
715 IPT_MATCH_ITERATE(e, cleanup_match, &j); 700 xt_ematch_foreach(ematch, e) {
701 if (j-- == 0)
702 break;
703 cleanup_match(ematch, net);
704 }
716 return ret; 705 return ret;
717} 706}
718 707
708static bool check_underflow(const struct ipt_entry *e)
709{
710 const struct xt_entry_target *t;
711 unsigned int verdict;
712
713 if (!unconditional(&e->ip))
714 return false;
715 t = ipt_get_target_c(e);
716 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
717 return false;
718 verdict = ((struct xt_standard_target *)t)->verdict;
719 verdict = -verdict - 1;
720 return verdict == NF_DROP || verdict == NF_ACCEPT;
721}
722
719static int 723static int
720check_entry_size_and_hooks(struct ipt_entry *e, 724check_entry_size_and_hooks(struct ipt_entry *e,
721 struct xt_table_info *newinfo, 725 struct xt_table_info *newinfo,
722 unsigned char *base, 726 const unsigned char *base,
723 unsigned char *limit, 727 const unsigned char *limit,
724 const unsigned int *hook_entries, 728 const unsigned int *hook_entries,
725 const unsigned int *underflows, 729 const unsigned int *underflows,
726 unsigned int *i) 730 unsigned int valid_hooks)
727{ 731{
728 unsigned int h; 732 unsigned int h;
729 733
730 if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 734 if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
731 || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { 735 (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
732 duprintf("Bad offset %p\n", e); 736 duprintf("Bad offset %p\n", e);
733 return -EINVAL; 737 return -EINVAL;
734 } 738 }
735 739
736 if (e->next_offset 740 if (e->next_offset
737 < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) { 741 < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
738 duprintf("checking: element %p size %u\n", 742 duprintf("checking: element %p size %u\n",
739 e, e->next_offset); 743 e, e->next_offset);
740 return -EINVAL; 744 return -EINVAL;
@@ -742,62 +746,60 @@ check_entry_size_and_hooks(struct ipt_entry *e,
742 746
743 /* Check hooks & underflows */ 747 /* Check hooks & underflows */
744 for (h = 0; h < NF_INET_NUMHOOKS; h++) { 748 for (h = 0; h < NF_INET_NUMHOOKS; h++) {
749 if (!(valid_hooks & (1 << h)))
750 continue;
745 if ((unsigned char *)e - base == hook_entries[h]) 751 if ((unsigned char *)e - base == hook_entries[h])
746 newinfo->hook_entry[h] = hook_entries[h]; 752 newinfo->hook_entry[h] = hook_entries[h];
747 if ((unsigned char *)e - base == underflows[h]) 753 if ((unsigned char *)e - base == underflows[h]) {
754 if (!check_underflow(e)) {
755 pr_err("Underflows must be unconditional and "
756 "use the STANDARD target with "
757 "ACCEPT/DROP\n");
758 return -EINVAL;
759 }
748 newinfo->underflow[h] = underflows[h]; 760 newinfo->underflow[h] = underflows[h];
761 }
749 } 762 }
750 763
751 /* FIXME: underflows must be unconditional, standard verdicts
752 < 0 (not IPT_RETURN). --RR */
753
754 /* Clear counters and comefrom */ 764 /* Clear counters and comefrom */
755 e->counters = ((struct xt_counters) { 0, 0 }); 765 e->counters = ((struct xt_counters) { 0, 0 });
756 e->comefrom = 0; 766 e->comefrom = 0;
757
758 (*i)++;
759 return 0; 767 return 0;
760} 768}
761 769
762static int 770static void
763cleanup_entry(struct ipt_entry *e, unsigned int *i) 771cleanup_entry(struct ipt_entry *e, struct net *net)
764{ 772{
765 struct xt_tgdtor_param par; 773 struct xt_tgdtor_param par;
766 struct ipt_entry_target *t; 774 struct xt_entry_target *t;
767 775 struct xt_entry_match *ematch;
768 if (i && (*i)-- == 0)
769 return 1;
770 776
771 /* Cleanup all matches */ 777 /* Cleanup all matches */
772 IPT_MATCH_ITERATE(e, cleanup_match, NULL); 778 xt_ematch_foreach(ematch, e)
779 cleanup_match(ematch, net);
773 t = ipt_get_target(e); 780 t = ipt_get_target(e);
774 781
782 par.net = net;
775 par.target = t->u.kernel.target; 783 par.target = t->u.kernel.target;
776 par.targinfo = t->data; 784 par.targinfo = t->data;
777 par.family = NFPROTO_IPV4; 785 par.family = NFPROTO_IPV4;
778 if (par.target->destroy != NULL) 786 if (par.target->destroy != NULL)
779 par.target->destroy(&par); 787 par.target->destroy(&par);
780 module_put(par.target->me); 788 module_put(par.target->me);
781 return 0;
782} 789}
783 790
784/* Checks and translates the user-supplied table segment (held in 791/* Checks and translates the user-supplied table segment (held in
785 newinfo) */ 792 newinfo) */
786static int 793static int
787translate_table(const char *name, 794translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
788 unsigned int valid_hooks, 795 const struct ipt_replace *repl)
789 struct xt_table_info *newinfo,
790 void *entry0,
791 unsigned int size,
792 unsigned int number,
793 const unsigned int *hook_entries,
794 const unsigned int *underflows)
795{ 796{
797 struct ipt_entry *iter;
796 unsigned int i; 798 unsigned int i;
797 int ret; 799 int ret = 0;
798 800
799 newinfo->size = size; 801 newinfo->size = repl->size;
800 newinfo->number = number; 802 newinfo->number = repl->num_entries;
801 803
802 /* Init all hooks to impossible value. */ 804 /* Init all hooks to impossible value. */
803 for (i = 0; i < NF_INET_NUMHOOKS; i++) { 805 for (i = 0; i < NF_INET_NUMHOOKS; i++) {
@@ -808,49 +810,61 @@ translate_table(const char *name,
808 duprintf("translate_table: size %u\n", newinfo->size); 810 duprintf("translate_table: size %u\n", newinfo->size);
809 i = 0; 811 i = 0;
810 /* Walk through entries, checking offsets. */ 812 /* Walk through entries, checking offsets. */
811 ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, 813 xt_entry_foreach(iter, entry0, newinfo->size) {
812 check_entry_size_and_hooks, 814 ret = check_entry_size_and_hooks(iter, newinfo, entry0,
813 newinfo, 815 entry0 + repl->size,
814 entry0, 816 repl->hook_entry,
815 entry0 + size, 817 repl->underflow,
816 hook_entries, underflows, &i); 818 repl->valid_hooks);
817 if (ret != 0) 819 if (ret != 0)
818 return ret; 820 return ret;
821 ++i;
822 if (strcmp(ipt_get_target(iter)->u.user.name,
823 XT_ERROR_TARGET) == 0)
824 ++newinfo->stacksize;
825 }
819 826
820 if (i != number) { 827 if (i != repl->num_entries) {
821 duprintf("translate_table: %u not %u entries\n", 828 duprintf("translate_table: %u not %u entries\n",
822 i, number); 829 i, repl->num_entries);
823 return -EINVAL; 830 return -EINVAL;
824 } 831 }
825 832
826 /* Check hooks all assigned */ 833 /* Check hooks all assigned */
827 for (i = 0; i < NF_INET_NUMHOOKS; i++) { 834 for (i = 0; i < NF_INET_NUMHOOKS; i++) {
828 /* Only hooks which are valid */ 835 /* Only hooks which are valid */
829 if (!(valid_hooks & (1 << i))) 836 if (!(repl->valid_hooks & (1 << i)))
830 continue; 837 continue;
831 if (newinfo->hook_entry[i] == 0xFFFFFFFF) { 838 if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
832 duprintf("Invalid hook entry %u %u\n", 839 duprintf("Invalid hook entry %u %u\n",
833 i, hook_entries[i]); 840 i, repl->hook_entry[i]);
834 return -EINVAL; 841 return -EINVAL;
835 } 842 }
836 if (newinfo->underflow[i] == 0xFFFFFFFF) { 843 if (newinfo->underflow[i] == 0xFFFFFFFF) {
837 duprintf("Invalid underflow %u %u\n", 844 duprintf("Invalid underflow %u %u\n",
838 i, underflows[i]); 845 i, repl->underflow[i]);
839 return -EINVAL; 846 return -EINVAL;
840 } 847 }
841 } 848 }
842 849
843 if (!mark_source_chains(newinfo, valid_hooks, entry0)) 850 if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
844 return -ELOOP; 851 return -ELOOP;
845 852
846 /* Finally, each sanity check must pass */ 853 /* Finally, each sanity check must pass */
847 i = 0; 854 i = 0;
848 ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, 855 xt_entry_foreach(iter, entry0, newinfo->size) {
849 find_check_entry, name, size, &i); 856 ret = find_check_entry(iter, net, repl->name, repl->size);
857 if (ret != 0)
858 break;
859 ++i;
860 }
850 861
851 if (ret != 0) { 862 if (ret != 0) {
852 IPT_ENTRY_ITERATE(entry0, newinfo->size, 863 xt_entry_foreach(iter, entry0, newinfo->size) {
853 cleanup_entry, &i); 864 if (i-- == 0)
865 break;
866 cleanup_entry(iter, net);
867 }
854 return ret; 868 return ret;
855 } 869 }
856 870
@@ -863,80 +877,45 @@ translate_table(const char *name,
863 return ret; 877 return ret;
864} 878}
865 879
866/* Gets counters. */
867static inline int
868add_entry_to_counter(const struct ipt_entry *e,
869 struct xt_counters total[],
870 unsigned int *i)
871{
872 ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
873
874 (*i)++;
875 return 0;
876}
877
878static inline int
879set_entry_to_counter(const struct ipt_entry *e,
880 struct ipt_counters total[],
881 unsigned int *i)
882{
883 SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
884
885 (*i)++;
886 return 0;
887}
888
889static void 880static void
890get_counters(const struct xt_table_info *t, 881get_counters(const struct xt_table_info *t,
891 struct xt_counters counters[]) 882 struct xt_counters counters[])
892{ 883{
884 struct ipt_entry *iter;
893 unsigned int cpu; 885 unsigned int cpu;
894 unsigned int i; 886 unsigned int i;
895 unsigned int curcpu;
896
897 /* Instead of clearing (by a previous call to memset())
898 * the counters and using adds, we set the counters
899 * with data used by 'current' CPU.
900 *
901 * Bottom half has to be disabled to prevent deadlock
902 * if new softirq were to run and call ipt_do_table
903 */
904 local_bh_disable();
905 curcpu = smp_processor_id();
906
907 i = 0;
908 IPT_ENTRY_ITERATE(t->entries[curcpu],
909 t->size,
910 set_entry_to_counter,
911 counters,
912 &i);
913 887
914 for_each_possible_cpu(cpu) { 888 for_each_possible_cpu(cpu) {
915 if (cpu == curcpu) 889 seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
916 continue; 890
917 i = 0; 891 i = 0;
918 xt_info_wrlock(cpu); 892 xt_entry_foreach(iter, t->entries[cpu], t->size) {
919 IPT_ENTRY_ITERATE(t->entries[cpu], 893 u64 bcnt, pcnt;
920 t->size, 894 unsigned int start;
921 add_entry_to_counter, 895
922 counters, 896 do {
923 &i); 897 start = read_seqbegin(lock);
924 xt_info_wrunlock(cpu); 898 bcnt = iter->counters.bcnt;
899 pcnt = iter->counters.pcnt;
900 } while (read_seqretry(lock, start));
901
902 ADD_COUNTER(counters[i], bcnt, pcnt);
903 ++i; /* macro does multi eval of i */
904 }
925 } 905 }
926 local_bh_enable();
927} 906}
928 907
929static struct xt_counters * alloc_counters(struct xt_table *table) 908static struct xt_counters *alloc_counters(const struct xt_table *table)
930{ 909{
931 unsigned int countersize; 910 unsigned int countersize;
932 struct xt_counters *counters; 911 struct xt_counters *counters;
933 struct xt_table_info *private = table->private; 912 const struct xt_table_info *private = table->private;
934 913
935 /* We need atomic snapshot of counters: rest doesn't change 914 /* We need atomic snapshot of counters: rest doesn't change
936 (other than comefrom, which userspace doesn't care 915 (other than comefrom, which userspace doesn't care
937 about). */ 916 about). */
938 countersize = sizeof(struct xt_counters) * private->number; 917 countersize = sizeof(struct xt_counters) * private->number;
939 counters = vmalloc_node(countersize, numa_node_id()); 918 counters = vzalloc(countersize);
940 919
941 if (counters == NULL) 920 if (counters == NULL)
942 return ERR_PTR(-ENOMEM); 921 return ERR_PTR(-ENOMEM);
@@ -948,11 +927,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
948 927
949static int 928static int
950copy_entries_to_user(unsigned int total_size, 929copy_entries_to_user(unsigned int total_size,
951 struct xt_table *table, 930 const struct xt_table *table,
952 void __user *userptr) 931 void __user *userptr)
953{ 932{
954 unsigned int off, num; 933 unsigned int off, num;
955 struct ipt_entry *e; 934 const struct ipt_entry *e;
956 struct xt_counters *counters; 935 struct xt_counters *counters;
957 const struct xt_table_info *private = table->private; 936 const struct xt_table_info *private = table->private;
958 int ret = 0; 937 int ret = 0;
@@ -976,8 +955,8 @@ copy_entries_to_user(unsigned int total_size,
976 /* ... then go back and fix counters and names */ 955 /* ... then go back and fix counters and names */
977 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ 956 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
978 unsigned int i; 957 unsigned int i;
979 const struct ipt_entry_match *m; 958 const struct xt_entry_match *m;
980 const struct ipt_entry_target *t; 959 const struct xt_entry_target *t;
981 960
982 e = (struct ipt_entry *)(loc_cpu_entry + off); 961 e = (struct ipt_entry *)(loc_cpu_entry + off);
983 if (copy_to_user(userptr + off 962 if (copy_to_user(userptr + off
@@ -994,7 +973,7 @@ copy_entries_to_user(unsigned int total_size,
994 m = (void *)e + i; 973 m = (void *)e + i;
995 974
996 if (copy_to_user(userptr + off + i 975 if (copy_to_user(userptr + off + i
997 + offsetof(struct ipt_entry_match, 976 + offsetof(struct xt_entry_match,
998 u.user.name), 977 u.user.name),
999 m->u.kernel.match->name, 978 m->u.kernel.match->name,
1000 strlen(m->u.kernel.match->name)+1) 979 strlen(m->u.kernel.match->name)+1)
@@ -1004,9 +983,9 @@ copy_entries_to_user(unsigned int total_size,
1004 } 983 }
1005 } 984 }
1006 985
1007 t = ipt_get_target(e); 986 t = ipt_get_target_c(e);
1008 if (copy_to_user(userptr + off + e->target_offset 987 if (copy_to_user(userptr + off + e->target_offset
1009 + offsetof(struct ipt_entry_target, 988 + offsetof(struct xt_entry_target,
1010 u.user.name), 989 u.user.name),
1011 t->u.kernel.target->name, 990 t->u.kernel.target->name,
1012 strlen(t->u.kernel.target->name)+1) != 0) { 991 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1021,7 +1000,7 @@ copy_entries_to_user(unsigned int total_size,
1021} 1000}
1022 1001
1023#ifdef CONFIG_COMPAT 1002#ifdef CONFIG_COMPAT
1024static void compat_standard_from_user(void *dst, void *src) 1003static void compat_standard_from_user(void *dst, const void *src)
1025{ 1004{
1026 int v = *(compat_int_t *)src; 1005 int v = *(compat_int_t *)src;
1027 1006
@@ -1030,7 +1009,7 @@ static void compat_standard_from_user(void *dst, void *src)
1030 memcpy(dst, &v, sizeof(v)); 1009 memcpy(dst, &v, sizeof(v));
1031} 1010}
1032 1011
1033static int compat_standard_to_user(void __user *dst, void *src) 1012static int compat_standard_to_user(void __user *dst, const void *src)
1034{ 1013{
1035 compat_int_t cv = *(int *)src; 1014 compat_int_t cv = *(int *)src;
1036 1015
@@ -1039,25 +1018,20 @@ static int compat_standard_to_user(void __user *dst, void *src)
1039 return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; 1018 return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
1040} 1019}
1041 1020
1042static inline int 1021static int compat_calc_entry(const struct ipt_entry *e,
1043compat_calc_match(struct ipt_entry_match *m, int *size)
1044{
1045 *size += xt_compat_match_offset(m->u.kernel.match);
1046 return 0;
1047}
1048
1049static int compat_calc_entry(struct ipt_entry *e,
1050 const struct xt_table_info *info, 1022 const struct xt_table_info *info,
1051 void *base, struct xt_table_info *newinfo) 1023 const void *base, struct xt_table_info *newinfo)
1052{ 1024{
1053 struct ipt_entry_target *t; 1025 const struct xt_entry_match *ematch;
1026 const struct xt_entry_target *t;
1054 unsigned int entry_offset; 1027 unsigned int entry_offset;
1055 int off, i, ret; 1028 int off, i, ret;
1056 1029
1057 off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); 1030 off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
1058 entry_offset = (void *)e - base; 1031 entry_offset = (void *)e - base;
1059 IPT_MATCH_ITERATE(e, compat_calc_match, &off); 1032 xt_ematch_foreach(ematch, e)
1060 t = ipt_get_target(e); 1033 off += xt_compat_match_offset(ematch->u.kernel.match);
1034 t = ipt_get_target_c(e);
1061 off += xt_compat_target_offset(t->u.kernel.target); 1035 off += xt_compat_target_offset(t->u.kernel.target);
1062 newinfo->size -= off; 1036 newinfo->size -= off;
1063 ret = xt_compat_add_offset(AF_INET, entry_offset, off); 1037 ret = xt_compat_add_offset(AF_INET, entry_offset, off);
@@ -1078,7 +1052,9 @@ static int compat_calc_entry(struct ipt_entry *e,
1078static int compat_table_info(const struct xt_table_info *info, 1052static int compat_table_info(const struct xt_table_info *info,
1079 struct xt_table_info *newinfo) 1053 struct xt_table_info *newinfo)
1080{ 1054{
1055 struct ipt_entry *iter;
1081 void *loc_cpu_entry; 1056 void *loc_cpu_entry;
1057 int ret;
1082 1058
1083 if (!newinfo || !info) 1059 if (!newinfo || !info)
1084 return -EINVAL; 1060 return -EINVAL;
@@ -1087,15 +1063,19 @@ static int compat_table_info(const struct xt_table_info *info,
1087 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 1063 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
1088 newinfo->initial_entries = 0; 1064 newinfo->initial_entries = 0;
1089 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 1065 loc_cpu_entry = info->entries[raw_smp_processor_id()];
1090 return IPT_ENTRY_ITERATE(loc_cpu_entry, info->size, 1066 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
1091 compat_calc_entry, info, loc_cpu_entry, 1067 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
1092 newinfo); 1068 if (ret != 0)
1069 return ret;
1070 }
1071 return 0;
1093} 1072}
1094#endif 1073#endif
1095 1074
1096static int get_info(struct net *net, void __user *user, int *len, int compat) 1075static int get_info(struct net *net, void __user *user,
1076 const int *len, int compat)
1097{ 1077{
1098 char name[IPT_TABLE_MAXNAMELEN]; 1078 char name[XT_TABLE_MAXNAMELEN];
1099 struct xt_table *t; 1079 struct xt_table *t;
1100 int ret; 1080 int ret;
1101 1081
@@ -1108,7 +1088,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
1108 if (copy_from_user(name, user, sizeof(name)) != 0) 1088 if (copy_from_user(name, user, sizeof(name)) != 0)
1109 return -EFAULT; 1089 return -EFAULT;
1110 1090
1111 name[IPT_TABLE_MAXNAMELEN-1] = '\0'; 1091 name[XT_TABLE_MAXNAMELEN-1] = '\0';
1112#ifdef CONFIG_COMPAT 1092#ifdef CONFIG_COMPAT
1113 if (compat) 1093 if (compat)
1114 xt_compat_lock(AF_INET); 1094 xt_compat_lock(AF_INET);
@@ -1118,15 +1098,16 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
1118 if (t && !IS_ERR(t)) { 1098 if (t && !IS_ERR(t)) {
1119 struct ipt_getinfo info; 1099 struct ipt_getinfo info;
1120 const struct xt_table_info *private = t->private; 1100 const struct xt_table_info *private = t->private;
1121
1122#ifdef CONFIG_COMPAT 1101#ifdef CONFIG_COMPAT
1102 struct xt_table_info tmp;
1103
1123 if (compat) { 1104 if (compat) {
1124 struct xt_table_info tmp;
1125 ret = compat_table_info(private, &tmp); 1105 ret = compat_table_info(private, &tmp);
1126 xt_compat_flush_offsets(AF_INET); 1106 xt_compat_flush_offsets(AF_INET);
1127 private = &tmp; 1107 private = &tmp;
1128 } 1108 }
1129#endif 1109#endif
1110 memset(&info, 0, sizeof(info));
1130 info.valid_hooks = t->valid_hooks; 1111 info.valid_hooks = t->valid_hooks;
1131 memcpy(info.hook_entry, private->hook_entry, 1112 memcpy(info.hook_entry, private->hook_entry,
1132 sizeof(info.hook_entry)); 1113 sizeof(info.hook_entry));
@@ -1153,7 +1134,8 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
1153} 1134}
1154 1135
1155static int 1136static int
1156get_entries(struct net *net, struct ipt_get_entries __user *uptr, int *len) 1137get_entries(struct net *net, struct ipt_get_entries __user *uptr,
1138 const int *len)
1157{ 1139{
1158 int ret; 1140 int ret;
1159 struct ipt_get_entries get; 1141 struct ipt_get_entries get;
@@ -1201,9 +1183,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1201 struct xt_table_info *oldinfo; 1183 struct xt_table_info *oldinfo;
1202 struct xt_counters *counters; 1184 struct xt_counters *counters;
1203 void *loc_cpu_old_entry; 1185 void *loc_cpu_old_entry;
1186 struct ipt_entry *iter;
1204 1187
1205 ret = 0; 1188 ret = 0;
1206 counters = vmalloc(num_counters * sizeof(struct xt_counters)); 1189 counters = vzalloc(num_counters * sizeof(struct xt_counters));
1207 if (!counters) { 1190 if (!counters) {
1208 ret = -ENOMEM; 1191 ret = -ENOMEM;
1209 goto out; 1192 goto out;
@@ -1243,8 +1226,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1243 1226
1244 /* Decrease module usage counts and free resource */ 1227 /* Decrease module usage counts and free resource */
1245 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1228 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1246 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1229 xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
1247 NULL); 1230 cleanup_entry(iter, net);
1231
1248 xt_free_table_info(oldinfo); 1232 xt_free_table_info(oldinfo);
1249 if (copy_to_user(counters_ptr, counters, 1233 if (copy_to_user(counters_ptr, counters,
1250 sizeof(struct xt_counters) * num_counters) != 0) 1234 sizeof(struct xt_counters) * num_counters) != 0)
@@ -1263,12 +1247,13 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1263} 1247}
1264 1248
1265static int 1249static int
1266do_replace(struct net *net, void __user *user, unsigned int len) 1250do_replace(struct net *net, const void __user *user, unsigned int len)
1267{ 1251{
1268 int ret; 1252 int ret;
1269 struct ipt_replace tmp; 1253 struct ipt_replace tmp;
1270 struct xt_table_info *newinfo; 1254 struct xt_table_info *newinfo;
1271 void *loc_cpu_entry; 1255 void *loc_cpu_entry;
1256 struct ipt_entry *iter;
1272 1257
1273 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1258 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1274 return -EFAULT; 1259 return -EFAULT;
@@ -1289,13 +1274,11 @@ do_replace(struct net *net, void __user *user, unsigned int len)
1289 goto free_newinfo; 1274 goto free_newinfo;
1290 } 1275 }
1291 1276
1292 ret = translate_table(tmp.name, tmp.valid_hooks, 1277 ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
1293 newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
1294 tmp.hook_entry, tmp.underflow);
1295 if (ret != 0) 1278 if (ret != 0)
1296 goto free_newinfo; 1279 goto free_newinfo;
1297 1280
1298 duprintf("ip_tables: Translated table\n"); 1281 duprintf("Translated table\n");
1299 1282
1300 ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, 1283 ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
1301 tmp.num_counters, tmp.counters); 1284 tmp.num_counters, tmp.counters);
@@ -1304,27 +1287,16 @@ do_replace(struct net *net, void __user *user, unsigned int len)
1304 return 0; 1287 return 0;
1305 1288
1306 free_newinfo_untrans: 1289 free_newinfo_untrans:
1307 IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); 1290 xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
1291 cleanup_entry(iter, net);
1308 free_newinfo: 1292 free_newinfo:
1309 xt_free_table_info(newinfo); 1293 xt_free_table_info(newinfo);
1310 return ret; 1294 return ret;
1311} 1295}
1312 1296
1313/* We're lazy, and add to the first CPU; overflow works its fey magic
1314 * and everything is OK. */
1315static int
1316add_counter_to_entry(struct ipt_entry *e,
1317 const struct xt_counters addme[],
1318 unsigned int *i)
1319{
1320 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1321
1322 (*i)++;
1323 return 0;
1324}
1325
1326static int 1297static int
1327do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) 1298do_add_counters(struct net *net, const void __user *user,
1299 unsigned int len, int compat)
1328{ 1300{
1329 unsigned int i, curcpu; 1301 unsigned int i, curcpu;
1330 struct xt_counters_info tmp; 1302 struct xt_counters_info tmp;
@@ -1337,6 +1309,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1337 const struct xt_table_info *private; 1309 const struct xt_table_info *private;
1338 int ret = 0; 1310 int ret = 0;
1339 void *loc_cpu_entry; 1311 void *loc_cpu_entry;
1312 struct ipt_entry *iter;
1340#ifdef CONFIG_COMPAT 1313#ifdef CONFIG_COMPAT
1341 struct compat_xt_counters_info compat_tmp; 1314 struct compat_xt_counters_info compat_tmp;
1342 1315
@@ -1367,7 +1340,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1367 if (len != size + num_counters * sizeof(struct xt_counters)) 1340 if (len != size + num_counters * sizeof(struct xt_counters))
1368 return -EINVAL; 1341 return -EINVAL;
1369 1342
1370 paddc = vmalloc_node(len - size, numa_node_id()); 1343 paddc = vmalloc(len - size);
1371 if (!paddc) 1344 if (!paddc)
1372 return -ENOMEM; 1345 return -ENOMEM;
1373 1346
@@ -1394,11 +1367,10 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1394 curcpu = smp_processor_id(); 1367 curcpu = smp_processor_id();
1395 loc_cpu_entry = private->entries[curcpu]; 1368 loc_cpu_entry = private->entries[curcpu];
1396 xt_info_wrlock(curcpu); 1369 xt_info_wrlock(curcpu);
1397 IPT_ENTRY_ITERATE(loc_cpu_entry, 1370 xt_entry_foreach(iter, loc_cpu_entry, private->size) {
1398 private->size, 1371 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
1399 add_counter_to_entry, 1372 ++i;
1400 paddc, 1373 }
1401 &i);
1402 xt_info_wrunlock(curcpu); 1374 xt_info_wrunlock(curcpu);
1403 unlock_up_free: 1375 unlock_up_free:
1404 local_bh_enable(); 1376 local_bh_enable();
@@ -1412,130 +1384,109 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1412 1384
1413#ifdef CONFIG_COMPAT 1385#ifdef CONFIG_COMPAT
1414struct compat_ipt_replace { 1386struct compat_ipt_replace {
1415 char name[IPT_TABLE_MAXNAMELEN]; 1387 char name[XT_TABLE_MAXNAMELEN];
1416 u32 valid_hooks; 1388 u32 valid_hooks;
1417 u32 num_entries; 1389 u32 num_entries;
1418 u32 size; 1390 u32 size;
1419 u32 hook_entry[NF_INET_NUMHOOKS]; 1391 u32 hook_entry[NF_INET_NUMHOOKS];
1420 u32 underflow[NF_INET_NUMHOOKS]; 1392 u32 underflow[NF_INET_NUMHOOKS];
1421 u32 num_counters; 1393 u32 num_counters;
1422 compat_uptr_t counters; /* struct ipt_counters * */ 1394 compat_uptr_t counters; /* struct xt_counters * */
1423 struct compat_ipt_entry entries[0]; 1395 struct compat_ipt_entry entries[0];
1424}; 1396};
1425 1397
1426static int 1398static int
1427compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, 1399compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
1428 unsigned int *size, struct xt_counters *counters, 1400 unsigned int *size, struct xt_counters *counters,
1429 unsigned int *i) 1401 unsigned int i)
1430{ 1402{
1431 struct ipt_entry_target *t; 1403 struct xt_entry_target *t;
1432 struct compat_ipt_entry __user *ce; 1404 struct compat_ipt_entry __user *ce;
1433 u_int16_t target_offset, next_offset; 1405 u_int16_t target_offset, next_offset;
1434 compat_uint_t origsize; 1406 compat_uint_t origsize;
1435 int ret; 1407 const struct xt_entry_match *ematch;
1408 int ret = 0;
1436 1409
1437 ret = -EFAULT;
1438 origsize = *size; 1410 origsize = *size;
1439 ce = (struct compat_ipt_entry __user *)*dstptr; 1411 ce = (struct compat_ipt_entry __user *)*dstptr;
1440 if (copy_to_user(ce, e, sizeof(struct ipt_entry))) 1412 if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 ||
1441 goto out; 1413 copy_to_user(&ce->counters, &counters[i],
1442 1414 sizeof(counters[i])) != 0)
1443 if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) 1415 return -EFAULT;
1444 goto out;
1445 1416
1446 *dstptr += sizeof(struct compat_ipt_entry); 1417 *dstptr += sizeof(struct compat_ipt_entry);
1447 *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); 1418 *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
1448 1419
1449 ret = IPT_MATCH_ITERATE(e, xt_compat_match_to_user, dstptr, size); 1420 xt_ematch_foreach(ematch, e) {
1421 ret = xt_compat_match_to_user(ematch, dstptr, size);
1422 if (ret != 0)
1423 return ret;
1424 }
1450 target_offset = e->target_offset - (origsize - *size); 1425 target_offset = e->target_offset - (origsize - *size);
1451 if (ret)
1452 goto out;
1453 t = ipt_get_target(e); 1426 t = ipt_get_target(e);
1454 ret = xt_compat_target_to_user(t, dstptr, size); 1427 ret = xt_compat_target_to_user(t, dstptr, size);
1455 if (ret) 1428 if (ret)
1456 goto out; 1429 return ret;
1457 ret = -EFAULT;
1458 next_offset = e->next_offset - (origsize - *size); 1430 next_offset = e->next_offset - (origsize - *size);
1459 if (put_user(target_offset, &ce->target_offset)) 1431 if (put_user(target_offset, &ce->target_offset) != 0 ||
1460 goto out; 1432 put_user(next_offset, &ce->next_offset) != 0)
1461 if (put_user(next_offset, &ce->next_offset)) 1433 return -EFAULT;
1462 goto out;
1463
1464 (*i)++;
1465 return 0; 1434 return 0;
1466out:
1467 return ret;
1468} 1435}
1469 1436
1470static int 1437static int
1471compat_find_calc_match(struct ipt_entry_match *m, 1438compat_find_calc_match(struct xt_entry_match *m,
1472 const char *name, 1439 const char *name,
1473 const struct ipt_ip *ip, 1440 const struct ipt_ip *ip,
1474 unsigned int hookmask, 1441 unsigned int hookmask,
1475 int *size, unsigned int *i) 1442 int *size)
1476{ 1443{
1477 struct xt_match *match; 1444 struct xt_match *match;
1478 1445
1479 match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name, 1446 match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
1480 m->u.user.revision), 1447 m->u.user.revision);
1481 "ipt_%s", m->u.user.name); 1448 if (IS_ERR(match)) {
1482 if (IS_ERR(match) || !match) {
1483 duprintf("compat_check_calc_match: `%s' not found\n", 1449 duprintf("compat_check_calc_match: `%s' not found\n",
1484 m->u.user.name); 1450 m->u.user.name);
1485 return match ? PTR_ERR(match) : -ENOENT; 1451 return PTR_ERR(match);
1486 } 1452 }
1487 m->u.kernel.match = match; 1453 m->u.kernel.match = match;
1488 *size += xt_compat_match_offset(match); 1454 *size += xt_compat_match_offset(match);
1489
1490 (*i)++;
1491 return 0; 1455 return 0;
1492} 1456}
1493 1457
1494static int 1458static void compat_release_entry(struct compat_ipt_entry *e)
1495compat_release_match(struct ipt_entry_match *m, unsigned int *i)
1496{ 1459{
1497 if (i && (*i)-- == 0) 1460 struct xt_entry_target *t;
1498 return 1; 1461 struct xt_entry_match *ematch;
1499
1500 module_put(m->u.kernel.match->me);
1501 return 0;
1502}
1503
1504static int
1505compat_release_entry(struct compat_ipt_entry *e, unsigned int *i)
1506{
1507 struct ipt_entry_target *t;
1508
1509 if (i && (*i)-- == 0)
1510 return 1;
1511 1462
1512 /* Cleanup all matches */ 1463 /* Cleanup all matches */
1513 COMPAT_IPT_MATCH_ITERATE(e, compat_release_match, NULL); 1464 xt_ematch_foreach(ematch, e)
1465 module_put(ematch->u.kernel.match->me);
1514 t = compat_ipt_get_target(e); 1466 t = compat_ipt_get_target(e);
1515 module_put(t->u.kernel.target->me); 1467 module_put(t->u.kernel.target->me);
1516 return 0;
1517} 1468}
1518 1469
1519static int 1470static int
1520check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, 1471check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
1521 struct xt_table_info *newinfo, 1472 struct xt_table_info *newinfo,
1522 unsigned int *size, 1473 unsigned int *size,
1523 unsigned char *base, 1474 const unsigned char *base,
1524 unsigned char *limit, 1475 const unsigned char *limit,
1525 unsigned int *hook_entries, 1476 const unsigned int *hook_entries,
1526 unsigned int *underflows, 1477 const unsigned int *underflows,
1527 unsigned int *i,
1528 const char *name) 1478 const char *name)
1529{ 1479{
1530 struct ipt_entry_target *t; 1480 struct xt_entry_match *ematch;
1481 struct xt_entry_target *t;
1531 struct xt_target *target; 1482 struct xt_target *target;
1532 unsigned int entry_offset; 1483 unsigned int entry_offset;
1533 unsigned int j; 1484 unsigned int j;
1534 int ret, off, h; 1485 int ret, off, h;
1535 1486
1536 duprintf("check_compat_entry_size_and_hooks %p\n", e); 1487 duprintf("check_compat_entry_size_and_hooks %p\n", e);
1537 if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 1488 if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
1538 || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { 1489 (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) {
1539 duprintf("Bad offset %p, limit = %p\n", e, limit); 1490 duprintf("Bad offset %p, limit = %p\n", e, limit);
1540 return -EINVAL; 1491 return -EINVAL;
1541 } 1492 }
@@ -1555,20 +1506,21 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
1555 off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); 1506 off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
1556 entry_offset = (void *)e - (void *)base; 1507 entry_offset = (void *)e - (void *)base;
1557 j = 0; 1508 j = 0;
1558 ret = COMPAT_IPT_MATCH_ITERATE(e, compat_find_calc_match, name, 1509 xt_ematch_foreach(ematch, e) {
1559 &e->ip, e->comefrom, &off, &j); 1510 ret = compat_find_calc_match(ematch, name,
1560 if (ret != 0) 1511 &e->ip, e->comefrom, &off);
1561 goto release_matches; 1512 if (ret != 0)
1513 goto release_matches;
1514 ++j;
1515 }
1562 1516
1563 t = compat_ipt_get_target(e); 1517 t = compat_ipt_get_target(e);
1564 target = try_then_request_module(xt_find_target(AF_INET, 1518 target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
1565 t->u.user.name, 1519 t->u.user.revision);
1566 t->u.user.revision), 1520 if (IS_ERR(target)) {
1567 "ipt_%s", t->u.user.name);
1568 if (IS_ERR(target) || !target) {
1569 duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", 1521 duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
1570 t->u.user.name); 1522 t->u.user.name);
1571 ret = target ? PTR_ERR(target) : -ENOENT; 1523 ret = PTR_ERR(target);
1572 goto release_matches; 1524 goto release_matches;
1573 } 1525 }
1574 t->u.kernel.target = target; 1526 t->u.kernel.target = target;
@@ -1590,14 +1542,16 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
1590 /* Clear counters and comefrom */ 1542 /* Clear counters and comefrom */
1591 memset(&e->counters, 0, sizeof(e->counters)); 1543 memset(&e->counters, 0, sizeof(e->counters));
1592 e->comefrom = 0; 1544 e->comefrom = 0;
1593
1594 (*i)++;
1595 return 0; 1545 return 0;
1596 1546
1597out: 1547out:
1598 module_put(t->u.kernel.target->me); 1548 module_put(t->u.kernel.target->me);
1599release_matches: 1549release_matches:
1600 IPT_MATCH_ITERATE(e, compat_release_match, &j); 1550 xt_ematch_foreach(ematch, e) {
1551 if (j-- == 0)
1552 break;
1553 module_put(ematch->u.kernel.match->me);
1554 }
1601 return ret; 1555 return ret;
1602} 1556}
1603 1557
@@ -1606,11 +1560,12 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
1606 unsigned int *size, const char *name, 1560 unsigned int *size, const char *name,
1607 struct xt_table_info *newinfo, unsigned char *base) 1561 struct xt_table_info *newinfo, unsigned char *base)
1608{ 1562{
1609 struct ipt_entry_target *t; 1563 struct xt_entry_target *t;
1610 struct xt_target *target; 1564 struct xt_target *target;
1611 struct ipt_entry *de; 1565 struct ipt_entry *de;
1612 unsigned int origsize; 1566 unsigned int origsize;
1613 int ret, h; 1567 int ret, h;
1568 struct xt_entry_match *ematch;
1614 1569
1615 ret = 0; 1570 ret = 0;
1616 origsize = *size; 1571 origsize = *size;
@@ -1621,10 +1576,11 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
1621 *dstptr += sizeof(struct ipt_entry); 1576 *dstptr += sizeof(struct ipt_entry);
1622 *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); 1577 *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
1623 1578
1624 ret = COMPAT_IPT_MATCH_ITERATE(e, xt_compat_match_from_user, 1579 xt_ematch_foreach(ematch, e) {
1625 dstptr, size); 1580 ret = xt_compat_match_from_user(ematch, dstptr, size);
1626 if (ret) 1581 if (ret != 0)
1627 return ret; 1582 return ret;
1583 }
1628 de->target_offset = e->target_offset - (origsize - *size); 1584 de->target_offset = e->target_offset - (origsize - *size);
1629 t = compat_ipt_get_target(e); 1585 t = compat_ipt_get_target(e);
1630 target = t->u.kernel.target; 1586 target = t->u.kernel.target;
@@ -1641,36 +1597,43 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
1641} 1597}
1642 1598
1643static int 1599static int
1644compat_check_entry(struct ipt_entry *e, const char *name, 1600compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
1645 unsigned int *i)
1646{ 1601{
1602 struct xt_entry_match *ematch;
1647 struct xt_mtchk_param mtpar; 1603 struct xt_mtchk_param mtpar;
1648 unsigned int j; 1604 unsigned int j;
1649 int ret; 1605 int ret = 0;
1650 1606
1651 j = 0; 1607 j = 0;
1608 mtpar.net = net;
1652 mtpar.table = name; 1609 mtpar.table = name;
1653 mtpar.entryinfo = &e->ip; 1610 mtpar.entryinfo = &e->ip;
1654 mtpar.hook_mask = e->comefrom; 1611 mtpar.hook_mask = e->comefrom;
1655 mtpar.family = NFPROTO_IPV4; 1612 mtpar.family = NFPROTO_IPV4;
1656 ret = IPT_MATCH_ITERATE(e, check_match, &mtpar, &j); 1613 xt_ematch_foreach(ematch, e) {
1657 if (ret) 1614 ret = check_match(ematch, &mtpar);
1658 goto cleanup_matches; 1615 if (ret != 0)
1616 goto cleanup_matches;
1617 ++j;
1618 }
1659 1619
1660 ret = check_target(e, name); 1620 ret = check_target(e, net, name);
1661 if (ret) 1621 if (ret)
1662 goto cleanup_matches; 1622 goto cleanup_matches;
1663
1664 (*i)++;
1665 return 0; 1623 return 0;
1666 1624
1667 cleanup_matches: 1625 cleanup_matches:
1668 IPT_MATCH_ITERATE(e, cleanup_match, &j); 1626 xt_ematch_foreach(ematch, e) {
1627 if (j-- == 0)
1628 break;
1629 cleanup_match(ematch, net);
1630 }
1669 return ret; 1631 return ret;
1670} 1632}
1671 1633
1672static int 1634static int
1673translate_compat_table(const char *name, 1635translate_compat_table(struct net *net,
1636 const char *name,
1674 unsigned int valid_hooks, 1637 unsigned int valid_hooks,
1675 struct xt_table_info **pinfo, 1638 struct xt_table_info **pinfo,
1676 void **pentry0, 1639 void **pentry0,
@@ -1682,6 +1645,8 @@ translate_compat_table(const char *name,
1682 unsigned int i, j; 1645 unsigned int i, j;
1683 struct xt_table_info *newinfo, *info; 1646 struct xt_table_info *newinfo, *info;
1684 void *pos, *entry0, *entry1; 1647 void *pos, *entry0, *entry1;
1648 struct compat_ipt_entry *iter0;
1649 struct ipt_entry *iter1;
1685 unsigned int size; 1650 unsigned int size;
1686 int ret; 1651 int ret;
1687 1652
@@ -1700,13 +1665,17 @@ translate_compat_table(const char *name,
1700 j = 0; 1665 j = 0;
1701 xt_compat_lock(AF_INET); 1666 xt_compat_lock(AF_INET);
1702 /* Walk through entries, checking offsets. */ 1667 /* Walk through entries, checking offsets. */
1703 ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, 1668 xt_entry_foreach(iter0, entry0, total_size) {
1704 check_compat_entry_size_and_hooks, 1669 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
1705 info, &size, entry0, 1670 entry0,
1706 entry0 + total_size, 1671 entry0 + total_size,
1707 hook_entries, underflows, &j, name); 1672 hook_entries,
1708 if (ret != 0) 1673 underflows,
1709 goto out_unlock; 1674 name);
1675 if (ret != 0)
1676 goto out_unlock;
1677 ++j;
1678 }
1710 1679
1711 ret = -EINVAL; 1680 ret = -EINVAL;
1712 if (j != number) { 1681 if (j != number) {
@@ -1745,9 +1714,12 @@ translate_compat_table(const char *name,
1745 entry1 = newinfo->entries[raw_smp_processor_id()]; 1714 entry1 = newinfo->entries[raw_smp_processor_id()];
1746 pos = entry1; 1715 pos = entry1;
1747 size = total_size; 1716 size = total_size;
1748 ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, 1717 xt_entry_foreach(iter0, entry0, total_size) {
1749 compat_copy_entry_from_user, 1718 ret = compat_copy_entry_from_user(iter0, &pos, &size,
1750 &pos, &size, name, newinfo, entry1); 1719 name, newinfo, entry1);
1720 if (ret != 0)
1721 break;
1722 }
1751 xt_compat_flush_offsets(AF_INET); 1723 xt_compat_flush_offsets(AF_INET);
1752 xt_compat_unlock(AF_INET); 1724 xt_compat_unlock(AF_INET);
1753 if (ret) 1725 if (ret)
@@ -1758,13 +1730,35 @@ translate_compat_table(const char *name,
1758 goto free_newinfo; 1730 goto free_newinfo;
1759 1731
1760 i = 0; 1732 i = 0;
1761 ret = IPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, 1733 xt_entry_foreach(iter1, entry1, newinfo->size) {
1762 name, &i); 1734 ret = compat_check_entry(iter1, net, name);
1735 if (ret != 0)
1736 break;
1737 ++i;
1738 if (strcmp(ipt_get_target(iter1)->u.user.name,
1739 XT_ERROR_TARGET) == 0)
1740 ++newinfo->stacksize;
1741 }
1763 if (ret) { 1742 if (ret) {
1743 /*
1744 * The first i matches need cleanup_entry (calls ->destroy)
1745 * because they had called ->check already. The other j-i
1746 * entries need only release.
1747 */
1748 int skip = i;
1764 j -= i; 1749 j -= i;
1765 COMPAT_IPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, 1750 xt_entry_foreach(iter0, entry0, newinfo->size) {
1766 compat_release_entry, &j); 1751 if (skip-- > 0)
1767 IPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); 1752 continue;
1753 if (j-- == 0)
1754 break;
1755 compat_release_entry(iter0);
1756 }
1757 xt_entry_foreach(iter1, entry1, newinfo->size) {
1758 if (i-- == 0)
1759 break;
1760 cleanup_entry(iter1, net);
1761 }
1768 xt_free_table_info(newinfo); 1762 xt_free_table_info(newinfo);
1769 return ret; 1763 return ret;
1770 } 1764 }
@@ -1782,7 +1776,11 @@ translate_compat_table(const char *name,
1782free_newinfo: 1776free_newinfo:
1783 xt_free_table_info(newinfo); 1777 xt_free_table_info(newinfo);
1784out: 1778out:
1785 COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); 1779 xt_entry_foreach(iter0, entry0, total_size) {
1780 if (j-- == 0)
1781 break;
1782 compat_release_entry(iter0);
1783 }
1786 return ret; 1784 return ret;
1787out_unlock: 1785out_unlock:
1788 xt_compat_flush_offsets(AF_INET); 1786 xt_compat_flush_offsets(AF_INET);
@@ -1797,6 +1795,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
1797 struct compat_ipt_replace tmp; 1795 struct compat_ipt_replace tmp;
1798 struct xt_table_info *newinfo; 1796 struct xt_table_info *newinfo;
1799 void *loc_cpu_entry; 1797 void *loc_cpu_entry;
1798 struct ipt_entry *iter;
1800 1799
1801 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1800 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1802 return -EFAULT; 1801 return -EFAULT;
@@ -1819,7 +1818,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
1819 goto free_newinfo; 1818 goto free_newinfo;
1820 } 1819 }
1821 1820
1822 ret = translate_compat_table(tmp.name, tmp.valid_hooks, 1821 ret = translate_compat_table(net, tmp.name, tmp.valid_hooks,
1823 &newinfo, &loc_cpu_entry, tmp.size, 1822 &newinfo, &loc_cpu_entry, tmp.size,
1824 tmp.num_entries, tmp.hook_entry, 1823 tmp.num_entries, tmp.hook_entry,
1825 tmp.underflow); 1824 tmp.underflow);
@@ -1835,7 +1834,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
1835 return 0; 1834 return 0;
1836 1835
1837 free_newinfo_untrans: 1836 free_newinfo_untrans:
1838 IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); 1837 xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
1838 cleanup_entry(iter, net);
1839 free_newinfo: 1839 free_newinfo:
1840 xt_free_table_info(newinfo); 1840 xt_free_table_info(newinfo);
1841 return ret; 1841 return ret;
@@ -1868,7 +1868,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
1868} 1868}
1869 1869
1870struct compat_ipt_get_entries { 1870struct compat_ipt_get_entries {
1871 char name[IPT_TABLE_MAXNAMELEN]; 1871 char name[XT_TABLE_MAXNAMELEN];
1872 compat_uint_t size; 1872 compat_uint_t size;
1873 struct compat_ipt_entry entrytable[0]; 1873 struct compat_ipt_entry entrytable[0];
1874}; 1874};
@@ -1884,6 +1884,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
1884 int ret = 0; 1884 int ret = 0;
1885 const void *loc_cpu_entry; 1885 const void *loc_cpu_entry;
1886 unsigned int i = 0; 1886 unsigned int i = 0;
1887 struct ipt_entry *iter;
1887 1888
1888 counters = alloc_counters(table); 1889 counters = alloc_counters(table);
1889 if (IS_ERR(counters)) 1890 if (IS_ERR(counters))
@@ -1896,9 +1897,12 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
1896 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1897 loc_cpu_entry = private->entries[raw_smp_processor_id()];
1897 pos = userptr; 1898 pos = userptr;
1898 size = total_size; 1899 size = total_size;
1899 ret = IPT_ENTRY_ITERATE(loc_cpu_entry, total_size, 1900 xt_entry_foreach(iter, loc_cpu_entry, total_size) {
1900 compat_copy_entry_to_user, 1901 ret = compat_copy_entry_to_user(iter, &pos,
1901 &pos, &size, counters, &i); 1902 &size, counters, i++);
1903 if (ret != 0)
1904 break;
1905 }
1902 1906
1903 vfree(counters); 1907 vfree(counters);
1904 return ret; 1908 return ret;
@@ -2019,7 +2023,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2019 2023
2020 case IPT_SO_GET_REVISION_MATCH: 2024 case IPT_SO_GET_REVISION_MATCH:
2021 case IPT_SO_GET_REVISION_TARGET: { 2025 case IPT_SO_GET_REVISION_TARGET: {
2022 struct ipt_get_revision rev; 2026 struct xt_get_revision rev;
2023 int target; 2027 int target;
2024 2028
2025 if (*len != sizeof(rev)) { 2029 if (*len != sizeof(rev)) {
@@ -2051,13 +2055,13 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2051 return ret; 2055 return ret;
2052} 2056}
2053 2057
2054struct xt_table *ipt_register_table(struct net *net, struct xt_table *table, 2058struct xt_table *ipt_register_table(struct net *net,
2059 const struct xt_table *table,
2055 const struct ipt_replace *repl) 2060 const struct ipt_replace *repl)
2056{ 2061{
2057 int ret; 2062 int ret;
2058 struct xt_table_info *newinfo; 2063 struct xt_table_info *newinfo;
2059 struct xt_table_info bootstrap 2064 struct xt_table_info bootstrap = {0};
2060 = { 0, 0, 0, { 0 }, { 0 }, { } };
2061 void *loc_cpu_entry; 2065 void *loc_cpu_entry;
2062 struct xt_table *new_table; 2066 struct xt_table *new_table;
2063 2067
@@ -2071,11 +2075,7 @@ struct xt_table *ipt_register_table(struct net *net, struct xt_table *table,
2071 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; 2075 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
2072 memcpy(loc_cpu_entry, repl->entries, repl->size); 2076 memcpy(loc_cpu_entry, repl->entries, repl->size);
2073 2077
2074 ret = translate_table(table->name, table->valid_hooks, 2078 ret = translate_table(net, newinfo, loc_cpu_entry, repl);
2075 newinfo, loc_cpu_entry, repl->size,
2076 repl->num_entries,
2077 repl->hook_entry,
2078 repl->underflow);
2079 if (ret != 0) 2079 if (ret != 0)
2080 goto out_free; 2080 goto out_free;
2081 2081
@@ -2093,17 +2093,19 @@ out:
2093 return ERR_PTR(ret); 2093 return ERR_PTR(ret);
2094} 2094}
2095 2095
2096void ipt_unregister_table(struct xt_table *table) 2096void ipt_unregister_table(struct net *net, struct xt_table *table)
2097{ 2097{
2098 struct xt_table_info *private; 2098 struct xt_table_info *private;
2099 void *loc_cpu_entry; 2099 void *loc_cpu_entry;
2100 struct module *table_owner = table->me; 2100 struct module *table_owner = table->me;
2101 struct ipt_entry *iter;
2101 2102
2102 private = xt_unregister_table(table); 2103 private = xt_unregister_table(table);
2103 2104
2104 /* Decrease module usage counts and free resources */ 2105 /* Decrease module usage counts and free resources */
2105 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 2106 loc_cpu_entry = private->entries[raw_smp_processor_id()];
2106 IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); 2107 xt_entry_foreach(iter, loc_cpu_entry, private->size)
2108 cleanup_entry(iter, net);
2107 if (private->number > private->initial_entries) 2109 if (private->number > private->initial_entries)
2108 module_put(table_owner); 2110 module_put(table_owner);
2109 xt_free_table_info(private); 2111 xt_free_table_info(private);
@@ -2121,7 +2123,7 @@ icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
2121} 2123}
2122 2124
2123static bool 2125static bool
2124icmp_match(const struct sk_buff *skb, const struct xt_match_param *par) 2126icmp_match(const struct sk_buff *skb, struct xt_action_param *par)
2125{ 2127{
2126 const struct icmphdr *ic; 2128 const struct icmphdr *ic;
2127 struct icmphdr _icmph; 2129 struct icmphdr _icmph;
@@ -2137,7 +2139,7 @@ icmp_match(const struct sk_buff *skb, const struct xt_match_param *par)
2137 * can't. Hence, no choice but to drop. 2139 * can't. Hence, no choice but to drop.
2138 */ 2140 */
2139 duprintf("Dropping evil ICMP tinygram.\n"); 2141 duprintf("Dropping evil ICMP tinygram.\n");
2140 *par->hotdrop = true; 2142 par->hotdrop = true;
2141 return false; 2143 return false;
2142 } 2144 }
2143 2145
@@ -2148,31 +2150,31 @@ icmp_match(const struct sk_buff *skb, const struct xt_match_param *par)
2148 !!(icmpinfo->invflags&IPT_ICMP_INV)); 2150 !!(icmpinfo->invflags&IPT_ICMP_INV));
2149} 2151}
2150 2152
2151static bool icmp_checkentry(const struct xt_mtchk_param *par) 2153static int icmp_checkentry(const struct xt_mtchk_param *par)
2152{ 2154{
2153 const struct ipt_icmp *icmpinfo = par->matchinfo; 2155 const struct ipt_icmp *icmpinfo = par->matchinfo;
2154 2156
2155 /* Must specify no unknown invflags */ 2157 /* Must specify no unknown invflags */
2156 return !(icmpinfo->invflags & ~IPT_ICMP_INV); 2158 return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0;
2157} 2159}
2158 2160
2159/* The built-in targets: standard (NULL) and error. */ 2161static struct xt_target ipt_builtin_tg[] __read_mostly = {
2160static struct xt_target ipt_standard_target __read_mostly = { 2162 {
2161 .name = IPT_STANDARD_TARGET, 2163 .name = XT_STANDARD_TARGET,
2162 .targetsize = sizeof(int), 2164 .targetsize = sizeof(int),
2163 .family = NFPROTO_IPV4, 2165 .family = NFPROTO_IPV4,
2164#ifdef CONFIG_COMPAT 2166#ifdef CONFIG_COMPAT
2165 .compatsize = sizeof(compat_int_t), 2167 .compatsize = sizeof(compat_int_t),
2166 .compat_from_user = compat_standard_from_user, 2168 .compat_from_user = compat_standard_from_user,
2167 .compat_to_user = compat_standard_to_user, 2169 .compat_to_user = compat_standard_to_user,
2168#endif 2170#endif
2169}; 2171 },
2170 2172 {
2171static struct xt_target ipt_error_target __read_mostly = { 2173 .name = XT_ERROR_TARGET,
2172 .name = IPT_ERROR_TARGET, 2174 .target = ipt_error,
2173 .target = ipt_error, 2175 .targetsize = XT_FUNCTION_MAXNAMELEN,
2174 .targetsize = IPT_FUNCTION_MAXNAMELEN, 2176 .family = NFPROTO_IPV4,
2175 .family = NFPROTO_IPV4, 2177 },
2176}; 2178};
2177 2179
2178static struct nf_sockopt_ops ipt_sockopts = { 2180static struct nf_sockopt_ops ipt_sockopts = {
@@ -2192,13 +2194,15 @@ static struct nf_sockopt_ops ipt_sockopts = {
2192 .owner = THIS_MODULE, 2194 .owner = THIS_MODULE,
2193}; 2195};
2194 2196
2195static struct xt_match icmp_matchstruct __read_mostly = { 2197static struct xt_match ipt_builtin_mt[] __read_mostly = {
2196 .name = "icmp", 2198 {
2197 .match = icmp_match, 2199 .name = "icmp",
2198 .matchsize = sizeof(struct ipt_icmp), 2200 .match = icmp_match,
2199 .checkentry = icmp_checkentry, 2201 .matchsize = sizeof(struct ipt_icmp),
2200 .proto = IPPROTO_ICMP, 2202 .checkentry = icmp_checkentry,
2201 .family = NFPROTO_IPV4, 2203 .proto = IPPROTO_ICMP,
2204 .family = NFPROTO_IPV4,
2205 },
2202}; 2206};
2203 2207
2204static int __net_init ip_tables_net_init(struct net *net) 2208static int __net_init ip_tables_net_init(struct net *net)
@@ -2225,13 +2229,10 @@ static int __init ip_tables_init(void)
2225 goto err1; 2229 goto err1;
2226 2230
2227 /* Noone else will be downing sem now, so we won't sleep */ 2231 /* Noone else will be downing sem now, so we won't sleep */
2228 ret = xt_register_target(&ipt_standard_target); 2232 ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
2229 if (ret < 0) 2233 if (ret < 0)
2230 goto err2; 2234 goto err2;
2231 ret = xt_register_target(&ipt_error_target); 2235 ret = xt_register_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
2232 if (ret < 0)
2233 goto err3;
2234 ret = xt_register_match(&icmp_matchstruct);
2235 if (ret < 0) 2236 if (ret < 0)
2236 goto err4; 2237 goto err4;
2237 2238
@@ -2240,15 +2241,13 @@ static int __init ip_tables_init(void)
2240 if (ret < 0) 2241 if (ret < 0)
2241 goto err5; 2242 goto err5;
2242 2243
2243 printk(KERN_INFO "ip_tables: (C) 2000-2006 Netfilter Core Team\n"); 2244 pr_info("(C) 2000-2006 Netfilter Core Team\n");
2244 return 0; 2245 return 0;
2245 2246
2246err5: 2247err5:
2247 xt_unregister_match(&icmp_matchstruct); 2248 xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
2248err4: 2249err4:
2249 xt_unregister_target(&ipt_error_target); 2250 xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
2250err3:
2251 xt_unregister_target(&ipt_standard_target);
2252err2: 2251err2:
2253 unregister_pernet_subsys(&ip_tables_net_ops); 2252 unregister_pernet_subsys(&ip_tables_net_ops);
2254err1: 2253err1:
@@ -2259,10 +2258,8 @@ static void __exit ip_tables_fini(void)
2259{ 2258{
2260 nf_unregister_sockopt(&ipt_sockopts); 2259 nf_unregister_sockopt(&ipt_sockopts);
2261 2260
2262 xt_unregister_match(&icmp_matchstruct); 2261 xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
2263 xt_unregister_target(&ipt_error_target); 2262 xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
2264 xt_unregister_target(&ipt_standard_target);
2265
2266 unregister_pernet_subsys(&ip_tables_net_ops); 2263 unregister_pernet_subsys(&ip_tables_net_ops);
2267} 2264}
2268 2265
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 2e4f98b85524..1e26a4897655 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -9,11 +9,13 @@
9 * published by the Free Software Foundation. 9 * published by the Free Software Foundation.
10 * 10 *
11 */ 11 */
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/proc_fs.h> 14#include <linux/proc_fs.h>
14#include <linux/jhash.h> 15#include <linux/jhash.h>
15#include <linux/bitops.h> 16#include <linux/bitops.h>
16#include <linux/skbuff.h> 17#include <linux/skbuff.h>
18#include <linux/slab.h>
17#include <linux/ip.h> 19#include <linux/ip.h>
18#include <linux/tcp.h> 20#include <linux/tcp.h>
19#include <linux/udp.h> 21#include <linux/udp.h>
@@ -27,6 +29,7 @@
27#include <net/netfilter/nf_conntrack.h> 29#include <net/netfilter/nf_conntrack.h>
28#include <net/net_namespace.h> 30#include <net/net_namespace.h>
29#include <net/checksum.h> 31#include <net/checksum.h>
32#include <net/ip.h>
30 33
31#define CLUSTERIP_VERSION "0.8" 34#define CLUSTERIP_VERSION "0.8"
32 35
@@ -51,12 +54,13 @@ struct clusterip_config {
51#endif 54#endif
52 enum clusterip_hashmode hash_mode; /* which hashing mode */ 55 enum clusterip_hashmode hash_mode; /* which hashing mode */
53 u_int32_t hash_initval; /* hash initialization */ 56 u_int32_t hash_initval; /* hash initialization */
57 struct rcu_head rcu;
54}; 58};
55 59
56static LIST_HEAD(clusterip_configs); 60static LIST_HEAD(clusterip_configs);
57 61
58/* clusterip_lock protects the clusterip_configs list */ 62/* clusterip_lock protects the clusterip_configs list */
59static DEFINE_RWLOCK(clusterip_lock); 63static DEFINE_SPINLOCK(clusterip_lock);
60 64
61#ifdef CONFIG_PROC_FS 65#ifdef CONFIG_PROC_FS
62static const struct file_operations clusterip_proc_fops; 66static const struct file_operations clusterip_proc_fops;
@@ -69,11 +73,17 @@ clusterip_config_get(struct clusterip_config *c)
69 atomic_inc(&c->refcount); 73 atomic_inc(&c->refcount);
70} 74}
71 75
76
77static void clusterip_config_rcu_free(struct rcu_head *head)
78{
79 kfree(container_of(head, struct clusterip_config, rcu));
80}
81
72static inline void 82static inline void
73clusterip_config_put(struct clusterip_config *c) 83clusterip_config_put(struct clusterip_config *c)
74{ 84{
75 if (atomic_dec_and_test(&c->refcount)) 85 if (atomic_dec_and_test(&c->refcount))
76 kfree(c); 86 call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
77} 87}
78 88
79/* decrease the count of entries using/referencing this config. If last 89/* decrease the count of entries using/referencing this config. If last
@@ -82,12 +92,13 @@ clusterip_config_put(struct clusterip_config *c)
82static inline void 92static inline void
83clusterip_config_entry_put(struct clusterip_config *c) 93clusterip_config_entry_put(struct clusterip_config *c)
84{ 94{
85 write_lock_bh(&clusterip_lock); 95 local_bh_disable();
86 if (atomic_dec_and_test(&c->entries)) { 96 if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
87 list_del(&c->list); 97 list_del_rcu(&c->list);
88 write_unlock_bh(&clusterip_lock); 98 spin_unlock(&clusterip_lock);
99 local_bh_enable();
89 100
90 dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0); 101 dev_mc_del(c->dev, c->clustermac);
91 dev_put(c->dev); 102 dev_put(c->dev);
92 103
93 /* In case anyone still accesses the file, the open/close 104 /* In case anyone still accesses the file, the open/close
@@ -98,7 +109,7 @@ clusterip_config_entry_put(struct clusterip_config *c)
98#endif 109#endif
99 return; 110 return;
100 } 111 }
101 write_unlock_bh(&clusterip_lock); 112 local_bh_enable();
102} 113}
103 114
104static struct clusterip_config * 115static struct clusterip_config *
@@ -106,7 +117,7 @@ __clusterip_config_find(__be32 clusterip)
106{ 117{
107 struct clusterip_config *c; 118 struct clusterip_config *c;
108 119
109 list_for_each_entry(c, &clusterip_configs, list) { 120 list_for_each_entry_rcu(c, &clusterip_configs, list) {
110 if (c->clusterip == clusterip) 121 if (c->clusterip == clusterip)
111 return c; 122 return c;
112 } 123 }
@@ -119,16 +130,15 @@ clusterip_config_find_get(__be32 clusterip, int entry)
119{ 130{
120 struct clusterip_config *c; 131 struct clusterip_config *c;
121 132
122 read_lock_bh(&clusterip_lock); 133 rcu_read_lock_bh();
123 c = __clusterip_config_find(clusterip); 134 c = __clusterip_config_find(clusterip);
124 if (!c) { 135 if (c) {
125 read_unlock_bh(&clusterip_lock); 136 if (unlikely(!atomic_inc_not_zero(&c->refcount)))
126 return NULL; 137 c = NULL;
138 else if (entry)
139 atomic_inc(&c->entries);
127 } 140 }
128 atomic_inc(&c->refcount); 141 rcu_read_unlock_bh();
129 if (entry)
130 atomic_inc(&c->entries);
131 read_unlock_bh(&clusterip_lock);
132 142
133 return c; 143 return c;
134} 144}
@@ -179,9 +189,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
179 } 189 }
180#endif 190#endif
181 191
182 write_lock_bh(&clusterip_lock); 192 spin_lock_bh(&clusterip_lock);
183 list_add(&c->list, &clusterip_configs); 193 list_add_rcu(&c->list, &clusterip_configs);
184 write_unlock_bh(&clusterip_lock); 194 spin_unlock_bh(&clusterip_lock);
185 195
186 return c; 196 return c;
187} 197}
@@ -222,25 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb,
222{ 232{
223 const struct iphdr *iph = ip_hdr(skb); 233 const struct iphdr *iph = ip_hdr(skb);
224 unsigned long hashval; 234 unsigned long hashval;
225 u_int16_t sport, dport; 235 u_int16_t sport = 0, dport = 0;
226 const u_int16_t *ports; 236 int poff;
227 237
228 switch (iph->protocol) { 238 poff = proto_ports_offset(iph->protocol);
229 case IPPROTO_TCP: 239 if (poff >= 0) {
230 case IPPROTO_UDP: 240 const u_int16_t *ports;
231 case IPPROTO_UDPLITE: 241 u16 _ports[2];
232 case IPPROTO_SCTP: 242
233 case IPPROTO_DCCP: 243 ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
234 case IPPROTO_ICMP: 244 if (ports) {
235 ports = (const void *)iph+iph->ihl*4; 245 sport = ports[0];
236 sport = ports[0]; 246 dport = ports[1];
237 dport = ports[1]; 247 }
238 break; 248 } else {
239 default:
240 if (net_ratelimit()) 249 if (net_ratelimit())
241 printk(KERN_NOTICE "CLUSTERIP: unknown protocol `%u'\n", 250 pr_info("unknown protocol %u\n", iph->protocol);
242 iph->protocol);
243 sport = dport = 0;
244 } 251 }
245 252
246 switch (config->hash_mode) { 253 switch (config->hash_mode) {
@@ -261,7 +268,7 @@ clusterip_hashfn(const struct sk_buff *skb,
261 hashval = 0; 268 hashval = 0;
262 /* This cannot happen, unless the check function wasn't called 269 /* This cannot happen, unless the check function wasn't called
263 * at rule load time */ 270 * at rule load time */
264 printk("CLUSTERIP: unknown mode `%u'\n", config->hash_mode); 271 pr_info("unknown mode %u\n", config->hash_mode);
265 BUG(); 272 BUG();
266 break; 273 break;
267 } 274 }
@@ -281,7 +288,7 @@ clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
281 ***********************************************************************/ 288 ***********************************************************************/
282 289
283static unsigned int 290static unsigned int
284clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par) 291clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
285{ 292{
286 const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; 293 const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
287 struct nf_conn *ct; 294 struct nf_conn *ct;
@@ -294,7 +301,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par)
294 301
295 ct = nf_ct_get(skb, &ctinfo); 302 ct = nf_ct_get(skb, &ctinfo);
296 if (ct == NULL) { 303 if (ct == NULL) {
297 printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); 304 pr_info("no conntrack!\n");
298 /* FIXME: need to drop invalid ones, since replies 305 /* FIXME: need to drop invalid ones, since replies
299 * to outgoing connections of other nodes will be 306 * to outgoing connections of other nodes will be
300 * marked as INVALID */ 307 * marked as INVALID */
@@ -303,9 +310,9 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par)
303 310
304 /* special case: ICMP error handling. conntrack distinguishes between 311 /* special case: ICMP error handling. conntrack distinguishes between
305 * error messages (RELATED) and information requests (see below) */ 312 * error messages (RELATED) and information requests (see below) */
306 if (ip_hdr(skb)->protocol == IPPROTO_ICMP 313 if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
307 && (ctinfo == IP_CT_RELATED 314 (ctinfo == IP_CT_RELATED ||
308 || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY)) 315 ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY))
309 return XT_CONTINUE; 316 return XT_CONTINUE;
310 317
311 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 318 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
@@ -347,25 +354,24 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par)
347 return XT_CONTINUE; 354 return XT_CONTINUE;
348} 355}
349 356
350static bool clusterip_tg_check(const struct xt_tgchk_param *par) 357static int clusterip_tg_check(const struct xt_tgchk_param *par)
351{ 358{
352 struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; 359 struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
353 const struct ipt_entry *e = par->entryinfo; 360 const struct ipt_entry *e = par->entryinfo;
354
355 struct clusterip_config *config; 361 struct clusterip_config *config;
362 int ret;
356 363
357 if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && 364 if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
358 cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && 365 cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
359 cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { 366 cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
360 printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n", 367 pr_info("unknown mode %u\n", cipinfo->hash_mode);
361 cipinfo->hash_mode); 368 return -EINVAL;
362 return false;
363 369
364 } 370 }
365 if (e->ip.dmsk.s_addr != htonl(0xffffffff) 371 if (e->ip.dmsk.s_addr != htonl(0xffffffff) ||
366 || e->ip.dst.s_addr == 0) { 372 e->ip.dst.s_addr == 0) {
367 printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n"); 373 pr_info("Please specify destination IP\n");
368 return false; 374 return -EINVAL;
369 } 375 }
370 376
371 /* FIXME: further sanity checks */ 377 /* FIXME: further sanity checks */
@@ -373,41 +379,41 @@ static bool clusterip_tg_check(const struct xt_tgchk_param *par)
373 config = clusterip_config_find_get(e->ip.dst.s_addr, 1); 379 config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
374 if (!config) { 380 if (!config) {
375 if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { 381 if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
376 printk(KERN_WARNING "CLUSTERIP: no config found for %pI4, need 'new'\n", &e->ip.dst.s_addr); 382 pr_info("no config found for %pI4, need 'new'\n",
377 return false; 383 &e->ip.dst.s_addr);
384 return -EINVAL;
378 } else { 385 } else {
379 struct net_device *dev; 386 struct net_device *dev;
380 387
381 if (e->ip.iniface[0] == '\0') { 388 if (e->ip.iniface[0] == '\0') {
382 printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n"); 389 pr_info("Please specify an interface name\n");
383 return false; 390 return -EINVAL;
384 } 391 }
385 392
386 dev = dev_get_by_name(&init_net, e->ip.iniface); 393 dev = dev_get_by_name(&init_net, e->ip.iniface);
387 if (!dev) { 394 if (!dev) {
388 printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface); 395 pr_info("no such interface %s\n",
389 return false; 396 e->ip.iniface);
397 return -ENOENT;
390 } 398 }
391 399
392 config = clusterip_config_init(cipinfo, 400 config = clusterip_config_init(cipinfo,
393 e->ip.dst.s_addr, dev); 401 e->ip.dst.s_addr, dev);
394 if (!config) { 402 if (!config) {
395 printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n"); 403 pr_info("cannot allocate config\n");
396 dev_put(dev); 404 dev_put(dev);
397 return false; 405 return -ENOMEM;
398 } 406 }
399 dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0); 407 dev_mc_add(config->dev, config->clustermac);
400 } 408 }
401 } 409 }
402 cipinfo->config = config; 410 cipinfo->config = config;
403 411
404 if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { 412 ret = nf_ct_l3proto_try_module_get(par->family);
405 printk(KERN_WARNING "can't load conntrack support for " 413 if (ret < 0)
406 "proto=%u\n", par->target->family); 414 pr_info("cannot load conntrack support for proto=%u\n",
407 return false; 415 par->family);
408 } 416 return ret;
409
410 return true;
411} 417}
412 418
413/* drop reference count of cluster config when rule is deleted */ 419/* drop reference count of cluster config when rule is deleted */
@@ -421,7 +427,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
421 427
422 clusterip_config_put(cipinfo->config); 428 clusterip_config_put(cipinfo->config);
423 429
424 nf_ct_l3proto_module_put(par->target->family); 430 nf_ct_l3proto_module_put(par->family);
425} 431}
426 432
427#ifdef CONFIG_COMPAT 433#ifdef CONFIG_COMPAT
@@ -462,7 +468,7 @@ struct arp_payload {
462 __be32 src_ip; 468 __be32 src_ip;
463 u_int8_t dst_hw[ETH_ALEN]; 469 u_int8_t dst_hw[ETH_ALEN];
464 __be32 dst_ip; 470 __be32 dst_ip;
465} __attribute__ ((packed)); 471} __packed;
466 472
467#ifdef DEBUG 473#ifdef DEBUG
468static void arp_print(struct arp_payload *payload) 474static void arp_print(struct arp_payload *payload)
@@ -478,8 +484,8 @@ static void arp_print(struct arp_payload *payload)
478 } 484 }
479 hbuffer[--k]='\0'; 485 hbuffer[--k]='\0';
480 486
481 printk("src %pI4@%s, dst %pI4\n", 487 pr_debug("src %pI4@%s, dst %pI4\n",
482 &payload->src_ip, hbuffer, &payload->dst_ip); 488 &payload->src_ip, hbuffer, &payload->dst_ip);
483} 489}
484#endif 490#endif
485 491
@@ -495,14 +501,14 @@ arp_mangle(unsigned int hook,
495 struct clusterip_config *c; 501 struct clusterip_config *c;
496 502
497 /* we don't care about non-ethernet and non-ipv4 ARP */ 503 /* we don't care about non-ethernet and non-ipv4 ARP */
498 if (arp->ar_hrd != htons(ARPHRD_ETHER) 504 if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
499 || arp->ar_pro != htons(ETH_P_IP) 505 arp->ar_pro != htons(ETH_P_IP) ||
500 || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) 506 arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
501 return NF_ACCEPT; 507 return NF_ACCEPT;
502 508
503 /* we only want to mangle arp requests and replies */ 509 /* we only want to mangle arp requests and replies */
504 if (arp->ar_op != htons(ARPOP_REPLY) 510 if (arp->ar_op != htons(ARPOP_REPLY) &&
505 && arp->ar_op != htons(ARPOP_REQUEST)) 511 arp->ar_op != htons(ARPOP_REQUEST))
506 return NF_ACCEPT; 512 return NF_ACCEPT;
507 513
508 payload = (void *)(arp+1); 514 payload = (void *)(arp+1);
@@ -518,7 +524,7 @@ arp_mangle(unsigned int hook,
518 * this wouldn't work, since we didn't subscribe the mcast group on 524 * this wouldn't work, since we didn't subscribe the mcast group on
519 * other interfaces */ 525 * other interfaces */
520 if (c->dev != out) { 526 if (c->dev != out) {
521 pr_debug("CLUSTERIP: not mangling arp reply on different " 527 pr_debug("not mangling arp reply on different "
522 "interface: cip'%s'-skb'%s'\n", 528 "interface: cip'%s'-skb'%s'\n",
523 c->dev->name, out->name); 529 c->dev->name, out->name);
524 clusterip_config_put(c); 530 clusterip_config_put(c);
@@ -529,7 +535,7 @@ arp_mangle(unsigned int hook,
529 memcpy(payload->src_hw, c->clustermac, arp->ar_hln); 535 memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
530 536
531#ifdef DEBUG 537#ifdef DEBUG
532 pr_debug(KERN_DEBUG "CLUSTERIP mangled arp reply: "); 538 pr_debug("mangled arp reply: ");
533 arp_print(payload); 539 arp_print(payload);
534#endif 540#endif
535 541
@@ -560,8 +566,7 @@ struct clusterip_seq_position {
560 566
561static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) 567static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
562{ 568{
563 const struct proc_dir_entry *pde = s->private; 569 struct clusterip_config *c = s->private;
564 struct clusterip_config *c = pde->data;
565 unsigned int weight; 570 unsigned int weight;
566 u_int32_t local_nodes; 571 u_int32_t local_nodes;
567 struct clusterip_seq_position *idx; 572 struct clusterip_seq_position *idx;
@@ -601,7 +606,8 @@ static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
601 606
602static void clusterip_seq_stop(struct seq_file *s, void *v) 607static void clusterip_seq_stop(struct seq_file *s, void *v)
603{ 608{
604 kfree(v); 609 if (!IS_ERR(v))
610 kfree(v);
605} 611}
606 612
607static int clusterip_seq_show(struct seq_file *s, void *v) 613static int clusterip_seq_show(struct seq_file *s, void *v)
@@ -632,10 +638,9 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
632 638
633 if (!ret) { 639 if (!ret) {
634 struct seq_file *sf = file->private_data; 640 struct seq_file *sf = file->private_data;
635 struct proc_dir_entry *pde = PDE(inode); 641 struct clusterip_config *c = PDE(inode)->data;
636 struct clusterip_config *c = pde->data;
637 642
638 sf->private = pde; 643 sf->private = c;
639 644
640 clusterip_config_get(c); 645 clusterip_config_get(c);
641 } 646 }
@@ -645,8 +650,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
645 650
646static int clusterip_proc_release(struct inode *inode, struct file *file) 651static int clusterip_proc_release(struct inode *inode, struct file *file)
647{ 652{
648 struct proc_dir_entry *pde = PDE(inode); 653 struct clusterip_config *c = PDE(inode)->data;
649 struct clusterip_config *c = pde->data;
650 int ret; 654 int ret;
651 655
652 ret = seq_release(inode, file); 656 ret = seq_release(inode, file);
@@ -660,10 +664,9 @@ static int clusterip_proc_release(struct inode *inode, struct file *file)
660static ssize_t clusterip_proc_write(struct file *file, const char __user *input, 664static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
661 size_t size, loff_t *ofs) 665 size_t size, loff_t *ofs)
662{ 666{
667 struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data;
663#define PROC_WRITELEN 10 668#define PROC_WRITELEN 10
664 char buffer[PROC_WRITELEN+1]; 669 char buffer[PROC_WRITELEN+1];
665 const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
666 struct clusterip_config *c = pde->data;
667 unsigned long nodenum; 670 unsigned long nodenum;
668 671
669 if (copy_from_user(buffer, input, PROC_WRITELEN)) 672 if (copy_from_user(buffer, input, PROC_WRITELEN))
@@ -709,13 +712,13 @@ static int __init clusterip_tg_init(void)
709#ifdef CONFIG_PROC_FS 712#ifdef CONFIG_PROC_FS
710 clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net); 713 clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net);
711 if (!clusterip_procdir) { 714 if (!clusterip_procdir) {
712 printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n"); 715 pr_err("Unable to proc dir entry\n");
713 ret = -ENOMEM; 716 ret = -ENOMEM;
714 goto cleanup_hook; 717 goto cleanup_hook;
715 } 718 }
716#endif /* CONFIG_PROC_FS */ 719#endif /* CONFIG_PROC_FS */
717 720
718 printk(KERN_NOTICE "ClusterIP Version %s loaded successfully\n", 721 pr_info("ClusterIP Version %s loaded successfully\n",
719 CLUSTERIP_VERSION); 722 CLUSTERIP_VERSION);
720 return 0; 723 return 0;
721 724
@@ -730,13 +733,15 @@ cleanup_target:
730 733
731static void __exit clusterip_tg_exit(void) 734static void __exit clusterip_tg_exit(void)
732{ 735{
733 printk(KERN_NOTICE "ClusterIP Version %s unloading\n", 736 pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
734 CLUSTERIP_VERSION);
735#ifdef CONFIG_PROC_FS 737#ifdef CONFIG_PROC_FS
736 remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent); 738 remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
737#endif 739#endif
738 nf_unregister_hook(&cip_arp_ops); 740 nf_unregister_hook(&cip_arp_ops);
739 xt_unregister_target(&clusterip_tg_reg); 741 xt_unregister_target(&clusterip_tg_reg);
742
743 /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
744 rcu_barrier_bh();
740} 745}
741 746
742module_init(clusterip_tg_init); 747module_init(clusterip_tg_init);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index f7e2fa0974dc..4bf3dc49ad1e 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -6,7 +6,7 @@
6 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8*/ 8*/
9 9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10#include <linux/in.h> 10#include <linux/in.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/skbuff.h> 12#include <linux/skbuff.h>
@@ -50,7 +50,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
50 struct tcphdr _tcph, *tcph; 50 struct tcphdr _tcph, *tcph;
51 __be16 oldval; 51 __be16 oldval;
52 52
53 /* Not enought header? */ 53 /* Not enough header? */
54 tcph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); 54 tcph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
55 if (!tcph) 55 if (!tcph)
56 return false; 56 return false;
@@ -77,7 +77,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
77} 77}
78 78
79static unsigned int 79static unsigned int
80ecn_tg(struct sk_buff *skb, const struct xt_target_param *par) 80ecn_tg(struct sk_buff *skb, const struct xt_action_param *par)
81{ 81{
82 const struct ipt_ECN_info *einfo = par->targinfo; 82 const struct ipt_ECN_info *einfo = par->targinfo;
83 83
@@ -85,36 +85,33 @@ ecn_tg(struct sk_buff *skb, const struct xt_target_param *par)
85 if (!set_ect_ip(skb, einfo)) 85 if (!set_ect_ip(skb, einfo))
86 return NF_DROP; 86 return NF_DROP;
87 87
88 if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) 88 if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) &&
89 && ip_hdr(skb)->protocol == IPPROTO_TCP) 89 ip_hdr(skb)->protocol == IPPROTO_TCP)
90 if (!set_ect_tcp(skb, einfo)) 90 if (!set_ect_tcp(skb, einfo))
91 return NF_DROP; 91 return NF_DROP;
92 92
93 return XT_CONTINUE; 93 return XT_CONTINUE;
94} 94}
95 95
96static bool ecn_tg_check(const struct xt_tgchk_param *par) 96static int ecn_tg_check(const struct xt_tgchk_param *par)
97{ 97{
98 const struct ipt_ECN_info *einfo = par->targinfo; 98 const struct ipt_ECN_info *einfo = par->targinfo;
99 const struct ipt_entry *e = par->entryinfo; 99 const struct ipt_entry *e = par->entryinfo;
100 100
101 if (einfo->operation & IPT_ECN_OP_MASK) { 101 if (einfo->operation & IPT_ECN_OP_MASK) {
102 printk(KERN_WARNING "ECN: unsupported ECN operation %x\n", 102 pr_info("unsupported ECN operation %x\n", einfo->operation);
103 einfo->operation); 103 return -EINVAL;
104 return false;
105 } 104 }
106 if (einfo->ip_ect & ~IPT_ECN_IP_MASK) { 105 if (einfo->ip_ect & ~IPT_ECN_IP_MASK) {
107 printk(KERN_WARNING "ECN: new ECT codepoint %x out of mask\n", 106 pr_info("new ECT codepoint %x out of mask\n", einfo->ip_ect);
108 einfo->ip_ect); 107 return -EINVAL;
109 return false;
110 } 108 }
111 if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) 109 if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) &&
112 && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) { 110 (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) {
113 printk(KERN_WARNING "ECN: cannot use TCP operations on a " 111 pr_info("cannot use TCP operations on a non-tcp rule\n");
114 "non-tcp rule\n"); 112 return -EINVAL;
115 return false;
116 } 113 }
117 return true; 114 return 0;
118} 115}
119 116
120static struct xt_target ecn_tg_reg __read_mostly = { 117static struct xt_target ecn_tg_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index acc44c69eb68..72ffc8fda2e9 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -9,10 +9,11 @@
9 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation. 10 * published by the Free Software Foundation.
11 */ 11 */
12 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/skbuff.h> 15#include <linux/skbuff.h>
16#include <linux/if_arp.h>
16#include <linux/ip.h> 17#include <linux/ip.h>
17#include <net/icmp.h> 18#include <net/icmp.h>
18#include <net/udp.h> 19#include <net/udp.h>
@@ -23,16 +24,15 @@
23#include <linux/netfilter/x_tables.h> 24#include <linux/netfilter/x_tables.h>
24#include <linux/netfilter_ipv4/ipt_LOG.h> 25#include <linux/netfilter_ipv4/ipt_LOG.h>
25#include <net/netfilter/nf_log.h> 26#include <net/netfilter/nf_log.h>
27#include <net/netfilter/xt_log.h>
26 28
27MODULE_LICENSE("GPL"); 29MODULE_LICENSE("GPL");
28MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 30MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
29MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog"); 31MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
30 32
31/* Use lock to serialize, so printks don't overlap */
32static DEFINE_SPINLOCK(log_lock);
33
34/* One level of recursion won't kill us */ 33/* One level of recursion won't kill us */
35static void dump_packet(const struct nf_loginfo *info, 34static void dump_packet(struct sbuff *m,
35 const struct nf_loginfo *info,
36 const struct sk_buff *skb, 36 const struct sk_buff *skb,
37 unsigned int iphoff) 37 unsigned int iphoff)
38{ 38{
@@ -47,35 +47,35 @@ static void dump_packet(const struct nf_loginfo *info,
47 47
48 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); 48 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
49 if (ih == NULL) { 49 if (ih == NULL) {
50 printk("TRUNCATED"); 50 sb_add(m, "TRUNCATED");
51 return; 51 return;
52 } 52 }
53 53
54 /* Important fields: 54 /* Important fields:
55 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ 55 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
56 /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ 56 /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
57 printk("SRC=%pI4 DST=%pI4 ", 57 sb_add(m, "SRC=%pI4 DST=%pI4 ",
58 &ih->saddr, &ih->daddr); 58 &ih->saddr, &ih->daddr);
59 59
60 /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ 60 /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
61 printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", 61 sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
62 ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, 62 ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
63 ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); 63 ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
64 64
65 /* Max length: 6 "CE DF MF " */ 65 /* Max length: 6 "CE DF MF " */
66 if (ntohs(ih->frag_off) & IP_CE) 66 if (ntohs(ih->frag_off) & IP_CE)
67 printk("CE "); 67 sb_add(m, "CE ");
68 if (ntohs(ih->frag_off) & IP_DF) 68 if (ntohs(ih->frag_off) & IP_DF)
69 printk("DF "); 69 sb_add(m, "DF ");
70 if (ntohs(ih->frag_off) & IP_MF) 70 if (ntohs(ih->frag_off) & IP_MF)
71 printk("MF "); 71 sb_add(m, "MF ");
72 72
73 /* Max length: 11 "FRAG:65535 " */ 73 /* Max length: 11 "FRAG:65535 " */
74 if (ntohs(ih->frag_off) & IP_OFFSET) 74 if (ntohs(ih->frag_off) & IP_OFFSET)
75 printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); 75 sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
76 76
77 if ((logflags & IPT_LOG_IPOPT) 77 if ((logflags & IPT_LOG_IPOPT) &&
78 && ih->ihl * 4 > sizeof(struct iphdr)) { 78 ih->ihl * 4 > sizeof(struct iphdr)) {
79 const unsigned char *op; 79 const unsigned char *op;
80 unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; 80 unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
81 unsigned int i, optsize; 81 unsigned int i, optsize;
@@ -84,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info,
84 op = skb_header_pointer(skb, iphoff+sizeof(_iph), 84 op = skb_header_pointer(skb, iphoff+sizeof(_iph),
85 optsize, _opt); 85 optsize, _opt);
86 if (op == NULL) { 86 if (op == NULL) {
87 printk("TRUNCATED"); 87 sb_add(m, "TRUNCATED");
88 return; 88 return;
89 } 89 }
90 90
91 /* Max length: 127 "OPT (" 15*4*2chars ") " */ 91 /* Max length: 127 "OPT (" 15*4*2chars ") " */
92 printk("OPT ("); 92 sb_add(m, "OPT (");
93 for (i = 0; i < optsize; i++) 93 for (i = 0; i < optsize; i++)
94 printk("%02X", op[i]); 94 sb_add(m, "%02X", op[i]);
95 printk(") "); 95 sb_add(m, ") ");
96 } 96 }
97 97
98 switch (ih->protocol) { 98 switch (ih->protocol) {
@@ -101,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info,
101 const struct tcphdr *th; 101 const struct tcphdr *th;
102 102
103 /* Max length: 10 "PROTO=TCP " */ 103 /* Max length: 10 "PROTO=TCP " */
104 printk("PROTO=TCP "); 104 sb_add(m, "PROTO=TCP ");
105 105
106 if (ntohs(ih->frag_off) & IP_OFFSET) 106 if (ntohs(ih->frag_off) & IP_OFFSET)
107 break; 107 break;
@@ -110,44 +110,44 @@ static void dump_packet(const struct nf_loginfo *info,
110 th = skb_header_pointer(skb, iphoff + ih->ihl * 4, 110 th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
111 sizeof(_tcph), &_tcph); 111 sizeof(_tcph), &_tcph);
112 if (th == NULL) { 112 if (th == NULL) {
113 printk("INCOMPLETE [%u bytes] ", 113 sb_add(m, "INCOMPLETE [%u bytes] ",
114 skb->len - iphoff - ih->ihl*4); 114 skb->len - iphoff - ih->ihl*4);
115 break; 115 break;
116 } 116 }
117 117
118 /* Max length: 20 "SPT=65535 DPT=65535 " */ 118 /* Max length: 20 "SPT=65535 DPT=65535 " */
119 printk("SPT=%u DPT=%u ", 119 sb_add(m, "SPT=%u DPT=%u ",
120 ntohs(th->source), ntohs(th->dest)); 120 ntohs(th->source), ntohs(th->dest));
121 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ 121 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
122 if (logflags & IPT_LOG_TCPSEQ) 122 if (logflags & IPT_LOG_TCPSEQ)
123 printk("SEQ=%u ACK=%u ", 123 sb_add(m, "SEQ=%u ACK=%u ",
124 ntohl(th->seq), ntohl(th->ack_seq)); 124 ntohl(th->seq), ntohl(th->ack_seq));
125 /* Max length: 13 "WINDOW=65535 " */ 125 /* Max length: 13 "WINDOW=65535 " */
126 printk("WINDOW=%u ", ntohs(th->window)); 126 sb_add(m, "WINDOW=%u ", ntohs(th->window));
127 /* Max length: 9 "RES=0x3F " */ 127 /* Max length: 9 "RES=0x3F " */
128 printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); 128 sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
129 /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ 129 /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
130 if (th->cwr) 130 if (th->cwr)
131 printk("CWR "); 131 sb_add(m, "CWR ");
132 if (th->ece) 132 if (th->ece)
133 printk("ECE "); 133 sb_add(m, "ECE ");
134 if (th->urg) 134 if (th->urg)
135 printk("URG "); 135 sb_add(m, "URG ");
136 if (th->ack) 136 if (th->ack)
137 printk("ACK "); 137 sb_add(m, "ACK ");
138 if (th->psh) 138 if (th->psh)
139 printk("PSH "); 139 sb_add(m, "PSH ");
140 if (th->rst) 140 if (th->rst)
141 printk("RST "); 141 sb_add(m, "RST ");
142 if (th->syn) 142 if (th->syn)
143 printk("SYN "); 143 sb_add(m, "SYN ");
144 if (th->fin) 144 if (th->fin)
145 printk("FIN "); 145 sb_add(m, "FIN ");
146 /* Max length: 11 "URGP=65535 " */ 146 /* Max length: 11 "URGP=65535 " */
147 printk("URGP=%u ", ntohs(th->urg_ptr)); 147 sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
148 148
149 if ((logflags & IPT_LOG_TCPOPT) 149 if ((logflags & IPT_LOG_TCPOPT) &&
150 && th->doff * 4 > sizeof(struct tcphdr)) { 150 th->doff * 4 > sizeof(struct tcphdr)) {
151 unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; 151 unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
152 const unsigned char *op; 152 const unsigned char *op;
153 unsigned int i, optsize; 153 unsigned int i, optsize;
@@ -157,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info,
157 iphoff+ih->ihl*4+sizeof(_tcph), 157 iphoff+ih->ihl*4+sizeof(_tcph),
158 optsize, _opt); 158 optsize, _opt);
159 if (op == NULL) { 159 if (op == NULL) {
160 printk("TRUNCATED"); 160 sb_add(m, "TRUNCATED");
161 return; 161 return;
162 } 162 }
163 163
164 /* Max length: 127 "OPT (" 15*4*2chars ") " */ 164 /* Max length: 127 "OPT (" 15*4*2chars ") " */
165 printk("OPT ("); 165 sb_add(m, "OPT (");
166 for (i = 0; i < optsize; i++) 166 for (i = 0; i < optsize; i++)
167 printk("%02X", op[i]); 167 sb_add(m, "%02X", op[i]);
168 printk(") "); 168 sb_add(m, ") ");
169 } 169 }
170 break; 170 break;
171 } 171 }
@@ -176,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info,
176 176
177 if (ih->protocol == IPPROTO_UDP) 177 if (ih->protocol == IPPROTO_UDP)
178 /* Max length: 10 "PROTO=UDP " */ 178 /* Max length: 10 "PROTO=UDP " */
179 printk("PROTO=UDP " ); 179 sb_add(m, "PROTO=UDP " );
180 else /* Max length: 14 "PROTO=UDPLITE " */ 180 else /* Max length: 14 "PROTO=UDPLITE " */
181 printk("PROTO=UDPLITE "); 181 sb_add(m, "PROTO=UDPLITE ");
182 182
183 if (ntohs(ih->frag_off) & IP_OFFSET) 183 if (ntohs(ih->frag_off) & IP_OFFSET)
184 break; 184 break;
@@ -187,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info,
187 uh = skb_header_pointer(skb, iphoff+ih->ihl*4, 187 uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
188 sizeof(_udph), &_udph); 188 sizeof(_udph), &_udph);
189 if (uh == NULL) { 189 if (uh == NULL) {
190 printk("INCOMPLETE [%u bytes] ", 190 sb_add(m, "INCOMPLETE [%u bytes] ",
191 skb->len - iphoff - ih->ihl*4); 191 skb->len - iphoff - ih->ihl*4);
192 break; 192 break;
193 } 193 }
194 194
195 /* Max length: 20 "SPT=65535 DPT=65535 " */ 195 /* Max length: 20 "SPT=65535 DPT=65535 " */
196 printk("SPT=%u DPT=%u LEN=%u ", 196 sb_add(m, "SPT=%u DPT=%u LEN=%u ",
197 ntohs(uh->source), ntohs(uh->dest), 197 ntohs(uh->source), ntohs(uh->dest),
198 ntohs(uh->len)); 198 ntohs(uh->len));
199 break; 199 break;
@@ -220,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info,
220 [ICMP_ADDRESSREPLY] = 12 }; 220 [ICMP_ADDRESSREPLY] = 12 };
221 221
222 /* Max length: 11 "PROTO=ICMP " */ 222 /* Max length: 11 "PROTO=ICMP " */
223 printk("PROTO=ICMP "); 223 sb_add(m, "PROTO=ICMP ");
224 224
225 if (ntohs(ih->frag_off) & IP_OFFSET) 225 if (ntohs(ih->frag_off) & IP_OFFSET)
226 break; 226 break;
@@ -229,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info,
229 ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, 229 ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
230 sizeof(_icmph), &_icmph); 230 sizeof(_icmph), &_icmph);
231 if (ich == NULL) { 231 if (ich == NULL) {
232 printk("INCOMPLETE [%u bytes] ", 232 sb_add(m, "INCOMPLETE [%u bytes] ",
233 skb->len - iphoff - ih->ihl*4); 233 skb->len - iphoff - ih->ihl*4);
234 break; 234 break;
235 } 235 }
236 236
237 /* Max length: 18 "TYPE=255 CODE=255 " */ 237 /* Max length: 18 "TYPE=255 CODE=255 " */
238 printk("TYPE=%u CODE=%u ", ich->type, ich->code); 238 sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
239 239
240 /* Max length: 25 "INCOMPLETE [65535 bytes] " */ 240 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
241 if (ich->type <= NR_ICMP_TYPES 241 if (ich->type <= NR_ICMP_TYPES &&
242 && required_len[ich->type] 242 required_len[ich->type] &&
243 && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { 243 skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
244 printk("INCOMPLETE [%u bytes] ", 244 sb_add(m, "INCOMPLETE [%u bytes] ",
245 skb->len - iphoff - ih->ihl*4); 245 skb->len - iphoff - ih->ihl*4);
246 break; 246 break;
247 } 247 }
@@ -250,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info,
250 case ICMP_ECHOREPLY: 250 case ICMP_ECHOREPLY:
251 case ICMP_ECHO: 251 case ICMP_ECHO:
252 /* Max length: 19 "ID=65535 SEQ=65535 " */ 252 /* Max length: 19 "ID=65535 SEQ=65535 " */
253 printk("ID=%u SEQ=%u ", 253 sb_add(m, "ID=%u SEQ=%u ",
254 ntohs(ich->un.echo.id), 254 ntohs(ich->un.echo.id),
255 ntohs(ich->un.echo.sequence)); 255 ntohs(ich->un.echo.sequence));
256 break; 256 break;
257 257
258 case ICMP_PARAMETERPROB: 258 case ICMP_PARAMETERPROB:
259 /* Max length: 14 "PARAMETER=255 " */ 259 /* Max length: 14 "PARAMETER=255 " */
260 printk("PARAMETER=%u ", 260 sb_add(m, "PARAMETER=%u ",
261 ntohl(ich->un.gateway) >> 24); 261 ntohl(ich->un.gateway) >> 24);
262 break; 262 break;
263 case ICMP_REDIRECT: 263 case ICMP_REDIRECT:
264 /* Max length: 24 "GATEWAY=255.255.255.255 " */ 264 /* Max length: 24 "GATEWAY=255.255.255.255 " */
265 printk("GATEWAY=%pI4 ", &ich->un.gateway); 265 sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
266 /* Fall through */ 266 /* Fall through */
267 case ICMP_DEST_UNREACH: 267 case ICMP_DEST_UNREACH:
268 case ICMP_SOURCE_QUENCH: 268 case ICMP_SOURCE_QUENCH:
269 case ICMP_TIME_EXCEEDED: 269 case ICMP_TIME_EXCEEDED:
270 /* Max length: 3+maxlen */ 270 /* Max length: 3+maxlen */
271 if (!iphoff) { /* Only recurse once. */ 271 if (!iphoff) { /* Only recurse once. */
272 printk("["); 272 sb_add(m, "[");
273 dump_packet(info, skb, 273 dump_packet(m, info, skb,
274 iphoff + ih->ihl*4+sizeof(_icmph)); 274 iphoff + ih->ihl*4+sizeof(_icmph));
275 printk("] "); 275 sb_add(m, "] ");
276 } 276 }
277 277
278 /* Max length: 10 "MTU=65535 " */ 278 /* Max length: 10 "MTU=65535 " */
279 if (ich->type == ICMP_DEST_UNREACH 279 if (ich->type == ICMP_DEST_UNREACH &&
280 && ich->code == ICMP_FRAG_NEEDED) 280 ich->code == ICMP_FRAG_NEEDED)
281 printk("MTU=%u ", ntohs(ich->un.frag.mtu)); 281 sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
282 } 282 }
283 break; 283 break;
284 } 284 }
@@ -291,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info,
291 break; 291 break;
292 292
293 /* Max length: 9 "PROTO=AH " */ 293 /* Max length: 9 "PROTO=AH " */
294 printk("PROTO=AH "); 294 sb_add(m, "PROTO=AH ");
295 295
296 /* Max length: 25 "INCOMPLETE [65535 bytes] " */ 296 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
297 ah = skb_header_pointer(skb, iphoff+ih->ihl*4, 297 ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
298 sizeof(_ahdr), &_ahdr); 298 sizeof(_ahdr), &_ahdr);
299 if (ah == NULL) { 299 if (ah == NULL) {
300 printk("INCOMPLETE [%u bytes] ", 300 sb_add(m, "INCOMPLETE [%u bytes] ",
301 skb->len - iphoff - ih->ihl*4); 301 skb->len - iphoff - ih->ihl*4);
302 break; 302 break;
303 } 303 }
304 304
305 /* Length: 15 "SPI=0xF1234567 " */ 305 /* Length: 15 "SPI=0xF1234567 " */
306 printk("SPI=0x%x ", ntohl(ah->spi)); 306 sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
307 break; 307 break;
308 } 308 }
309 case IPPROTO_ESP: { 309 case IPPROTO_ESP: {
@@ -311,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info,
311 const struct ip_esp_hdr *eh; 311 const struct ip_esp_hdr *eh;
312 312
313 /* Max length: 10 "PROTO=ESP " */ 313 /* Max length: 10 "PROTO=ESP " */
314 printk("PROTO=ESP "); 314 sb_add(m, "PROTO=ESP ");
315 315
316 if (ntohs(ih->frag_off) & IP_OFFSET) 316 if (ntohs(ih->frag_off) & IP_OFFSET)
317 break; 317 break;
@@ -320,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info,
320 eh = skb_header_pointer(skb, iphoff+ih->ihl*4, 320 eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
321 sizeof(_esph), &_esph); 321 sizeof(_esph), &_esph);
322 if (eh == NULL) { 322 if (eh == NULL) {
323 printk("INCOMPLETE [%u bytes] ", 323 sb_add(m, "INCOMPLETE [%u bytes] ",
324 skb->len - iphoff - ih->ihl*4); 324 skb->len - iphoff - ih->ihl*4);
325 break; 325 break;
326 } 326 }
327 327
328 /* Length: 15 "SPI=0xF1234567 " */ 328 /* Length: 15 "SPI=0xF1234567 " */
329 printk("SPI=0x%x ", ntohl(eh->spi)); 329 sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
330 break; 330 break;
331 } 331 }
332 /* Max length: 10 "PROTO 255 " */ 332 /* Max length: 10 "PROTO 255 " */
333 default: 333 default:
334 printk("PROTO=%u ", ih->protocol); 334 sb_add(m, "PROTO=%u ", ih->protocol);
335 } 335 }
336 336
337 /* Max length: 15 "UID=4294967295 " */ 337 /* Max length: 15 "UID=4294967295 " */
338 if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { 338 if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
339 read_lock_bh(&skb->sk->sk_callback_lock); 339 read_lock_bh(&skb->sk->sk_callback_lock);
340 if (skb->sk->sk_socket && skb->sk->sk_socket->file) 340 if (skb->sk->sk_socket && skb->sk->sk_socket->file)
341 printk("UID=%u GID=%u ", 341 sb_add(m, "UID=%u GID=%u ",
342 skb->sk->sk_socket->file->f_cred->fsuid, 342 skb->sk->sk_socket->file->f_cred->fsuid,
343 skb->sk->sk_socket->file->f_cred->fsgid); 343 skb->sk->sk_socket->file->f_cred->fsgid);
344 read_unlock_bh(&skb->sk->sk_callback_lock); 344 read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -346,7 +346,7 @@ static void dump_packet(const struct nf_loginfo *info,
346 346
347 /* Max length: 16 "MARK=0xFFFFFFFF " */ 347 /* Max length: 16 "MARK=0xFFFFFFFF " */
348 if (!iphoff && skb->mark) 348 if (!iphoff && skb->mark)
349 printk("MARK=0x%x ", skb->mark); 349 sb_add(m, "MARK=0x%x ", skb->mark);
350 350
351 /* Proto Max log string length */ 351 /* Proto Max log string length */
352 /* IP: 40+46+6+11+127 = 230 */ 352 /* IP: 40+46+6+11+127 = 230 */
@@ -363,11 +363,48 @@ static void dump_packet(const struct nf_loginfo *info,
363 /* maxlen = 230+ 91 + 230 + 252 = 803 */ 363 /* maxlen = 230+ 91 + 230 + 252 = 803 */
364} 364}
365 365
366static void dump_mac_header(struct sbuff *m,
367 const struct nf_loginfo *info,
368 const struct sk_buff *skb)
369{
370 struct net_device *dev = skb->dev;
371 unsigned int logflags = 0;
372
373 if (info->type == NF_LOG_TYPE_LOG)
374 logflags = info->u.log.logflags;
375
376 if (!(logflags & IPT_LOG_MACDECODE))
377 goto fallback;
378
379 switch (dev->type) {
380 case ARPHRD_ETHER:
381 sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
382 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
383 ntohs(eth_hdr(skb)->h_proto));
384 return;
385 default:
386 break;
387 }
388
389fallback:
390 sb_add(m, "MAC=");
391 if (dev->hard_header_len &&
392 skb->mac_header != skb->network_header) {
393 const unsigned char *p = skb_mac_header(skb);
394 unsigned int i;
395
396 sb_add(m, "%02x", *p++);
397 for (i = 1; i < dev->hard_header_len; i++, p++)
398 sb_add(m, ":%02x", *p);
399 }
400 sb_add(m, " ");
401}
402
366static struct nf_loginfo default_loginfo = { 403static struct nf_loginfo default_loginfo = {
367 .type = NF_LOG_TYPE_LOG, 404 .type = NF_LOG_TYPE_LOG,
368 .u = { 405 .u = {
369 .log = { 406 .log = {
370 .level = 0, 407 .level = 5,
371 .logflags = NF_LOG_MASK, 408 .logflags = NF_LOG_MASK,
372 }, 409 },
373 }, 410 },
@@ -382,11 +419,12 @@ ipt_log_packet(u_int8_t pf,
382 const struct nf_loginfo *loginfo, 419 const struct nf_loginfo *loginfo,
383 const char *prefix) 420 const char *prefix)
384{ 421{
422 struct sbuff *m = sb_open();
423
385 if (!loginfo) 424 if (!loginfo)
386 loginfo = &default_loginfo; 425 loginfo = &default_loginfo;
387 426
388 spin_lock_bh(&log_lock); 427 sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
389 printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
390 prefix, 428 prefix,
391 in ? in->name : "", 429 in ? in->name : "",
392 out ? out->name : ""); 430 out ? out->name : "");
@@ -397,35 +435,24 @@ ipt_log_packet(u_int8_t pf,
397 435
398 physindev = skb->nf_bridge->physindev; 436 physindev = skb->nf_bridge->physindev;
399 if (physindev && in != physindev) 437 if (physindev && in != physindev)
400 printk("PHYSIN=%s ", physindev->name); 438 sb_add(m, "PHYSIN=%s ", physindev->name);
401 physoutdev = skb->nf_bridge->physoutdev; 439 physoutdev = skb->nf_bridge->physoutdev;
402 if (physoutdev && out != physoutdev) 440 if (physoutdev && out != physoutdev)
403 printk("PHYSOUT=%s ", physoutdev->name); 441 sb_add(m, "PHYSOUT=%s ", physoutdev->name);
404 } 442 }
405#endif 443#endif
406 444
407 if (in && !out) { 445 /* MAC logging for input path only. */
408 /* MAC logging for input chain only. */ 446 if (in && !out)
409 printk("MAC="); 447 dump_mac_header(m, loginfo, skb);
410 if (skb->dev && skb->dev->hard_header_len
411 && skb->mac_header != skb->network_header) {
412 int i;
413 const unsigned char *p = skb_mac_header(skb);
414 for (i = 0; i < skb->dev->hard_header_len; i++,p++)
415 printk("%02x%c", *p,
416 i==skb->dev->hard_header_len - 1
417 ? ' ':':');
418 } else
419 printk(" ");
420 }
421 448
422 dump_packet(loginfo, skb, 0); 449 dump_packet(m, loginfo, skb, 0);
423 printk("\n"); 450
424 spin_unlock_bh(&log_lock); 451 sb_close(m);
425} 452}
426 453
427static unsigned int 454static unsigned int
428log_tg(struct sk_buff *skb, const struct xt_target_param *par) 455log_tg(struct sk_buff *skb, const struct xt_action_param *par)
429{ 456{
430 const struct ipt_log_info *loginfo = par->targinfo; 457 const struct ipt_log_info *loginfo = par->targinfo;
431 struct nf_loginfo li; 458 struct nf_loginfo li;
@@ -439,20 +466,19 @@ log_tg(struct sk_buff *skb, const struct xt_target_param *par)
439 return XT_CONTINUE; 466 return XT_CONTINUE;
440} 467}
441 468
442static bool log_tg_check(const struct xt_tgchk_param *par) 469static int log_tg_check(const struct xt_tgchk_param *par)
443{ 470{
444 const struct ipt_log_info *loginfo = par->targinfo; 471 const struct ipt_log_info *loginfo = par->targinfo;
445 472
446 if (loginfo->level >= 8) { 473 if (loginfo->level >= 8) {
447 pr_debug("LOG: level %u >= 8\n", loginfo->level); 474 pr_debug("level %u >= 8\n", loginfo->level);
448 return false; 475 return -EINVAL;
449 } 476 }
450 if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { 477 if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
451 pr_debug("LOG: prefix term %i\n", 478 pr_debug("prefix is not null-terminated\n");
452 loginfo->prefix[sizeof(loginfo->prefix)-1]); 479 return -EINVAL;
453 return false;
454 } 480 }
455 return true; 481 return 0;
456} 482}
457 483
458static struct xt_target log_tg_reg __read_mostly = { 484static struct xt_target log_tg_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index dada0863946d..d2ed9dc74ebc 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -8,7 +8,7 @@
8 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation. 9 * published by the Free Software Foundation.
10 */ 10 */
11 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/inetdevice.h> 13#include <linux/inetdevice.h>
14#include <linux/ip.h> 14#include <linux/ip.h>
@@ -28,23 +28,23 @@ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
28MODULE_DESCRIPTION("Xtables: automatic-address SNAT"); 28MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
29 29
30/* FIXME: Multiple targets. --RR */ 30/* FIXME: Multiple targets. --RR */
31static bool masquerade_tg_check(const struct xt_tgchk_param *par) 31static int masquerade_tg_check(const struct xt_tgchk_param *par)
32{ 32{
33 const struct nf_nat_multi_range_compat *mr = par->targinfo; 33 const struct nf_nat_multi_range_compat *mr = par->targinfo;
34 34
35 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { 35 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
36 pr_debug("masquerade_check: bad MAP_IPS.\n"); 36 pr_debug("bad MAP_IPS.\n");
37 return false; 37 return -EINVAL;
38 } 38 }
39 if (mr->rangesize != 1) { 39 if (mr->rangesize != 1) {
40 pr_debug("masquerade_check: bad rangesize %u\n", mr->rangesize); 40 pr_debug("bad rangesize %u\n", mr->rangesize);
41 return false; 41 return -EINVAL;
42 } 42 }
43 return true; 43 return 0;
44} 44}
45 45
46static unsigned int 46static unsigned int
47masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par) 47masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
48{ 48{
49 struct nf_conn *ct; 49 struct nf_conn *ct;
50 struct nf_conn_nat *nat; 50 struct nf_conn_nat *nat;
@@ -59,8 +59,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par)
59 ct = nf_ct_get(skb, &ctinfo); 59 ct = nf_ct_get(skb, &ctinfo);
60 nat = nfct_nat(ct); 60 nat = nfct_nat(ct);
61 61
62 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED 62 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
63 || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); 63 ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
64 64
65 /* Source address is 0.0.0.0 - locally generated packet that is 65 /* Source address is 0.0.0.0 - locally generated packet that is
66 * probably not supposed to be masqueraded. 66 * probably not supposed to be masqueraded.
@@ -72,7 +72,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par)
72 rt = skb_rtable(skb); 72 rt = skb_rtable(skb);
73 newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); 73 newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
74 if (!newsrc) { 74 if (!newsrc) {
75 printk("MASQUERADE: %s ate my IP address\n", par->out->name); 75 pr_info("%s ate my IP address\n", par->out->name);
76 return NF_DROP; 76 return NF_DROP;
77 } 77 }
78 78
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 7c29582d4ec8..6cdb298f1035 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -9,7 +9,7 @@
9 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation. 10 * published by the Free Software Foundation.
11 */ 11 */
12 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13#include <linux/ip.h> 13#include <linux/ip.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/netdevice.h> 15#include <linux/netdevice.h>
@@ -22,23 +22,23 @@ MODULE_LICENSE("GPL");
22MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>"); 22MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
23MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets"); 23MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets");
24 24
25static bool netmap_tg_check(const struct xt_tgchk_param *par) 25static int netmap_tg_check(const struct xt_tgchk_param *par)
26{ 26{
27 const struct nf_nat_multi_range_compat *mr = par->targinfo; 27 const struct nf_nat_multi_range_compat *mr = par->targinfo;
28 28
29 if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { 29 if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
30 pr_debug("NETMAP:check: bad MAP_IPS.\n"); 30 pr_debug("bad MAP_IPS.\n");
31 return false; 31 return -EINVAL;
32 } 32 }
33 if (mr->rangesize != 1) { 33 if (mr->rangesize != 1) {
34 pr_debug("NETMAP:check: bad rangesize %u.\n", mr->rangesize); 34 pr_debug("bad rangesize %u.\n", mr->rangesize);
35 return false; 35 return -EINVAL;
36 } 36 }
37 return true; 37 return 0;
38} 38}
39 39
40static unsigned int 40static unsigned int
41netmap_tg(struct sk_buff *skb, const struct xt_target_param *par) 41netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
42{ 42{
43 struct nf_conn *ct; 43 struct nf_conn *ct;
44 enum ip_conntrack_info ctinfo; 44 enum ip_conntrack_info ctinfo;
@@ -48,7 +48,8 @@ netmap_tg(struct sk_buff *skb, const struct xt_target_param *par)
48 48
49 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || 49 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
50 par->hooknum == NF_INET_POST_ROUTING || 50 par->hooknum == NF_INET_POST_ROUTING ||
51 par->hooknum == NF_INET_LOCAL_OUT); 51 par->hooknum == NF_INET_LOCAL_OUT ||
52 par->hooknum == NF_INET_LOCAL_IN);
52 ct = nf_ct_get(skb, &ctinfo); 53 ct = nf_ct_get(skb, &ctinfo);
53 54
54 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); 55 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
@@ -77,7 +78,8 @@ static struct xt_target netmap_tg_reg __read_mostly = {
77 .table = "nat", 78 .table = "nat",
78 .hooks = (1 << NF_INET_PRE_ROUTING) | 79 .hooks = (1 << NF_INET_PRE_ROUTING) |
79 (1 << NF_INET_POST_ROUTING) | 80 (1 << NF_INET_POST_ROUTING) |
80 (1 << NF_INET_LOCAL_OUT), 81 (1 << NF_INET_LOCAL_OUT) |
82 (1 << NF_INET_LOCAL_IN),
81 .checkentry = netmap_tg_check, 83 .checkentry = netmap_tg_check,
82 .me = THIS_MODULE 84 .me = THIS_MODULE
83}; 85};
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index 698e5e78685b..18a0656505a0 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -6,7 +6,7 @@
6 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 */ 8 */
9 9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/ip.h> 11#include <linux/ip.h>
12#include <linux/timer.h> 12#include <linux/timer.h>
@@ -26,23 +26,23 @@ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
26MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); 26MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
27 27
28/* FIXME: Take multiple ranges --RR */ 28/* FIXME: Take multiple ranges --RR */
29static bool redirect_tg_check(const struct xt_tgchk_param *par) 29static int redirect_tg_check(const struct xt_tgchk_param *par)
30{ 30{
31 const struct nf_nat_multi_range_compat *mr = par->targinfo; 31 const struct nf_nat_multi_range_compat *mr = par->targinfo;
32 32
33 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { 33 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
34 pr_debug("redirect_check: bad MAP_IPS.\n"); 34 pr_debug("bad MAP_IPS.\n");
35 return false; 35 return -EINVAL;
36 } 36 }
37 if (mr->rangesize != 1) { 37 if (mr->rangesize != 1) {
38 pr_debug("redirect_check: bad rangesize %u.\n", mr->rangesize); 38 pr_debug("bad rangesize %u.\n", mr->rangesize);
39 return false; 39 return -EINVAL;
40 } 40 }
41 return true; 41 return 0;
42} 42}
43 43
44static unsigned int 44static unsigned int
45redirect_tg(struct sk_buff *skb, const struct xt_target_param *par) 45redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
46{ 46{
47 struct nf_conn *ct; 47 struct nf_conn *ct;
48 enum ip_conntrack_info ctinfo; 48 enum ip_conntrack_info ctinfo;
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index c93ae44bff2a..1ff79e557f96 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -9,9 +9,10 @@
9 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation. 10 * published by the Free Software Foundation.
11 */ 11 */
12 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/slab.h>
15#include <linux/ip.h> 16#include <linux/ip.h>
16#include <linux/udp.h> 17#include <linux/udp.h>
17#include <linux/icmp.h> 18#include <linux/icmp.h>
@@ -94,10 +95,11 @@ static void send_reset(struct sk_buff *oldskb, int hook)
94 } 95 }
95 96
96 tcph->rst = 1; 97 tcph->rst = 1;
97 tcph->check = tcp_v4_check(sizeof(struct tcphdr), 98 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
98 niph->saddr, niph->daddr, 99 niph->daddr, 0);
99 csum_partial(tcph, 100 nskb->ip_summed = CHECKSUM_PARTIAL;
100 sizeof(struct tcphdr), 0)); 101 nskb->csum_start = (unsigned char *)tcph - nskb->head;
102 nskb->csum_offset = offsetof(struct tcphdr, check);
101 103
102 addr_type = RTN_UNSPEC; 104 addr_type = RTN_UNSPEC;
103 if (hook != NF_INET_FORWARD 105 if (hook != NF_INET_FORWARD
@@ -108,13 +110,13 @@ static void send_reset(struct sk_buff *oldskb, int hook)
108 addr_type = RTN_LOCAL; 110 addr_type = RTN_LOCAL;
109 111
110 /* ip_route_me_harder expects skb->dst to be set */ 112 /* ip_route_me_harder expects skb->dst to be set */
111 skb_dst_set(nskb, dst_clone(skb_dst(oldskb))); 113 skb_dst_set_noref(nskb, skb_dst(oldskb));
112 114
115 nskb->protocol = htons(ETH_P_IP);
113 if (ip_route_me_harder(nskb, addr_type)) 116 if (ip_route_me_harder(nskb, addr_type))
114 goto free_nskb; 117 goto free_nskb;
115 118
116 niph->ttl = dst_metric(skb_dst(nskb), RTAX_HOPLIMIT); 119 niph->ttl = ip4_dst_hoplimit(skb_dst(nskb));
117 nskb->ip_summed = CHECKSUM_NONE;
118 120
119 /* "Never happens" */ 121 /* "Never happens" */
120 if (nskb->len > dst_mtu(skb_dst(nskb))) 122 if (nskb->len > dst_mtu(skb_dst(nskb)))
@@ -135,13 +137,10 @@ static inline void send_unreach(struct sk_buff *skb_in, int code)
135} 137}
136 138
137static unsigned int 139static unsigned int
138reject_tg(struct sk_buff *skb, const struct xt_target_param *par) 140reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
139{ 141{
140 const struct ipt_reject_info *reject = par->targinfo; 142 const struct ipt_reject_info *reject = par->targinfo;
141 143
142 /* WARNING: This code causes reentry within iptables.
143 This means that the iptables jump stack is now crap. We
144 must return an absolute verdict. --RR */
145 switch (reject->with) { 144 switch (reject->with) {
146 case IPT_ICMP_NET_UNREACHABLE: 145 case IPT_ICMP_NET_UNREACHABLE:
147 send_unreach(skb, ICMP_NET_UNREACH); 146 send_unreach(skb, ICMP_NET_UNREACH);
@@ -174,23 +173,23 @@ reject_tg(struct sk_buff *skb, const struct xt_target_param *par)
174 return NF_DROP; 173 return NF_DROP;
175} 174}
176 175
177static bool reject_tg_check(const struct xt_tgchk_param *par) 176static int reject_tg_check(const struct xt_tgchk_param *par)
178{ 177{
179 const struct ipt_reject_info *rejinfo = par->targinfo; 178 const struct ipt_reject_info *rejinfo = par->targinfo;
180 const struct ipt_entry *e = par->entryinfo; 179 const struct ipt_entry *e = par->entryinfo;
181 180
182 if (rejinfo->with == IPT_ICMP_ECHOREPLY) { 181 if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
183 printk("ipt_REJECT: ECHOREPLY no longer supported.\n"); 182 pr_info("ECHOREPLY no longer supported.\n");
184 return false; 183 return -EINVAL;
185 } else if (rejinfo->with == IPT_TCP_RESET) { 184 } else if (rejinfo->with == IPT_TCP_RESET) {
186 /* Must specify that it's a TCP packet */ 185 /* Must specify that it's a TCP packet */
187 if (e->ip.proto != IPPROTO_TCP 186 if (e->ip.proto != IPPROTO_TCP ||
188 || (e->ip.invflags & XT_INV_PROTO)) { 187 (e->ip.invflags & XT_INV_PROTO)) {
189 printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n"); 188 pr_info("TCP_RESET invalid for non-tcp\n");
190 return false; 189 return -EINVAL;
191 } 190 }
192 } 191 }
193 return true; 192 return 0;
194} 193}
195 194
196static struct xt_target reject_tg_reg __read_mostly = { 195static struct xt_target reject_tg_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index d32cc4bb328a..446e0f467a17 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -29,10 +29,11 @@
29 * Specify, after how many hundredths of a second the queue should be 29 * Specify, after how many hundredths of a second the queue should be
30 * flushed even if it is not full yet. 30 * flushed even if it is not full yet.
31 */ 31 */
32 32#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
33#include <linux/module.h> 33#include <linux/module.h>
34#include <linux/spinlock.h> 34#include <linux/spinlock.h>
35#include <linux/socket.h> 35#include <linux/socket.h>
36#include <linux/slab.h>
36#include <linux/skbuff.h> 37#include <linux/skbuff.h>
37#include <linux/kernel.h> 38#include <linux/kernel.h>
38#include <linux/timer.h> 39#include <linux/timer.h>
@@ -56,8 +57,6 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
56#define ULOG_NL_EVENT 111 /* Harald's favorite number */ 57#define ULOG_NL_EVENT 111 /* Harald's favorite number */
57#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ 58#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
58 59
59#define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0)
60
61static unsigned int nlbufsiz = NLMSG_GOODSIZE; 60static unsigned int nlbufsiz = NLMSG_GOODSIZE;
62module_param(nlbufsiz, uint, 0400); 61module_param(nlbufsiz, uint, 0400);
63MODULE_PARM_DESC(nlbufsiz, "netlink buffer size"); 62MODULE_PARM_DESC(nlbufsiz, "netlink buffer size");
@@ -90,12 +89,12 @@ static void ulog_send(unsigned int nlgroupnum)
90 ulog_buff_t *ub = &ulog_buffers[nlgroupnum]; 89 ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
91 90
92 if (timer_pending(&ub->timer)) { 91 if (timer_pending(&ub->timer)) {
93 pr_debug("ipt_ULOG: ulog_send: timer was pending, deleting\n"); 92 pr_debug("ulog_send: timer was pending, deleting\n");
94 del_timer(&ub->timer); 93 del_timer(&ub->timer);
95 } 94 }
96 95
97 if (!ub->skb) { 96 if (!ub->skb) {
98 pr_debug("ipt_ULOG: ulog_send: nothing to send\n"); 97 pr_debug("ulog_send: nothing to send\n");
99 return; 98 return;
100 } 99 }
101 100
@@ -104,7 +103,7 @@ static void ulog_send(unsigned int nlgroupnum)
104 ub->lastnlh->nlmsg_type = NLMSG_DONE; 103 ub->lastnlh->nlmsg_type = NLMSG_DONE;
105 104
106 NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1; 105 NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
107 pr_debug("ipt_ULOG: throwing %d packets to netlink group %u\n", 106 pr_debug("throwing %d packets to netlink group %u\n",
108 ub->qlen, nlgroupnum + 1); 107 ub->qlen, nlgroupnum + 1);
109 netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC); 108 netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
110 109
@@ -117,7 +116,7 @@ static void ulog_send(unsigned int nlgroupnum)
117/* timer function to flush queue in flushtimeout time */ 116/* timer function to flush queue in flushtimeout time */
118static void ulog_timer(unsigned long data) 117static void ulog_timer(unsigned long data)
119{ 118{
120 pr_debug("ipt_ULOG: timer function called, calling ulog_send\n"); 119 pr_debug("timer function called, calling ulog_send\n");
121 120
122 /* lock to protect against somebody modifying our structure 121 /* lock to protect against somebody modifying our structure
123 * from ipt_ulog_target at the same time */ 122 * from ipt_ulog_target at the same time */
@@ -138,7 +137,7 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
138 n = max(size, nlbufsiz); 137 n = max(size, nlbufsiz);
139 skb = alloc_skb(n, GFP_ATOMIC); 138 skb = alloc_skb(n, GFP_ATOMIC);
140 if (!skb) { 139 if (!skb) {
141 PRINTR("ipt_ULOG: can't alloc whole buffer %ub!\n", n); 140 pr_debug("cannot alloc whole buffer %ub!\n", n);
142 141
143 if (n > size) { 142 if (n > size) {
144 /* try to allocate only as much as we need for 143 /* try to allocate only as much as we need for
@@ -146,8 +145,7 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
146 145
147 skb = alloc_skb(size, GFP_ATOMIC); 146 skb = alloc_skb(size, GFP_ATOMIC);
148 if (!skb) 147 if (!skb)
149 PRINTR("ipt_ULOG: can't even allocate %ub\n", 148 pr_debug("cannot even allocate %ub\n", size);
150 size);
151 } 149 }
152 } 150 }
153 151
@@ -198,8 +196,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
198 goto alloc_failure; 196 goto alloc_failure;
199 } 197 }
200 198
201 pr_debug("ipt_ULOG: qlen %d, qthreshold %Zu\n", ub->qlen, 199 pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
202 loginfo->qthreshold);
203 200
204 /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ 201 /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
205 nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, 202 nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
@@ -226,9 +223,9 @@ static void ipt_ulog_packet(unsigned int hooknum,
226 else 223 else
227 *(pm->prefix) = '\0'; 224 *(pm->prefix) = '\0';
228 225
229 if (in && in->hard_header_len > 0 226 if (in && in->hard_header_len > 0 &&
230 && skb->mac_header != skb->network_header 227 skb->mac_header != skb->network_header &&
231 && in->hard_header_len <= ULOG_MAC_LEN) { 228 in->hard_header_len <= ULOG_MAC_LEN) {
232 memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len); 229 memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len);
233 pm->mac_len = in->hard_header_len; 230 pm->mac_len = in->hard_header_len;
234 } else 231 } else
@@ -272,16 +269,14 @@ static void ipt_ulog_packet(unsigned int hooknum,
272 return; 269 return;
273 270
274nlmsg_failure: 271nlmsg_failure:
275 PRINTR("ipt_ULOG: error during NLMSG_PUT\n"); 272 pr_debug("error during NLMSG_PUT\n");
276
277alloc_failure: 273alloc_failure:
278 PRINTR("ipt_ULOG: Error building netlink message\n"); 274 pr_debug("Error building netlink message\n");
279
280 spin_unlock_bh(&ulog_lock); 275 spin_unlock_bh(&ulog_lock);
281} 276}
282 277
283static unsigned int 278static unsigned int
284ulog_tg(struct sk_buff *skb, const struct xt_target_param *par) 279ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
285{ 280{
286 ipt_ulog_packet(par->hooknum, skb, par->in, par->out, 281 ipt_ulog_packet(par->hooknum, skb, par->in, par->out,
287 par->targinfo, NULL); 282 par->targinfo, NULL);
@@ -313,21 +308,20 @@ static void ipt_logfn(u_int8_t pf,
313 ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); 308 ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
314} 309}
315 310
316static bool ulog_tg_check(const struct xt_tgchk_param *par) 311static int ulog_tg_check(const struct xt_tgchk_param *par)
317{ 312{
318 const struct ipt_ulog_info *loginfo = par->targinfo; 313 const struct ipt_ulog_info *loginfo = par->targinfo;
319 314
320 if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { 315 if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
321 pr_debug("ipt_ULOG: prefix term %i\n", 316 pr_debug("prefix not null-terminated\n");
322 loginfo->prefix[sizeof(loginfo->prefix) - 1]); 317 return -EINVAL;
323 return false;
324 } 318 }
325 if (loginfo->qthreshold > ULOG_MAX_QLEN) { 319 if (loginfo->qthreshold > ULOG_MAX_QLEN) {
326 pr_debug("ipt_ULOG: queue threshold %Zu > MAX_QLEN\n", 320 pr_debug("queue threshold %Zu > MAX_QLEN\n",
327 loginfo->qthreshold); 321 loginfo->qthreshold);
328 return false; 322 return -EINVAL;
329 } 323 }
330 return true; 324 return 0;
331} 325}
332 326
333#ifdef CONFIG_COMPAT 327#ifdef CONFIG_COMPAT
@@ -338,7 +332,7 @@ struct compat_ipt_ulog_info {
338 char prefix[ULOG_PREFIX_LEN]; 332 char prefix[ULOG_PREFIX_LEN];
339}; 333};
340 334
341static void ulog_tg_compat_from_user(void *dst, void *src) 335static void ulog_tg_compat_from_user(void *dst, const void *src)
342{ 336{
343 const struct compat_ipt_ulog_info *cl = src; 337 const struct compat_ipt_ulog_info *cl = src;
344 struct ipt_ulog_info l = { 338 struct ipt_ulog_info l = {
@@ -351,7 +345,7 @@ static void ulog_tg_compat_from_user(void *dst, void *src)
351 memcpy(dst, &l, sizeof(l)); 345 memcpy(dst, &l, sizeof(l));
352} 346}
353 347
354static int ulog_tg_compat_to_user(void __user *dst, void *src) 348static int ulog_tg_compat_to_user(void __user *dst, const void *src)
355{ 349{
356 const struct ipt_ulog_info *l = src; 350 const struct ipt_ulog_info *l = src;
357 struct compat_ipt_ulog_info cl = { 351 struct compat_ipt_ulog_info cl = {
@@ -389,10 +383,10 @@ static int __init ulog_tg_init(void)
389{ 383{
390 int ret, i; 384 int ret, i;
391 385
392 pr_debug("ipt_ULOG: init module\n"); 386 pr_debug("init module\n");
393 387
394 if (nlbufsiz > 128*1024) { 388 if (nlbufsiz > 128*1024) {
395 printk("Netlink buffer has to be <= 128kB\n"); 389 pr_warning("Netlink buffer has to be <= 128kB\n");
396 return -EINVAL; 390 return -EINVAL;
397 } 391 }
398 392
@@ -422,7 +416,7 @@ static void __exit ulog_tg_exit(void)
422 ulog_buff_t *ub; 416 ulog_buff_t *ub;
423 int i; 417 int i;
424 418
425 pr_debug("ipt_ULOG: cleanup_module\n"); 419 pr_debug("cleanup_module\n");
426 420
427 if (nflog) 421 if (nflog)
428 nf_log_unregister(&ipt_ulog_logger); 422 nf_log_unregister(&ipt_ulog_logger);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index 3b216be3bc9f..db8bff0fb86d 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -8,7 +8,7 @@
8 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation. 9 * published by the Free Software Foundation.
10 */ 10 */
11 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
@@ -30,7 +30,7 @@ static inline bool match_type(struct net *net, const struct net_device *dev,
30} 30}
31 31
32static bool 32static bool
33addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) 33addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
34{ 34{
35 struct net *net = dev_net(par->in ? par->in : par->out); 35 struct net *net = dev_net(par->in ? par->in : par->out);
36 const struct ipt_addrtype_info *info = par->matchinfo; 36 const struct ipt_addrtype_info *info = par->matchinfo;
@@ -48,7 +48,7 @@ addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
48} 48}
49 49
50static bool 50static bool
51addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par) 51addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
52{ 52{
53 struct net *net = dev_net(par->in ? par->in : par->out); 53 struct net *net = dev_net(par->in ? par->in : par->out);
54 const struct ipt_addrtype_info_v1 *info = par->matchinfo; 54 const struct ipt_addrtype_info_v1 *info = par->matchinfo;
@@ -70,34 +70,34 @@ addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par)
70 return ret; 70 return ret;
71} 71}
72 72
73static bool addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) 73static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
74{ 74{
75 struct ipt_addrtype_info_v1 *info = par->matchinfo; 75 struct ipt_addrtype_info_v1 *info = par->matchinfo;
76 76
77 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && 77 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN &&
78 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { 78 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
79 printk(KERN_ERR "ipt_addrtype: both incoming and outgoing " 79 pr_info("both incoming and outgoing "
80 "interface limitation cannot be selected\n"); 80 "interface limitation cannot be selected\n");
81 return false; 81 return -EINVAL;
82 } 82 }
83 83
84 if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | 84 if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
85 (1 << NF_INET_LOCAL_IN)) && 85 (1 << NF_INET_LOCAL_IN)) &&
86 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { 86 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
87 printk(KERN_ERR "ipt_addrtype: output interface limitation " 87 pr_info("output interface limitation "
88 "not valid in PRE_ROUTING and INPUT\n"); 88 "not valid in PREROUTING and INPUT\n");
89 return false; 89 return -EINVAL;
90 } 90 }
91 91
92 if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | 92 if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
93 (1 << NF_INET_LOCAL_OUT)) && 93 (1 << NF_INET_LOCAL_OUT)) &&
94 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { 94 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) {
95 printk(KERN_ERR "ipt_addrtype: input interface limitation " 95 pr_info("input interface limitation "
96 "not valid in POST_ROUTING and OUTPUT\n"); 96 "not valid in POSTROUTING and OUTPUT\n");
97 return false; 97 return -EINVAL;
98 } 98 }
99 99
100 return true; 100 return 0;
101} 101}
102 102
103static struct xt_match addrtype_mt_reg[] __read_mostly = { 103static struct xt_match addrtype_mt_reg[] __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index 0104c0b399de..14a2aa8b8a14 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -5,7 +5,7 @@
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation. 6 * published by the Free Software Foundation.
7 */ 7 */
8 8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9#include <linux/in.h> 9#include <linux/in.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/skbuff.h> 11#include <linux/skbuff.h>
@@ -18,25 +18,19 @@ MODULE_LICENSE("GPL");
18MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>"); 18MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
19MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match"); 19MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match");
20 20
21#ifdef DEBUG_CONNTRACK
22#define duprintf(format, args...) printk(format , ## args)
23#else
24#define duprintf(format, args...)
25#endif
26
27/* Returns 1 if the spi is matched by the range, 0 otherwise */ 21/* Returns 1 if the spi is matched by the range, 0 otherwise */
28static inline bool 22static inline bool
29spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) 23spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
30{ 24{
31 bool r; 25 bool r;
32 duprintf("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', 26 pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
33 min,spi,max); 27 invert ? '!' : ' ', min, spi, max);
34 r=(spi >= min && spi <= max) ^ invert; 28 r=(spi >= min && spi <= max) ^ invert;
35 duprintf(" result %s\n",r? "PASS" : "FAILED"); 29 pr_debug(" result %s\n", r ? "PASS" : "FAILED");
36 return r; 30 return r;
37} 31}
38 32
39static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par) 33static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par)
40{ 34{
41 struct ip_auth_hdr _ahdr; 35 struct ip_auth_hdr _ahdr;
42 const struct ip_auth_hdr *ah; 36 const struct ip_auth_hdr *ah;
@@ -51,8 +45,8 @@ static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par)
51 /* We've been asked to examine this packet, and we 45 /* We've been asked to examine this packet, and we
52 * can't. Hence, no choice but to drop. 46 * can't. Hence, no choice but to drop.
53 */ 47 */
54 duprintf("Dropping evil AH tinygram.\n"); 48 pr_debug("Dropping evil AH tinygram.\n");
55 *par->hotdrop = true; 49 par->hotdrop = true;
56 return 0; 50 return 0;
57 } 51 }
58 52
@@ -61,16 +55,16 @@ static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par)
61 !!(ahinfo->invflags & IPT_AH_INV_SPI)); 55 !!(ahinfo->invflags & IPT_AH_INV_SPI));
62} 56}
63 57
64static bool ah_mt_check(const struct xt_mtchk_param *par) 58static int ah_mt_check(const struct xt_mtchk_param *par)
65{ 59{
66 const struct ipt_ah *ahinfo = par->matchinfo; 60 const struct ipt_ah *ahinfo = par->matchinfo;
67 61
68 /* Must specify no unknown invflags */ 62 /* Must specify no unknown invflags */
69 if (ahinfo->invflags & ~IPT_AH_INV_MASK) { 63 if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
70 duprintf("ipt_ah: unknown flags %X\n", ahinfo->invflags); 64 pr_debug("unknown flags %X\n", ahinfo->invflags);
71 return false; 65 return -EINVAL;
72 } 66 }
73 return true; 67 return 0;
74} 68}
75 69
76static struct xt_match ah_mt_reg __read_mostly = { 70static struct xt_match ah_mt_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index 6289b64144c6..af6e9c778345 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -6,7 +6,7 @@
6 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 */ 8 */
9 9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10#include <linux/in.h> 10#include <linux/in.h>
11#include <linux/ip.h> 11#include <linux/ip.h>
12#include <net/ip.h> 12#include <net/ip.h>
@@ -67,7 +67,7 @@ static inline bool match_tcp(const struct sk_buff *skb,
67 return true; 67 return true;
68} 68}
69 69
70static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par) 70static bool ecn_mt(const struct sk_buff *skb, struct xt_action_param *par)
71{ 71{
72 const struct ipt_ecn_info *info = par->matchinfo; 72 const struct ipt_ecn_info *info = par->matchinfo;
73 73
@@ -78,32 +78,31 @@ static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par)
78 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { 78 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
79 if (ip_hdr(skb)->protocol != IPPROTO_TCP) 79 if (ip_hdr(skb)->protocol != IPPROTO_TCP)
80 return false; 80 return false;
81 if (!match_tcp(skb, info, par->hotdrop)) 81 if (!match_tcp(skb, info, &par->hotdrop))
82 return false; 82 return false;
83 } 83 }
84 84
85 return true; 85 return true;
86} 86}
87 87
88static bool ecn_mt_check(const struct xt_mtchk_param *par) 88static int ecn_mt_check(const struct xt_mtchk_param *par)
89{ 89{
90 const struct ipt_ecn_info *info = par->matchinfo; 90 const struct ipt_ecn_info *info = par->matchinfo;
91 const struct ipt_ip *ip = par->entryinfo; 91 const struct ipt_ip *ip = par->entryinfo;
92 92
93 if (info->operation & IPT_ECN_OP_MATCH_MASK) 93 if (info->operation & IPT_ECN_OP_MATCH_MASK)
94 return false; 94 return -EINVAL;
95 95
96 if (info->invert & IPT_ECN_OP_MATCH_MASK) 96 if (info->invert & IPT_ECN_OP_MATCH_MASK)
97 return false; 97 return -EINVAL;
98 98
99 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) 99 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) &&
100 && ip->proto != IPPROTO_TCP) { 100 ip->proto != IPPROTO_TCP) {
101 printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for" 101 pr_info("cannot match TCP bits in rule for non-tcp packets\n");
102 " non-tcp packets\n"); 102 return -EINVAL;
103 return false;
104 } 103 }
105 104
106 return true; 105 return 0;
107} 106}
108 107
109static struct xt_match ecn_mt_reg __read_mostly = { 108static struct xt_match ecn_mt_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index c30a969724f8..c37641e819f2 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/netfilter_ipv4/ip_tables.h> 15#include <linux/netfilter_ipv4/ip_tables.h>
16#include <linux/slab.h>
16#include <net/ip.h> 17#include <net/ip.h>
17 18
18MODULE_LICENSE("GPL"); 19MODULE_LICENSE("GPL");
@@ -23,104 +24,32 @@ MODULE_DESCRIPTION("iptables filter table");
23 (1 << NF_INET_FORWARD) | \ 24 (1 << NF_INET_FORWARD) | \
24 (1 << NF_INET_LOCAL_OUT)) 25 (1 << NF_INET_LOCAL_OUT))
25 26
26static struct 27static const struct xt_table packet_filter = {
27{
28 struct ipt_replace repl;
29 struct ipt_standard entries[3];
30 struct ipt_error term;
31} initial_table __net_initdata = {
32 .repl = {
33 .name = "filter",
34 .valid_hooks = FILTER_VALID_HOOKS,
35 .num_entries = 4,
36 .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
37 .hook_entry = {
38 [NF_INET_LOCAL_IN] = 0,
39 [NF_INET_FORWARD] = sizeof(struct ipt_standard),
40 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
41 },
42 .underflow = {
43 [NF_INET_LOCAL_IN] = 0,
44 [NF_INET_FORWARD] = sizeof(struct ipt_standard),
45 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
46 },
47 },
48 .entries = {
49 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */
50 IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */
51 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
52 },
53 .term = IPT_ERROR_INIT, /* ERROR */
54};
55
56static struct xt_table packet_filter = {
57 .name = "filter", 28 .name = "filter",
58 .valid_hooks = FILTER_VALID_HOOKS, 29 .valid_hooks = FILTER_VALID_HOOKS,
59 .me = THIS_MODULE, 30 .me = THIS_MODULE,
60 .af = AF_INET, 31 .af = NFPROTO_IPV4,
32 .priority = NF_IP_PRI_FILTER,
61}; 33};
62 34
63/* The work comes in here from netfilter.c. */
64static unsigned int
65ipt_local_in_hook(unsigned int hook,
66 struct sk_buff *skb,
67 const struct net_device *in,
68 const struct net_device *out,
69 int (*okfn)(struct sk_buff *))
70{
71 return ipt_do_table(skb, hook, in, out,
72 dev_net(in)->ipv4.iptable_filter);
73}
74
75static unsigned int 35static unsigned int
76ipt_hook(unsigned int hook, 36iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
77 struct sk_buff *skb, 37 const struct net_device *in, const struct net_device *out,
78 const struct net_device *in, 38 int (*okfn)(struct sk_buff *))
79 const struct net_device *out,
80 int (*okfn)(struct sk_buff *))
81{ 39{
82 return ipt_do_table(skb, hook, in, out, 40 const struct net *net;
83 dev_net(in)->ipv4.iptable_filter);
84}
85 41
86static unsigned int 42 if (hook == NF_INET_LOCAL_OUT &&
87ipt_local_out_hook(unsigned int hook, 43 (skb->len < sizeof(struct iphdr) ||
88 struct sk_buff *skb, 44 ip_hdrlen(skb) < sizeof(struct iphdr)))
89 const struct net_device *in, 45 /* root is playing with raw sockets. */
90 const struct net_device *out,
91 int (*okfn)(struct sk_buff *))
92{
93 /* root is playing with raw sockets. */
94 if (skb->len < sizeof(struct iphdr) ||
95 ip_hdrlen(skb) < sizeof(struct iphdr))
96 return NF_ACCEPT; 46 return NF_ACCEPT;
97 return ipt_do_table(skb, hook, in, out, 47
98 dev_net(out)->ipv4.iptable_filter); 48 net = dev_net((in != NULL) ? in : out);
49 return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter);
99} 50}
100 51
101static struct nf_hook_ops ipt_ops[] __read_mostly = { 52static struct nf_hook_ops *filter_ops __read_mostly;
102 {
103 .hook = ipt_local_in_hook,
104 .owner = THIS_MODULE,
105 .pf = PF_INET,
106 .hooknum = NF_INET_LOCAL_IN,
107 .priority = NF_IP_PRI_FILTER,
108 },
109 {
110 .hook = ipt_hook,
111 .owner = THIS_MODULE,
112 .pf = PF_INET,
113 .hooknum = NF_INET_FORWARD,
114 .priority = NF_IP_PRI_FILTER,
115 },
116 {
117 .hook = ipt_local_out_hook,
118 .owner = THIS_MODULE,
119 .pf = PF_INET,
120 .hooknum = NF_INET_LOCAL_OUT,
121 .priority = NF_IP_PRI_FILTER,
122 },
123};
124 53
125/* Default to forward because I got too much mail already. */ 54/* Default to forward because I got too much mail already. */
126static int forward = NF_ACCEPT; 55static int forward = NF_ACCEPT;
@@ -128,9 +57,18 @@ module_param(forward, bool, 0000);
128 57
129static int __net_init iptable_filter_net_init(struct net *net) 58static int __net_init iptable_filter_net_init(struct net *net)
130{ 59{
131 /* Register table */ 60 struct ipt_replace *repl;
61
62 repl = ipt_alloc_initial_table(&packet_filter);
63 if (repl == NULL)
64 return -ENOMEM;
65 /* Entry 1 is the FORWARD hook */
66 ((struct ipt_standard *)repl->entries)[1].target.verdict =
67 -forward - 1;
68
132 net->ipv4.iptable_filter = 69 net->ipv4.iptable_filter =
133 ipt_register_table(net, &packet_filter, &initial_table.repl); 70 ipt_register_table(net, &packet_filter, repl);
71 kfree(repl);
134 if (IS_ERR(net->ipv4.iptable_filter)) 72 if (IS_ERR(net->ipv4.iptable_filter))
135 return PTR_ERR(net->ipv4.iptable_filter); 73 return PTR_ERR(net->ipv4.iptable_filter);
136 return 0; 74 return 0;
@@ -138,7 +76,7 @@ static int __net_init iptable_filter_net_init(struct net *net)
138 76
139static void __net_exit iptable_filter_net_exit(struct net *net) 77static void __net_exit iptable_filter_net_exit(struct net *net)
140{ 78{
141 ipt_unregister_table(net->ipv4.iptable_filter); 79 ipt_unregister_table(net, net->ipv4.iptable_filter);
142} 80}
143 81
144static struct pernet_operations iptable_filter_net_ops = { 82static struct pernet_operations iptable_filter_net_ops = {
@@ -151,21 +89,20 @@ static int __init iptable_filter_init(void)
151 int ret; 89 int ret;
152 90
153 if (forward < 0 || forward > NF_MAX_VERDICT) { 91 if (forward < 0 || forward > NF_MAX_VERDICT) {
154 printk("iptables forward must be 0 or 1\n"); 92 pr_err("iptables forward must be 0 or 1\n");
155 return -EINVAL; 93 return -EINVAL;
156 } 94 }
157 95
158 /* Entry 1 is the FORWARD hook */
159 initial_table.entries[1].target.verdict = -forward - 1;
160
161 ret = register_pernet_subsys(&iptable_filter_net_ops); 96 ret = register_pernet_subsys(&iptable_filter_net_ops);
162 if (ret < 0) 97 if (ret < 0)
163 return ret; 98 return ret;
164 99
165 /* Register hooks */ 100 /* Register hooks */
166 ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 101 filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
167 if (ret < 0) 102 if (IS_ERR(filter_ops)) {
103 ret = PTR_ERR(filter_ops);
168 goto cleanup_table; 104 goto cleanup_table;
105 }
169 106
170 return ret; 107 return ret;
171 108
@@ -176,7 +113,7 @@ static int __init iptable_filter_init(void)
176 113
177static void __exit iptable_filter_fini(void) 114static void __exit iptable_filter_fini(void)
178{ 115{
179 nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 116 xt_hook_unlink(&packet_filter, filter_ops);
180 unregister_pernet_subsys(&iptable_filter_net_ops); 117 unregister_pernet_subsys(&iptable_filter_net_ops);
181} 118}
182 119
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 4087614d9519..294a2a32f293 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -12,6 +12,7 @@
12#include <linux/netfilter_ipv4/ip_tables.h> 12#include <linux/netfilter_ipv4/ip_tables.h>
13#include <linux/netdevice.h> 13#include <linux/netdevice.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/slab.h>
15#include <net/sock.h> 16#include <net/sock.h>
16#include <net/route.h> 17#include <net/route.h>
17#include <linux/ip.h> 18#include <linux/ip.h>
@@ -27,101 +28,16 @@ MODULE_DESCRIPTION("iptables mangle table");
27 (1 << NF_INET_LOCAL_OUT) | \ 28 (1 << NF_INET_LOCAL_OUT) | \
28 (1 << NF_INET_POST_ROUTING)) 29 (1 << NF_INET_POST_ROUTING))
29 30
30/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */ 31static const struct xt_table packet_mangler = {
31static struct
32{
33 struct ipt_replace repl;
34 struct ipt_standard entries[5];
35 struct ipt_error term;
36} initial_table __net_initdata = {
37 .repl = {
38 .name = "mangle",
39 .valid_hooks = MANGLE_VALID_HOOKS,
40 .num_entries = 6,
41 .size = sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error),
42 .hook_entry = {
43 [NF_INET_PRE_ROUTING] = 0,
44 [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard),
45 [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2,
46 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
47 [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4,
48 },
49 .underflow = {
50 [NF_INET_PRE_ROUTING] = 0,
51 [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard),
52 [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2,
53 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
54 [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4,
55 },
56 },
57 .entries = {
58 IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */
59 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */
60 IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */
61 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
62 IPT_STANDARD_INIT(NF_ACCEPT), /* POST_ROUTING */
63 },
64 .term = IPT_ERROR_INIT, /* ERROR */
65};
66
67static struct xt_table packet_mangler = {
68 .name = "mangle", 32 .name = "mangle",
69 .valid_hooks = MANGLE_VALID_HOOKS, 33 .valid_hooks = MANGLE_VALID_HOOKS,
70 .me = THIS_MODULE, 34 .me = THIS_MODULE,
71 .af = AF_INET, 35 .af = NFPROTO_IPV4,
36 .priority = NF_IP_PRI_MANGLE,
72}; 37};
73 38
74/* The work comes in here from netfilter.c. */
75static unsigned int
76ipt_pre_routing_hook(unsigned int hook,
77 struct sk_buff *skb,
78 const struct net_device *in,
79 const struct net_device *out,
80 int (*okfn)(struct sk_buff *))
81{
82 return ipt_do_table(skb, hook, in, out,
83 dev_net(in)->ipv4.iptable_mangle);
84}
85
86static unsigned int
87ipt_post_routing_hook(unsigned int hook,
88 struct sk_buff *skb,
89 const struct net_device *in,
90 const struct net_device *out,
91 int (*okfn)(struct sk_buff *))
92{
93 return ipt_do_table(skb, hook, in, out,
94 dev_net(out)->ipv4.iptable_mangle);
95}
96
97static unsigned int
98ipt_local_in_hook(unsigned int hook,
99 struct sk_buff *skb,
100 const struct net_device *in,
101 const struct net_device *out,
102 int (*okfn)(struct sk_buff *))
103{
104 return ipt_do_table(skb, hook, in, out,
105 dev_net(in)->ipv4.iptable_mangle);
106}
107
108static unsigned int
109ipt_forward_hook(unsigned int hook,
110 struct sk_buff *skb,
111 const struct net_device *in,
112 const struct net_device *out,
113 int (*okfn)(struct sk_buff *))
114{
115 return ipt_do_table(skb, hook, in, out,
116 dev_net(in)->ipv4.iptable_mangle);
117}
118
119static unsigned int 39static unsigned int
120ipt_local_hook(unsigned int hook, 40ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
121 struct sk_buff *skb,
122 const struct net_device *in,
123 const struct net_device *out,
124 int (*okfn)(struct sk_buff *))
125{ 41{
126 unsigned int ret; 42 unsigned int ret;
127 const struct iphdr *iph; 43 const struct iphdr *iph;
@@ -130,8 +46,8 @@ ipt_local_hook(unsigned int hook,
130 u_int32_t mark; 46 u_int32_t mark;
131 47
132 /* root is playing with raw sockets. */ 48 /* root is playing with raw sockets. */
133 if (skb->len < sizeof(struct iphdr) 49 if (skb->len < sizeof(struct iphdr) ||
134 || ip_hdrlen(skb) < sizeof(struct iphdr)) 50 ip_hdrlen(skb) < sizeof(struct iphdr))
135 return NF_ACCEPT; 51 return NF_ACCEPT;
136 52
137 /* Save things which could affect route */ 53 /* Save things which could affect route */
@@ -141,7 +57,7 @@ ipt_local_hook(unsigned int hook,
141 daddr = iph->daddr; 57 daddr = iph->daddr;
142 tos = iph->tos; 58 tos = iph->tos;
143 59
144 ret = ipt_do_table(skb, hook, in, out, 60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
145 dev_net(out)->ipv4.iptable_mangle); 61 dev_net(out)->ipv4.iptable_mangle);
146 /* Reroute for ANY change. */ 62 /* Reroute for ANY change. */
147 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { 63 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) {
@@ -158,49 +74,36 @@ ipt_local_hook(unsigned int hook,
158 return ret; 74 return ret;
159} 75}
160 76
161static struct nf_hook_ops ipt_ops[] __read_mostly = { 77/* The work comes in here from netfilter.c. */
162 { 78static unsigned int
163 .hook = ipt_pre_routing_hook, 79iptable_mangle_hook(unsigned int hook,
164 .owner = THIS_MODULE, 80 struct sk_buff *skb,
165 .pf = PF_INET, 81 const struct net_device *in,
166 .hooknum = NF_INET_PRE_ROUTING, 82 const struct net_device *out,
167 .priority = NF_IP_PRI_MANGLE, 83 int (*okfn)(struct sk_buff *))
168 }, 84{
169 { 85 if (hook == NF_INET_LOCAL_OUT)
170 .hook = ipt_local_in_hook, 86 return ipt_mangle_out(skb, out);
171 .owner = THIS_MODULE, 87 if (hook == NF_INET_POST_ROUTING)
172 .pf = PF_INET, 88 return ipt_do_table(skb, hook, in, out,
173 .hooknum = NF_INET_LOCAL_IN, 89 dev_net(out)->ipv4.iptable_mangle);
174 .priority = NF_IP_PRI_MANGLE, 90 /* PREROUTING/INPUT/FORWARD: */
175 }, 91 return ipt_do_table(skb, hook, in, out,
176 { 92 dev_net(in)->ipv4.iptable_mangle);
177 .hook = ipt_forward_hook, 93}
178 .owner = THIS_MODULE, 94
179 .pf = PF_INET, 95static struct nf_hook_ops *mangle_ops __read_mostly;
180 .hooknum = NF_INET_FORWARD,
181 .priority = NF_IP_PRI_MANGLE,
182 },
183 {
184 .hook = ipt_local_hook,
185 .owner = THIS_MODULE,
186 .pf = PF_INET,
187 .hooknum = NF_INET_LOCAL_OUT,
188 .priority = NF_IP_PRI_MANGLE,
189 },
190 {
191 .hook = ipt_post_routing_hook,
192 .owner = THIS_MODULE,
193 .pf = PF_INET,
194 .hooknum = NF_INET_POST_ROUTING,
195 .priority = NF_IP_PRI_MANGLE,
196 },
197};
198 96
199static int __net_init iptable_mangle_net_init(struct net *net) 97static int __net_init iptable_mangle_net_init(struct net *net)
200{ 98{
201 /* Register table */ 99 struct ipt_replace *repl;
100
101 repl = ipt_alloc_initial_table(&packet_mangler);
102 if (repl == NULL)
103 return -ENOMEM;
202 net->ipv4.iptable_mangle = 104 net->ipv4.iptable_mangle =
203 ipt_register_table(net, &packet_mangler, &initial_table.repl); 105 ipt_register_table(net, &packet_mangler, repl);
106 kfree(repl);
204 if (IS_ERR(net->ipv4.iptable_mangle)) 107 if (IS_ERR(net->ipv4.iptable_mangle))
205 return PTR_ERR(net->ipv4.iptable_mangle); 108 return PTR_ERR(net->ipv4.iptable_mangle);
206 return 0; 109 return 0;
@@ -208,7 +111,7 @@ static int __net_init iptable_mangle_net_init(struct net *net)
208 111
209static void __net_exit iptable_mangle_net_exit(struct net *net) 112static void __net_exit iptable_mangle_net_exit(struct net *net)
210{ 113{
211 ipt_unregister_table(net->ipv4.iptable_mangle); 114 ipt_unregister_table(net, net->ipv4.iptable_mangle);
212} 115}
213 116
214static struct pernet_operations iptable_mangle_net_ops = { 117static struct pernet_operations iptable_mangle_net_ops = {
@@ -225,9 +128,11 @@ static int __init iptable_mangle_init(void)
225 return ret; 128 return ret;
226 129
227 /* Register hooks */ 130 /* Register hooks */
228 ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 131 mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
229 if (ret < 0) 132 if (IS_ERR(mangle_ops)) {
133 ret = PTR_ERR(mangle_ops);
230 goto cleanup_table; 134 goto cleanup_table;
135 }
231 136
232 return ret; 137 return ret;
233 138
@@ -238,7 +143,7 @@ static int __init iptable_mangle_init(void)
238 143
239static void __exit iptable_mangle_fini(void) 144static void __exit iptable_mangle_fini(void)
240{ 145{
241 nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 146 xt_hook_unlink(&packet_mangler, mangle_ops);
242 unregister_pernet_subsys(&iptable_mangle_net_ops); 147 unregister_pernet_subsys(&iptable_mangle_net_ops);
243} 148}
244 149
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index e5356da1fb54..07fb710cd722 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -5,94 +5,49 @@
5 */ 5 */
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/netfilter_ipv4/ip_tables.h> 7#include <linux/netfilter_ipv4/ip_tables.h>
8#include <linux/slab.h>
8#include <net/ip.h> 9#include <net/ip.h>
9 10
10#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) 11#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
11 12
12static struct 13static const struct xt_table packet_raw = {
13{
14 struct ipt_replace repl;
15 struct ipt_standard entries[2];
16 struct ipt_error term;
17} initial_table __net_initdata = {
18 .repl = {
19 .name = "raw",
20 .valid_hooks = RAW_VALID_HOOKS,
21 .num_entries = 3,
22 .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error),
23 .hook_entry = {
24 [NF_INET_PRE_ROUTING] = 0,
25 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard)
26 },
27 .underflow = {
28 [NF_INET_PRE_ROUTING] = 0,
29 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard)
30 },
31 },
32 .entries = {
33 IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */
34 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
35 },
36 .term = IPT_ERROR_INIT, /* ERROR */
37};
38
39static struct xt_table packet_raw = {
40 .name = "raw", 14 .name = "raw",
41 .valid_hooks = RAW_VALID_HOOKS, 15 .valid_hooks = RAW_VALID_HOOKS,
42 .me = THIS_MODULE, 16 .me = THIS_MODULE,
43 .af = AF_INET, 17 .af = NFPROTO_IPV4,
18 .priority = NF_IP_PRI_RAW,
44}; 19};
45 20
46/* The work comes in here from netfilter.c. */ 21/* The work comes in here from netfilter.c. */
47static unsigned int 22static unsigned int
48ipt_hook(unsigned int hook, 23iptable_raw_hook(unsigned int hook, struct sk_buff *skb,
49 struct sk_buff *skb, 24 const struct net_device *in, const struct net_device *out,
50 const struct net_device *in, 25 int (*okfn)(struct sk_buff *))
51 const struct net_device *out,
52 int (*okfn)(struct sk_buff *))
53{ 26{
54 return ipt_do_table(skb, hook, in, out, 27 const struct net *net;
55 dev_net(in)->ipv4.iptable_raw);
56}
57 28
58static unsigned int 29 if (hook == NF_INET_LOCAL_OUT &&
59ipt_local_hook(unsigned int hook, 30 (skb->len < sizeof(struct iphdr) ||
60 struct sk_buff *skb, 31 ip_hdrlen(skb) < sizeof(struct iphdr)))
61 const struct net_device *in, 32 /* root is playing with raw sockets. */
62 const struct net_device *out,
63 int (*okfn)(struct sk_buff *))
64{
65 /* root is playing with raw sockets. */
66 if (skb->len < sizeof(struct iphdr) ||
67 ip_hdrlen(skb) < sizeof(struct iphdr))
68 return NF_ACCEPT; 33 return NF_ACCEPT;
69 return ipt_do_table(skb, hook, in, out, 34
70 dev_net(out)->ipv4.iptable_raw); 35 net = dev_net((in != NULL) ? in : out);
36 return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw);
71} 37}
72 38
73/* 'raw' is the very first table. */ 39static struct nf_hook_ops *rawtable_ops __read_mostly;
74static struct nf_hook_ops ipt_ops[] __read_mostly = {
75 {
76 .hook = ipt_hook,
77 .pf = PF_INET,
78 .hooknum = NF_INET_PRE_ROUTING,
79 .priority = NF_IP_PRI_RAW,
80 .owner = THIS_MODULE,
81 },
82 {
83 .hook = ipt_local_hook,
84 .pf = PF_INET,
85 .hooknum = NF_INET_LOCAL_OUT,
86 .priority = NF_IP_PRI_RAW,
87 .owner = THIS_MODULE,
88 },
89};
90 40
91static int __net_init iptable_raw_net_init(struct net *net) 41static int __net_init iptable_raw_net_init(struct net *net)
92{ 42{
93 /* Register table */ 43 struct ipt_replace *repl;
44
45 repl = ipt_alloc_initial_table(&packet_raw);
46 if (repl == NULL)
47 return -ENOMEM;
94 net->ipv4.iptable_raw = 48 net->ipv4.iptable_raw =
95 ipt_register_table(net, &packet_raw, &initial_table.repl); 49 ipt_register_table(net, &packet_raw, repl);
50 kfree(repl);
96 if (IS_ERR(net->ipv4.iptable_raw)) 51 if (IS_ERR(net->ipv4.iptable_raw))
97 return PTR_ERR(net->ipv4.iptable_raw); 52 return PTR_ERR(net->ipv4.iptable_raw);
98 return 0; 53 return 0;
@@ -100,7 +55,7 @@ static int __net_init iptable_raw_net_init(struct net *net)
100 55
101static void __net_exit iptable_raw_net_exit(struct net *net) 56static void __net_exit iptable_raw_net_exit(struct net *net)
102{ 57{
103 ipt_unregister_table(net->ipv4.iptable_raw); 58 ipt_unregister_table(net, net->ipv4.iptable_raw);
104} 59}
105 60
106static struct pernet_operations iptable_raw_net_ops = { 61static struct pernet_operations iptable_raw_net_ops = {
@@ -117,9 +72,11 @@ static int __init iptable_raw_init(void)
117 return ret; 72 return ret;
118 73
119 /* Register hooks */ 74 /* Register hooks */
120 ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 75 rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
121 if (ret < 0) 76 if (IS_ERR(rawtable_ops)) {
77 ret = PTR_ERR(rawtable_ops);
122 goto cleanup_table; 78 goto cleanup_table;
79 }
123 80
124 return ret; 81 return ret;
125 82
@@ -130,7 +87,7 @@ static int __init iptable_raw_init(void)
130 87
131static void __exit iptable_raw_fini(void) 88static void __exit iptable_raw_fini(void)
132{ 89{
133 nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 90 xt_hook_unlink(&packet_raw, rawtable_ops);
134 unregister_pernet_subsys(&iptable_raw_net_ops); 91 unregister_pernet_subsys(&iptable_raw_net_ops);
135} 92}
136 93
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index 29ab630f240a..be45bdc4c602 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/netfilter_ipv4/ip_tables.h> 19#include <linux/netfilter_ipv4/ip_tables.h>
20#include <linux/slab.h>
20#include <net/ip.h> 21#include <net/ip.h>
21 22
22MODULE_LICENSE("GPL"); 23MODULE_LICENSE("GPL");
@@ -27,109 +28,44 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules");
27 (1 << NF_INET_FORWARD) | \ 28 (1 << NF_INET_FORWARD) | \
28 (1 << NF_INET_LOCAL_OUT) 29 (1 << NF_INET_LOCAL_OUT)
29 30
30static struct 31static const struct xt_table security_table = {
31{
32 struct ipt_replace repl;
33 struct ipt_standard entries[3];
34 struct ipt_error term;
35} initial_table __net_initdata = {
36 .repl = {
37 .name = "security",
38 .valid_hooks = SECURITY_VALID_HOOKS,
39 .num_entries = 4,
40 .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
41 .hook_entry = {
42 [NF_INET_LOCAL_IN] = 0,
43 [NF_INET_FORWARD] = sizeof(struct ipt_standard),
44 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
45 },
46 .underflow = {
47 [NF_INET_LOCAL_IN] = 0,
48 [NF_INET_FORWARD] = sizeof(struct ipt_standard),
49 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
50 },
51 },
52 .entries = {
53 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */
54 IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */
55 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
56 },
57 .term = IPT_ERROR_INIT, /* ERROR */
58};
59
60static struct xt_table security_table = {
61 .name = "security", 32 .name = "security",
62 .valid_hooks = SECURITY_VALID_HOOKS, 33 .valid_hooks = SECURITY_VALID_HOOKS,
63 .me = THIS_MODULE, 34 .me = THIS_MODULE,
64 .af = AF_INET, 35 .af = NFPROTO_IPV4,
36 .priority = NF_IP_PRI_SECURITY,
65}; 37};
66 38
67static unsigned int 39static unsigned int
68ipt_local_in_hook(unsigned int hook, 40iptable_security_hook(unsigned int hook, struct sk_buff *skb,
69 struct sk_buff *skb, 41 const struct net_device *in,
70 const struct net_device *in, 42 const struct net_device *out,
71 const struct net_device *out, 43 int (*okfn)(struct sk_buff *))
72 int (*okfn)(struct sk_buff *))
73{
74 return ipt_do_table(skb, hook, in, out,
75 dev_net(in)->ipv4.iptable_security);
76}
77
78static unsigned int
79ipt_forward_hook(unsigned int hook,
80 struct sk_buff *skb,
81 const struct net_device *in,
82 const struct net_device *out,
83 int (*okfn)(struct sk_buff *))
84{ 44{
85 return ipt_do_table(skb, hook, in, out, 45 const struct net *net;
86 dev_net(in)->ipv4.iptable_security);
87}
88 46
89static unsigned int 47 if (hook == NF_INET_LOCAL_OUT &&
90ipt_local_out_hook(unsigned int hook, 48 (skb->len < sizeof(struct iphdr) ||
91 struct sk_buff *skb, 49 ip_hdrlen(skb) < sizeof(struct iphdr)))
92 const struct net_device *in, 50 /* Somebody is playing with raw sockets. */
93 const struct net_device *out,
94 int (*okfn)(struct sk_buff *))
95{
96 /* Somebody is playing with raw sockets. */
97 if (skb->len < sizeof(struct iphdr)
98 || ip_hdrlen(skb) < sizeof(struct iphdr))
99 return NF_ACCEPT; 51 return NF_ACCEPT;
100 return ipt_do_table(skb, hook, in, out, 52
101 dev_net(out)->ipv4.iptable_security); 53 net = dev_net((in != NULL) ? in : out);
54 return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security);
102} 55}
103 56
104static struct nf_hook_ops ipt_ops[] __read_mostly = { 57static struct nf_hook_ops *sectbl_ops __read_mostly;
105 {
106 .hook = ipt_local_in_hook,
107 .owner = THIS_MODULE,
108 .pf = PF_INET,
109 .hooknum = NF_INET_LOCAL_IN,
110 .priority = NF_IP_PRI_SECURITY,
111 },
112 {
113 .hook = ipt_forward_hook,
114 .owner = THIS_MODULE,
115 .pf = PF_INET,
116 .hooknum = NF_INET_FORWARD,
117 .priority = NF_IP_PRI_SECURITY,
118 },
119 {
120 .hook = ipt_local_out_hook,
121 .owner = THIS_MODULE,
122 .pf = PF_INET,
123 .hooknum = NF_INET_LOCAL_OUT,
124 .priority = NF_IP_PRI_SECURITY,
125 },
126};
127 58
128static int __net_init iptable_security_net_init(struct net *net) 59static int __net_init iptable_security_net_init(struct net *net)
129{ 60{
130 net->ipv4.iptable_security = 61 struct ipt_replace *repl;
131 ipt_register_table(net, &security_table, &initial_table.repl);
132 62
63 repl = ipt_alloc_initial_table(&security_table);
64 if (repl == NULL)
65 return -ENOMEM;
66 net->ipv4.iptable_security =
67 ipt_register_table(net, &security_table, repl);
68 kfree(repl);
133 if (IS_ERR(net->ipv4.iptable_security)) 69 if (IS_ERR(net->ipv4.iptable_security))
134 return PTR_ERR(net->ipv4.iptable_security); 70 return PTR_ERR(net->ipv4.iptable_security);
135 71
@@ -138,7 +74,7 @@ static int __net_init iptable_security_net_init(struct net *net)
138 74
139static void __net_exit iptable_security_net_exit(struct net *net) 75static void __net_exit iptable_security_net_exit(struct net *net)
140{ 76{
141 ipt_unregister_table(net->ipv4.iptable_security); 77 ipt_unregister_table(net, net->ipv4.iptable_security);
142} 78}
143 79
144static struct pernet_operations iptable_security_net_ops = { 80static struct pernet_operations iptable_security_net_ops = {
@@ -154,9 +90,11 @@ static int __init iptable_security_init(void)
154 if (ret < 0) 90 if (ret < 0)
155 return ret; 91 return ret;
156 92
157 ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 93 sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
158 if (ret < 0) 94 if (IS_ERR(sectbl_ops)) {
95 ret = PTR_ERR(sectbl_ops);
159 goto cleanup_table; 96 goto cleanup_table;
97 }
160 98
161 return ret; 99 return ret;
162 100
@@ -167,7 +105,7 @@ cleanup_table:
167 105
168static void __exit iptable_security_fini(void) 106static void __exit iptable_security_fini(void)
169{ 107{
170 nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 108 xt_hook_unlink(&security_table, sectbl_ops);
171 unregister_pernet_subsys(&iptable_security_net_ops); 109 unregister_pernet_subsys(&iptable_security_net_ops);
172} 110}
173 111
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 7d2ead7228ac..5a03c02af999 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -22,10 +22,12 @@
22#include <net/netfilter/nf_conntrack_helper.h> 22#include <net/netfilter/nf_conntrack_helper.h>
23#include <net/netfilter/nf_conntrack_l4proto.h> 23#include <net/netfilter/nf_conntrack_l4proto.h>
24#include <net/netfilter/nf_conntrack_l3proto.h> 24#include <net/netfilter/nf_conntrack_l3proto.h>
25#include <net/netfilter/nf_conntrack_zones.h>
25#include <net/netfilter/nf_conntrack_core.h> 26#include <net/netfilter/nf_conntrack_core.h>
26#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> 27#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
27#include <net/netfilter/nf_nat_helper.h> 28#include <net/netfilter/nf_nat_helper.h>
28#include <net/netfilter/ipv4/nf_defrag_ipv4.h> 29#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
30#include <net/netfilter/nf_log.h>
29 31
30int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, 32int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
31 struct nf_conn *ct, 33 struct nf_conn *ct,
@@ -113,8 +115,11 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
113 115
114 ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), 116 ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
115 ct, ctinfo); 117 ct, ctinfo);
116 if (ret != NF_ACCEPT) 118 if (ret != NF_ACCEPT) {
119 nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL,
120 "nf_ct_%s: dropping packet", helper->name);
117 return ret; 121 return ret;
122 }
118 123
119 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) { 124 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) {
120 typeof(nf_nat_seq_adjust_hook) seq_adjust; 125 typeof(nf_nat_seq_adjust_hook) seq_adjust;
@@ -158,28 +163,28 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
158 { 163 {
159 .hook = ipv4_conntrack_in, 164 .hook = ipv4_conntrack_in,
160 .owner = THIS_MODULE, 165 .owner = THIS_MODULE,
161 .pf = PF_INET, 166 .pf = NFPROTO_IPV4,
162 .hooknum = NF_INET_PRE_ROUTING, 167 .hooknum = NF_INET_PRE_ROUTING,
163 .priority = NF_IP_PRI_CONNTRACK, 168 .priority = NF_IP_PRI_CONNTRACK,
164 }, 169 },
165 { 170 {
166 .hook = ipv4_conntrack_local, 171 .hook = ipv4_conntrack_local,
167 .owner = THIS_MODULE, 172 .owner = THIS_MODULE,
168 .pf = PF_INET, 173 .pf = NFPROTO_IPV4,
169 .hooknum = NF_INET_LOCAL_OUT, 174 .hooknum = NF_INET_LOCAL_OUT,
170 .priority = NF_IP_PRI_CONNTRACK, 175 .priority = NF_IP_PRI_CONNTRACK,
171 }, 176 },
172 { 177 {
173 .hook = ipv4_confirm, 178 .hook = ipv4_confirm,
174 .owner = THIS_MODULE, 179 .owner = THIS_MODULE,
175 .pf = PF_INET, 180 .pf = NFPROTO_IPV4,
176 .hooknum = NF_INET_POST_ROUTING, 181 .hooknum = NF_INET_POST_ROUTING,
177 .priority = NF_IP_PRI_CONNTRACK_CONFIRM, 182 .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
178 }, 183 },
179 { 184 {
180 .hook = ipv4_confirm, 185 .hook = ipv4_confirm,
181 .owner = THIS_MODULE, 186 .owner = THIS_MODULE,
182 .pf = PF_INET, 187 .pf = NFPROTO_IPV4,
183 .hooknum = NF_INET_LOCAL_IN, 188 .hooknum = NF_INET_LOCAL_IN,
184 .priority = NF_IP_PRI_CONNTRACK_CONFIRM, 189 .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
185 }, 190 },
@@ -191,7 +196,6 @@ static int log_invalid_proto_max = 255;
191 196
192static ctl_table ip_ct_sysctl_table[] = { 197static ctl_table ip_ct_sysctl_table[] = {
193 { 198 {
194 .ctl_name = NET_IPV4_NF_CONNTRACK_MAX,
195 .procname = "ip_conntrack_max", 199 .procname = "ip_conntrack_max",
196 .data = &nf_conntrack_max, 200 .data = &nf_conntrack_max,
197 .maxlen = sizeof(int), 201 .maxlen = sizeof(int),
@@ -199,7 +203,6 @@ static ctl_table ip_ct_sysctl_table[] = {
199 .proc_handler = proc_dointvec, 203 .proc_handler = proc_dointvec,
200 }, 204 },
201 { 205 {
202 .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT,
203 .procname = "ip_conntrack_count", 206 .procname = "ip_conntrack_count",
204 .data = &init_net.ct.count, 207 .data = &init_net.ct.count,
205 .maxlen = sizeof(int), 208 .maxlen = sizeof(int),
@@ -207,15 +210,13 @@ static ctl_table ip_ct_sysctl_table[] = {
207 .proc_handler = proc_dointvec, 210 .proc_handler = proc_dointvec,
208 }, 211 },
209 { 212 {
210 .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS,
211 .procname = "ip_conntrack_buckets", 213 .procname = "ip_conntrack_buckets",
212 .data = &nf_conntrack_htable_size, 214 .data = &init_net.ct.htable_size,
213 .maxlen = sizeof(unsigned int), 215 .maxlen = sizeof(unsigned int),
214 .mode = 0444, 216 .mode = 0444,
215 .proc_handler = proc_dointvec, 217 .proc_handler = proc_dointvec,
216 }, 218 },
217 { 219 {
218 .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM,
219 .procname = "ip_conntrack_checksum", 220 .procname = "ip_conntrack_checksum",
220 .data = &init_net.ct.sysctl_checksum, 221 .data = &init_net.ct.sysctl_checksum,
221 .maxlen = sizeof(int), 222 .maxlen = sizeof(int),
@@ -223,19 +224,15 @@ static ctl_table ip_ct_sysctl_table[] = {
223 .proc_handler = proc_dointvec, 224 .proc_handler = proc_dointvec,
224 }, 225 },
225 { 226 {
226 .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID,
227 .procname = "ip_conntrack_log_invalid", 227 .procname = "ip_conntrack_log_invalid",
228 .data = &init_net.ct.sysctl_log_invalid, 228 .data = &init_net.ct.sysctl_log_invalid,
229 .maxlen = sizeof(unsigned int), 229 .maxlen = sizeof(unsigned int),
230 .mode = 0644, 230 .mode = 0644,
231 .proc_handler = proc_dointvec_minmax, 231 .proc_handler = proc_dointvec_minmax,
232 .strategy = sysctl_intvec,
233 .extra1 = &log_invalid_proto_min, 232 .extra1 = &log_invalid_proto_min,
234 .extra2 = &log_invalid_proto_max, 233 .extra2 = &log_invalid_proto_max,
235 }, 234 },
236 { 235 { }
237 .ctl_name = 0
238 }
239}; 236};
240#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */ 237#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */
241 238
@@ -251,16 +248,16 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
251 struct nf_conntrack_tuple tuple; 248 struct nf_conntrack_tuple tuple;
252 249
253 memset(&tuple, 0, sizeof(tuple)); 250 memset(&tuple, 0, sizeof(tuple));
254 tuple.src.u3.ip = inet->rcv_saddr; 251 tuple.src.u3.ip = inet->inet_rcv_saddr;
255 tuple.src.u.tcp.port = inet->sport; 252 tuple.src.u.tcp.port = inet->inet_sport;
256 tuple.dst.u3.ip = inet->daddr; 253 tuple.dst.u3.ip = inet->inet_daddr;
257 tuple.dst.u.tcp.port = inet->dport; 254 tuple.dst.u.tcp.port = inet->inet_dport;
258 tuple.src.l3num = PF_INET; 255 tuple.src.l3num = PF_INET;
259 tuple.dst.protonum = IPPROTO_TCP; 256 tuple.dst.protonum = sk->sk_protocol;
260 257
261 /* We only do TCP at the moment: is there a better way? */ 258 /* We only do TCP and SCTP at the moment: is there a better way? */
262 if (strcmp(sk->sk_prot->name, "TCP")) { 259 if (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_SCTP) {
263 pr_debug("SO_ORIGINAL_DST: Not a TCP socket\n"); 260 pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
264 return -ENOPROTOOPT; 261 return -ENOPROTOOPT;
265 } 262 }
266 263
@@ -270,7 +267,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
270 return -EINVAL; 267 return -EINVAL;
271 } 268 }
272 269
273 h = nf_conntrack_find_get(sock_net(sk), &tuple); 270 h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
274 if (h) { 271 if (h) {
275 struct sockaddr_in sin; 272 struct sockaddr_in sin;
276 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 273 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
@@ -385,32 +382,32 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
385 382
386 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); 383 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4);
387 if (ret < 0) { 384 if (ret < 0) {
388 printk("nf_conntrack_ipv4: can't register tcp.\n"); 385 pr_err("nf_conntrack_ipv4: can't register tcp.\n");
389 goto cleanup_sockopt; 386 goto cleanup_sockopt;
390 } 387 }
391 388
392 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4); 389 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
393 if (ret < 0) { 390 if (ret < 0) {
394 printk("nf_conntrack_ipv4: can't register udp.\n"); 391 pr_err("nf_conntrack_ipv4: can't register udp.\n");
395 goto cleanup_tcp; 392 goto cleanup_tcp;
396 } 393 }
397 394
398 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp); 395 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
399 if (ret < 0) { 396 if (ret < 0) {
400 printk("nf_conntrack_ipv4: can't register icmp.\n"); 397 pr_err("nf_conntrack_ipv4: can't register icmp.\n");
401 goto cleanup_udp; 398 goto cleanup_udp;
402 } 399 }
403 400
404 ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); 401 ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
405 if (ret < 0) { 402 if (ret < 0) {
406 printk("nf_conntrack_ipv4: can't register ipv4\n"); 403 pr_err("nf_conntrack_ipv4: can't register ipv4\n");
407 goto cleanup_icmp; 404 goto cleanup_icmp;
408 } 405 }
409 406
410 ret = nf_register_hooks(ipv4_conntrack_ops, 407 ret = nf_register_hooks(ipv4_conntrack_ops,
411 ARRAY_SIZE(ipv4_conntrack_ops)); 408 ARRAY_SIZE(ipv4_conntrack_ops));
412 if (ret < 0) { 409 if (ret < 0) {
413 printk("nf_conntrack_ipv4: can't register hooks.\n"); 410 pr_err("nf_conntrack_ipv4: can't register hooks.\n");
414 goto cleanup_ipv4; 411 goto cleanup_ipv4;
415 } 412 }
416#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) 413#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 8668a3defda6..63f60fc5d26a 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -11,6 +11,7 @@
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/percpu.h> 13#include <linux/percpu.h>
14#include <linux/security.h>
14#include <net/net_namespace.h> 15#include <net/net_namespace.h>
15 16
16#include <linux/netfilter.h> 17#include <linux/netfilter.h>
@@ -32,7 +33,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
32 struct hlist_nulls_node *n; 33 struct hlist_nulls_node *n;
33 34
34 for (st->bucket = 0; 35 for (st->bucket = 0;
35 st->bucket < nf_conntrack_htable_size; 36 st->bucket < net->ct.htable_size;
36 st->bucket++) { 37 st->bucket++) {
37 n = rcu_dereference(net->ct.hash[st->bucket].first); 38 n = rcu_dereference(net->ct.hash[st->bucket].first);
38 if (!is_a_nulls(n)) 39 if (!is_a_nulls(n))
@@ -50,7 +51,7 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
50 head = rcu_dereference(head->next); 51 head = rcu_dereference(head->next);
51 while (is_a_nulls(head)) { 52 while (is_a_nulls(head)) {
52 if (likely(get_nulls_value(head) == st->bucket)) { 53 if (likely(get_nulls_value(head) == st->bucket)) {
53 if (++st->bucket >= nf_conntrack_htable_size) 54 if (++st->bucket >= net->ct.htable_size)
54 return NULL; 55 return NULL;
55 } 56 }
56 head = rcu_dereference(net->ct.hash[st->bucket].first); 57 head = rcu_dereference(net->ct.hash[st->bucket].first);
@@ -87,6 +88,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
87 rcu_read_unlock(); 88 rcu_read_unlock();
88} 89}
89 90
91#ifdef CONFIG_NF_CONNTRACK_SECMARK
92static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
93{
94 int ret;
95 u32 len;
96 char *secctx;
97
98 ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
99 if (ret)
100 return 0;
101
102 ret = seq_printf(s, "secctx=%s ", secctx);
103
104 security_release_secctx(secctx, len);
105 return ret;
106}
107#else
108static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
109{
110 return 0;
111}
112#endif
113
90static int ct_seq_show(struct seq_file *s, void *v) 114static int ct_seq_show(struct seq_file *s, void *v)
91{ 115{
92 struct nf_conntrack_tuple_hash *hash = v; 116 struct nf_conntrack_tuple_hash *hash = v;
@@ -148,10 +172,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
148 goto release; 172 goto release;
149#endif 173#endif
150 174
151#ifdef CONFIG_NF_CONNTRACK_SECMARK 175 if (ct_show_secctx(s, ct))
152 if (seq_printf(s, "secmark=%u ", ct->secmark))
153 goto release; 176 goto release;
154#endif
155 177
156 if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) 178 if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
157 goto release; 179 goto release;
@@ -336,12 +358,12 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
336 const struct ip_conntrack_stat *st = v; 358 const struct ip_conntrack_stat *st = v;
337 359
338 if (v == SEQ_START_TOKEN) { 360 if (v == SEQ_START_TOKEN) {
339 seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n"); 361 seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
340 return 0; 362 return 0;
341 } 363 }
342 364
343 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " 365 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
344 "%08x %08x %08x %08x %08x %08x %08x %08x \n", 366 "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
345 nr_conntracks, 367 nr_conntracks,
346 st->searched, 368 st->searched,
347 st->found, 369 st->found,
@@ -358,7 +380,8 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
358 380
359 st->expect_new, 381 st->expect_new,
360 st->expect_create, 382 st->expect_create,
361 st->expect_delete 383 st->expect_delete,
384 st->search_restart
362 ); 385 );
363 return 0; 386 return 0;
364} 387}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index d71ba7677344..7404bde95994 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -18,6 +18,7 @@
18#include <net/netfilter/nf_conntrack_tuple.h> 18#include <net/netfilter/nf_conntrack_tuple.h>
19#include <net/netfilter/nf_conntrack_l4proto.h> 19#include <net/netfilter/nf_conntrack_l4proto.h>
20#include <net/netfilter/nf_conntrack_core.h> 20#include <net/netfilter/nf_conntrack_core.h>
21#include <net/netfilter/nf_conntrack_zones.h>
21#include <net/netfilter/nf_log.h> 22#include <net/netfilter/nf_log.h>
22 23
23static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; 24static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
@@ -54,8 +55,8 @@ static const u_int8_t invmap[] = {
54static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, 55static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
55 const struct nf_conntrack_tuple *orig) 56 const struct nf_conntrack_tuple *orig)
56{ 57{
57 if (orig->dst.u.icmp.type >= sizeof(invmap) 58 if (orig->dst.u.icmp.type >= sizeof(invmap) ||
58 || !invmap[orig->dst.u.icmp.type]) 59 !invmap[orig->dst.u.icmp.type])
59 return false; 60 return false;
60 61
61 tuple->src.u.icmp.id = orig->src.u.icmp.id; 62 tuple->src.u.icmp.id = orig->src.u.icmp.id;
@@ -101,8 +102,8 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
101 [ICMP_ADDRESS] = 1 102 [ICMP_ADDRESS] = 1
102 }; 103 };
103 104
104 if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) 105 if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) ||
105 || !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { 106 !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
106 /* Can't create a new ICMP `conn' with this. */ 107 /* Can't create a new ICMP `conn' with this. */
107 pr_debug("icmp: can't create new conn with type %u\n", 108 pr_debug("icmp: can't create new conn with type %u\n",
108 ct->tuplehash[0].tuple.dst.u.icmp.type); 109 ct->tuplehash[0].tuple.dst.u.icmp.type);
@@ -114,13 +115,14 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
114 115
115/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ 116/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
116static int 117static int
117icmp_error_message(struct net *net, struct sk_buff *skb, 118icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
118 enum ip_conntrack_info *ctinfo, 119 enum ip_conntrack_info *ctinfo,
119 unsigned int hooknum) 120 unsigned int hooknum)
120{ 121{
121 struct nf_conntrack_tuple innertuple, origtuple; 122 struct nf_conntrack_tuple innertuple, origtuple;
122 const struct nf_conntrack_l4proto *innerproto; 123 const struct nf_conntrack_l4proto *innerproto;
123 const struct nf_conntrack_tuple_hash *h; 124 const struct nf_conntrack_tuple_hash *h;
125 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
124 126
125 NF_CT_ASSERT(skb->nfct == NULL); 127 NF_CT_ASSERT(skb->nfct == NULL);
126 128
@@ -146,7 +148,7 @@ icmp_error_message(struct net *net, struct sk_buff *skb,
146 148
147 *ctinfo = IP_CT_RELATED; 149 *ctinfo = IP_CT_RELATED;
148 150
149 h = nf_conntrack_find_get(net, &innertuple); 151 h = nf_conntrack_find_get(net, zone, &innertuple);
150 if (!h) { 152 if (!h) {
151 pr_debug("icmp_error_message: no match\n"); 153 pr_debug("icmp_error_message: no match\n");
152 return -NF_ACCEPT; 154 return -NF_ACCEPT;
@@ -163,7 +165,8 @@ icmp_error_message(struct net *net, struct sk_buff *skb,
163 165
164/* Small and modified version of icmp_rcv */ 166/* Small and modified version of icmp_rcv */
165static int 167static int
166icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, 168icmp_error(struct net *net, struct nf_conn *tmpl,
169 struct sk_buff *skb, unsigned int dataoff,
167 enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) 170 enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
168{ 171{
169 const struct icmphdr *icmph; 172 const struct icmphdr *icmph;
@@ -201,14 +204,14 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
201 } 204 }
202 205
203 /* Need to track icmp error message? */ 206 /* Need to track icmp error message? */
204 if (icmph->type != ICMP_DEST_UNREACH 207 if (icmph->type != ICMP_DEST_UNREACH &&
205 && icmph->type != ICMP_SOURCE_QUENCH 208 icmph->type != ICMP_SOURCE_QUENCH &&
206 && icmph->type != ICMP_TIME_EXCEEDED 209 icmph->type != ICMP_TIME_EXCEEDED &&
207 && icmph->type != ICMP_PARAMETERPROB 210 icmph->type != ICMP_PARAMETERPROB &&
208 && icmph->type != ICMP_REDIRECT) 211 icmph->type != ICMP_REDIRECT)
209 return NF_ACCEPT; 212 return NF_ACCEPT;
210 213
211 return icmp_error_message(net, skb, ctinfo, hooknum); 214 return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
212} 215}
213 216
214#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 217#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
@@ -238,17 +241,17 @@ static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = {
238static int icmp_nlattr_to_tuple(struct nlattr *tb[], 241static int icmp_nlattr_to_tuple(struct nlattr *tb[],
239 struct nf_conntrack_tuple *tuple) 242 struct nf_conntrack_tuple *tuple)
240{ 243{
241 if (!tb[CTA_PROTO_ICMP_TYPE] 244 if (!tb[CTA_PROTO_ICMP_TYPE] ||
242 || !tb[CTA_PROTO_ICMP_CODE] 245 !tb[CTA_PROTO_ICMP_CODE] ||
243 || !tb[CTA_PROTO_ICMP_ID]) 246 !tb[CTA_PROTO_ICMP_ID])
244 return -EINVAL; 247 return -EINVAL;
245 248
246 tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); 249 tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
247 tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); 250 tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
248 tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); 251 tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
249 252
250 if (tuple->dst.u.icmp.type >= sizeof(invmap) 253 if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
251 || !invmap[tuple->dst.u.icmp.type]) 254 !invmap[tuple->dst.u.icmp.type])
252 return -EINVAL; 255 return -EINVAL;
253 256
254 return 0; 257 return 0;
@@ -270,9 +273,7 @@ static struct ctl_table icmp_sysctl_table[] = {
270 .mode = 0644, 273 .mode = 0644,
271 .proc_handler = proc_dointvec_jiffies, 274 .proc_handler = proc_dointvec_jiffies,
272 }, 275 },
273 { 276 { }
274 .ctl_name = 0
275 }
276}; 277};
277#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT 278#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
278static struct ctl_table icmp_compat_sysctl_table[] = { 279static struct ctl_table icmp_compat_sysctl_table[] = {
@@ -283,9 +284,7 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
283 .mode = 0644, 284 .mode = 0644,
284 .proc_handler = proc_dointvec_jiffies, 285 .proc_handler = proc_dointvec_jiffies,
285 }, 286 },
286 { 287 { }
287 .ctl_name = 0
288 }
289}; 288};
290#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ 289#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
291#endif /* CONFIG_SYSCTL */ 290#endif /* CONFIG_SYSCTL */
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index fa2d6b6fc3e5..f3a9b42b16c6 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -14,8 +14,13 @@
14#include <net/route.h> 14#include <net/route.h>
15#include <net/ip.h> 15#include <net/ip.h>
16 16
17#include <linux/netfilter_bridge.h>
17#include <linux/netfilter_ipv4.h> 18#include <linux/netfilter_ipv4.h>
18#include <net/netfilter/ipv4/nf_defrag_ipv4.h> 19#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
20#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
21#include <net/netfilter/nf_conntrack.h>
22#endif
23#include <net/netfilter/nf_conntrack_zones.h>
19 24
20/* Returns new sk_buff, or NULL */ 25/* Returns new sk_buff, or NULL */
21static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) 26static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
@@ -34,26 +39,52 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
34 return err; 39 return err;
35} 40}
36 41
42static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
43 struct sk_buff *skb)
44{
45 u16 zone = NF_CT_DEFAULT_ZONE;
46
47#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
48 if (skb->nfct)
49 zone = nf_ct_zone((struct nf_conn *)skb->nfct);
50#endif
51
52#ifdef CONFIG_BRIDGE_NETFILTER
53 if (skb->nf_bridge &&
54 skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
55 return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
56#endif
57 if (hooknum == NF_INET_PRE_ROUTING)
58 return IP_DEFRAG_CONNTRACK_IN + zone;
59 else
60 return IP_DEFRAG_CONNTRACK_OUT + zone;
61}
62
37static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, 63static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
38 struct sk_buff *skb, 64 struct sk_buff *skb,
39 const struct net_device *in, 65 const struct net_device *in,
40 const struct net_device *out, 66 const struct net_device *out,
41 int (*okfn)(struct sk_buff *)) 67 int (*okfn)(struct sk_buff *))
42{ 68{
69 struct sock *sk = skb->sk;
70 struct inet_sock *inet = inet_sk(skb->sk);
71
72 if (sk && (sk->sk_family == PF_INET) &&
73 inet->nodefrag)
74 return NF_ACCEPT;
75
43#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 76#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
44#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) 77#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
45 /* Previously seen (loopback)? Ignore. Do this before 78 /* Previously seen (loopback)? Ignore. Do this before
46 fragment check. */ 79 fragment check. */
47 if (skb->nfct) 80 if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
48 return NF_ACCEPT; 81 return NF_ACCEPT;
49#endif 82#endif
50#endif 83#endif
51 /* Gather fragments. */ 84 /* Gather fragments. */
52 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { 85 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
53 if (nf_ct_ipv4_gather_frags(skb, 86 enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb);
54 hooknum == NF_INET_PRE_ROUTING ? 87 if (nf_ct_ipv4_gather_frags(skb, user))
55 IP_DEFRAG_CONNTRACK_IN :
56 IP_DEFRAG_CONNTRACK_OUT))
57 return NF_STOLEN; 88 return NF_STOLEN;
58 } 89 }
59 return NF_ACCEPT; 90 return NF_ACCEPT;
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index c31b87668250..0f23b3f06df0 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,9 +44,16 @@ static unsigned int help(struct sk_buff *skb,
44 44
45 /* Try to get same port: if not, try to change it. */ 45 /* Try to get same port: if not, try to change it. */
46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
47 int ret;
48
47 exp->tuple.dst.u.tcp.port = htons(port); 49 exp->tuple.dst.u.tcp.port = htons(port);
48 if (nf_ct_expect_related(exp) == 0) 50 ret = nf_ct_expect_related(exp);
51 if (ret == 0)
52 break;
53 else if (ret != -EBUSY) {
54 port = 0;
49 break; 55 break;
56 }
50 } 57 }
51 58
52 if (port == 0) 59 if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 3229e0a81ba6..c04787ce1a71 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -12,6 +12,7 @@
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/timer.h> 13#include <linux/timer.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/gfp.h>
15#include <net/checksum.h> 16#include <net/checksum.h>
16#include <net/icmp.h> 17#include <net/icmp.h>
17#include <net/ip.h> 18#include <net/ip.h>
@@ -30,16 +31,14 @@
30#include <net/netfilter/nf_conntrack_helper.h> 31#include <net/netfilter/nf_conntrack_helper.h>
31#include <net/netfilter/nf_conntrack_l3proto.h> 32#include <net/netfilter/nf_conntrack_l3proto.h>
32#include <net/netfilter/nf_conntrack_l4proto.h> 33#include <net/netfilter/nf_conntrack_l4proto.h>
34#include <net/netfilter/nf_conntrack_zones.h>
33 35
34static DEFINE_SPINLOCK(nf_nat_lock); 36static DEFINE_SPINLOCK(nf_nat_lock);
35 37
36static struct nf_conntrack_l3proto *l3proto __read_mostly; 38static struct nf_conntrack_l3proto *l3proto __read_mostly;
37 39
38/* Calculated at init based on memory size */
39static unsigned int nf_nat_htable_size __read_mostly;
40
41#define MAX_IP_NAT_PROTO 256 40#define MAX_IP_NAT_PROTO 256
42static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] 41static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
43 __read_mostly; 42 __read_mostly;
44 43
45static inline const struct nf_nat_protocol * 44static inline const struct nf_nat_protocol *
@@ -48,39 +47,18 @@ __nf_nat_proto_find(u_int8_t protonum)
48 return rcu_dereference(nf_nat_protos[protonum]); 47 return rcu_dereference(nf_nat_protos[protonum]);
49} 48}
50 49
51const struct nf_nat_protocol *
52nf_nat_proto_find_get(u_int8_t protonum)
53{
54 const struct nf_nat_protocol *p;
55
56 rcu_read_lock();
57 p = __nf_nat_proto_find(protonum);
58 if (!try_module_get(p->me))
59 p = &nf_nat_unknown_protocol;
60 rcu_read_unlock();
61
62 return p;
63}
64EXPORT_SYMBOL_GPL(nf_nat_proto_find_get);
65
66void
67nf_nat_proto_put(const struct nf_nat_protocol *p)
68{
69 module_put(p->me);
70}
71EXPORT_SYMBOL_GPL(nf_nat_proto_put);
72
73/* We keep an extra hash for each conntrack, for fast searching. */ 50/* We keep an extra hash for each conntrack, for fast searching. */
74static inline unsigned int 51static inline unsigned int
75hash_by_src(const struct nf_conntrack_tuple *tuple) 52hash_by_src(const struct net *net, u16 zone,
53 const struct nf_conntrack_tuple *tuple)
76{ 54{
77 unsigned int hash; 55 unsigned int hash;
78 56
79 /* Original src, to ensure we map it consistently if poss. */ 57 /* Original src, to ensure we map it consistently if poss. */
80 hash = jhash_3words((__force u32)tuple->src.u3.ip, 58 hash = jhash_3words((__force u32)tuple->src.u3.ip,
81 (__force u32)tuple->src.u.all, 59 (__force u32)tuple->src.u.all ^ zone,
82 tuple->dst.protonum, 0); 60 tuple->dst.protonum, 0);
83 return ((u64)hash * nf_nat_htable_size) >> 32; 61 return ((u64)hash * net->ipv4.nat_htable_size) >> 32;
84} 62}
85 63
86/* Is this tuple already taken? (not by us) */ 64/* Is this tuple already taken? (not by us) */
@@ -142,12 +120,12 @@ same_src(const struct nf_conn *ct,
142 120
143/* Only called for SRC manip */ 121/* Only called for SRC manip */
144static int 122static int
145find_appropriate_src(struct net *net, 123find_appropriate_src(struct net *net, u16 zone,
146 const struct nf_conntrack_tuple *tuple, 124 const struct nf_conntrack_tuple *tuple,
147 struct nf_conntrack_tuple *result, 125 struct nf_conntrack_tuple *result,
148 const struct nf_nat_range *range) 126 const struct nf_nat_range *range)
149{ 127{
150 unsigned int h = hash_by_src(tuple); 128 unsigned int h = hash_by_src(net, zone, tuple);
151 const struct nf_conn_nat *nat; 129 const struct nf_conn_nat *nat;
152 const struct nf_conn *ct; 130 const struct nf_conn *ct;
153 const struct hlist_node *n; 131 const struct hlist_node *n;
@@ -155,7 +133,7 @@ find_appropriate_src(struct net *net,
155 rcu_read_lock(); 133 rcu_read_lock();
156 hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) { 134 hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
157 ct = nat->ct; 135 ct = nat->ct;
158 if (same_src(ct, tuple)) { 136 if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
159 /* Copy source part from reply tuple. */ 137 /* Copy source part from reply tuple. */
160 nf_ct_invert_tuplepr(result, 138 nf_ct_invert_tuplepr(result,
161 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 139 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -178,7 +156,7 @@ find_appropriate_src(struct net *net,
178 the ip with the lowest src-ip/dst-ip/proto usage. 156 the ip with the lowest src-ip/dst-ip/proto usage.
179*/ 157*/
180static void 158static void
181find_best_ips_proto(struct nf_conntrack_tuple *tuple, 159find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
182 const struct nf_nat_range *range, 160 const struct nf_nat_range *range,
183 const struct nf_conn *ct, 161 const struct nf_conn *ct,
184 enum nf_nat_manip_type maniptype) 162 enum nf_nat_manip_type maniptype)
@@ -212,7 +190,7 @@ find_best_ips_proto(struct nf_conntrack_tuple *tuple,
212 maxip = ntohl(range->max_ip); 190 maxip = ntohl(range->max_ip);
213 j = jhash_2words((__force u32)tuple->src.u3.ip, 191 j = jhash_2words((__force u32)tuple->src.u3.ip,
214 range->flags & IP_NAT_RANGE_PERSISTENT ? 192 range->flags & IP_NAT_RANGE_PERSISTENT ?
215 (__force u32)tuple->dst.u3.ip : 0, 0); 193 0 : (__force u32)tuple->dst.u3.ip ^ zone, 0);
216 j = ((u64)j * (maxip - minip + 1)) >> 32; 194 j = ((u64)j * (maxip - minip + 1)) >> 32;
217 *var_ipp = htonl(minip + j); 195 *var_ipp = htonl(minip + j);
218} 196}
@@ -232,6 +210,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
232{ 210{
233 struct net *net = nf_ct_net(ct); 211 struct net *net = nf_ct_net(ct);
234 const struct nf_nat_protocol *proto; 212 const struct nf_nat_protocol *proto;
213 u16 zone = nf_ct_zone(ct);
235 214
236 /* 1) If this srcip/proto/src-proto-part is currently mapped, 215 /* 1) If this srcip/proto/src-proto-part is currently mapped,
237 and that same mapping gives a unique tuple within the given 216 and that same mapping gives a unique tuple within the given
@@ -242,7 +221,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
242 manips not an issue. */ 221 manips not an issue. */
243 if (maniptype == IP_NAT_MANIP_SRC && 222 if (maniptype == IP_NAT_MANIP_SRC &&
244 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { 223 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
245 if (find_appropriate_src(net, orig_tuple, tuple, range)) { 224 if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) {
246 pr_debug("get_unique_tuple: Found current src map\n"); 225 pr_debug("get_unique_tuple: Found current src map\n");
247 if (!nf_nat_used_tuple(tuple, ct)) 226 if (!nf_nat_used_tuple(tuple, ct))
248 return; 227 return;
@@ -252,7 +231,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
252 /* 2) Select the least-used IP/proto combination in the given 231 /* 2) Select the least-used IP/proto combination in the given
253 range. */ 232 range. */
254 *tuple = *orig_tuple; 233 *tuple = *orig_tuple;
255 find_best_ips_proto(tuple, range, ct, maniptype); 234 find_best_ips_proto(zone, tuple, range, ct, maniptype);
256 235
257 /* 3) The per-protocol part of the manip is made to map into 236 /* 3) The per-protocol part of the manip is made to map into
258 the range to make a unique tuple. */ 237 the range to make a unique tuple. */
@@ -260,17 +239,18 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
260 rcu_read_lock(); 239 rcu_read_lock();
261 proto = __nf_nat_proto_find(orig_tuple->dst.protonum); 240 proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
262 241
263 /* Change protocol info to have some randomization */
264 if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) {
265 proto->unique_tuple(tuple, range, maniptype, ct);
266 goto out;
267 }
268
269 /* Only bother mapping if it's not already in range and unique */ 242 /* Only bother mapping if it's not already in range and unique */
270 if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || 243 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
271 proto->in_range(tuple, maniptype, &range->min, &range->max)) && 244 if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
272 !nf_nat_used_tuple(tuple, ct)) 245 if (proto->in_range(tuple, maniptype, &range->min,
273 goto out; 246 &range->max) &&
247 (range->min.all == range->max.all ||
248 !nf_nat_used_tuple(tuple, ct)))
249 goto out;
250 } else if (!nf_nat_used_tuple(tuple, ct)) {
251 goto out;
252 }
253 }
274 254
275 /* Last change: get protocol to try to obtain unique tuple. */ 255 /* Last change: get protocol to try to obtain unique tuple. */
276 proto->unique_tuple(tuple, range, maniptype, ct); 256 proto->unique_tuple(tuple, range, maniptype, ct);
@@ -330,7 +310,8 @@ nf_nat_setup_info(struct nf_conn *ct,
330 if (have_to_hash) { 310 if (have_to_hash) {
331 unsigned int srchash; 311 unsigned int srchash;
332 312
333 srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 313 srchash = hash_by_src(net, nf_ct_zone(ct),
314 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
334 spin_lock_bh(&nf_nat_lock); 315 spin_lock_bh(&nf_nat_lock);
335 /* nf_conntrack_alter_reply might re-allocate exntension aera */ 316 /* nf_conntrack_alter_reply might re-allocate exntension aera */
336 nat = nfct_nat(ct); 317 nat = nfct_nat(ct);
@@ -438,7 +419,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
438 if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) 419 if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
439 return 0; 420 return 0;
440 421
441 inside = (void *)skb->data + ip_hdrlen(skb); 422 inside = (void *)skb->data + hdrlen;
442 423
443 /* We're actually going to mangle it beyond trivial checksum 424 /* We're actually going to mangle it beyond trivial checksum
444 adjustment, so make sure the current checksum is correct. */ 425 adjustment, so make sure the current checksum is correct. */
@@ -461,6 +442,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
461 return 0; 442 return 0;
462 } 443 }
463 444
445 if (manip == IP_NAT_MANIP_SRC)
446 statusbit = IPS_SRC_NAT;
447 else
448 statusbit = IPS_DST_NAT;
449
450 /* Invert if this is reply dir. */
451 if (dir == IP_CT_DIR_REPLY)
452 statusbit ^= IPS_NAT_MASK;
453
454 if (!(ct->status & statusbit))
455 return 1;
456
464 pr_debug("icmp_reply_translation: translating error %p manip %u " 457 pr_debug("icmp_reply_translation: translating error %p manip %u "
465 "dir %s\n", skb, manip, 458 "dir %s\n", skb, manip,
466 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); 459 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
@@ -468,12 +461,10 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
468 /* rcu_read_lock()ed by nf_hook_slow */ 461 /* rcu_read_lock()ed by nf_hook_slow */
469 l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); 462 l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
470 463
471 if (!nf_ct_get_tuple(skb, 464 if (!nf_ct_get_tuple(skb, hdrlen + sizeof(struct icmphdr),
472 ip_hdrlen(skb) + sizeof(struct icmphdr), 465 (hdrlen +
473 (ip_hdrlen(skb) +
474 sizeof(struct icmphdr) + inside->ip.ihl * 4), 466 sizeof(struct icmphdr) + inside->ip.ihl * 4),
475 (u_int16_t)AF_INET, 467 (u_int16_t)AF_INET, inside->ip.protocol,
476 inside->ip.protocol,
477 &inner, l3proto, l4proto)) 468 &inner, l3proto, l4proto))
478 return 0; 469 return 0;
479 470
@@ -482,15 +473,13 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
482 pass all hooks (locally-generated ICMP). Consider incoming 473 pass all hooks (locally-generated ICMP). Consider incoming
483 packet: PREROUTING (DST manip), routing produces ICMP, goes 474 packet: PREROUTING (DST manip), routing produces ICMP, goes
484 through POSTROUTING (which must correct the DST manip). */ 475 through POSTROUTING (which must correct the DST manip). */
485 if (!manip_pkt(inside->ip.protocol, skb, 476 if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp),
486 ip_hdrlen(skb) + sizeof(inside->icmp), 477 &ct->tuplehash[!dir].tuple, !manip))
487 &ct->tuplehash[!dir].tuple,
488 !manip))
489 return 0; 478 return 0;
490 479
491 if (skb->ip_summed != CHECKSUM_PARTIAL) { 480 if (skb->ip_summed != CHECKSUM_PARTIAL) {
492 /* Reloading "inside" here since manip_pkt inner. */ 481 /* Reloading "inside" here since manip_pkt inner. */
493 inside = (void *)skb->data + ip_hdrlen(skb); 482 inside = (void *)skb->data + hdrlen;
494 inside->icmp.checksum = 0; 483 inside->icmp.checksum = 0;
495 inside->icmp.checksum = 484 inside->icmp.checksum =
496 csum_fold(skb_checksum(skb, hdrlen, 485 csum_fold(skb_checksum(skb, hdrlen,
@@ -499,20 +488,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
499 488
500 /* Change outer to look the reply to an incoming packet 489 /* Change outer to look the reply to an incoming packet
501 * (proto 0 means don't invert per-proto part). */ 490 * (proto 0 means don't invert per-proto part). */
502 if (manip == IP_NAT_MANIP_SRC) 491 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
503 statusbit = IPS_SRC_NAT; 492 if (!manip_pkt(0, skb, 0, &target, manip))
504 else 493 return 0;
505 statusbit = IPS_DST_NAT;
506
507 /* Invert if this is reply dir. */
508 if (dir == IP_CT_DIR_REPLY)
509 statusbit ^= IPS_NAT_MASK;
510
511 if (ct->status & statusbit) {
512 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
513 if (!manip_pkt(0, skb, 0, &target, manip))
514 return 0;
515 }
516 494
517 return 1; 495 return 1;
518} 496}
@@ -590,6 +568,26 @@ static struct nf_ct_ext_type nat_extend __read_mostly = {
590#include <linux/netfilter/nfnetlink.h> 568#include <linux/netfilter/nfnetlink.h>
591#include <linux/netfilter/nfnetlink_conntrack.h> 569#include <linux/netfilter/nfnetlink_conntrack.h>
592 570
571static const struct nf_nat_protocol *
572nf_nat_proto_find_get(u_int8_t protonum)
573{
574 const struct nf_nat_protocol *p;
575
576 rcu_read_lock();
577 p = __nf_nat_proto_find(protonum);
578 if (!try_module_get(p->me))
579 p = &nf_nat_unknown_protocol;
580 rcu_read_unlock();
581
582 return p;
583}
584
585static void
586nf_nat_proto_put(const struct nf_nat_protocol *p)
587{
588 module_put(p->me);
589}
590
593static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { 591static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
594 [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, 592 [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 },
595 [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, 593 [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
@@ -620,7 +618,7 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
620}; 618};
621 619
622static int 620static int
623nfnetlink_parse_nat(struct nlattr *nat, 621nfnetlink_parse_nat(const struct nlattr *nat,
624 const struct nf_conn *ct, struct nf_nat_range *range) 622 const struct nf_conn *ct, struct nf_nat_range *range)
625{ 623{
626 struct nlattr *tb[CTA_NAT_MAX+1]; 624 struct nlattr *tb[CTA_NAT_MAX+1];
@@ -656,7 +654,7 @@ nfnetlink_parse_nat(struct nlattr *nat,
656static int 654static int
657nfnetlink_parse_nat_setup(struct nf_conn *ct, 655nfnetlink_parse_nat_setup(struct nf_conn *ct,
658 enum nf_nat_manip_type manip, 656 enum nf_nat_manip_type manip,
659 struct nlattr *attr) 657 const struct nlattr *attr)
660{ 658{
661 struct nf_nat_range range; 659 struct nf_nat_range range;
662 660
@@ -671,7 +669,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
671static int 669static int
672nfnetlink_parse_nat_setup(struct nf_conn *ct, 670nfnetlink_parse_nat_setup(struct nf_conn *ct,
673 enum nf_nat_manip_type manip, 671 enum nf_nat_manip_type manip,
674 struct nlattr *attr) 672 const struct nlattr *attr)
675{ 673{
676 return -EOPNOTSUPP; 674 return -EOPNOTSUPP;
677} 675}
@@ -679,8 +677,10 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
679 677
680static int __net_init nf_nat_net_init(struct net *net) 678static int __net_init nf_nat_net_init(struct net *net)
681{ 679{
682 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 680 /* Leave them the same for the moment. */
683 &net->ipv4.nat_vmalloced, 0); 681 net->ipv4.nat_htable_size = net->ct.htable_size;
682 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size,
683 &net->ipv4.nat_vmalloced, 0);
684 if (!net->ipv4.nat_bysource) 684 if (!net->ipv4.nat_bysource)
685 return -ENOMEM; 685 return -ENOMEM;
686 return 0; 686 return 0;
@@ -703,7 +703,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
703 nf_ct_iterate_cleanup(net, &clean_nat, NULL); 703 nf_ct_iterate_cleanup(net, &clean_nat, NULL);
704 synchronize_rcu(); 704 synchronize_rcu();
705 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, 705 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
706 nf_nat_htable_size); 706 net->ipv4.nat_htable_size);
707} 707}
708 708
709static struct pernet_operations nf_nat_net_ops = { 709static struct pernet_operations nf_nat_net_ops = {
@@ -724,9 +724,6 @@ static int __init nf_nat_init(void)
724 return ret; 724 return ret;
725 } 725 }
726 726
727 /* Leave them the same for the moment. */
728 nf_nat_htable_size = nf_conntrack_htable_size;
729
730 ret = register_pernet_subsys(&nf_nat_net_ops); 727 ret = register_pernet_subsys(&nf_nat_net_ops);
731 if (ret < 0) 728 if (ret < 0)
732 goto cleanup_extend; 729 goto cleanup_extend;
@@ -741,7 +738,7 @@ static int __init nf_nat_init(void)
741 spin_unlock_bh(&nf_nat_lock); 738 spin_unlock_bh(&nf_nat_lock);
742 739
743 /* Initialize fake conntrack so that NAT will skip it */ 740 /* Initialize fake conntrack so that NAT will skip it */
744 nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK; 741 nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
745 742
746 l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); 743 l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
747 744
@@ -750,6 +747,8 @@ static int __init nf_nat_init(void)
750 BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); 747 BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
751 rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, 748 rcu_assign_pointer(nfnetlink_parse_nat_setup_hook,
752 nfnetlink_parse_nat_setup); 749 nfnetlink_parse_nat_setup);
750 BUG_ON(nf_ct_nat_offset != NULL);
751 rcu_assign_pointer(nf_ct_nat_offset, nf_nat_get_offset);
753 return 0; 752 return 0;
754 753
755 cleanup_extend: 754 cleanup_extend:
@@ -764,6 +763,7 @@ static void __exit nf_nat_cleanup(void)
764 nf_ct_extend_unregister(&nat_extend); 763 nf_ct_extend_unregister(&nat_extend);
765 rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); 764 rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL);
766 rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL); 765 rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL);
766 rcu_assign_pointer(nf_ct_nat_offset, NULL);
767 synchronize_net(); 767 synchronize_net();
768} 768}
769 769
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index a1d5d58a58bf..dc73abb3fe27 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -27,76 +27,29 @@ MODULE_ALIAS("ip_nat_ftp");
27 27
28/* FIXME: Time out? --RR */ 28/* FIXME: Time out? --RR */
29 29
30static int 30static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type,
31mangle_rfc959_packet(struct sk_buff *skb, 31 char *buffer, size_t buflen,
32 __be32 newip, 32 __be32 addr, u16 port)
33 u_int16_t port,
34 unsigned int matchoff,
35 unsigned int matchlen,
36 struct nf_conn *ct,
37 enum ip_conntrack_info ctinfo)
38{ 33{
39 char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")]; 34 switch (type) {
40 35 case NF_CT_FTP_PORT:
41 sprintf(buffer, "%u,%u,%u,%u,%u,%u", 36 case NF_CT_FTP_PASV:
42 NIPQUAD(newip), port>>8, port&0xFF); 37 return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u",
43 38 ((unsigned char *)&addr)[0],
44 pr_debug("calling nf_nat_mangle_tcp_packet\n"); 39 ((unsigned char *)&addr)[1],
45 40 ((unsigned char *)&addr)[2],
46 return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, 41 ((unsigned char *)&addr)[3],
47 matchlen, buffer, strlen(buffer)); 42 port >> 8,
48} 43 port & 0xFF);
49 44 case NF_CT_FTP_EPRT:
50/* |1|132.235.1.2|6275| */ 45 return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port);
51static int 46 case NF_CT_FTP_EPSV:
52mangle_eprt_packet(struct sk_buff *skb, 47 return snprintf(buffer, buflen, "|||%u|", port);
53 __be32 newip, 48 }
54 u_int16_t port,
55 unsigned int matchoff,
56 unsigned int matchlen,
57 struct nf_conn *ct,
58 enum ip_conntrack_info ctinfo)
59{
60 char buffer[sizeof("|1|255.255.255.255|65535|")];
61
62 sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port);
63
64 pr_debug("calling nf_nat_mangle_tcp_packet\n");
65
66 return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
67 matchlen, buffer, strlen(buffer));
68}
69
70/* |1|132.235.1.2|6275| */
71static int
72mangle_epsv_packet(struct sk_buff *skb,
73 __be32 newip,
74 u_int16_t port,
75 unsigned int matchoff,
76 unsigned int matchlen,
77 struct nf_conn *ct,
78 enum ip_conntrack_info ctinfo)
79{
80 char buffer[sizeof("|||65535|")];
81
82 sprintf(buffer, "|||%u|", port);
83
84 pr_debug("calling nf_nat_mangle_tcp_packet\n");
85 49
86 return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, 50 return 0;
87 matchlen, buffer, strlen(buffer));
88} 51}
89 52
90static int (*mangle[])(struct sk_buff *, __be32, u_int16_t,
91 unsigned int, unsigned int, struct nf_conn *,
92 enum ip_conntrack_info)
93= {
94 [NF_CT_FTP_PORT] = mangle_rfc959_packet,
95 [NF_CT_FTP_PASV] = mangle_rfc959_packet,
96 [NF_CT_FTP_EPRT] = mangle_eprt_packet,
97 [NF_CT_FTP_EPSV] = mangle_epsv_packet
98};
99
100/* So, this packet has hit the connection tracking matching code. 53/* So, this packet has hit the connection tracking matching code.
101 Mangle it, and change the expectation to match the new version. */ 54 Mangle it, and change the expectation to match the new version. */
102static unsigned int nf_nat_ftp(struct sk_buff *skb, 55static unsigned int nf_nat_ftp(struct sk_buff *skb,
@@ -110,6 +63,8 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
110 u_int16_t port; 63 u_int16_t port;
111 int dir = CTINFO2DIR(ctinfo); 64 int dir = CTINFO2DIR(ctinfo);
112 struct nf_conn *ct = exp->master; 65 struct nf_conn *ct = exp->master;
66 char buffer[sizeof("|1|255.255.255.255|65535|")];
67 unsigned int buflen;
113 68
114 pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); 69 pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
115 70
@@ -124,19 +79,36 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
124 79
125 /* Try to get same port: if not, try to change it. */ 80 /* Try to get same port: if not, try to change it. */
126 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 81 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
82 int ret;
83
127 exp->tuple.dst.u.tcp.port = htons(port); 84 exp->tuple.dst.u.tcp.port = htons(port);
128 if (nf_ct_expect_related(exp) == 0) 85 ret = nf_ct_expect_related(exp);
86 if (ret == 0)
87 break;
88 else if (ret != -EBUSY) {
89 port = 0;
129 break; 90 break;
91 }
130 } 92 }
131 93
132 if (port == 0) 94 if (port == 0)
133 return NF_DROP; 95 return NF_DROP;
134 96
135 if (!mangle[type](skb, newip, port, matchoff, matchlen, ct, ctinfo)) { 97 buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port);
136 nf_ct_unexpect_related(exp); 98 if (!buflen)
137 return NF_DROP; 99 goto out;
138 } 100
101 pr_debug("calling nf_nat_mangle_tcp_packet\n");
102
103 if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
104 matchlen, buffer, buflen))
105 goto out;
106
139 return NF_ACCEPT; 107 return NF_ACCEPT;
108
109out:
110 nf_ct_unexpect_related(exp);
111 return NF_DROP;
140} 112}
141 113
142static void __exit nf_nat_ftp_fini(void) 114static void __exit nf_nat_ftp_fini(void)
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 7e8e6fc75413..790f3160e012 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/moduleparam.h>
14#include <linux/tcp.h> 13#include <linux/tcp.h>
15#include <net/tcp.h> 14#include <net/tcp.h>
16 15
@@ -44,7 +43,7 @@ static int set_addr(struct sk_buff *skb,
44 addroff, sizeof(buf), 43 addroff, sizeof(buf),
45 (char *) &buf, sizeof(buf))) { 44 (char *) &buf, sizeof(buf))) {
46 if (net_ratelimit()) 45 if (net_ratelimit())
47 printk("nf_nat_h323: nf_nat_mangle_tcp_packet" 46 pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet"
48 " error\n"); 47 " error\n");
49 return -1; 48 return -1;
50 } 49 }
@@ -60,7 +59,7 @@ static int set_addr(struct sk_buff *skb,
60 addroff, sizeof(buf), 59 addroff, sizeof(buf),
61 (char *) &buf, sizeof(buf))) { 60 (char *) &buf, sizeof(buf))) {
62 if (net_ratelimit()) 61 if (net_ratelimit())
63 printk("nf_nat_h323: nf_nat_mangle_udp_packet" 62 pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet"
64 " error\n"); 63 " error\n");
65 return -1; 64 return -1;
66 } 65 }
@@ -216,26 +215,37 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
216 /* Run out of expectations */ 215 /* Run out of expectations */
217 if (i >= H323_RTP_CHANNEL_MAX) { 216 if (i >= H323_RTP_CHANNEL_MAX) {
218 if (net_ratelimit()) 217 if (net_ratelimit())
219 printk("nf_nat_h323: out of expectations\n"); 218 pr_notice("nf_nat_h323: out of expectations\n");
220 return 0; 219 return 0;
221 } 220 }
222 221
223 /* Try to get a pair of ports. */ 222 /* Try to get a pair of ports. */
224 for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port); 223 for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
225 nated_port != 0; nated_port += 2) { 224 nated_port != 0; nated_port += 2) {
225 int ret;
226
226 rtp_exp->tuple.dst.u.udp.port = htons(nated_port); 227 rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
227 if (nf_ct_expect_related(rtp_exp) == 0) { 228 ret = nf_ct_expect_related(rtp_exp);
229 if (ret == 0) {
228 rtcp_exp->tuple.dst.u.udp.port = 230 rtcp_exp->tuple.dst.u.udp.port =
229 htons(nated_port + 1); 231 htons(nated_port + 1);
230 if (nf_ct_expect_related(rtcp_exp) == 0) 232 ret = nf_ct_expect_related(rtcp_exp);
233 if (ret == 0)
234 break;
235 else if (ret != -EBUSY) {
236 nf_ct_unexpect_related(rtp_exp);
237 nated_port = 0;
231 break; 238 break;
232 nf_ct_unexpect_related(rtp_exp); 239 }
240 } else if (ret != -EBUSY) {
241 nated_port = 0;
242 break;
233 } 243 }
234 } 244 }
235 245
236 if (nated_port == 0) { /* No port available */ 246 if (nated_port == 0) { /* No port available */
237 if (net_ratelimit()) 247 if (net_ratelimit())
238 printk("nf_nat_h323: out of RTP ports\n"); 248 pr_notice("nf_nat_h323: out of RTP ports\n");
239 return 0; 249 return 0;
240 } 250 }
241 251
@@ -285,14 +295,21 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
285 295
286 /* Try to get same port: if not, try to change it. */ 296 /* Try to get same port: if not, try to change it. */
287 for (; nated_port != 0; nated_port++) { 297 for (; nated_port != 0; nated_port++) {
298 int ret;
299
288 exp->tuple.dst.u.tcp.port = htons(nated_port); 300 exp->tuple.dst.u.tcp.port = htons(nated_port);
289 if (nf_ct_expect_related(exp) == 0) 301 ret = nf_ct_expect_related(exp);
302 if (ret == 0)
303 break;
304 else if (ret != -EBUSY) {
305 nated_port = 0;
290 break; 306 break;
307 }
291 } 308 }
292 309
293 if (nated_port == 0) { /* No port available */ 310 if (nated_port == 0) { /* No port available */
294 if (net_ratelimit()) 311 if (net_ratelimit())
295 printk("nf_nat_h323: out of TCP ports\n"); 312 pr_notice("nf_nat_h323: out of TCP ports\n");
296 return 0; 313 return 0;
297 } 314 }
298 315
@@ -335,14 +352,21 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
335 352
336 /* Try to get same port: if not, try to change it. */ 353 /* Try to get same port: if not, try to change it. */
337 for (; nated_port != 0; nated_port++) { 354 for (; nated_port != 0; nated_port++) {
355 int ret;
356
338 exp->tuple.dst.u.tcp.port = htons(nated_port); 357 exp->tuple.dst.u.tcp.port = htons(nated_port);
339 if (nf_ct_expect_related(exp) == 0) 358 ret = nf_ct_expect_related(exp);
359 if (ret == 0)
340 break; 360 break;
361 else if (ret != -EBUSY) {
362 nated_port = 0;
363 break;
364 }
341 } 365 }
342 366
343 if (nated_port == 0) { /* No port available */ 367 if (nated_port == 0) { /* No port available */
344 if (net_ratelimit()) 368 if (net_ratelimit())
345 printk("nf_nat_q931: out of TCP ports\n"); 369 pr_notice("nf_nat_q931: out of TCP ports\n");
346 return 0; 370 return 0;
347 } 371 }
348 372
@@ -419,14 +443,21 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
419 443
420 /* Try to get same port: if not, try to change it. */ 444 /* Try to get same port: if not, try to change it. */
421 for (; nated_port != 0; nated_port++) { 445 for (; nated_port != 0; nated_port++) {
446 int ret;
447
422 exp->tuple.dst.u.tcp.port = htons(nated_port); 448 exp->tuple.dst.u.tcp.port = htons(nated_port);
423 if (nf_ct_expect_related(exp) == 0) 449 ret = nf_ct_expect_related(exp);
450 if (ret == 0)
451 break;
452 else if (ret != -EBUSY) {
453 nated_port = 0;
424 break; 454 break;
455 }
425 } 456 }
426 457
427 if (nated_port == 0) { /* No port available */ 458 if (nated_port == 0) { /* No port available */
428 if (net_ratelimit()) 459 if (net_ratelimit())
429 printk("nf_nat_ras: out of TCP ports\n"); 460 pr_notice("nf_nat_ras: out of TCP ports\n");
430 return 0; 461 return 0;
431 } 462 }
432 463
@@ -501,14 +532,21 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
501 532
502 /* Try to get same port: if not, try to change it. */ 533 /* Try to get same port: if not, try to change it. */
503 for (nated_port = ntohs(port); nated_port != 0; nated_port++) { 534 for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
535 int ret;
536
504 exp->tuple.dst.u.tcp.port = htons(nated_port); 537 exp->tuple.dst.u.tcp.port = htons(nated_port);
505 if (nf_ct_expect_related(exp) == 0) 538 ret = nf_ct_expect_related(exp);
539 if (ret == 0)
506 break; 540 break;
541 else if (ret != -EBUSY) {
542 nated_port = 0;
543 break;
544 }
507 } 545 }
508 546
509 if (nated_port == 0) { /* No port available */ 547 if (nated_port == 0) { /* No port available */
510 if (net_ratelimit()) 548 if (net_ratelimit())
511 printk("nf_nat_q931: out of TCP ports\n"); 549 pr_notice("nf_nat_q931: out of TCP ports\n");
512 return 0; 550 return 0;
513 } 551 }
514 552
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 155c008626c8..31427fb57aa8 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -8,6 +8,7 @@
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/gfp.h>
11#include <linux/kmod.h> 12#include <linux/kmod.h>
12#include <linux/types.h> 13#include <linux/types.h>
13#include <linux/timer.h> 14#include <linux/timer.h>
@@ -41,18 +42,14 @@ adjust_tcp_sequence(u32 seq,
41 struct nf_conn *ct, 42 struct nf_conn *ct,
42 enum ip_conntrack_info ctinfo) 43 enum ip_conntrack_info ctinfo)
43{ 44{
44 int dir; 45 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
45 struct nf_nat_seq *this_way, *other_way;
46 struct nf_conn_nat *nat = nfct_nat(ct); 46 struct nf_conn_nat *nat = nfct_nat(ct);
47 struct nf_nat_seq *this_way = &nat->seq[dir];
47 48
48 pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n", seq, seq); 49 pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n",
49 50 seq, sizediff);
50 dir = CTINFO2DIR(ctinfo);
51
52 this_way = &nat->seq[dir];
53 other_way = &nat->seq[!dir];
54 51
55 pr_debug("nf_nat_resize_packet: Seq_offset before: "); 52 pr_debug("adjust_tcp_sequence: Seq_offset before: ");
56 DUMP_OFFSET(this_way); 53 DUMP_OFFSET(this_way);
57 54
58 spin_lock_bh(&nf_nat_seqofs_lock); 55 spin_lock_bh(&nf_nat_seqofs_lock);
@@ -63,16 +60,38 @@ adjust_tcp_sequence(u32 seq,
63 * retransmit */ 60 * retransmit */
64 if (this_way->offset_before == this_way->offset_after || 61 if (this_way->offset_before == this_way->offset_after ||
65 before(this_way->correction_pos, seq)) { 62 before(this_way->correction_pos, seq)) {
66 this_way->correction_pos = seq; 63 this_way->correction_pos = seq;
67 this_way->offset_before = this_way->offset_after; 64 this_way->offset_before = this_way->offset_after;
68 this_way->offset_after += sizediff; 65 this_way->offset_after += sizediff;
69 } 66 }
70 spin_unlock_bh(&nf_nat_seqofs_lock); 67 spin_unlock_bh(&nf_nat_seqofs_lock);
71 68
72 pr_debug("nf_nat_resize_packet: Seq_offset after: "); 69 pr_debug("adjust_tcp_sequence: Seq_offset after: ");
73 DUMP_OFFSET(this_way); 70 DUMP_OFFSET(this_way);
74} 71}
75 72
73/* Get the offset value, for conntrack */
74s16 nf_nat_get_offset(const struct nf_conn *ct,
75 enum ip_conntrack_dir dir,
76 u32 seq)
77{
78 struct nf_conn_nat *nat = nfct_nat(ct);
79 struct nf_nat_seq *this_way;
80 s16 offset;
81
82 if (!nat)
83 return 0;
84
85 this_way = &nat->seq[dir];
86 spin_lock_bh(&nf_nat_seqofs_lock);
87 offset = after(seq, this_way->correction_pos)
88 ? this_way->offset_after : this_way->offset_before;
89 spin_unlock_bh(&nf_nat_seqofs_lock);
90
91 return offset;
92}
93EXPORT_SYMBOL_GPL(nf_nat_get_offset);
94
76/* Frobs data inside this packet, which is linear. */ 95/* Frobs data inside this packet, which is linear. */
77static void mangle_contents(struct sk_buff *skb, 96static void mangle_contents(struct sk_buff *skb,
78 unsigned int dataoff, 97 unsigned int dataoff,
@@ -123,6 +142,46 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
123 return 1; 142 return 1;
124} 143}
125 144
145void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
146 __be32 seq, s16 off)
147{
148 if (!off)
149 return;
150 set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
151 adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo);
152 nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
153}
154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
155
156static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data,
157 int datalen, __sum16 *check, int oldlen)
158{
159 struct rtable *rt = skb_rtable(skb);
160
161 if (skb->ip_summed != CHECKSUM_PARTIAL) {
162 if (!(rt->rt_flags & RTCF_LOCAL) &&
163 skb->dev->features & NETIF_F_V4_CSUM) {
164 skb->ip_summed = CHECKSUM_PARTIAL;
165 skb->csum_start = skb_headroom(skb) +
166 skb_network_offset(skb) +
167 iph->ihl * 4;
168 skb->csum_offset = (void *)check - data;
169 *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
170 datalen, iph->protocol, 0);
171 } else {
172 *check = 0;
173 *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
174 datalen, iph->protocol,
175 csum_partial(data, datalen,
176 0));
177 if (iph->protocol == IPPROTO_UDP && !*check)
178 *check = CSUM_MANGLED_0;
179 }
180 } else
181 inet_proto_csum_replace2(check, skb,
182 htons(oldlen), htons(datalen), 1);
183}
184
126/* Generic function for mangling variable-length address changes inside 185/* Generic function for mangling variable-length address changes inside
127 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX 186 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
128 * command in FTP). 187 * command in FTP).
@@ -131,16 +190,14 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
131 * skb enlargement, ... 190 * skb enlargement, ...
132 * 191 *
133 * */ 192 * */
134int 193int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
135nf_nat_mangle_tcp_packet(struct sk_buff *skb, 194 struct nf_conn *ct,
136 struct nf_conn *ct, 195 enum ip_conntrack_info ctinfo,
137 enum ip_conntrack_info ctinfo, 196 unsigned int match_offset,
138 unsigned int match_offset, 197 unsigned int match_len,
139 unsigned int match_len, 198 const char *rep_buffer,
140 const char *rep_buffer, 199 unsigned int rep_len, bool adjust)
141 unsigned int rep_len)
142{ 200{
143 struct rtable *rt = skb_rtable(skb);
144 struct iphdr *iph; 201 struct iphdr *iph;
145 struct tcphdr *tcph; 202 struct tcphdr *tcph;
146 int oldlen, datalen; 203 int oldlen, datalen;
@@ -163,41 +220,15 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb,
163 match_offset, match_len, rep_buffer, rep_len); 220 match_offset, match_len, rep_buffer, rep_len);
164 221
165 datalen = skb->len - iph->ihl*4; 222 datalen = skb->len - iph->ihl*4;
166 if (skb->ip_summed != CHECKSUM_PARTIAL) { 223 nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
167 if (!(rt->rt_flags & RTCF_LOCAL) &&
168 skb->dev->features & NETIF_F_V4_CSUM) {
169 skb->ip_summed = CHECKSUM_PARTIAL;
170 skb->csum_start = skb_headroom(skb) +
171 skb_network_offset(skb) +
172 iph->ihl * 4;
173 skb->csum_offset = offsetof(struct tcphdr, check);
174 tcph->check = ~tcp_v4_check(datalen,
175 iph->saddr, iph->daddr, 0);
176 } else {
177 tcph->check = 0;
178 tcph->check = tcp_v4_check(datalen,
179 iph->saddr, iph->daddr,
180 csum_partial(tcph,
181 datalen, 0));
182 }
183 } else
184 inet_proto_csum_replace2(&tcph->check, skb,
185 htons(oldlen), htons(datalen), 1);
186 224
187 if (rep_len != match_len) { 225 if (adjust && rep_len != match_len)
188 set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); 226 nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
189 adjust_tcp_sequence(ntohl(tcph->seq), 227 (int)rep_len - (int)match_len);
190 (int)rep_len - (int)match_len,
191 ct, ctinfo);
192 /* Tell TCP window tracking about seq change */
193 nf_conntrack_tcp_update(skb, ip_hdrlen(skb),
194 ct, CTINFO2DIR(ctinfo));
195 228
196 nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
197 }
198 return 1; 229 return 1;
199} 230}
200EXPORT_SYMBOL(nf_nat_mangle_tcp_packet); 231EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet);
201 232
202/* Generic function for mangling variable-length address changes inside 233/* Generic function for mangling variable-length address changes inside
203 * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX 234 * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
@@ -218,7 +249,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
218 const char *rep_buffer, 249 const char *rep_buffer,
219 unsigned int rep_len) 250 unsigned int rep_len)
220{ 251{
221 struct rtable *rt = skb_rtable(skb);
222 struct iphdr *iph; 252 struct iphdr *iph;
223 struct udphdr *udph; 253 struct udphdr *udph;
224 int datalen, oldlen; 254 int datalen, oldlen;
@@ -252,29 +282,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
252 if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) 282 if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
253 return 1; 283 return 1;
254 284
255 if (skb->ip_summed != CHECKSUM_PARTIAL) { 285 nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
256 if (!(rt->rt_flags & RTCF_LOCAL) &&
257 skb->dev->features & NETIF_F_V4_CSUM) {
258 skb->ip_summed = CHECKSUM_PARTIAL;
259 skb->csum_start = skb_headroom(skb) +
260 skb_network_offset(skb) +
261 iph->ihl * 4;
262 skb->csum_offset = offsetof(struct udphdr, check);
263 udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
264 datalen, IPPROTO_UDP,
265 0);
266 } else {
267 udph->check = 0;
268 udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
269 datalen, IPPROTO_UDP,
270 csum_partial(udph,
271 datalen, 0));
272 if (!udph->check)
273 udph->check = CSUM_MANGLED_0;
274 }
275 } else
276 inet_proto_csum_replace2(&udph->check, skb,
277 htons(oldlen), htons(datalen), 1);
278 286
279 return 1; 287 return 1;
280} 288}
@@ -377,6 +385,7 @@ nf_nat_seq_adjust(struct sk_buff *skb,
377 struct tcphdr *tcph; 385 struct tcphdr *tcph;
378 int dir; 386 int dir;
379 __be32 newseq, newack; 387 __be32 newseq, newack;
388 s16 seqoff, ackoff;
380 struct nf_conn_nat *nat = nfct_nat(ct); 389 struct nf_conn_nat *nat = nfct_nat(ct);
381 struct nf_nat_seq *this_way, *other_way; 390 struct nf_nat_seq *this_way, *other_way;
382 391
@@ -390,15 +399,18 @@ nf_nat_seq_adjust(struct sk_buff *skb,
390 399
391 tcph = (void *)skb->data + ip_hdrlen(skb); 400 tcph = (void *)skb->data + ip_hdrlen(skb);
392 if (after(ntohl(tcph->seq), this_way->correction_pos)) 401 if (after(ntohl(tcph->seq), this_way->correction_pos))
393 newseq = htonl(ntohl(tcph->seq) + this_way->offset_after); 402 seqoff = this_way->offset_after;
394 else 403 else
395 newseq = htonl(ntohl(tcph->seq) + this_way->offset_before); 404 seqoff = this_way->offset_before;
396 405
397 if (after(ntohl(tcph->ack_seq) - other_way->offset_before, 406 if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
398 other_way->correction_pos)) 407 other_way->correction_pos))
399 newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_after); 408 ackoff = other_way->offset_after;
400 else 409 else
401 newack = htonl(ntohl(tcph->ack_seq) - other_way->offset_before); 410 ackoff = other_way->offset_before;
411
412 newseq = htonl(ntohl(tcph->seq) + seqoff);
413 newack = htonl(ntohl(tcph->ack_seq) - ackoff);
402 414
403 inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); 415 inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
404 inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); 416 inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
@@ -410,12 +422,7 @@ nf_nat_seq_adjust(struct sk_buff *skb,
410 tcph->seq = newseq; 422 tcph->seq = newseq;
411 tcph->ack_seq = newack; 423 tcph->ack_seq = newack;
412 424
413 if (!nf_nat_sack_adjust(skb, tcph, ct, ctinfo)) 425 return nf_nat_sack_adjust(skb, tcph, ct, ctinfo);
414 return 0;
415
416 nf_conntrack_tcp_update(skb, ip_hdrlen(skb), ct, dir);
417
418 return 1;
419} 426}
420 427
421/* Setup NAT on this expected conntrack so it follows master. */ 428/* Setup NAT on this expected conntrack so it follows master. */
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
index ea83a886b03e..535e1a802356 100644
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ b/net/ipv4/netfilter/nf_nat_irc.c
@@ -45,9 +45,16 @@ static unsigned int help(struct sk_buff *skb,
45 45
46 /* Try to get same port: if not, try to change it. */ 46 /* Try to get same port: if not, try to change it. */
47 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 47 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
48 int ret;
49
48 exp->tuple.dst.u.tcp.port = htons(port); 50 exp->tuple.dst.u.tcp.port = htons(port);
49 if (nf_ct_expect_related(exp) == 0) 51 ret = nf_ct_expect_related(exp);
52 if (ret == 0)
53 break;
54 else if (ret != -EBUSY) {
55 port = 0;
50 break; 56 break;
57 }
51 } 58 }
52 59
53 if (port == 0) 60 if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 9eb171056c63..4c060038d29f 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -25,6 +25,7 @@
25#include <net/netfilter/nf_nat_rule.h> 25#include <net/netfilter/nf_nat_rule.h>
26#include <net/netfilter/nf_conntrack_helper.h> 26#include <net/netfilter/nf_conntrack_helper.h>
27#include <net/netfilter/nf_conntrack_expect.h> 27#include <net/netfilter/nf_conntrack_expect.h>
28#include <net/netfilter/nf_conntrack_zones.h>
28#include <linux/netfilter/nf_conntrack_proto_gre.h> 29#include <linux/netfilter/nf_conntrack_proto_gre.h>
29#include <linux/netfilter/nf_conntrack_pptp.h> 30#include <linux/netfilter/nf_conntrack_pptp.h>
30 31
@@ -74,7 +75,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
74 75
75 pr_debug("trying to unexpect other dir: "); 76 pr_debug("trying to unexpect other dir: ");
76 nf_ct_dump_tuple_ip(&t); 77 nf_ct_dump_tuple_ip(&t);
77 other_exp = nf_ct_expect_find_get(net, &t); 78 other_exp = nf_ct_expect_find_get(net, nf_ct_zone(ct), &t);
78 if (other_exp) { 79 if (other_exp) {
79 nf_ct_unexpect_related(other_exp); 80 nf_ct_unexpect_related(other_exp);
80 nf_ct_expect_put(other_exp); 81 nf_ct_expect_put(other_exp);
diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
index 6c4f11f51446..3e61faf23a9a 100644
--- a/net/ipv4/netfilter/nf_nat_proto_common.c
+++ b/net/ipv4/netfilter/nf_nat_proto_common.c
@@ -34,7 +34,7 @@ bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
34} 34}
35EXPORT_SYMBOL_GPL(nf_nat_proto_in_range); 35EXPORT_SYMBOL_GPL(nf_nat_proto_in_range);
36 36
37bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, 37void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
38 const struct nf_nat_range *range, 38 const struct nf_nat_range *range,
39 enum nf_nat_manip_type maniptype, 39 enum nf_nat_manip_type maniptype,
40 const struct nf_conn *ct, 40 const struct nf_conn *ct,
@@ -53,7 +53,7 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
53 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { 53 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
54 /* If it's dst rewrite, can't change port */ 54 /* If it's dst rewrite, can't change port */
55 if (maniptype == IP_NAT_MANIP_DST) 55 if (maniptype == IP_NAT_MANIP_DST)
56 return false; 56 return;
57 57
58 if (ntohs(*portptr) < 1024) { 58 if (ntohs(*portptr) < 1024) {
59 /* Loose convention: >> 512 is credential passing */ 59 /* Loose convention: >> 512 is credential passing */
@@ -81,15 +81,15 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
81 else 81 else
82 off = *rover; 82 off = *rover;
83 83
84 for (i = 0; i < range_size; i++, off++) { 84 for (i = 0; ; ++off) {
85 *portptr = htons(min + off % range_size); 85 *portptr = htons(min + off % range_size);
86 if (nf_nat_used_tuple(tuple, ct)) 86 if (++i != range_size && nf_nat_used_tuple(tuple, ct))
87 continue; 87 continue;
88 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) 88 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM))
89 *rover = off; 89 *rover = off;
90 return true; 90 return;
91 } 91 }
92 return false; 92 return;
93} 93}
94EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple); 94EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple);
95 95
diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c
index 22485ce306d4..570faf2667b2 100644
--- a/net/ipv4/netfilter/nf_nat_proto_dccp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_dccp.c
@@ -22,14 +22,14 @@
22 22
23static u_int16_t dccp_port_rover; 23static u_int16_t dccp_port_rover;
24 24
25static bool 25static void
26dccp_unique_tuple(struct nf_conntrack_tuple *tuple, 26dccp_unique_tuple(struct nf_conntrack_tuple *tuple,
27 const struct nf_nat_range *range, 27 const struct nf_nat_range *range,
28 enum nf_nat_manip_type maniptype, 28 enum nf_nat_manip_type maniptype,
29 const struct nf_conn *ct) 29 const struct nf_conn *ct)
30{ 30{
31 return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, 31 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
32 &dccp_port_rover); 32 &dccp_port_rover);
33} 33}
34 34
35static bool 35static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index d7e89201351e..bc8d83a31c73 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -37,7 +37,7 @@ MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
37MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); 37MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
38 38
39/* generate unique tuple ... */ 39/* generate unique tuple ... */
40static bool 40static void
41gre_unique_tuple(struct nf_conntrack_tuple *tuple, 41gre_unique_tuple(struct nf_conntrack_tuple *tuple,
42 const struct nf_nat_range *range, 42 const struct nf_nat_range *range,
43 enum nf_nat_manip_type maniptype, 43 enum nf_nat_manip_type maniptype,
@@ -50,7 +50,7 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
50 /* If there is no master conntrack we are not PPTP, 50 /* If there is no master conntrack we are not PPTP,
51 do not change tuples */ 51 do not change tuples */
52 if (!ct->master) 52 if (!ct->master)
53 return false; 53 return;
54 54
55 if (maniptype == IP_NAT_MANIP_SRC) 55 if (maniptype == IP_NAT_MANIP_SRC)
56 keyptr = &tuple->src.u.gre.key; 56 keyptr = &tuple->src.u.gre.key;
@@ -68,14 +68,14 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
68 68
69 pr_debug("min = %u, range_size = %u\n", min, range_size); 69 pr_debug("min = %u, range_size = %u\n", min, range_size);
70 70
71 for (i = 0; i < range_size; i++, key++) { 71 for (i = 0; ; ++key) {
72 *keyptr = htons(min + key % range_size); 72 *keyptr = htons(min + key % range_size);
73 if (!nf_nat_used_tuple(tuple, ct)) 73 if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
74 return true; 74 return;
75 } 75 }
76 76
77 pr_debug("%p: no NAT mapping\n", ct); 77 pr_debug("%p: no NAT mapping\n", ct);
78 return false; 78 return;
79} 79}
80 80
81/* manipulate a GRE packet according to maniptype */ 81/* manipulate a GRE packet according to maniptype */
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 19a8b0b07d8e..5744c3ec847c 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -27,7 +27,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
27 ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); 27 ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
28} 28}
29 29
30static bool 30static void
31icmp_unique_tuple(struct nf_conntrack_tuple *tuple, 31icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
32 const struct nf_nat_range *range, 32 const struct nf_nat_range *range,
33 enum nf_nat_manip_type maniptype, 33 enum nf_nat_manip_type maniptype,
@@ -42,13 +42,13 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
42 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) 42 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
43 range_size = 0xFFFF; 43 range_size = 0xFFFF;
44 44
45 for (i = 0; i < range_size; i++, id++) { 45 for (i = 0; ; ++id) {
46 tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) + 46 tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
47 (id % range_size)); 47 (id % range_size));
48 if (!nf_nat_used_tuple(tuple, ct)) 48 if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
49 return true; 49 return;
50 } 50 }
51 return false; 51 return;
52} 52}
53 53
54static bool 54static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
index 3fc598eeeb1a..756331d42661 100644
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c
@@ -16,14 +16,14 @@
16 16
17static u_int16_t nf_sctp_port_rover; 17static u_int16_t nf_sctp_port_rover;
18 18
19static bool 19static void
20sctp_unique_tuple(struct nf_conntrack_tuple *tuple, 20sctp_unique_tuple(struct nf_conntrack_tuple *tuple,
21 const struct nf_nat_range *range, 21 const struct nf_nat_range *range,
22 enum nf_nat_manip_type maniptype, 22 enum nf_nat_manip_type maniptype,
23 const struct nf_conn *ct) 23 const struct nf_conn *ct)
24{ 24{
25 return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, 25 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
26 &nf_sctp_port_rover); 26 &nf_sctp_port_rover);
27} 27}
28 28
29static bool 29static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
index 399e2cfa263b..aa460a595d5d 100644
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c
@@ -20,14 +20,13 @@
20 20
21static u_int16_t tcp_port_rover; 21static u_int16_t tcp_port_rover;
22 22
23static bool 23static void
24tcp_unique_tuple(struct nf_conntrack_tuple *tuple, 24tcp_unique_tuple(struct nf_conntrack_tuple *tuple,
25 const struct nf_nat_range *range, 25 const struct nf_nat_range *range,
26 enum nf_nat_manip_type maniptype, 26 enum nf_nat_manip_type maniptype,
27 const struct nf_conn *ct) 27 const struct nf_conn *ct)
28{ 28{
29 return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, 29 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover);
30 &tcp_port_rover);
31} 30}
32 31
33static bool 32static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
index 9e61c79492e4..dfe65c7e2925 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udp.c
@@ -19,14 +19,13 @@
19 19
20static u_int16_t udp_port_rover; 20static u_int16_t udp_port_rover;
21 21
22static bool 22static void
23udp_unique_tuple(struct nf_conntrack_tuple *tuple, 23udp_unique_tuple(struct nf_conntrack_tuple *tuple,
24 const struct nf_nat_range *range, 24 const struct nf_nat_range *range,
25 enum nf_nat_manip_type maniptype, 25 enum nf_nat_manip_type maniptype,
26 const struct nf_conn *ct) 26 const struct nf_conn *ct)
27{ 27{
28 return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, 28 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover);
29 &udp_port_rover);
30} 29}
31 30
32static bool 31static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c
index 440a229bbd87..3cc8c8af39ef 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udplite.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udplite.c
@@ -18,14 +18,14 @@
18 18
19static u_int16_t udplite_port_rover; 19static u_int16_t udplite_port_rover;
20 20
21static bool 21static void
22udplite_unique_tuple(struct nf_conntrack_tuple *tuple, 22udplite_unique_tuple(struct nf_conntrack_tuple *tuple,
23 const struct nf_nat_range *range, 23 const struct nf_nat_range *range,
24 enum nf_nat_manip_type maniptype, 24 enum nf_nat_manip_type maniptype,
25 const struct nf_conn *ct) 25 const struct nf_conn *ct)
26{ 26{
27 return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, 27 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
28 &udplite_port_rover); 28 &udplite_port_rover);
29} 29}
30 30
31static bool 31static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c
index 14381c62acea..a50f2bc1c732 100644
--- a/net/ipv4/netfilter/nf_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c
@@ -26,14 +26,14 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
26 return true; 26 return true;
27} 27}
28 28
29static bool unknown_unique_tuple(struct nf_conntrack_tuple *tuple, 29static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
30 const struct nf_nat_range *range, 30 const struct nf_nat_range *range,
31 enum nf_nat_manip_type maniptype, 31 enum nf_nat_manip_type maniptype,
32 const struct nf_conn *ct) 32 const struct nf_conn *ct)
33{ 33{
34 /* Sorry: we can't help you; if it's not unique, we can't frob 34 /* Sorry: we can't help you; if it's not unique, we can't frob
35 anything. */ 35 anything. */
36 return false; 36 return;
37} 37}
38 38
39static bool 39static bool
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 6348a793936e..21c30426480b 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9/* Everything about the rules for NAT. */ 9/* Everything about the rules for NAT. */
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10#include <linux/types.h> 11#include <linux/types.h>
11#include <linux/ip.h> 12#include <linux/ip.h>
12#include <linux/netfilter.h> 13#include <linux/netfilter.h>
@@ -15,6 +16,7 @@
15#include <linux/kmod.h> 16#include <linux/kmod.h>
16#include <linux/skbuff.h> 17#include <linux/skbuff.h>
17#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
19#include <linux/slab.h>
18#include <net/checksum.h> 20#include <net/checksum.h>
19#include <net/route.h> 21#include <net/route.h>
20#include <linux/bitops.h> 22#include <linux/bitops.h>
@@ -26,54 +28,26 @@
26 28
27#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ 29#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
28 (1 << NF_INET_POST_ROUTING) | \ 30 (1 << NF_INET_POST_ROUTING) | \
29 (1 << NF_INET_LOCAL_OUT)) 31 (1 << NF_INET_LOCAL_OUT) | \
32 (1 << NF_INET_LOCAL_IN))
30 33
31static struct 34static const struct xt_table nat_table = {
32{
33 struct ipt_replace repl;
34 struct ipt_standard entries[3];
35 struct ipt_error term;
36} nat_initial_table __net_initdata = {
37 .repl = {
38 .name = "nat",
39 .valid_hooks = NAT_VALID_HOOKS,
40 .num_entries = 4,
41 .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
42 .hook_entry = {
43 [NF_INET_PRE_ROUTING] = 0,
44 [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard),
45 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2
46 },
47 .underflow = {
48 [NF_INET_PRE_ROUTING] = 0,
49 [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard),
50 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2
51 },
52 },
53 .entries = {
54 IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */
55 IPT_STANDARD_INIT(NF_ACCEPT), /* POST_ROUTING */
56 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
57 },
58 .term = IPT_ERROR_INIT, /* ERROR */
59};
60
61static struct xt_table nat_table = {
62 .name = "nat", 35 .name = "nat",
63 .valid_hooks = NAT_VALID_HOOKS, 36 .valid_hooks = NAT_VALID_HOOKS,
64 .me = THIS_MODULE, 37 .me = THIS_MODULE,
65 .af = AF_INET, 38 .af = NFPROTO_IPV4,
66}; 39};
67 40
68/* Source NAT */ 41/* Source NAT */
69static unsigned int 42static unsigned int
70ipt_snat_target(struct sk_buff *skb, const struct xt_target_param *par) 43ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
71{ 44{
72 struct nf_conn *ct; 45 struct nf_conn *ct;
73 enum ip_conntrack_info ctinfo; 46 enum ip_conntrack_info ctinfo;
74 const struct nf_nat_multi_range_compat *mr = par->targinfo; 47 const struct nf_nat_multi_range_compat *mr = par->targinfo;
75 48
76 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); 49 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING ||
50 par->hooknum == NF_INET_LOCAL_IN);
77 51
78 ct = nf_ct_get(skb, &ctinfo); 52 ct = nf_ct_get(skb, &ctinfo);
79 53
@@ -86,7 +60,7 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_target_param *par)
86} 60}
87 61
88static unsigned int 62static unsigned int
89ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par) 63ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par)
90{ 64{
91 struct nf_conn *ct; 65 struct nf_conn *ct;
92 enum ip_conntrack_info ctinfo; 66 enum ip_conntrack_info ctinfo;
@@ -103,45 +77,44 @@ ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par)
103 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); 77 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST);
104} 78}
105 79
106static bool ipt_snat_checkentry(const struct xt_tgchk_param *par) 80static int ipt_snat_checkentry(const struct xt_tgchk_param *par)
107{ 81{
108 const struct nf_nat_multi_range_compat *mr = par->targinfo; 82 const struct nf_nat_multi_range_compat *mr = par->targinfo;
109 83
110 /* Must be a valid range */ 84 /* Must be a valid range */
111 if (mr->rangesize != 1) { 85 if (mr->rangesize != 1) {
112 printk("SNAT: multiple ranges no longer supported\n"); 86 pr_info("SNAT: multiple ranges no longer supported\n");
113 return false; 87 return -EINVAL;
114 } 88 }
115 return true; 89 return 0;
116} 90}
117 91
118static bool ipt_dnat_checkentry(const struct xt_tgchk_param *par) 92static int ipt_dnat_checkentry(const struct xt_tgchk_param *par)
119{ 93{
120 const struct nf_nat_multi_range_compat *mr = par->targinfo; 94 const struct nf_nat_multi_range_compat *mr = par->targinfo;
121 95
122 /* Must be a valid range */ 96 /* Must be a valid range */
123 if (mr->rangesize != 1) { 97 if (mr->rangesize != 1) {
124 printk("DNAT: multiple ranges no longer supported\n"); 98 pr_info("DNAT: multiple ranges no longer supported\n");
125 return false; 99 return -EINVAL;
126 } 100 }
127 return true; 101 return 0;
128} 102}
129 103
130unsigned int 104static unsigned int
131alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) 105alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
132{ 106{
133 /* Force range to this IP; let proto decide mapping for 107 /* Force range to this IP; let proto decide mapping for
134 per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). 108 per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
135 Use reply in case it's already been mangled (eg local packet).
136 */ 109 */
137 __be32 ip 110 struct nf_nat_range range;
138 = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC 111
139 ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip 112 range.flags = 0;
140 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); 113 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
141 struct nf_nat_range range 114 HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ?
142 = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } }; 115 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
143 116 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
144 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip); 117
145 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); 118 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
146} 119}
147 120
@@ -169,7 +142,7 @@ static struct xt_target ipt_snat_reg __read_mostly = {
169 .target = ipt_snat_target, 142 .target = ipt_snat_target,
170 .targetsize = sizeof(struct nf_nat_multi_range_compat), 143 .targetsize = sizeof(struct nf_nat_multi_range_compat),
171 .table = "nat", 144 .table = "nat",
172 .hooks = 1 << NF_INET_POST_ROUTING, 145 .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN),
173 .checkentry = ipt_snat_checkentry, 146 .checkentry = ipt_snat_checkentry,
174 .family = AF_INET, 147 .family = AF_INET,
175}; 148};
@@ -186,8 +159,13 @@ static struct xt_target ipt_dnat_reg __read_mostly = {
186 159
187static int __net_init nf_nat_rule_net_init(struct net *net) 160static int __net_init nf_nat_rule_net_init(struct net *net)
188{ 161{
189 net->ipv4.nat_table = ipt_register_table(net, &nat_table, 162 struct ipt_replace *repl;
190 &nat_initial_table.repl); 163
164 repl = ipt_alloc_initial_table(&nat_table);
165 if (repl == NULL)
166 return -ENOMEM;
167 net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl);
168 kfree(repl);
191 if (IS_ERR(net->ipv4.nat_table)) 169 if (IS_ERR(net->ipv4.nat_table))
192 return PTR_ERR(net->ipv4.nat_table); 170 return PTR_ERR(net->ipv4.nat_table);
193 return 0; 171 return 0;
@@ -195,7 +173,7 @@ static int __net_init nf_nat_rule_net_init(struct net *net)
195 173
196static void __net_exit nf_nat_rule_net_exit(struct net *net) 174static void __net_exit nf_nat_rule_net_exit(struct net *net)
197{ 175{
198 ipt_unregister_table(net->ipv4.nat_table); 176 ipt_unregister_table(net, net->ipv4.nat_table);
199} 177}
200 178
201static struct pernet_operations nf_nat_rule_net_ops = { 179static struct pernet_operations nf_nat_rule_net_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 07d61a57613c..e40cf7816fdb 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -1,4 +1,4 @@
1/* SIP extension for UDP NAT alteration. 1/* SIP extension for NAT alteration.
2 * 2 *
3 * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> 3 * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
4 * based on RR's ip_nat_ftp.c and other modules. 4 * based on RR's ip_nat_ftp.c and other modules.
@@ -15,6 +15,7 @@
15#include <linux/ip.h> 15#include <linux/ip.h>
16#include <net/ip.h> 16#include <net/ip.h>
17#include <linux/udp.h> 17#include <linux/udp.h>
18#include <linux/tcp.h>
18 19
19#include <net/netfilter/nf_nat.h> 20#include <net/netfilter/nf_nat.h>
20#include <net/netfilter/nf_nat_helper.h> 21#include <net/netfilter/nf_nat_helper.h>
@@ -29,25 +30,42 @@ MODULE_DESCRIPTION("SIP NAT helper");
29MODULE_ALIAS("ip_nat_sip"); 30MODULE_ALIAS("ip_nat_sip");
30 31
31 32
32static unsigned int mangle_packet(struct sk_buff *skb, 33static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff,
33 const char **dptr, unsigned int *datalen, 34 const char **dptr, unsigned int *datalen,
34 unsigned int matchoff, unsigned int matchlen, 35 unsigned int matchoff, unsigned int matchlen,
35 const char *buffer, unsigned int buflen) 36 const char *buffer, unsigned int buflen)
36{ 37{
37 enum ip_conntrack_info ctinfo; 38 enum ip_conntrack_info ctinfo;
38 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 39 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
39 40 struct tcphdr *th;
40 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, matchoff, matchlen, 41 unsigned int baseoff;
41 buffer, buflen)) 42
42 return 0; 43 if (nf_ct_protonum(ct) == IPPROTO_TCP) {
44 th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
45 baseoff = ip_hdrlen(skb) + th->doff * 4;
46 matchoff += dataoff - baseoff;
47
48 if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
49 matchoff, matchlen,
50 buffer, buflen, false))
51 return 0;
52 } else {
53 baseoff = ip_hdrlen(skb) + sizeof(struct udphdr);
54 matchoff += dataoff - baseoff;
55
56 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
57 matchoff, matchlen,
58 buffer, buflen))
59 return 0;
60 }
43 61
44 /* Reload data pointer and adjust datalen value */ 62 /* Reload data pointer and adjust datalen value */
45 *dptr = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr); 63 *dptr = skb->data + dataoff;
46 *datalen += buflen - matchlen; 64 *datalen += buflen - matchlen;
47 return 1; 65 return 1;
48} 66}
49 67
50static int map_addr(struct sk_buff *skb, 68static int map_addr(struct sk_buff *skb, unsigned int dataoff,
51 const char **dptr, unsigned int *datalen, 69 const char **dptr, unsigned int *datalen,
52 unsigned int matchoff, unsigned int matchlen, 70 unsigned int matchoff, unsigned int matchlen,
53 union nf_inet_addr *addr, __be16 port) 71 union nf_inet_addr *addr, __be16 port)
@@ -76,11 +94,11 @@ static int map_addr(struct sk_buff *skb,
76 94
77 buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport)); 95 buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport));
78 96
79 return mangle_packet(skb, dptr, datalen, matchoff, matchlen, 97 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
80 buffer, buflen); 98 buffer, buflen);
81} 99}
82 100
83static int map_sip_addr(struct sk_buff *skb, 101static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff,
84 const char **dptr, unsigned int *datalen, 102 const char **dptr, unsigned int *datalen,
85 enum sip_header_types type) 103 enum sip_header_types type)
86{ 104{
@@ -93,16 +111,18 @@ static int map_sip_addr(struct sk_buff *skb,
93 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL, 111 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL,
94 &matchoff, &matchlen, &addr, &port) <= 0) 112 &matchoff, &matchlen, &addr, &port) <= 0)
95 return 1; 113 return 1;
96 return map_addr(skb, dptr, datalen, matchoff, matchlen, &addr, port); 114 return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
115 &addr, port);
97} 116}
98 117
99static unsigned int ip_nat_sip(struct sk_buff *skb, 118static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
100 const char **dptr, unsigned int *datalen) 119 const char **dptr, unsigned int *datalen)
101{ 120{
102 enum ip_conntrack_info ctinfo; 121 enum ip_conntrack_info ctinfo;
103 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 122 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
104 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 123 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
105 unsigned int dataoff, matchoff, matchlen; 124 unsigned int coff, matchoff, matchlen;
125 enum sip_header_types hdr;
106 union nf_inet_addr addr; 126 union nf_inet_addr addr;
107 __be16 port; 127 __be16 port;
108 int request, in_header; 128 int request, in_header;
@@ -112,16 +132,21 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
112 if (ct_sip_parse_request(ct, *dptr, *datalen, 132 if (ct_sip_parse_request(ct, *dptr, *datalen,
113 &matchoff, &matchlen, 133 &matchoff, &matchlen,
114 &addr, &port) > 0 && 134 &addr, &port) > 0 &&
115 !map_addr(skb, dptr, datalen, matchoff, matchlen, 135 !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
116 &addr, port)) 136 &addr, port))
117 return NF_DROP; 137 return NF_DROP;
118 request = 1; 138 request = 1;
119 } else 139 } else
120 request = 0; 140 request = 0;
121 141
142 if (nf_ct_protonum(ct) == IPPROTO_TCP)
143 hdr = SIP_HDR_VIA_TCP;
144 else
145 hdr = SIP_HDR_VIA_UDP;
146
122 /* Translate topmost Via header and parameters */ 147 /* Translate topmost Via header and parameters */
123 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, 148 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
124 SIP_HDR_VIA, NULL, &matchoff, &matchlen, 149 hdr, NULL, &matchoff, &matchlen,
125 &addr, &port) > 0) { 150 &addr, &port) > 0) {
126 unsigned int matchend, poff, plen, buflen, n; 151 unsigned int matchend, poff, plen, buflen, n;
127 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; 152 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
@@ -138,7 +163,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
138 goto next; 163 goto next;
139 } 164 }
140 165
141 if (!map_addr(skb, dptr, datalen, matchoff, matchlen, 166 if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
142 &addr, port)) 167 &addr, port))
143 return NF_DROP; 168 return NF_DROP;
144 169
@@ -153,8 +178,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
153 addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { 178 addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) {
154 buflen = sprintf(buffer, "%pI4", 179 buflen = sprintf(buffer, "%pI4",
155 &ct->tuplehash[!dir].tuple.dst.u3.ip); 180 &ct->tuplehash[!dir].tuple.dst.u3.ip);
156 if (!mangle_packet(skb, dptr, datalen, poff, plen, 181 if (!mangle_packet(skb, dataoff, dptr, datalen,
157 buffer, buflen)) 182 poff, plen, buffer, buflen))
158 return NF_DROP; 183 return NF_DROP;
159 } 184 }
160 185
@@ -167,8 +192,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
167 addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { 192 addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) {
168 buflen = sprintf(buffer, "%pI4", 193 buflen = sprintf(buffer, "%pI4",
169 &ct->tuplehash[!dir].tuple.src.u3.ip); 194 &ct->tuplehash[!dir].tuple.src.u3.ip);
170 if (!mangle_packet(skb, dptr, datalen, poff, plen, 195 if (!mangle_packet(skb, dataoff, dptr, datalen,
171 buffer, buflen)) 196 poff, plen, buffer, buflen))
172 return NF_DROP; 197 return NF_DROP;
173 } 198 }
174 199
@@ -181,31 +206,45 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
181 htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) { 206 htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
182 __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; 207 __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
183 buflen = sprintf(buffer, "%u", ntohs(p)); 208 buflen = sprintf(buffer, "%u", ntohs(p));
184 if (!mangle_packet(skb, dptr, datalen, poff, plen, 209 if (!mangle_packet(skb, dataoff, dptr, datalen,
185 buffer, buflen)) 210 poff, plen, buffer, buflen))
186 return NF_DROP; 211 return NF_DROP;
187 } 212 }
188 } 213 }
189 214
190next: 215next:
191 /* Translate Contact headers */ 216 /* Translate Contact headers */
192 dataoff = 0; 217 coff = 0;
193 in_header = 0; 218 in_header = 0;
194 while (ct_sip_parse_header_uri(ct, *dptr, &dataoff, *datalen, 219 while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
195 SIP_HDR_CONTACT, &in_header, 220 SIP_HDR_CONTACT, &in_header,
196 &matchoff, &matchlen, 221 &matchoff, &matchlen,
197 &addr, &port) > 0) { 222 &addr, &port) > 0) {
198 if (!map_addr(skb, dptr, datalen, matchoff, matchlen, 223 if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
199 &addr, port)) 224 &addr, port))
200 return NF_DROP; 225 return NF_DROP;
201 } 226 }
202 227
203 if (!map_sip_addr(skb, dptr, datalen, SIP_HDR_FROM) || 228 if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) ||
204 !map_sip_addr(skb, dptr, datalen, SIP_HDR_TO)) 229 !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO))
205 return NF_DROP; 230 return NF_DROP;
231
206 return NF_ACCEPT; 232 return NF_ACCEPT;
207} 233}
208 234
235static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off)
236{
237 enum ip_conntrack_info ctinfo;
238 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
239 const struct tcphdr *th;
240
241 if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0)
242 return;
243
244 th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
245 nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
246}
247
209/* Handles expected signalling connections and media streams */ 248/* Handles expected signalling connections and media streams */
210static void ip_nat_sip_expected(struct nf_conn *ct, 249static void ip_nat_sip_expected(struct nf_conn *ct,
211 struct nf_conntrack_expect *exp) 250 struct nf_conntrack_expect *exp)
@@ -232,7 +271,7 @@ static void ip_nat_sip_expected(struct nf_conn *ct,
232 } 271 }
233} 272}
234 273
235static unsigned int ip_nat_sip_expect(struct sk_buff *skb, 274static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
236 const char **dptr, unsigned int *datalen, 275 const char **dptr, unsigned int *datalen,
237 struct nf_conntrack_expect *exp, 276 struct nf_conntrack_expect *exp,
238 unsigned int matchoff, 277 unsigned int matchoff,
@@ -268,9 +307,16 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb,
268 exp->expectfn = ip_nat_sip_expected; 307 exp->expectfn = ip_nat_sip_expected;
269 308
270 for (; port != 0; port++) { 309 for (; port != 0; port++) {
310 int ret;
311
271 exp->tuple.dst.u.udp.port = htons(port); 312 exp->tuple.dst.u.udp.port = htons(port);
272 if (nf_ct_expect_related(exp) == 0) 313 ret = nf_ct_expect_related(exp);
314 if (ret == 0)
273 break; 315 break;
316 else if (ret != -EBUSY) {
317 port = 0;
318 break;
319 }
274 } 320 }
275 321
276 if (port == 0) 322 if (port == 0)
@@ -279,8 +325,8 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb,
279 if (exp->tuple.dst.u3.ip != exp->saved_ip || 325 if (exp->tuple.dst.u3.ip != exp->saved_ip ||
280 exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { 326 exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
281 buflen = sprintf(buffer, "%pI4:%u", &newip, port); 327 buflen = sprintf(buffer, "%pI4:%u", &newip, port);
282 if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen, 328 if (!mangle_packet(skb, dataoff, dptr, datalen,
283 buffer, buflen)) 329 matchoff, matchlen, buffer, buflen))
284 goto err; 330 goto err;
285 } 331 }
286 return NF_ACCEPT; 332 return NF_ACCEPT;
@@ -290,7 +336,7 @@ err:
290 return NF_DROP; 336 return NF_DROP;
291} 337}
292 338
293static int mangle_content_len(struct sk_buff *skb, 339static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff,
294 const char **dptr, unsigned int *datalen) 340 const char **dptr, unsigned int *datalen)
295{ 341{
296 enum ip_conntrack_info ctinfo; 342 enum ip_conntrack_info ctinfo;
@@ -312,12 +358,13 @@ static int mangle_content_len(struct sk_buff *skb,
312 return 0; 358 return 0;
313 359
314 buflen = sprintf(buffer, "%u", c_len); 360 buflen = sprintf(buffer, "%u", c_len);
315 return mangle_packet(skb, dptr, datalen, matchoff, matchlen, 361 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
316 buffer, buflen); 362 buffer, buflen);
317} 363}
318 364
319static int mangle_sdp_packet(struct sk_buff *skb, const char **dptr, 365static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff,
320 unsigned int dataoff, unsigned int *datalen, 366 const char **dptr, unsigned int *datalen,
367 unsigned int sdpoff,
321 enum sdp_header_types type, 368 enum sdp_header_types type,
322 enum sdp_header_types term, 369 enum sdp_header_types term,
323 char *buffer, int buflen) 370 char *buffer, int buflen)
@@ -326,16 +373,16 @@ static int mangle_sdp_packet(struct sk_buff *skb, const char **dptr,
326 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 373 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
327 unsigned int matchlen, matchoff; 374 unsigned int matchlen, matchoff;
328 375
329 if (ct_sip_get_sdp_header(ct, *dptr, dataoff, *datalen, type, term, 376 if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term,
330 &matchoff, &matchlen) <= 0) 377 &matchoff, &matchlen) <= 0)
331 return -ENOENT; 378 return -ENOENT;
332 return mangle_packet(skb, dptr, datalen, matchoff, matchlen, 379 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
333 buffer, buflen) ? 0 : -EINVAL; 380 buffer, buflen) ? 0 : -EINVAL;
334} 381}
335 382
336static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr, 383static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff,
337 unsigned int dataoff, 384 const char **dptr, unsigned int *datalen,
338 unsigned int *datalen, 385 unsigned int sdpoff,
339 enum sdp_header_types type, 386 enum sdp_header_types type,
340 enum sdp_header_types term, 387 enum sdp_header_types term,
341 const union nf_inet_addr *addr) 388 const union nf_inet_addr *addr)
@@ -344,16 +391,15 @@ static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr,
344 unsigned int buflen; 391 unsigned int buflen;
345 392
346 buflen = sprintf(buffer, "%pI4", &addr->ip); 393 buflen = sprintf(buffer, "%pI4", &addr->ip);
347 if (mangle_sdp_packet(skb, dptr, dataoff, datalen, type, term, 394 if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term,
348 buffer, buflen)) 395 buffer, buflen))
349 return 0; 396 return 0;
350 397
351 return mangle_content_len(skb, dptr, datalen); 398 return mangle_content_len(skb, dataoff, dptr, datalen);
352} 399}
353 400
354static unsigned int ip_nat_sdp_port(struct sk_buff *skb, 401static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff,
355 const char **dptr, 402 const char **dptr, unsigned int *datalen,
356 unsigned int *datalen,
357 unsigned int matchoff, 403 unsigned int matchoff,
358 unsigned int matchlen, 404 unsigned int matchlen,
359 u_int16_t port) 405 u_int16_t port)
@@ -362,16 +408,16 @@ static unsigned int ip_nat_sdp_port(struct sk_buff *skb,
362 unsigned int buflen; 408 unsigned int buflen;
363 409
364 buflen = sprintf(buffer, "%u", port); 410 buflen = sprintf(buffer, "%u", port);
365 if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen, 411 if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
366 buffer, buflen)) 412 buffer, buflen))
367 return 0; 413 return 0;
368 414
369 return mangle_content_len(skb, dptr, datalen); 415 return mangle_content_len(skb, dataoff, dptr, datalen);
370} 416}
371 417
372static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr, 418static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff,
373 unsigned int dataoff, 419 const char **dptr, unsigned int *datalen,
374 unsigned int *datalen, 420 unsigned int sdpoff,
375 const union nf_inet_addr *addr) 421 const union nf_inet_addr *addr)
376{ 422{
377 char buffer[sizeof("nnn.nnn.nnn.nnn")]; 423 char buffer[sizeof("nnn.nnn.nnn.nnn")];
@@ -379,12 +425,12 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr,
379 425
380 /* Mangle session description owner and contact addresses */ 426 /* Mangle session description owner and contact addresses */
381 buflen = sprintf(buffer, "%pI4", &addr->ip); 427 buflen = sprintf(buffer, "%pI4", &addr->ip);
382 if (mangle_sdp_packet(skb, dptr, dataoff, datalen, 428 if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
383 SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA, 429 SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA,
384 buffer, buflen)) 430 buffer, buflen))
385 return 0; 431 return 0;
386 432
387 switch (mangle_sdp_packet(skb, dptr, dataoff, datalen, 433 switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
388 SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA, 434 SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA,
389 buffer, buflen)) { 435 buffer, buflen)) {
390 case 0: 436 case 0:
@@ -401,14 +447,13 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr,
401 return 0; 447 return 0;
402 } 448 }
403 449
404 return mangle_content_len(skb, dptr, datalen); 450 return mangle_content_len(skb, dataoff, dptr, datalen);
405} 451}
406 452
407/* So, this packet has hit the connection tracking matching code. 453/* So, this packet has hit the connection tracking matching code.
408 Mangle it, and change the expectation to match the new version. */ 454 Mangle it, and change the expectation to match the new version. */
409static unsigned int ip_nat_sdp_media(struct sk_buff *skb, 455static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
410 const char **dptr, 456 const char **dptr, unsigned int *datalen,
411 unsigned int *datalen,
412 struct nf_conntrack_expect *rtp_exp, 457 struct nf_conntrack_expect *rtp_exp,
413 struct nf_conntrack_expect *rtcp_exp, 458 struct nf_conntrack_expect *rtcp_exp,
414 unsigned int mediaoff, 459 unsigned int mediaoff,
@@ -442,13 +487,25 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb,
442 /* Try to get same pair of ports: if not, try to change them. */ 487 /* Try to get same pair of ports: if not, try to change them. */
443 for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); 488 for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
444 port != 0; port += 2) { 489 port != 0; port += 2) {
490 int ret;
491
445 rtp_exp->tuple.dst.u.udp.port = htons(port); 492 rtp_exp->tuple.dst.u.udp.port = htons(port);
446 if (nf_ct_expect_related(rtp_exp) != 0) 493 ret = nf_ct_expect_related(rtp_exp);
494 if (ret == -EBUSY)
447 continue; 495 continue;
496 else if (ret < 0) {
497 port = 0;
498 break;
499 }
448 rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); 500 rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
449 if (nf_ct_expect_related(rtcp_exp) == 0) 501 ret = nf_ct_expect_related(rtcp_exp);
502 if (ret == 0)
450 break; 503 break;
451 nf_ct_unexpect_related(rtp_exp); 504 else if (ret != -EBUSY) {
505 nf_ct_unexpect_related(rtp_exp);
506 port = 0;
507 break;
508 }
452 } 509 }
453 510
454 if (port == 0) 511 if (port == 0)
@@ -456,7 +513,8 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb,
456 513
457 /* Update media port. */ 514 /* Update media port. */
458 if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port && 515 if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port &&
459 !ip_nat_sdp_port(skb, dptr, datalen, mediaoff, medialen, port)) 516 !ip_nat_sdp_port(skb, dataoff, dptr, datalen,
517 mediaoff, medialen, port))
460 goto err2; 518 goto err2;
461 519
462 return NF_ACCEPT; 520 return NF_ACCEPT;
@@ -471,6 +529,7 @@ err1:
471static void __exit nf_nat_sip_fini(void) 529static void __exit nf_nat_sip_fini(void)
472{ 530{
473 rcu_assign_pointer(nf_nat_sip_hook, NULL); 531 rcu_assign_pointer(nf_nat_sip_hook, NULL);
532 rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, NULL);
474 rcu_assign_pointer(nf_nat_sip_expect_hook, NULL); 533 rcu_assign_pointer(nf_nat_sip_expect_hook, NULL);
475 rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL); 534 rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL);
476 rcu_assign_pointer(nf_nat_sdp_port_hook, NULL); 535 rcu_assign_pointer(nf_nat_sdp_port_hook, NULL);
@@ -482,12 +541,14 @@ static void __exit nf_nat_sip_fini(void)
482static int __init nf_nat_sip_init(void) 541static int __init nf_nat_sip_init(void)
483{ 542{
484 BUG_ON(nf_nat_sip_hook != NULL); 543 BUG_ON(nf_nat_sip_hook != NULL);
544 BUG_ON(nf_nat_sip_seq_adjust_hook != NULL);
485 BUG_ON(nf_nat_sip_expect_hook != NULL); 545 BUG_ON(nf_nat_sip_expect_hook != NULL);
486 BUG_ON(nf_nat_sdp_addr_hook != NULL); 546 BUG_ON(nf_nat_sdp_addr_hook != NULL);
487 BUG_ON(nf_nat_sdp_port_hook != NULL); 547 BUG_ON(nf_nat_sdp_port_hook != NULL);
488 BUG_ON(nf_nat_sdp_session_hook != NULL); 548 BUG_ON(nf_nat_sdp_session_hook != NULL);
489 BUG_ON(nf_nat_sdp_media_hook != NULL); 549 BUG_ON(nf_nat_sdp_media_hook != NULL);
490 rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip); 550 rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip);
551 rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust);
491 rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect); 552 rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect);
492 rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr); 553 rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr);
493 rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port); 554 rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port);
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index d9521f6f9ed0..ee5f419d0a56 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -43,6 +43,7 @@
43#include <linux/moduleparam.h> 43#include <linux/moduleparam.h>
44#include <linux/types.h> 44#include <linux/types.h>
45#include <linux/kernel.h> 45#include <linux/kernel.h>
46#include <linux/slab.h>
46#include <linux/in.h> 47#include <linux/in.h>
47#include <linux/ip.h> 48#include <linux/ip.h>
48#include <linux/udp.h> 49#include <linux/udp.h>
@@ -400,7 +401,7 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
400 *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); 401 *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
401 if (*octets == NULL) { 402 if (*octets == NULL) {
402 if (net_ratelimit()) 403 if (net_ratelimit())
403 printk("OOM in bsalg (%d)\n", __LINE__); 404 pr_notice("OOM in bsalg (%d)\n", __LINE__);
404 return 0; 405 return 0;
405 } 406 }
406 407
@@ -451,7 +452,7 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
451 *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); 452 *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
452 if (*oid == NULL) { 453 if (*oid == NULL) {
453 if (net_ratelimit()) 454 if (net_ratelimit())
454 printk("OOM in bsalg (%d)\n", __LINE__); 455 pr_notice("OOM in bsalg (%d)\n", __LINE__);
455 return 0; 456 return 0;
456 } 457 }
457 458
@@ -728,7 +729,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
728 if (*obj == NULL) { 729 if (*obj == NULL) {
729 kfree(id); 730 kfree(id);
730 if (net_ratelimit()) 731 if (net_ratelimit())
731 printk("OOM in bsalg (%d)\n", __LINE__); 732 pr_notice("OOM in bsalg (%d)\n", __LINE__);
732 return 0; 733 return 0;
733 } 734 }
734 (*obj)->syntax.l[0] = l; 735 (*obj)->syntax.l[0] = l;
@@ -745,7 +746,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
745 kfree(p); 746 kfree(p);
746 kfree(id); 747 kfree(id);
747 if (net_ratelimit()) 748 if (net_ratelimit())
748 printk("OOM in bsalg (%d)\n", __LINE__); 749 pr_notice("OOM in bsalg (%d)\n", __LINE__);
749 return 0; 750 return 0;
750 } 751 }
751 memcpy((*obj)->syntax.c, p, len); 752 memcpy((*obj)->syntax.c, p, len);
@@ -760,7 +761,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
760 if (*obj == NULL) { 761 if (*obj == NULL) {
761 kfree(id); 762 kfree(id);
762 if (net_ratelimit()) 763 if (net_ratelimit())
763 printk("OOM in bsalg (%d)\n", __LINE__); 764 pr_notice("OOM in bsalg (%d)\n", __LINE__);
764 return 0; 765 return 0;
765 } 766 }
766 if (!asn1_null_decode(ctx, end)) { 767 if (!asn1_null_decode(ctx, end)) {
@@ -781,7 +782,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
781 kfree(lp); 782 kfree(lp);
782 kfree(id); 783 kfree(id);
783 if (net_ratelimit()) 784 if (net_ratelimit())
784 printk("OOM in bsalg (%d)\n", __LINE__); 785 pr_notice("OOM in bsalg (%d)\n", __LINE__);
785 return 0; 786 return 0;
786 } 787 }
787 memcpy((*obj)->syntax.ul, lp, len); 788 memcpy((*obj)->syntax.ul, lp, len);
@@ -802,7 +803,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
802 kfree(p); 803 kfree(p);
803 kfree(id); 804 kfree(id);
804 if (net_ratelimit()) 805 if (net_ratelimit())
805 printk("OOM in bsalg (%d)\n", __LINE__); 806 pr_notice("OOM in bsalg (%d)\n", __LINE__);
806 return 0; 807 return 0;
807 } 808 }
808 memcpy((*obj)->syntax.uc, p, len); 809 memcpy((*obj)->syntax.uc, p, len);
@@ -820,7 +821,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
820 if (*obj == NULL) { 821 if (*obj == NULL) {
821 kfree(id); 822 kfree(id);
822 if (net_ratelimit()) 823 if (net_ratelimit())
823 printk("OOM in bsalg (%d)\n", __LINE__); 824 pr_notice("OOM in bsalg (%d)\n", __LINE__);
824 return 0; 825 return 0;
825 } 826 }
826 (*obj)->syntax.ul[0] = ul; 827 (*obj)->syntax.ul[0] = ul;
@@ -892,13 +893,15 @@ static void fast_csum(__sum16 *csum,
892 unsigned char s[4]; 893 unsigned char s[4];
893 894
894 if (offset & 1) { 895 if (offset & 1) {
895 s[0] = s[2] = 0; 896 s[0] = ~0;
896 s[1] = ~*optr; 897 s[1] = ~*optr;
898 s[2] = 0;
897 s[3] = *nptr; 899 s[3] = *nptr;
898 } else { 900 } else {
899 s[1] = s[3] = 0;
900 s[0] = ~*optr; 901 s[0] = ~*optr;
902 s[1] = ~0;
901 s[2] = *nptr; 903 s[2] = *nptr;
904 s[3] = 0;
902 } 905 }
903 906
904 *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum))); 907 *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
@@ -1038,7 +1041,7 @@ static int snmp_parse_mangle(unsigned char *msg,
1038 unsigned int cls, con, tag, vers, pdutype; 1041 unsigned int cls, con, tag, vers, pdutype;
1039 struct asn1_ctx ctx; 1042 struct asn1_ctx ctx;
1040 struct asn1_octstr comm; 1043 struct asn1_octstr comm;
1041 struct snmp_object **obj; 1044 struct snmp_object *obj;
1042 1045
1043 if (debug > 1) 1046 if (debug > 1)
1044 hex_dump(msg, len); 1047 hex_dump(msg, len);
@@ -1148,43 +1151,34 @@ static int snmp_parse_mangle(unsigned char *msg,
1148 if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) 1151 if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
1149 return 0; 1152 return 0;
1150 1153
1151 obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
1152 if (obj == NULL) {
1153 if (net_ratelimit())
1154 printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__);
1155 return 0;
1156 }
1157
1158 while (!asn1_eoc_decode(&ctx, eoc)) { 1154 while (!asn1_eoc_decode(&ctx, eoc)) {
1159 unsigned int i; 1155 unsigned int i;
1160 1156
1161 if (!snmp_object_decode(&ctx, obj)) { 1157 if (!snmp_object_decode(&ctx, &obj)) {
1162 if (*obj) { 1158 if (obj) {
1163 kfree((*obj)->id); 1159 kfree(obj->id);
1164 kfree(*obj); 1160 kfree(obj);
1165 } 1161 }
1166 kfree(obj);
1167 return 0; 1162 return 0;
1168 } 1163 }
1169 1164
1170 if (debug > 1) { 1165 if (debug > 1) {
1171 printk(KERN_DEBUG "bsalg: object: "); 1166 printk(KERN_DEBUG "bsalg: object: ");
1172 for (i = 0; i < (*obj)->id_len; i++) { 1167 for (i = 0; i < obj->id_len; i++) {
1173 if (i > 0) 1168 if (i > 0)
1174 printk("."); 1169 printk(".");
1175 printk("%lu", (*obj)->id[i]); 1170 printk("%lu", obj->id[i]);
1176 } 1171 }
1177 printk(": type=%u\n", (*obj)->type); 1172 printk(": type=%u\n", obj->type);
1178 1173
1179 } 1174 }
1180 1175
1181 if ((*obj)->type == SNMP_IPADDR) 1176 if (obj->type == SNMP_IPADDR)
1182 mangle_address(ctx.begin, ctx.pointer - 4 , map, check); 1177 mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
1183 1178
1184 kfree((*obj)->id); 1179 kfree(obj->id);
1185 kfree(*obj); 1180 kfree(obj);
1186 } 1181 }
1187 kfree(obj);
1188 1182
1189 if (!asn1_eoc_decode(&ctx, eoc)) 1183 if (!asn1_eoc_decode(&ctx, eoc))
1190 return 0; 1184 return 0;
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 5567bd0d0750..95481fee8bdb 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -7,6 +7,7 @@
7 */ 7 */
8#include <linux/types.h> 8#include <linux/types.h>
9#include <linux/icmp.h> 9#include <linux/icmp.h>
10#include <linux/gfp.h>
10#include <linux/ip.h> 11#include <linux/ip.h>
11#include <linux/netfilter.h> 12#include <linux/netfilter.h>
12#include <linux/netfilter_ipv4.h> 13#include <linux/netfilter_ipv4.h>
@@ -97,7 +98,7 @@ nf_nat_fn(unsigned int hooknum,
97 return NF_ACCEPT; 98 return NF_ACCEPT;
98 99
99 /* Don't try to NAT if this packet is not conntracked */ 100 /* Don't try to NAT if this packet is not conntracked */
100 if (ct == &nf_conntrack_untracked) 101 if (nf_ct_is_untracked(ct))
101 return NF_ACCEPT; 102 return NF_ACCEPT;
102 103
103 nat = nfct_nat(ct); 104 nat = nfct_nat(ct);
@@ -130,16 +131,9 @@ nf_nat_fn(unsigned int hooknum,
130 if (!nf_nat_initialized(ct, maniptype)) { 131 if (!nf_nat_initialized(ct, maniptype)) {
131 unsigned int ret; 132 unsigned int ret;
132 133
133 if (hooknum == NF_INET_LOCAL_IN) 134 ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
134 /* LOCAL_IN hook doesn't have a chain! */ 135 if (ret != NF_ACCEPT)
135 ret = alloc_null_binding(ct, hooknum);
136 else
137 ret = nf_nat_rule_find(skb, hooknum, in, out,
138 ct);
139
140 if (ret != NF_ACCEPT) {
141 return ret; 136 return ret;
142 }
143 } else 137 } else
144 pr_debug("Already setup manip %s for ct %p\n", 138 pr_debug("Already setup manip %s for ct %p\n",
145 maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", 139 maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
@@ -197,11 +191,11 @@ nf_nat_out(unsigned int hooknum,
197 (ct = nf_ct_get(skb, &ctinfo)) != NULL) { 191 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
198 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 192 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
199 193
200 if (ct->tuplehash[dir].tuple.src.u3.ip != 194 if ((ct->tuplehash[dir].tuple.src.u3.ip !=
201 ct->tuplehash[!dir].tuple.dst.u3.ip 195 ct->tuplehash[!dir].tuple.dst.u3.ip) ||
202 || ct->tuplehash[dir].tuple.src.u.all != 196 (ct->tuplehash[dir].tuple.src.u.all !=
203 ct->tuplehash[!dir].tuple.dst.u.all 197 ct->tuplehash[!dir].tuple.dst.u.all)
204 ) 198 )
205 return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP; 199 return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP;
206 } 200 }
207#endif 201#endif
@@ -251,7 +245,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
251 { 245 {
252 .hook = nf_nat_in, 246 .hook = nf_nat_in,
253 .owner = THIS_MODULE, 247 .owner = THIS_MODULE,
254 .pf = PF_INET, 248 .pf = NFPROTO_IPV4,
255 .hooknum = NF_INET_PRE_ROUTING, 249 .hooknum = NF_INET_PRE_ROUTING,
256 .priority = NF_IP_PRI_NAT_DST, 250 .priority = NF_IP_PRI_NAT_DST,
257 }, 251 },
@@ -259,7 +253,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
259 { 253 {
260 .hook = nf_nat_out, 254 .hook = nf_nat_out,
261 .owner = THIS_MODULE, 255 .owner = THIS_MODULE,
262 .pf = PF_INET, 256 .pf = NFPROTO_IPV4,
263 .hooknum = NF_INET_POST_ROUTING, 257 .hooknum = NF_INET_POST_ROUTING,
264 .priority = NF_IP_PRI_NAT_SRC, 258 .priority = NF_IP_PRI_NAT_SRC,
265 }, 259 },
@@ -267,7 +261,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
267 { 261 {
268 .hook = nf_nat_local_fn, 262 .hook = nf_nat_local_fn,
269 .owner = THIS_MODULE, 263 .owner = THIS_MODULE,
270 .pf = PF_INET, 264 .pf = NFPROTO_IPV4,
271 .hooknum = NF_INET_LOCAL_OUT, 265 .hooknum = NF_INET_LOCAL_OUT,
272 .priority = NF_IP_PRI_NAT_DST, 266 .priority = NF_IP_PRI_NAT_DST,
273 }, 267 },
@@ -275,7 +269,7 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
275 { 269 {
276 .hook = nf_nat_fn, 270 .hook = nf_nat_fn,
277 .owner = THIS_MODULE, 271 .owner = THIS_MODULE,
278 .pf = PF_INET, 272 .pf = NFPROTO_IPV4,
279 .hooknum = NF_INET_LOCAL_IN, 273 .hooknum = NF_INET_LOCAL_IN,
280 .priority = NF_IP_PRI_NAT_SRC, 274 .priority = NF_IP_PRI_NAT_SRC,
281 }, 275 },
@@ -293,12 +287,12 @@ static int __init nf_nat_standalone_init(void)
293#endif 287#endif
294 ret = nf_nat_rule_init(); 288 ret = nf_nat_rule_init();
295 if (ret < 0) { 289 if (ret < 0) {
296 printk("nf_nat_init: can't setup rules.\n"); 290 pr_err("nf_nat_init: can't setup rules.\n");
297 goto cleanup_decode_session; 291 goto cleanup_decode_session;
298 } 292 }
299 ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); 293 ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
300 if (ret < 0) { 294 if (ret < 0) {
301 printk("nf_nat_init: can't register hooks.\n"); 295 pr_err("nf_nat_init: can't register hooks.\n");
302 goto cleanup_rule_init; 296 goto cleanup_rule_init;
303 } 297 }
304 return ret; 298 return ret;
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
index b096e81500ae..7274a43c7a12 100644
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ b/net/ipv4/netfilter/nf_nat_tftp.c
@@ -6,7 +6,6 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/moduleparam.h>
10#include <linux/udp.h> 9#include <linux/udp.h>
11 10
12#include <net/netfilter/nf_nat_helper.h> 11#include <net/netfilter/nf_nat_helper.h>
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index f25542c48b7d..b14ec7d03b6e 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,13 +59,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
59 local_bh_enable(); 59 local_bh_enable();
60 60
61 socket_seq_show(seq); 61 socket_seq_show(seq);
62 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", 62 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
63 sock_prot_inuse_get(net, &tcp_prot), orphans, 63 sock_prot_inuse_get(net, &tcp_prot), orphans,
64 tcp_death_row.tw_count, sockets, 64 tcp_death_row.tw_count, sockets,
65 atomic_read(&tcp_memory_allocated)); 65 atomic_long_read(&tcp_memory_allocated));
66 seq_printf(seq, "UDP: inuse %d mem %d\n", 66 seq_printf(seq, "UDP: inuse %d mem %ld\n",
67 sock_prot_inuse_get(net, &udp_prot), 67 sock_prot_inuse_get(net, &udp_prot),
68 atomic_read(&udp_memory_allocated)); 68 atomic_long_read(&udp_memory_allocated));
69 seq_printf(seq, "UDPLITE: inuse %d\n", 69 seq_printf(seq, "UDPLITE: inuse %d\n",
70 sock_prot_inuse_get(net, &udplite_prot)); 70 sock_prot_inuse_get(net, &udplite_prot));
71 seq_printf(seq, "RAW: inuse %d\n", 71 seq_printf(seq, "RAW: inuse %d\n",
@@ -127,8 +127,8 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
127 SNMP_MIB_SENTINEL 127 SNMP_MIB_SENTINEL
128}; 128};
129 129
130static struct { 130static const struct {
131 char *name; 131 const char *name;
132 int index; 132 int index;
133} icmpmibmap[] = { 133} icmpmibmap[] = {
134 { "DestUnreachs", ICMP_DEST_UNREACH }, 134 { "DestUnreachs", ICMP_DEST_UNREACH },
@@ -249,6 +249,11 @@ static const struct snmp_mib snmp4_net_list[] = {
249 SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), 249 SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
250 SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), 250 SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
251 SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), 251 SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
252 SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
253 SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
254 SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
255 SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
256 SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
252 SNMP_MIB_SENTINEL 257 SNMP_MIB_SENTINEL
253}; 258};
254 259
@@ -280,7 +285,7 @@ static void icmpmsg_put(struct seq_file *seq)
280 285
281 count = 0; 286 count = 0;
282 for (i = 0; i < ICMPMSG_MIB_MAX; i++) { 287 for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
283 val = snmp_fold_field((void **) net->mib.icmpmsg_statistics, i); 288 val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i);
284 if (val) { 289 if (val) {
285 type[count] = i; 290 type[count] = i;
286 vals[count++] = val; 291 vals[count++] = val;
@@ -307,18 +312,18 @@ static void icmp_put(struct seq_file *seq)
307 for (i=0; icmpmibmap[i].name != NULL; i++) 312 for (i=0; icmpmibmap[i].name != NULL; i++)
308 seq_printf(seq, " Out%s", icmpmibmap[i].name); 313 seq_printf(seq, " Out%s", icmpmibmap[i].name);
309 seq_printf(seq, "\nIcmp: %lu %lu", 314 seq_printf(seq, "\nIcmp: %lu %lu",
310 snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), 315 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS),
311 snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); 316 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
312 for (i=0; icmpmibmap[i].name != NULL; i++) 317 for (i=0; icmpmibmap[i].name != NULL; i++)
313 seq_printf(seq, " %lu", 318 seq_printf(seq, " %lu",
314 snmp_fold_field((void **) net->mib.icmpmsg_statistics, 319 snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
315 icmpmibmap[i].index)); 320 icmpmibmap[i].index));
316 seq_printf(seq, " %lu %lu", 321 seq_printf(seq, " %lu %lu",
317 snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), 322 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
318 snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); 323 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
319 for (i=0; icmpmibmap[i].name != NULL; i++) 324 for (i=0; icmpmibmap[i].name != NULL; i++)
320 seq_printf(seq, " %lu", 325 seq_printf(seq, " %lu",
321 snmp_fold_field((void **) net->mib.icmpmsg_statistics, 326 snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
322 icmpmibmap[i].index | 0x100)); 327 icmpmibmap[i].index | 0x100));
323} 328}
324 329
@@ -339,10 +344,12 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
339 IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, 344 IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
340 sysctl_ip_default_ttl); 345 sysctl_ip_default_ttl);
341 346
347 BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
342 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) 348 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
343 seq_printf(seq, " %lu", 349 seq_printf(seq, " %llu",
344 snmp_fold_field((void **)net->mib.ip_statistics, 350 snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
345 snmp4_ipstats_list[i].entry)); 351 snmp4_ipstats_list[i].entry,
352 offsetof(struct ipstats_mib, syncp)));
346 353
347 icmp_put(seq); /* RFC 2011 compatibility */ 354 icmp_put(seq); /* RFC 2011 compatibility */
348 icmpmsg_put(seq); 355 icmpmsg_put(seq);
@@ -356,11 +363,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
356 /* MaxConn field is signed, RFC 2012 */ 363 /* MaxConn field is signed, RFC 2012 */
357 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) 364 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
358 seq_printf(seq, " %ld", 365 seq_printf(seq, " %ld",
359 snmp_fold_field((void **)net->mib.tcp_statistics, 366 snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
360 snmp4_tcp_list[i].entry)); 367 snmp4_tcp_list[i].entry));
361 else 368 else
362 seq_printf(seq, " %lu", 369 seq_printf(seq, " %lu",
363 snmp_fold_field((void **)net->mib.tcp_statistics, 370 snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
364 snmp4_tcp_list[i].entry)); 371 snmp4_tcp_list[i].entry));
365 } 372 }
366 373
@@ -371,7 +378,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
371 seq_puts(seq, "\nUdp:"); 378 seq_puts(seq, "\nUdp:");
372 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 379 for (i = 0; snmp4_udp_list[i].name != NULL; i++)
373 seq_printf(seq, " %lu", 380 seq_printf(seq, " %lu",
374 snmp_fold_field((void **)net->mib.udp_statistics, 381 snmp_fold_field((void __percpu **)net->mib.udp_statistics,
375 snmp4_udp_list[i].entry)); 382 snmp4_udp_list[i].entry));
376 383
377 /* the UDP and UDP-Lite MIBs are the same */ 384 /* the UDP and UDP-Lite MIBs are the same */
@@ -382,7 +389,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
382 seq_puts(seq, "\nUdpLite:"); 389 seq_puts(seq, "\nUdpLite:");
383 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 390 for (i = 0; snmp4_udp_list[i].name != NULL; i++)
384 seq_printf(seq, " %lu", 391 seq_printf(seq, " %lu",
385 snmp_fold_field((void **)net->mib.udplite_statistics, 392 snmp_fold_field((void __percpu **)net->mib.udplite_statistics,
386 snmp4_udp_list[i].entry)); 393 snmp4_udp_list[i].entry));
387 394
388 seq_putc(seq, '\n'); 395 seq_putc(seq, '\n');
@@ -419,7 +426,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
419 seq_puts(seq, "\nTcpExt:"); 426 seq_puts(seq, "\nTcpExt:");
420 for (i = 0; snmp4_net_list[i].name != NULL; i++) 427 for (i = 0; snmp4_net_list[i].name != NULL; i++)
421 seq_printf(seq, " %lu", 428 seq_printf(seq, " %lu",
422 snmp_fold_field((void **)net->mib.net_statistics, 429 snmp_fold_field((void __percpu **)net->mib.net_statistics,
423 snmp4_net_list[i].entry)); 430 snmp4_net_list[i].entry));
424 431
425 seq_puts(seq, "\nIpExt:"); 432 seq_puts(seq, "\nIpExt:");
@@ -428,9 +435,10 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
428 435
429 seq_puts(seq, "\nIpExt:"); 436 seq_puts(seq, "\nIpExt:");
430 for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) 437 for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
431 seq_printf(seq, " %lu", 438 seq_printf(seq, " %llu",
432 snmp_fold_field((void **)net->mib.ip_statistics, 439 snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
433 snmp4_ipextstats_list[i].entry)); 440 snmp4_ipextstats_list[i].entry,
441 offsetof(struct ipstats_mib, syncp)));
434 442
435 seq_putc(seq, '\n'); 443 seq_putc(seq, '\n');
436 return 0; 444 return 0;
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index ea50da0649fd..9ae5c01cd0b2 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -22,75 +22,40 @@
22 * as published by the Free Software Foundation; either version 22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version. 23 * 2 of the License, or (at your option) any later version.
24 */ 24 */
25 25#include <linux/cache.h>
26#include <asm/uaccess.h>
27#include <asm/system.h>
28#include <linux/module.h> 26#include <linux/module.h>
29#include <linux/types.h>
30#include <linux/kernel.h>
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/in.h>
34#include <linux/inet.h>
35#include <linux/netdevice.h> 27#include <linux/netdevice.h>
36#include <linux/timer.h> 28#include <linux/spinlock.h>
37#include <net/ip.h>
38#include <net/protocol.h> 29#include <net/protocol.h>
39#include <linux/skbuff.h>
40#include <net/sock.h>
41#include <net/icmp.h>
42#include <net/udp.h>
43#include <net/ipip.h>
44#include <linux/igmp.h>
45 30
46struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; 31const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
47static DEFINE_SPINLOCK(inet_proto_lock);
48 32
49/* 33/*
50 * Add a protocol handler to the hash tables 34 * Add a protocol handler to the hash tables
51 */ 35 */
52 36
53int inet_add_protocol(struct net_protocol *prot, unsigned char protocol) 37int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
54{ 38{
55 int hash, ret; 39 int hash = protocol & (MAX_INET_PROTOS - 1);
56
57 hash = protocol & (MAX_INET_PROTOS - 1);
58 40
59 spin_lock_bh(&inet_proto_lock); 41 return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
60 if (inet_protos[hash]) { 42 NULL, prot) ? 0 : -1;
61 ret = -1;
62 } else {
63 inet_protos[hash] = prot;
64 ret = 0;
65 }
66 spin_unlock_bh(&inet_proto_lock);
67
68 return ret;
69} 43}
44EXPORT_SYMBOL(inet_add_protocol);
70 45
71/* 46/*
72 * Remove a protocol from the hash tables. 47 * Remove a protocol from the hash tables.
73 */ 48 */
74 49
75int inet_del_protocol(struct net_protocol *prot, unsigned char protocol) 50int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
76{ 51{
77 int hash, ret; 52 int ret, hash = protocol & (MAX_INET_PROTOS - 1);
78
79 hash = protocol & (MAX_INET_PROTOS - 1);
80 53
81 spin_lock_bh(&inet_proto_lock); 54 ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
82 if (inet_protos[hash] == prot) { 55 prot, NULL) == prot) ? 0 : -1;
83 inet_protos[hash] = NULL;
84 ret = 0;
85 } else {
86 ret = -1;
87 }
88 spin_unlock_bh(&inet_proto_lock);
89 56
90 synchronize_net(); 57 synchronize_net();
91 58
92 return ret; 59 return ret;
93} 60}
94
95EXPORT_SYMBOL(inet_add_protocol);
96EXPORT_SYMBOL(inet_del_protocol); 61EXPORT_SYMBOL(inet_del_protocol);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2979f14bb188..a3d5ab786e81 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -60,7 +60,6 @@
60#include <net/net_namespace.h> 60#include <net/net_namespace.h>
61#include <net/dst.h> 61#include <net/dst.h>
62#include <net/sock.h> 62#include <net/sock.h>
63#include <linux/gfp.h>
64#include <linux/ip.h> 63#include <linux/ip.h>
65#include <linux/net.h> 64#include <linux/net.h>
66#include <net/ip.h> 65#include <net/ip.h>
@@ -87,7 +86,7 @@ void raw_hash_sk(struct sock *sk)
87 struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; 86 struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
88 struct hlist_head *head; 87 struct hlist_head *head;
89 88
90 head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)]; 89 head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
91 90
92 write_lock_bh(&h->lock); 91 write_lock_bh(&h->lock);
93 sk_add_node(sk, head); 92 sk_add_node(sk, head);
@@ -115,9 +114,9 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
115 sk_for_each_from(sk, node) { 114 sk_for_each_from(sk, node) {
116 struct inet_sock *inet = inet_sk(sk); 115 struct inet_sock *inet = inet_sk(sk);
117 116
118 if (net_eq(sock_net(sk), net) && inet->num == num && 117 if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
119 !(inet->daddr && inet->daddr != raddr) && 118 !(inet->inet_daddr && inet->inet_daddr != raddr) &&
120 !(inet->rcv_saddr && inet->rcv_saddr != laddr) && 119 !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
121 !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) 120 !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
122 goto found; /* gotcha */ 121 goto found; /* gotcha */
123 } 122 }
@@ -291,8 +290,7 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
291{ 290{
292 /* Charge it to the socket. */ 291 /* Charge it to the socket. */
293 292
294 if (sock_queue_rcv_skb(sk, skb) < 0) { 293 if (ip_queue_rcv_skb(sk, skb) < 0) {
295 atomic_inc(&sk->sk_drops);
296 kfree_skb(skb); 294 kfree_skb(skb);
297 return NET_RX_DROP; 295 return NET_RX_DROP;
298 } 296 }
@@ -316,7 +314,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
316} 314}
317 315
318static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, 316static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
319 struct rtable *rt, 317 struct rtable **rtp,
320 unsigned int flags) 318 unsigned int flags)
321{ 319{
322 struct inet_sock *inet = inet_sk(sk); 320 struct inet_sock *inet = inet_sk(sk);
@@ -325,25 +323,27 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
325 struct sk_buff *skb; 323 struct sk_buff *skb;
326 unsigned int iphlen; 324 unsigned int iphlen;
327 int err; 325 int err;
326 struct rtable *rt = *rtp;
328 327
329 if (length > rt->u.dst.dev->mtu) { 328 if (length > rt->dst.dev->mtu) {
330 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, 329 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
331 rt->u.dst.dev->mtu); 330 rt->dst.dev->mtu);
332 return -EMSGSIZE; 331 return -EMSGSIZE;
333 } 332 }
334 if (flags&MSG_PROBE) 333 if (flags&MSG_PROBE)
335 goto out; 334 goto out;
336 335
337 skb = sock_alloc_send_skb(sk, 336 skb = sock_alloc_send_skb(sk,
338 length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15, 337 length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15,
339 flags & MSG_DONTWAIT, &err); 338 flags & MSG_DONTWAIT, &err);
340 if (skb == NULL) 339 if (skb == NULL)
341 goto error; 340 goto error;
342 skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev)); 341 skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev));
343 342
344 skb->priority = sk->sk_priority; 343 skb->priority = sk->sk_priority;
345 skb->mark = sk->sk_mark; 344 skb->mark = sk->sk_mark;
346 skb_dst_set(skb, dst_clone(&rt->u.dst)); 345 skb_dst_set(skb, &rt->dst);
346 *rtp = NULL;
347 347
348 skb_reset_network_header(skb); 348 skb_reset_network_header(skb);
349 iph = ip_hdr(skb); 349 iph = ip_hdr(skb);
@@ -352,19 +352,30 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
352 skb->ip_summed = CHECKSUM_NONE; 352 skb->ip_summed = CHECKSUM_NONE;
353 353
354 skb->transport_header = skb->network_header; 354 skb->transport_header = skb->network_header;
355 err = memcpy_fromiovecend((void *)iph, from, 0, length); 355 err = -EFAULT;
356 if (err) 356 if (memcpy_fromiovecend((void *)iph, from, 0, length))
357 goto error_fault; 357 goto error_free;
358 358
359 /* We don't modify invalid header */
360 iphlen = iph->ihl * 4; 359 iphlen = iph->ihl * 4;
361 if (iphlen >= sizeof(*iph) && iphlen <= length) { 360
361 /*
362 * We don't want to modify the ip header, but we do need to
363 * be sure that it won't cause problems later along the network
364 * stack. Specifically we want to make sure that iph->ihl is a
365 * sane value. If ihl points beyond the length of the buffer passed
366 * in, reject the frame as invalid
367 */
368 err = -EINVAL;
369 if (iphlen > length)
370 goto error_free;
371
372 if (iphlen >= sizeof(*iph)) {
362 if (!iph->saddr) 373 if (!iph->saddr)
363 iph->saddr = rt->rt_src; 374 iph->saddr = rt->rt_src;
364 iph->check = 0; 375 iph->check = 0;
365 iph->tot_len = htons(length); 376 iph->tot_len = htons(length);
366 if (!iph->id) 377 if (!iph->id)
367 ip_select_ident(iph, &rt->u.dst, NULL); 378 ip_select_ident(iph, &rt->dst, NULL);
368 379
369 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 380 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
370 } 381 }
@@ -372,20 +383,21 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
372 icmp_out_count(net, ((struct icmphdr *) 383 icmp_out_count(net, ((struct icmphdr *)
373 skb_transport_header(skb))->type); 384 skb_transport_header(skb))->type);
374 385
375 err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, 386 err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
376 dst_output); 387 rt->dst.dev, dst_output);
377 if (err > 0) 388 if (err > 0)
378 err = inet->recverr ? net_xmit_errno(err) : 0; 389 err = net_xmit_errno(err);
379 if (err) 390 if (err)
380 goto error; 391 goto error;
381out: 392out:
382 return 0; 393 return 0;
383 394
384error_fault: 395error_free:
385 err = -EFAULT;
386 kfree_skb(skb); 396 kfree_skb(skb);
387error: 397error:
388 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); 398 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
399 if (err == -ENOBUFS && !inet->recverr)
400 err = 0;
389 return err; 401 return err;
390} 402}
391 403
@@ -488,12 +500,12 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
488 err = -EDESTADDRREQ; 500 err = -EDESTADDRREQ;
489 if (sk->sk_state != TCP_ESTABLISHED) 501 if (sk->sk_state != TCP_ESTABLISHED)
490 goto out; 502 goto out;
491 daddr = inet->daddr; 503 daddr = inet->inet_daddr;
492 } 504 }
493 505
494 ipc.addr = inet->saddr; 506 ipc.addr = inet->inet_saddr;
495 ipc.opt = NULL; 507 ipc.opt = NULL;
496 ipc.shtx.flags = 0; 508 ipc.tx_flags = 0;
497 ipc.oif = sk->sk_bound_dev_if; 509 ipc.oif = sk->sk_bound_dev_if;
498 510
499 if (msg->msg_controllen) { 511 if (msg->msg_controllen) {
@@ -537,10 +549,9 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
537 { 549 {
538 struct flowi fl = { .oif = ipc.oif, 550 struct flowi fl = { .oif = ipc.oif,
539 .mark = sk->sk_mark, 551 .mark = sk->sk_mark,
540 .nl_u = { .ip4_u = 552 .fl4_dst = daddr,
541 { .daddr = daddr, 553 .fl4_src = saddr,
542 .saddr = saddr, 554 .fl4_tos = tos,
543 .tos = tos } },
544 .proto = inet->hdrincl ? IPPROTO_RAW : 555 .proto = inet->hdrincl ? IPPROTO_RAW :
545 sk->sk_protocol, 556 sk->sk_protocol,
546 }; 557 };
@@ -566,7 +577,7 @@ back_from_confirm:
566 577
567 if (inet->hdrincl) 578 if (inet->hdrincl)
568 err = raw_send_hdrinc(sk, msg->msg_iov, len, 579 err = raw_send_hdrinc(sk, msg->msg_iov, len,
569 rt, msg->msg_flags); 580 &rt, msg->msg_flags);
570 581
571 else { 582 else {
572 if (!ipc.addr) 583 if (!ipc.addr)
@@ -576,8 +587,11 @@ back_from_confirm:
576 &ipc, &rt, msg->msg_flags); 587 &ipc, &rt, msg->msg_flags);
577 if (err) 588 if (err)
578 ip_flush_pending_frames(sk); 589 ip_flush_pending_frames(sk);
579 else if (!(msg->msg_flags & MSG_MORE)) 590 else if (!(msg->msg_flags & MSG_MORE)) {
580 err = ip_push_pending_frames(sk); 591 err = ip_push_pending_frames(sk);
592 if (err == -ENOBUFS && !inet->recverr)
593 err = 0;
594 }
581 release_sock(sk); 595 release_sock(sk);
582 } 596 }
583done: 597done:
@@ -591,7 +605,7 @@ out:
591 return len; 605 return len;
592 606
593do_confirm: 607do_confirm:
594 dst_confirm(&rt->u.dst); 608 dst_confirm(&rt->dst);
595 if (!(msg->msg_flags & MSG_PROBE) || len) 609 if (!(msg->msg_flags & MSG_PROBE) || len)
596 goto back_from_confirm; 610 goto back_from_confirm;
597 err = 0; 611 err = 0;
@@ -630,9 +644,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
630 if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && 644 if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
631 chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) 645 chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
632 goto out; 646 goto out;
633 inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; 647 inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
634 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) 648 if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
635 inet->saddr = 0; /* Use device */ 649 inet->inet_saddr = 0; /* Use device */
636 sk_dst_reset(sk); 650 sk_dst_reset(sk);
637 ret = 0; 651 ret = 0;
638out: return ret; 652out: return ret;
@@ -677,7 +691,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
677 if (err) 691 if (err)
678 goto done; 692 goto done;
679 693
680 sock_recv_timestamp(msg, sk, skb); 694 sock_recv_ts_and_drops(msg, sk, skb);
681 695
682 /* Copy the address. */ 696 /* Copy the address. */
683 if (sin) { 697 if (sin) {
@@ -702,7 +716,7 @@ static int raw_init(struct sock *sk)
702{ 716{
703 struct raw_sock *rp = raw_sk(sk); 717 struct raw_sock *rp = raw_sk(sk);
704 718
705 if (inet_sk(sk)->num == IPPROTO_ICMP) 719 if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
706 memset(&rp->filter, 0, sizeof(rp->filter)); 720 memset(&rp->filter, 0, sizeof(rp->filter));
707 return 0; 721 return 0;
708} 722}
@@ -736,10 +750,10 @@ out: return ret;
736} 750}
737 751
738static int do_raw_setsockopt(struct sock *sk, int level, int optname, 752static int do_raw_setsockopt(struct sock *sk, int level, int optname,
739 char __user *optval, int optlen) 753 char __user *optval, unsigned int optlen)
740{ 754{
741 if (optname == ICMP_FILTER) { 755 if (optname == ICMP_FILTER) {
742 if (inet_sk(sk)->num != IPPROTO_ICMP) 756 if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
743 return -EOPNOTSUPP; 757 return -EOPNOTSUPP;
744 else 758 else
745 return raw_seticmpfilter(sk, optval, optlen); 759 return raw_seticmpfilter(sk, optval, optlen);
@@ -748,7 +762,7 @@ static int do_raw_setsockopt(struct sock *sk, int level, int optname,
748} 762}
749 763
750static int raw_setsockopt(struct sock *sk, int level, int optname, 764static int raw_setsockopt(struct sock *sk, int level, int optname,
751 char __user *optval, int optlen) 765 char __user *optval, unsigned int optlen)
752{ 766{
753 if (level != SOL_RAW) 767 if (level != SOL_RAW)
754 return ip_setsockopt(sk, level, optname, optval, optlen); 768 return ip_setsockopt(sk, level, optname, optval, optlen);
@@ -757,7 +771,7 @@ static int raw_setsockopt(struct sock *sk, int level, int optname,
757 771
758#ifdef CONFIG_COMPAT 772#ifdef CONFIG_COMPAT
759static int compat_raw_setsockopt(struct sock *sk, int level, int optname, 773static int compat_raw_setsockopt(struct sock *sk, int level, int optname,
760 char __user *optval, int optlen) 774 char __user *optval, unsigned int optlen)
761{ 775{
762 if (level != SOL_RAW) 776 if (level != SOL_RAW)
763 return compat_ip_setsockopt(sk, level, optname, optval, optlen); 777 return compat_ip_setsockopt(sk, level, optname, optval, optlen);
@@ -769,7 +783,7 @@ static int do_raw_getsockopt(struct sock *sk, int level, int optname,
769 char __user *optval, int __user *optlen) 783 char __user *optval, int __user *optlen)
770{ 784{
771 if (optname == ICMP_FILTER) { 785 if (optname == ICMP_FILTER) {
772 if (inet_sk(sk)->num != IPPROTO_ICMP) 786 if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
773 return -EOPNOTSUPP; 787 return -EOPNOTSUPP;
774 else 788 else
775 return raw_geticmpfilter(sk, optval, optlen); 789 return raw_geticmpfilter(sk, optval, optlen);
@@ -928,10 +942,10 @@ EXPORT_SYMBOL_GPL(raw_seq_stop);
928static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) 942static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
929{ 943{
930 struct inet_sock *inet = inet_sk(sp); 944 struct inet_sock *inet = inet_sk(sp);
931 __be32 dest = inet->daddr, 945 __be32 dest = inet->inet_daddr,
932 src = inet->rcv_saddr; 946 src = inet->inet_rcv_saddr;
933 __u16 destp = 0, 947 __u16 destp = 0,
934 srcp = inet->num; 948 srcp = inet->inet_num;
935 949
936 seq_printf(seq, "%4d: %08X:%04X %08X:%04X" 950 seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
937 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", 951 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 65b3a8b11a6c..351dc4e85242 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -90,6 +90,7 @@
90#include <linux/jhash.h> 90#include <linux/jhash.h>
91#include <linux/rcupdate.h> 91#include <linux/rcupdate.h>
92#include <linux/times.h> 92#include <linux/times.h>
93#include <linux/slab.h>
93#include <net/dst.h> 94#include <net/dst.h>
94#include <net/net_namespace.h> 95#include <net/net_namespace.h>
95#include <net/protocol.h> 96#include <net/protocol.h>
@@ -128,7 +129,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256; 131static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
132static int rt_chain_length_max __read_mostly = 20; 132static int rt_chain_length_max __read_mostly = 20;
133 133
134static struct delayed_work expires_work; 134static struct delayed_work expires_work;
@@ -139,28 +139,32 @@ static unsigned long expires_ljiffies;
139 */ 139 */
140 140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
143static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
142static void ipv4_dst_destroy(struct dst_entry *dst); 144static void ipv4_dst_destroy(struct dst_entry *dst);
143static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb); 146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148static int rt_garbage_collect(struct dst_ops *ops); 148static int rt_garbage_collect(struct dst_ops *ops);
149static void rt_emergency_hash_rebuild(struct net *net);
150 149
150static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
151 int how)
152{
153}
151 154
152static struct dst_ops ipv4_dst_ops = { 155static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET, 156 .family = AF_INET,
154 .protocol = cpu_to_be16(ETH_P_IP), 157 .protocol = cpu_to_be16(ETH_P_IP),
155 .gc = rt_garbage_collect, 158 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check, 159 .check = ipv4_dst_check,
160 .default_advmss = ipv4_default_advmss,
161 .default_mtu = ipv4_default_mtu,
157 .destroy = ipv4_dst_destroy, 162 .destroy = ipv4_dst_destroy,
158 .ifdown = ipv4_dst_ifdown, 163 .ifdown = ipv4_dst_ifdown,
159 .negative_advice = ipv4_negative_advice, 164 .negative_advice = ipv4_negative_advice,
160 .link_failure = ipv4_link_failure, 165 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu, 166 .update_pmtu = ip_rt_update_pmtu,
162 .local_out = __ip_local_out, 167 .local_out = __ip_local_out,
163 .entries = ATOMIC_INIT(0),
164}; 168};
165 169
166#define ECN_OR_COST(class) TC_PRIO_##class 170#define ECN_OR_COST(class) TC_PRIO_##class
@@ -200,7 +204,7 @@ const __u8 ip_tos2prio[16] = {
200 */ 204 */
201 205
202struct rt_hash_bucket { 206struct rt_hash_bucket {
203 struct rtable *chain; 207 struct rtable __rcu *chain;
204}; 208};
205 209
206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ 210#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
@@ -254,14 +258,12 @@ static unsigned rt_hash_mask __read_mostly;
254static unsigned int rt_hash_log __read_mostly; 258static unsigned int rt_hash_log __read_mostly;
255 259
256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 260static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257#define RT_CACHE_STAT_INC(field) \ 261#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
258 (__raw_get_cpu_var(rt_cache_stat).field++)
259 262
260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, 263static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid) 264 int genid)
262{ 265{
263 return jhash_3words((__force u32)(__be32)(daddr), 266 return jhash_3words((__force u32)daddr, (__force u32)saddr,
264 (__force u32)(__be32)(saddr),
265 idx, genid) 267 idx, genid)
266 & rt_hash_mask; 268 & rt_hash_mask;
267} 269}
@@ -284,15 +286,15 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
284 struct rtable *r = NULL; 286 struct rtable *r = NULL;
285 287
286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 288 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287 if (!rt_hash_table[st->bucket].chain) 289 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
288 continue; 290 continue;
289 rcu_read_lock_bh(); 291 rcu_read_lock_bh();
290 r = rcu_dereference(rt_hash_table[st->bucket].chain); 292 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
291 while (r) { 293 while (r) {
292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) && 294 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
293 r->rt_genid == st->genid) 295 r->rt_genid == st->genid)
294 return r; 296 return r;
295 r = rcu_dereference(r->u.dst.rt_next); 297 r = rcu_dereference_bh(r->dst.rt_next);
296 } 298 }
297 rcu_read_unlock_bh(); 299 rcu_read_unlock_bh();
298 } 300 }
@@ -304,17 +306,17 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
304{ 306{
305 struct rt_cache_iter_state *st = seq->private; 307 struct rt_cache_iter_state *st = seq->private;
306 308
307 r = r->u.dst.rt_next; 309 r = rcu_dereference_bh(r->dst.rt_next);
308 while (!r) { 310 while (!r) {
309 rcu_read_unlock_bh(); 311 rcu_read_unlock_bh();
310 do { 312 do {
311 if (--st->bucket < 0) 313 if (--st->bucket < 0)
312 return NULL; 314 return NULL;
313 } while (!rt_hash_table[st->bucket].chain); 315 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
314 rcu_read_lock_bh(); 316 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain; 317 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
316 } 318 }
317 return rcu_dereference(r); 319 return r;
318} 320}
319 321
320static struct rtable *rt_cache_get_next(struct seq_file *seq, 322static struct rtable *rt_cache_get_next(struct seq_file *seq,
@@ -322,7 +324,7 @@ static struct rtable *rt_cache_get_next(struct seq_file *seq,
322{ 324{
323 struct rt_cache_iter_state *st = seq->private; 325 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) { 326 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 if (dev_net(r->u.dst.dev) != seq_file_net(seq)) 327 if (dev_net(r->dst.dev) != seq_file_net(seq))
326 continue; 328 continue;
327 if (r->rt_genid == st->genid) 329 if (r->rt_genid == st->genid)
328 break; 330 break;
@@ -378,20 +380,20 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
378 struct rtable *r = v; 380 struct rtable *r = v;
379 int len; 381 int len;
380 382
381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" 383 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", 384 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 r->u.dst.dev ? r->u.dst.dev->name : "*", 385 r->dst.dev ? r->dst.dev->name : "*",
384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, 386 (__force u32)r->rt_dst,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt), 387 (__force u32)r->rt_gateway,
386 r->u.dst.__use, 0, (unsigned long)r->rt_src, 388 r->rt_flags, atomic_read(&r->dst.__refcnt),
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ? 389 r->dst.__use, 0, (__force u32)r->rt_src,
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), 390 dst_metric_advmss(&r->dst) + 40,
389 dst_metric(&r->u.dst, RTAX_WINDOW), 391 dst_metric(&r->dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + 392 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->u.dst, RTAX_RTTVAR)), 393 dst_metric(&r->dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos, 394 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, 395 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output == 396 r->dst.hh ? (r->dst.hh->hh_output ==
395 dev_queue_xmit) : 0, 397 dev_queue_xmit) : 0,
396 r->rt_spec_dst, &len); 398 r->rt_spec_dst, &len);
397 399
@@ -468,7 +470,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
468 470
469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 471 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 472 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops.entries), 473 dst_entries_get_slow(&ipv4_dst_ops),
472 st->in_hit, 474 st->in_hit,
473 st->in_slow_tot, 475 st->in_slow_tot,
474 st->in_slow_mc, 476 st->in_slow_mc,
@@ -513,43 +515,42 @@ static const struct file_operations rt_cpu_seq_fops = {
513}; 515};
514 516
515#ifdef CONFIG_NET_CLS_ROUTE 517#ifdef CONFIG_NET_CLS_ROUTE
516static int ip_rt_acct_read(char *buffer, char **start, off_t offset, 518static int rt_acct_proc_show(struct seq_file *m, void *v)
517 int length, int *eof, void *data) 519{
518{ 520 struct ip_rt_acct *dst, *src;
519 unsigned int i; 521 unsigned int i, j;
520 522
521 if ((offset & 3) || (length & 3)) 523 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 return -EIO; 524 if (!dst)
523 525 return -ENOMEM;
524 if (offset >= sizeof(struct ip_rt_acct) * 256) { 526
525 *eof = 1; 527 for_each_possible_cpu(i) {
526 return 0; 528 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 } 529 for (j = 0; j < 256; j++) {
528 530 dst[j].o_bytes += src[j].o_bytes;
529 if (offset + length >= sizeof(struct ip_rt_acct) * 256) { 531 dst[j].o_packets += src[j].o_packets;
530 length = sizeof(struct ip_rt_acct) * 256 - offset; 532 dst[j].i_bytes += src[j].i_bytes;
531 *eof = 1; 533 dst[j].i_packets += src[j].i_packets;
534 }
532 } 535 }
533 536
534 offset /= sizeof(u32); 537 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
535 538 kfree(dst);
536 if (length > 0) { 539 return 0;
537 u32 *dst = (u32 *) buffer; 540}
538
539 *start = buffer;
540 memset(dst, 0, length);
541
542 for_each_possible_cpu(i) {
543 unsigned int j;
544 u32 *src;
545 541
546 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset; 542static int rt_acct_proc_open(struct inode *inode, struct file *file)
547 for (j = 0; j < length/4; j++) 543{
548 dst[j] += src[j]; 544 return single_open(file, rt_acct_proc_show, NULL);
549 }
550 }
551 return length;
552} 545}
546
547static const struct file_operations rt_acct_proc_fops = {
548 .owner = THIS_MODULE,
549 .open = rt_acct_proc_open,
550 .read = seq_read,
551 .llseek = seq_lseek,
552 .release = single_release,
553};
553#endif 554#endif
554 555
555static int __net_init ip_rt_do_proc_init(struct net *net) 556static int __net_init ip_rt_do_proc_init(struct net *net)
@@ -567,8 +568,7 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
567 goto err2; 568 goto err2;
568 569
569#ifdef CONFIG_NET_CLS_ROUTE 570#ifdef CONFIG_NET_CLS_ROUTE
570 pde = create_proc_read_entry("rt_acct", 0, net->proc_net, 571 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
571 ip_rt_acct_read, NULL);
572 if (!pde) 572 if (!pde)
573 goto err3; 573 goto err3;
574#endif 574#endif
@@ -588,7 +588,9 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
588{ 588{
589 remove_proc_entry("rt_cache", net->proc_net_stat); 589 remove_proc_entry("rt_cache", net->proc_net_stat);
590 remove_proc_entry("rt_cache", net->proc_net); 590 remove_proc_entry("rt_cache", net->proc_net);
591#ifdef CONFIG_NET_CLS_ROUTE
591 remove_proc_entry("rt_acct", net->proc_net); 592 remove_proc_entry("rt_acct", net->proc_net);
593#endif
592} 594}
593 595
594static struct pernet_operations ip_rt_proc_ops __net_initdata = { 596static struct pernet_operations ip_rt_proc_ops __net_initdata = {
@@ -610,13 +612,13 @@ static inline int ip_rt_proc_init(void)
610 612
611static inline void rt_free(struct rtable *rt) 613static inline void rt_free(struct rtable *rt)
612{ 614{
613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 615 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
614} 616}
615 617
616static inline void rt_drop(struct rtable *rt) 618static inline void rt_drop(struct rtable *rt)
617{ 619{
618 ip_rt_put(rt); 620 ip_rt_put(rt);
619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 621 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
620} 622}
621 623
622static inline int rt_fast_clean(struct rtable *rth) 624static inline int rt_fast_clean(struct rtable *rth)
@@ -624,13 +626,13 @@ static inline int rt_fast_clean(struct rtable *rth)
624 /* Kill broadcast/multicast entries very aggresively, if they 626 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */ 627 collide in hash table with more useful entries */
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 628 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 rth->fl.iif && rth->u.dst.rt_next; 629 rt_is_input_route(rth) && rth->dst.rt_next;
628} 630}
629 631
630static inline int rt_valuable(struct rtable *rth) 632static inline int rt_valuable(struct rtable *rth)
631{ 633{
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 634 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 rth->u.dst.expires; 635 rth->dst.expires;
634} 636}
635 637
636static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 638static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -638,15 +640,15 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
638 unsigned long age; 640 unsigned long age;
639 int ret = 0; 641 int ret = 0;
640 642
641 if (atomic_read(&rth->u.dst.__refcnt)) 643 if (atomic_read(&rth->dst.__refcnt))
642 goto out; 644 goto out;
643 645
644 ret = 1; 646 ret = 1;
645 if (rth->u.dst.expires && 647 if (rth->dst.expires &&
646 time_after_eq(jiffies, rth->u.dst.expires)) 648 time_after_eq(jiffies, rth->dst.expires))
647 goto out; 649 goto out;
648 650
649 age = jiffies - rth->u.dst.lastuse; 651 age = jiffies - rth->dst.lastuse;
650 ret = 0; 652 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) || 653 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth))) 654 (age <= tmo2 && rt_valuable(rth)))
@@ -662,14 +664,14 @@ out: return ret;
662 */ 664 */
663static inline u32 rt_score(struct rtable *rt) 665static inline u32 rt_score(struct rtable *rt)
664{ 666{
665 u32 score = jiffies - rt->u.dst.lastuse; 667 u32 score = jiffies - rt->dst.lastuse;
666 668
667 score = ~score & ~(3<<30); 669 score = ~score & ~(3<<30);
668 670
669 if (rt_valuable(rt)) 671 if (rt_valuable(rt))
670 score |= (1<<31); 672 score |= (1<<31);
671 673
672 if (!rt->fl.iif || 674 if (rt_is_output_route(rt) ||
673 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) 675 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 score |= (1<<30); 676 score |= (1<<30);
675 677
@@ -685,30 +687,29 @@ static inline bool rt_caching(const struct net *net)
685static inline bool compare_hash_inputs(const struct flowi *fl1, 687static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2) 688 const struct flowi *fl2)
687{ 689{
688 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | 690 return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
689 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | 691 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
690 (fl1->iif ^ fl2->iif)) == 0); 692 (fl1->iif ^ fl2->iif)) == 0);
691} 693}
692 694
693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 695static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694{ 696{
695 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | 697 return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
696 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) | 698 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
697 (fl1->mark ^ fl2->mark) | 699 (fl1->mark ^ fl2->mark) |
698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ 700 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
699 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 (fl1->oif ^ fl2->oif) | 701 (fl1->oif ^ fl2->oif) |
701 (fl1->iif ^ fl2->iif)) == 0; 702 (fl1->iif ^ fl2->iif)) == 0;
702} 703}
703 704
704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 705static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705{ 706{
706 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev); 707 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
707} 708}
708 709
709static inline int rt_is_expired(struct rtable *rth) 710static inline int rt_is_expired(struct rtable *rth)
710{ 711{
711 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); 712 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
712} 713}
713 714
714/* 715/*
@@ -716,55 +717,48 @@ static inline int rt_is_expired(struct rtable *rth)
716 * Can be called by a softirq or a process. 717 * Can be called by a softirq or a process.
717 * In the later case, we want to be reschedule if necessary 718 * In the later case, we want to be reschedule if necessary
718 */ 719 */
719static void rt_do_flush(int process_context) 720static void rt_do_flush(struct net *net, int process_context)
720{ 721{
721 unsigned int i; 722 unsigned int i;
722 struct rtable *rth, *next; 723 struct rtable *rth, *next;
723 struct rtable * tail;
724 724
725 for (i = 0; i <= rt_hash_mask; i++) { 725 for (i = 0; i <= rt_hash_mask; i++) {
726 struct rtable __rcu **pprev;
727 struct rtable *list;
728
726 if (process_context && need_resched()) 729 if (process_context && need_resched())
727 cond_resched(); 730 cond_resched();
728 rth = rt_hash_table[i].chain; 731 rth = rcu_dereference_raw(rt_hash_table[i].chain);
729 if (!rth) 732 if (!rth)
730 continue; 733 continue;
731 734
732 spin_lock_bh(rt_hash_lock_addr(i)); 735 spin_lock_bh(rt_hash_lock_addr(i));
733#ifdef CONFIG_NET_NS
734 {
735 struct rtable ** prev, * p;
736 736
737 rth = rt_hash_table[i].chain; 737 list = NULL;
738 pprev = &rt_hash_table[i].chain;
739 rth = rcu_dereference_protected(*pprev,
740 lockdep_is_held(rt_hash_lock_addr(i)));
738 741
739 /* defer releasing the head of the list after spin_unlock */ 742 while (rth) {
740 for (tail = rth; tail; tail = tail->u.dst.rt_next) 743 next = rcu_dereference_protected(rth->dst.rt_next,
741 if (!rt_is_expired(tail)) 744 lockdep_is_held(rt_hash_lock_addr(i)));
742 break; 745
743 if (rth != tail) 746 if (!net ||
744 rt_hash_table[i].chain = tail; 747 net_eq(dev_net(rth->dst.dev), net)) {
745 748 rcu_assign_pointer(*pprev, next);
746 /* call rt_free on entries after the tail requiring flush */ 749 rcu_assign_pointer(rth->dst.rt_next, list);
747 prev = &rt_hash_table[i].chain; 750 list = rth;
748 for (p = *prev; p; p = next) {
749 next = p->u.dst.rt_next;
750 if (!rt_is_expired(p)) {
751 prev = &p->u.dst.rt_next;
752 } else { 751 } else {
753 *prev = next; 752 pprev = &rth->dst.rt_next;
754 rt_free(p);
755 } 753 }
754 rth = next;
756 } 755 }
757 } 756
758#else
759 rth = rt_hash_table[i].chain;
760 rt_hash_table[i].chain = NULL;
761 tail = NULL;
762#endif
763 spin_unlock_bh(rt_hash_lock_addr(i)); 757 spin_unlock_bh(rt_hash_lock_addr(i));
764 758
765 for (; rth != tail; rth = next) { 759 for (; list; list = next) {
766 next = rth->u.dst.rt_next; 760 next = rcu_dereference_protected(list->dst.rt_next, 1);
767 rt_free(rth); 761 rt_free(list);
768 } 762 }
769 } 763 }
770} 764}
@@ -780,11 +774,31 @@ static void rt_do_flush(int process_context)
780#define FRACT_BITS 3 774#define FRACT_BITS 3
781#define ONE (1UL << FRACT_BITS) 775#define ONE (1UL << FRACT_BITS)
782 776
777/*
778 * Given a hash chain and an item in this hash chain,
779 * find if a previous entry has the same hash_inputs
780 * (but differs on tos, mark or oif)
781 * Returns 0 if an alias is found.
782 * Returns ONE if rth has no alias before itself.
783 */
784static int has_noalias(const struct rtable *head, const struct rtable *rth)
785{
786 const struct rtable *aux = head;
787
788 while (aux != rth) {
789 if (compare_hash_inputs(&aux->fl, &rth->fl))
790 return 0;
791 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
792 }
793 return ONE;
794}
795
783static void rt_check_expire(void) 796static void rt_check_expire(void)
784{ 797{
785 static unsigned int rover; 798 static unsigned int rover;
786 unsigned int i = rover, goal; 799 unsigned int i = rover, goal;
787 struct rtable *rth, *aux, **rthp; 800 struct rtable *rth;
801 struct rtable __rcu **rthp;
788 unsigned long samples = 0; 802 unsigned long samples = 0;
789 unsigned long sum = 0, sum2 = 0; 803 unsigned long sum = 0, sum2 = 0;
790 unsigned long delta; 804 unsigned long delta;
@@ -810,23 +824,24 @@ static void rt_check_expire(void)
810 824
811 samples++; 825 samples++;
812 826
813 if (*rthp == NULL) 827 if (rcu_dereference_raw(*rthp) == NULL)
814 continue; 828 continue;
815 length = 0; 829 length = 0;
816 spin_lock_bh(rt_hash_lock_addr(i)); 830 spin_lock_bh(rt_hash_lock_addr(i));
817 while ((rth = *rthp) != NULL) { 831 while ((rth = rcu_dereference_protected(*rthp,
818 prefetch(rth->u.dst.rt_next); 832 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
833 prefetch(rth->dst.rt_next);
819 if (rt_is_expired(rth)) { 834 if (rt_is_expired(rth)) {
820 *rthp = rth->u.dst.rt_next; 835 *rthp = rth->dst.rt_next;
821 rt_free(rth); 836 rt_free(rth);
822 continue; 837 continue;
823 } 838 }
824 if (rth->u.dst.expires) { 839 if (rth->dst.expires) {
825 /* Entry is expired even if it is in use */ 840 /* Entry is expired even if it is in use */
826 if (time_before_eq(jiffies, rth->u.dst.expires)) { 841 if (time_before_eq(jiffies, rth->dst.expires)) {
827nofree: 842nofree:
828 tmo >>= 1; 843 tmo >>= 1;
829 rthp = &rth->u.dst.rt_next; 844 rthp = &rth->dst.rt_next;
830 /* 845 /*
831 * We only count entries on 846 * We only count entries on
832 * a chain with equal hash inputs once 847 * a chain with equal hash inputs once
@@ -835,22 +850,14 @@ nofree:
835 * attributes don't unfairly skew 850 * attributes don't unfairly skew
836 * the length computation 851 * the length computation
837 */ 852 */
838 for (aux = rt_hash_table[i].chain;;) { 853 length += has_noalias(rt_hash_table[i].chain, rth);
839 if (aux == rth) {
840 length += ONE;
841 break;
842 }
843 if (compare_hash_inputs(&aux->fl, &rth->fl))
844 break;
845 aux = aux->u.dst.rt_next;
846 }
847 continue; 854 continue;
848 } 855 }
849 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) 856 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
850 goto nofree; 857 goto nofree;
851 858
852 /* Cleanup aged off entries. */ 859 /* Cleanup aged off entries. */
853 *rthp = rth->u.dst.rt_next; 860 *rthp = rth->dst.rt_next;
854 rt_free(rth); 861 rt_free(rth);
855 } 862 }
856 spin_unlock_bh(rt_hash_lock_addr(i)); 863 spin_unlock_bh(rt_hash_lock_addr(i));
@@ -899,37 +906,20 @@ void rt_cache_flush(struct net *net, int delay)
899{ 906{
900 rt_cache_invalidate(net); 907 rt_cache_invalidate(net);
901 if (delay >= 0) 908 if (delay >= 0)
902 rt_do_flush(!in_softirq()); 909 rt_do_flush(net, !in_softirq());
903} 910}
904 911
905/* 912/* Flush previous cache invalidated entries from the cache */
906 * We change rt_genid and let gc do the cleanup 913void rt_cache_flush_batch(struct net *net)
907 */
908static void rt_secret_rebuild(unsigned long __net)
909{ 914{
910 struct net *net = (struct net *)__net; 915 rt_do_flush(net, !in_softirq());
911 rt_cache_invalidate(net);
912 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
913}
914
915static void rt_secret_rebuild_oneshot(struct net *net)
916{
917 del_timer_sync(&net->ipv4.rt_secret_timer);
918 rt_cache_invalidate(net);
919 if (ip_rt_secret_interval) {
920 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
921 add_timer(&net->ipv4.rt_secret_timer);
922 }
923} 916}
924 917
925static void rt_emergency_hash_rebuild(struct net *net) 918static void rt_emergency_hash_rebuild(struct net *net)
926{ 919{
927 if (net_ratelimit()) { 920 if (net_ratelimit())
928 printk(KERN_WARNING "Route hash chain too long!\n"); 921 printk(KERN_WARNING "Route hash chain too long!\n");
929 printk(KERN_WARNING "Adjust your secret_interval!\n"); 922 rt_cache_invalidate(net);
930 }
931
932 rt_secret_rebuild_oneshot(net);
933} 923}
934 924
935/* 925/*
@@ -951,9 +941,11 @@ static int rt_garbage_collect(struct dst_ops *ops)
951 static unsigned long last_gc; 941 static unsigned long last_gc;
952 static int rover; 942 static int rover;
953 static int equilibrium; 943 static int equilibrium;
954 struct rtable *rth, **rthp; 944 struct rtable *rth;
945 struct rtable __rcu **rthp;
955 unsigned long now = jiffies; 946 unsigned long now = jiffies;
956 int goal; 947 int goal;
948 int entries = dst_entries_get_fast(&ipv4_dst_ops);
957 949
958 /* 950 /*
959 * Garbage collection is pretty expensive, 951 * Garbage collection is pretty expensive,
@@ -963,28 +955,28 @@ static int rt_garbage_collect(struct dst_ops *ops)
963 RT_CACHE_STAT_INC(gc_total); 955 RT_CACHE_STAT_INC(gc_total);
964 956
965 if (now - last_gc < ip_rt_gc_min_interval && 957 if (now - last_gc < ip_rt_gc_min_interval &&
966 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { 958 entries < ip_rt_max_size) {
967 RT_CACHE_STAT_INC(gc_ignored); 959 RT_CACHE_STAT_INC(gc_ignored);
968 goto out; 960 goto out;
969 } 961 }
970 962
963 entries = dst_entries_get_slow(&ipv4_dst_ops);
971 /* Calculate number of entries, which we want to expire now. */ 964 /* Calculate number of entries, which we want to expire now. */
972 goal = atomic_read(&ipv4_dst_ops.entries) - 965 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
973 (ip_rt_gc_elasticity << rt_hash_log);
974 if (goal <= 0) { 966 if (goal <= 0) {
975 if (equilibrium < ipv4_dst_ops.gc_thresh) 967 if (equilibrium < ipv4_dst_ops.gc_thresh)
976 equilibrium = ipv4_dst_ops.gc_thresh; 968 equilibrium = ipv4_dst_ops.gc_thresh;
977 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 969 goal = entries - equilibrium;
978 if (goal > 0) { 970 if (goal > 0) {
979 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); 971 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
980 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 972 goal = entries - equilibrium;
981 } 973 }
982 } else { 974 } else {
983 /* We are in dangerous area. Try to reduce cache really 975 /* We are in dangerous area. Try to reduce cache really
984 * aggressively. 976 * aggressively.
985 */ 977 */
986 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); 978 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
987 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; 979 equilibrium = entries - goal;
988 } 980 }
989 981
990 if (now - last_gc >= ip_rt_gc_min_interval) 982 if (now - last_gc >= ip_rt_gc_min_interval)
@@ -1004,14 +996,15 @@ static int rt_garbage_collect(struct dst_ops *ops)
1004 k = (k + 1) & rt_hash_mask; 996 k = (k + 1) & rt_hash_mask;
1005 rthp = &rt_hash_table[k].chain; 997 rthp = &rt_hash_table[k].chain;
1006 spin_lock_bh(rt_hash_lock_addr(k)); 998 spin_lock_bh(rt_hash_lock_addr(k));
1007 while ((rth = *rthp) != NULL) { 999 while ((rth = rcu_dereference_protected(*rthp,
1000 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1008 if (!rt_is_expired(rth) && 1001 if (!rt_is_expired(rth) &&
1009 !rt_may_expire(rth, tmo, expire)) { 1002 !rt_may_expire(rth, tmo, expire)) {
1010 tmo >>= 1; 1003 tmo >>= 1;
1011 rthp = &rth->u.dst.rt_next; 1004 rthp = &rth->dst.rt_next;
1012 continue; 1005 continue;
1013 } 1006 }
1014 *rthp = rth->u.dst.rt_next; 1007 *rthp = rth->dst.rt_next;
1015 rt_free(rth); 1008 rt_free(rth);
1016 goal--; 1009 goal--;
1017 } 1010 }
@@ -1041,14 +1034,16 @@ static int rt_garbage_collect(struct dst_ops *ops)
1041 expire >>= 1; 1034 expire >>= 1;
1042#if RT_CACHE_DEBUG >= 2 1035#if RT_CACHE_DEBUG >= 2
1043 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, 1036 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1044 atomic_read(&ipv4_dst_ops.entries), goal, i); 1037 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1045#endif 1038#endif
1046 1039
1047 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 1040 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1048 goto out; 1041 goto out;
1049 } while (!in_softirq() && time_before_eq(jiffies, now)); 1042 } while (!in_softirq() && time_before_eq(jiffies, now));
1050 1043
1051 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 1044 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1045 goto out;
1046 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1052 goto out; 1047 goto out;
1053 if (net_ratelimit()) 1048 if (net_ratelimit())
1054 printk(KERN_WARNING "dst cache overflow\n"); 1049 printk(KERN_WARNING "dst cache overflow\n");
@@ -1058,21 +1053,37 @@ static int rt_garbage_collect(struct dst_ops *ops)
1058work_done: 1053work_done:
1059 expire += ip_rt_gc_min_interval; 1054 expire += ip_rt_gc_min_interval;
1060 if (expire > ip_rt_gc_timeout || 1055 if (expire > ip_rt_gc_timeout ||
1061 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) 1056 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1057 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1062 expire = ip_rt_gc_timeout; 1058 expire = ip_rt_gc_timeout;
1063#if RT_CACHE_DEBUG >= 2 1059#if RT_CACHE_DEBUG >= 2
1064 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, 1060 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1065 atomic_read(&ipv4_dst_ops.entries), goal, rover); 1061 dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1066#endif 1062#endif
1067out: return 0; 1063out: return 0;
1068} 1064}
1069 1065
1066/*
1067 * Returns number of entries in a hash chain that have different hash_inputs
1068 */
1069static int slow_chain_length(const struct rtable *head)
1070{
1071 int length = 0;
1072 const struct rtable *rth = head;
1073
1074 while (rth) {
1075 length += has_noalias(head, rth);
1076 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1077 }
1078 return length >> FRACT_BITS;
1079}
1080
1070static int rt_intern_hash(unsigned hash, struct rtable *rt, 1081static int rt_intern_hash(unsigned hash, struct rtable *rt,
1071 struct rtable **rp, struct sk_buff *skb) 1082 struct rtable **rp, struct sk_buff *skb, int ifindex)
1072{ 1083{
1073 struct rtable *rth, **rthp; 1084 struct rtable *rth, *cand;
1085 struct rtable __rcu **rthp, **candp;
1074 unsigned long now; 1086 unsigned long now;
1075 struct rtable *cand, **candp;
1076 u32 min_score; 1087 u32 min_score;
1077 int chain_length; 1088 int chain_length;
1078 int attempts = !in_softirq(); 1089 int attempts = !in_softirq();
@@ -1084,7 +1095,7 @@ restart:
1084 candp = NULL; 1095 candp = NULL;
1085 now = jiffies; 1096 now = jiffies;
1086 1097
1087 if (!rt_caching(dev_net(rt->u.dst.dev))) { 1098 if (!rt_caching(dev_net(rt->dst.dev))) {
1088 /* 1099 /*
1089 * If we're not caching, just tell the caller we 1100 * If we're not caching, just tell the caller we
1090 * were successful and don't touch the route. The 1101 * were successful and don't touch the route. The
@@ -1093,28 +1104,48 @@ restart:
1093 * If we drop it here, the callers have no way to resolve routes 1104 * If we drop it here, the callers have no way to resolve routes
1094 * when we're not caching. Instead, just point *rp at rt, so 1105 * when we're not caching. Instead, just point *rp at rt, so
1095 * the caller gets a single use out of the route 1106 * the caller gets a single use out of the route
1107 * Note that we do rt_free on this new route entry, so that
1108 * once its refcount hits zero, we are still able to reap it
1109 * (Thanks Alexey)
1110 * Note: To avoid expensive rcu stuff for this uncached dst,
1111 * we set DST_NOCACHE so that dst_release() can free dst without
1112 * waiting a grace period.
1096 */ 1113 */
1097 goto report_and_exit; 1114
1115 rt->dst.flags |= DST_NOCACHE;
1116 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1117 int err = arp_bind_neighbour(&rt->dst);
1118 if (err) {
1119 if (net_ratelimit())
1120 printk(KERN_WARNING
1121 "Neighbour table failure & not caching routes.\n");
1122 ip_rt_put(rt);
1123 return err;
1124 }
1125 }
1126
1127 goto skip_hashing;
1098 } 1128 }
1099 1129
1100 rthp = &rt_hash_table[hash].chain; 1130 rthp = &rt_hash_table[hash].chain;
1101 1131
1102 spin_lock_bh(rt_hash_lock_addr(hash)); 1132 spin_lock_bh(rt_hash_lock_addr(hash));
1103 while ((rth = *rthp) != NULL) { 1133 while ((rth = rcu_dereference_protected(*rthp,
1134 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1104 if (rt_is_expired(rth)) { 1135 if (rt_is_expired(rth)) {
1105 *rthp = rth->u.dst.rt_next; 1136 *rthp = rth->dst.rt_next;
1106 rt_free(rth); 1137 rt_free(rth);
1107 continue; 1138 continue;
1108 } 1139 }
1109 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { 1140 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1110 /* Put it first */ 1141 /* Put it first */
1111 *rthp = rth->u.dst.rt_next; 1142 *rthp = rth->dst.rt_next;
1112 /* 1143 /*
1113 * Since lookup is lockfree, the deletion 1144 * Since lookup is lockfree, the deletion
1114 * must be visible to another weakly ordered CPU before 1145 * must be visible to another weakly ordered CPU before
1115 * the insertion at the start of the hash chain. 1146 * the insertion at the start of the hash chain.
1116 */ 1147 */
1117 rcu_assign_pointer(rth->u.dst.rt_next, 1148 rcu_assign_pointer(rth->dst.rt_next,
1118 rt_hash_table[hash].chain); 1149 rt_hash_table[hash].chain);
1119 /* 1150 /*
1120 * Since lookup is lockfree, the update writes 1151 * Since lookup is lockfree, the update writes
@@ -1122,18 +1153,18 @@ restart:
1122 */ 1153 */
1123 rcu_assign_pointer(rt_hash_table[hash].chain, rth); 1154 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1124 1155
1125 dst_use(&rth->u.dst, now); 1156 dst_use(&rth->dst, now);
1126 spin_unlock_bh(rt_hash_lock_addr(hash)); 1157 spin_unlock_bh(rt_hash_lock_addr(hash));
1127 1158
1128 rt_drop(rt); 1159 rt_drop(rt);
1129 if (rp) 1160 if (rp)
1130 *rp = rth; 1161 *rp = rth;
1131 else 1162 else
1132 skb_dst_set(skb, &rth->u.dst); 1163 skb_dst_set(skb, &rth->dst);
1133 return 0; 1164 return 0;
1134 } 1165 }
1135 1166
1136 if (!atomic_read(&rth->u.dst.__refcnt)) { 1167 if (!atomic_read(&rth->dst.__refcnt)) {
1137 u32 score = rt_score(rth); 1168 u32 score = rt_score(rth);
1138 1169
1139 if (score <= min_score) { 1170 if (score <= min_score) {
@@ -1145,7 +1176,7 @@ restart:
1145 1176
1146 chain_length++; 1177 chain_length++;
1147 1178
1148 rthp = &rth->u.dst.rt_next; 1179 rthp = &rth->dst.rt_next;
1149 } 1180 }
1150 1181
1151 if (cand) { 1182 if (cand) {
@@ -1156,26 +1187,32 @@ restart:
1156 * only 2 entries per bucket. We will see. 1187 * only 2 entries per bucket. We will see.
1157 */ 1188 */
1158 if (chain_length > ip_rt_gc_elasticity) { 1189 if (chain_length > ip_rt_gc_elasticity) {
1159 *candp = cand->u.dst.rt_next; 1190 *candp = cand->dst.rt_next;
1160 rt_free(cand); 1191 rt_free(cand);
1161 } 1192 }
1162 } else { 1193 } else {
1163 if (chain_length > rt_chain_length_max) { 1194 if (chain_length > rt_chain_length_max &&
1164 struct net *net = dev_net(rt->u.dst.dev); 1195 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1196 struct net *net = dev_net(rt->dst.dev);
1165 int num = ++net->ipv4.current_rt_cache_rebuild_count; 1197 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1166 if (!rt_caching(dev_net(rt->u.dst.dev))) { 1198 if (!rt_caching(net)) {
1167 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", 1199 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1168 rt->u.dst.dev->name, num); 1200 rt->dst.dev->name, num);
1169 } 1201 }
1170 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); 1202 rt_emergency_hash_rebuild(net);
1203 spin_unlock_bh(rt_hash_lock_addr(hash));
1204
1205 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1206 ifindex, rt_genid(net));
1207 goto restart;
1171 } 1208 }
1172 } 1209 }
1173 1210
1174 /* Try to bind route to arp only if it is output 1211 /* Try to bind route to arp only if it is output
1175 route or unicast forwarding path. 1212 route or unicast forwarding path.
1176 */ 1213 */
1177 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1214 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1178 int err = arp_bind_neighbour(&rt->u.dst); 1215 int err = arp_bind_neighbour(&rt->dst);
1179 if (err) { 1216 if (err) {
1180 spin_unlock_bh(rt_hash_lock_addr(hash)); 1217 spin_unlock_bh(rt_hash_lock_addr(hash));
1181 1218
@@ -1200,19 +1237,20 @@ restart:
1200 } 1237 }
1201 1238
1202 if (net_ratelimit()) 1239 if (net_ratelimit())
1203 printk(KERN_WARNING "Neighbour table overflow.\n"); 1240 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1204 rt_drop(rt); 1241 rt_drop(rt);
1205 return -ENOBUFS; 1242 return -ENOBUFS;
1206 } 1243 }
1207 } 1244 }
1208 1245
1209 rt->u.dst.rt_next = rt_hash_table[hash].chain; 1246 rt->dst.rt_next = rt_hash_table[hash].chain;
1210 1247
1211#if RT_CACHE_DEBUG >= 2 1248#if RT_CACHE_DEBUG >= 2
1212 if (rt->u.dst.rt_next) { 1249 if (rt->dst.rt_next) {
1213 struct rtable *trt; 1250 struct rtable *trt;
1214 printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst); 1251 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1215 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) 1252 hash, &rt->rt_dst);
1253 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1216 printk(" . %pI4", &trt->rt_dst); 1254 printk(" . %pI4", &trt->rt_dst);
1217 printk("\n"); 1255 printk("\n");
1218 } 1256 }
@@ -1226,28 +1264,21 @@ restart:
1226 1264
1227 spin_unlock_bh(rt_hash_lock_addr(hash)); 1265 spin_unlock_bh(rt_hash_lock_addr(hash));
1228 1266
1229report_and_exit: 1267skip_hashing:
1230 if (rp) 1268 if (rp)
1231 *rp = rt; 1269 *rp = rt;
1232 else 1270 else
1233 skb_dst_set(skb, &rt->u.dst); 1271 skb_dst_set(skb, &rt->dst);
1234 return 0; 1272 return 0;
1235} 1273}
1236 1274
1237void rt_bind_peer(struct rtable *rt, int create) 1275void rt_bind_peer(struct rtable *rt, int create)
1238{ 1276{
1239 static DEFINE_SPINLOCK(rt_peer_lock);
1240 struct inet_peer *peer; 1277 struct inet_peer *peer;
1241 1278
1242 peer = inet_getpeer(rt->rt_dst, create); 1279 peer = inet_getpeer_v4(rt->rt_dst, create);
1243 1280
1244 spin_lock_bh(&rt_peer_lock); 1281 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1245 if (rt->peer == NULL) {
1246 rt->peer = peer;
1247 peer = NULL;
1248 }
1249 spin_unlock_bh(&rt_peer_lock);
1250 if (peer)
1251 inet_putpeer(peer); 1282 inet_putpeer(peer);
1252} 1283}
1253 1284
@@ -1292,31 +1323,36 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1292 1323
1293 ip_select_fb_ident(iph); 1324 ip_select_fb_ident(iph);
1294} 1325}
1326EXPORT_SYMBOL(__ip_select_ident);
1295 1327
1296static void rt_del(unsigned hash, struct rtable *rt) 1328static void rt_del(unsigned hash, struct rtable *rt)
1297{ 1329{
1298 struct rtable **rthp, *aux; 1330 struct rtable __rcu **rthp;
1331 struct rtable *aux;
1299 1332
1300 rthp = &rt_hash_table[hash].chain; 1333 rthp = &rt_hash_table[hash].chain;
1301 spin_lock_bh(rt_hash_lock_addr(hash)); 1334 spin_lock_bh(rt_hash_lock_addr(hash));
1302 ip_rt_put(rt); 1335 ip_rt_put(rt);
1303 while ((aux = *rthp) != NULL) { 1336 while ((aux = rcu_dereference_protected(*rthp,
1337 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1304 if (aux == rt || rt_is_expired(aux)) { 1338 if (aux == rt || rt_is_expired(aux)) {
1305 *rthp = aux->u.dst.rt_next; 1339 *rthp = aux->dst.rt_next;
1306 rt_free(aux); 1340 rt_free(aux);
1307 continue; 1341 continue;
1308 } 1342 }
1309 rthp = &aux->u.dst.rt_next; 1343 rthp = &aux->dst.rt_next;
1310 } 1344 }
1311 spin_unlock_bh(rt_hash_lock_addr(hash)); 1345 spin_unlock_bh(rt_hash_lock_addr(hash));
1312} 1346}
1313 1347
1348/* called in rcu_read_lock() section */
1314void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1349void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1315 __be32 saddr, struct net_device *dev) 1350 __be32 saddr, struct net_device *dev)
1316{ 1351{
1317 int i, k; 1352 int i, k;
1318 struct in_device *in_dev = in_dev_get(dev); 1353 struct in_device *in_dev = __in_dev_get_rcu(dev);
1319 struct rtable *rth, **rthp; 1354 struct rtable *rth;
1355 struct rtable __rcu **rthp;
1320 __be32 skeys[2] = { saddr, 0 }; 1356 __be32 skeys[2] = { saddr, 0 };
1321 int ikeys[2] = { dev->ifindex, 0 }; 1357 int ikeys[2] = { dev->ifindex, 0 };
1322 struct netevent_redirect netevent; 1358 struct netevent_redirect netevent;
@@ -1326,9 +1362,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1326 return; 1362 return;
1327 1363
1328 net = dev_net(dev); 1364 net = dev_net(dev);
1329 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) 1365 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1330 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) 1366 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1331 || ipv4_is_zeronet(new_gw)) 1367 ipv4_is_zeronet(new_gw))
1332 goto reject_redirect; 1368 goto reject_redirect;
1333 1369
1334 if (!rt_caching(net)) 1370 if (!rt_caching(net))
@@ -1349,55 +1385,50 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1349 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1385 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1350 rt_genid(net)); 1386 rt_genid(net));
1351 1387
1352 rthp=&rt_hash_table[hash].chain; 1388 rthp = &rt_hash_table[hash].chain;
1353 1389
1354 rcu_read_lock();
1355 while ((rth = rcu_dereference(*rthp)) != NULL) { 1390 while ((rth = rcu_dereference(*rthp)) != NULL) {
1356 struct rtable *rt; 1391 struct rtable *rt;
1357 1392
1358 if (rth->fl.fl4_dst != daddr || 1393 if (rth->fl.fl4_dst != daddr ||
1359 rth->fl.fl4_src != skeys[i] || 1394 rth->fl.fl4_src != skeys[i] ||
1360 rth->fl.oif != ikeys[k] || 1395 rth->fl.oif != ikeys[k] ||
1361 rth->fl.iif != 0 || 1396 rt_is_input_route(rth) ||
1362 rt_is_expired(rth) || 1397 rt_is_expired(rth) ||
1363 !net_eq(dev_net(rth->u.dst.dev), net)) { 1398 !net_eq(dev_net(rth->dst.dev), net)) {
1364 rthp = &rth->u.dst.rt_next; 1399 rthp = &rth->dst.rt_next;
1365 continue; 1400 continue;
1366 } 1401 }
1367 1402
1368 if (rth->rt_dst != daddr || 1403 if (rth->rt_dst != daddr ||
1369 rth->rt_src != saddr || 1404 rth->rt_src != saddr ||
1370 rth->u.dst.error || 1405 rth->dst.error ||
1371 rth->rt_gateway != old_gw || 1406 rth->rt_gateway != old_gw ||
1372 rth->u.dst.dev != dev) 1407 rth->dst.dev != dev)
1373 break; 1408 break;
1374 1409
1375 dst_hold(&rth->u.dst); 1410 dst_hold(&rth->dst);
1376 rcu_read_unlock();
1377 1411
1378 rt = dst_alloc(&ipv4_dst_ops); 1412 rt = dst_alloc(&ipv4_dst_ops);
1379 if (rt == NULL) { 1413 if (rt == NULL) {
1380 ip_rt_put(rth); 1414 ip_rt_put(rth);
1381 in_dev_put(in_dev);
1382 return; 1415 return;
1383 } 1416 }
1384 1417
1385 /* Copy all the information. */ 1418 /* Copy all the information. */
1386 *rt = *rth; 1419 *rt = *rth;
1387 rt->u.dst.__use = 1; 1420 rt->dst.__use = 1;
1388 atomic_set(&rt->u.dst.__refcnt, 1); 1421 atomic_set(&rt->dst.__refcnt, 1);
1389 rt->u.dst.child = NULL; 1422 rt->dst.child = NULL;
1390 if (rt->u.dst.dev) 1423 if (rt->dst.dev)
1391 dev_hold(rt->u.dst.dev); 1424 dev_hold(rt->dst.dev);
1392 if (rt->idev) 1425 rt->dst.obsolete = -1;
1393 in_dev_hold(rt->idev); 1426 rt->dst.lastuse = jiffies;
1394 rt->u.dst.obsolete = 0; 1427 rt->dst.path = &rt->dst;
1395 rt->u.dst.lastuse = jiffies; 1428 rt->dst.neighbour = NULL;
1396 rt->u.dst.path = &rt->u.dst; 1429 rt->dst.hh = NULL;
1397 rt->u.dst.neighbour = NULL;
1398 rt->u.dst.hh = NULL;
1399#ifdef CONFIG_XFRM 1430#ifdef CONFIG_XFRM
1400 rt->u.dst.xfrm = NULL; 1431 rt->dst.xfrm = NULL;
1401#endif 1432#endif
1402 rt->rt_genid = rt_genid(net); 1433 rt->rt_genid = rt_genid(net);
1403 rt->rt_flags |= RTCF_REDIRECTED; 1434 rt->rt_flags |= RTCF_REDIRECTED;
@@ -1406,37 +1437,35 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1406 rt->rt_gateway = new_gw; 1437 rt->rt_gateway = new_gw;
1407 1438
1408 /* Redirect received -> path was valid */ 1439 /* Redirect received -> path was valid */
1409 dst_confirm(&rth->u.dst); 1440 dst_confirm(&rth->dst);
1410 1441
1411 if (rt->peer) 1442 if (rt->peer)
1412 atomic_inc(&rt->peer->refcnt); 1443 atomic_inc(&rt->peer->refcnt);
1413 1444
1414 if (arp_bind_neighbour(&rt->u.dst) || 1445 if (arp_bind_neighbour(&rt->dst) ||
1415 !(rt->u.dst.neighbour->nud_state & 1446 !(rt->dst.neighbour->nud_state &
1416 NUD_VALID)) { 1447 NUD_VALID)) {
1417 if (rt->u.dst.neighbour) 1448 if (rt->dst.neighbour)
1418 neigh_event_send(rt->u.dst.neighbour, NULL); 1449 neigh_event_send(rt->dst.neighbour, NULL);
1419 ip_rt_put(rth); 1450 ip_rt_put(rth);
1420 rt_drop(rt); 1451 rt_drop(rt);
1421 goto do_next; 1452 goto do_next;
1422 } 1453 }
1423 1454
1424 netevent.old = &rth->u.dst; 1455 netevent.old = &rth->dst;
1425 netevent.new = &rt->u.dst; 1456 netevent.new = &rt->dst;
1426 call_netevent_notifiers(NETEVENT_REDIRECT, 1457 call_netevent_notifiers(NETEVENT_REDIRECT,
1427 &netevent); 1458 &netevent);
1428 1459
1429 rt_del(hash, rth); 1460 rt_del(hash, rth);
1430 if (!rt_intern_hash(hash, rt, &rt, NULL)) 1461 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1431 ip_rt_put(rt); 1462 ip_rt_put(rt);
1432 goto do_next; 1463 goto do_next;
1433 } 1464 }
1434 rcu_read_unlock();
1435 do_next: 1465 do_next:
1436 ; 1466 ;
1437 } 1467 }
1438 } 1468 }
1439 in_dev_put(in_dev);
1440 return; 1469 return;
1441 1470
1442reject_redirect: 1471reject_redirect:
@@ -1447,7 +1476,7 @@ reject_redirect:
1447 &old_gw, dev->name, &new_gw, 1476 &old_gw, dev->name, &new_gw,
1448 &saddr, &daddr); 1477 &saddr, &daddr);
1449#endif 1478#endif
1450 in_dev_put(in_dev); 1479 ;
1451} 1480}
1452 1481
1453static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1482static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -1456,11 +1485,12 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1456 struct dst_entry *ret = dst; 1485 struct dst_entry *ret = dst;
1457 1486
1458 if (rt) { 1487 if (rt) {
1459 if (dst->obsolete) { 1488 if (dst->obsolete > 0) {
1460 ip_rt_put(rt); 1489 ip_rt_put(rt);
1461 ret = NULL; 1490 ret = NULL;
1462 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1491 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1463 rt->u.dst.expires) { 1492 (rt->dst.expires &&
1493 time_after_eq(jiffies, rt->dst.expires))) {
1464 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1494 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1465 rt->fl.oif, 1495 rt->fl.oif,
1466 rt_genid(dev_net(dst->dev))); 1496 rt_genid(dev_net(dst->dev)));
@@ -1494,49 +1524,51 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1494void ip_rt_send_redirect(struct sk_buff *skb) 1524void ip_rt_send_redirect(struct sk_buff *skb)
1495{ 1525{
1496 struct rtable *rt = skb_rtable(skb); 1526 struct rtable *rt = skb_rtable(skb);
1497 struct in_device *in_dev = in_dev_get(rt->u.dst.dev); 1527 struct in_device *in_dev;
1528 int log_martians;
1498 1529
1499 if (!in_dev) 1530 rcu_read_lock();
1531 in_dev = __in_dev_get_rcu(rt->dst.dev);
1532 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1533 rcu_read_unlock();
1500 return; 1534 return;
1501 1535 }
1502 if (!IN_DEV_TX_REDIRECTS(in_dev)) 1536 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1503 goto out; 1537 rcu_read_unlock();
1504 1538
1505 /* No redirected packets during ip_rt_redirect_silence; 1539 /* No redirected packets during ip_rt_redirect_silence;
1506 * reset the algorithm. 1540 * reset the algorithm.
1507 */ 1541 */
1508 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) 1542 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1509 rt->u.dst.rate_tokens = 0; 1543 rt->dst.rate_tokens = 0;
1510 1544
1511 /* Too many ignored redirects; do not send anything 1545 /* Too many ignored redirects; do not send anything
1512 * set u.dst.rate_last to the last seen redirected packet. 1546 * set dst.rate_last to the last seen redirected packet.
1513 */ 1547 */
1514 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { 1548 if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1515 rt->u.dst.rate_last = jiffies; 1549 rt->dst.rate_last = jiffies;
1516 goto out; 1550 return;
1517 } 1551 }
1518 1552
1519 /* Check for load limit; set rate_last to the latest sent 1553 /* Check for load limit; set rate_last to the latest sent
1520 * redirect. 1554 * redirect.
1521 */ 1555 */
1522 if (rt->u.dst.rate_tokens == 0 || 1556 if (rt->dst.rate_tokens == 0 ||
1523 time_after(jiffies, 1557 time_after(jiffies,
1524 (rt->u.dst.rate_last + 1558 (rt->dst.rate_last +
1525 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { 1559 (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1526 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1560 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1527 rt->u.dst.rate_last = jiffies; 1561 rt->dst.rate_last = jiffies;
1528 ++rt->u.dst.rate_tokens; 1562 ++rt->dst.rate_tokens;
1529#ifdef CONFIG_IP_ROUTE_VERBOSE 1563#ifdef CONFIG_IP_ROUTE_VERBOSE
1530 if (IN_DEV_LOG_MARTIANS(in_dev) && 1564 if (log_martians &&
1531 rt->u.dst.rate_tokens == ip_rt_redirect_number && 1565 rt->dst.rate_tokens == ip_rt_redirect_number &&
1532 net_ratelimit()) 1566 net_ratelimit())
1533 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1567 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1534 &rt->rt_src, rt->rt_iif, 1568 &rt->rt_src, rt->rt_iif,
1535 &rt->rt_dst, &rt->rt_gateway); 1569 &rt->rt_dst, &rt->rt_gateway);
1536#endif 1570#endif
1537 } 1571 }
1538out:
1539 in_dev_put(in_dev);
1540} 1572}
1541 1573
1542static int ip_error(struct sk_buff *skb) 1574static int ip_error(struct sk_buff *skb)
@@ -1545,7 +1577,7 @@ static int ip_error(struct sk_buff *skb)
1545 unsigned long now; 1577 unsigned long now;
1546 int code; 1578 int code;
1547 1579
1548 switch (rt->u.dst.error) { 1580 switch (rt->dst.error) {
1549 case EINVAL: 1581 case EINVAL:
1550 default: 1582 default:
1551 goto out; 1583 goto out;
@@ -1554,7 +1586,7 @@ static int ip_error(struct sk_buff *skb)
1554 break; 1586 break;
1555 case ENETUNREACH: 1587 case ENETUNREACH:
1556 code = ICMP_NET_UNREACH; 1588 code = ICMP_NET_UNREACH;
1557 IP_INC_STATS_BH(dev_net(rt->u.dst.dev), 1589 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1558 IPSTATS_MIB_INNOROUTES); 1590 IPSTATS_MIB_INNOROUTES);
1559 break; 1591 break;
1560 case EACCES: 1592 case EACCES:
@@ -1563,12 +1595,12 @@ static int ip_error(struct sk_buff *skb)
1563 } 1595 }
1564 1596
1565 now = jiffies; 1597 now = jiffies;
1566 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; 1598 rt->dst.rate_tokens += now - rt->dst.rate_last;
1567 if (rt->u.dst.rate_tokens > ip_rt_error_burst) 1599 if (rt->dst.rate_tokens > ip_rt_error_burst)
1568 rt->u.dst.rate_tokens = ip_rt_error_burst; 1600 rt->dst.rate_tokens = ip_rt_error_burst;
1569 rt->u.dst.rate_last = now; 1601 rt->dst.rate_last = now;
1570 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { 1602 if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1571 rt->u.dst.rate_tokens -= ip_rt_error_cost; 1603 rt->dst.rate_tokens -= ip_rt_error_cost;
1572 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1604 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1573 } 1605 }
1574 1606
@@ -1606,9 +1638,6 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1606 __be32 daddr = iph->daddr; 1638 __be32 daddr = iph->daddr;
1607 unsigned short est_mtu = 0; 1639 unsigned short est_mtu = 0;
1608 1640
1609 if (ipv4_config.no_pmtu_disc)
1610 return 0;
1611
1612 for (k = 0; k < 2; k++) { 1641 for (k = 0; k < 2; k++) {
1613 for (i = 0; i < 2; i++) { 1642 for (i = 0; i < 2; i++) {
1614 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1643 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
@@ -1616,7 +1645,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1616 1645
1617 rcu_read_lock(); 1646 rcu_read_lock();
1618 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 1647 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1619 rth = rcu_dereference(rth->u.dst.rt_next)) { 1648 rth = rcu_dereference(rth->dst.rt_next)) {
1620 unsigned short mtu = new_mtu; 1649 unsigned short mtu = new_mtu;
1621 1650
1622 if (rth->fl.fl4_dst != daddr || 1651 if (rth->fl.fl4_dst != daddr ||
@@ -1624,9 +1653,9 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1624 rth->rt_dst != daddr || 1653 rth->rt_dst != daddr ||
1625 rth->rt_src != iph->saddr || 1654 rth->rt_src != iph->saddr ||
1626 rth->fl.oif != ikeys[k] || 1655 rth->fl.oif != ikeys[k] ||
1627 rth->fl.iif != 0 || 1656 rt_is_input_route(rth) ||
1628 dst_metric_locked(&rth->u.dst, RTAX_MTU) || 1657 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1629 !net_eq(dev_net(rth->u.dst.dev), net) || 1658 !net_eq(dev_net(rth->dst.dev), net) ||
1630 rt_is_expired(rth)) 1659 rt_is_expired(rth))
1631 continue; 1660 continue;
1632 1661
@@ -1634,22 +1663,25 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1634 1663
1635 /* BSD 4.2 compatibility hack :-( */ 1664 /* BSD 4.2 compatibility hack :-( */
1636 if (mtu == 0 && 1665 if (mtu == 0 &&
1637 old_mtu >= dst_mtu(&rth->u.dst) && 1666 old_mtu >= dst_mtu(&rth->dst) &&
1638 old_mtu >= 68 + (iph->ihl << 2)) 1667 old_mtu >= 68 + (iph->ihl << 2))
1639 old_mtu -= iph->ihl << 2; 1668 old_mtu -= iph->ihl << 2;
1640 1669
1641 mtu = guess_mtu(old_mtu); 1670 mtu = guess_mtu(old_mtu);
1642 } 1671 }
1643 if (mtu <= dst_mtu(&rth->u.dst)) { 1672 if (mtu <= dst_mtu(&rth->dst)) {
1644 if (mtu < dst_mtu(&rth->u.dst)) { 1673 if (mtu < dst_mtu(&rth->dst)) {
1645 dst_confirm(&rth->u.dst); 1674 dst_confirm(&rth->dst);
1646 if (mtu < ip_rt_min_pmtu) { 1675 if (mtu < ip_rt_min_pmtu) {
1676 u32 lock = dst_metric(&rth->dst,
1677 RTAX_LOCK);
1647 mtu = ip_rt_min_pmtu; 1678 mtu = ip_rt_min_pmtu;
1648 rth->u.dst.metrics[RTAX_LOCK-1] |= 1679 lock |= (1 << RTAX_MTU);
1649 (1 << RTAX_MTU); 1680 dst_metric_set(&rth->dst, RTAX_LOCK,
1681 lock);
1650 } 1682 }
1651 rth->u.dst.metrics[RTAX_MTU-1] = mtu; 1683 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1652 dst_set_expires(&rth->u.dst, 1684 dst_set_expires(&rth->dst,
1653 ip_rt_mtu_expires); 1685 ip_rt_mtu_expires);
1654 } 1686 }
1655 est_mtu = mtu; 1687 est_mtu = mtu;
@@ -1666,10 +1698,11 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1666 if (dst_mtu(dst) > mtu && mtu >= 68 && 1698 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1667 !(dst_metric_locked(dst, RTAX_MTU))) { 1699 !(dst_metric_locked(dst, RTAX_MTU))) {
1668 if (mtu < ip_rt_min_pmtu) { 1700 if (mtu < ip_rt_min_pmtu) {
1701 u32 lock = dst_metric(dst, RTAX_LOCK);
1669 mtu = ip_rt_min_pmtu; 1702 mtu = ip_rt_min_pmtu;
1670 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); 1703 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1671 } 1704 }
1672 dst->metrics[RTAX_MTU-1] = mtu; 1705 dst_metric_set(dst, RTAX_MTU, mtu);
1673 dst_set_expires(dst, ip_rt_mtu_expires); 1706 dst_set_expires(dst, ip_rt_mtu_expires);
1674 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); 1707 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1675 } 1708 }
@@ -1677,40 +1710,22 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1677 1710
1678static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1711static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1679{ 1712{
1680 return NULL; 1713 if (rt_is_expired((struct rtable *)dst))
1714 return NULL;
1715 return dst;
1681} 1716}
1682 1717
1683static void ipv4_dst_destroy(struct dst_entry *dst) 1718static void ipv4_dst_destroy(struct dst_entry *dst)
1684{ 1719{
1685 struct rtable *rt = (struct rtable *) dst; 1720 struct rtable *rt = (struct rtable *) dst;
1686 struct inet_peer *peer = rt->peer; 1721 struct inet_peer *peer = rt->peer;
1687 struct in_device *idev = rt->idev;
1688 1722
1689 if (peer) { 1723 if (peer) {
1690 rt->peer = NULL; 1724 rt->peer = NULL;
1691 inet_putpeer(peer); 1725 inet_putpeer(peer);
1692 } 1726 }
1693
1694 if (idev) {
1695 rt->idev = NULL;
1696 in_dev_put(idev);
1697 }
1698} 1727}
1699 1728
1700static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1701 int how)
1702{
1703 struct rtable *rt = (struct rtable *) dst;
1704 struct in_device *idev = rt->idev;
1705 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1706 struct in_device *loopback_idev =
1707 in_dev_get(dev_net(dev)->loopback_dev);
1708 if (loopback_idev) {
1709 rt->idev = loopback_idev;
1710 in_dev_put(idev);
1711 }
1712 }
1713}
1714 1729
1715static void ipv4_link_failure(struct sk_buff *skb) 1730static void ipv4_link_failure(struct sk_buff *skb)
1716{ 1731{
@@ -1720,7 +1735,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
1720 1735
1721 rt = skb_rtable(skb); 1736 rt = skb_rtable(skb);
1722 if (rt) 1737 if (rt)
1723 dst_set_expires(&rt->u.dst, 0); 1738 dst_set_expires(&rt->dst, 0);
1724} 1739}
1725 1740
1726static int ip_rt_bug(struct sk_buff *skb) 1741static int ip_rt_bug(struct sk_buff *skb)
@@ -1746,59 +1761,79 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1746 __be32 src; 1761 __be32 src;
1747 struct fib_result res; 1762 struct fib_result res;
1748 1763
1749 if (rt->fl.iif == 0) 1764 if (rt_is_output_route(rt))
1750 src = rt->rt_src; 1765 src = rt->rt_src;
1751 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) { 1766 else {
1752 src = FIB_RES_PREFSRC(res); 1767 rcu_read_lock();
1753 fib_res_put(&res); 1768 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1754 } else 1769 src = FIB_RES_PREFSRC(res);
1755 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, 1770 else
1771 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1756 RT_SCOPE_UNIVERSE); 1772 RT_SCOPE_UNIVERSE);
1773 rcu_read_unlock();
1774 }
1757 memcpy(addr, &src, 4); 1775 memcpy(addr, &src, 4);
1758} 1776}
1759 1777
1760#ifdef CONFIG_NET_CLS_ROUTE 1778#ifdef CONFIG_NET_CLS_ROUTE
1761static void set_class_tag(struct rtable *rt, u32 tag) 1779static void set_class_tag(struct rtable *rt, u32 tag)
1762{ 1780{
1763 if (!(rt->u.dst.tclassid & 0xFFFF)) 1781 if (!(rt->dst.tclassid & 0xFFFF))
1764 rt->u.dst.tclassid |= tag & 0xFFFF; 1782 rt->dst.tclassid |= tag & 0xFFFF;
1765 if (!(rt->u.dst.tclassid & 0xFFFF0000)) 1783 if (!(rt->dst.tclassid & 0xFFFF0000))
1766 rt->u.dst.tclassid |= tag & 0xFFFF0000; 1784 rt->dst.tclassid |= tag & 0xFFFF0000;
1767} 1785}
1768#endif 1786#endif
1769 1787
1788static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1789{
1790 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1791
1792 if (advmss == 0) {
1793 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1794 ip_rt_min_advmss);
1795 if (advmss > 65535 - 40)
1796 advmss = 65535 - 40;
1797 }
1798 return advmss;
1799}
1800
1801static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1802{
1803 unsigned int mtu = dst->dev->mtu;
1804
1805 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1806 const struct rtable *rt = (const struct rtable *) dst;
1807
1808 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1809 mtu = 576;
1810 }
1811
1812 if (mtu > IP_MAX_MTU)
1813 mtu = IP_MAX_MTU;
1814
1815 return mtu;
1816}
1817
1770static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 1818static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1771{ 1819{
1820 struct dst_entry *dst = &rt->dst;
1772 struct fib_info *fi = res->fi; 1821 struct fib_info *fi = res->fi;
1773 1822
1774 if (fi) { 1823 if (fi) {
1775 if (FIB_RES_GW(*res) && 1824 if (FIB_RES_GW(*res) &&
1776 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1825 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1777 rt->rt_gateway = FIB_RES_GW(*res); 1826 rt->rt_gateway = FIB_RES_GW(*res);
1778 memcpy(rt->u.dst.metrics, fi->fib_metrics, 1827 dst_import_metrics(dst, fi->fib_metrics);
1779 sizeof(rt->u.dst.metrics));
1780 if (fi->fib_mtu == 0) {
1781 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1782 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1783 rt->rt_gateway != rt->rt_dst &&
1784 rt->u.dst.dev->mtu > 576)
1785 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1786 }
1787#ifdef CONFIG_NET_CLS_ROUTE 1828#ifdef CONFIG_NET_CLS_ROUTE
1788 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 1829 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1789#endif 1830#endif
1790 } else 1831 }
1791 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; 1832
1792 1833 if (dst_mtu(dst) > IP_MAX_MTU)
1793 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0) 1834 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1794 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; 1835 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1795 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU) 1836 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1796 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1797 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1798 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1799 ip_rt_min_advmss);
1800 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1801 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1802 1837
1803#ifdef CONFIG_NET_CLS_ROUTE 1838#ifdef CONFIG_NET_CLS_ROUTE
1804#ifdef CONFIG_IP_MULTIPLE_TABLES 1839#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1809,14 +1844,16 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1809 rt->rt_type = res->type; 1844 rt->rt_type = res->type;
1810} 1845}
1811 1846
1847/* called in rcu_read_lock() section */
1812static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1848static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1813 u8 tos, struct net_device *dev, int our) 1849 u8 tos, struct net_device *dev, int our)
1814{ 1850{
1815 unsigned hash; 1851 unsigned int hash;
1816 struct rtable *rth; 1852 struct rtable *rth;
1817 __be32 spec_dst; 1853 __be32 spec_dst;
1818 struct in_device *in_dev = in_dev_get(dev); 1854 struct in_device *in_dev = __in_dev_get_rcu(dev);
1819 u32 itag = 0; 1855 u32 itag = 0;
1856 int err;
1820 1857
1821 /* Primary sanity checks. */ 1858 /* Primary sanity checks. */
1822 1859
@@ -1831,20 +1868,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1831 if (!ipv4_is_local_multicast(daddr)) 1868 if (!ipv4_is_local_multicast(daddr))
1832 goto e_inval; 1869 goto e_inval;
1833 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1870 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1834 } else if (fib_validate_source(saddr, 0, tos, 0, 1871 } else {
1835 dev, &spec_dst, &itag) < 0) 1872 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1836 goto e_inval; 1873 &itag, 0);
1837 1874 if (err < 0)
1875 goto e_err;
1876 }
1838 rth = dst_alloc(&ipv4_dst_ops); 1877 rth = dst_alloc(&ipv4_dst_ops);
1839 if (!rth) 1878 if (!rth)
1840 goto e_nobufs; 1879 goto e_nobufs;
1841 1880
1842 rth->u.dst.output= ip_rt_bug; 1881 rth->dst.output = ip_rt_bug;
1882 rth->dst.obsolete = -1;
1843 1883
1844 atomic_set(&rth->u.dst.__refcnt, 1); 1884 atomic_set(&rth->dst.__refcnt, 1);
1845 rth->u.dst.flags= DST_HOST; 1885 rth->dst.flags= DST_HOST;
1846 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 1886 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1847 rth->u.dst.flags |= DST_NOPOLICY; 1887 rth->dst.flags |= DST_NOPOLICY;
1848 rth->fl.fl4_dst = daddr; 1888 rth->fl.fl4_dst = daddr;
1849 rth->rt_dst = daddr; 1889 rth->rt_dst = daddr;
1850 rth->fl.fl4_tos = tos; 1890 rth->fl.fl4_tos = tos;
@@ -1852,13 +1892,12 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1852 rth->fl.fl4_src = saddr; 1892 rth->fl.fl4_src = saddr;
1853 rth->rt_src = saddr; 1893 rth->rt_src = saddr;
1854#ifdef CONFIG_NET_CLS_ROUTE 1894#ifdef CONFIG_NET_CLS_ROUTE
1855 rth->u.dst.tclassid = itag; 1895 rth->dst.tclassid = itag;
1856#endif 1896#endif
1857 rth->rt_iif = 1897 rth->rt_iif =
1858 rth->fl.iif = dev->ifindex; 1898 rth->fl.iif = dev->ifindex;
1859 rth->u.dst.dev = init_net.loopback_dev; 1899 rth->dst.dev = init_net.loopback_dev;
1860 dev_hold(rth->u.dst.dev); 1900 dev_hold(rth->dst.dev);
1861 rth->idev = in_dev_get(rth->u.dst.dev);
1862 rth->fl.oif = 0; 1901 rth->fl.oif = 0;
1863 rth->rt_gateway = daddr; 1902 rth->rt_gateway = daddr;
1864 rth->rt_spec_dst= spec_dst; 1903 rth->rt_spec_dst= spec_dst;
@@ -1866,27 +1905,25 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1866 rth->rt_flags = RTCF_MULTICAST; 1905 rth->rt_flags = RTCF_MULTICAST;
1867 rth->rt_type = RTN_MULTICAST; 1906 rth->rt_type = RTN_MULTICAST;
1868 if (our) { 1907 if (our) {
1869 rth->u.dst.input= ip_local_deliver; 1908 rth->dst.input= ip_local_deliver;
1870 rth->rt_flags |= RTCF_LOCAL; 1909 rth->rt_flags |= RTCF_LOCAL;
1871 } 1910 }
1872 1911
1873#ifdef CONFIG_IP_MROUTE 1912#ifdef CONFIG_IP_MROUTE
1874 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1913 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1875 rth->u.dst.input = ip_mr_input; 1914 rth->dst.input = ip_mr_input;
1876#endif 1915#endif
1877 RT_CACHE_STAT_INC(in_slow_mc); 1916 RT_CACHE_STAT_INC(in_slow_mc);
1878 1917
1879 in_dev_put(in_dev);
1880 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1918 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1881 return rt_intern_hash(hash, rth, NULL, skb); 1919 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1882 1920
1883e_nobufs: 1921e_nobufs:
1884 in_dev_put(in_dev);
1885 return -ENOBUFS; 1922 return -ENOBUFS;
1886
1887e_inval: 1923e_inval:
1888 in_dev_put(in_dev);
1889 return -EINVAL; 1924 return -EINVAL;
1925e_err:
1926 return err;
1890} 1927}
1891 1928
1892 1929
@@ -1920,22 +1957,22 @@ static void ip_handle_martian_source(struct net_device *dev,
1920#endif 1957#endif
1921} 1958}
1922 1959
1960/* called in rcu_read_lock() section */
1923static int __mkroute_input(struct sk_buff *skb, 1961static int __mkroute_input(struct sk_buff *skb,
1924 struct fib_result *res, 1962 struct fib_result *res,
1925 struct in_device *in_dev, 1963 struct in_device *in_dev,
1926 __be32 daddr, __be32 saddr, u32 tos, 1964 __be32 daddr, __be32 saddr, u32 tos,
1927 struct rtable **result) 1965 struct rtable **result)
1928{ 1966{
1929
1930 struct rtable *rth; 1967 struct rtable *rth;
1931 int err; 1968 int err;
1932 struct in_device *out_dev; 1969 struct in_device *out_dev;
1933 unsigned flags = 0; 1970 unsigned int flags = 0;
1934 __be32 spec_dst; 1971 __be32 spec_dst;
1935 u32 itag; 1972 u32 itag;
1936 1973
1937 /* get a working reference to the output device */ 1974 /* get a working reference to the output device */
1938 out_dev = in_dev_get(FIB_RES_DEV(*res)); 1975 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1939 if (out_dev == NULL) { 1976 if (out_dev == NULL) {
1940 if (net_ratelimit()) 1977 if (net_ratelimit())
1941 printk(KERN_CRIT "Bug in ip_route_input" \ 1978 printk(KERN_CRIT "Bug in ip_route_input" \
@@ -1945,12 +1982,11 @@ static int __mkroute_input(struct sk_buff *skb,
1945 1982
1946 1983
1947 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 1984 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1948 in_dev->dev, &spec_dst, &itag); 1985 in_dev->dev, &spec_dst, &itag, skb->mark);
1949 if (err < 0) { 1986 if (err < 0) {
1950 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1987 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1951 saddr); 1988 saddr);
1952 1989
1953 err = -EINVAL;
1954 goto cleanup; 1990 goto cleanup;
1955 } 1991 }
1956 1992
@@ -1965,8 +2001,13 @@ static int __mkroute_input(struct sk_buff *skb,
1965 if (skb->protocol != htons(ETH_P_IP)) { 2001 if (skb->protocol != htons(ETH_P_IP)) {
1966 /* Not IP (i.e. ARP). Do not create route, if it is 2002 /* Not IP (i.e. ARP). Do not create route, if it is
1967 * invalid for proxy arp. DNAT routes are always valid. 2003 * invalid for proxy arp. DNAT routes are always valid.
2004 *
2005 * Proxy arp feature have been extended to allow, ARP
2006 * replies back to the same interface, to support
2007 * Private VLAN switch technologies. See arp.c.
1968 */ 2008 */
1969 if (out_dev == in_dev) { 2009 if (out_dev == in_dev &&
2010 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1970 err = -EINVAL; 2011 err = -EINVAL;
1971 goto cleanup; 2012 goto cleanup;
1972 } 2013 }
@@ -1979,12 +2020,12 @@ static int __mkroute_input(struct sk_buff *skb,
1979 goto cleanup; 2020 goto cleanup;
1980 } 2021 }
1981 2022
1982 atomic_set(&rth->u.dst.__refcnt, 1); 2023 atomic_set(&rth->dst.__refcnt, 1);
1983 rth->u.dst.flags= DST_HOST; 2024 rth->dst.flags= DST_HOST;
1984 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2025 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1985 rth->u.dst.flags |= DST_NOPOLICY; 2026 rth->dst.flags |= DST_NOPOLICY;
1986 if (IN_DEV_CONF_GET(out_dev, NOXFRM)) 2027 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1987 rth->u.dst.flags |= DST_NOXFRM; 2028 rth->dst.flags |= DST_NOXFRM;
1988 rth->fl.fl4_dst = daddr; 2029 rth->fl.fl4_dst = daddr;
1989 rth->rt_dst = daddr; 2030 rth->rt_dst = daddr;
1990 rth->fl.fl4_tos = tos; 2031 rth->fl.fl4_tos = tos;
@@ -1994,15 +2035,15 @@ static int __mkroute_input(struct sk_buff *skb,
1994 rth->rt_gateway = daddr; 2035 rth->rt_gateway = daddr;
1995 rth->rt_iif = 2036 rth->rt_iif =
1996 rth->fl.iif = in_dev->dev->ifindex; 2037 rth->fl.iif = in_dev->dev->ifindex;
1997 rth->u.dst.dev = (out_dev)->dev; 2038 rth->dst.dev = (out_dev)->dev;
1998 dev_hold(rth->u.dst.dev); 2039 dev_hold(rth->dst.dev);
1999 rth->idev = in_dev_get(rth->u.dst.dev);
2000 rth->fl.oif = 0; 2040 rth->fl.oif = 0;
2001 rth->rt_spec_dst= spec_dst; 2041 rth->rt_spec_dst= spec_dst;
2002 2042
2003 rth->u.dst.input = ip_forward; 2043 rth->dst.obsolete = -1;
2004 rth->u.dst.output = ip_output; 2044 rth->dst.input = ip_forward;
2005 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); 2045 rth->dst.output = ip_output;
2046 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2006 2047
2007 rt_set_nexthop(rth, res, itag); 2048 rt_set_nexthop(rth, res, itag);
2008 2049
@@ -2011,8 +2052,6 @@ static int __mkroute_input(struct sk_buff *skb,
2011 *result = rth; 2052 *result = rth;
2012 err = 0; 2053 err = 0;
2013 cleanup: 2054 cleanup:
2014 /* release the working reference to the output device */
2015 in_dev_put(out_dev);
2016 return err; 2055 return err;
2017} 2056}
2018 2057
@@ -2038,8 +2077,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
2038 2077
2039 /* put it into the cache */ 2078 /* put it into the cache */
2040 hash = rt_hash(daddr, saddr, fl->iif, 2079 hash = rt_hash(daddr, saddr, fl->iif,
2041 rt_genid(dev_net(rth->u.dst.dev))); 2080 rt_genid(dev_net(rth->dst.dev)));
2042 return rt_intern_hash(hash, rth, NULL, skb); 2081 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2043} 2082}
2044 2083
2045/* 2084/*
@@ -2050,19 +2089,18 @@ static int ip_mkroute_input(struct sk_buff *skb,
2050 * Such approach solves two big problems: 2089 * Such approach solves two big problems:
2051 * 1. Not simplex devices are handled properly. 2090 * 1. Not simplex devices are handled properly.
2052 * 2. IP spoofing attempts are filtered with 100% of guarantee. 2091 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2092 * called with rcu_read_lock()
2053 */ 2093 */
2054 2094
2055static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2095static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2056 u8 tos, struct net_device *dev) 2096 u8 tos, struct net_device *dev)
2057{ 2097{
2058 struct fib_result res; 2098 struct fib_result res;
2059 struct in_device *in_dev = in_dev_get(dev); 2099 struct in_device *in_dev = __in_dev_get_rcu(dev);
2060 struct flowi fl = { .nl_u = { .ip4_u = 2100 struct flowi fl = { .fl4_dst = daddr,
2061 { .daddr = daddr, 2101 .fl4_src = saddr,
2062 .saddr = saddr, 2102 .fl4_tos = tos,
2063 .tos = tos, 2103 .fl4_scope = RT_SCOPE_UNIVERSE,
2064 .scope = RT_SCOPE_UNIVERSE,
2065 } },
2066 .mark = skb->mark, 2104 .mark = skb->mark,
2067 .iif = dev->ifindex }; 2105 .iif = dev->ifindex };
2068 unsigned flags = 0; 2106 unsigned flags = 0;
@@ -2071,7 +2109,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2071 unsigned hash; 2109 unsigned hash;
2072 __be32 spec_dst; 2110 __be32 spec_dst;
2073 int err = -EINVAL; 2111 int err = -EINVAL;
2074 int free_res = 0;
2075 struct net * net = dev_net(dev); 2112 struct net * net = dev_net(dev);
2076 2113
2077 /* IP on this device is disabled. */ 2114 /* IP on this device is disabled. */
@@ -2087,7 +2124,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2087 ipv4_is_loopback(saddr)) 2124 ipv4_is_loopback(saddr))
2088 goto martian_source; 2125 goto martian_source;
2089 2126
2090 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) 2127 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2091 goto brd_input; 2128 goto brd_input;
2092 2129
2093 /* Accept zero addresses only to limited broadcast; 2130 /* Accept zero addresses only to limited broadcast;
@@ -2096,19 +2133,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2096 if (ipv4_is_zeronet(saddr)) 2133 if (ipv4_is_zeronet(saddr))
2097 goto martian_source; 2134 goto martian_source;
2098 2135
2099 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || 2136 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2100 ipv4_is_loopback(daddr))
2101 goto martian_destination; 2137 goto martian_destination;
2102 2138
2103 /* 2139 /*
2104 * Now we are ready to route packet. 2140 * Now we are ready to route packet.
2105 */ 2141 */
2106 if ((err = fib_lookup(net, &fl, &res)) != 0) { 2142 err = fib_lookup(net, &fl, &res);
2143 if (err != 0) {
2107 if (!IN_DEV_FORWARD(in_dev)) 2144 if (!IN_DEV_FORWARD(in_dev))
2108 goto e_hostunreach; 2145 goto e_hostunreach;
2109 goto no_route; 2146 goto no_route;
2110 } 2147 }
2111 free_res = 1;
2112 2148
2113 RT_CACHE_STAT_INC(in_slow_tot); 2149 RT_CACHE_STAT_INC(in_slow_tot);
2114 2150
@@ -2116,13 +2152,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2116 goto brd_input; 2152 goto brd_input;
2117 2153
2118 if (res.type == RTN_LOCAL) { 2154 if (res.type == RTN_LOCAL) {
2119 int result; 2155 err = fib_validate_source(saddr, daddr, tos,
2120 result = fib_validate_source(saddr, daddr, tos, 2156 net->loopback_dev->ifindex,
2121 net->loopback_dev->ifindex, 2157 dev, &spec_dst, &itag, skb->mark);
2122 dev, &spec_dst, &itag); 2158 if (err < 0)
2123 if (result < 0) 2159 goto martian_source_keep_err;
2124 goto martian_source; 2160 if (err)
2125 if (result)
2126 flags |= RTCF_DIRECTSRC; 2161 flags |= RTCF_DIRECTSRC;
2127 spec_dst = daddr; 2162 spec_dst = daddr;
2128 goto local_input; 2163 goto local_input;
@@ -2134,10 +2169,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2134 goto martian_destination; 2169 goto martian_destination;
2135 2170
2136 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 2171 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2137done:
2138 in_dev_put(in_dev);
2139 if (free_res)
2140 fib_res_put(&res);
2141out: return err; 2172out: return err;
2142 2173
2143brd_input: 2174brd_input:
@@ -2148,9 +2179,9 @@ brd_input:
2148 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 2179 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2149 else { 2180 else {
2150 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, 2181 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2151 &itag); 2182 &itag, skb->mark);
2152 if (err < 0) 2183 if (err < 0)
2153 goto martian_source; 2184 goto martian_source_keep_err;
2154 if (err) 2185 if (err)
2155 flags |= RTCF_DIRECTSRC; 2186 flags |= RTCF_DIRECTSRC;
2156 } 2187 }
@@ -2163,13 +2194,14 @@ local_input:
2163 if (!rth) 2194 if (!rth)
2164 goto e_nobufs; 2195 goto e_nobufs;
2165 2196
2166 rth->u.dst.output= ip_rt_bug; 2197 rth->dst.output= ip_rt_bug;
2198 rth->dst.obsolete = -1;
2167 rth->rt_genid = rt_genid(net); 2199 rth->rt_genid = rt_genid(net);
2168 2200
2169 atomic_set(&rth->u.dst.__refcnt, 1); 2201 atomic_set(&rth->dst.__refcnt, 1);
2170 rth->u.dst.flags= DST_HOST; 2202 rth->dst.flags= DST_HOST;
2171 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2203 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2172 rth->u.dst.flags |= DST_NOPOLICY; 2204 rth->dst.flags |= DST_NOPOLICY;
2173 rth->fl.fl4_dst = daddr; 2205 rth->fl.fl4_dst = daddr;
2174 rth->rt_dst = daddr; 2206 rth->rt_dst = daddr;
2175 rth->fl.fl4_tos = tos; 2207 rth->fl.fl4_tos = tos;
@@ -2177,26 +2209,25 @@ local_input:
2177 rth->fl.fl4_src = saddr; 2209 rth->fl.fl4_src = saddr;
2178 rth->rt_src = saddr; 2210 rth->rt_src = saddr;
2179#ifdef CONFIG_NET_CLS_ROUTE 2211#ifdef CONFIG_NET_CLS_ROUTE
2180 rth->u.dst.tclassid = itag; 2212 rth->dst.tclassid = itag;
2181#endif 2213#endif
2182 rth->rt_iif = 2214 rth->rt_iif =
2183 rth->fl.iif = dev->ifindex; 2215 rth->fl.iif = dev->ifindex;
2184 rth->u.dst.dev = net->loopback_dev; 2216 rth->dst.dev = net->loopback_dev;
2185 dev_hold(rth->u.dst.dev); 2217 dev_hold(rth->dst.dev);
2186 rth->idev = in_dev_get(rth->u.dst.dev);
2187 rth->rt_gateway = daddr; 2218 rth->rt_gateway = daddr;
2188 rth->rt_spec_dst= spec_dst; 2219 rth->rt_spec_dst= spec_dst;
2189 rth->u.dst.input= ip_local_deliver; 2220 rth->dst.input= ip_local_deliver;
2190 rth->rt_flags = flags|RTCF_LOCAL; 2221 rth->rt_flags = flags|RTCF_LOCAL;
2191 if (res.type == RTN_UNREACHABLE) { 2222 if (res.type == RTN_UNREACHABLE) {
2192 rth->u.dst.input= ip_error; 2223 rth->dst.input= ip_error;
2193 rth->u.dst.error= -err; 2224 rth->dst.error= -err;
2194 rth->rt_flags &= ~RTCF_LOCAL; 2225 rth->rt_flags &= ~RTCF_LOCAL;
2195 } 2226 }
2196 rth->rt_type = res.type; 2227 rth->rt_type = res.type;
2197 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); 2228 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2198 err = rt_intern_hash(hash, rth, NULL, skb); 2229 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2199 goto done; 2230 goto out;
2200 2231
2201no_route: 2232no_route:
2202 RT_CACHE_STAT_INC(in_no_route); 2233 RT_CACHE_STAT_INC(in_no_route);
@@ -2219,57 +2250,65 @@ martian_destination:
2219 2250
2220e_hostunreach: 2251e_hostunreach:
2221 err = -EHOSTUNREACH; 2252 err = -EHOSTUNREACH;
2222 goto done; 2253 goto out;
2223 2254
2224e_inval: 2255e_inval:
2225 err = -EINVAL; 2256 err = -EINVAL;
2226 goto done; 2257 goto out;
2227 2258
2228e_nobufs: 2259e_nobufs:
2229 err = -ENOBUFS; 2260 err = -ENOBUFS;
2230 goto done; 2261 goto out;
2231 2262
2232martian_source: 2263martian_source:
2264 err = -EINVAL;
2265martian_source_keep_err:
2233 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2266 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2234 goto e_inval; 2267 goto out;
2235} 2268}
2236 2269
2237int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2270int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2238 u8 tos, struct net_device *dev) 2271 u8 tos, struct net_device *dev, bool noref)
2239{ 2272{
2240 struct rtable * rth; 2273 struct rtable * rth;
2241 unsigned hash; 2274 unsigned hash;
2242 int iif = dev->ifindex; 2275 int iif = dev->ifindex;
2243 struct net *net; 2276 struct net *net;
2277 int res;
2244 2278
2245 net = dev_net(dev); 2279 net = dev_net(dev);
2246 2280
2281 rcu_read_lock();
2282
2247 if (!rt_caching(net)) 2283 if (!rt_caching(net))
2248 goto skip_cache; 2284 goto skip_cache;
2249 2285
2250 tos &= IPTOS_RT_MASK; 2286 tos &= IPTOS_RT_MASK;
2251 hash = rt_hash(daddr, saddr, iif, rt_genid(net)); 2287 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2252 2288
2253 rcu_read_lock();
2254 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2289 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2255 rth = rcu_dereference(rth->u.dst.rt_next)) { 2290 rth = rcu_dereference(rth->dst.rt_next)) {
2256 if (((rth->fl.fl4_dst ^ daddr) | 2291 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2257 (rth->fl.fl4_src ^ saddr) | 2292 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2258 (rth->fl.iif ^ iif) | 2293 (rth->fl.iif ^ iif) |
2259 rth->fl.oif | 2294 rth->fl.oif |
2260 (rth->fl.fl4_tos ^ tos)) == 0 && 2295 (rth->fl.fl4_tos ^ tos)) == 0 &&
2261 rth->fl.mark == skb->mark && 2296 rth->fl.mark == skb->mark &&
2262 net_eq(dev_net(rth->u.dst.dev), net) && 2297 net_eq(dev_net(rth->dst.dev), net) &&
2263 !rt_is_expired(rth)) { 2298 !rt_is_expired(rth)) {
2264 dst_use(&rth->u.dst, jiffies); 2299 if (noref) {
2300 dst_use_noref(&rth->dst, jiffies);
2301 skb_dst_set_noref(skb, &rth->dst);
2302 } else {
2303 dst_use(&rth->dst, jiffies);
2304 skb_dst_set(skb, &rth->dst);
2305 }
2265 RT_CACHE_STAT_INC(in_hit); 2306 RT_CACHE_STAT_INC(in_hit);
2266 rcu_read_unlock(); 2307 rcu_read_unlock();
2267 skb_dst_set(skb, &rth->u.dst);
2268 return 0; 2308 return 0;
2269 } 2309 }
2270 RT_CACHE_STAT_INC(in_hlist_search); 2310 RT_CACHE_STAT_INC(in_hlist_search);
2271 } 2311 }
2272 rcu_read_unlock();
2273 2312
2274skip_cache: 2313skip_cache:
2275 /* Multicast recognition logic is moved from route cache to here. 2314 /* Multicast recognition logic is moved from route cache to here.
@@ -2284,29 +2323,34 @@ skip_cache:
2284 route cache entry is created eventually. 2323 route cache entry is created eventually.
2285 */ 2324 */
2286 if (ipv4_is_multicast(daddr)) { 2325 if (ipv4_is_multicast(daddr)) {
2287 struct in_device *in_dev; 2326 struct in_device *in_dev = __in_dev_get_rcu(dev);
2288 2327
2289 rcu_read_lock(); 2328 if (in_dev) {
2290 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2291 int our = ip_check_mc(in_dev, daddr, saddr, 2329 int our = ip_check_mc(in_dev, daddr, saddr,
2292 ip_hdr(skb)->protocol); 2330 ip_hdr(skb)->protocol);
2293 if (our 2331 if (our
2294#ifdef CONFIG_IP_MROUTE 2332#ifdef CONFIG_IP_MROUTE
2295 || (!ipv4_is_local_multicast(daddr) && 2333 ||
2296 IN_DEV_MFORWARD(in_dev)) 2334 (!ipv4_is_local_multicast(daddr) &&
2335 IN_DEV_MFORWARD(in_dev))
2297#endif 2336#endif
2298 ) { 2337 ) {
2338 int res = ip_route_input_mc(skb, daddr, saddr,
2339 tos, dev, our);
2299 rcu_read_unlock(); 2340 rcu_read_unlock();
2300 return ip_route_input_mc(skb, daddr, saddr, 2341 return res;
2301 tos, dev, our);
2302 } 2342 }
2303 } 2343 }
2304 rcu_read_unlock(); 2344 rcu_read_unlock();
2305 return -EINVAL; 2345 return -EINVAL;
2306 } 2346 }
2307 return ip_route_input_slow(skb, daddr, saddr, tos, dev); 2347 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2348 rcu_read_unlock();
2349 return res;
2308} 2350}
2351EXPORT_SYMBOL(ip_route_input_common);
2309 2352
2353/* called with rcu_read_lock() */
2310static int __mkroute_output(struct rtable **result, 2354static int __mkroute_output(struct rtable **result,
2311 struct fib_result *res, 2355 struct fib_result *res,
2312 const struct flowi *fl, 2356 const struct flowi *fl,
@@ -2317,60 +2361,51 @@ static int __mkroute_output(struct rtable **result,
2317 struct rtable *rth; 2361 struct rtable *rth;
2318 struct in_device *in_dev; 2362 struct in_device *in_dev;
2319 u32 tos = RT_FL_TOS(oldflp); 2363 u32 tos = RT_FL_TOS(oldflp);
2320 int err = 0;
2321 2364
2322 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) 2365 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2323 return -EINVAL; 2366 return -EINVAL;
2324 2367
2325 if (fl->fl4_dst == htonl(0xFFFFFFFF)) 2368 if (ipv4_is_lbcast(fl->fl4_dst))
2326 res->type = RTN_BROADCAST; 2369 res->type = RTN_BROADCAST;
2327 else if (ipv4_is_multicast(fl->fl4_dst)) 2370 else if (ipv4_is_multicast(fl->fl4_dst))
2328 res->type = RTN_MULTICAST; 2371 res->type = RTN_MULTICAST;
2329 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) 2372 else if (ipv4_is_zeronet(fl->fl4_dst))
2330 return -EINVAL; 2373 return -EINVAL;
2331 2374
2332 if (dev_out->flags & IFF_LOOPBACK) 2375 if (dev_out->flags & IFF_LOOPBACK)
2333 flags |= RTCF_LOCAL; 2376 flags |= RTCF_LOCAL;
2334 2377
2335 /* get work reference to inet device */ 2378 in_dev = __in_dev_get_rcu(dev_out);
2336 in_dev = in_dev_get(dev_out);
2337 if (!in_dev) 2379 if (!in_dev)
2338 return -EINVAL; 2380 return -EINVAL;
2339 2381
2340 if (res->type == RTN_BROADCAST) { 2382 if (res->type == RTN_BROADCAST) {
2341 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2383 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2342 if (res->fi) { 2384 res->fi = NULL;
2343 fib_info_put(res->fi);
2344 res->fi = NULL;
2345 }
2346 } else if (res->type == RTN_MULTICAST) { 2385 } else if (res->type == RTN_MULTICAST) {
2347 flags |= RTCF_MULTICAST|RTCF_LOCAL; 2386 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2348 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 2387 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2349 oldflp->proto)) 2388 oldflp->proto))
2350 flags &= ~RTCF_LOCAL; 2389 flags &= ~RTCF_LOCAL;
2351 /* If multicast route do not exist use 2390 /* If multicast route do not exist use
2352 default one, but do not gateway in this case. 2391 * default one, but do not gateway in this case.
2353 Yes, it is hack. 2392 * Yes, it is hack.
2354 */ 2393 */
2355 if (res->fi && res->prefixlen < 4) { 2394 if (res->fi && res->prefixlen < 4)
2356 fib_info_put(res->fi);
2357 res->fi = NULL; 2395 res->fi = NULL;
2358 }
2359 } 2396 }
2360 2397
2361 2398
2362 rth = dst_alloc(&ipv4_dst_ops); 2399 rth = dst_alloc(&ipv4_dst_ops);
2363 if (!rth) { 2400 if (!rth)
2364 err = -ENOBUFS; 2401 return -ENOBUFS;
2365 goto cleanup;
2366 }
2367 2402
2368 atomic_set(&rth->u.dst.__refcnt, 1); 2403 atomic_set(&rth->dst.__refcnt, 1);
2369 rth->u.dst.flags= DST_HOST; 2404 rth->dst.flags= DST_HOST;
2370 if (IN_DEV_CONF_GET(in_dev, NOXFRM)) 2405 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2371 rth->u.dst.flags |= DST_NOXFRM; 2406 rth->dst.flags |= DST_NOXFRM;
2372 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2407 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2373 rth->u.dst.flags |= DST_NOPOLICY; 2408 rth->dst.flags |= DST_NOPOLICY;
2374 2409
2375 rth->fl.fl4_dst = oldflp->fl4_dst; 2410 rth->fl.fl4_dst = oldflp->fl4_dst;
2376 rth->fl.fl4_tos = tos; 2411 rth->fl.fl4_tos = tos;
@@ -2382,34 +2417,34 @@ static int __mkroute_output(struct rtable **result,
2382 rth->rt_iif = oldflp->oif ? : dev_out->ifindex; 2417 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2383 /* get references to the devices that are to be hold by the routing 2418 /* get references to the devices that are to be hold by the routing
2384 cache entry */ 2419 cache entry */
2385 rth->u.dst.dev = dev_out; 2420 rth->dst.dev = dev_out;
2386 dev_hold(dev_out); 2421 dev_hold(dev_out);
2387 rth->idev = in_dev_get(dev_out);
2388 rth->rt_gateway = fl->fl4_dst; 2422 rth->rt_gateway = fl->fl4_dst;
2389 rth->rt_spec_dst= fl->fl4_src; 2423 rth->rt_spec_dst= fl->fl4_src;
2390 2424
2391 rth->u.dst.output=ip_output; 2425 rth->dst.output=ip_output;
2426 rth->dst.obsolete = -1;
2392 rth->rt_genid = rt_genid(dev_net(dev_out)); 2427 rth->rt_genid = rt_genid(dev_net(dev_out));
2393 2428
2394 RT_CACHE_STAT_INC(out_slow_tot); 2429 RT_CACHE_STAT_INC(out_slow_tot);
2395 2430
2396 if (flags & RTCF_LOCAL) { 2431 if (flags & RTCF_LOCAL) {
2397 rth->u.dst.input = ip_local_deliver; 2432 rth->dst.input = ip_local_deliver;
2398 rth->rt_spec_dst = fl->fl4_dst; 2433 rth->rt_spec_dst = fl->fl4_dst;
2399 } 2434 }
2400 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2435 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2401 rth->rt_spec_dst = fl->fl4_src; 2436 rth->rt_spec_dst = fl->fl4_src;
2402 if (flags & RTCF_LOCAL && 2437 if (flags & RTCF_LOCAL &&
2403 !(dev_out->flags & IFF_LOOPBACK)) { 2438 !(dev_out->flags & IFF_LOOPBACK)) {
2404 rth->u.dst.output = ip_mc_output; 2439 rth->dst.output = ip_mc_output;
2405 RT_CACHE_STAT_INC(out_slow_mc); 2440 RT_CACHE_STAT_INC(out_slow_mc);
2406 } 2441 }
2407#ifdef CONFIG_IP_MROUTE 2442#ifdef CONFIG_IP_MROUTE
2408 if (res->type == RTN_MULTICAST) { 2443 if (res->type == RTN_MULTICAST) {
2409 if (IN_DEV_MFORWARD(in_dev) && 2444 if (IN_DEV_MFORWARD(in_dev) &&
2410 !ipv4_is_local_multicast(oldflp->fl4_dst)) { 2445 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2411 rth->u.dst.input = ip_mr_input; 2446 rth->dst.input = ip_mr_input;
2412 rth->u.dst.output = ip_mc_output; 2447 rth->dst.output = ip_mc_output;
2413 } 2448 }
2414 } 2449 }
2415#endif 2450#endif
@@ -2418,15 +2453,11 @@ static int __mkroute_output(struct rtable **result,
2418 rt_set_nexthop(rth, res, 0); 2453 rt_set_nexthop(rth, res, 0);
2419 2454
2420 rth->rt_flags = flags; 2455 rth->rt_flags = flags;
2421
2422 *result = rth; 2456 *result = rth;
2423 cleanup: 2457 return 0;
2424 /* release work reference to inet device */
2425 in_dev_put(in_dev);
2426
2427 return err;
2428} 2458}
2429 2459
2460/* called with rcu_read_lock() */
2430static int ip_mkroute_output(struct rtable **rp, 2461static int ip_mkroute_output(struct rtable **rp,
2431 struct fib_result *res, 2462 struct fib_result *res,
2432 const struct flowi *fl, 2463 const struct flowi *fl,
@@ -2440,7 +2471,7 @@ static int ip_mkroute_output(struct rtable **rp,
2440 if (err == 0) { 2471 if (err == 0) {
2441 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, 2472 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2442 rt_genid(dev_net(dev_out))); 2473 rt_genid(dev_net(dev_out)));
2443 err = rt_intern_hash(hash, rth, rp, NULL); 2474 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2444 } 2475 }
2445 2476
2446 return err; 2477 return err;
@@ -2448,27 +2479,24 @@ static int ip_mkroute_output(struct rtable **rp,
2448 2479
2449/* 2480/*
2450 * Major route resolver routine. 2481 * Major route resolver routine.
2482 * called with rcu_read_lock();
2451 */ 2483 */
2452 2484
2453static int ip_route_output_slow(struct net *net, struct rtable **rp, 2485static int ip_route_output_slow(struct net *net, struct rtable **rp,
2454 const struct flowi *oldflp) 2486 const struct flowi *oldflp)
2455{ 2487{
2456 u32 tos = RT_FL_TOS(oldflp); 2488 u32 tos = RT_FL_TOS(oldflp);
2457 struct flowi fl = { .nl_u = { .ip4_u = 2489 struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2458 { .daddr = oldflp->fl4_dst, 2490 .fl4_src = oldflp->fl4_src,
2459 .saddr = oldflp->fl4_src, 2491 .fl4_tos = tos & IPTOS_RT_MASK,
2460 .tos = tos & IPTOS_RT_MASK, 2492 .fl4_scope = ((tos & RTO_ONLINK) ?
2461 .scope = ((tos & RTO_ONLINK) ? 2493 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2462 RT_SCOPE_LINK :
2463 RT_SCOPE_UNIVERSE),
2464 } },
2465 .mark = oldflp->mark, 2494 .mark = oldflp->mark,
2466 .iif = net->loopback_dev->ifindex, 2495 .iif = net->loopback_dev->ifindex,
2467 .oif = oldflp->oif }; 2496 .oif = oldflp->oif };
2468 struct fib_result res; 2497 struct fib_result res;
2469 unsigned flags = 0; 2498 unsigned int flags = 0;
2470 struct net_device *dev_out = NULL; 2499 struct net_device *dev_out = NULL;
2471 int free_res = 0;
2472 int err; 2500 int err;
2473 2501
2474 2502
@@ -2492,11 +2520,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2492 of another iface. --ANK 2520 of another iface. --ANK
2493 */ 2521 */
2494 2522
2495 if (oldflp->oif == 0 2523 if (oldflp->oif == 0 &&
2496 && (ipv4_is_multicast(oldflp->fl4_dst) || 2524 (ipv4_is_multicast(oldflp->fl4_dst) ||
2497 oldflp->fl4_dst == htonl(0xFFFFFFFF))) { 2525 ipv4_is_lbcast(oldflp->fl4_dst))) {
2498 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2526 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2499 dev_out = ip_dev_find(net, oldflp->fl4_src); 2527 dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2500 if (dev_out == NULL) 2528 if (dev_out == NULL)
2501 goto out; 2529 goto out;
2502 2530
@@ -2521,29 +2549,25 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2521 2549
2522 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { 2550 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2523 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2551 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2524 dev_out = ip_dev_find(net, oldflp->fl4_src); 2552 if (!__ip_dev_find(net, oldflp->fl4_src, false))
2525 if (dev_out == NULL)
2526 goto out; 2553 goto out;
2527 dev_put(dev_out);
2528 dev_out = NULL;
2529 } 2554 }
2530 } 2555 }
2531 2556
2532 2557
2533 if (oldflp->oif) { 2558 if (oldflp->oif) {
2534 dev_out = dev_get_by_index(net, oldflp->oif); 2559 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2535 err = -ENODEV; 2560 err = -ENODEV;
2536 if (dev_out == NULL) 2561 if (dev_out == NULL)
2537 goto out; 2562 goto out;
2538 2563
2539 /* RACE: Check return value of inet_select_addr instead. */ 2564 /* RACE: Check return value of inet_select_addr instead. */
2540 if (__in_dev_get_rtnl(dev_out) == NULL) { 2565 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2541 dev_put(dev_out); 2566 err = -ENETUNREACH;
2542 goto out; /* Wrong error code */ 2567 goto out;
2543 } 2568 }
2544
2545 if (ipv4_is_local_multicast(oldflp->fl4_dst) || 2569 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2546 oldflp->fl4_dst == htonl(0xFFFFFFFF)) { 2570 ipv4_is_lbcast(oldflp->fl4_dst)) {
2547 if (!fl.fl4_src) 2571 if (!fl.fl4_src)
2548 fl.fl4_src = inet_select_addr(dev_out, 0, 2572 fl.fl4_src = inet_select_addr(dev_out, 0,
2549 RT_SCOPE_LINK); 2573 RT_SCOPE_LINK);
@@ -2563,10 +2587,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2563 fl.fl4_dst = fl.fl4_src; 2587 fl.fl4_dst = fl.fl4_src;
2564 if (!fl.fl4_dst) 2588 if (!fl.fl4_dst)
2565 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); 2589 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2566 if (dev_out)
2567 dev_put(dev_out);
2568 dev_out = net->loopback_dev; 2590 dev_out = net->loopback_dev;
2569 dev_hold(dev_out);
2570 fl.oif = net->loopback_dev->ifindex; 2591 fl.oif = net->loopback_dev->ifindex;
2571 res.type = RTN_LOCAL; 2592 res.type = RTN_LOCAL;
2572 flags |= RTCF_LOCAL; 2593 flags |= RTCF_LOCAL;
@@ -2600,23 +2621,19 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2600 res.type = RTN_UNICAST; 2621 res.type = RTN_UNICAST;
2601 goto make_route; 2622 goto make_route;
2602 } 2623 }
2603 if (dev_out)
2604 dev_put(dev_out);
2605 err = -ENETUNREACH; 2624 err = -ENETUNREACH;
2606 goto out; 2625 goto out;
2607 } 2626 }
2608 free_res = 1;
2609 2627
2610 if (res.type == RTN_LOCAL) { 2628 if (res.type == RTN_LOCAL) {
2611 if (!fl.fl4_src) 2629 if (!fl.fl4_src) {
2612 fl.fl4_src = fl.fl4_dst; 2630 if (res.fi->fib_prefsrc)
2613 if (dev_out) 2631 fl.fl4_src = res.fi->fib_prefsrc;
2614 dev_put(dev_out); 2632 else
2633 fl.fl4_src = fl.fl4_dst;
2634 }
2615 dev_out = net->loopback_dev; 2635 dev_out = net->loopback_dev;
2616 dev_hold(dev_out);
2617 fl.oif = dev_out->ifindex; 2636 fl.oif = dev_out->ifindex;
2618 if (res.fi)
2619 fib_info_put(res.fi);
2620 res.fi = NULL; 2637 res.fi = NULL;
2621 flags |= RTCF_LOCAL; 2638 flags |= RTCF_LOCAL;
2622 goto make_route; 2639 goto make_route;
@@ -2633,28 +2650,21 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2633 if (!fl.fl4_src) 2650 if (!fl.fl4_src)
2634 fl.fl4_src = FIB_RES_PREFSRC(res); 2651 fl.fl4_src = FIB_RES_PREFSRC(res);
2635 2652
2636 if (dev_out)
2637 dev_put(dev_out);
2638 dev_out = FIB_RES_DEV(res); 2653 dev_out = FIB_RES_DEV(res);
2639 dev_hold(dev_out);
2640 fl.oif = dev_out->ifindex; 2654 fl.oif = dev_out->ifindex;
2641 2655
2642 2656
2643make_route: 2657make_route:
2644 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2658 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2645 2659
2646
2647 if (free_res)
2648 fib_res_put(&res);
2649 if (dev_out)
2650 dev_put(dev_out);
2651out: return err; 2660out: return err;
2652} 2661}
2653 2662
2654int __ip_route_output_key(struct net *net, struct rtable **rp, 2663int __ip_route_output_key(struct net *net, struct rtable **rp,
2655 const struct flowi *flp) 2664 const struct flowi *flp)
2656{ 2665{
2657 unsigned hash; 2666 unsigned int hash;
2667 int res;
2658 struct rtable *rth; 2668 struct rtable *rth;
2659 2669
2660 if (!rt_caching(net)) 2670 if (!rt_caching(net))
@@ -2663,18 +2673,18 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2663 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); 2673 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2664 2674
2665 rcu_read_lock_bh(); 2675 rcu_read_lock_bh();
2666 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2676 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2667 rth = rcu_dereference(rth->u.dst.rt_next)) { 2677 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2668 if (rth->fl.fl4_dst == flp->fl4_dst && 2678 if (rth->fl.fl4_dst == flp->fl4_dst &&
2669 rth->fl.fl4_src == flp->fl4_src && 2679 rth->fl.fl4_src == flp->fl4_src &&
2670 rth->fl.iif == 0 && 2680 rt_is_output_route(rth) &&
2671 rth->fl.oif == flp->oif && 2681 rth->fl.oif == flp->oif &&
2672 rth->fl.mark == flp->mark && 2682 rth->fl.mark == flp->mark &&
2673 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2683 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2674 (IPTOS_RT_MASK | RTO_ONLINK)) && 2684 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2675 net_eq(dev_net(rth->u.dst.dev), net) && 2685 net_eq(dev_net(rth->dst.dev), net) &&
2676 !rt_is_expired(rth)) { 2686 !rt_is_expired(rth)) {
2677 dst_use(&rth->u.dst, jiffies); 2687 dst_use(&rth->dst, jiffies);
2678 RT_CACHE_STAT_INC(out_hit); 2688 RT_CACHE_STAT_INC(out_hit);
2679 rcu_read_unlock_bh(); 2689 rcu_read_unlock_bh();
2680 *rp = rth; 2690 *rp = rth;
@@ -2685,11 +2695,18 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2685 rcu_read_unlock_bh(); 2695 rcu_read_unlock_bh();
2686 2696
2687slow_output: 2697slow_output:
2688 return ip_route_output_slow(net, rp, flp); 2698 rcu_read_lock();
2699 res = ip_route_output_slow(net, rp, flp);
2700 rcu_read_unlock();
2701 return res;
2689} 2702}
2690
2691EXPORT_SYMBOL_GPL(__ip_route_output_key); 2703EXPORT_SYMBOL_GPL(__ip_route_output_key);
2692 2704
2705static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2706{
2707 return NULL;
2708}
2709
2693static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2710static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2694{ 2711{
2695} 2712}
@@ -2698,9 +2715,8 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
2698 .family = AF_INET, 2715 .family = AF_INET,
2699 .protocol = cpu_to_be16(ETH_P_IP), 2716 .protocol = cpu_to_be16(ETH_P_IP),
2700 .destroy = ipv4_dst_destroy, 2717 .destroy = ipv4_dst_destroy,
2701 .check = ipv4_dst_check, 2718 .check = ipv4_blackhole_dst_check,
2702 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2719 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2703 .entries = ATOMIC_INIT(0),
2704}; 2720};
2705 2721
2706 2722
@@ -2711,23 +2727,20 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2711 dst_alloc(&ipv4_dst_blackhole_ops); 2727 dst_alloc(&ipv4_dst_blackhole_ops);
2712 2728
2713 if (rt) { 2729 if (rt) {
2714 struct dst_entry *new = &rt->u.dst; 2730 struct dst_entry *new = &rt->dst;
2715 2731
2716 atomic_set(&new->__refcnt, 1); 2732 atomic_set(&new->__refcnt, 1);
2717 new->__use = 1; 2733 new->__use = 1;
2718 new->input = dst_discard; 2734 new->input = dst_discard;
2719 new->output = dst_discard; 2735 new->output = dst_discard;
2720 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); 2736 dst_copy_metrics(new, &ort->dst);
2721 2737
2722 new->dev = ort->u.dst.dev; 2738 new->dev = ort->dst.dev;
2723 if (new->dev) 2739 if (new->dev)
2724 dev_hold(new->dev); 2740 dev_hold(new->dev);
2725 2741
2726 rt->fl = ort->fl; 2742 rt->fl = ort->fl;
2727 2743
2728 rt->idev = ort->idev;
2729 if (rt->idev)
2730 in_dev_hold(rt->idev);
2731 rt->rt_genid = rt_genid(net); 2744 rt->rt_genid = rt_genid(net);
2732 rt->rt_flags = ort->rt_flags; 2745 rt->rt_flags = ort->rt_flags;
2733 rt->rt_type = ort->rt_type; 2746 rt->rt_type = ort->rt_type;
@@ -2743,9 +2756,9 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2743 dst_free(new); 2756 dst_free(new);
2744 } 2757 }
2745 2758
2746 dst_release(&(*rp)->u.dst); 2759 dst_release(&(*rp)->dst);
2747 *rp = rt; 2760 *rp = rt;
2748 return (rt ? 0 : -ENOMEM); 2761 return rt ? 0 : -ENOMEM;
2749} 2762}
2750 2763
2751int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, 2764int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
@@ -2771,13 +2784,13 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2771 2784
2772 return 0; 2785 return 0;
2773} 2786}
2774
2775EXPORT_SYMBOL_GPL(ip_route_output_flow); 2787EXPORT_SYMBOL_GPL(ip_route_output_flow);
2776 2788
2777int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) 2789int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2778{ 2790{
2779 return ip_route_output_flow(net, rp, flp, NULL, 0); 2791 return ip_route_output_flow(net, rp, flp, NULL, 0);
2780} 2792}
2793EXPORT_SYMBOL(ip_route_output_key);
2781 2794
2782static int rt_fill_info(struct net *net, 2795static int rt_fill_info(struct net *net,
2783 struct sk_buff *skb, u32 pid, u32 seq, int event, 2796 struct sk_buff *skb, u32 pid, u32 seq, int event,
@@ -2813,13 +2826,13 @@ static int rt_fill_info(struct net *net,
2813 r->rtm_src_len = 32; 2826 r->rtm_src_len = 32;
2814 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); 2827 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2815 } 2828 }
2816 if (rt->u.dst.dev) 2829 if (rt->dst.dev)
2817 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex); 2830 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2818#ifdef CONFIG_NET_CLS_ROUTE 2831#ifdef CONFIG_NET_CLS_ROUTE
2819 if (rt->u.dst.tclassid) 2832 if (rt->dst.tclassid)
2820 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid); 2833 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2821#endif 2834#endif
2822 if (rt->fl.iif) 2835 if (rt_is_input_route(rt))
2823 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2836 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2824 else if (rt->rt_src != rt->fl.fl4_src) 2837 else if (rt->rt_src != rt->fl.fl4_src)
2825 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 2838 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
@@ -2827,20 +2840,24 @@ static int rt_fill_info(struct net *net,
2827 if (rt->rt_dst != rt->rt_gateway) 2840 if (rt->rt_dst != rt->rt_gateway)
2828 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2841 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2829 2842
2830 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) 2843 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2831 goto nla_put_failure; 2844 goto nla_put_failure;
2832 2845
2833 error = rt->u.dst.error; 2846 if (rt->fl.mark)
2834 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; 2847 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2848
2849 error = rt->dst.error;
2850 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2835 if (rt->peer) { 2851 if (rt->peer) {
2836 id = rt->peer->ip_id_count; 2852 inet_peer_refcheck(rt->peer);
2853 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2837 if (rt->peer->tcp_ts_stamp) { 2854 if (rt->peer->tcp_ts_stamp) {
2838 ts = rt->peer->tcp_ts; 2855 ts = rt->peer->tcp_ts;
2839 tsage = get_seconds() - rt->peer->tcp_ts_stamp; 2856 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2840 } 2857 }
2841 } 2858 }
2842 2859
2843 if (rt->fl.iif) { 2860 if (rt_is_input_route(rt)) {
2844#ifdef CONFIG_IP_MROUTE 2861#ifdef CONFIG_IP_MROUTE
2845 __be32 dst = rt->rt_dst; 2862 __be32 dst = rt->rt_dst;
2846 2863
@@ -2863,7 +2880,7 @@ static int rt_fill_info(struct net *net,
2863 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); 2880 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2864 } 2881 }
2865 2882
2866 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage, 2883 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2867 expires, error) < 0) 2884 expires, error) < 0)
2868 goto nla_put_failure; 2885 goto nla_put_failure;
2869 2886
@@ -2884,6 +2901,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2884 __be32 src = 0; 2901 __be32 src = 0;
2885 u32 iif; 2902 u32 iif;
2886 int err; 2903 int err;
2904 int mark;
2887 struct sk_buff *skb; 2905 struct sk_buff *skb;
2888 2906
2889 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); 2907 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
@@ -2911,6 +2929,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2911 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; 2929 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2912 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; 2930 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2913 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2931 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2932 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2914 2933
2915 if (iif) { 2934 if (iif) {
2916 struct net_device *dev; 2935 struct net_device *dev;
@@ -2923,23 +2942,21 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2923 2942
2924 skb->protocol = htons(ETH_P_IP); 2943 skb->protocol = htons(ETH_P_IP);
2925 skb->dev = dev; 2944 skb->dev = dev;
2945 skb->mark = mark;
2926 local_bh_disable(); 2946 local_bh_disable();
2927 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 2947 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2928 local_bh_enable(); 2948 local_bh_enable();
2929 2949
2930 rt = skb_rtable(skb); 2950 rt = skb_rtable(skb);
2931 if (err == 0 && rt->u.dst.error) 2951 if (err == 0 && rt->dst.error)
2932 err = -rt->u.dst.error; 2952 err = -rt->dst.error;
2933 } else { 2953 } else {
2934 struct flowi fl = { 2954 struct flowi fl = {
2935 .nl_u = { 2955 .fl4_dst = dst,
2936 .ip4_u = { 2956 .fl4_src = src,
2937 .daddr = dst, 2957 .fl4_tos = rtm->rtm_tos,
2938 .saddr = src,
2939 .tos = rtm->rtm_tos,
2940 },
2941 },
2942 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 2958 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2959 .mark = mark,
2943 }; 2960 };
2944 err = ip_route_output_key(net, &rt, &fl); 2961 err = ip_route_output_key(net, &rt, &fl);
2945 } 2962 }
@@ -2947,7 +2964,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2947 if (err) 2964 if (err)
2948 goto errout_free; 2965 goto errout_free;
2949 2966
2950 skb_dst_set(skb, &rt->u.dst); 2967 skb_dst_set(skb, &rt->dst);
2951 if (rtm->rtm_flags & RTM_F_NOTIFY) 2968 if (rtm->rtm_flags & RTM_F_NOTIFY)
2952 rt->rt_flags |= RTCF_NOTIFY; 2969 rt->rt_flags |= RTCF_NOTIFY;
2953 2970
@@ -2982,13 +2999,13 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2982 if (!rt_hash_table[h].chain) 2999 if (!rt_hash_table[h].chain)
2983 continue; 3000 continue;
2984 rcu_read_lock_bh(); 3001 rcu_read_lock_bh();
2985 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; 3002 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
2986 rt = rcu_dereference(rt->u.dst.rt_next), idx++) { 3003 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
2987 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) 3004 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
2988 continue; 3005 continue;
2989 if (rt_is_expired(rt)) 3006 if (rt_is_expired(rt))
2990 continue; 3007 continue;
2991 skb_dst_set(skb, dst_clone(&rt->u.dst)); 3008 skb_dst_set_noref(skb, &rt->dst);
2992 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, 3009 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
2993 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 3010 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2994 1, NLM_F_MULTI) <= 0) { 3011 1, NLM_F_MULTI) <= 0) {
@@ -3014,7 +3031,7 @@ void ip_rt_multicast_event(struct in_device *in_dev)
3014 3031
3015#ifdef CONFIG_SYSCTL 3032#ifdef CONFIG_SYSCTL
3016static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, 3033static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3017 struct file *filp, void __user *buffer, 3034 void __user *buffer,
3018 size_t *lenp, loff_t *ppos) 3035 size_t *lenp, loff_t *ppos)
3019{ 3036{
3020 if (write) { 3037 if (write) {
@@ -3024,7 +3041,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3024 3041
3025 memcpy(&ctl, __ctl, sizeof(ctl)); 3042 memcpy(&ctl, __ctl, sizeof(ctl));
3026 ctl.data = &flush_delay; 3043 ctl.data = &flush_delay;
3027 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos); 3044 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3028 3045
3029 net = (struct net *)__ctl->extra1; 3046 net = (struct net *)__ctl->extra1;
3030 rt_cache_flush(net, flush_delay); 3047 rt_cache_flush(net, flush_delay);
@@ -3034,85 +3051,8 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3034 return -EINVAL; 3051 return -EINVAL;
3035} 3052}
3036 3053
3037static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
3038 void __user *oldval,
3039 size_t __user *oldlenp,
3040 void __user *newval,
3041 size_t newlen)
3042{
3043 int delay;
3044 struct net *net;
3045 if (newlen != sizeof(int))
3046 return -EINVAL;
3047 if (get_user(delay, (int __user *)newval))
3048 return -EFAULT;
3049 net = (struct net *)table->extra1;
3050 rt_cache_flush(net, delay);
3051 return 0;
3052}
3053
3054static void rt_secret_reschedule(int old)
3055{
3056 struct net *net;
3057 int new = ip_rt_secret_interval;
3058 int diff = new - old;
3059
3060 if (!diff)
3061 return;
3062
3063 rtnl_lock();
3064 for_each_net(net) {
3065 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3066
3067 if (!new)
3068 continue;
3069
3070 if (deleted) {
3071 long time = net->ipv4.rt_secret_timer.expires - jiffies;
3072
3073 if (time <= 0 || (time += diff) <= 0)
3074 time = 0;
3075
3076 net->ipv4.rt_secret_timer.expires = time;
3077 } else
3078 net->ipv4.rt_secret_timer.expires = new;
3079
3080 net->ipv4.rt_secret_timer.expires += jiffies;
3081 add_timer(&net->ipv4.rt_secret_timer);
3082 }
3083 rtnl_unlock();
3084}
3085
3086static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3087 struct file *filp,
3088 void __user *buffer, size_t *lenp,
3089 loff_t *ppos)
3090{
3091 int old = ip_rt_secret_interval;
3092 int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
3093
3094 rt_secret_reschedule(old);
3095
3096 return ret;
3097}
3098
3099static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
3100 void __user *oldval,
3101 size_t __user *oldlenp,
3102 void __user *newval,
3103 size_t newlen)
3104{
3105 int old = ip_rt_secret_interval;
3106 int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
3107
3108 rt_secret_reschedule(old);
3109
3110 return ret;
3111}
3112
3113static ctl_table ipv4_route_table[] = { 3054static ctl_table ipv4_route_table[] = {
3114 { 3055 {
3115 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
3116 .procname = "gc_thresh", 3056 .procname = "gc_thresh",
3117 .data = &ipv4_dst_ops.gc_thresh, 3057 .data = &ipv4_dst_ops.gc_thresh,
3118 .maxlen = sizeof(int), 3058 .maxlen = sizeof(int),
@@ -3120,7 +3060,6 @@ static ctl_table ipv4_route_table[] = {
3120 .proc_handler = proc_dointvec, 3060 .proc_handler = proc_dointvec,
3121 }, 3061 },
3122 { 3062 {
3123 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
3124 .procname = "max_size", 3063 .procname = "max_size",
3125 .data = &ip_rt_max_size, 3064 .data = &ip_rt_max_size,
3126 .maxlen = sizeof(int), 3065 .maxlen = sizeof(int),
@@ -3130,43 +3069,34 @@ static ctl_table ipv4_route_table[] = {
3130 { 3069 {
3131 /* Deprecated. Use gc_min_interval_ms */ 3070 /* Deprecated. Use gc_min_interval_ms */
3132 3071
3133 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3134 .procname = "gc_min_interval", 3072 .procname = "gc_min_interval",
3135 .data = &ip_rt_gc_min_interval, 3073 .data = &ip_rt_gc_min_interval,
3136 .maxlen = sizeof(int), 3074 .maxlen = sizeof(int),
3137 .mode = 0644, 3075 .mode = 0644,
3138 .proc_handler = proc_dointvec_jiffies, 3076 .proc_handler = proc_dointvec_jiffies,
3139 .strategy = sysctl_jiffies,
3140 }, 3077 },
3141 { 3078 {
3142 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3143 .procname = "gc_min_interval_ms", 3079 .procname = "gc_min_interval_ms",
3144 .data = &ip_rt_gc_min_interval, 3080 .data = &ip_rt_gc_min_interval,
3145 .maxlen = sizeof(int), 3081 .maxlen = sizeof(int),
3146 .mode = 0644, 3082 .mode = 0644,
3147 .proc_handler = proc_dointvec_ms_jiffies, 3083 .proc_handler = proc_dointvec_ms_jiffies,
3148 .strategy = sysctl_ms_jiffies,
3149 }, 3084 },
3150 { 3085 {
3151 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
3152 .procname = "gc_timeout", 3086 .procname = "gc_timeout",
3153 .data = &ip_rt_gc_timeout, 3087 .data = &ip_rt_gc_timeout,
3154 .maxlen = sizeof(int), 3088 .maxlen = sizeof(int),
3155 .mode = 0644, 3089 .mode = 0644,
3156 .proc_handler = proc_dointvec_jiffies, 3090 .proc_handler = proc_dointvec_jiffies,
3157 .strategy = sysctl_jiffies,
3158 }, 3091 },
3159 { 3092 {
3160 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
3161 .procname = "gc_interval", 3093 .procname = "gc_interval",
3162 .data = &ip_rt_gc_interval, 3094 .data = &ip_rt_gc_interval,
3163 .maxlen = sizeof(int), 3095 .maxlen = sizeof(int),
3164 .mode = 0644, 3096 .mode = 0644,
3165 .proc_handler = proc_dointvec_jiffies, 3097 .proc_handler = proc_dointvec_jiffies,
3166 .strategy = sysctl_jiffies,
3167 }, 3098 },
3168 { 3099 {
3169 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
3170 .procname = "redirect_load", 3100 .procname = "redirect_load",
3171 .data = &ip_rt_redirect_load, 3101 .data = &ip_rt_redirect_load,
3172 .maxlen = sizeof(int), 3102 .maxlen = sizeof(int),
@@ -3174,7 +3104,6 @@ static ctl_table ipv4_route_table[] = {
3174 .proc_handler = proc_dointvec, 3104 .proc_handler = proc_dointvec,
3175 }, 3105 },
3176 { 3106 {
3177 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3178 .procname = "redirect_number", 3107 .procname = "redirect_number",
3179 .data = &ip_rt_redirect_number, 3108 .data = &ip_rt_redirect_number,
3180 .maxlen = sizeof(int), 3109 .maxlen = sizeof(int),
@@ -3182,7 +3111,6 @@ static ctl_table ipv4_route_table[] = {
3182 .proc_handler = proc_dointvec, 3111 .proc_handler = proc_dointvec,
3183 }, 3112 },
3184 { 3113 {
3185 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3186 .procname = "redirect_silence", 3114 .procname = "redirect_silence",
3187 .data = &ip_rt_redirect_silence, 3115 .data = &ip_rt_redirect_silence,
3188 .maxlen = sizeof(int), 3116 .maxlen = sizeof(int),
@@ -3190,7 +3118,6 @@ static ctl_table ipv4_route_table[] = {
3190 .proc_handler = proc_dointvec, 3118 .proc_handler = proc_dointvec,
3191 }, 3119 },
3192 { 3120 {
3193 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
3194 .procname = "error_cost", 3121 .procname = "error_cost",
3195 .data = &ip_rt_error_cost, 3122 .data = &ip_rt_error_cost,
3196 .maxlen = sizeof(int), 3123 .maxlen = sizeof(int),
@@ -3198,7 +3125,6 @@ static ctl_table ipv4_route_table[] = {
3198 .proc_handler = proc_dointvec, 3125 .proc_handler = proc_dointvec,
3199 }, 3126 },
3200 { 3127 {
3201 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3202 .procname = "error_burst", 3128 .procname = "error_burst",
3203 .data = &ip_rt_error_burst, 3129 .data = &ip_rt_error_burst,
3204 .maxlen = sizeof(int), 3130 .maxlen = sizeof(int),
@@ -3206,7 +3132,6 @@ static ctl_table ipv4_route_table[] = {
3206 .proc_handler = proc_dointvec, 3132 .proc_handler = proc_dointvec,
3207 }, 3133 },
3208 { 3134 {
3209 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3210 .procname = "gc_elasticity", 3135 .procname = "gc_elasticity",
3211 .data = &ip_rt_gc_elasticity, 3136 .data = &ip_rt_gc_elasticity,
3212 .maxlen = sizeof(int), 3137 .maxlen = sizeof(int),
@@ -3214,16 +3139,13 @@ static ctl_table ipv4_route_table[] = {
3214 .proc_handler = proc_dointvec, 3139 .proc_handler = proc_dointvec,
3215 }, 3140 },
3216 { 3141 {
3217 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3218 .procname = "mtu_expires", 3142 .procname = "mtu_expires",
3219 .data = &ip_rt_mtu_expires, 3143 .data = &ip_rt_mtu_expires,
3220 .maxlen = sizeof(int), 3144 .maxlen = sizeof(int),
3221 .mode = 0644, 3145 .mode = 0644,
3222 .proc_handler = proc_dointvec_jiffies, 3146 .proc_handler = proc_dointvec_jiffies,
3223 .strategy = sysctl_jiffies,
3224 }, 3147 },
3225 { 3148 {
3226 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3227 .procname = "min_pmtu", 3149 .procname = "min_pmtu",
3228 .data = &ip_rt_min_pmtu, 3150 .data = &ip_rt_min_pmtu,
3229 .maxlen = sizeof(int), 3151 .maxlen = sizeof(int),
@@ -3231,58 +3153,46 @@ static ctl_table ipv4_route_table[] = {
3231 .proc_handler = proc_dointvec, 3153 .proc_handler = proc_dointvec,
3232 }, 3154 },
3233 { 3155 {
3234 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3235 .procname = "min_adv_mss", 3156 .procname = "min_adv_mss",
3236 .data = &ip_rt_min_advmss, 3157 .data = &ip_rt_min_advmss,
3237 .maxlen = sizeof(int), 3158 .maxlen = sizeof(int),
3238 .mode = 0644, 3159 .mode = 0644,
3239 .proc_handler = proc_dointvec, 3160 .proc_handler = proc_dointvec,
3240 }, 3161 },
3241 { 3162 { }
3242 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3243 .procname = "secret_interval",
3244 .data = &ip_rt_secret_interval,
3245 .maxlen = sizeof(int),
3246 .mode = 0644,
3247 .proc_handler = ipv4_sysctl_rt_secret_interval,
3248 .strategy = ipv4_sysctl_rt_secret_interval_strategy,
3249 },
3250 { .ctl_name = 0 }
3251}; 3163};
3252 3164
3253static struct ctl_table empty[1]; 3165static struct ctl_table empty[1];
3254 3166
3255static struct ctl_table ipv4_skeleton[] = 3167static struct ctl_table ipv4_skeleton[] =
3256{ 3168{
3257 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, 3169 { .procname = "route",
3258 .mode = 0555, .child = ipv4_route_table}, 3170 .mode = 0555, .child = ipv4_route_table},
3259 { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH, 3171 { .procname = "neigh",
3260 .mode = 0555, .child = empty}, 3172 .mode = 0555, .child = empty},
3261 { } 3173 { }
3262}; 3174};
3263 3175
3264static __net_initdata struct ctl_path ipv4_path[] = { 3176static __net_initdata struct ctl_path ipv4_path[] = {
3265 { .procname = "net", .ctl_name = CTL_NET, }, 3177 { .procname = "net", },
3266 { .procname = "ipv4", .ctl_name = NET_IPV4, }, 3178 { .procname = "ipv4", },
3267 { }, 3179 { },
3268}; 3180};
3269 3181
3270static struct ctl_table ipv4_route_flush_table[] = { 3182static struct ctl_table ipv4_route_flush_table[] = {
3271 { 3183 {
3272 .ctl_name = NET_IPV4_ROUTE_FLUSH,
3273 .procname = "flush", 3184 .procname = "flush",
3274 .maxlen = sizeof(int), 3185 .maxlen = sizeof(int),
3275 .mode = 0200, 3186 .mode = 0200,
3276 .proc_handler = ipv4_sysctl_rtcache_flush, 3187 .proc_handler = ipv4_sysctl_rtcache_flush,
3277 .strategy = ipv4_sysctl_rtcache_flush_strategy,
3278 }, 3188 },
3279 { .ctl_name = 0 }, 3189 { },
3280}; 3190};
3281 3191
3282static __net_initdata struct ctl_path ipv4_route_path[] = { 3192static __net_initdata struct ctl_path ipv4_route_path[] = {
3283 { .procname = "net", .ctl_name = CTL_NET, }, 3193 { .procname = "net", },
3284 { .procname = "ipv4", .ctl_name = NET_IPV4, }, 3194 { .procname = "ipv4", },
3285 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, }, 3195 { .procname = "route", },
3286 { }, 3196 { },
3287}; 3197};
3288 3198
@@ -3291,7 +3201,7 @@ static __net_init int sysctl_route_net_init(struct net *net)
3291 struct ctl_table *tbl; 3201 struct ctl_table *tbl;
3292 3202
3293 tbl = ipv4_route_flush_table; 3203 tbl = ipv4_route_flush_table;
3294 if (net != &init_net) { 3204 if (!net_eq(net, &init_net)) {
3295 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3205 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3296 if (tbl == NULL) 3206 if (tbl == NULL)
3297 goto err_dup; 3207 goto err_dup;
@@ -3327,39 +3237,20 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
3327}; 3237};
3328#endif 3238#endif
3329 3239
3330 3240static __net_init int rt_genid_init(struct net *net)
3331static __net_init int rt_secret_timer_init(struct net *net)
3332{ 3241{
3333 atomic_set(&net->ipv4.rt_genid, 3242 get_random_bytes(&net->ipv4.rt_genid,
3334 (int) ((num_physpages ^ (num_physpages>>8)) ^ 3243 sizeof(net->ipv4.rt_genid));
3335 (jiffies ^ (jiffies >> 7))));
3336
3337 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3338 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3339 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3340
3341 if (ip_rt_secret_interval) {
3342 net->ipv4.rt_secret_timer.expires =
3343 jiffies + net_random() % ip_rt_secret_interval +
3344 ip_rt_secret_interval;
3345 add_timer(&net->ipv4.rt_secret_timer);
3346 }
3347 return 0; 3244 return 0;
3348} 3245}
3349 3246
3350static __net_exit void rt_secret_timer_exit(struct net *net) 3247static __net_initdata struct pernet_operations rt_genid_ops = {
3351{ 3248 .init = rt_genid_init,
3352 del_timer_sync(&net->ipv4.rt_secret_timer);
3353}
3354
3355static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3356 .init = rt_secret_timer_init,
3357 .exit = rt_secret_timer_exit,
3358}; 3249};
3359 3250
3360 3251
3361#ifdef CONFIG_NET_CLS_ROUTE 3252#ifdef CONFIG_NET_CLS_ROUTE
3362struct ip_rt_acct *ip_rt_acct __read_mostly; 3253struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3363#endif /* CONFIG_NET_CLS_ROUTE */ 3254#endif /* CONFIG_NET_CLS_ROUTE */
3364 3255
3365static __initdata unsigned long rhash_entries; 3256static __initdata unsigned long rhash_entries;
@@ -3388,11 +3279,17 @@ int __init ip_rt_init(void)
3388 3279
3389 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3280 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3390 3281
3282 if (dst_entries_init(&ipv4_dst_ops) < 0)
3283 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3284
3285 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3286 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3287
3391 rt_hash_table = (struct rt_hash_bucket *) 3288 rt_hash_table = (struct rt_hash_bucket *)
3392 alloc_large_system_hash("IP route cache", 3289 alloc_large_system_hash("IP route cache",
3393 sizeof(struct rt_hash_bucket), 3290 sizeof(struct rt_hash_bucket),
3394 rhash_entries, 3291 rhash_entries,
3395 (num_physpages >= 128 * 1024) ? 3292 (totalram_pages >= 128 * 1024) ?
3396 15 : 17, 3293 15 : 17,
3397 0, 3294 0,
3398 &rt_hash_log, 3295 &rt_hash_log,
@@ -3415,20 +3312,18 @@ int __init ip_rt_init(void)
3415 schedule_delayed_work(&expires_work, 3312 schedule_delayed_work(&expires_work,
3416 net_random() % ip_rt_gc_interval + ip_rt_gc_interval); 3313 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3417 3314
3418 if (register_pernet_subsys(&rt_secret_timer_ops))
3419 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3420
3421 if (ip_rt_proc_init()) 3315 if (ip_rt_proc_init())
3422 printk(KERN_ERR "Unable to create route proc files\n"); 3316 printk(KERN_ERR "Unable to create route proc files\n");
3423#ifdef CONFIG_XFRM 3317#ifdef CONFIG_XFRM
3424 xfrm_init(); 3318 xfrm_init();
3425 xfrm4_init(); 3319 xfrm4_init(ip_rt_max_size);
3426#endif 3320#endif
3427 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); 3321 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3428 3322
3429#ifdef CONFIG_SYSCTL 3323#ifdef CONFIG_SYSCTL
3430 register_pernet_subsys(&sysctl_route_ops); 3324 register_pernet_subsys(&sysctl_route_ops);
3431#endif 3325#endif
3326 register_pernet_subsys(&rt_genid_ops);
3432 return rc; 3327 return rc;
3433} 3328}
3434 3329
@@ -3442,7 +3337,3 @@ void __init ip_static_sysctl_init(void)
3442 register_sysctl_paths(ipv4_path, ipv4_skeleton); 3337 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3443} 3338}
3444#endif 3339#endif
3445
3446EXPORT_SYMBOL(__ip_select_ident);
3447EXPORT_SYMBOL(ip_route_input);
3448EXPORT_SYMBOL(ip_route_output_key);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index cd2b97f1b6e1..47519205a014 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -18,8 +18,8 @@
18#include <net/tcp.h> 18#include <net/tcp.h>
19#include <net/route.h> 19#include <net/route.h>
20 20
21/* Timestamps: lowest 9 bits store TCP options */ 21/* Timestamps: lowest bits store TCP options */
22#define TSBITS 9 22#define TSBITS 6
23#define TSMASK (((__u32)1 << TSBITS) - 1) 23#define TSMASK (((__u32)1 << TSBITS) - 1)
24 24
25extern int sysctl_tcp_syncookies; 25extern int sysctl_tcp_syncookies;
@@ -37,12 +37,13 @@ __initcall(init_syncookies);
37#define COOKIEBITS 24 /* Upper bits store count */ 37#define COOKIEBITS 24 /* Upper bits store count */
38#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) 38#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
39 39
40static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; 40static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
41 ipv4_cookie_scratch);
41 42
42static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, 43static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
43 u32 count, int c) 44 u32 count, int c)
44{ 45{
45 __u32 *tmp = __get_cpu_var(cookie_scratch); 46 __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch);
46 47
47 memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); 48 memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
48 tmp[0] = (__force u32)saddr; 49 tmp[0] = (__force u32)saddr;
@@ -57,7 +58,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
57 58
58/* 59/*
59 * when syncookies are in effect and tcp timestamps are enabled we encode 60 * when syncookies are in effect and tcp timestamps are enabled we encode
60 * tcp options in the lowest 9 bits of the timestamp value that will be 61 * tcp options in the lower bits of the timestamp value that will be
61 * sent in the syn-ack. 62 * sent in the syn-ack.
62 * Since subsequent timestamps use the normal tcp_time_stamp value, we 63 * Since subsequent timestamps use the normal tcp_time_stamp value, we
63 * must make sure that the resulting initial timestamp is <= tcp_time_stamp. 64 * must make sure that the resulting initial timestamp is <= tcp_time_stamp.
@@ -69,11 +70,10 @@ __u32 cookie_init_timestamp(struct request_sock *req)
69 u32 options = 0; 70 u32 options = 0;
70 71
71 ireq = inet_rsk(req); 72 ireq = inet_rsk(req);
72 if (ireq->wscale_ok) { 73
73 options = ireq->snd_wscale; 74 options = ireq->wscale_ok ? ireq->snd_wscale : 0xf;
74 options |= ireq->rcv_wscale << 4; 75 options |= ireq->sack_ok << 4;
75 } 76 options |= ireq->ecn_ok << 5;
76 options |= ireq->sack_ok << 8;
77 77
78 ts = ts_now & ~TSMASK; 78 ts = ts_now & ~TSMASK;
79 ts |= options; 79 ts |= options;
@@ -137,23 +137,23 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
137} 137}
138 138
139/* 139/*
140 * This table has to be sorted and terminated with (__u16)-1. 140 * MSS Values are taken from the 2009 paper
141 * XXX generate a better table. 141 * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson:
142 * Unresolved Issues: HIPPI with a 64k MSS is not well supported. 142 * - values 1440 to 1460 accounted for 80% of observed mss values
143 * - values outside the 536-1460 range are rare (<0.2%).
144 *
145 * Table must be sorted.
143 */ 146 */
144static __u16 const msstab[] = { 147static __u16 const msstab[] = {
145 64 - 1, 148 64,
146 256 - 1, 149 512,
147 512 - 1, 150 536,
148 536 - 1, 151 1024,
149 1024 - 1, 152 1440,
150 1440 - 1, 153 1460,
151 1460 - 1, 154 4312,
152 4312 - 1, 155 8960,
153 (__u16)-1
154}; 156};
155/* The number doesn't include the -1 terminator */
156#define NUM_MSS (ARRAY_SIZE(msstab) - 1)
157 157
158/* 158/*
159 * Generate a syncookie. mssp points to the mss, which is returned 159 * Generate a syncookie. mssp points to the mss, which is returned
@@ -168,10 +168,10 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
168 168
169 tcp_synq_overflow(sk); 169 tcp_synq_overflow(sk);
170 170
171 /* XXX sort msstab[] by probability? Binary search? */ 171 for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
172 for (mssind = 0; mss > msstab[mssind + 1]; mssind++) 172 if (mss >= msstab[mssind])
173 ; 173 break;
174 *mssp = msstab[mssind] + 1; 174 *mssp = msstab[mssind];
175 175
176 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); 176 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
177 177
@@ -201,7 +201,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
201 jiffies / (HZ * 60), 201 jiffies / (HZ * 60),
202 COUNTER_TRIES); 202 COUNTER_TRIES);
203 203
204 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; 204 return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
205} 205}
206 206
207static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, 207static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
@@ -226,32 +226,46 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
226 * additional tcp options in the timestamp. 226 * additional tcp options in the timestamp.
227 * This extracts these options from the timestamp echo. 227 * This extracts these options from the timestamp echo.
228 * 228 *
229 * The lowest 4 bits are for snd_wscale 229 * The lowest 4 bits store snd_wscale.
230 * The next 4 lsb are for rcv_wscale 230 * next 2 bits indicate SACK and ECN support.
231 * The next lsb is for sack_ok 231 *
232 * return false if we decode an option that should not be.
232 */ 233 */
233void cookie_check_timestamp(struct tcp_options_received *tcp_opt) 234bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
234{ 235{
235 /* echoed timestamp, 9 lowest bits contain options */ 236 /* echoed timestamp, lowest bits contain options */
236 u32 options = tcp_opt->rcv_tsecr & TSMASK; 237 u32 options = tcp_opt->rcv_tsecr & TSMASK;
237 238
238 tcp_opt->snd_wscale = options & 0xf; 239 if (!tcp_opt->saw_tstamp) {
239 options >>= 4; 240 tcp_clear_options(tcp_opt);
240 tcp_opt->rcv_wscale = options & 0xf; 241 return true;
242 }
243
244 if (!sysctl_tcp_timestamps)
245 return false;
241 246
242 tcp_opt->sack_ok = (options >> 4) & 0x1; 247 tcp_opt->sack_ok = (options >> 4) & 0x1;
248 *ecn_ok = (options >> 5) & 1;
249 if (*ecn_ok && !sysctl_tcp_ecn)
250 return false;
251
252 if (tcp_opt->sack_ok && !sysctl_tcp_sack)
253 return false;
243 254
244 if (tcp_opt->sack_ok) 255 if ((options & 0xf) == 0xf)
245 tcp_sack_reset(tcp_opt); 256 return true; /* no window scaling */
246 257
247 if (tcp_opt->snd_wscale || tcp_opt->rcv_wscale) 258 tcp_opt->wscale_ok = 1;
248 tcp_opt->wscale_ok = 1; 259 tcp_opt->snd_wscale = options & 0xf;
260 return sysctl_tcp_window_scaling != 0;
249} 261}
250EXPORT_SYMBOL(cookie_check_timestamp); 262EXPORT_SYMBOL(cookie_check_timestamp);
251 263
252struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, 264struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
253 struct ip_options *opt) 265 struct ip_options *opt)
254{ 266{
267 struct tcp_options_received tcp_opt;
268 u8 *hash_location;
255 struct inet_request_sock *ireq; 269 struct inet_request_sock *ireq;
256 struct tcp_request_sock *treq; 270 struct tcp_request_sock *treq;
257 struct tcp_sock *tp = tcp_sk(sk); 271 struct tcp_sock *tp = tcp_sk(sk);
@@ -262,9 +276,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
262 int mss; 276 int mss;
263 struct rtable *rt; 277 struct rtable *rt;
264 __u8 rcv_wscale; 278 __u8 rcv_wscale;
265 struct tcp_options_received tcp_opt; 279 bool ecn_ok;
266 280
267 if (!sysctl_tcp_syncookies || !th->ack) 281 if (!sysctl_tcp_syncookies || !th->ack || th->rst)
268 goto out; 282 goto out;
269 283
270 if (tcp_synq_no_recent_overflow(sk) || 284 if (tcp_synq_no_recent_overflow(sk) ||
@@ -277,10 +291,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
277 291
278 /* check for timestamp cookie support */ 292 /* check for timestamp cookie support */
279 memset(&tcp_opt, 0, sizeof(tcp_opt)); 293 memset(&tcp_opt, 0, sizeof(tcp_opt));
280 tcp_parse_options(skb, &tcp_opt, 0); 294 tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
281 295
282 if (tcp_opt.saw_tstamp) 296 if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
283 cookie_check_timestamp(&tcp_opt); 297 goto out;
284 298
285 ret = NULL; 299 ret = NULL;
286 req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ 300 req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
@@ -296,9 +310,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
296 ireq->rmt_port = th->source; 310 ireq->rmt_port = th->source;
297 ireq->loc_addr = ip_hdr(skb)->daddr; 311 ireq->loc_addr = ip_hdr(skb)->daddr;
298 ireq->rmt_addr = ip_hdr(skb)->saddr; 312 ireq->rmt_addr = ip_hdr(skb)->saddr;
299 ireq->ecn_ok = 0; 313 ireq->ecn_ok = ecn_ok;
300 ireq->snd_wscale = tcp_opt.snd_wscale; 314 ireq->snd_wscale = tcp_opt.snd_wscale;
301 ireq->rcv_wscale = tcp_opt.rcv_wscale;
302 ireq->sack_ok = tcp_opt.sack_ok; 315 ireq->sack_ok = tcp_opt.sack_ok;
303 ireq->wscale_ok = tcp_opt.wscale_ok; 316 ireq->wscale_ok = tcp_opt.wscale_ok;
304 ireq->tstamp_ok = tcp_opt.saw_tstamp; 317 ireq->tstamp_ok = tcp_opt.saw_tstamp;
@@ -332,33 +345,32 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
332 * no easy way to do this. 345 * no easy way to do this.
333 */ 346 */
334 { 347 {
335 struct flowi fl = { .nl_u = { .ip4_u = 348 struct flowi fl = { .mark = sk->sk_mark,
336 { .daddr = ((opt && opt->srr) ? 349 .fl4_dst = ((opt && opt->srr) ?
337 opt->faddr : 350 opt->faddr : ireq->rmt_addr),
338 ireq->rmt_addr), 351 .fl4_src = ireq->loc_addr,
339 .saddr = ireq->loc_addr, 352 .fl4_tos = RT_CONN_FLAGS(sk),
340 .tos = RT_CONN_FLAGS(sk) } },
341 .proto = IPPROTO_TCP, 353 .proto = IPPROTO_TCP,
342 .flags = inet_sk_flowi_flags(sk), 354 .flags = inet_sk_flowi_flags(sk),
343 .uli_u = { .ports = 355 .fl_ip_sport = th->dest,
344 { .sport = th->dest, 356 .fl_ip_dport = th->source };
345 .dport = th->source } } };
346 security_req_classify_flow(req, &fl); 357 security_req_classify_flow(req, &fl);
347 if (ip_route_output_key(&init_net, &rt, &fl)) { 358 if (ip_route_output_key(sock_net(sk), &rt, &fl)) {
348 reqsk_free(req); 359 reqsk_free(req);
349 goto out; 360 goto out;
350 } 361 }
351 } 362 }
352 363
353 /* Try to redo what tcp_v4_send_synack did. */ 364 /* Try to redo what tcp_v4_send_synack did. */
354 req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW); 365 req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
355 366
356 tcp_select_initial_window(tcp_full_space(sk), req->mss, 367 tcp_select_initial_window(tcp_full_space(sk), req->mss,
357 &req->rcv_wnd, &req->window_clamp, 368 &req->rcv_wnd, &req->window_clamp,
358 ireq->wscale_ok, &rcv_wscale); 369 ireq->wscale_ok, &rcv_wscale,
370 dst_metric(&rt->dst, RTAX_INITRWND));
359 371
360 ireq->rcv_wscale = rcv_wscale; 372 ireq->rcv_wscale = rcv_wscale;
361 373
362 ret = get_cookie_sock(sk, skb, req, &rt->u.dst); 374 ret = get_cookie_sock(sk, skb, req, &rt->dst);
363out: return ret; 375out: return ret;
364} 376}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4710d219f06a..1a456652086b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -12,6 +12,7 @@
12#include <linux/inetdevice.h> 12#include <linux/inetdevice.h>
13#include <linux/seqlock.h> 13#include <linux/seqlock.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
15#include <net/snmp.h> 16#include <net/snmp.h>
16#include <net/icmp.h> 17#include <net/icmp.h>
17#include <net/ip.h> 18#include <net/ip.h>
@@ -25,6 +26,10 @@ static int zero;
25static int tcp_retr1_max = 255; 26static int tcp_retr1_max = 255;
26static int ip_local_port_range_min[] = { 1, 1 }; 27static int ip_local_port_range_min[] = { 1, 1 };
27static int ip_local_port_range_max[] = { 65535, 65535 }; 28static int ip_local_port_range_max[] = { 65535, 65535 };
29static int tcp_adv_win_scale_min = -31;
30static int tcp_adv_win_scale_max = 31;
31static int ip_ttl_min = 1;
32static int ip_ttl_max = 255;
28 33
29/* Update system visible IP port range */ 34/* Update system visible IP port range */
30static void set_local_port_range(int range[2]) 35static void set_local_port_range(int range[2])
@@ -36,7 +41,7 @@ static void set_local_port_range(int range[2])
36} 41}
37 42
38/* Validate changes from /proc interface. */ 43/* Validate changes from /proc interface. */
39static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, 44static int ipv4_local_port_range(ctl_table *table, int write,
40 void __user *buffer, 45 void __user *buffer,
41 size_t *lenp, loff_t *ppos) 46 size_t *lenp, loff_t *ppos)
42{ 47{
@@ -51,7 +56,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
51 }; 56 };
52 57
53 inet_get_local_port_range(range, range + 1); 58 inet_get_local_port_range(range, range + 1);
54 ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); 59 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
55 60
56 if (write && ret == 0) { 61 if (write && ret == 0) {
57 if (range[1] < range[0]) 62 if (range[1] < range[0])
@@ -63,35 +68,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
63 return ret; 68 return ret;
64} 69}
65 70
66/* Validate changes from sysctl interface. */ 71static int proc_tcp_congestion_control(ctl_table *ctl, int write,
67static int ipv4_sysctl_local_port_range(ctl_table *table,
68 void __user *oldval,
69 size_t __user *oldlenp,
70 void __user *newval, size_t newlen)
71{
72 int ret;
73 int range[2];
74 ctl_table tmp = {
75 .data = &range,
76 .maxlen = sizeof(range),
77 .mode = table->mode,
78 .extra1 = &ip_local_port_range_min,
79 .extra2 = &ip_local_port_range_max,
80 };
81
82 inet_get_local_port_range(range, range + 1);
83 ret = sysctl_intvec(&tmp, oldval, oldlenp, newval, newlen);
84 if (ret == 0 && newval && newlen) {
85 if (range[1] < range[0])
86 ret = -EINVAL;
87 else
88 set_local_port_range(range);
89 }
90 return ret;
91}
92
93
94static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
95 void __user *buffer, size_t *lenp, loff_t *ppos) 72 void __user *buffer, size_t *lenp, loff_t *ppos)
96{ 73{
97 char val[TCP_CA_NAME_MAX]; 74 char val[TCP_CA_NAME_MAX];
@@ -103,33 +80,14 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file *
103 80
104 tcp_get_default_congestion_control(val); 81 tcp_get_default_congestion_control(val);
105 82
106 ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); 83 ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
107 if (write && ret == 0) 84 if (write && ret == 0)
108 ret = tcp_set_default_congestion_control(val); 85 ret = tcp_set_default_congestion_control(val);
109 return ret; 86 return ret;
110} 87}
111 88
112static int sysctl_tcp_congestion_control(ctl_table *table,
113 void __user *oldval,
114 size_t __user *oldlenp,
115 void __user *newval, size_t newlen)
116{
117 char val[TCP_CA_NAME_MAX];
118 ctl_table tbl = {
119 .data = val,
120 .maxlen = TCP_CA_NAME_MAX,
121 };
122 int ret;
123
124 tcp_get_default_congestion_control(val);
125 ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen);
126 if (ret == 1 && newval && newlen)
127 ret = tcp_set_default_congestion_control(val);
128 return ret;
129}
130
131static int proc_tcp_available_congestion_control(ctl_table *ctl, 89static int proc_tcp_available_congestion_control(ctl_table *ctl,
132 int write, struct file * filp, 90 int write,
133 void __user *buffer, size_t *lenp, 91 void __user *buffer, size_t *lenp,
134 loff_t *ppos) 92 loff_t *ppos)
135{ 93{
@@ -140,13 +98,13 @@ static int proc_tcp_available_congestion_control(ctl_table *ctl,
140 if (!tbl.data) 98 if (!tbl.data)
141 return -ENOMEM; 99 return -ENOMEM;
142 tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX); 100 tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX);
143 ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); 101 ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
144 kfree(tbl.data); 102 kfree(tbl.data);
145 return ret; 103 return ret;
146} 104}
147 105
148static int proc_allowed_congestion_control(ctl_table *ctl, 106static int proc_allowed_congestion_control(ctl_table *ctl,
149 int write, struct file * filp, 107 int write,
150 void __user *buffer, size_t *lenp, 108 void __user *buffer, size_t *lenp,
151 loff_t *ppos) 109 loff_t *ppos)
152{ 110{
@@ -158,39 +116,15 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
158 return -ENOMEM; 116 return -ENOMEM;
159 117
160 tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen); 118 tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen);
161 ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); 119 ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
162 if (write && ret == 0) 120 if (write && ret == 0)
163 ret = tcp_set_allowed_congestion_control(tbl.data); 121 ret = tcp_set_allowed_congestion_control(tbl.data);
164 kfree(tbl.data); 122 kfree(tbl.data);
165 return ret; 123 return ret;
166} 124}
167 125
168static int strategy_allowed_congestion_control(ctl_table *table,
169 void __user *oldval,
170 size_t __user *oldlenp,
171 void __user *newval,
172 size_t newlen)
173{
174 ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
175 int ret;
176
177 tbl.data = kmalloc(tbl.maxlen, GFP_USER);
178 if (!tbl.data)
179 return -ENOMEM;
180
181 tcp_get_available_congestion_control(tbl.data, tbl.maxlen);
182 ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen);
183 if (ret == 1 && newval && newlen)
184 ret = tcp_set_allowed_congestion_control(tbl.data);
185 kfree(tbl.data);
186
187 return ret;
188
189}
190
191static struct ctl_table ipv4_table[] = { 126static struct ctl_table ipv4_table[] = {
192 { 127 {
193 .ctl_name = NET_IPV4_TCP_TIMESTAMPS,
194 .procname = "tcp_timestamps", 128 .procname = "tcp_timestamps",
195 .data = &sysctl_tcp_timestamps, 129 .data = &sysctl_tcp_timestamps,
196 .maxlen = sizeof(int), 130 .maxlen = sizeof(int),
@@ -198,7 +132,6 @@ static struct ctl_table ipv4_table[] = {
198 .proc_handler = proc_dointvec 132 .proc_handler = proc_dointvec
199 }, 133 },
200 { 134 {
201 .ctl_name = NET_IPV4_TCP_WINDOW_SCALING,
202 .procname = "tcp_window_scaling", 135 .procname = "tcp_window_scaling",
203 .data = &sysctl_tcp_window_scaling, 136 .data = &sysctl_tcp_window_scaling,
204 .maxlen = sizeof(int), 137 .maxlen = sizeof(int),
@@ -206,7 +139,6 @@ static struct ctl_table ipv4_table[] = {
206 .proc_handler = proc_dointvec 139 .proc_handler = proc_dointvec
207 }, 140 },
208 { 141 {
209 .ctl_name = NET_IPV4_TCP_SACK,
210 .procname = "tcp_sack", 142 .procname = "tcp_sack",
211 .data = &sysctl_tcp_sack, 143 .data = &sysctl_tcp_sack,
212 .maxlen = sizeof(int), 144 .maxlen = sizeof(int),
@@ -214,7 +146,6 @@ static struct ctl_table ipv4_table[] = {
214 .proc_handler = proc_dointvec 146 .proc_handler = proc_dointvec
215 }, 147 },
216 { 148 {
217 .ctl_name = NET_IPV4_TCP_RETRANS_COLLAPSE,
218 .procname = "tcp_retrans_collapse", 149 .procname = "tcp_retrans_collapse",
219 .data = &sysctl_tcp_retrans_collapse, 150 .data = &sysctl_tcp_retrans_collapse,
220 .maxlen = sizeof(int), 151 .maxlen = sizeof(int),
@@ -222,17 +153,15 @@ static struct ctl_table ipv4_table[] = {
222 .proc_handler = proc_dointvec 153 .proc_handler = proc_dointvec
223 }, 154 },
224 { 155 {
225 .ctl_name = NET_IPV4_DEFAULT_TTL,
226 .procname = "ip_default_ttl", 156 .procname = "ip_default_ttl",
227 .data = &sysctl_ip_default_ttl, 157 .data = &sysctl_ip_default_ttl,
228 .maxlen = sizeof(int), 158 .maxlen = sizeof(int),
229 .mode = 0644, 159 .mode = 0644,
230 .proc_handler = ipv4_doint_and_flush, 160 .proc_handler = proc_dointvec_minmax,
231 .strategy = ipv4_doint_and_flush_strategy, 161 .extra1 = &ip_ttl_min,
232 .extra2 = &init_net, 162 .extra2 = &ip_ttl_max,
233 }, 163 },
234 { 164 {
235 .ctl_name = NET_IPV4_NO_PMTU_DISC,
236 .procname = "ip_no_pmtu_disc", 165 .procname = "ip_no_pmtu_disc",
237 .data = &ipv4_config.no_pmtu_disc, 166 .data = &ipv4_config.no_pmtu_disc,
238 .maxlen = sizeof(int), 167 .maxlen = sizeof(int),
@@ -240,7 +169,6 @@ static struct ctl_table ipv4_table[] = {
240 .proc_handler = proc_dointvec 169 .proc_handler = proc_dointvec
241 }, 170 },
242 { 171 {
243 .ctl_name = NET_IPV4_NONLOCAL_BIND,
244 .procname = "ip_nonlocal_bind", 172 .procname = "ip_nonlocal_bind",
245 .data = &sysctl_ip_nonlocal_bind, 173 .data = &sysctl_ip_nonlocal_bind,
246 .maxlen = sizeof(int), 174 .maxlen = sizeof(int),
@@ -248,7 +176,6 @@ static struct ctl_table ipv4_table[] = {
248 .proc_handler = proc_dointvec 176 .proc_handler = proc_dointvec
249 }, 177 },
250 { 178 {
251 .ctl_name = NET_IPV4_TCP_SYN_RETRIES,
252 .procname = "tcp_syn_retries", 179 .procname = "tcp_syn_retries",
253 .data = &sysctl_tcp_syn_retries, 180 .data = &sysctl_tcp_syn_retries,
254 .maxlen = sizeof(int), 181 .maxlen = sizeof(int),
@@ -256,7 +183,6 @@ static struct ctl_table ipv4_table[] = {
256 .proc_handler = proc_dointvec 183 .proc_handler = proc_dointvec
257 }, 184 },
258 { 185 {
259 .ctl_name = NET_TCP_SYNACK_RETRIES,
260 .procname = "tcp_synack_retries", 186 .procname = "tcp_synack_retries",
261 .data = &sysctl_tcp_synack_retries, 187 .data = &sysctl_tcp_synack_retries,
262 .maxlen = sizeof(int), 188 .maxlen = sizeof(int),
@@ -264,7 +190,6 @@ static struct ctl_table ipv4_table[] = {
264 .proc_handler = proc_dointvec 190 .proc_handler = proc_dointvec
265 }, 191 },
266 { 192 {
267 .ctl_name = NET_TCP_MAX_ORPHANS,
268 .procname = "tcp_max_orphans", 193 .procname = "tcp_max_orphans",
269 .data = &sysctl_tcp_max_orphans, 194 .data = &sysctl_tcp_max_orphans,
270 .maxlen = sizeof(int), 195 .maxlen = sizeof(int),
@@ -272,7 +197,6 @@ static struct ctl_table ipv4_table[] = {
272 .proc_handler = proc_dointvec 197 .proc_handler = proc_dointvec
273 }, 198 },
274 { 199 {
275 .ctl_name = NET_TCP_MAX_TW_BUCKETS,
276 .procname = "tcp_max_tw_buckets", 200 .procname = "tcp_max_tw_buckets",
277 .data = &tcp_death_row.sysctl_max_tw_buckets, 201 .data = &tcp_death_row.sysctl_max_tw_buckets,
278 .maxlen = sizeof(int), 202 .maxlen = sizeof(int),
@@ -280,7 +204,6 @@ static struct ctl_table ipv4_table[] = {
280 .proc_handler = proc_dointvec 204 .proc_handler = proc_dointvec
281 }, 205 },
282 { 206 {
283 .ctl_name = NET_IPV4_DYNADDR,
284 .procname = "ip_dynaddr", 207 .procname = "ip_dynaddr",
285 .data = &sysctl_ip_dynaddr, 208 .data = &sysctl_ip_dynaddr,
286 .maxlen = sizeof(int), 209 .maxlen = sizeof(int),
@@ -288,16 +211,13 @@ static struct ctl_table ipv4_table[] = {
288 .proc_handler = proc_dointvec 211 .proc_handler = proc_dointvec
289 }, 212 },
290 { 213 {
291 .ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME,
292 .procname = "tcp_keepalive_time", 214 .procname = "tcp_keepalive_time",
293 .data = &sysctl_tcp_keepalive_time, 215 .data = &sysctl_tcp_keepalive_time,
294 .maxlen = sizeof(int), 216 .maxlen = sizeof(int),
295 .mode = 0644, 217 .mode = 0644,
296 .proc_handler = proc_dointvec_jiffies, 218 .proc_handler = proc_dointvec_jiffies,
297 .strategy = sysctl_jiffies
298 }, 219 },
299 { 220 {
300 .ctl_name = NET_IPV4_TCP_KEEPALIVE_PROBES,
301 .procname = "tcp_keepalive_probes", 221 .procname = "tcp_keepalive_probes",
302 .data = &sysctl_tcp_keepalive_probes, 222 .data = &sysctl_tcp_keepalive_probes,
303 .maxlen = sizeof(int), 223 .maxlen = sizeof(int),
@@ -305,26 +225,21 @@ static struct ctl_table ipv4_table[] = {
305 .proc_handler = proc_dointvec 225 .proc_handler = proc_dointvec
306 }, 226 },
307 { 227 {
308 .ctl_name = NET_IPV4_TCP_KEEPALIVE_INTVL,
309 .procname = "tcp_keepalive_intvl", 228 .procname = "tcp_keepalive_intvl",
310 .data = &sysctl_tcp_keepalive_intvl, 229 .data = &sysctl_tcp_keepalive_intvl,
311 .maxlen = sizeof(int), 230 .maxlen = sizeof(int),
312 .mode = 0644, 231 .mode = 0644,
313 .proc_handler = proc_dointvec_jiffies, 232 .proc_handler = proc_dointvec_jiffies,
314 .strategy = sysctl_jiffies
315 }, 233 },
316 { 234 {
317 .ctl_name = NET_IPV4_TCP_RETRIES1,
318 .procname = "tcp_retries1", 235 .procname = "tcp_retries1",
319 .data = &sysctl_tcp_retries1, 236 .data = &sysctl_tcp_retries1,
320 .maxlen = sizeof(int), 237 .maxlen = sizeof(int),
321 .mode = 0644, 238 .mode = 0644,
322 .proc_handler = proc_dointvec_minmax, 239 .proc_handler = proc_dointvec_minmax,
323 .strategy = sysctl_intvec,
324 .extra2 = &tcp_retr1_max 240 .extra2 = &tcp_retr1_max
325 }, 241 },
326 { 242 {
327 .ctl_name = NET_IPV4_TCP_RETRIES2,
328 .procname = "tcp_retries2", 243 .procname = "tcp_retries2",
329 .data = &sysctl_tcp_retries2, 244 .data = &sysctl_tcp_retries2,
330 .maxlen = sizeof(int), 245 .maxlen = sizeof(int),
@@ -332,17 +247,14 @@ static struct ctl_table ipv4_table[] = {
332 .proc_handler = proc_dointvec 247 .proc_handler = proc_dointvec
333 }, 248 },
334 { 249 {
335 .ctl_name = NET_IPV4_TCP_FIN_TIMEOUT,
336 .procname = "tcp_fin_timeout", 250 .procname = "tcp_fin_timeout",
337 .data = &sysctl_tcp_fin_timeout, 251 .data = &sysctl_tcp_fin_timeout,
338 .maxlen = sizeof(int), 252 .maxlen = sizeof(int),
339 .mode = 0644, 253 .mode = 0644,
340 .proc_handler = proc_dointvec_jiffies, 254 .proc_handler = proc_dointvec_jiffies,
341 .strategy = sysctl_jiffies
342 }, 255 },
343#ifdef CONFIG_SYN_COOKIES 256#ifdef CONFIG_SYN_COOKIES
344 { 257 {
345 .ctl_name = NET_TCP_SYNCOOKIES,
346 .procname = "tcp_syncookies", 258 .procname = "tcp_syncookies",
347 .data = &sysctl_tcp_syncookies, 259 .data = &sysctl_tcp_syncookies,
348 .maxlen = sizeof(int), 260 .maxlen = sizeof(int),
@@ -351,7 +263,6 @@ static struct ctl_table ipv4_table[] = {
351 }, 263 },
352#endif 264#endif
353 { 265 {
354 .ctl_name = NET_TCP_TW_RECYCLE,
355 .procname = "tcp_tw_recycle", 266 .procname = "tcp_tw_recycle",
356 .data = &tcp_death_row.sysctl_tw_recycle, 267 .data = &tcp_death_row.sysctl_tw_recycle,
357 .maxlen = sizeof(int), 268 .maxlen = sizeof(int),
@@ -359,7 +270,6 @@ static struct ctl_table ipv4_table[] = {
359 .proc_handler = proc_dointvec 270 .proc_handler = proc_dointvec
360 }, 271 },
361 { 272 {
362 .ctl_name = NET_TCP_ABORT_ON_OVERFLOW,
363 .procname = "tcp_abort_on_overflow", 273 .procname = "tcp_abort_on_overflow",
364 .data = &sysctl_tcp_abort_on_overflow, 274 .data = &sysctl_tcp_abort_on_overflow,
365 .maxlen = sizeof(int), 275 .maxlen = sizeof(int),
@@ -367,7 +277,6 @@ static struct ctl_table ipv4_table[] = {
367 .proc_handler = proc_dointvec 277 .proc_handler = proc_dointvec
368 }, 278 },
369 { 279 {
370 .ctl_name = NET_TCP_STDURG,
371 .procname = "tcp_stdurg", 280 .procname = "tcp_stdurg",
372 .data = &sysctl_tcp_stdurg, 281 .data = &sysctl_tcp_stdurg,
373 .maxlen = sizeof(int), 282 .maxlen = sizeof(int),
@@ -375,7 +284,6 @@ static struct ctl_table ipv4_table[] = {
375 .proc_handler = proc_dointvec 284 .proc_handler = proc_dointvec
376 }, 285 },
377 { 286 {
378 .ctl_name = NET_TCP_RFC1337,
379 .procname = "tcp_rfc1337", 287 .procname = "tcp_rfc1337",
380 .data = &sysctl_tcp_rfc1337, 288 .data = &sysctl_tcp_rfc1337,
381 .maxlen = sizeof(int), 289 .maxlen = sizeof(int),
@@ -383,7 +291,6 @@ static struct ctl_table ipv4_table[] = {
383 .proc_handler = proc_dointvec 291 .proc_handler = proc_dointvec
384 }, 292 },
385 { 293 {
386 .ctl_name = NET_TCP_MAX_SYN_BACKLOG,
387 .procname = "tcp_max_syn_backlog", 294 .procname = "tcp_max_syn_backlog",
388 .data = &sysctl_max_syn_backlog, 295 .data = &sysctl_max_syn_backlog,
389 .maxlen = sizeof(int), 296 .maxlen = sizeof(int),
@@ -391,17 +298,21 @@ static struct ctl_table ipv4_table[] = {
391 .proc_handler = proc_dointvec 298 .proc_handler = proc_dointvec
392 }, 299 },
393 { 300 {
394 .ctl_name = NET_IPV4_LOCAL_PORT_RANGE,
395 .procname = "ip_local_port_range", 301 .procname = "ip_local_port_range",
396 .data = &sysctl_local_ports.range, 302 .data = &sysctl_local_ports.range,
397 .maxlen = sizeof(sysctl_local_ports.range), 303 .maxlen = sizeof(sysctl_local_ports.range),
398 .mode = 0644, 304 .mode = 0644,
399 .proc_handler = ipv4_local_port_range, 305 .proc_handler = ipv4_local_port_range,
400 .strategy = ipv4_sysctl_local_port_range, 306 },
307 {
308 .procname = "ip_local_reserved_ports",
309 .data = NULL, /* initialized in sysctl_ipv4_init */
310 .maxlen = 65536,
311 .mode = 0644,
312 .proc_handler = proc_do_large_bitmap,
401 }, 313 },
402#ifdef CONFIG_IP_MULTICAST 314#ifdef CONFIG_IP_MULTICAST
403 { 315 {
404 .ctl_name = NET_IPV4_IGMP_MAX_MEMBERSHIPS,
405 .procname = "igmp_max_memberships", 316 .procname = "igmp_max_memberships",
406 .data = &sysctl_igmp_max_memberships, 317 .data = &sysctl_igmp_max_memberships,
407 .maxlen = sizeof(int), 318 .maxlen = sizeof(int),
@@ -411,7 +322,6 @@ static struct ctl_table ipv4_table[] = {
411 322
412#endif 323#endif
413 { 324 {
414 .ctl_name = NET_IPV4_IGMP_MAX_MSF,
415 .procname = "igmp_max_msf", 325 .procname = "igmp_max_msf",
416 .data = &sysctl_igmp_max_msf, 326 .data = &sysctl_igmp_max_msf,
417 .maxlen = sizeof(int), 327 .maxlen = sizeof(int),
@@ -419,7 +329,6 @@ static struct ctl_table ipv4_table[] = {
419 .proc_handler = proc_dointvec 329 .proc_handler = proc_dointvec
420 }, 330 },
421 { 331 {
422 .ctl_name = NET_IPV4_INET_PEER_THRESHOLD,
423 .procname = "inet_peer_threshold", 332 .procname = "inet_peer_threshold",
424 .data = &inet_peer_threshold, 333 .data = &inet_peer_threshold,
425 .maxlen = sizeof(int), 334 .maxlen = sizeof(int),
@@ -427,43 +336,34 @@ static struct ctl_table ipv4_table[] = {
427 .proc_handler = proc_dointvec 336 .proc_handler = proc_dointvec
428 }, 337 },
429 { 338 {
430 .ctl_name = NET_IPV4_INET_PEER_MINTTL,
431 .procname = "inet_peer_minttl", 339 .procname = "inet_peer_minttl",
432 .data = &inet_peer_minttl, 340 .data = &inet_peer_minttl,
433 .maxlen = sizeof(int), 341 .maxlen = sizeof(int),
434 .mode = 0644, 342 .mode = 0644,
435 .proc_handler = proc_dointvec_jiffies, 343 .proc_handler = proc_dointvec_jiffies,
436 .strategy = sysctl_jiffies
437 }, 344 },
438 { 345 {
439 .ctl_name = NET_IPV4_INET_PEER_MAXTTL,
440 .procname = "inet_peer_maxttl", 346 .procname = "inet_peer_maxttl",
441 .data = &inet_peer_maxttl, 347 .data = &inet_peer_maxttl,
442 .maxlen = sizeof(int), 348 .maxlen = sizeof(int),
443 .mode = 0644, 349 .mode = 0644,
444 .proc_handler = proc_dointvec_jiffies, 350 .proc_handler = proc_dointvec_jiffies,
445 .strategy = sysctl_jiffies
446 }, 351 },
447 { 352 {
448 .ctl_name = NET_IPV4_INET_PEER_GC_MINTIME,
449 .procname = "inet_peer_gc_mintime", 353 .procname = "inet_peer_gc_mintime",
450 .data = &inet_peer_gc_mintime, 354 .data = &inet_peer_gc_mintime,
451 .maxlen = sizeof(int), 355 .maxlen = sizeof(int),
452 .mode = 0644, 356 .mode = 0644,
453 .proc_handler = proc_dointvec_jiffies, 357 .proc_handler = proc_dointvec_jiffies,
454 .strategy = sysctl_jiffies
455 }, 358 },
456 { 359 {
457 .ctl_name = NET_IPV4_INET_PEER_GC_MAXTIME,
458 .procname = "inet_peer_gc_maxtime", 360 .procname = "inet_peer_gc_maxtime",
459 .data = &inet_peer_gc_maxtime, 361 .data = &inet_peer_gc_maxtime,
460 .maxlen = sizeof(int), 362 .maxlen = sizeof(int),
461 .mode = 0644, 363 .mode = 0644,
462 .proc_handler = proc_dointvec_jiffies, 364 .proc_handler = proc_dointvec_jiffies,
463 .strategy = sysctl_jiffies
464 }, 365 },
465 { 366 {
466 .ctl_name = NET_TCP_ORPHAN_RETRIES,
467 .procname = "tcp_orphan_retries", 367 .procname = "tcp_orphan_retries",
468 .data = &sysctl_tcp_orphan_retries, 368 .data = &sysctl_tcp_orphan_retries,
469 .maxlen = sizeof(int), 369 .maxlen = sizeof(int),
@@ -471,7 +371,6 @@ static struct ctl_table ipv4_table[] = {
471 .proc_handler = proc_dointvec 371 .proc_handler = proc_dointvec
472 }, 372 },
473 { 373 {
474 .ctl_name = NET_TCP_FACK,
475 .procname = "tcp_fack", 374 .procname = "tcp_fack",
476 .data = &sysctl_tcp_fack, 375 .data = &sysctl_tcp_fack,
477 .maxlen = sizeof(int), 376 .maxlen = sizeof(int),
@@ -479,7 +378,6 @@ static struct ctl_table ipv4_table[] = {
479 .proc_handler = proc_dointvec 378 .proc_handler = proc_dointvec
480 }, 379 },
481 { 380 {
482 .ctl_name = NET_TCP_REORDERING,
483 .procname = "tcp_reordering", 381 .procname = "tcp_reordering",
484 .data = &sysctl_tcp_reordering, 382 .data = &sysctl_tcp_reordering,
485 .maxlen = sizeof(int), 383 .maxlen = sizeof(int),
@@ -487,7 +385,6 @@ static struct ctl_table ipv4_table[] = {
487 .proc_handler = proc_dointvec 385 .proc_handler = proc_dointvec
488 }, 386 },
489 { 387 {
490 .ctl_name = NET_TCP_ECN,
491 .procname = "tcp_ecn", 388 .procname = "tcp_ecn",
492 .data = &sysctl_tcp_ecn, 389 .data = &sysctl_tcp_ecn,
493 .maxlen = sizeof(int), 390 .maxlen = sizeof(int),
@@ -495,7 +392,6 @@ static struct ctl_table ipv4_table[] = {
495 .proc_handler = proc_dointvec 392 .proc_handler = proc_dointvec
496 }, 393 },
497 { 394 {
498 .ctl_name = NET_TCP_DSACK,
499 .procname = "tcp_dsack", 395 .procname = "tcp_dsack",
500 .data = &sysctl_tcp_dsack, 396 .data = &sysctl_tcp_dsack,
501 .maxlen = sizeof(int), 397 .maxlen = sizeof(int),
@@ -503,15 +399,13 @@ static struct ctl_table ipv4_table[] = {
503 .proc_handler = proc_dointvec 399 .proc_handler = proc_dointvec
504 }, 400 },
505 { 401 {
506 .ctl_name = NET_TCP_MEM,
507 .procname = "tcp_mem", 402 .procname = "tcp_mem",
508 .data = &sysctl_tcp_mem, 403 .data = &sysctl_tcp_mem,
509 .maxlen = sizeof(sysctl_tcp_mem), 404 .maxlen = sizeof(sysctl_tcp_mem),
510 .mode = 0644, 405 .mode = 0644,
511 .proc_handler = proc_dointvec 406 .proc_handler = proc_doulongvec_minmax
512 }, 407 },
513 { 408 {
514 .ctl_name = NET_TCP_WMEM,
515 .procname = "tcp_wmem", 409 .procname = "tcp_wmem",
516 .data = &sysctl_tcp_wmem, 410 .data = &sysctl_tcp_wmem,
517 .maxlen = sizeof(sysctl_tcp_wmem), 411 .maxlen = sizeof(sysctl_tcp_wmem),
@@ -519,7 +413,6 @@ static struct ctl_table ipv4_table[] = {
519 .proc_handler = proc_dointvec 413 .proc_handler = proc_dointvec
520 }, 414 },
521 { 415 {
522 .ctl_name = NET_TCP_RMEM,
523 .procname = "tcp_rmem", 416 .procname = "tcp_rmem",
524 .data = &sysctl_tcp_rmem, 417 .data = &sysctl_tcp_rmem,
525 .maxlen = sizeof(sysctl_tcp_rmem), 418 .maxlen = sizeof(sysctl_tcp_rmem),
@@ -527,7 +420,6 @@ static struct ctl_table ipv4_table[] = {
527 .proc_handler = proc_dointvec 420 .proc_handler = proc_dointvec
528 }, 421 },
529 { 422 {
530 .ctl_name = NET_TCP_APP_WIN,
531 .procname = "tcp_app_win", 423 .procname = "tcp_app_win",
532 .data = &sysctl_tcp_app_win, 424 .data = &sysctl_tcp_app_win,
533 .maxlen = sizeof(int), 425 .maxlen = sizeof(int),
@@ -535,15 +427,15 @@ static struct ctl_table ipv4_table[] = {
535 .proc_handler = proc_dointvec 427 .proc_handler = proc_dointvec
536 }, 428 },
537 { 429 {
538 .ctl_name = NET_TCP_ADV_WIN_SCALE,
539 .procname = "tcp_adv_win_scale", 430 .procname = "tcp_adv_win_scale",
540 .data = &sysctl_tcp_adv_win_scale, 431 .data = &sysctl_tcp_adv_win_scale,
541 .maxlen = sizeof(int), 432 .maxlen = sizeof(int),
542 .mode = 0644, 433 .mode = 0644,
543 .proc_handler = proc_dointvec 434 .proc_handler = proc_dointvec_minmax,
435 .extra1 = &tcp_adv_win_scale_min,
436 .extra2 = &tcp_adv_win_scale_max,
544 }, 437 },
545 { 438 {
546 .ctl_name = NET_TCP_TW_REUSE,
547 .procname = "tcp_tw_reuse", 439 .procname = "tcp_tw_reuse",
548 .data = &sysctl_tcp_tw_reuse, 440 .data = &sysctl_tcp_tw_reuse,
549 .maxlen = sizeof(int), 441 .maxlen = sizeof(int),
@@ -551,7 +443,6 @@ static struct ctl_table ipv4_table[] = {
551 .proc_handler = proc_dointvec 443 .proc_handler = proc_dointvec
552 }, 444 },
553 { 445 {
554 .ctl_name = NET_TCP_FRTO,
555 .procname = "tcp_frto", 446 .procname = "tcp_frto",
556 .data = &sysctl_tcp_frto, 447 .data = &sysctl_tcp_frto,
557 .maxlen = sizeof(int), 448 .maxlen = sizeof(int),
@@ -559,7 +450,6 @@ static struct ctl_table ipv4_table[] = {
559 .proc_handler = proc_dointvec 450 .proc_handler = proc_dointvec
560 }, 451 },
561 { 452 {
562 .ctl_name = NET_TCP_FRTO_RESPONSE,
563 .procname = "tcp_frto_response", 453 .procname = "tcp_frto_response",
564 .data = &sysctl_tcp_frto_response, 454 .data = &sysctl_tcp_frto_response,
565 .maxlen = sizeof(int), 455 .maxlen = sizeof(int),
@@ -567,7 +457,6 @@ static struct ctl_table ipv4_table[] = {
567 .proc_handler = proc_dointvec 457 .proc_handler = proc_dointvec
568 }, 458 },
569 { 459 {
570 .ctl_name = NET_TCP_LOW_LATENCY,
571 .procname = "tcp_low_latency", 460 .procname = "tcp_low_latency",
572 .data = &sysctl_tcp_low_latency, 461 .data = &sysctl_tcp_low_latency,
573 .maxlen = sizeof(int), 462 .maxlen = sizeof(int),
@@ -575,7 +464,6 @@ static struct ctl_table ipv4_table[] = {
575 .proc_handler = proc_dointvec 464 .proc_handler = proc_dointvec
576 }, 465 },
577 { 466 {
578 .ctl_name = NET_TCP_NO_METRICS_SAVE,
579 .procname = "tcp_no_metrics_save", 467 .procname = "tcp_no_metrics_save",
580 .data = &sysctl_tcp_nometrics_save, 468 .data = &sysctl_tcp_nometrics_save,
581 .maxlen = sizeof(int), 469 .maxlen = sizeof(int),
@@ -583,7 +471,6 @@ static struct ctl_table ipv4_table[] = {
583 .proc_handler = proc_dointvec, 471 .proc_handler = proc_dointvec,
584 }, 472 },
585 { 473 {
586 .ctl_name = NET_TCP_MODERATE_RCVBUF,
587 .procname = "tcp_moderate_rcvbuf", 474 .procname = "tcp_moderate_rcvbuf",
588 .data = &sysctl_tcp_moderate_rcvbuf, 475 .data = &sysctl_tcp_moderate_rcvbuf,
589 .maxlen = sizeof(int), 476 .maxlen = sizeof(int),
@@ -591,7 +478,6 @@ static struct ctl_table ipv4_table[] = {
591 .proc_handler = proc_dointvec, 478 .proc_handler = proc_dointvec,
592 }, 479 },
593 { 480 {
594 .ctl_name = NET_TCP_TSO_WIN_DIVISOR,
595 .procname = "tcp_tso_win_divisor", 481 .procname = "tcp_tso_win_divisor",
596 .data = &sysctl_tcp_tso_win_divisor, 482 .data = &sysctl_tcp_tso_win_divisor,
597 .maxlen = sizeof(int), 483 .maxlen = sizeof(int),
@@ -599,15 +485,12 @@ static struct ctl_table ipv4_table[] = {
599 .proc_handler = proc_dointvec, 485 .proc_handler = proc_dointvec,
600 }, 486 },
601 { 487 {
602 .ctl_name = NET_TCP_CONG_CONTROL,
603 .procname = "tcp_congestion_control", 488 .procname = "tcp_congestion_control",
604 .mode = 0644, 489 .mode = 0644,
605 .maxlen = TCP_CA_NAME_MAX, 490 .maxlen = TCP_CA_NAME_MAX,
606 .proc_handler = proc_tcp_congestion_control, 491 .proc_handler = proc_tcp_congestion_control,
607 .strategy = sysctl_tcp_congestion_control,
608 }, 492 },
609 { 493 {
610 .ctl_name = NET_TCP_ABC,
611 .procname = "tcp_abc", 494 .procname = "tcp_abc",
612 .data = &sysctl_tcp_abc, 495 .data = &sysctl_tcp_abc,
613 .maxlen = sizeof(int), 496 .maxlen = sizeof(int),
@@ -615,7 +498,6 @@ static struct ctl_table ipv4_table[] = {
615 .proc_handler = proc_dointvec, 498 .proc_handler = proc_dointvec,
616 }, 499 },
617 { 500 {
618 .ctl_name = NET_TCP_MTU_PROBING,
619 .procname = "tcp_mtu_probing", 501 .procname = "tcp_mtu_probing",
620 .data = &sysctl_tcp_mtu_probing, 502 .data = &sysctl_tcp_mtu_probing,
621 .maxlen = sizeof(int), 503 .maxlen = sizeof(int),
@@ -623,7 +505,6 @@ static struct ctl_table ipv4_table[] = {
623 .proc_handler = proc_dointvec, 505 .proc_handler = proc_dointvec,
624 }, 506 },
625 { 507 {
626 .ctl_name = NET_TCP_BASE_MSS,
627 .procname = "tcp_base_mss", 508 .procname = "tcp_base_mss",
628 .data = &sysctl_tcp_base_mss, 509 .data = &sysctl_tcp_base_mss,
629 .maxlen = sizeof(int), 510 .maxlen = sizeof(int),
@@ -631,7 +512,6 @@ static struct ctl_table ipv4_table[] = {
631 .proc_handler = proc_dointvec, 512 .proc_handler = proc_dointvec,
632 }, 513 },
633 { 514 {
634 .ctl_name = NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS,
635 .procname = "tcp_workaround_signed_windows", 515 .procname = "tcp_workaround_signed_windows",
636 .data = &sysctl_tcp_workaround_signed_windows, 516 .data = &sysctl_tcp_workaround_signed_windows,
637 .maxlen = sizeof(int), 517 .maxlen = sizeof(int),
@@ -640,7 +520,6 @@ static struct ctl_table ipv4_table[] = {
640 }, 520 },
641#ifdef CONFIG_NET_DMA 521#ifdef CONFIG_NET_DMA
642 { 522 {
643 .ctl_name = NET_TCP_DMA_COPYBREAK,
644 .procname = "tcp_dma_copybreak", 523 .procname = "tcp_dma_copybreak",
645 .data = &sysctl_tcp_dma_copybreak, 524 .data = &sysctl_tcp_dma_copybreak,
646 .maxlen = sizeof(int), 525 .maxlen = sizeof(int),
@@ -649,7 +528,6 @@ static struct ctl_table ipv4_table[] = {
649 }, 528 },
650#endif 529#endif
651 { 530 {
652 .ctl_name = NET_TCP_SLOW_START_AFTER_IDLE,
653 .procname = "tcp_slow_start_after_idle", 531 .procname = "tcp_slow_start_after_idle",
654 .data = &sysctl_tcp_slow_start_after_idle, 532 .data = &sysctl_tcp_slow_start_after_idle,
655 .maxlen = sizeof(int), 533 .maxlen = sizeof(int),
@@ -658,7 +536,6 @@ static struct ctl_table ipv4_table[] = {
658 }, 536 },
659#ifdef CONFIG_NETLABEL 537#ifdef CONFIG_NETLABEL
660 { 538 {
661 .ctl_name = NET_CIPSOV4_CACHE_ENABLE,
662 .procname = "cipso_cache_enable", 539 .procname = "cipso_cache_enable",
663 .data = &cipso_v4_cache_enabled, 540 .data = &cipso_v4_cache_enabled,
664 .maxlen = sizeof(int), 541 .maxlen = sizeof(int),
@@ -666,7 +543,6 @@ static struct ctl_table ipv4_table[] = {
666 .proc_handler = proc_dointvec, 543 .proc_handler = proc_dointvec,
667 }, 544 },
668 { 545 {
669 .ctl_name = NET_CIPSOV4_CACHE_BUCKET_SIZE,
670 .procname = "cipso_cache_bucket_size", 546 .procname = "cipso_cache_bucket_size",
671 .data = &cipso_v4_cache_bucketsize, 547 .data = &cipso_v4_cache_bucketsize,
672 .maxlen = sizeof(int), 548 .maxlen = sizeof(int),
@@ -674,7 +550,6 @@ static struct ctl_table ipv4_table[] = {
674 .proc_handler = proc_dointvec, 550 .proc_handler = proc_dointvec,
675 }, 551 },
676 { 552 {
677 .ctl_name = NET_CIPSOV4_RBM_OPTFMT,
678 .procname = "cipso_rbm_optfmt", 553 .procname = "cipso_rbm_optfmt",
679 .data = &cipso_v4_rbm_optfmt, 554 .data = &cipso_v4_rbm_optfmt,
680 .maxlen = sizeof(int), 555 .maxlen = sizeof(int),
@@ -682,7 +557,6 @@ static struct ctl_table ipv4_table[] = {
682 .proc_handler = proc_dointvec, 557 .proc_handler = proc_dointvec,
683 }, 558 },
684 { 559 {
685 .ctl_name = NET_CIPSOV4_RBM_STRICTVALID,
686 .procname = "cipso_rbm_strictvalid", 560 .procname = "cipso_rbm_strictvalid",
687 .data = &cipso_v4_rbm_strictvalid, 561 .data = &cipso_v4_rbm_strictvalid,
688 .maxlen = sizeof(int), 562 .maxlen = sizeof(int),
@@ -697,15 +571,12 @@ static struct ctl_table ipv4_table[] = {
697 .proc_handler = proc_tcp_available_congestion_control, 571 .proc_handler = proc_tcp_available_congestion_control,
698 }, 572 },
699 { 573 {
700 .ctl_name = NET_TCP_ALLOWED_CONG_CONTROL,
701 .procname = "tcp_allowed_congestion_control", 574 .procname = "tcp_allowed_congestion_control",
702 .maxlen = TCP_CA_BUF_MAX, 575 .maxlen = TCP_CA_BUF_MAX,
703 .mode = 0644, 576 .mode = 0644,
704 .proc_handler = proc_allowed_congestion_control, 577 .proc_handler = proc_allowed_congestion_control,
705 .strategy = strategy_allowed_congestion_control,
706 }, 578 },
707 { 579 {
708 .ctl_name = NET_TCP_MAX_SSTHRESH,
709 .procname = "tcp_max_ssthresh", 580 .procname = "tcp_max_ssthresh",
710 .data = &sysctl_tcp_max_ssthresh, 581 .data = &sysctl_tcp_max_ssthresh,
711 .maxlen = sizeof(int), 582 .maxlen = sizeof(int),
@@ -713,41 +584,54 @@ static struct ctl_table ipv4_table[] = {
713 .proc_handler = proc_dointvec, 584 .proc_handler = proc_dointvec,
714 }, 585 },
715 { 586 {
716 .ctl_name = CTL_UNNUMBERED, 587 .procname = "tcp_cookie_size",
588 .data = &sysctl_tcp_cookie_size,
589 .maxlen = sizeof(int),
590 .mode = 0644,
591 .proc_handler = proc_dointvec
592 },
593 {
594 .procname = "tcp_thin_linear_timeouts",
595 .data = &sysctl_tcp_thin_linear_timeouts,
596 .maxlen = sizeof(int),
597 .mode = 0644,
598 .proc_handler = proc_dointvec
599 },
600 {
601 .procname = "tcp_thin_dupack",
602 .data = &sysctl_tcp_thin_dupack,
603 .maxlen = sizeof(int),
604 .mode = 0644,
605 .proc_handler = proc_dointvec
606 },
607 {
717 .procname = "udp_mem", 608 .procname = "udp_mem",
718 .data = &sysctl_udp_mem, 609 .data = &sysctl_udp_mem,
719 .maxlen = sizeof(sysctl_udp_mem), 610 .maxlen = sizeof(sysctl_udp_mem),
720 .mode = 0644, 611 .mode = 0644,
721 .proc_handler = proc_dointvec_minmax, 612 .proc_handler = proc_doulongvec_minmax,
722 .strategy = sysctl_intvec,
723 .extra1 = &zero
724 }, 613 },
725 { 614 {
726 .ctl_name = CTL_UNNUMBERED,
727 .procname = "udp_rmem_min", 615 .procname = "udp_rmem_min",
728 .data = &sysctl_udp_rmem_min, 616 .data = &sysctl_udp_rmem_min,
729 .maxlen = sizeof(sysctl_udp_rmem_min), 617 .maxlen = sizeof(sysctl_udp_rmem_min),
730 .mode = 0644, 618 .mode = 0644,
731 .proc_handler = proc_dointvec_minmax, 619 .proc_handler = proc_dointvec_minmax,
732 .strategy = sysctl_intvec,
733 .extra1 = &zero 620 .extra1 = &zero
734 }, 621 },
735 { 622 {
736 .ctl_name = CTL_UNNUMBERED,
737 .procname = "udp_wmem_min", 623 .procname = "udp_wmem_min",
738 .data = &sysctl_udp_wmem_min, 624 .data = &sysctl_udp_wmem_min,
739 .maxlen = sizeof(sysctl_udp_wmem_min), 625 .maxlen = sizeof(sysctl_udp_wmem_min),
740 .mode = 0644, 626 .mode = 0644,
741 .proc_handler = proc_dointvec_minmax, 627 .proc_handler = proc_dointvec_minmax,
742 .strategy = sysctl_intvec,
743 .extra1 = &zero 628 .extra1 = &zero
744 }, 629 },
745 { .ctl_name = 0 } 630 { }
746}; 631};
747 632
748static struct ctl_table ipv4_net_table[] = { 633static struct ctl_table ipv4_net_table[] = {
749 { 634 {
750 .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL,
751 .procname = "icmp_echo_ignore_all", 635 .procname = "icmp_echo_ignore_all",
752 .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all, 636 .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
753 .maxlen = sizeof(int), 637 .maxlen = sizeof(int),
@@ -755,7 +639,6 @@ static struct ctl_table ipv4_net_table[] = {
755 .proc_handler = proc_dointvec 639 .proc_handler = proc_dointvec
756 }, 640 },
757 { 641 {
758 .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS,
759 .procname = "icmp_echo_ignore_broadcasts", 642 .procname = "icmp_echo_ignore_broadcasts",
760 .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts, 643 .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts,
761 .maxlen = sizeof(int), 644 .maxlen = sizeof(int),
@@ -763,7 +646,6 @@ static struct ctl_table ipv4_net_table[] = {
763 .proc_handler = proc_dointvec 646 .proc_handler = proc_dointvec
764 }, 647 },
765 { 648 {
766 .ctl_name = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES,
767 .procname = "icmp_ignore_bogus_error_responses", 649 .procname = "icmp_ignore_bogus_error_responses",
768 .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses, 650 .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
769 .maxlen = sizeof(int), 651 .maxlen = sizeof(int),
@@ -771,7 +653,6 @@ static struct ctl_table ipv4_net_table[] = {
771 .proc_handler = proc_dointvec 653 .proc_handler = proc_dointvec
772 }, 654 },
773 { 655 {
774 .ctl_name = NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,
775 .procname = "icmp_errors_use_inbound_ifaddr", 656 .procname = "icmp_errors_use_inbound_ifaddr",
776 .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr, 657 .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr,
777 .maxlen = sizeof(int), 658 .maxlen = sizeof(int),
@@ -779,16 +660,13 @@ static struct ctl_table ipv4_net_table[] = {
779 .proc_handler = proc_dointvec 660 .proc_handler = proc_dointvec
780 }, 661 },
781 { 662 {
782 .ctl_name = NET_IPV4_ICMP_RATELIMIT,
783 .procname = "icmp_ratelimit", 663 .procname = "icmp_ratelimit",
784 .data = &init_net.ipv4.sysctl_icmp_ratelimit, 664 .data = &init_net.ipv4.sysctl_icmp_ratelimit,
785 .maxlen = sizeof(int), 665 .maxlen = sizeof(int),
786 .mode = 0644, 666 .mode = 0644,
787 .proc_handler = proc_dointvec_ms_jiffies, 667 .proc_handler = proc_dointvec_ms_jiffies,
788 .strategy = sysctl_ms_jiffies
789 }, 668 },
790 { 669 {
791 .ctl_name = NET_IPV4_ICMP_RATEMASK,
792 .procname = "icmp_ratemask", 670 .procname = "icmp_ratemask",
793 .data = &init_net.ipv4.sysctl_icmp_ratemask, 671 .data = &init_net.ipv4.sysctl_icmp_ratemask,
794 .maxlen = sizeof(int), 672 .maxlen = sizeof(int),
@@ -796,7 +674,6 @@ static struct ctl_table ipv4_net_table[] = {
796 .proc_handler = proc_dointvec 674 .proc_handler = proc_dointvec
797 }, 675 },
798 { 676 {
799 .ctl_name = CTL_UNNUMBERED,
800 .procname = "rt_cache_rebuild_count", 677 .procname = "rt_cache_rebuild_count",
801 .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, 678 .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count,
802 .maxlen = sizeof(int), 679 .maxlen = sizeof(int),
@@ -807,8 +684,8 @@ static struct ctl_table ipv4_net_table[] = {
807}; 684};
808 685
809struct ctl_path net_ipv4_ctl_path[] = { 686struct ctl_path net_ipv4_ctl_path[] = {
810 { .procname = "net", .ctl_name = CTL_NET, }, 687 { .procname = "net", },
811 { .procname = "ipv4", .ctl_name = NET_IPV4, }, 688 { .procname = "ipv4", },
812 { }, 689 { },
813}; 690};
814EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); 691EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
@@ -818,7 +695,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
818 struct ctl_table *table; 695 struct ctl_table *table;
819 696
820 table = ipv4_net_table; 697 table = ipv4_net_table;
821 if (net != &init_net) { 698 if (!net_eq(net, &init_net)) {
822 table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); 699 table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
823 if (table == NULL) 700 if (table == NULL)
824 goto err_alloc; 701 goto err_alloc;
@@ -849,7 +726,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
849 return 0; 726 return 0;
850 727
851err_reg: 728err_reg:
852 if (net != &init_net) 729 if (!net_eq(net, &init_net))
853 kfree(table); 730 kfree(table);
854err_alloc: 731err_alloc:
855 return -ENOMEM; 732 return -ENOMEM;
@@ -872,6 +749,16 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
872static __init int sysctl_ipv4_init(void) 749static __init int sysctl_ipv4_init(void)
873{ 750{
874 struct ctl_table_header *hdr; 751 struct ctl_table_header *hdr;
752 struct ctl_table *i;
753
754 for (i = ipv4_table; i->procname; i++) {
755 if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
756 i->data = sysctl_local_reserved_ports;
757 break;
758 }
759 }
760 if (!i->procname)
761 return -EINVAL;
875 762
876 hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table); 763 hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
877 if (hdr == NULL) 764 if (hdr == NULL)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 17b89c523f9d..6c11eece262c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -264,6 +264,8 @@
264#include <linux/cache.h> 264#include <linux/cache.h>
265#include <linux/err.h> 265#include <linux/err.h>
266#include <linux/crypto.h> 266#include <linux/crypto.h>
267#include <linux/time.h>
268#include <linux/slab.h>
267 269
268#include <net/icmp.h> 270#include <net/icmp.h>
269#include <net/tcp.h> 271#include <net/tcp.h>
@@ -280,7 +282,7 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
280struct percpu_counter tcp_orphan_count; 282struct percpu_counter tcp_orphan_count;
281EXPORT_SYMBOL_GPL(tcp_orphan_count); 283EXPORT_SYMBOL_GPL(tcp_orphan_count);
282 284
283int sysctl_tcp_mem[3] __read_mostly; 285long sysctl_tcp_mem[3] __read_mostly;
284int sysctl_tcp_wmem[3] __read_mostly; 286int sysctl_tcp_wmem[3] __read_mostly;
285int sysctl_tcp_rmem[3] __read_mostly; 287int sysctl_tcp_rmem[3] __read_mostly;
286 288
@@ -288,7 +290,7 @@ EXPORT_SYMBOL(sysctl_tcp_mem);
288EXPORT_SYMBOL(sysctl_tcp_rmem); 290EXPORT_SYMBOL(sysctl_tcp_rmem);
289EXPORT_SYMBOL(sysctl_tcp_wmem); 291EXPORT_SYMBOL(sysctl_tcp_wmem);
290 292
291atomic_t tcp_memory_allocated; /* Current allocated memory. */ 293atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
292EXPORT_SYMBOL(tcp_memory_allocated); 294EXPORT_SYMBOL(tcp_memory_allocated);
293 295
294/* 296/*
@@ -313,7 +315,6 @@ struct tcp_splice_state {
313 * is strict, actions are advisory and have some latency. 315 * is strict, actions are advisory and have some latency.
314 */ 316 */
315int tcp_memory_pressure __read_mostly; 317int tcp_memory_pressure __read_mostly;
316
317EXPORT_SYMBOL(tcp_memory_pressure); 318EXPORT_SYMBOL(tcp_memory_pressure);
318 319
319void tcp_enter_memory_pressure(struct sock *sk) 320void tcp_enter_memory_pressure(struct sock *sk)
@@ -323,9 +324,45 @@ void tcp_enter_memory_pressure(struct sock *sk)
323 tcp_memory_pressure = 1; 324 tcp_memory_pressure = 1;
324 } 325 }
325} 326}
326
327EXPORT_SYMBOL(tcp_enter_memory_pressure); 327EXPORT_SYMBOL(tcp_enter_memory_pressure);
328 328
329/* Convert seconds to retransmits based on initial and max timeout */
330static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
331{
332 u8 res = 0;
333
334 if (seconds > 0) {
335 int period = timeout;
336
337 res = 1;
338 while (seconds > period && res < 255) {
339 res++;
340 timeout <<= 1;
341 if (timeout > rto_max)
342 timeout = rto_max;
343 period += timeout;
344 }
345 }
346 return res;
347}
348
349/* Convert retransmits to seconds based on initial and max timeout */
350static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
351{
352 int period = 0;
353
354 if (retrans > 0) {
355 period = timeout;
356 while (--retrans) {
357 timeout <<= 1;
358 if (timeout > rto_max)
359 timeout = rto_max;
360 period += timeout;
361 }
362 }
363 return period;
364}
365
329/* 366/*
330 * Wait for a TCP event. 367 * Wait for a TCP event.
331 * 368 *
@@ -339,7 +376,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
339 struct sock *sk = sock->sk; 376 struct sock *sk = sock->sk;
340 struct tcp_sock *tp = tcp_sk(sk); 377 struct tcp_sock *tp = tcp_sk(sk);
341 378
342 poll_wait(file, sk->sk_sleep, wait); 379 sock_poll_wait(file, sk_sleep(sk), wait);
343 if (sk->sk_state == TCP_LISTEN) 380 if (sk->sk_state == TCP_LISTEN)
344 return inet_csk_listen_poll(sk); 381 return inet_csk_listen_poll(sk);
345 382
@@ -349,8 +386,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
349 */ 386 */
350 387
351 mask = 0; 388 mask = 0;
352 if (sk->sk_err)
353 mask = POLLERR;
354 389
355 /* 390 /*
356 * POLLHUP is certainly not done right. But poll() doesn't 391 * POLLHUP is certainly not done right. But poll() doesn't
@@ -391,7 +426,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
391 if (tp->urg_seq == tp->copied_seq && 426 if (tp->urg_seq == tp->copied_seq &&
392 !sock_flag(sk, SOCK_URGINLINE) && 427 !sock_flag(sk, SOCK_URGINLINE) &&
393 tp->urg_data) 428 tp->urg_data)
394 target--; 429 target++;
395 430
396 /* Potential race condition. If read of tp below will 431 /* Potential race condition. If read of tp below will
397 * escape above sk->sk_state, we can be illegally awaken 432 * escape above sk->sk_state, we can be illegally awaken
@@ -414,13 +449,20 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
414 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) 449 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
415 mask |= POLLOUT | POLLWRNORM; 450 mask |= POLLOUT | POLLWRNORM;
416 } 451 }
417 } 452 } else
453 mask |= POLLOUT | POLLWRNORM;
418 454
419 if (tp->urg_data & TCP_URG_VALID) 455 if (tp->urg_data & TCP_URG_VALID)
420 mask |= POLLPRI; 456 mask |= POLLPRI;
421 } 457 }
458 /* This barrier is coupled with smp_wmb() in tcp_reset() */
459 smp_rmb();
460 if (sk->sk_err)
461 mask |= POLLERR;
462
422 return mask; 463 return mask;
423} 464}
465EXPORT_SYMBOL(tcp_poll);
424 466
425int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) 467int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
426{ 468{
@@ -469,10 +511,11 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
469 511
470 return put_user(answ, (int __user *)arg); 512 return put_user(answ, (int __user *)arg);
471} 513}
514EXPORT_SYMBOL(tcp_ioctl);
472 515
473static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) 516static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
474{ 517{
475 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 518 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
476 tp->pushed_seq = tp->write_seq; 519 tp->pushed_seq = tp->write_seq;
477} 520}
478 521
@@ -488,7 +531,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
488 531
489 skb->csum = 0; 532 skb->csum = 0;
490 tcb->seq = tcb->end_seq = tp->write_seq; 533 tcb->seq = tcb->end_seq = tp->write_seq;
491 tcb->flags = TCPCB_FLAG_ACK; 534 tcb->flags = TCPHDR_ACK;
492 tcb->sacked = 0; 535 tcb->sacked = 0;
493 skb_header_release(skb); 536 skb_header_release(skb);
494 tcp_add_write_queue_tail(sk, skb); 537 tcp_add_write_queue_tail(sk, skb);
@@ -498,8 +541,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
498 tp->nonagle &= ~TCP_NAGLE_PUSH; 541 tp->nonagle &= ~TCP_NAGLE_PUSH;
499} 542}
500 543
501static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, 544static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
502 struct sk_buff *skb)
503{ 545{
504 if (flags & MSG_OOB) 546 if (flags & MSG_OOB)
505 tp->snd_up = tp->write_seq; 547 tp->snd_up = tp->write_seq;
@@ -508,13 +550,13 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
508static inline void tcp_push(struct sock *sk, int flags, int mss_now, 550static inline void tcp_push(struct sock *sk, int flags, int mss_now,
509 int nonagle) 551 int nonagle)
510{ 552{
511 struct tcp_sock *tp = tcp_sk(sk);
512
513 if (tcp_send_head(sk)) { 553 if (tcp_send_head(sk)) {
514 struct sk_buff *skb = tcp_write_queue_tail(sk); 554 struct tcp_sock *tp = tcp_sk(sk);
555
515 if (!(flags & MSG_MORE) || forced_push(tp)) 556 if (!(flags & MSG_MORE) || forced_push(tp))
516 tcp_mark_push(tp, skb); 557 tcp_mark_push(tp, tcp_write_queue_tail(sk));
517 tcp_mark_urg(tp, flags, skb); 558
559 tcp_mark_urg(tp, flags);
518 __tcp_push_pending_frames(sk, mss_now, 560 __tcp_push_pending_frames(sk, mss_now,
519 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); 561 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
520 } 562 }
@@ -570,6 +612,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
570 ssize_t spliced; 612 ssize_t spliced;
571 int ret; 613 int ret;
572 614
615 sock_rps_record_flow(sk);
573 /* 616 /*
574 * We can't seek on a socket input 617 * We can't seek on a socket input
575 */ 618 */
@@ -580,7 +623,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
580 623
581 lock_sock(sk); 624 lock_sock(sk);
582 625
583 timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK); 626 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
584 while (tss.len) { 627 while (tss.len) {
585 ret = __tcp_splice_read(sk, &tss); 628 ret = __tcp_splice_read(sk, &tss);
586 if (ret < 0) 629 if (ret < 0)
@@ -637,6 +680,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
637 680
638 return ret; 681 return ret;
639} 682}
683EXPORT_SYMBOL(tcp_splice_read);
640 684
641struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) 685struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
642{ 686{
@@ -777,7 +821,7 @@ new_segment:
777 skb_shinfo(skb)->gso_segs = 0; 821 skb_shinfo(skb)->gso_segs = 0;
778 822
779 if (!copied) 823 if (!copied)
780 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; 824 TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
781 825
782 copied += copy; 826 copied += copy;
783 poffset += copy; 827 poffset += copy;
@@ -818,15 +862,15 @@ out_err:
818 return sk_stream_error(sk, flags, err); 862 return sk_stream_error(sk, flags, err);
819} 863}
820 864
821ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, 865int tcp_sendpage(struct sock *sk, struct page *page, int offset,
822 size_t size, int flags) 866 size_t size, int flags)
823{ 867{
824 ssize_t res; 868 ssize_t res;
825 struct sock *sk = sock->sk;
826 869
827 if (!(sk->sk_route_caps & NETIF_F_SG) || 870 if (!(sk->sk_route_caps & NETIF_F_SG) ||
828 !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) 871 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
829 return sock_no_sendpage(sock, page, offset, size, flags); 872 return sock_no_sendpage(sk->sk_socket, page, offset, size,
873 flags);
830 874
831 lock_sock(sk); 875 lock_sock(sk);
832 TCP_CHECK_TIMER(sk); 876 TCP_CHECK_TIMER(sk);
@@ -835,16 +879,17 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
835 release_sock(sk); 879 release_sock(sk);
836 return res; 880 return res;
837} 881}
882EXPORT_SYMBOL(tcp_sendpage);
838 883
839#define TCP_PAGE(sk) (sk->sk_sndmsg_page) 884#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
840#define TCP_OFF(sk) (sk->sk_sndmsg_off) 885#define TCP_OFF(sk) (sk->sk_sndmsg_off)
841 886
842static inline int select_size(struct sock *sk) 887static inline int select_size(struct sock *sk, int sg)
843{ 888{
844 struct tcp_sock *tp = tcp_sk(sk); 889 struct tcp_sock *tp = tcp_sk(sk);
845 int tmp = tp->mss_cache; 890 int tmp = tp->mss_cache;
846 891
847 if (sk->sk_route_caps & NETIF_F_SG) { 892 if (sg) {
848 if (sk_can_gso(sk)) 893 if (sk_can_gso(sk))
849 tmp = 0; 894 tmp = 0;
850 else { 895 else {
@@ -859,16 +904,15 @@ static inline int select_size(struct sock *sk)
859 return tmp; 904 return tmp;
860} 905}
861 906
862int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, 907int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
863 size_t size) 908 size_t size)
864{ 909{
865 struct sock *sk = sock->sk;
866 struct iovec *iov; 910 struct iovec *iov;
867 struct tcp_sock *tp = tcp_sk(sk); 911 struct tcp_sock *tp = tcp_sk(sk);
868 struct sk_buff *skb; 912 struct sk_buff *skb;
869 int iovlen, flags; 913 int iovlen, flags;
870 int mss_now, size_goal; 914 int mss_now, size_goal;
871 int err, copied; 915 int sg, err, copied;
872 long timeo; 916 long timeo;
873 917
874 lock_sock(sk); 918 lock_sock(sk);
@@ -896,20 +940,26 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
896 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 940 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
897 goto out_err; 941 goto out_err;
898 942
943 sg = sk->sk_route_caps & NETIF_F_SG;
944
899 while (--iovlen >= 0) { 945 while (--iovlen >= 0) {
900 int seglen = iov->iov_len; 946 size_t seglen = iov->iov_len;
901 unsigned char __user *from = iov->iov_base; 947 unsigned char __user *from = iov->iov_base;
902 948
903 iov++; 949 iov++;
904 950
905 while (seglen > 0) { 951 while (seglen > 0) {
906 int copy; 952 int copy = 0;
953 int max = size_goal;
907 954
908 skb = tcp_write_queue_tail(sk); 955 skb = tcp_write_queue_tail(sk);
956 if (tcp_send_head(sk)) {
957 if (skb->ip_summed == CHECKSUM_NONE)
958 max = mss_now;
959 copy = max - skb->len;
960 }
909 961
910 if (!tcp_send_head(sk) || 962 if (copy <= 0) {
911 (copy = size_goal - skb->len) <= 0) {
912
913new_segment: 963new_segment:
914 /* Allocate new segment. If the interface is SG, 964 /* Allocate new segment. If the interface is SG,
915 * allocate skb fitting to single page. 965 * allocate skb fitting to single page.
@@ -917,8 +967,9 @@ new_segment:
917 if (!sk_stream_memory_free(sk)) 967 if (!sk_stream_memory_free(sk))
918 goto wait_for_sndbuf; 968 goto wait_for_sndbuf;
919 969
920 skb = sk_stream_alloc_skb(sk, select_size(sk), 970 skb = sk_stream_alloc_skb(sk,
921 sk->sk_allocation); 971 select_size(sk, sg),
972 sk->sk_allocation);
922 if (!skb) 973 if (!skb)
923 goto wait_for_memory; 974 goto wait_for_memory;
924 975
@@ -930,6 +981,7 @@ new_segment:
930 981
931 skb_entail(sk, skb); 982 skb_entail(sk, skb);
932 copy = size_goal; 983 copy = size_goal;
984 max = size_goal;
933 } 985 }
934 986
935 /* Try to append data to the end of skb. */ 987 /* Try to append data to the end of skb. */
@@ -954,9 +1006,7 @@ new_segment:
954 /* We can extend the last page 1006 /* We can extend the last page
955 * fragment. */ 1007 * fragment. */
956 merge = 1; 1008 merge = 1;
957 } else if (i == MAX_SKB_FRAGS || 1009 } else if (i == MAX_SKB_FRAGS || !sg) {
958 (!i &&
959 !(sk->sk_route_caps & NETIF_F_SG))) {
960 /* Need to add new fragment and cannot 1010 /* Need to add new fragment and cannot
961 * do this because interface is non-SG, 1011 * do this because interface is non-SG,
962 * or because all the page slots are 1012 * or because all the page slots are
@@ -1017,7 +1067,7 @@ new_segment:
1017 } 1067 }
1018 1068
1019 if (!copied) 1069 if (!copied)
1020 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; 1070 TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
1021 1071
1022 tp->write_seq += copy; 1072 tp->write_seq += copy;
1023 TCP_SKB_CB(skb)->end_seq += copy; 1073 TCP_SKB_CB(skb)->end_seq += copy;
@@ -1028,7 +1078,7 @@ new_segment:
1028 if ((seglen -= copy) == 0 && iovlen == 0) 1078 if ((seglen -= copy) == 0 && iovlen == 0)
1029 goto out; 1079 goto out;
1030 1080
1031 if (skb->len < size_goal || (flags & MSG_OOB)) 1081 if (skb->len < max || (flags & MSG_OOB))
1032 continue; 1082 continue;
1033 1083
1034 if (forced_push(tp)) { 1084 if (forced_push(tp)) {
@@ -1077,6 +1127,7 @@ out_err:
1077 release_sock(sk); 1127 release_sock(sk);
1078 return err; 1128 return err;
1079} 1129}
1130EXPORT_SYMBOL(tcp_sendmsg);
1080 1131
1081/* 1132/*
1082 * Handle reading urgent data. BSD has very simple semantics for 1133 * Handle reading urgent data. BSD has very simple semantics for
@@ -1141,7 +1192,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
1141#if TCP_DEBUG 1192#if TCP_DEBUG
1142 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 1193 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1143 1194
1144 WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); 1195 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1196 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1197 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1145#endif 1198#endif
1146 1199
1147 if (inet_csk_ack_scheduled(sk)) { 1200 if (inet_csk_ack_scheduled(sk)) {
@@ -1209,6 +1262,39 @@ static void tcp_prequeue_process(struct sock *sk)
1209 tp->ucopy.memory = 0; 1262 tp->ucopy.memory = 0;
1210} 1263}
1211 1264
1265#ifdef CONFIG_NET_DMA
1266static void tcp_service_net_dma(struct sock *sk, bool wait)
1267{
1268 dma_cookie_t done, used;
1269 dma_cookie_t last_issued;
1270 struct tcp_sock *tp = tcp_sk(sk);
1271
1272 if (!tp->ucopy.dma_chan)
1273 return;
1274
1275 last_issued = tp->ucopy.dma_cookie;
1276 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1277
1278 do {
1279 if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1280 last_issued, &done,
1281 &used) == DMA_SUCCESS) {
1282 /* Safe to free early-copied skbs now */
1283 __skb_queue_purge(&sk->sk_async_wait_queue);
1284 break;
1285 } else {
1286 struct sk_buff *skb;
1287 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1288 (dma_async_is_complete(skb->dma_cookie, done,
1289 used) == DMA_SUCCESS)) {
1290 __skb_dequeue(&sk->sk_async_wait_queue);
1291 kfree_skb(skb);
1292 }
1293 }
1294 } while (wait);
1295}
1296#endif
1297
1212static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) 1298static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1213{ 1299{
1214 struct sk_buff *skb; 1300 struct sk_buff *skb;
@@ -1290,6 +1376,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1290 sk_eat_skb(sk, skb, 0); 1376 sk_eat_skb(sk, skb, 0);
1291 if (!desc->count) 1377 if (!desc->count)
1292 break; 1378 break;
1379 tp->copied_seq = seq;
1293 } 1380 }
1294 tp->copied_seq = seq; 1381 tp->copied_seq = seq;
1295 1382
@@ -1300,6 +1387,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1300 tcp_cleanup_rbuf(sk, copied); 1387 tcp_cleanup_rbuf(sk, copied);
1301 return copied; 1388 return copied;
1302} 1389}
1390EXPORT_SYMBOL(tcp_read_sock);
1303 1391
1304/* 1392/*
1305 * This routine copies from a sock struct into the user buffer. 1393 * This routine copies from a sock struct into the user buffer.
@@ -1388,11 +1476,12 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1388 /* Now that we have two receive queues this 1476 /* Now that we have two receive queues this
1389 * shouldn't happen. 1477 * shouldn't happen.
1390 */ 1478 */
1391 if (before(*seq, TCP_SKB_CB(skb)->seq)) { 1479 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1392 printk(KERN_INFO "recvmsg bug: copied %X " 1480 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1393 "seq %X\n", *seq, TCP_SKB_CB(skb)->seq); 1481 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1482 flags))
1394 break; 1483 break;
1395 } 1484
1396 offset = *seq - TCP_SKB_CB(skb)->seq; 1485 offset = *seq - TCP_SKB_CB(skb)->seq;
1397 if (tcp_hdr(skb)->syn) 1486 if (tcp_hdr(skb)->syn)
1398 offset--; 1487 offset--;
@@ -1400,7 +1489,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1400 goto found_ok_skb; 1489 goto found_ok_skb;
1401 if (tcp_hdr(skb)->fin) 1490 if (tcp_hdr(skb)->fin)
1402 goto found_fin_ok; 1491 goto found_fin_ok;
1403 WARN_ON(!(flags & MSG_PEEK)); 1492 WARN(!(flags & MSG_PEEK),
1493 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1494 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1404 } 1495 }
1405 1496
1406 /* Well, if we have backlog, try to process it now yet. */ 1497 /* Well, if we have backlog, try to process it now yet. */
@@ -1496,6 +1587,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1496 /* __ Set realtime policy in scheduler __ */ 1587 /* __ Set realtime policy in scheduler __ */
1497 } 1588 }
1498 1589
1590#ifdef CONFIG_NET_DMA
1591 if (tp->ucopy.dma_chan)
1592 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1593#endif
1499 if (copied >= target) { 1594 if (copied >= target) {
1500 /* Do not sleep, just process backlog. */ 1595 /* Do not sleep, just process backlog. */
1501 release_sock(sk); 1596 release_sock(sk);
@@ -1504,6 +1599,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1504 sk_wait_data(sk, &timeo); 1599 sk_wait_data(sk, &timeo);
1505 1600
1506#ifdef CONFIG_NET_DMA 1601#ifdef CONFIG_NET_DMA
1602 tcp_service_net_dma(sk, false); /* Don't block */
1507 tp->ucopy.wakeup = 0; 1603 tp->ucopy.wakeup = 0;
1508#endif 1604#endif
1509 1605
@@ -1583,6 +1679,9 @@ do_prequeue:
1583 copied = -EFAULT; 1679 copied = -EFAULT;
1584 break; 1680 break;
1585 } 1681 }
1682
1683 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1684
1586 if ((offset + used) == skb->len) 1685 if ((offset + used) == skb->len)
1587 copied_early = 1; 1686 copied_early = 1;
1588 1687
@@ -1652,27 +1751,9 @@ skip_copy:
1652 } 1751 }
1653 1752
1654#ifdef CONFIG_NET_DMA 1753#ifdef CONFIG_NET_DMA
1655 if (tp->ucopy.dma_chan) { 1754 tcp_service_net_dma(sk, true); /* Wait for queue to drain */
1656 dma_cookie_t done, used; 1755 tp->ucopy.dma_chan = NULL;
1657
1658 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1659
1660 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1661 tp->ucopy.dma_cookie, &done,
1662 &used) == DMA_IN_PROGRESS) {
1663 /* do partial cleanup of sk_async_wait_queue */
1664 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1665 (dma_async_is_complete(skb->dma_cookie, done,
1666 used) == DMA_SUCCESS)) {
1667 __skb_dequeue(&sk->sk_async_wait_queue);
1668 kfree_skb(skb);
1669 }
1670 }
1671 1756
1672 /* Safe to free early-copied skbs now */
1673 __skb_queue_purge(&sk->sk_async_wait_queue);
1674 tp->ucopy.dma_chan = NULL;
1675 }
1676 if (tp->ucopy.pinned_list) { 1757 if (tp->ucopy.pinned_list) {
1677 dma_unpin_iovec_pages(tp->ucopy.pinned_list); 1758 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1678 tp->ucopy.pinned_list = NULL; 1759 tp->ucopy.pinned_list = NULL;
@@ -1699,6 +1780,7 @@ recv_urg:
1699 err = tcp_recv_urg(sk, msg, len, flags); 1780 err = tcp_recv_urg(sk, msg, len, flags);
1700 goto out; 1781 goto out;
1701} 1782}
1783EXPORT_SYMBOL(tcp_recvmsg);
1702 1784
1703void tcp_set_state(struct sock *sk, int state) 1785void tcp_set_state(struct sock *sk, int state)
1704{ 1786{
@@ -1791,6 +1873,7 @@ void tcp_shutdown(struct sock *sk, int how)
1791 tcp_send_fin(sk); 1873 tcp_send_fin(sk);
1792 } 1874 }
1793} 1875}
1876EXPORT_SYMBOL(tcp_shutdown);
1794 1877
1795void tcp_close(struct sock *sk, long timeout) 1878void tcp_close(struct sock *sk, long timeout)
1796{ 1879{
@@ -1823,6 +1906,10 @@ void tcp_close(struct sock *sk, long timeout)
1823 1906
1824 sk_mem_reclaim(sk); 1907 sk_mem_reclaim(sk);
1825 1908
1909 /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
1910 if (sk->sk_state == TCP_CLOSE)
1911 goto adjudge_to_death;
1912
1826 /* As outlined in RFC 2525, section 2.17, we send a RST here because 1913 /* As outlined in RFC 2525, section 2.17, we send a RST here because
1827 * data was lost. To witness the awful effects of the old behavior of 1914 * data was lost. To witness the awful effects of the old behavior of
1828 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk 1915 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
@@ -1834,7 +1921,7 @@ void tcp_close(struct sock *sk, long timeout)
1834 /* Unread data was tossed, zap the connection. */ 1921 /* Unread data was tossed, zap the connection. */
1835 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); 1922 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
1836 tcp_set_state(sk, TCP_CLOSE); 1923 tcp_set_state(sk, TCP_CLOSE);
1837 tcp_send_active_reset(sk, GFP_KERNEL); 1924 tcp_send_active_reset(sk, sk->sk_allocation);
1838 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { 1925 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1839 /* Check zero linger _after_ checking for unread data. */ 1926 /* Check zero linger _after_ checking for unread data. */
1840 sk->sk_prot->disconnect(sk, 0); 1927 sk->sk_prot->disconnect(sk, 0);
@@ -1926,11 +2013,8 @@ adjudge_to_death:
1926 } 2013 }
1927 } 2014 }
1928 if (sk->sk_state != TCP_CLOSE) { 2015 if (sk->sk_state != TCP_CLOSE) {
1929 int orphan_count = percpu_counter_read_positive(
1930 sk->sk_prot->orphan_count);
1931
1932 sk_mem_reclaim(sk); 2016 sk_mem_reclaim(sk);
1933 if (tcp_too_many_orphans(sk, orphan_count)) { 2017 if (tcp_too_many_orphans(sk, 0)) {
1934 if (net_ratelimit()) 2018 if (net_ratelimit())
1935 printk(KERN_INFO "TCP: too many of orphaned " 2019 printk(KERN_INFO "TCP: too many of orphaned "
1936 "sockets\n"); 2020 "sockets\n");
@@ -1950,6 +2034,7 @@ out:
1950 local_bh_enable(); 2034 local_bh_enable();
1951 sock_put(sk); 2035 sock_put(sk);
1952} 2036}
2037EXPORT_SYMBOL(tcp_close);
1953 2038
1954/* These states need RST on ABORT according to RFC793 */ 2039/* These states need RST on ABORT according to RFC793 */
1955 2040
@@ -1993,7 +2078,7 @@ int tcp_disconnect(struct sock *sk, int flags)
1993 __skb_queue_purge(&sk->sk_async_wait_queue); 2078 __skb_queue_purge(&sk->sk_async_wait_queue);
1994#endif 2079#endif
1995 2080
1996 inet->dport = 0; 2081 inet->inet_dport = 0;
1997 2082
1998 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 2083 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1999 inet_reset_saddr(sk); 2084 inet_reset_saddr(sk);
@@ -2007,9 +2092,10 @@ int tcp_disconnect(struct sock *sk, int flags)
2007 tp->snd_cwnd = 2; 2092 tp->snd_cwnd = 2;
2008 icsk->icsk_probes_out = 0; 2093 icsk->icsk_probes_out = 0;
2009 tp->packets_out = 0; 2094 tp->packets_out = 0;
2010 tp->snd_ssthresh = 0x7fffffff; 2095 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2011 tp->snd_cwnd_cnt = 0; 2096 tp->snd_cwnd_cnt = 0;
2012 tp->bytes_acked = 0; 2097 tp->bytes_acked = 0;
2098 tp->window_clamp = 0;
2013 tcp_set_ca_state(sk, TCP_CA_Open); 2099 tcp_set_ca_state(sk, TCP_CA_Open);
2014 tcp_clear_retrans(tp); 2100 tcp_clear_retrans(tp);
2015 inet_csk_delack_init(sk); 2101 inet_csk_delack_init(sk);
@@ -2017,32 +2103,34 @@ int tcp_disconnect(struct sock *sk, int flags)
2017 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); 2103 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2018 __sk_dst_reset(sk); 2104 __sk_dst_reset(sk);
2019 2105
2020 WARN_ON(inet->num && !icsk->icsk_bind_hash); 2106 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2021 2107
2022 sk->sk_error_report(sk); 2108 sk->sk_error_report(sk);
2023 return err; 2109 return err;
2024} 2110}
2111EXPORT_SYMBOL(tcp_disconnect);
2025 2112
2026/* 2113/*
2027 * Socket option code for TCP. 2114 * Socket option code for TCP.
2028 */ 2115 */
2029static int do_tcp_setsockopt(struct sock *sk, int level, 2116static int do_tcp_setsockopt(struct sock *sk, int level,
2030 int optname, char __user *optval, int optlen) 2117 int optname, char __user *optval, unsigned int optlen)
2031{ 2118{
2032 struct tcp_sock *tp = tcp_sk(sk); 2119 struct tcp_sock *tp = tcp_sk(sk);
2033 struct inet_connection_sock *icsk = inet_csk(sk); 2120 struct inet_connection_sock *icsk = inet_csk(sk);
2034 int val; 2121 int val;
2035 int err = 0; 2122 int err = 0;
2036 2123
2037 /* This is a string value all the others are int's */ 2124 /* These are data/string values, all the others are ints */
2038 if (optname == TCP_CONGESTION) { 2125 switch (optname) {
2126 case TCP_CONGESTION: {
2039 char name[TCP_CA_NAME_MAX]; 2127 char name[TCP_CA_NAME_MAX];
2040 2128
2041 if (optlen < 1) 2129 if (optlen < 1)
2042 return -EINVAL; 2130 return -EINVAL;
2043 2131
2044 val = strncpy_from_user(name, optval, 2132 val = strncpy_from_user(name, optval,
2045 min(TCP_CA_NAME_MAX-1, optlen)); 2133 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2046 if (val < 0) 2134 if (val < 0)
2047 return -EFAULT; 2135 return -EFAULT;
2048 name[val] = 0; 2136 name[val] = 0;
@@ -2052,6 +2140,96 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2052 release_sock(sk); 2140 release_sock(sk);
2053 return err; 2141 return err;
2054 } 2142 }
2143 case TCP_COOKIE_TRANSACTIONS: {
2144 struct tcp_cookie_transactions ctd;
2145 struct tcp_cookie_values *cvp = NULL;
2146
2147 if (sizeof(ctd) > optlen)
2148 return -EINVAL;
2149 if (copy_from_user(&ctd, optval, sizeof(ctd)))
2150 return -EFAULT;
2151
2152 if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
2153 ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
2154 return -EINVAL;
2155
2156 if (ctd.tcpct_cookie_desired == 0) {
2157 /* default to global value */
2158 } else if ((0x1 & ctd.tcpct_cookie_desired) ||
2159 ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
2160 ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
2161 return -EINVAL;
2162 }
2163
2164 if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
2165 /* Supercedes all other values */
2166 lock_sock(sk);
2167 if (tp->cookie_values != NULL) {
2168 kref_put(&tp->cookie_values->kref,
2169 tcp_cookie_values_release);
2170 tp->cookie_values = NULL;
2171 }
2172 tp->rx_opt.cookie_in_always = 0; /* false */
2173 tp->rx_opt.cookie_out_never = 1; /* true */
2174 release_sock(sk);
2175 return err;
2176 }
2177
2178 /* Allocate ancillary memory before locking.
2179 */
2180 if (ctd.tcpct_used > 0 ||
2181 (tp->cookie_values == NULL &&
2182 (sysctl_tcp_cookie_size > 0 ||
2183 ctd.tcpct_cookie_desired > 0 ||
2184 ctd.tcpct_s_data_desired > 0))) {
2185 cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
2186 GFP_KERNEL);
2187 if (cvp == NULL)
2188 return -ENOMEM;
2189
2190 kref_init(&cvp->kref);
2191 }
2192 lock_sock(sk);
2193 tp->rx_opt.cookie_in_always =
2194 (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
2195 tp->rx_opt.cookie_out_never = 0; /* false */
2196
2197 if (tp->cookie_values != NULL) {
2198 if (cvp != NULL) {
2199 /* Changed values are recorded by a changed
2200 * pointer, ensuring the cookie will differ,
2201 * without separately hashing each value later.
2202 */
2203 kref_put(&tp->cookie_values->kref,
2204 tcp_cookie_values_release);
2205 } else {
2206 cvp = tp->cookie_values;
2207 }
2208 }
2209
2210 if (cvp != NULL) {
2211 cvp->cookie_desired = ctd.tcpct_cookie_desired;
2212
2213 if (ctd.tcpct_used > 0) {
2214 memcpy(cvp->s_data_payload, ctd.tcpct_value,
2215 ctd.tcpct_used);
2216 cvp->s_data_desired = ctd.tcpct_used;
2217 cvp->s_data_constant = 1; /* true */
2218 } else {
2219 /* No constant payload data. */
2220 cvp->s_data_desired = ctd.tcpct_s_data_desired;
2221 cvp->s_data_constant = 0; /* false */
2222 }
2223
2224 tp->cookie_values = cvp;
2225 }
2226 release_sock(sk);
2227 return err;
2228 }
2229 default:
2230 /* fallthru */
2231 break;
2232 }
2055 2233
2056 if (optlen < sizeof(int)) 2234 if (optlen < sizeof(int))
2057 return -EINVAL; 2235 return -EINVAL;
@@ -2066,7 +2244,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2066 /* Values greater than interface MTU won't take effect. However 2244 /* Values greater than interface MTU won't take effect. However
2067 * at the point when this call is done we typically don't yet 2245 * at the point when this call is done we typically don't yet
2068 * know which interface is going to be used */ 2246 * know which interface is going to be used */
2069 if (val < 8 || val > MAX_TCP_WINDOW) { 2247 if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2070 err = -EINVAL; 2248 err = -EINVAL;
2071 break; 2249 break;
2072 } 2250 }
@@ -2090,6 +2268,20 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2090 } 2268 }
2091 break; 2269 break;
2092 2270
2271 case TCP_THIN_LINEAR_TIMEOUTS:
2272 if (val < 0 || val > 1)
2273 err = -EINVAL;
2274 else
2275 tp->thin_lto = val;
2276 break;
2277
2278 case TCP_THIN_DUPACK:
2279 if (val < 0 || val > 1)
2280 err = -EINVAL;
2281 else
2282 tp->thin_dupack = val;
2283 break;
2284
2093 case TCP_CORK: 2285 case TCP_CORK:
2094 /* When set indicates to always queue non-full frames. 2286 /* When set indicates to always queue non-full frames.
2095 * Later the user clears this option and we transmit 2287 * Later the user clears this option and we transmit
@@ -2120,7 +2312,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2120 if (sock_flag(sk, SOCK_KEEPOPEN) && 2312 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2121 !((1 << sk->sk_state) & 2313 !((1 << sk->sk_state) &
2122 (TCPF_CLOSE | TCPF_LISTEN))) { 2314 (TCPF_CLOSE | TCPF_LISTEN))) {
2123 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp; 2315 u32 elapsed = keepalive_time_elapsed(tp);
2124 if (tp->keepalive_time > elapsed) 2316 if (tp->keepalive_time > elapsed)
2125 elapsed = tp->keepalive_time - elapsed; 2317 elapsed = tp->keepalive_time - elapsed;
2126 else 2318 else
@@ -2158,16 +2350,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2158 break; 2350 break;
2159 2351
2160 case TCP_DEFER_ACCEPT: 2352 case TCP_DEFER_ACCEPT:
2161 icsk->icsk_accept_queue.rskq_defer_accept = 0; 2353 /* Translate value in seconds to number of retransmits */
2162 if (val > 0) { 2354 icsk->icsk_accept_queue.rskq_defer_accept =
2163 /* Translate value in seconds to number of 2355 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2164 * retransmits */ 2356 TCP_RTO_MAX / HZ);
2165 while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
2166 val > ((TCP_TIMEOUT_INIT / HZ) <<
2167 icsk->icsk_accept_queue.rskq_defer_accept))
2168 icsk->icsk_accept_queue.rskq_defer_accept++;
2169 icsk->icsk_accept_queue.rskq_defer_accept++;
2170 }
2171 break; 2357 break;
2172 2358
2173 case TCP_WINDOW_CLAMP: 2359 case TCP_WINDOW_CLAMP:
@@ -2204,7 +2390,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2204 err = tp->af_specific->md5_parse(sk, optval, optlen); 2390 err = tp->af_specific->md5_parse(sk, optval, optlen);
2205 break; 2391 break;
2206#endif 2392#endif
2207 2393 case TCP_USER_TIMEOUT:
2394 /* Cap the max timeout in ms TCP will retry/retrans
2395 * before giving up and aborting (ETIMEDOUT) a connection.
2396 */
2397 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2398 break;
2208 default: 2399 default:
2209 err = -ENOPROTOOPT; 2400 err = -ENOPROTOOPT;
2210 break; 2401 break;
@@ -2215,7 +2406,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2215} 2406}
2216 2407
2217int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, 2408int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2218 int optlen) 2409 unsigned int optlen)
2219{ 2410{
2220 struct inet_connection_sock *icsk = inet_csk(sk); 2411 struct inet_connection_sock *icsk = inet_csk(sk);
2221 2412
@@ -2224,17 +2415,17 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2224 optval, optlen); 2415 optval, optlen);
2225 return do_tcp_setsockopt(sk, level, optname, optval, optlen); 2416 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2226} 2417}
2418EXPORT_SYMBOL(tcp_setsockopt);
2227 2419
2228#ifdef CONFIG_COMPAT 2420#ifdef CONFIG_COMPAT
2229int compat_tcp_setsockopt(struct sock *sk, int level, int optname, 2421int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2230 char __user *optval, int optlen) 2422 char __user *optval, unsigned int optlen)
2231{ 2423{
2232 if (level != SOL_TCP) 2424 if (level != SOL_TCP)
2233 return inet_csk_compat_setsockopt(sk, level, optname, 2425 return inet_csk_compat_setsockopt(sk, level, optname,
2234 optval, optlen); 2426 optval, optlen);
2235 return do_tcp_setsockopt(sk, level, optname, optval, optlen); 2427 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2236} 2428}
2237
2238EXPORT_SYMBOL(compat_tcp_setsockopt); 2429EXPORT_SYMBOL(compat_tcp_setsockopt);
2239#endif 2430#endif
2240 2431
@@ -2300,7 +2491,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2300 2491
2301 info->tcpi_total_retrans = tp->total_retrans; 2492 info->tcpi_total_retrans = tp->total_retrans;
2302} 2493}
2303
2304EXPORT_SYMBOL_GPL(tcp_get_info); 2494EXPORT_SYMBOL_GPL(tcp_get_info);
2305 2495
2306static int do_tcp_getsockopt(struct sock *sk, int level, 2496static int do_tcp_getsockopt(struct sock *sk, int level,
@@ -2331,13 +2521,13 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2331 val = !!(tp->nonagle&TCP_NAGLE_CORK); 2521 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2332 break; 2522 break;
2333 case TCP_KEEPIDLE: 2523 case TCP_KEEPIDLE:
2334 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ; 2524 val = keepalive_time_when(tp) / HZ;
2335 break; 2525 break;
2336 case TCP_KEEPINTVL: 2526 case TCP_KEEPINTVL:
2337 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ; 2527 val = keepalive_intvl_when(tp) / HZ;
2338 break; 2528 break;
2339 case TCP_KEEPCNT: 2529 case TCP_KEEPCNT:
2340 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; 2530 val = keepalive_probes(tp);
2341 break; 2531 break;
2342 case TCP_SYNCNT: 2532 case TCP_SYNCNT:
2343 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 2533 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
@@ -2348,8 +2538,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2348 val = (val ? : sysctl_tcp_fin_timeout) / HZ; 2538 val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2349 break; 2539 break;
2350 case TCP_DEFER_ACCEPT: 2540 case TCP_DEFER_ACCEPT:
2351 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : 2541 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2352 ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); 2542 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2353 break; 2543 break;
2354 case TCP_WINDOW_CLAMP: 2544 case TCP_WINDOW_CLAMP:
2355 val = tp->window_clamp; 2545 val = tp->window_clamp;
@@ -2382,6 +2572,52 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2382 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) 2572 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2383 return -EFAULT; 2573 return -EFAULT;
2384 return 0; 2574 return 0;
2575
2576 case TCP_COOKIE_TRANSACTIONS: {
2577 struct tcp_cookie_transactions ctd;
2578 struct tcp_cookie_values *cvp = tp->cookie_values;
2579
2580 if (get_user(len, optlen))
2581 return -EFAULT;
2582 if (len < sizeof(ctd))
2583 return -EINVAL;
2584
2585 memset(&ctd, 0, sizeof(ctd));
2586 ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
2587 TCP_COOKIE_IN_ALWAYS : 0)
2588 | (tp->rx_opt.cookie_out_never ?
2589 TCP_COOKIE_OUT_NEVER : 0);
2590
2591 if (cvp != NULL) {
2592 ctd.tcpct_flags |= (cvp->s_data_in ?
2593 TCP_S_DATA_IN : 0)
2594 | (cvp->s_data_out ?
2595 TCP_S_DATA_OUT : 0);
2596
2597 ctd.tcpct_cookie_desired = cvp->cookie_desired;
2598 ctd.tcpct_s_data_desired = cvp->s_data_desired;
2599
2600 memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
2601 cvp->cookie_pair_size);
2602 ctd.tcpct_used = cvp->cookie_pair_size;
2603 }
2604
2605 if (put_user(sizeof(ctd), optlen))
2606 return -EFAULT;
2607 if (copy_to_user(optval, &ctd, sizeof(ctd)))
2608 return -EFAULT;
2609 return 0;
2610 }
2611 case TCP_THIN_LINEAR_TIMEOUTS:
2612 val = tp->thin_lto;
2613 break;
2614 case TCP_THIN_DUPACK:
2615 val = tp->thin_dupack;
2616 break;
2617
2618 case TCP_USER_TIMEOUT:
2619 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2620 break;
2385 default: 2621 default:
2386 return -ENOPROTOOPT; 2622 return -ENOPROTOOPT;
2387 } 2623 }
@@ -2403,6 +2639,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2403 optval, optlen); 2639 optval, optlen);
2404 return do_tcp_getsockopt(sk, level, optname, optval, optlen); 2640 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2405} 2641}
2642EXPORT_SYMBOL(tcp_getsockopt);
2406 2643
2407#ifdef CONFIG_COMPAT 2644#ifdef CONFIG_COMPAT
2408int compat_tcp_getsockopt(struct sock *sk, int level, int optname, 2645int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
@@ -2413,7 +2650,6 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2413 optval, optlen); 2650 optval, optlen);
2414 return do_tcp_getsockopt(sk, level, optname, optval, optlen); 2651 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2415} 2652}
2416
2417EXPORT_SYMBOL(compat_tcp_getsockopt); 2653EXPORT_SYMBOL(compat_tcp_getsockopt);
2418#endif 2654#endif
2419 2655
@@ -2513,7 +2749,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2513 struct tcphdr *th2; 2749 struct tcphdr *th2;
2514 unsigned int len; 2750 unsigned int len;
2515 unsigned int thlen; 2751 unsigned int thlen;
2516 unsigned int flags; 2752 __be32 flags;
2517 unsigned int mss = 1; 2753 unsigned int mss = 1;
2518 unsigned int hlen; 2754 unsigned int hlen;
2519 unsigned int off; 2755 unsigned int off;
@@ -2563,10 +2799,10 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2563 2799
2564found: 2800found:
2565 flush = NAPI_GRO_CB(p)->flush; 2801 flush = NAPI_GRO_CB(p)->flush;
2566 flush |= flags & TCP_FLAG_CWR; 2802 flush |= (__force int)(flags & TCP_FLAG_CWR);
2567 flush |= (flags ^ tcp_flag_word(th2)) & 2803 flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
2568 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH); 2804 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
2569 flush |= th->ack_seq ^ th2->ack_seq; 2805 flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
2570 for (i = sizeof(*th); i < thlen; i += 4) 2806 for (i = sizeof(*th); i < thlen; i += 4)
2571 flush |= *(u32 *)((u8 *)th + i) ^ 2807 flush |= *(u32 *)((u8 *)th + i) ^
2572 *(u32 *)((u8 *)th2 + i); 2808 *(u32 *)((u8 *)th2 + i);
@@ -2587,8 +2823,9 @@ found:
2587 2823
2588out_check_final: 2824out_check_final:
2589 flush = len < mss; 2825 flush = len < mss;
2590 flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | 2826 flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
2591 TCP_FLAG_SYN | TCP_FLAG_FIN); 2827 TCP_FLAG_RST | TCP_FLAG_SYN |
2828 TCP_FLAG_FIN));
2592 2829
2593 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) 2830 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
2594 pp = head; 2831 pp = head;
@@ -2619,10 +2856,10 @@ EXPORT_SYMBOL(tcp_gro_complete);
2619 2856
2620#ifdef CONFIG_TCP_MD5SIG 2857#ifdef CONFIG_TCP_MD5SIG
2621static unsigned long tcp_md5sig_users; 2858static unsigned long tcp_md5sig_users;
2622static struct tcp_md5sig_pool **tcp_md5sig_pool; 2859static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool;
2623static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); 2860static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
2624 2861
2625static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) 2862static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool)
2626{ 2863{
2627 int cpu; 2864 int cpu;
2628 for_each_possible_cpu(cpu) { 2865 for_each_possible_cpu(cpu) {
@@ -2631,7 +2868,6 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2631 if (p->md5_desc.tfm) 2868 if (p->md5_desc.tfm)
2632 crypto_free_hash(p->md5_desc.tfm); 2869 crypto_free_hash(p->md5_desc.tfm);
2633 kfree(p); 2870 kfree(p);
2634 p = NULL;
2635 } 2871 }
2636 } 2872 }
2637 free_percpu(pool); 2873 free_percpu(pool);
@@ -2639,7 +2875,7 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2639 2875
2640void tcp_free_md5sig_pool(void) 2876void tcp_free_md5sig_pool(void)
2641{ 2877{
2642 struct tcp_md5sig_pool **pool = NULL; 2878 struct tcp_md5sig_pool * __percpu *pool = NULL;
2643 2879
2644 spin_lock_bh(&tcp_md5sig_pool_lock); 2880 spin_lock_bh(&tcp_md5sig_pool_lock);
2645 if (--tcp_md5sig_users == 0) { 2881 if (--tcp_md5sig_users == 0) {
@@ -2650,13 +2886,13 @@ void tcp_free_md5sig_pool(void)
2650 if (pool) 2886 if (pool)
2651 __tcp_free_md5sig_pool(pool); 2887 __tcp_free_md5sig_pool(pool);
2652} 2888}
2653
2654EXPORT_SYMBOL(tcp_free_md5sig_pool); 2889EXPORT_SYMBOL(tcp_free_md5sig_pool);
2655 2890
2656static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void) 2891static struct tcp_md5sig_pool * __percpu *
2892__tcp_alloc_md5sig_pool(struct sock *sk)
2657{ 2893{
2658 int cpu; 2894 int cpu;
2659 struct tcp_md5sig_pool **pool; 2895 struct tcp_md5sig_pool * __percpu *pool;
2660 2896
2661 pool = alloc_percpu(struct tcp_md5sig_pool *); 2897 pool = alloc_percpu(struct tcp_md5sig_pool *);
2662 if (!pool) 2898 if (!pool)
@@ -2666,7 +2902,7 @@ static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void)
2666 struct tcp_md5sig_pool *p; 2902 struct tcp_md5sig_pool *p;
2667 struct crypto_hash *hash; 2903 struct crypto_hash *hash;
2668 2904
2669 p = kzalloc(sizeof(*p), GFP_KERNEL); 2905 p = kzalloc(sizeof(*p), sk->sk_allocation);
2670 if (!p) 2906 if (!p)
2671 goto out_free; 2907 goto out_free;
2672 *per_cpu_ptr(pool, cpu) = p; 2908 *per_cpu_ptr(pool, cpu) = p;
@@ -2683,9 +2919,9 @@ out_free:
2683 return NULL; 2919 return NULL;
2684} 2920}
2685 2921
2686struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void) 2922struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
2687{ 2923{
2688 struct tcp_md5sig_pool **pool; 2924 struct tcp_md5sig_pool * __percpu *pool;
2689 int alloc = 0; 2925 int alloc = 0;
2690 2926
2691retry: 2927retry:
@@ -2704,7 +2940,9 @@ retry:
2704 2940
2705 if (alloc) { 2941 if (alloc) {
2706 /* we cannot hold spinlock here because this may sleep. */ 2942 /* we cannot hold spinlock here because this may sleep. */
2707 struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(); 2943 struct tcp_md5sig_pool * __percpu *p;
2944
2945 p = __tcp_alloc_md5sig_pool(sk);
2708 spin_lock_bh(&tcp_md5sig_pool_lock); 2946 spin_lock_bh(&tcp_md5sig_pool_lock);
2709 if (!p) { 2947 if (!p) {
2710 tcp_md5sig_users--; 2948 tcp_md5sig_users--;
@@ -2723,28 +2961,42 @@ retry:
2723 } 2961 }
2724 return pool; 2962 return pool;
2725} 2963}
2726
2727EXPORT_SYMBOL(tcp_alloc_md5sig_pool); 2964EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2728 2965
2729struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu) 2966
2967/**
2968 * tcp_get_md5sig_pool - get md5sig_pool for this user
2969 *
2970 * We use percpu structure, so if we succeed, we exit with preemption
2971 * and BH disabled, to make sure another thread or softirq handling
2972 * wont try to get same context.
2973 */
2974struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
2730{ 2975{
2731 struct tcp_md5sig_pool **p; 2976 struct tcp_md5sig_pool * __percpu *p;
2732 spin_lock_bh(&tcp_md5sig_pool_lock); 2977
2978 local_bh_disable();
2979
2980 spin_lock(&tcp_md5sig_pool_lock);
2733 p = tcp_md5sig_pool; 2981 p = tcp_md5sig_pool;
2734 if (p) 2982 if (p)
2735 tcp_md5sig_users++; 2983 tcp_md5sig_users++;
2736 spin_unlock_bh(&tcp_md5sig_pool_lock); 2984 spin_unlock(&tcp_md5sig_pool_lock);
2737 return (p ? *per_cpu_ptr(p, cpu) : NULL);
2738}
2739 2985
2740EXPORT_SYMBOL(__tcp_get_md5sig_pool); 2986 if (p)
2987 return *this_cpu_ptr(p);
2741 2988
2742void __tcp_put_md5sig_pool(void) 2989 local_bh_enable();
2990 return NULL;
2991}
2992EXPORT_SYMBOL(tcp_get_md5sig_pool);
2993
2994void tcp_put_md5sig_pool(void)
2743{ 2995{
2996 local_bh_enable();
2744 tcp_free_md5sig_pool(); 2997 tcp_free_md5sig_pool();
2745} 2998}
2746 2999EXPORT_SYMBOL(tcp_put_md5sig_pool);
2747EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2748 3000
2749int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, 3001int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2750 struct tcphdr *th) 3002 struct tcphdr *th)
@@ -2760,7 +3012,6 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2760 th->check = old_checksum; 3012 th->check = old_checksum;
2761 return err; 3013 return err;
2762} 3014}
2763
2764EXPORT_SYMBOL(tcp_md5_hash_header); 3015EXPORT_SYMBOL(tcp_md5_hash_header);
2765 3016
2766int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, 3017int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
@@ -2773,6 +3024,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
2773 const unsigned head_data_len = skb_headlen(skb) > header_len ? 3024 const unsigned head_data_len = skb_headlen(skb) > header_len ?
2774 skb_headlen(skb) - header_len : 0; 3025 skb_headlen(skb) - header_len : 0;
2775 const struct skb_shared_info *shi = skb_shinfo(skb); 3026 const struct skb_shared_info *shi = skb_shinfo(skb);
3027 struct sk_buff *frag_iter;
2776 3028
2777 sg_init_table(&sg, 1); 3029 sg_init_table(&sg, 1);
2778 3030
@@ -2787,9 +3039,12 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
2787 return 1; 3039 return 1;
2788 } 3040 }
2789 3041
3042 skb_walk_frags(skb, frag_iter)
3043 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3044 return 1;
3045
2790 return 0; 3046 return 0;
2791} 3047}
2792
2793EXPORT_SYMBOL(tcp_md5_hash_skb_data); 3048EXPORT_SYMBOL(tcp_md5_hash_skb_data);
2794 3049
2795int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) 3050int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
@@ -2799,11 +3054,139 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
2799 sg_init_one(&sg, key->key, key->keylen); 3054 sg_init_one(&sg, key->key, key->keylen);
2800 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); 3055 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
2801} 3056}
2802
2803EXPORT_SYMBOL(tcp_md5_hash_key); 3057EXPORT_SYMBOL(tcp_md5_hash_key);
2804 3058
2805#endif 3059#endif
2806 3060
3061/**
3062 * Each Responder maintains up to two secret values concurrently for
3063 * efficient secret rollover. Each secret value has 4 states:
3064 *
3065 * Generating. (tcp_secret_generating != tcp_secret_primary)
3066 * Generates new Responder-Cookies, but not yet used for primary
3067 * verification. This is a short-term state, typically lasting only
3068 * one round trip time (RTT).
3069 *
3070 * Primary. (tcp_secret_generating == tcp_secret_primary)
3071 * Used both for generation and primary verification.
3072 *
3073 * Retiring. (tcp_secret_retiring != tcp_secret_secondary)
3074 * Used for verification, until the first failure that can be
3075 * verified by the newer Generating secret. At that time, this
3076 * cookie's state is changed to Secondary, and the Generating
3077 * cookie's state is changed to Primary. This is a short-term state,
3078 * typically lasting only one round trip time (RTT).
3079 *
3080 * Secondary. (tcp_secret_retiring == tcp_secret_secondary)
3081 * Used for secondary verification, after primary verification
3082 * failures. This state lasts no more than twice the Maximum Segment
3083 * Lifetime (2MSL). Then, the secret is discarded.
3084 */
3085struct tcp_cookie_secret {
3086 /* The secret is divided into two parts. The digest part is the
3087 * equivalent of previously hashing a secret and saving the state,
3088 * and serves as an initialization vector (IV). The message part
3089 * serves as the trailing secret.
3090 */
3091 u32 secrets[COOKIE_WORKSPACE_WORDS];
3092 unsigned long expires;
3093};
3094
3095#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
3096#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
3097#define TCP_SECRET_LIFE (HZ * 600)
3098
3099static struct tcp_cookie_secret tcp_secret_one;
3100static struct tcp_cookie_secret tcp_secret_two;
3101
3102/* Essentially a circular list, without dynamic allocation. */
3103static struct tcp_cookie_secret *tcp_secret_generating;
3104static struct tcp_cookie_secret *tcp_secret_primary;
3105static struct tcp_cookie_secret *tcp_secret_retiring;
3106static struct tcp_cookie_secret *tcp_secret_secondary;
3107
3108static DEFINE_SPINLOCK(tcp_secret_locker);
3109
3110/* Select a pseudo-random word in the cookie workspace.
3111 */
3112static inline u32 tcp_cookie_work(const u32 *ws, const int n)
3113{
3114 return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
3115}
3116
3117/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
3118 * Called in softirq context.
3119 * Returns: 0 for success.
3120 */
3121int tcp_cookie_generator(u32 *bakery)
3122{
3123 unsigned long jiffy = jiffies;
3124
3125 if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
3126 spin_lock_bh(&tcp_secret_locker);
3127 if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
3128 /* refreshed by another */
3129 memcpy(bakery,
3130 &tcp_secret_generating->secrets[0],
3131 COOKIE_WORKSPACE_WORDS);
3132 } else {
3133 /* still needs refreshing */
3134 get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
3135
3136 /* The first time, paranoia assumes that the
3137 * randomization function isn't as strong. But,
3138 * this secret initialization is delayed until
3139 * the last possible moment (packet arrival).
3140 * Although that time is observable, it is
3141 * unpredictably variable. Mash in the most
3142 * volatile clock bits available, and expire the
3143 * secret extra quickly.
3144 */
3145 if (unlikely(tcp_secret_primary->expires ==
3146 tcp_secret_secondary->expires)) {
3147 struct timespec tv;
3148
3149 getnstimeofday(&tv);
3150 bakery[COOKIE_DIGEST_WORDS+0] ^=
3151 (u32)tv.tv_nsec;
3152
3153 tcp_secret_secondary->expires = jiffy
3154 + TCP_SECRET_1MSL
3155 + (0x0f & tcp_cookie_work(bakery, 0));
3156 } else {
3157 tcp_secret_secondary->expires = jiffy
3158 + TCP_SECRET_LIFE
3159 + (0xff & tcp_cookie_work(bakery, 1));
3160 tcp_secret_primary->expires = jiffy
3161 + TCP_SECRET_2MSL
3162 + (0x1f & tcp_cookie_work(bakery, 2));
3163 }
3164 memcpy(&tcp_secret_secondary->secrets[0],
3165 bakery, COOKIE_WORKSPACE_WORDS);
3166
3167 rcu_assign_pointer(tcp_secret_generating,
3168 tcp_secret_secondary);
3169 rcu_assign_pointer(tcp_secret_retiring,
3170 tcp_secret_primary);
3171 /*
3172 * Neither call_rcu() nor synchronize_rcu() needed.
3173 * Retiring data is not freed. It is replaced after
3174 * further (locked) pointer updates, and a quiet time
3175 * (minimum 1MSL, maximum LIFE - 2MSL).
3176 */
3177 }
3178 spin_unlock_bh(&tcp_secret_locker);
3179 } else {
3180 rcu_read_lock_bh();
3181 memcpy(bakery,
3182 &rcu_dereference(tcp_secret_generating)->secrets[0],
3183 COOKIE_WORKSPACE_WORDS);
3184 rcu_read_unlock_bh();
3185 }
3186 return 0;
3187}
3188EXPORT_SYMBOL(tcp_cookie_generator);
3189
2807void tcp_done(struct sock *sk) 3190void tcp_done(struct sock *sk)
2808{ 3191{
2809 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) 3192 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
@@ -2837,7 +3220,8 @@ void __init tcp_init(void)
2837{ 3220{
2838 struct sk_buff *skb = NULL; 3221 struct sk_buff *skb = NULL;
2839 unsigned long nr_pages, limit; 3222 unsigned long nr_pages, limit;
2840 int order, i, max_share; 3223 int i, max_share, cnt;
3224 unsigned long jiffy = jiffies;
2841 3225
2842 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3226 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
2843 3227
@@ -2857,14 +3241,13 @@ void __init tcp_init(void)
2857 alloc_large_system_hash("TCP established", 3241 alloc_large_system_hash("TCP established",
2858 sizeof(struct inet_ehash_bucket), 3242 sizeof(struct inet_ehash_bucket),
2859 thash_entries, 3243 thash_entries,
2860 (num_physpages >= 128 * 1024) ? 3244 (totalram_pages >= 128 * 1024) ?
2861 13 : 15, 3245 13 : 15,
2862 0, 3246 0,
2863 &tcp_hashinfo.ehash_size,
2864 NULL, 3247 NULL,
3248 &tcp_hashinfo.ehash_mask,
2865 thash_entries ? 0 : 512 * 1024); 3249 thash_entries ? 0 : 512 * 1024);
2866 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; 3250 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
2867 for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2868 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); 3251 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
2869 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); 3252 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
2870 } 3253 }
@@ -2873,8 +3256,8 @@ void __init tcp_init(void)
2873 tcp_hashinfo.bhash = 3256 tcp_hashinfo.bhash =
2874 alloc_large_system_hash("TCP bind", 3257 alloc_large_system_hash("TCP bind",
2875 sizeof(struct inet_bind_hashbucket), 3258 sizeof(struct inet_bind_hashbucket),
2876 tcp_hashinfo.ehash_size, 3259 tcp_hashinfo.ehash_mask + 1,
2877 (num_physpages >= 128 * 1024) ? 3260 (totalram_pages >= 128 * 1024) ?
2878 13 : 15, 3261 13 : 15,
2879 0, 3262 0,
2880 &tcp_hashinfo.bhash_size, 3263 &tcp_hashinfo.bhash_size,
@@ -2886,22 +3269,12 @@ void __init tcp_init(void)
2886 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); 3269 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2887 } 3270 }
2888 3271
2889 /* Try to be a bit smarter and adjust defaults depending 3272
2890 * on available memory. 3273 cnt = tcp_hashinfo.ehash_mask + 1;
2891 */ 3274
2892 for (order = 0; ((1 << order) << PAGE_SHIFT) < 3275 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2893 (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); 3276 sysctl_tcp_max_orphans = cnt / 2;
2894 order++) 3277 sysctl_max_syn_backlog = max(128, cnt / 256);
2895 ;
2896 if (order >= 4) {
2897 tcp_death_row.sysctl_max_tw_buckets = 180000;
2898 sysctl_tcp_max_orphans = 4096 << (order - 4);
2899 sysctl_max_syn_backlog = 1024;
2900 } else if (order < 3) {
2901 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2902 sysctl_tcp_max_orphans >>= (3 - order);
2903 sysctl_max_syn_backlog = 128;
2904 }
2905 3278
2906 /* Set the pressure threshold to be a fraction of global memory that 3279 /* Set the pressure threshold to be a fraction of global memory that
2907 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of 3280 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
@@ -2928,21 +3301,17 @@ void __init tcp_init(void)
2928 sysctl_tcp_rmem[2] = max(87380, max_share); 3301 sysctl_tcp_rmem[2] = max(87380, max_share);
2929 3302
2930 printk(KERN_INFO "TCP: Hash tables configured " 3303 printk(KERN_INFO "TCP: Hash tables configured "
2931 "(established %d bind %d)\n", 3304 "(established %u bind %u)\n",
2932 tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size); 3305 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
2933 3306
2934 tcp_register_congestion_control(&tcp_reno); 3307 tcp_register_congestion_control(&tcp_reno);
2935}
2936 3308
2937EXPORT_SYMBOL(tcp_close); 3309 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
2938EXPORT_SYMBOL(tcp_disconnect); 3310 memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
2939EXPORT_SYMBOL(tcp_getsockopt); 3311 tcp_secret_one.expires = jiffy; /* past due */
2940EXPORT_SYMBOL(tcp_ioctl); 3312 tcp_secret_two.expires = jiffy; /* past due */
2941EXPORT_SYMBOL(tcp_poll); 3313 tcp_secret_generating = &tcp_secret_one;
2942EXPORT_SYMBOL(tcp_read_sock); 3314 tcp_secret_primary = &tcp_secret_one;
2943EXPORT_SYMBOL(tcp_recvmsg); 3315 tcp_secret_retiring = &tcp_secret_two;
2944EXPORT_SYMBOL(tcp_sendmsg); 3316 tcp_secret_secondary = &tcp_secret_two;
2945EXPORT_SYMBOL(tcp_splice_read); 3317}
2946EXPORT_SYMBOL(tcp_sendpage);
2947EXPORT_SYMBOL(tcp_setsockopt);
2948EXPORT_SYMBOL(tcp_shutdown);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index e92beb9e55e0..850c737e08e2 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -10,6 +10,7 @@
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/list.h> 12#include <linux/list.h>
13#include <linux/gfp.h>
13#include <net/tcp.h> 14#include <net/tcp.h>
14 15
15int sysctl_tcp_max_ssthresh = 0; 16int sysctl_tcp_max_ssthresh = 0;
@@ -116,7 +117,7 @@ int tcp_set_default_congestion_control(const char *name)
116 spin_lock(&tcp_cong_list_lock); 117 spin_lock(&tcp_cong_list_lock);
117 ca = tcp_ca_find(name); 118 ca = tcp_ca_find(name);
118#ifdef CONFIG_MODULES 119#ifdef CONFIG_MODULES
119 if (!ca && capable(CAP_SYS_MODULE)) { 120 if (!ca && capable(CAP_NET_ADMIN)) {
120 spin_unlock(&tcp_cong_list_lock); 121 spin_unlock(&tcp_cong_list_lock);
121 122
122 request_module("tcp_%s", name); 123 request_module("tcp_%s", name);
@@ -195,10 +196,10 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
195int tcp_set_allowed_congestion_control(char *val) 196int tcp_set_allowed_congestion_control(char *val)
196{ 197{
197 struct tcp_congestion_ops *ca; 198 struct tcp_congestion_ops *ca;
198 char *clone, *name; 199 char *saved_clone, *clone, *name;
199 int ret = 0; 200 int ret = 0;
200 201
201 clone = kstrdup(val, GFP_USER); 202 saved_clone = clone = kstrdup(val, GFP_USER);
202 if (!clone) 203 if (!clone)
203 return -ENOMEM; 204 return -ENOMEM;
204 205
@@ -225,6 +226,7 @@ int tcp_set_allowed_congestion_control(char *val)
225 } 226 }
226out: 227out:
227 spin_unlock(&tcp_cong_list_lock); 228 spin_unlock(&tcp_cong_list_lock);
229 kfree(saved_clone);
228 230
229 return ret; 231 return ret;
230} 232}
@@ -246,7 +248,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
246 248
247#ifdef CONFIG_MODULES 249#ifdef CONFIG_MODULES
248 /* not found attempt to autoload module */ 250 /* not found attempt to autoload module */
249 if (!ca && capable(CAP_SYS_MODULE)) { 251 if (!ca && capable(CAP_NET_ADMIN)) {
250 rcu_read_unlock(); 252 rcu_read_unlock();
251 request_module("tcp_%s", name); 253 request_module("tcp_%s", name);
252 rcu_read_lock(); 254 rcu_read_lock();
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index fcbcd4ff6c5f..939edb3b8e4d 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -27,7 +27,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
27 r->idiag_rqueue = sk->sk_ack_backlog; 27 r->idiag_rqueue = sk->sk_ack_backlog;
28 r->idiag_wqueue = sk->sk_max_ack_backlog; 28 r->idiag_wqueue = sk->sk_max_ack_backlog;
29 } else { 29 } else {
30 r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; 30 r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
31 r->idiag_wqueue = tp->write_seq - tp->snd_una; 31 r->idiag_wqueue = tp->write_seq - tp->snd_una;
32 } 32 }
33 if (info != NULL) 33 if (info != NULL)
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 26d5c7fc7de5..7c94a4955416 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -92,8 +92,8 @@ static inline void measure_rtt(struct sock *sk, u32 srtt)
92 if (icsk->icsk_ca_state == TCP_CA_Open) { 92 if (icsk->icsk_ca_state == TCP_CA_Open) {
93 if (ca->maxRTT < ca->minRTT) 93 if (ca->maxRTT < ca->minRTT)
94 ca->maxRTT = ca->minRTT; 94 ca->maxRTT = ca->minRTT;
95 if (ca->maxRTT < srtt 95 if (ca->maxRTT < srtt &&
96 && srtt <= ca->maxRTT + msecs_to_jiffies(20)) 96 srtt <= ca->maxRTT + msecs_to_jiffies(20))
97 ca->maxRTT = srtt; 97 ca->maxRTT = srtt;
98 } 98 }
99} 99}
@@ -123,9 +123,9 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt
123 123
124 ca->packetcount += pkts_acked; 124 ca->packetcount += pkts_acked;
125 125
126 if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) 126 if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
127 && now - ca->lasttime >= ca->minRTT 127 now - ca->lasttime >= ca->minRTT &&
128 && ca->minRTT > 0) { 128 ca->minRTT > 0) {
129 __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime); 129 __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime);
130 130
131 if (htcp_ccount(ca) <= 3) { 131 if (htcp_ccount(ca) <= 3) {
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index c209e054a634..377bc9349371 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -126,8 +126,8 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
126 * calculate 2^fract in a <<7 value. 126 * calculate 2^fract in a <<7 value.
127 */ 127 */
128 is_slowstart = 1; 128 is_slowstart = 1;
129 increment = ((1 << ca->rho) * hybla_fraction(rho_fractions)) 129 increment = ((1 << min(ca->rho, 16U)) *
130 - 128; 130 hybla_fraction(rho_fractions)) - 128;
131 } else { 131 } else {
132 /* 132 /*
133 * congestion avoidance 133 * congestion avoidance
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 1eba160b72dc..00ca688d8964 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -6,7 +6,7 @@
6 * The algorithm is described in: 6 * The algorithm is described in:
7 * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm 7 * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm
8 * for High-Speed Networks" 8 * for High-Speed Networks"
9 * http://www.ews.uiuc.edu/~shaoliu/papersandslides/liubassri06perf.pdf 9 * http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf
10 * 10 *
11 * Implemented from description in paper and ns-2 simulation. 11 * Implemented from description in paper and ns-2 simulation.
12 * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org> 12 * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org>
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2bdb0da237e6..2549b29b062d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -62,6 +62,7 @@
62 */ 62 */
63 63
64#include <linux/mm.h> 64#include <linux/mm.h>
65#include <linux/slab.h>
65#include <linux/module.h> 66#include <linux/module.h>
66#include <linux/sysctl.h> 67#include <linux/sysctl.h>
67#include <linux/kernel.h> 68#include <linux/kernel.h>
@@ -77,10 +78,13 @@ int sysctl_tcp_window_scaling __read_mostly = 1;
77int sysctl_tcp_sack __read_mostly = 1; 78int sysctl_tcp_sack __read_mostly = 1;
78int sysctl_tcp_fack __read_mostly = 1; 79int sysctl_tcp_fack __read_mostly = 1;
79int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; 80int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
81EXPORT_SYMBOL(sysctl_tcp_reordering);
80int sysctl_tcp_ecn __read_mostly = 2; 82int sysctl_tcp_ecn __read_mostly = 2;
83EXPORT_SYMBOL(sysctl_tcp_ecn);
81int sysctl_tcp_dsack __read_mostly = 1; 84int sysctl_tcp_dsack __read_mostly = 1;
82int sysctl_tcp_app_win __read_mostly = 31; 85int sysctl_tcp_app_win __read_mostly = 31;
83int sysctl_tcp_adv_win_scale __read_mostly = 2; 86int sysctl_tcp_adv_win_scale __read_mostly = 2;
87EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
84 88
85int sysctl_tcp_stdurg __read_mostly; 89int sysctl_tcp_stdurg __read_mostly;
86int sysctl_tcp_rfc1337 __read_mostly; 90int sysctl_tcp_rfc1337 __read_mostly;
@@ -89,6 +93,8 @@ int sysctl_tcp_frto __read_mostly = 2;
89int sysctl_tcp_frto_response __read_mostly; 93int sysctl_tcp_frto_response __read_mostly;
90int sysctl_tcp_nometrics_save __read_mostly; 94int sysctl_tcp_nometrics_save __read_mostly;
91 95
96int sysctl_tcp_thin_dupack __read_mostly;
97
92int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 98int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
93int sysctl_tcp_abc __read_mostly; 99int sysctl_tcp_abc __read_mostly;
94 100
@@ -140,7 +146,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
140 * "len" is invariant segment length, including TCP header. 146 * "len" is invariant segment length, including TCP header.
141 */ 147 */
142 len += skb->data - skb_transport_header(skb); 148 len += skb->data - skb_transport_header(skb);
143 if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || 149 if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
144 /* If PSH is not set, packet should be 150 /* If PSH is not set, packet should be
145 * full sized, provided peer TCP is not badly broken. 151 * full sized, provided peer TCP is not badly broken.
146 * This observation (if it is correct 8)) allows 152 * This observation (if it is correct 8)) allows
@@ -176,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk)
176 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); 182 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
177} 183}
178 184
179void tcp_enter_quickack_mode(struct sock *sk) 185static void tcp_enter_quickack_mode(struct sock *sk)
180{ 186{
181 struct inet_connection_sock *icsk = inet_csk(sk); 187 struct inet_connection_sock *icsk = inet_csk(sk);
182 tcp_incr_quickack(sk); 188 tcp_incr_quickack(sk);
@@ -253,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk)
253 int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + 259 int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
254 sizeof(struct sk_buff); 260 sizeof(struct sk_buff);
255 261
256 if (sk->sk_sndbuf < 3 * sndmem) 262 if (sk->sk_sndbuf < 3 * sndmem) {
257 sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]); 263 sk->sk_sndbuf = 3 * sndmem;
264 if (sk->sk_sndbuf > sysctl_tcp_wmem[2])
265 sk->sk_sndbuf = sysctl_tcp_wmem[2];
266 }
258} 267}
259 268
260/* 2. Tuning advertised window (window_clamp, rcv_ssthresh) 269/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -390,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk)
390 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && 399 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
391 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && 400 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
392 !tcp_memory_pressure && 401 !tcp_memory_pressure &&
393 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { 402 atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
394 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), 403 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
395 sysctl_tcp_rmem[2]); 404 sysctl_tcp_rmem[2]);
396 } 405 }
@@ -411,20 +420,21 @@ void tcp_initialize_rcv_mss(struct sock *sk)
411 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); 420 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
412 421
413 hint = min(hint, tp->rcv_wnd / 2); 422 hint = min(hint, tp->rcv_wnd / 2);
414 hint = min(hint, TCP_MIN_RCVMSS); 423 hint = min(hint, TCP_MSS_DEFAULT);
415 hint = max(hint, TCP_MIN_MSS); 424 hint = max(hint, TCP_MIN_MSS);
416 425
417 inet_csk(sk)->icsk_ack.rcv_mss = hint; 426 inet_csk(sk)->icsk_ack.rcv_mss = hint;
418} 427}
428EXPORT_SYMBOL(tcp_initialize_rcv_mss);
419 429
420/* Receiver "autotuning" code. 430/* Receiver "autotuning" code.
421 * 431 *
422 * The algorithm for RTT estimation w/o timestamps is based on 432 * The algorithm for RTT estimation w/o timestamps is based on
423 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. 433 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
424 * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps> 434 * <http://public.lanl.gov/radiant/pubs.html#DRS>
425 * 435 *
426 * More detail on this code can be found at 436 * More detail on this code can be found at
427 * <http://www.psc.edu/~jheffner/senior_thesis.ps>, 437 * <http://staff.psc.edu/jheffner/>,
428 * though this reference is out of date. A new paper 438 * though this reference is out of date. A new paper
429 * is pending. 439 * is pending.
430 */ 440 */
@@ -685,7 +695,7 @@ static inline void tcp_set_rto(struct sock *sk)
685 * is invisible. Actually, Linux-2.4 also generates erratic 695 * is invisible. Actually, Linux-2.4 also generates erratic
686 * ACKs in some circumstances. 696 * ACKs in some circumstances.
687 */ 697 */
688 inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; 698 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
689 699
690 /* 2. Fixups made earlier cannot be right. 700 /* 2. Fixups made earlier cannot be right.
691 * If we do not estimate RTO correctly without them, 701 * If we do not estimate RTO correctly without them,
@@ -696,8 +706,7 @@ static inline void tcp_set_rto(struct sock *sk)
696 /* NOTE: clamping at TCP_RTO_MIN is not required, current algo 706 /* NOTE: clamping at TCP_RTO_MIN is not required, current algo
697 * guarantees that rto is higher. 707 * guarantees that rto is higher.
698 */ 708 */
699 if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) 709 tcp_bound_rto(sk);
700 inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
701} 710}
702 711
703/* Save metrics learned by this TCP session. 712/* Save metrics learned by this TCP session.
@@ -725,7 +734,7 @@ void tcp_update_metrics(struct sock *sk)
725 * Reset our results. 734 * Reset our results.
726 */ 735 */
727 if (!(dst_metric_locked(dst, RTAX_RTT))) 736 if (!(dst_metric_locked(dst, RTAX_RTT)))
728 dst->metrics[RTAX_RTT - 1] = 0; 737 dst_metric_set(dst, RTAX_RTT, 0);
729 return; 738 return;
730 } 739 }
731 740
@@ -762,62 +771,53 @@ void tcp_update_metrics(struct sock *sk)
762 set_dst_metric_rtt(dst, RTAX_RTTVAR, var); 771 set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
763 } 772 }
764 773
765 if (tp->snd_ssthresh >= 0xFFFF) { 774 if (tcp_in_initial_slowstart(tp)) {
766 /* Slow start still did not finish. */ 775 /* Slow start still did not finish. */
767 if (dst_metric(dst, RTAX_SSTHRESH) && 776 if (dst_metric(dst, RTAX_SSTHRESH) &&
768 !dst_metric_locked(dst, RTAX_SSTHRESH) && 777 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
769 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) 778 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
770 dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; 779 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
771 if (!dst_metric_locked(dst, RTAX_CWND) && 780 if (!dst_metric_locked(dst, RTAX_CWND) &&
772 tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) 781 tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
773 dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd; 782 dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
774 } else if (tp->snd_cwnd > tp->snd_ssthresh && 783 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
775 icsk->icsk_ca_state == TCP_CA_Open) { 784 icsk->icsk_ca_state == TCP_CA_Open) {
776 /* Cong. avoidance phase, cwnd is reliable. */ 785 /* Cong. avoidance phase, cwnd is reliable. */
777 if (!dst_metric_locked(dst, RTAX_SSTHRESH)) 786 if (!dst_metric_locked(dst, RTAX_SSTHRESH))
778 dst->metrics[RTAX_SSTHRESH-1] = 787 dst_metric_set(dst, RTAX_SSTHRESH,
779 max(tp->snd_cwnd >> 1, tp->snd_ssthresh); 788 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
780 if (!dst_metric_locked(dst, RTAX_CWND)) 789 if (!dst_metric_locked(dst, RTAX_CWND))
781 dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1; 790 dst_metric_set(dst, RTAX_CWND,
791 (dst_metric(dst, RTAX_CWND) +
792 tp->snd_cwnd) >> 1);
782 } else { 793 } else {
783 /* Else slow start did not finish, cwnd is non-sense, 794 /* Else slow start did not finish, cwnd is non-sense,
784 ssthresh may be also invalid. 795 ssthresh may be also invalid.
785 */ 796 */
786 if (!dst_metric_locked(dst, RTAX_CWND)) 797 if (!dst_metric_locked(dst, RTAX_CWND))
787 dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1; 798 dst_metric_set(dst, RTAX_CWND,
799 (dst_metric(dst, RTAX_CWND) +
800 tp->snd_ssthresh) >> 1);
788 if (dst_metric(dst, RTAX_SSTHRESH) && 801 if (dst_metric(dst, RTAX_SSTHRESH) &&
789 !dst_metric_locked(dst, RTAX_SSTHRESH) && 802 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
790 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) 803 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
791 dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh; 804 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
792 } 805 }
793 806
794 if (!dst_metric_locked(dst, RTAX_REORDERING)) { 807 if (!dst_metric_locked(dst, RTAX_REORDERING)) {
795 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && 808 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
796 tp->reordering != sysctl_tcp_reordering) 809 tp->reordering != sysctl_tcp_reordering)
797 dst->metrics[RTAX_REORDERING-1] = tp->reordering; 810 dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
798 } 811 }
799 } 812 }
800} 813}
801 814
802/* Numbers are taken from RFC3390.
803 *
804 * John Heffner states:
805 *
806 * The RFC specifies a window of no more than 4380 bytes
807 * unless 2*MSS > 4380. Reading the pseudocode in the RFC
808 * is a bit misleading because they use a clamp at 4380 bytes
809 * rather than use a multiplier in the relevant range.
810 */
811__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) 815__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
812{ 816{
813 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 817 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
814 818
815 if (!cwnd) { 819 if (!cwnd)
816 if (tp->mss_cache > 1460) 820 cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
817 cwnd = 2;
818 else
819 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
820 }
821 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 821 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
822} 822}
823 823
@@ -916,25 +916,20 @@ static void tcp_init_metrics(struct sock *sk)
916 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); 916 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
917 } 917 }
918 tcp_set_rto(sk); 918 tcp_set_rto(sk);
919 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) 919 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) {
920 goto reset;
921
922cwnd:
923 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
924 tp->snd_cwnd_stamp = tcp_time_stamp;
925 return;
926
927reset: 920reset:
928 /* Play conservative. If timestamps are not 921 /* Play conservative. If timestamps are not
929 * supported, TCP will fail to recalculate correct 922 * supported, TCP will fail to recalculate correct
930 * rtt, if initial rto is too small. FORGET ALL AND RESET! 923 * rtt, if initial rto is too small. FORGET ALL AND RESET!
931 */ 924 */
932 if (!tp->rx_opt.saw_tstamp && tp->srtt) { 925 if (!tp->rx_opt.saw_tstamp && tp->srtt) {
933 tp->srtt = 0; 926 tp->srtt = 0;
934 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; 927 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
935 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; 928 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
929 }
936 } 930 }
937 goto cwnd; 931 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
932 tp->snd_cwnd_stamp = tcp_time_stamp;
938} 933}
939 934
940static void tcp_update_reordering(struct sock *sk, const int metric, 935static void tcp_update_reordering(struct sock *sk, const int metric,
@@ -2301,14 +2296,14 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)
2301 * they differ. Since neither occurs due to loss, TCP should really 2296 * they differ. Since neither occurs due to loss, TCP should really
2302 * ignore them. 2297 * ignore them.
2303 */ 2298 */
2304static inline int tcp_dupack_heurestics(struct tcp_sock *tp) 2299static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
2305{ 2300{
2306 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; 2301 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
2307} 2302}
2308 2303
2309static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) 2304static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
2310{ 2305{
2311 return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); 2306 return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
2312} 2307}
2313 2308
2314static inline int tcp_head_timedout(struct sock *sk) 2309static inline int tcp_head_timedout(struct sock *sk)
@@ -2426,7 +2421,7 @@ static int tcp_time_to_recover(struct sock *sk)
2426 return 1; 2421 return 1;
2427 2422
2428 /* Not-A-Trick#2 : Classic rule... */ 2423 /* Not-A-Trick#2 : Classic rule... */
2429 if (tcp_dupack_heurestics(tp) > tp->reordering) 2424 if (tcp_dupack_heuristics(tp) > tp->reordering)
2430 return 1; 2425 return 1;
2431 2426
2432 /* Trick#3 : when we use RFC2988 timer restart, fast 2427 /* Trick#3 : when we use RFC2988 timer restart, fast
@@ -2448,6 +2443,16 @@ static int tcp_time_to_recover(struct sock *sk)
2448 return 1; 2443 return 1;
2449 } 2444 }
2450 2445
2446 /* If a thin stream is detected, retransmit after first
2447 * received dupack. Employ only if SACK is supported in order
2448 * to avoid possible corner-case series of spurious retransmissions
2449 * Use only if there are no unsent data.
2450 */
2451 if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
2452 tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
2453 tcp_is_sack(tp) && !tcp_send_head(sk))
2454 return 1;
2455
2451 return 0; 2456 return 0;
2452} 2457}
2453 2458
@@ -2492,7 +2497,7 @@ static void tcp_timeout_skbs(struct sock *sk)
2492/* Mark head of queue up as lost. With RFC3517 SACK, the packets is 2497/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
2493 * is against sacked "cnt", otherwise it's against facked "cnt" 2498 * is against sacked "cnt", otherwise it's against facked "cnt"
2494 */ 2499 */
2495static void tcp_mark_head_lost(struct sock *sk, int packets) 2500static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2496{ 2501{
2497 struct tcp_sock *tp = tcp_sk(sk); 2502 struct tcp_sock *tp = tcp_sk(sk);
2498 struct sk_buff *skb; 2503 struct sk_buff *skb;
@@ -2504,6 +2509,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2504 if (tp->lost_skb_hint) { 2509 if (tp->lost_skb_hint) {
2505 skb = tp->lost_skb_hint; 2510 skb = tp->lost_skb_hint;
2506 cnt = tp->lost_cnt_hint; 2511 cnt = tp->lost_cnt_hint;
2512 /* Head already handled? */
2513 if (mark_head && skb != tcp_write_queue_head(sk))
2514 return;
2507 } else { 2515 } else {
2508 skb = tcp_write_queue_head(sk); 2516 skb = tcp_write_queue_head(sk);
2509 cnt = 0; 2517 cnt = 0;
@@ -2526,7 +2534,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2526 cnt += tcp_skb_pcount(skb); 2534 cnt += tcp_skb_pcount(skb);
2527 2535
2528 if (cnt > packets) { 2536 if (cnt > packets) {
2529 if (tcp_is_sack(tp) || (oldcnt >= packets)) 2537 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
2538 (oldcnt >= packets))
2530 break; 2539 break;
2531 2540
2532 mss = skb_shinfo(skb)->gso_size; 2541 mss = skb_shinfo(skb)->gso_size;
@@ -2537,6 +2546,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2537 } 2546 }
2538 2547
2539 tcp_skb_mark_lost(tp, skb); 2548 tcp_skb_mark_lost(tp, skb);
2549
2550 if (mark_head)
2551 break;
2540 } 2552 }
2541 tcp_verify_left_out(tp); 2553 tcp_verify_left_out(tp);
2542} 2554}
@@ -2548,17 +2560,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2548 struct tcp_sock *tp = tcp_sk(sk); 2560 struct tcp_sock *tp = tcp_sk(sk);
2549 2561
2550 if (tcp_is_reno(tp)) { 2562 if (tcp_is_reno(tp)) {
2551 tcp_mark_head_lost(sk, 1); 2563 tcp_mark_head_lost(sk, 1, 1);
2552 } else if (tcp_is_fack(tp)) { 2564 } else if (tcp_is_fack(tp)) {
2553 int lost = tp->fackets_out - tp->reordering; 2565 int lost = tp->fackets_out - tp->reordering;
2554 if (lost <= 0) 2566 if (lost <= 0)
2555 lost = 1; 2567 lost = 1;
2556 tcp_mark_head_lost(sk, lost); 2568 tcp_mark_head_lost(sk, lost, 0);
2557 } else { 2569 } else {
2558 int sacked_upto = tp->sacked_out - tp->reordering; 2570 int sacked_upto = tp->sacked_out - tp->reordering;
2559 if (sacked_upto < fast_rexmit) 2571 if (sacked_upto >= 0)
2560 sacked_upto = fast_rexmit; 2572 tcp_mark_head_lost(sk, sacked_upto, 0);
2561 tcp_mark_head_lost(sk, sacked_upto); 2573 else if (fast_rexmit)
2574 tcp_mark_head_lost(sk, 1, 1);
2562 } 2575 }
2563 2576
2564 tcp_timeout_skbs(sk); 2577 tcp_timeout_skbs(sk);
@@ -2624,7 +2637,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2624 if (sk->sk_family == AF_INET) { 2637 if (sk->sk_family == AF_INET) {
2625 printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", 2638 printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2626 msg, 2639 msg,
2627 &inet->daddr, ntohs(inet->dport), 2640 &inet->inet_daddr, ntohs(inet->inet_dport),
2628 tp->snd_cwnd, tcp_left_out(tp), 2641 tp->snd_cwnd, tcp_left_out(tp),
2629 tp->snd_ssthresh, tp->prior_ssthresh, 2642 tp->snd_ssthresh, tp->prior_ssthresh,
2630 tp->packets_out); 2643 tp->packets_out);
@@ -2634,7 +2647,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2634 struct ipv6_pinfo *np = inet6_sk(sk); 2647 struct ipv6_pinfo *np = inet6_sk(sk);
2635 printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", 2648 printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2636 msg, 2649 msg,
2637 &np->daddr, ntohs(inet->dport), 2650 &np->daddr, ntohs(inet->inet_dport),
2638 tp->snd_cwnd, tcp_left_out(tp), 2651 tp->snd_cwnd, tcp_left_out(tp),
2639 tp->snd_ssthresh, tp->prior_ssthresh, 2652 tp->snd_ssthresh, tp->prior_ssthresh,
2640 tp->packets_out); 2653 tp->packets_out);
@@ -2718,6 +2731,35 @@ static void tcp_try_undo_dsack(struct sock *sk)
2718 } 2731 }
2719} 2732}
2720 2733
2734/* We can clear retrans_stamp when there are no retransmissions in the
2735 * window. It would seem that it is trivially available for us in
2736 * tp->retrans_out, however, that kind of assumptions doesn't consider
2737 * what will happen if errors occur when sending retransmission for the
2738 * second time. ...It could the that such segment has only
2739 * TCPCB_EVER_RETRANS set at the present time. It seems that checking
2740 * the head skb is enough except for some reneging corner cases that
2741 * are not worth the effort.
2742 *
2743 * Main reason for all this complexity is the fact that connection dying
2744 * time now depends on the validity of the retrans_stamp, in particular,
2745 * that successive retransmissions of a segment must not advance
2746 * retrans_stamp under any conditions.
2747 */
2748static int tcp_any_retrans_done(struct sock *sk)
2749{
2750 struct tcp_sock *tp = tcp_sk(sk);
2751 struct sk_buff *skb;
2752
2753 if (tp->retrans_out)
2754 return 1;
2755
2756 skb = tcp_write_queue_head(sk);
2757 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2758 return 1;
2759
2760 return 0;
2761}
2762
2721/* Undo during fast recovery after partial ACK. */ 2763/* Undo during fast recovery after partial ACK. */
2722 2764
2723static int tcp_try_undo_partial(struct sock *sk, int acked) 2765static int tcp_try_undo_partial(struct sock *sk, int acked)
@@ -2730,7 +2772,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
2730 /* Plain luck! Hole if filled with delayed 2772 /* Plain luck! Hole if filled with delayed
2731 * packet, rather than with a retransmit. 2773 * packet, rather than with a retransmit.
2732 */ 2774 */
2733 if (tp->retrans_out == 0) 2775 if (!tcp_any_retrans_done(sk))
2734 tp->retrans_stamp = 0; 2776 tp->retrans_stamp = 0;
2735 2777
2736 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); 2778 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
@@ -2789,7 +2831,7 @@ static void tcp_try_keep_open(struct sock *sk)
2789 struct tcp_sock *tp = tcp_sk(sk); 2831 struct tcp_sock *tp = tcp_sk(sk);
2790 int state = TCP_CA_Open; 2832 int state = TCP_CA_Open;
2791 2833
2792 if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) 2834 if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker)
2793 state = TCP_CA_Disorder; 2835 state = TCP_CA_Disorder;
2794 2836
2795 if (inet_csk(sk)->icsk_ca_state != state) { 2837 if (inet_csk(sk)->icsk_ca_state != state) {
@@ -2804,7 +2846,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
2804 2846
2805 tcp_verify_left_out(tp); 2847 tcp_verify_left_out(tp);
2806 2848
2807 if (!tp->frto_counter && tp->retrans_out == 0) 2849 if (!tp->frto_counter && !tcp_any_retrans_done(sk))
2808 tp->retrans_stamp = 0; 2850 tp->retrans_stamp = 0;
2809 2851
2810 if (flag & FLAG_ECE) 2852 if (flag & FLAG_ECE)
@@ -2838,7 +2880,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
2838 icsk->icsk_mtup.probe_size; 2880 icsk->icsk_mtup.probe_size;
2839 tp->snd_cwnd_cnt = 0; 2881 tp->snd_cwnd_cnt = 0;
2840 tp->snd_cwnd_stamp = tcp_time_stamp; 2882 tp->snd_cwnd_stamp = tcp_time_stamp;
2841 tp->rcv_ssthresh = tcp_current_ssthresh(sk); 2883 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2842 2884
2843 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; 2885 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
2844 icsk->icsk_mtup.probe_size = 0; 2886 icsk->icsk_mtup.probe_size = 0;
@@ -2894,6 +2936,7 @@ void tcp_simple_retransmit(struct sock *sk)
2894 } 2936 }
2895 tcp_xmit_retransmit_queue(sk); 2937 tcp_xmit_retransmit_queue(sk);
2896} 2938}
2939EXPORT_SYMBOL(tcp_simple_retransmit);
2897 2940
2898/* Process an event, which can update packets-in-flight not trivially. 2941/* Process an event, which can update packets-in-flight not trivially.
2899 * Main goal of this function is to calculate new estimate for left_out, 2942 * Main goal of this function is to calculate new estimate for left_out,
@@ -2934,7 +2977,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
2934 before(tp->snd_una, tp->high_seq) && 2977 before(tp->snd_una, tp->high_seq) &&
2935 icsk->icsk_ca_state != TCP_CA_Open && 2978 icsk->icsk_ca_state != TCP_CA_Open &&
2936 tp->fackets_out > tp->reordering) { 2979 tp->fackets_out > tp->reordering) {
2937 tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); 2980 tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
2938 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); 2981 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
2939 } 2982 }
2940 2983
@@ -3242,7 +3285,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3242 * connection startup slow start one packet too 3285 * connection startup slow start one packet too
3243 * quickly. This is severely frowned upon behavior. 3286 * quickly. This is severely frowned upon behavior.
3244 */ 3287 */
3245 if (!(scb->flags & TCPCB_FLAG_SYN)) { 3288 if (!(scb->flags & TCPHDR_SYN)) {
3246 flag |= FLAG_DATA_ACKED; 3289 flag |= FLAG_DATA_ACKED;
3247 } else { 3290 } else {
3248 flag |= FLAG_SYN_ACKED; 3291 flag |= FLAG_SYN_ACKED;
@@ -3362,8 +3405,8 @@ static void tcp_ack_probe(struct sock *sk)
3362 3405
3363static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) 3406static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
3364{ 3407{
3365 return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || 3408 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3366 inet_csk(sk)->icsk_ca_state != TCP_CA_Open); 3409 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3367} 3410}
3368 3411
3369static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) 3412static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
@@ -3380,9 +3423,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
3380 const u32 ack, const u32 ack_seq, 3423 const u32 ack, const u32 ack_seq,
3381 const u32 nwin) 3424 const u32 nwin)
3382{ 3425{
3383 return (after(ack, tp->snd_una) || 3426 return after(ack, tp->snd_una) ||
3384 after(ack_seq, tp->snd_wl1) || 3427 after(ack_seq, tp->snd_wl1) ||
3385 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); 3428 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
3386} 3429}
3387 3430
3388/* Update our send window. 3431/* Update our send window.
@@ -3666,7 +3709,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3666 } 3709 }
3667 3710
3668 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) 3711 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
3669 dst_confirm(sk->sk_dst_cache); 3712 dst_confirm(__sk_dst_get(sk));
3670 3713
3671 return 1; 3714 return 1;
3672 3715
@@ -3699,7 +3742,7 @@ old_ack:
3699 * the fast version below fails. 3742 * the fast version below fails.
3700 */ 3743 */
3701void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, 3744void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3702 int estab) 3745 u8 **hvpp, int estab)
3703{ 3746{
3704 unsigned char *ptr; 3747 unsigned char *ptr;
3705 struct tcphdr *th = tcp_hdr(skb); 3748 struct tcphdr *th = tcp_hdr(skb);
@@ -3783,6 +3826,30 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3783 */ 3826 */
3784 break; 3827 break;
3785#endif 3828#endif
3829 case TCPOPT_COOKIE:
3830 /* This option is variable length.
3831 */
3832 switch (opsize) {
3833 case TCPOLEN_COOKIE_BASE:
3834 /* not yet implemented */
3835 break;
3836 case TCPOLEN_COOKIE_PAIR:
3837 /* not yet implemented */
3838 break;
3839 case TCPOLEN_COOKIE_MIN+0:
3840 case TCPOLEN_COOKIE_MIN+2:
3841 case TCPOLEN_COOKIE_MIN+4:
3842 case TCPOLEN_COOKIE_MIN+6:
3843 case TCPOLEN_COOKIE_MAX:
3844 /* 16-bit multiple */
3845 opt_rx->cookie_plus = opsize;
3846 *hvpp = ptr;
3847 break;
3848 default:
3849 /* ignore option */
3850 break;
3851 }
3852 break;
3786 } 3853 }
3787 3854
3788 ptr += opsize-2; 3855 ptr += opsize-2;
@@ -3790,6 +3857,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3790 } 3857 }
3791 } 3858 }
3792} 3859}
3860EXPORT_SYMBOL(tcp_parse_options);
3793 3861
3794static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) 3862static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
3795{ 3863{
@@ -3811,17 +3879,20 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
3811 * If it is wrong it falls back on tcp_parse_options(). 3879 * If it is wrong it falls back on tcp_parse_options().
3812 */ 3880 */
3813static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, 3881static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
3814 struct tcp_sock *tp) 3882 struct tcp_sock *tp, u8 **hvpp)
3815{ 3883{
3816 if (th->doff == sizeof(struct tcphdr) >> 2) { 3884 /* In the spirit of fast parsing, compare doff directly to constant
3885 * values. Because equality is used, short doff can be ignored here.
3886 */
3887 if (th->doff == (sizeof(*th) / 4)) {
3817 tp->rx_opt.saw_tstamp = 0; 3888 tp->rx_opt.saw_tstamp = 0;
3818 return 0; 3889 return 0;
3819 } else if (tp->rx_opt.tstamp_ok && 3890 } else if (tp->rx_opt.tstamp_ok &&
3820 th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { 3891 th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
3821 if (tcp_parse_aligned_timestamp(tp, th)) 3892 if (tcp_parse_aligned_timestamp(tp, th))
3822 return 1; 3893 return 1;
3823 } 3894 }
3824 tcp_parse_options(skb, &tp->rx_opt, 1); 3895 tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
3825 return 1; 3896 return 1;
3826} 3897}
3827 3898
@@ -3853,13 +3924,14 @@ u8 *tcp_parse_md5sig_option(struct tcphdr *th)
3853 if (opsize < 2 || opsize > length) 3924 if (opsize < 2 || opsize > length)
3854 return NULL; 3925 return NULL;
3855 if (opcode == TCPOPT_MD5SIG) 3926 if (opcode == TCPOPT_MD5SIG)
3856 return ptr; 3927 return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
3857 } 3928 }
3858 ptr += opsize - 2; 3929 ptr += opsize - 2;
3859 length -= opsize; 3930 length -= opsize;
3860 } 3931 }
3861 return NULL; 3932 return NULL;
3862} 3933}
3934EXPORT_SYMBOL(tcp_parse_md5sig_option);
3863#endif 3935#endif
3864 3936
3865static inline void tcp_store_ts_recent(struct tcp_sock *tp) 3937static inline void tcp_store_ts_recent(struct tcp_sock *tp)
@@ -3970,6 +4042,8 @@ static void tcp_reset(struct sock *sk)
3970 default: 4042 default:
3971 sk->sk_err = ECONNRESET; 4043 sk->sk_err = ECONNRESET;
3972 } 4044 }
4045 /* This barrier is coupled with smp_rmb() in tcp_poll() */
4046 smp_wmb();
3973 4047
3974 if (!sock_flag(sk, SOCK_DEAD)) 4048 if (!sock_flag(sk, SOCK_DEAD))
3975 sk->sk_error_report(sk); 4049 sk->sk_error_report(sk);
@@ -4249,7 +4323,7 @@ static void tcp_ofo_queue(struct sock *sk)
4249 } 4323 }
4250 4324
4251 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { 4325 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4252 SOCK_DEBUG(sk, "ofo packet was already received \n"); 4326 SOCK_DEBUG(sk, "ofo packet was already received\n");
4253 __skb_unlink(skb, &tp->out_of_order_queue); 4327 __skb_unlink(skb, &tp->out_of_order_queue);
4254 __kfree_skb(skb); 4328 __kfree_skb(skb);
4255 continue; 4329 continue;
@@ -4297,6 +4371,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4297 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) 4371 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
4298 goto drop; 4372 goto drop;
4299 4373
4374 skb_dst_drop(skb);
4300 __skb_pull(skb, th->doff * 4); 4375 __skb_pull(skb, th->doff * 4);
4301 4376
4302 TCP_ECN_accept_cwr(tp, skb); 4377 TCP_ECN_accept_cwr(tp, skb);
@@ -4788,7 +4863,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk)
4788 return 0; 4863 return 0;
4789 4864
4790 /* If we are under soft global TCP memory pressure, do not expand. */ 4865 /* If we are under soft global TCP memory pressure, do not expand. */
4791 if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) 4866 if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
4792 return 0; 4867 return 0;
4793 4868
4794 /* If we filled the congestion window, do not expand. */ 4869 /* If we filled the congestion window, do not expand. */
@@ -4846,11 +4921,11 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
4846 struct tcp_sock *tp = tcp_sk(sk); 4921 struct tcp_sock *tp = tcp_sk(sk);
4847 4922
4848 /* More than one full frame received... */ 4923 /* More than one full frame received... */
4849 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss 4924 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
4850 /* ... and right edge of window advances far enough. 4925 /* ... and right edge of window advances far enough.
4851 * (tcp_recvmsg() will send ACK otherwise). Or... 4926 * (tcp_recvmsg() will send ACK otherwise). Or...
4852 */ 4927 */
4853 && __tcp_select_window(sk) >= tp->rcv_wnd) || 4928 __tcp_select_window(sk) >= tp->rcv_wnd) ||
4854 /* We ACK each frame or... */ 4929 /* We ACK each frame or... */
4855 tcp_in_quickack_mode(sk) || 4930 tcp_in_quickack_mode(sk) ||
4856 /* We have out of order data. */ 4931 /* We have out of order data. */
@@ -5071,10 +5146,12 @@ out:
5071static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, 5146static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5072 struct tcphdr *th, int syn_inerr) 5147 struct tcphdr *th, int syn_inerr)
5073{ 5148{
5149 u8 *hash_location;
5074 struct tcp_sock *tp = tcp_sk(sk); 5150 struct tcp_sock *tp = tcp_sk(sk);
5075 5151
5076 /* RFC1323: H1. Apply PAWS check first. */ 5152 /* RFC1323: H1. Apply PAWS check first. */
5077 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && 5153 if (tcp_fast_parse_options(skb, th, tp, &hash_location) &&
5154 tp->rx_opt.saw_tstamp &&
5078 tcp_paws_discard(sk, skb)) { 5155 tcp_paws_discard(sk, skb)) {
5079 if (!th->rst) { 5156 if (!th->rst) {
5080 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); 5157 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
@@ -5358,15 +5435,18 @@ discard:
5358 __kfree_skb(skb); 5435 __kfree_skb(skb);
5359 return 0; 5436 return 0;
5360} 5437}
5438EXPORT_SYMBOL(tcp_rcv_established);
5361 5439
5362static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5440static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5363 struct tcphdr *th, unsigned len) 5441 struct tcphdr *th, unsigned len)
5364{ 5442{
5365 struct tcp_sock *tp = tcp_sk(sk); 5443 u8 *hash_location;
5366 struct inet_connection_sock *icsk = inet_csk(sk); 5444 struct inet_connection_sock *icsk = inet_csk(sk);
5445 struct tcp_sock *tp = tcp_sk(sk);
5446 struct tcp_cookie_values *cvp = tp->cookie_values;
5367 int saved_clamp = tp->rx_opt.mss_clamp; 5447 int saved_clamp = tp->rx_opt.mss_clamp;
5368 5448
5369 tcp_parse_options(skb, &tp->rx_opt, 0); 5449 tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
5370 5450
5371 if (th->ack) { 5451 if (th->ack) {
5372 /* rfc793: 5452 /* rfc793:
@@ -5463,6 +5543,31 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5463 * Change state from SYN-SENT only after copied_seq 5543 * Change state from SYN-SENT only after copied_seq
5464 * is initialized. */ 5544 * is initialized. */
5465 tp->copied_seq = tp->rcv_nxt; 5545 tp->copied_seq = tp->rcv_nxt;
5546
5547 if (cvp != NULL &&
5548 cvp->cookie_pair_size > 0 &&
5549 tp->rx_opt.cookie_plus > 0) {
5550 int cookie_size = tp->rx_opt.cookie_plus
5551 - TCPOLEN_COOKIE_BASE;
5552 int cookie_pair_size = cookie_size
5553 + cvp->cookie_desired;
5554
5555 /* A cookie extension option was sent and returned.
5556 * Note that each incoming SYNACK replaces the
5557 * Responder cookie. The initial exchange is most
5558 * fragile, as protection against spoofing relies
5559 * entirely upon the sequence and timestamp (above).
5560 * This replacement strategy allows the correct pair to
5561 * pass through, while any others will be filtered via
5562 * Responder verification later.
5563 */
5564 if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
5565 memcpy(&cvp->cookie_pair[cvp->cookie_desired],
5566 hash_location, cookie_size);
5567 cvp->cookie_pair_size = cookie_pair_size;
5568 }
5569 }
5570
5466 smp_mb(); 5571 smp_mb();
5467 tcp_set_state(sk, TCP_ESTABLISHED); 5572 tcp_set_state(sk, TCP_ESTABLISHED);
5468 5573
@@ -5700,11 +5805,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5700 5805
5701 /* tcp_ack considers this ACK as duplicate 5806 /* tcp_ack considers this ACK as duplicate
5702 * and does not calculate rtt. 5807 * and does not calculate rtt.
5703 * Fix it at least with timestamps. 5808 * Force it here.
5704 */ 5809 */
5705 if (tp->rx_opt.saw_tstamp && 5810 tcp_ack_update_rtt(sk, 0, 0);
5706 tp->rx_opt.rcv_tsecr && !tp->srtt)
5707 tcp_ack_saw_tstamp(sk, 0);
5708 5811
5709 if (tp->rx_opt.tstamp_ok) 5812 if (tp->rx_opt.tstamp_ok)
5710 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5813 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -5736,7 +5839,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5736 if (tp->snd_una == tp->write_seq) { 5839 if (tp->snd_una == tp->write_seq) {
5737 tcp_set_state(sk, TCP_FIN_WAIT2); 5840 tcp_set_state(sk, TCP_FIN_WAIT2);
5738 sk->sk_shutdown |= SEND_SHUTDOWN; 5841 sk->sk_shutdown |= SEND_SHUTDOWN;
5739 dst_confirm(sk->sk_dst_cache); 5842 dst_confirm(__sk_dst_get(sk));
5740 5843
5741 if (!sock_flag(sk, SOCK_DEAD)) 5844 if (!sock_flag(sk, SOCK_DEAD))
5742 /* Wake up lingering close() */ 5845 /* Wake up lingering close() */
@@ -5832,14 +5935,4 @@ discard:
5832 } 5935 }
5833 return 0; 5936 return 0;
5834} 5937}
5835
5836EXPORT_SYMBOL(sysctl_tcp_ecn);
5837EXPORT_SYMBOL(sysctl_tcp_reordering);
5838EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
5839EXPORT_SYMBOL(tcp_parse_options);
5840#ifdef CONFIG_TCP_MD5SIG
5841EXPORT_SYMBOL(tcp_parse_md5sig_option);
5842#endif
5843EXPORT_SYMBOL(tcp_rcv_established);
5844EXPORT_SYMBOL(tcp_rcv_state_process); 5938EXPORT_SYMBOL(tcp_rcv_state_process);
5845EXPORT_SYMBOL(tcp_initialize_rcv_mss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5a1ca2698c88..856f68466d49 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -60,6 +60,7 @@
60#include <linux/jhash.h> 60#include <linux/jhash.h>
61#include <linux/init.h> 61#include <linux/init.h>
62#include <linux/times.h> 62#include <linux/times.h>
63#include <linux/slab.h>
63 64
64#include <net/net_namespace.h> 65#include <net/net_namespace.h>
65#include <net/icmp.h> 66#include <net/icmp.h>
@@ -83,6 +84,7 @@
83 84
84int sysctl_tcp_tw_reuse __read_mostly; 85int sysctl_tcp_tw_reuse __read_mostly;
85int sysctl_tcp_low_latency __read_mostly; 86int sysctl_tcp_low_latency __read_mostly;
87EXPORT_SYMBOL(sysctl_tcp_low_latency);
86 88
87 89
88#ifdef CONFIG_TCP_MD5SIG 90#ifdef CONFIG_TCP_MD5SIG
@@ -99,6 +101,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
99#endif 101#endif
100 102
101struct inet_hashinfo tcp_hashinfo; 103struct inet_hashinfo tcp_hashinfo;
104EXPORT_SYMBOL(tcp_hashinfo);
102 105
103static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) 106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
104{ 107{
@@ -138,7 +141,6 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
138 141
139 return 0; 142 return 0;
140} 143}
141
142EXPORT_SYMBOL_GPL(tcp_twsk_unique); 144EXPORT_SYMBOL_GPL(tcp_twsk_unique);
143 145
144/* This will initiate an outgoing connection. */ 146/* This will initiate an outgoing connection. */
@@ -165,10 +167,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
165 nexthop = inet->opt->faddr; 167 nexthop = inet->opt->faddr;
166 } 168 }
167 169
168 tmp = ip_route_connect(&rt, nexthop, inet->saddr, 170 tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
169 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 171 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
170 IPPROTO_TCP, 172 IPPROTO_TCP,
171 inet->sport, usin->sin_port, sk, 1); 173 inet->inet_sport, usin->sin_port, sk, 1);
172 if (tmp < 0) { 174 if (tmp < 0) {
173 if (tmp == -ENETUNREACH) 175 if (tmp == -ENETUNREACH)
174 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 176 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
@@ -183,11 +185,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
183 if (!inet->opt || !inet->opt->srr) 185 if (!inet->opt || !inet->opt->srr)
184 daddr = rt->rt_dst; 186 daddr = rt->rt_dst;
185 187
186 if (!inet->saddr) 188 if (!inet->inet_saddr)
187 inet->saddr = rt->rt_src; 189 inet->inet_saddr = rt->rt_src;
188 inet->rcv_saddr = inet->saddr; 190 inet->inet_rcv_saddr = inet->inet_saddr;
189 191
190 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { 192 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
191 /* Reset inherited state */ 193 /* Reset inherited state */
192 tp->rx_opt.ts_recent = 0; 194 tp->rx_opt.ts_recent = 0;
193 tp->rx_opt.ts_recent_stamp = 0; 195 tp->rx_opt.ts_recent_stamp = 0;
@@ -203,21 +205,23 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
203 * TIME-WAIT * and initialize rx_opt.ts_recent from it, 205 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
204 * when trying new connection. 206 * when trying new connection.
205 */ 207 */
206 if (peer != NULL && 208 if (peer) {
207 peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) { 209 inet_peer_refcheck(peer);
208 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; 210 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
209 tp->rx_opt.ts_recent = peer->tcp_ts; 211 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
212 tp->rx_opt.ts_recent = peer->tcp_ts;
213 }
210 } 214 }
211 } 215 }
212 216
213 inet->dport = usin->sin_port; 217 inet->inet_dport = usin->sin_port;
214 inet->daddr = daddr; 218 inet->inet_daddr = daddr;
215 219
216 inet_csk(sk)->icsk_ext_hdr_len = 0; 220 inet_csk(sk)->icsk_ext_hdr_len = 0;
217 if (inet->opt) 221 if (inet->opt)
218 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; 222 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
219 223
220 tp->rx_opt.mss_clamp = 536; 224 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
221 225
222 /* Socket identity is still unknown (sport may be zero). 226 /* Socket identity is still unknown (sport may be zero).
223 * However we set state to SYN-SENT and not releasing socket 227 * However we set state to SYN-SENT and not releasing socket
@@ -230,21 +234,21 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
230 goto failure; 234 goto failure;
231 235
232 err = ip_route_newports(&rt, IPPROTO_TCP, 236 err = ip_route_newports(&rt, IPPROTO_TCP,
233 inet->sport, inet->dport, sk); 237 inet->inet_sport, inet->inet_dport, sk);
234 if (err) 238 if (err)
235 goto failure; 239 goto failure;
236 240
237 /* OK, now commit destination to socket. */ 241 /* OK, now commit destination to socket. */
238 sk->sk_gso_type = SKB_GSO_TCPV4; 242 sk->sk_gso_type = SKB_GSO_TCPV4;
239 sk_setup_caps(sk, &rt->u.dst); 243 sk_setup_caps(sk, &rt->dst);
240 244
241 if (!tp->write_seq) 245 if (!tp->write_seq)
242 tp->write_seq = secure_tcp_sequence_number(inet->saddr, 246 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
243 inet->daddr, 247 inet->inet_daddr,
244 inet->sport, 248 inet->inet_sport,
245 usin->sin_port); 249 usin->sin_port);
246 250
247 inet->id = tp->write_seq ^ jiffies; 251 inet->inet_id = tp->write_seq ^ jiffies;
248 252
249 err = tcp_connect(sk); 253 err = tcp_connect(sk);
250 rt = NULL; 254 rt = NULL;
@@ -261,9 +265,10 @@ failure:
261 tcp_set_state(sk, TCP_CLOSE); 265 tcp_set_state(sk, TCP_CLOSE);
262 ip_rt_put(rt); 266 ip_rt_put(rt);
263 sk->sk_route_caps = 0; 267 sk->sk_route_caps = 0;
264 inet->dport = 0; 268 inet->inet_dport = 0;
265 return err; 269 return err;
266} 270}
271EXPORT_SYMBOL(tcp_v4_connect);
267 272
268/* 273/*
269 * This routine does path mtu discovery as defined in RFC1191. 274 * This routine does path mtu discovery as defined in RFC1191.
@@ -328,26 +333,29 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
328 * 333 *
329 */ 334 */
330 335
331void tcp_v4_err(struct sk_buff *skb, u32 info) 336void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
332{ 337{
333 struct iphdr *iph = (struct iphdr *)skb->data; 338 struct iphdr *iph = (struct iphdr *)icmp_skb->data;
334 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); 339 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
340 struct inet_connection_sock *icsk;
335 struct tcp_sock *tp; 341 struct tcp_sock *tp;
336 struct inet_sock *inet; 342 struct inet_sock *inet;
337 const int type = icmp_hdr(skb)->type; 343 const int type = icmp_hdr(icmp_skb)->type;
338 const int code = icmp_hdr(skb)->code; 344 const int code = icmp_hdr(icmp_skb)->code;
339 struct sock *sk; 345 struct sock *sk;
346 struct sk_buff *skb;
340 __u32 seq; 347 __u32 seq;
348 __u32 remaining;
341 int err; 349 int err;
342 struct net *net = dev_net(skb->dev); 350 struct net *net = dev_net(icmp_skb->dev);
343 351
344 if (skb->len < (iph->ihl << 2) + 8) { 352 if (icmp_skb->len < (iph->ihl << 2) + 8) {
345 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); 353 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
346 return; 354 return;
347 } 355 }
348 356
349 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, 357 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
350 iph->saddr, th->source, inet_iif(skb)); 358 iph->saddr, th->source, inet_iif(icmp_skb));
351 if (!sk) { 359 if (!sk) {
352 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); 360 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
353 return; 361 return;
@@ -367,6 +375,12 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
367 if (sk->sk_state == TCP_CLOSE) 375 if (sk->sk_state == TCP_CLOSE)
368 goto out; 376 goto out;
369 377
378 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
379 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
380 goto out;
381 }
382
383 icsk = inet_csk(sk);
370 tp = tcp_sk(sk); 384 tp = tcp_sk(sk);
371 seq = ntohl(th->seq); 385 seq = ntohl(th->seq);
372 if (sk->sk_state != TCP_LISTEN && 386 if (sk->sk_state != TCP_LISTEN &&
@@ -393,6 +407,37 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
393 } 407 }
394 408
395 err = icmp_err_convert[code].errno; 409 err = icmp_err_convert[code].errno;
410 /* check if icmp_skb allows revert of backoff
411 * (see draft-zimmermann-tcp-lcd) */
412 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
413 break;
414 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
415 !icsk->icsk_backoff)
416 break;
417
418 if (sock_owned_by_user(sk))
419 break;
420
421 icsk->icsk_backoff--;
422 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
423 icsk->icsk_backoff;
424 tcp_bound_rto(sk);
425
426 skb = tcp_write_queue_head(sk);
427 BUG_ON(!skb);
428
429 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
430 tcp_time_stamp - TCP_SKB_CB(skb)->when);
431
432 if (remaining) {
433 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
434 remaining, TCP_RTO_MAX);
435 } else {
436 /* RTO revert clocked out retransmission.
437 * Will retransmit now */
438 tcp_retransmit_timer(sk);
439 }
440
396 break; 441 break;
397 case ICMP_TIME_EXCEEDED: 442 case ICMP_TIME_EXCEEDED:
398 err = EHOSTUNREACH; 443 err = EHOSTUNREACH;
@@ -476,25 +521,32 @@ out:
476 sock_put(sk); 521 sock_put(sk);
477} 522}
478 523
479/* This routine computes an IPv4 TCP checksum. */ 524static void __tcp_v4_send_check(struct sk_buff *skb,
480void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) 525 __be32 saddr, __be32 daddr)
481{ 526{
482 struct inet_sock *inet = inet_sk(sk);
483 struct tcphdr *th = tcp_hdr(skb); 527 struct tcphdr *th = tcp_hdr(skb);
484 528
485 if (skb->ip_summed == CHECKSUM_PARTIAL) { 529 if (skb->ip_summed == CHECKSUM_PARTIAL) {
486 th->check = ~tcp_v4_check(len, inet->saddr, 530 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
487 inet->daddr, 0);
488 skb->csum_start = skb_transport_header(skb) - skb->head; 531 skb->csum_start = skb_transport_header(skb) - skb->head;
489 skb->csum_offset = offsetof(struct tcphdr, check); 532 skb->csum_offset = offsetof(struct tcphdr, check);
490 } else { 533 } else {
491 th->check = tcp_v4_check(len, inet->saddr, inet->daddr, 534 th->check = tcp_v4_check(skb->len, saddr, daddr,
492 csum_partial(th, 535 csum_partial(th,
493 th->doff << 2, 536 th->doff << 2,
494 skb->csum)); 537 skb->csum));
495 } 538 }
496} 539}
497 540
541/* This routine computes an IPv4 TCP checksum. */
542void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
543{
544 struct inet_sock *inet = inet_sk(sk);
545
546 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
547}
548EXPORT_SYMBOL(tcp_v4_send_check);
549
498int tcp_v4_gso_send_check(struct sk_buff *skb) 550int tcp_v4_gso_send_check(struct sk_buff *skb)
499{ 551{
500 const struct iphdr *iph; 552 const struct iphdr *iph;
@@ -507,10 +559,8 @@ int tcp_v4_gso_send_check(struct sk_buff *skb)
507 th = tcp_hdr(skb); 559 th = tcp_hdr(skb);
508 560
509 th->check = 0; 561 th->check = 0;
510 th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
511 skb->csum_start = skb_transport_header(skb) - skb->head;
512 skb->csum_offset = offsetof(struct tcphdr, check);
513 skb->ip_summed = CHECKSUM_PARTIAL; 562 skb->ip_summed = CHECKSUM_PARTIAL;
563 __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
514 return 0; 564 return 0;
515} 565}
516 566
@@ -704,8 +754,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
704 * This still operates on a request_sock only, not on a big 754 * This still operates on a request_sock only, not on a big
705 * socket. 755 * socket.
706 */ 756 */
707static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, 757static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
708 struct dst_entry *dst) 758 struct request_sock *req,
759 struct request_values *rvp)
709{ 760{
710 const struct inet_request_sock *ireq = inet_rsk(req); 761 const struct inet_request_sock *ireq = inet_rsk(req);
711 int err = -1; 762 int err = -1;
@@ -715,16 +766,10 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
715 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) 766 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
716 return -1; 767 return -1;
717 768
718 skb = tcp_make_synack(sk, dst, req); 769 skb = tcp_make_synack(sk, dst, req, rvp);
719 770
720 if (skb) { 771 if (skb) {
721 struct tcphdr *th = tcp_hdr(skb); 772 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
722
723 th->check = tcp_v4_check(skb->len,
724 ireq->loc_addr,
725 ireq->rmt_addr,
726 csum_partial(th, skb->len,
727 skb->csum));
728 773
729 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, 774 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
730 ireq->rmt_addr, 775 ireq->rmt_addr,
@@ -736,9 +781,11 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
736 return err; 781 return err;
737} 782}
738 783
739static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req) 784static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
785 struct request_values *rvp)
740{ 786{
741 return __tcp_v4_send_synack(sk, req, NULL); 787 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
788 return tcp_v4_send_synack(sk, NULL, req, rvp);
742} 789}
743 790
744/* 791/*
@@ -749,19 +796,20 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
749 kfree(inet_rsk(req)->opt); 796 kfree(inet_rsk(req)->opt);
750} 797}
751 798
752#ifdef CONFIG_SYN_COOKIES 799static void syn_flood_warning(const struct sk_buff *skb)
753static void syn_flood_warning(struct sk_buff *skb)
754{ 800{
755 static unsigned long warntime; 801 const char *msg;
756 802
757 if (time_after(jiffies, (warntime + HZ * 60))) { 803#ifdef CONFIG_SYN_COOKIES
758 warntime = jiffies; 804 if (sysctl_tcp_syncookies)
759 printk(KERN_INFO 805 msg = "Sending cookies";
760 "possible SYN flooding on port %d. Sending cookies.\n", 806 else
761 ntohs(tcp_hdr(skb)->dest));
762 }
763}
764#endif 807#endif
808 msg = "Dropping request";
809
810 pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
811 ntohs(tcp_hdr(skb)->dest), msg);
812}
765 813
766/* 814/*
767 * Save and compile IPv4 options into the request_sock if needed. 815 * Save and compile IPv4 options into the request_sock if needed.
@@ -811,9 +859,8 @@ static struct tcp_md5sig_key *
811struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, 859struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
812 struct sock *addr_sk) 860 struct sock *addr_sk)
813{ 861{
814 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr); 862 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
815} 863}
816
817EXPORT_SYMBOL(tcp_v4_md5_lookup); 864EXPORT_SYMBOL(tcp_v4_md5_lookup);
818 865
819static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, 866static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
@@ -847,9 +894,9 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
847 kfree(newkey); 894 kfree(newkey);
848 return -ENOMEM; 895 return -ENOMEM;
849 } 896 }
850 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 897 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
851 } 898 }
852 if (tcp_alloc_md5sig_pool() == NULL) { 899 if (tcp_alloc_md5sig_pool(sk) == NULL) {
853 kfree(newkey); 900 kfree(newkey);
854 return -ENOMEM; 901 return -ENOMEM;
855 } 902 }
@@ -880,13 +927,12 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
880 } 927 }
881 return 0; 928 return 0;
882} 929}
883
884EXPORT_SYMBOL(tcp_v4_md5_do_add); 930EXPORT_SYMBOL(tcp_v4_md5_do_add);
885 931
886static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, 932static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
887 u8 *newkey, u8 newkeylen) 933 u8 *newkey, u8 newkeylen)
888{ 934{
889 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr, 935 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
890 newkey, newkeylen); 936 newkey, newkeylen);
891} 937}
892 938
@@ -918,7 +964,6 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
918 } 964 }
919 return -ENOENT; 965 return -ENOENT;
920} 966}
921
922EXPORT_SYMBOL(tcp_v4_md5_do_del); 967EXPORT_SYMBOL(tcp_v4_md5_do_del);
923 968
924static void tcp_v4_clear_md5_list(struct sock *sk) 969static void tcp_v4_clear_md5_list(struct sock *sk)
@@ -970,16 +1015,17 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
970 1015
971 if (!tcp_sk(sk)->md5sig_info) { 1016 if (!tcp_sk(sk)->md5sig_info) {
972 struct tcp_sock *tp = tcp_sk(sk); 1017 struct tcp_sock *tp = tcp_sk(sk);
973 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL); 1018 struct tcp_md5sig_info *p;
974 1019
1020 p = kzalloc(sizeof(*p), sk->sk_allocation);
975 if (!p) 1021 if (!p)
976 return -EINVAL; 1022 return -EINVAL;
977 1023
978 tp->md5sig_info = p; 1024 tp->md5sig_info = p;
979 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1025 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
980 } 1026 }
981 1027
982 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); 1028 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
983 if (!newkey) 1029 if (!newkey)
984 return -ENOMEM; 1030 return -ENOMEM;
985 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr, 1031 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
@@ -1051,8 +1097,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1051 __be32 saddr, daddr; 1097 __be32 saddr, daddr;
1052 1098
1053 if (sk) { 1099 if (sk) {
1054 saddr = inet_sk(sk)->saddr; 1100 saddr = inet_sk(sk)->inet_saddr;
1055 daddr = inet_sk(sk)->daddr; 1101 daddr = inet_sk(sk)->inet_daddr;
1056 } else if (req) { 1102 } else if (req) {
1057 saddr = inet_rsk(req)->loc_addr; 1103 saddr = inet_rsk(req)->loc_addr;
1058 daddr = inet_rsk(req)->rmt_addr; 1104 daddr = inet_rsk(req)->rmt_addr;
@@ -1090,7 +1136,6 @@ clear_hash_noput:
1090 memset(md5_hash, 0, 16); 1136 memset(md5_hash, 0, 16);
1091 return 1; 1137 return 1;
1092} 1138}
1093
1094EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1139EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1095 1140
1096static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) 1141static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
@@ -1151,33 +1196,32 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1151struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1196struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1152 .family = PF_INET, 1197 .family = PF_INET,
1153 .obj_size = sizeof(struct tcp_request_sock), 1198 .obj_size = sizeof(struct tcp_request_sock),
1154 .rtx_syn_ack = tcp_v4_send_synack, 1199 .rtx_syn_ack = tcp_v4_rtx_synack,
1155 .send_ack = tcp_v4_reqsk_send_ack, 1200 .send_ack = tcp_v4_reqsk_send_ack,
1156 .destructor = tcp_v4_reqsk_destructor, 1201 .destructor = tcp_v4_reqsk_destructor,
1157 .send_reset = tcp_v4_send_reset, 1202 .send_reset = tcp_v4_send_reset,
1203 .syn_ack_timeout = tcp_syn_ack_timeout,
1158}; 1204};
1159 1205
1160#ifdef CONFIG_TCP_MD5SIG 1206#ifdef CONFIG_TCP_MD5SIG
1161static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { 1207static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1162 .md5_lookup = tcp_v4_reqsk_md5_lookup, 1208 .md5_lookup = tcp_v4_reqsk_md5_lookup,
1209 .calc_md5_hash = tcp_v4_md5_hash_skb,
1163}; 1210};
1164#endif 1211#endif
1165 1212
1166static struct timewait_sock_ops tcp_timewait_sock_ops = {
1167 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1168 .twsk_unique = tcp_twsk_unique,
1169 .twsk_destructor= tcp_twsk_destructor,
1170};
1171
1172int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1213int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1173{ 1214{
1174 struct inet_request_sock *ireq; 1215 struct tcp_extend_values tmp_ext;
1175 struct tcp_options_received tmp_opt; 1216 struct tcp_options_received tmp_opt;
1217 u8 *hash_location;
1176 struct request_sock *req; 1218 struct request_sock *req;
1219 struct inet_request_sock *ireq;
1220 struct tcp_sock *tp = tcp_sk(sk);
1221 struct dst_entry *dst = NULL;
1177 __be32 saddr = ip_hdr(skb)->saddr; 1222 __be32 saddr = ip_hdr(skb)->saddr;
1178 __be32 daddr = ip_hdr(skb)->daddr; 1223 __be32 daddr = ip_hdr(skb)->daddr;
1179 __u32 isn = TCP_SKB_CB(skb)->when; 1224 __u32 isn = TCP_SKB_CB(skb)->when;
1180 struct dst_entry *dst = NULL;
1181#ifdef CONFIG_SYN_COOKIES 1225#ifdef CONFIG_SYN_COOKIES
1182 int want_cookie = 0; 1226 int want_cookie = 0;
1183#else 1227#else
@@ -1193,6 +1237,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1193 * evidently real one. 1237 * evidently real one.
1194 */ 1238 */
1195 if (inet_csk_reqsk_queue_is_full(sk) && !isn) { 1239 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1240 if (net_ratelimit())
1241 syn_flood_warning(skb);
1196#ifdef CONFIG_SYN_COOKIES 1242#ifdef CONFIG_SYN_COOKIES
1197 if (sysctl_tcp_syncookies) { 1243 if (sysctl_tcp_syncookies) {
1198 want_cookie = 1; 1244 want_cookie = 1;
@@ -1218,16 +1264,50 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1218#endif 1264#endif
1219 1265
1220 tcp_clear_options(&tmp_opt); 1266 tcp_clear_options(&tmp_opt);
1221 tmp_opt.mss_clamp = 536; 1267 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1222 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss; 1268 tmp_opt.user_mss = tp->rx_opt.user_mss;
1269 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1270
1271 if (tmp_opt.cookie_plus > 0 &&
1272 tmp_opt.saw_tstamp &&
1273 !tp->rx_opt.cookie_out_never &&
1274 (sysctl_tcp_cookie_size > 0 ||
1275 (tp->cookie_values != NULL &&
1276 tp->cookie_values->cookie_desired > 0))) {
1277 u8 *c;
1278 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1279 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1280
1281 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1282 goto drop_and_release;
1223 1283
1224 tcp_parse_options(skb, &tmp_opt, 0); 1284 /* Secret recipe starts with IP addresses */
1285 *mess++ ^= (__force u32)daddr;
1286 *mess++ ^= (__force u32)saddr;
1287
1288 /* plus variable length Initiator Cookie */
1289 c = (u8 *)mess;
1290 while (l-- > 0)
1291 *c++ ^= *hash_location++;
1292
1293#ifdef CONFIG_SYN_COOKIES
1294 want_cookie = 0; /* not our kind of cookie */
1295#endif
1296 tmp_ext.cookie_out_never = 0; /* false */
1297 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1298 } else if (!tp->rx_opt.cookie_in_always) {
1299 /* redundant indications, but ensure initialization. */
1300 tmp_ext.cookie_out_never = 1; /* true */
1301 tmp_ext.cookie_plus = 0;
1302 } else {
1303 goto drop_and_release;
1304 }
1305 tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1225 1306
1226 if (want_cookie && !tmp_opt.saw_tstamp) 1307 if (want_cookie && !tmp_opt.saw_tstamp)
1227 tcp_clear_options(&tmp_opt); 1308 tcp_clear_options(&tmp_opt);
1228 1309
1229 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 1310 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1230
1231 tcp_openreq_init(req, &tmp_opt, skb); 1311 tcp_openreq_init(req, &tmp_opt, skb);
1232 1312
1233 ireq = inet_rsk(req); 1313 ireq = inet_rsk(req);
@@ -1239,15 +1319,12 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1239 if (security_inet_conn_request(sk, skb, req)) 1319 if (security_inet_conn_request(sk, skb, req))
1240 goto drop_and_free; 1320 goto drop_and_free;
1241 1321
1242 if (!want_cookie) 1322 if (!want_cookie || tmp_opt.tstamp_ok)
1243 TCP_ECN_create_request(req, tcp_hdr(skb)); 1323 TCP_ECN_create_request(req, tcp_hdr(skb));
1244 1324
1245 if (want_cookie) { 1325 if (want_cookie) {
1246#ifdef CONFIG_SYN_COOKIES
1247 syn_flood_warning(skb);
1248 req->cookie_ts = tmp_opt.tstamp_ok;
1249#endif
1250 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1326 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1327 req->cookie_ts = tmp_opt.tstamp_ok;
1251 } else if (!isn) { 1328 } else if (!isn) {
1252 struct inet_peer *peer = NULL; 1329 struct inet_peer *peer = NULL;
1253 1330
@@ -1264,8 +1341,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1264 tcp_death_row.sysctl_tw_recycle && 1341 tcp_death_row.sysctl_tw_recycle &&
1265 (dst = inet_csk_route_req(sk, req)) != NULL && 1342 (dst = inet_csk_route_req(sk, req)) != NULL &&
1266 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1343 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1267 peer->v4daddr == saddr) { 1344 peer->daddr.a4 == saddr) {
1268 if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && 1345 inet_peer_refcheck(peer);
1346 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1269 (s32)(peer->tcp_ts - req->ts_recent) > 1347 (s32)(peer->tcp_ts - req->ts_recent) >
1270 TCP_PAWS_WINDOW) { 1348 TCP_PAWS_WINDOW) {
1271 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 1349 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
@@ -1294,7 +1372,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1294 } 1372 }
1295 tcp_rsk(req)->snt_isn = isn; 1373 tcp_rsk(req)->snt_isn = isn;
1296 1374
1297 if (__tcp_v4_send_synack(sk, req, dst) || want_cookie) 1375 if (tcp_v4_send_synack(sk, dst, req,
1376 (struct request_values *)&tmp_ext) ||
1377 want_cookie)
1298 goto drop_and_free; 1378 goto drop_and_free;
1299 1379
1300 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); 1380 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
@@ -1307,6 +1387,7 @@ drop_and_free:
1307drop: 1387drop:
1308 return 0; 1388 return 0;
1309} 1389}
1390EXPORT_SYMBOL(tcp_v4_conn_request);
1310 1391
1311 1392
1312/* 1393/*
@@ -1333,7 +1414,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1333 1414
1334 newsk = tcp_create_openreq_child(sk, req, skb); 1415 newsk = tcp_create_openreq_child(sk, req, skb);
1335 if (!newsk) 1416 if (!newsk)
1336 goto exit; 1417 goto exit_nonewsk;
1337 1418
1338 newsk->sk_gso_type = SKB_GSO_TCPV4; 1419 newsk->sk_gso_type = SKB_GSO_TCPV4;
1339 sk_setup_caps(newsk, dst); 1420 sk_setup_caps(newsk, dst);
@@ -1341,9 +1422,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1341 newtp = tcp_sk(newsk); 1422 newtp = tcp_sk(newsk);
1342 newinet = inet_sk(newsk); 1423 newinet = inet_sk(newsk);
1343 ireq = inet_rsk(req); 1424 ireq = inet_rsk(req);
1344 newinet->daddr = ireq->rmt_addr; 1425 newinet->inet_daddr = ireq->rmt_addr;
1345 newinet->rcv_saddr = ireq->loc_addr; 1426 newinet->inet_rcv_saddr = ireq->loc_addr;
1346 newinet->saddr = ireq->loc_addr; 1427 newinet->inet_saddr = ireq->loc_addr;
1347 newinet->opt = ireq->opt; 1428 newinet->opt = ireq->opt;
1348 ireq->opt = NULL; 1429 ireq->opt = NULL;
1349 newinet->mc_index = inet_iif(skb); 1430 newinet->mc_index = inet_iif(skb);
@@ -1351,11 +1432,11 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1351 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1432 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1352 if (newinet->opt) 1433 if (newinet->opt)
1353 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; 1434 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1354 newinet->id = newtp->write_seq ^ jiffies; 1435 newinet->inet_id = newtp->write_seq ^ jiffies;
1355 1436
1356 tcp_mtup_init(newsk); 1437 tcp_mtup_init(newsk);
1357 tcp_sync_mss(newsk, dst_mtu(dst)); 1438 tcp_sync_mss(newsk, dst_mtu(dst));
1358 newtp->advmss = dst_metric(dst, RTAX_ADVMSS); 1439 newtp->advmss = dst_metric_advmss(dst);
1359 if (tcp_sk(sk)->rx_opt.user_mss && 1440 if (tcp_sk(sk)->rx_opt.user_mss &&
1360 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) 1441 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1361 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; 1442 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
@@ -1364,7 +1445,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1364 1445
1365#ifdef CONFIG_TCP_MD5SIG 1446#ifdef CONFIG_TCP_MD5SIG
1366 /* Copy over the MD5 key from the original socket */ 1447 /* Copy over the MD5 key from the original socket */
1367 if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) { 1448 key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1449 if (key != NULL) {
1368 /* 1450 /*
1369 * We're using one, so create a matching key 1451 * We're using one, so create a matching key
1370 * on the newsk structure. If we fail to get 1452 * on the newsk structure. If we fail to get
@@ -1373,24 +1455,29 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1373 */ 1455 */
1374 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); 1456 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1375 if (newkey != NULL) 1457 if (newkey != NULL)
1376 tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr, 1458 tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1377 newkey, key->keylen); 1459 newkey, key->keylen);
1378 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1460 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1379 } 1461 }
1380#endif 1462#endif
1381 1463
1382 __inet_hash_nolisten(newsk); 1464 if (__inet_inherit_port(sk, newsk) < 0) {
1383 __inet_inherit_port(sk, newsk); 1465 sock_put(newsk);
1466 goto exit;
1467 }
1468 __inet_hash_nolisten(newsk, NULL);
1384 1469
1385 return newsk; 1470 return newsk;
1386 1471
1387exit_overflow: 1472exit_overflow:
1388 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1473 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1474exit_nonewsk:
1475 dst_release(dst);
1389exit: 1476exit:
1390 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1477 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1391 dst_release(dst);
1392 return NULL; 1478 return NULL;
1393} 1479}
1480EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1394 1481
1395static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) 1482static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1396{ 1483{
@@ -1417,7 +1504,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1417 } 1504 }
1418 1505
1419#ifdef CONFIG_SYN_COOKIES 1506#ifdef CONFIG_SYN_COOKIES
1420 if (!th->rst && !th->syn && th->ack) 1507 if (!th->syn)
1421 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); 1508 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1422#endif 1509#endif
1423 return sk; 1510 return sk;
@@ -1468,6 +1555,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1468#endif 1555#endif
1469 1556
1470 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1557 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1558 sock_rps_save_rxhash(sk, skb->rxhash);
1471 TCP_CHECK_TIMER(sk); 1559 TCP_CHECK_TIMER(sk);
1472 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1560 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1473 rsk = sk; 1561 rsk = sk;
@@ -1492,7 +1580,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1492 } 1580 }
1493 return 0; 1581 return 0;
1494 } 1582 }
1495 } 1583 } else
1584 sock_rps_save_rxhash(sk, skb->rxhash);
1585
1496 1586
1497 TCP_CHECK_TIMER(sk); 1587 TCP_CHECK_TIMER(sk);
1498 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { 1588 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
@@ -1517,6 +1607,7 @@ csum_err:
1517 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); 1607 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1518 goto discard; 1608 goto discard;
1519} 1609}
1610EXPORT_SYMBOL(tcp_v4_do_rcv);
1520 1611
1521/* 1612/*
1522 * From tcp_input.c 1613 * From tcp_input.c
@@ -1571,6 +1662,11 @@ process:
1571 if (sk->sk_state == TCP_TIME_WAIT) 1662 if (sk->sk_state == TCP_TIME_WAIT)
1572 goto do_time_wait; 1663 goto do_time_wait;
1573 1664
1665 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1666 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1667 goto discard_and_relse;
1668 }
1669
1574 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1670 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1575 goto discard_and_relse; 1671 goto discard_and_relse;
1576 nf_reset(skb); 1672 nf_reset(skb);
@@ -1595,8 +1691,11 @@ process:
1595 if (!tcp_prequeue(sk, skb)) 1691 if (!tcp_prequeue(sk, skb))
1596 ret = tcp_v4_do_rcv(sk, skb); 1692 ret = tcp_v4_do_rcv(sk, skb);
1597 } 1693 }
1598 } else 1694 } else if (unlikely(sk_add_backlog(sk, skb))) {
1599 sk_add_backlog(sk, skb); 1695 bh_unlock_sock(sk);
1696 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1697 goto discard_and_relse;
1698 }
1600 bh_unlock_sock(sk); 1699 bh_unlock_sock(sk);
1601 1700
1602 sock_put(sk); 1701 sock_put(sk);
@@ -1658,71 +1757,48 @@ do_time_wait:
1658 goto discard_it; 1757 goto discard_it;
1659} 1758}
1660 1759
1661/* VJ's idea. Save last timestamp seen from this destination 1760struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1662 * and hold it at least for normal timewait interval to use for duplicate
1663 * segment detection in subsequent connections, before they enter synchronized
1664 * state.
1665 */
1666
1667int tcp_v4_remember_stamp(struct sock *sk)
1668{ 1761{
1762 struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1669 struct inet_sock *inet = inet_sk(sk); 1763 struct inet_sock *inet = inet_sk(sk);
1670 struct tcp_sock *tp = tcp_sk(sk); 1764 struct inet_peer *peer;
1671 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1672 struct inet_peer *peer = NULL;
1673 int release_it = 0;
1674 1765
1675 if (!rt || rt->rt_dst != inet->daddr) { 1766 if (!rt || rt->rt_dst != inet->inet_daddr) {
1676 peer = inet_getpeer(inet->daddr, 1); 1767 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1677 release_it = 1; 1768 *release_it = true;
1678 } else { 1769 } else {
1679 if (!rt->peer) 1770 if (!rt->peer)
1680 rt_bind_peer(rt, 1); 1771 rt_bind_peer(rt, 1);
1681 peer = rt->peer; 1772 peer = rt->peer;
1773 *release_it = false;
1682 } 1774 }
1683 1775
1684 if (peer) { 1776 return peer;
1685 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1686 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1687 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1688 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1689 peer->tcp_ts = tp->rx_opt.ts_recent;
1690 }
1691 if (release_it)
1692 inet_putpeer(peer);
1693 return 1;
1694 }
1695
1696 return 0;
1697} 1777}
1778EXPORT_SYMBOL(tcp_v4_get_peer);
1698 1779
1699int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) 1780void *tcp_v4_tw_get_peer(struct sock *sk)
1700{ 1781{
1701 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); 1782 struct inet_timewait_sock *tw = inet_twsk(sk);
1702
1703 if (peer) {
1704 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1705
1706 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1707 (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1708 peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1709 peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1710 peer->tcp_ts = tcptw->tw_ts_recent;
1711 }
1712 inet_putpeer(peer);
1713 return 1;
1714 }
1715 1783
1716 return 0; 1784 return inet_getpeer_v4(tw->tw_daddr, 1);
1717} 1785}
1786EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1718 1787
1719struct inet_connection_sock_af_ops ipv4_specific = { 1788static struct timewait_sock_ops tcp_timewait_sock_ops = {
1789 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1790 .twsk_unique = tcp_twsk_unique,
1791 .twsk_destructor= tcp_twsk_destructor,
1792 .twsk_getpeer = tcp_v4_tw_get_peer,
1793};
1794
1795const struct inet_connection_sock_af_ops ipv4_specific = {
1720 .queue_xmit = ip_queue_xmit, 1796 .queue_xmit = ip_queue_xmit,
1721 .send_check = tcp_v4_send_check, 1797 .send_check = tcp_v4_send_check,
1722 .rebuild_header = inet_sk_rebuild_header, 1798 .rebuild_header = inet_sk_rebuild_header,
1723 .conn_request = tcp_v4_conn_request, 1799 .conn_request = tcp_v4_conn_request,
1724 .syn_recv_sock = tcp_v4_syn_recv_sock, 1800 .syn_recv_sock = tcp_v4_syn_recv_sock,
1725 .remember_stamp = tcp_v4_remember_stamp, 1801 .get_peer = tcp_v4_get_peer,
1726 .net_header_len = sizeof(struct iphdr), 1802 .net_header_len = sizeof(struct iphdr),
1727 .setsockopt = ip_setsockopt, 1803 .setsockopt = ip_setsockopt,
1728 .getsockopt = ip_getsockopt, 1804 .getsockopt = ip_getsockopt,
@@ -1734,9 +1810,10 @@ struct inet_connection_sock_af_ops ipv4_specific = {
1734 .compat_getsockopt = compat_ip_getsockopt, 1810 .compat_getsockopt = compat_ip_getsockopt,
1735#endif 1811#endif
1736}; 1812};
1813EXPORT_SYMBOL(ipv4_specific);
1737 1814
1738#ifdef CONFIG_TCP_MD5SIG 1815#ifdef CONFIG_TCP_MD5SIG
1739static struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 1816static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1740 .md5_lookup = tcp_v4_md5_lookup, 1817 .md5_lookup = tcp_v4_md5_lookup,
1741 .calc_md5_hash = tcp_v4_md5_hash_skb, 1818 .calc_md5_hash = tcp_v4_md5_hash_skb,
1742 .md5_add = tcp_v4_md5_add_func, 1819 .md5_add = tcp_v4_md5_add_func,
@@ -1769,9 +1846,9 @@ static int tcp_v4_init_sock(struct sock *sk)
1769 /* See draft-stevens-tcpca-spec-01 for discussion of the 1846 /* See draft-stevens-tcpca-spec-01 for discussion of the
1770 * initialization of these values. 1847 * initialization of these values.
1771 */ 1848 */
1772 tp->snd_ssthresh = 0x7fffffff; /* Infinity */ 1849 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1773 tp->snd_cwnd_clamp = ~0; 1850 tp->snd_cwnd_clamp = ~0;
1774 tp->mss_cache = 536; 1851 tp->mss_cache = TCP_MSS_DEFAULT;
1775 1852
1776 tp->reordering = sysctl_tcp_reordering; 1853 tp->reordering = sysctl_tcp_reordering;
1777 icsk->icsk_ca_ops = &tcp_init_congestion_ops; 1854 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
@@ -1787,6 +1864,19 @@ static int tcp_v4_init_sock(struct sock *sk)
1787 tp->af_specific = &tcp_sock_ipv4_specific; 1864 tp->af_specific = &tcp_sock_ipv4_specific;
1788#endif 1865#endif
1789 1866
1867 /* TCP Cookie Transactions */
1868 if (sysctl_tcp_cookie_size > 0) {
1869 /* Default, cookies without s_data_payload. */
1870 tp->cookie_values =
1871 kzalloc(sizeof(*tp->cookie_values),
1872 sk->sk_allocation);
1873 if (tp->cookie_values != NULL)
1874 kref_init(&tp->cookie_values->kref);
1875 }
1876 /* Presumed zeroed, in order of appearance:
1877 * cookie_in_always, cookie_out_never,
1878 * s_data_constant, s_data_in, s_data_out
1879 */
1790 sk->sk_sndbuf = sysctl_tcp_wmem[1]; 1880 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1791 sk->sk_rcvbuf = sysctl_tcp_rmem[1]; 1881 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1792 1882
@@ -1840,9 +1930,15 @@ void tcp_v4_destroy_sock(struct sock *sk)
1840 sk->sk_sndmsg_page = NULL; 1930 sk->sk_sndmsg_page = NULL;
1841 } 1931 }
1842 1932
1933 /* TCP Cookie Transactions */
1934 if (tp->cookie_values != NULL) {
1935 kref_put(&tp->cookie_values->kref,
1936 tcp_cookie_values_release);
1937 tp->cookie_values = NULL;
1938 }
1939
1843 percpu_counter_dec(&tcp_sockets_allocated); 1940 percpu_counter_dec(&tcp_sockets_allocated);
1844} 1941}
1845
1846EXPORT_SYMBOL(tcp_v4_destroy_sock); 1942EXPORT_SYMBOL(tcp_v4_destroy_sock);
1847 1943
1848#ifdef CONFIG_PROC_FS 1944#ifdef CONFIG_PROC_FS
@@ -1860,6 +1956,11 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1860 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; 1956 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1861} 1957}
1862 1958
1959/*
1960 * Get next listener socket follow cur. If cur is NULL, get first socket
1961 * starting from bucket given in st->bucket; when st->bucket is zero the
1962 * very first socket in the hash table is returned.
1963 */
1863static void *listening_get_next(struct seq_file *seq, void *cur) 1964static void *listening_get_next(struct seq_file *seq, void *cur)
1864{ 1965{
1865 struct inet_connection_sock *icsk; 1966 struct inet_connection_sock *icsk;
@@ -1870,14 +1971,15 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
1870 struct net *net = seq_file_net(seq); 1971 struct net *net = seq_file_net(seq);
1871 1972
1872 if (!sk) { 1973 if (!sk) {
1873 st->bucket = 0; 1974 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1874 ilb = &tcp_hashinfo.listening_hash[0];
1875 spin_lock_bh(&ilb->lock); 1975 spin_lock_bh(&ilb->lock);
1876 sk = sk_nulls_head(&ilb->head); 1976 sk = sk_nulls_head(&ilb->head);
1977 st->offset = 0;
1877 goto get_sk; 1978 goto get_sk;
1878 } 1979 }
1879 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 1980 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1880 ++st->num; 1981 ++st->num;
1982 ++st->offset;
1881 1983
1882 if (st->state == TCP_SEQ_STATE_OPENREQ) { 1984 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1883 struct request_sock *req = cur; 1985 struct request_sock *req = cur;
@@ -1892,12 +1994,13 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
1892 } 1994 }
1893 req = req->dl_next; 1995 req = req->dl_next;
1894 } 1996 }
1997 st->offset = 0;
1895 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) 1998 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1896 break; 1999 break;
1897get_req: 2000get_req:
1898 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; 2001 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1899 } 2002 }
1900 sk = sk_next(st->syn_wait_sk); 2003 sk = sk_nulls_next(st->syn_wait_sk);
1901 st->state = TCP_SEQ_STATE_LISTENING; 2004 st->state = TCP_SEQ_STATE_LISTENING;
1902 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2005 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1903 } else { 2006 } else {
@@ -1906,11 +2009,13 @@ get_req:
1906 if (reqsk_queue_len(&icsk->icsk_accept_queue)) 2009 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1907 goto start_req; 2010 goto start_req;
1908 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2011 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1909 sk = sk_next(sk); 2012 sk = sk_nulls_next(sk);
1910 } 2013 }
1911get_sk: 2014get_sk:
1912 sk_nulls_for_each_from(sk, node) { 2015 sk_nulls_for_each_from(sk, node) {
1913 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { 2016 if (!net_eq(sock_net(sk), net))
2017 continue;
2018 if (sk->sk_family == st->family) {
1914 cur = sk; 2019 cur = sk;
1915 goto out; 2020 goto out;
1916 } 2021 }
@@ -1927,6 +2032,7 @@ start_req:
1927 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2032 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1928 } 2033 }
1929 spin_unlock_bh(&ilb->lock); 2034 spin_unlock_bh(&ilb->lock);
2035 st->offset = 0;
1930 if (++st->bucket < INET_LHTABLE_SIZE) { 2036 if (++st->bucket < INET_LHTABLE_SIZE) {
1931 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2037 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1932 spin_lock_bh(&ilb->lock); 2038 spin_lock_bh(&ilb->lock);
@@ -1940,7 +2046,12 @@ out:
1940 2046
1941static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2047static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1942{ 2048{
1943 void *rc = listening_get_next(seq, NULL); 2049 struct tcp_iter_state *st = seq->private;
2050 void *rc;
2051
2052 st->bucket = 0;
2053 st->offset = 0;
2054 rc = listening_get_next(seq, NULL);
1944 2055
1945 while (rc && *pos) { 2056 while (rc && *pos) {
1946 rc = listening_get_next(seq, rc); 2057 rc = listening_get_next(seq, rc);
@@ -1955,13 +2066,18 @@ static inline int empty_bucket(struct tcp_iter_state *st)
1955 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); 2066 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1956} 2067}
1957 2068
2069/*
2070 * Get first established socket starting from bucket given in st->bucket.
2071 * If st->bucket is zero, the very first socket in the hash is returned.
2072 */
1958static void *established_get_first(struct seq_file *seq) 2073static void *established_get_first(struct seq_file *seq)
1959{ 2074{
1960 struct tcp_iter_state *st = seq->private; 2075 struct tcp_iter_state *st = seq->private;
1961 struct net *net = seq_file_net(seq); 2076 struct net *net = seq_file_net(seq);
1962 void *rc = NULL; 2077 void *rc = NULL;
1963 2078
1964 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { 2079 st->offset = 0;
2080 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1965 struct sock *sk; 2081 struct sock *sk;
1966 struct hlist_nulls_node *node; 2082 struct hlist_nulls_node *node;
1967 struct inet_timewait_sock *tw; 2083 struct inet_timewait_sock *tw;
@@ -2006,6 +2122,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
2006 struct net *net = seq_file_net(seq); 2122 struct net *net = seq_file_net(seq);
2007 2123
2008 ++st->num; 2124 ++st->num;
2125 ++st->offset;
2009 2126
2010 if (st->state == TCP_SEQ_STATE_TIME_WAIT) { 2127 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2011 tw = cur; 2128 tw = cur;
@@ -2022,10 +2139,11 @@ get_tw:
2022 st->state = TCP_SEQ_STATE_ESTABLISHED; 2139 st->state = TCP_SEQ_STATE_ESTABLISHED;
2023 2140
2024 /* Look for next non empty bucket */ 2141 /* Look for next non empty bucket */
2025 while (++st->bucket < tcp_hashinfo.ehash_size && 2142 st->offset = 0;
2143 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2026 empty_bucket(st)) 2144 empty_bucket(st))
2027 ; 2145 ;
2028 if (st->bucket >= tcp_hashinfo.ehash_size) 2146 if (st->bucket > tcp_hashinfo.ehash_mask)
2029 return NULL; 2147 return NULL;
2030 2148
2031 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2149 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
@@ -2049,7 +2167,11 @@ out:
2049 2167
2050static void *established_get_idx(struct seq_file *seq, loff_t pos) 2168static void *established_get_idx(struct seq_file *seq, loff_t pos)
2051{ 2169{
2052 void *rc = established_get_first(seq); 2170 struct tcp_iter_state *st = seq->private;
2171 void *rc;
2172
2173 st->bucket = 0;
2174 rc = established_get_first(seq);
2053 2175
2054 while (rc && pos) { 2176 while (rc && pos) {
2055 rc = established_get_next(seq, rc); 2177 rc = established_get_next(seq, rc);
@@ -2074,24 +2196,72 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2074 return rc; 2196 return rc;
2075} 2197}
2076 2198
2199static void *tcp_seek_last_pos(struct seq_file *seq)
2200{
2201 struct tcp_iter_state *st = seq->private;
2202 int offset = st->offset;
2203 int orig_num = st->num;
2204 void *rc = NULL;
2205
2206 switch (st->state) {
2207 case TCP_SEQ_STATE_OPENREQ:
2208 case TCP_SEQ_STATE_LISTENING:
2209 if (st->bucket >= INET_LHTABLE_SIZE)
2210 break;
2211 st->state = TCP_SEQ_STATE_LISTENING;
2212 rc = listening_get_next(seq, NULL);
2213 while (offset-- && rc)
2214 rc = listening_get_next(seq, rc);
2215 if (rc)
2216 break;
2217 st->bucket = 0;
2218 /* Fallthrough */
2219 case TCP_SEQ_STATE_ESTABLISHED:
2220 case TCP_SEQ_STATE_TIME_WAIT:
2221 st->state = TCP_SEQ_STATE_ESTABLISHED;
2222 if (st->bucket > tcp_hashinfo.ehash_mask)
2223 break;
2224 rc = established_get_first(seq);
2225 while (offset-- && rc)
2226 rc = established_get_next(seq, rc);
2227 }
2228
2229 st->num = orig_num;
2230
2231 return rc;
2232}
2233
2077static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2234static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2078{ 2235{
2079 struct tcp_iter_state *st = seq->private; 2236 struct tcp_iter_state *st = seq->private;
2237 void *rc;
2238
2239 if (*pos && *pos == st->last_pos) {
2240 rc = tcp_seek_last_pos(seq);
2241 if (rc)
2242 goto out;
2243 }
2244
2080 st->state = TCP_SEQ_STATE_LISTENING; 2245 st->state = TCP_SEQ_STATE_LISTENING;
2081 st->num = 0; 2246 st->num = 0;
2082 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2247 st->bucket = 0;
2248 st->offset = 0;
2249 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2250
2251out:
2252 st->last_pos = *pos;
2253 return rc;
2083} 2254}
2084 2255
2085static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2256static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2086{ 2257{
2258 struct tcp_iter_state *st = seq->private;
2087 void *rc = NULL; 2259 void *rc = NULL;
2088 struct tcp_iter_state *st;
2089 2260
2090 if (v == SEQ_START_TOKEN) { 2261 if (v == SEQ_START_TOKEN) {
2091 rc = tcp_get_idx(seq, 0); 2262 rc = tcp_get_idx(seq, 0);
2092 goto out; 2263 goto out;
2093 } 2264 }
2094 st = seq->private;
2095 2265
2096 switch (st->state) { 2266 switch (st->state) {
2097 case TCP_SEQ_STATE_OPENREQ: 2267 case TCP_SEQ_STATE_OPENREQ:
@@ -2099,6 +2269,8 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2099 rc = listening_get_next(seq, v); 2269 rc = listening_get_next(seq, v);
2100 if (!rc) { 2270 if (!rc) {
2101 st->state = TCP_SEQ_STATE_ESTABLISHED; 2271 st->state = TCP_SEQ_STATE_ESTABLISHED;
2272 st->bucket = 0;
2273 st->offset = 0;
2102 rc = established_get_first(seq); 2274 rc = established_get_first(seq);
2103 } 2275 }
2104 break; 2276 break;
@@ -2109,6 +2281,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2109 } 2281 }
2110out: 2282out:
2111 ++*pos; 2283 ++*pos;
2284 st->last_pos = *pos;
2112 return rc; 2285 return rc;
2113} 2286}
2114 2287
@@ -2147,6 +2320,7 @@ static int tcp_seq_open(struct inode *inode, struct file *file)
2147 2320
2148 s = ((struct seq_file *)file->private_data)->private; 2321 s = ((struct seq_file *)file->private_data)->private;
2149 s->family = afinfo->family; 2322 s->family = afinfo->family;
2323 s->last_pos = 0;
2150 return 0; 2324 return 0;
2151} 2325}
2152 2326
@@ -2170,11 +2344,13 @@ int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2170 rc = -ENOMEM; 2344 rc = -ENOMEM;
2171 return rc; 2345 return rc;
2172} 2346}
2347EXPORT_SYMBOL(tcp_proc_register);
2173 2348
2174void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) 2349void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2175{ 2350{
2176 proc_net_remove(net, afinfo->name); 2351 proc_net_remove(net, afinfo->name);
2177} 2352}
2353EXPORT_SYMBOL(tcp_proc_unregister);
2178 2354
2179static void get_openreq4(struct sock *sk, struct request_sock *req, 2355static void get_openreq4(struct sock *sk, struct request_sock *req,
2180 struct seq_file *f, int i, int uid, int *len) 2356 struct seq_file *f, int i, int uid, int *len)
@@ -2186,7 +2362,7 @@ static void get_openreq4(struct sock *sk, struct request_sock *req,
2186 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", 2362 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2187 i, 2363 i,
2188 ireq->loc_addr, 2364 ireq->loc_addr,
2189 ntohs(inet_sk(sk)->sport), 2365 ntohs(inet_sk(sk)->inet_sport),
2190 ireq->rmt_addr, 2366 ireq->rmt_addr,
2191 ntohs(ireq->rmt_port), 2367 ntohs(ireq->rmt_port),
2192 TCP_SYN_RECV, 2368 TCP_SYN_RECV,
@@ -2209,10 +2385,11 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2209 struct tcp_sock *tp = tcp_sk(sk); 2385 struct tcp_sock *tp = tcp_sk(sk);
2210 const struct inet_connection_sock *icsk = inet_csk(sk); 2386 const struct inet_connection_sock *icsk = inet_csk(sk);
2211 struct inet_sock *inet = inet_sk(sk); 2387 struct inet_sock *inet = inet_sk(sk);
2212 __be32 dest = inet->daddr; 2388 __be32 dest = inet->inet_daddr;
2213 __be32 src = inet->rcv_saddr; 2389 __be32 src = inet->inet_rcv_saddr;
2214 __u16 destp = ntohs(inet->dport); 2390 __u16 destp = ntohs(inet->inet_dport);
2215 __u16 srcp = ntohs(inet->sport); 2391 __u16 srcp = ntohs(inet->inet_sport);
2392 int rx_queue;
2216 2393
2217 if (icsk->icsk_pending == ICSK_TIME_RETRANS) { 2394 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2218 timer_active = 1; 2395 timer_active = 1;
@@ -2228,12 +2405,19 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2228 timer_expires = jiffies; 2405 timer_expires = jiffies;
2229 } 2406 }
2230 2407
2408 if (sk->sk_state == TCP_LISTEN)
2409 rx_queue = sk->sk_ack_backlog;
2410 else
2411 /*
2412 * because we dont lock socket, we might find a transient negative value
2413 */
2414 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2415
2231 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2416 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2232 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n", 2417 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2233 i, src, srcp, dest, destp, sk->sk_state, 2418 i, src, srcp, dest, destp, sk->sk_state,
2234 tp->write_seq - tp->snd_una, 2419 tp->write_seq - tp->snd_una,
2235 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog : 2420 rx_queue,
2236 (tp->rcv_nxt - tp->copied_seq),
2237 timer_active, 2421 timer_active,
2238 jiffies_to_clock_t(timer_expires - jiffies), 2422 jiffies_to_clock_t(timer_expires - jiffies),
2239 icsk->icsk_retransmits, 2423 icsk->icsk_retransmits,
@@ -2245,7 +2429,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2245 jiffies_to_clock_t(icsk->icsk_ack.ato), 2429 jiffies_to_clock_t(icsk->icsk_ack.ato),
2246 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2430 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2247 tp->snd_cwnd, 2431 tp->snd_cwnd,
2248 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh, 2432 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2249 len); 2433 len);
2250} 2434}
2251 2435
@@ -2315,12 +2499,12 @@ static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2315 }, 2499 },
2316}; 2500};
2317 2501
2318static int tcp4_proc_init_net(struct net *net) 2502static int __net_init tcp4_proc_init_net(struct net *net)
2319{ 2503{
2320 return tcp_proc_register(net, &tcp4_seq_afinfo); 2504 return tcp_proc_register(net, &tcp4_seq_afinfo);
2321} 2505}
2322 2506
2323static void tcp4_proc_exit_net(struct net *net) 2507static void __net_exit tcp4_proc_exit_net(struct net *net)
2324{ 2508{
2325 tcp_proc_unregister(net, &tcp4_seq_afinfo); 2509 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2326} 2510}
@@ -2361,7 +2545,6 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2361 2545
2362 return tcp_gro_receive(head, skb); 2546 return tcp_gro_receive(head, skb);
2363} 2547}
2364EXPORT_SYMBOL(tcp4_gro_receive);
2365 2548
2366int tcp4_gro_complete(struct sk_buff *skb) 2549int tcp4_gro_complete(struct sk_buff *skb)
2367{ 2550{
@@ -2374,7 +2557,6 @@ int tcp4_gro_complete(struct sk_buff *skb)
2374 2557
2375 return tcp_gro_complete(skb); 2558 return tcp_gro_complete(skb);
2376} 2559}
2377EXPORT_SYMBOL(tcp4_gro_complete);
2378 2560
2379struct proto tcp_prot = { 2561struct proto tcp_prot = {
2380 .name = "TCP", 2562 .name = "TCP",
@@ -2390,6 +2572,8 @@ struct proto tcp_prot = {
2390 .setsockopt = tcp_setsockopt, 2572 .setsockopt = tcp_setsockopt,
2391 .getsockopt = tcp_getsockopt, 2573 .getsockopt = tcp_getsockopt,
2392 .recvmsg = tcp_recvmsg, 2574 .recvmsg = tcp_recvmsg,
2575 .sendmsg = tcp_sendmsg,
2576 .sendpage = tcp_sendpage,
2393 .backlog_rcv = tcp_v4_do_rcv, 2577 .backlog_rcv = tcp_v4_do_rcv,
2394 .hash = inet_hash, 2578 .hash = inet_hash,
2395 .unhash = inet_unhash, 2579 .unhash = inet_unhash,
@@ -2408,11 +2592,13 @@ struct proto tcp_prot = {
2408 .twsk_prot = &tcp_timewait_sock_ops, 2592 .twsk_prot = &tcp_timewait_sock_ops,
2409 .rsk_prot = &tcp_request_sock_ops, 2593 .rsk_prot = &tcp_request_sock_ops,
2410 .h.hashinfo = &tcp_hashinfo, 2594 .h.hashinfo = &tcp_hashinfo,
2595 .no_autobind = true,
2411#ifdef CONFIG_COMPAT 2596#ifdef CONFIG_COMPAT
2412 .compat_setsockopt = compat_tcp_setsockopt, 2597 .compat_setsockopt = compat_tcp_setsockopt,
2413 .compat_getsockopt = compat_tcp_getsockopt, 2598 .compat_getsockopt = compat_tcp_getsockopt,
2414#endif 2599#endif
2415}; 2600};
2601EXPORT_SYMBOL(tcp_prot);
2416 2602
2417 2603
2418static int __net_init tcp_sk_init(struct net *net) 2604static int __net_init tcp_sk_init(struct net *net)
@@ -2424,12 +2610,17 @@ static int __net_init tcp_sk_init(struct net *net)
2424static void __net_exit tcp_sk_exit(struct net *net) 2610static void __net_exit tcp_sk_exit(struct net *net)
2425{ 2611{
2426 inet_ctl_sock_destroy(net->ipv4.tcp_sock); 2612 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2427 inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET); 2613}
2614
2615static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2616{
2617 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2428} 2618}
2429 2619
2430static struct pernet_operations __net_initdata tcp_sk_ops = { 2620static struct pernet_operations __net_initdata tcp_sk_ops = {
2431 .init = tcp_sk_init, 2621 .init = tcp_sk_init,
2432 .exit = tcp_sk_exit, 2622 .exit = tcp_sk_exit,
2623 .exit_batch = tcp_sk_exit_batch,
2433}; 2624};
2434 2625
2435void __init tcp_v4_init(void) 2626void __init tcp_v4_init(void)
@@ -2438,20 +2629,3 @@ void __init tcp_v4_init(void)
2438 if (register_pernet_subsys(&tcp_sk_ops)) 2629 if (register_pernet_subsys(&tcp_sk_ops))
2439 panic("Failed to create the TCP control socket.\n"); 2630 panic("Failed to create the TCP control socket.\n");
2440} 2631}
2441
2442EXPORT_SYMBOL(ipv4_specific);
2443EXPORT_SYMBOL(tcp_hashinfo);
2444EXPORT_SYMBOL(tcp_prot);
2445EXPORT_SYMBOL(tcp_v4_conn_request);
2446EXPORT_SYMBOL(tcp_v4_connect);
2447EXPORT_SYMBOL(tcp_v4_do_rcv);
2448EXPORT_SYMBOL(tcp_v4_remember_stamp);
2449EXPORT_SYMBOL(tcp_v4_send_check);
2450EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2451
2452#ifdef CONFIG_PROC_FS
2453EXPORT_SYMBOL(tcp_proc_register);
2454EXPORT_SYMBOL(tcp_proc_unregister);
2455#endif
2456EXPORT_SYMBOL(sysctl_tcp_low_latency);
2457
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index ce3c41ff50b2..de870377fbba 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -143,8 +143,8 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
143 goto out; 143 goto out;
144 144
145 /* we can't calc remote HZ with no different!! */ 145 /* we can't calc remote HZ with no different!! */
146 if (tp->rx_opt.rcv_tsval == lp->remote_ref_time 146 if (tp->rx_opt.rcv_tsval == lp->remote_ref_time ||
147 || tp->rx_opt.rcv_tsecr == lp->local_ref_time) 147 tp->rx_opt.rcv_tsecr == lp->local_ref_time)
148 goto out; 148 goto out;
149 149
150 m = HZ * (tp->rx_opt.rcv_tsval - 150 m = HZ * (tp->rx_opt.rcv_tsval -
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 43bbba7926ee..80b1f80759ab 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -20,19 +20,14 @@
20 20
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/slab.h>
23#include <linux/sysctl.h> 24#include <linux/sysctl.h>
24#include <linux/workqueue.h> 25#include <linux/workqueue.h>
25#include <net/tcp.h> 26#include <net/tcp.h>
26#include <net/inet_common.h> 27#include <net/inet_common.h>
27#include <net/xfrm.h> 28#include <net/xfrm.h>
28 29
29#ifdef CONFIG_SYSCTL 30int sysctl_tcp_syncookies __read_mostly = 1;
30#define SYNC_INIT 0 /* let the user enable it */
31#else
32#define SYNC_INIT 1
33#endif
34
35int sysctl_tcp_syncookies __read_mostly = SYNC_INIT;
36EXPORT_SYMBOL(sysctl_tcp_syncookies); 31EXPORT_SYMBOL(sysctl_tcp_syncookies);
37 32
38int sysctl_tcp_abort_on_overflow __read_mostly; 33int sysctl_tcp_abort_on_overflow __read_mostly;
@@ -52,16 +47,65 @@ struct inet_timewait_death_row tcp_death_row = {
52 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, 47 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
53 (unsigned long)&tcp_death_row), 48 (unsigned long)&tcp_death_row),
54}; 49};
55
56EXPORT_SYMBOL_GPL(tcp_death_row); 50EXPORT_SYMBOL_GPL(tcp_death_row);
57 51
52/* VJ's idea. Save last timestamp seen from this destination
53 * and hold it at least for normal timewait interval to use for duplicate
54 * segment detection in subsequent connections, before they enter synchronized
55 * state.
56 */
57
58static int tcp_remember_stamp(struct sock *sk)
59{
60 const struct inet_connection_sock *icsk = inet_csk(sk);
61 struct tcp_sock *tp = tcp_sk(sk);
62 struct inet_peer *peer;
63 bool release_it;
64
65 peer = icsk->icsk_af_ops->get_peer(sk, &release_it);
66 if (peer) {
67 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
68 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
69 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
70 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
71 peer->tcp_ts = tp->rx_opt.ts_recent;
72 }
73 if (release_it)
74 inet_putpeer(peer);
75 return 1;
76 }
77
78 return 0;
79}
80
81static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
82{
83 struct sock *sk = (struct sock *) tw;
84 struct inet_peer *peer;
85
86 peer = twsk_getpeer(sk);
87 if (peer) {
88 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
89
90 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
91 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
92 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
93 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
94 peer->tcp_ts = tcptw->tw_ts_recent;
95 }
96 inet_putpeer(peer);
97 return 1;
98 }
99 return 0;
100}
101
58static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 102static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
59{ 103{
60 if (seq == s_win) 104 if (seq == s_win)
61 return 1; 105 return 1;
62 if (after(end_seq, s_win) && before(seq, e_win)) 106 if (after(end_seq, s_win) && before(seq, e_win))
63 return 1; 107 return 1;
64 return (seq == e_win && seq == end_seq); 108 return seq == e_win && seq == end_seq;
65} 109}
66 110
67/* 111/*
@@ -96,13 +140,14 @@ enum tcp_tw_status
96tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, 140tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
97 const struct tcphdr *th) 141 const struct tcphdr *th)
98{ 142{
99 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
100 struct tcp_options_received tmp_opt; 143 struct tcp_options_received tmp_opt;
144 u8 *hash_location;
145 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
101 int paws_reject = 0; 146 int paws_reject = 0;
102 147
103 tmp_opt.saw_tstamp = 0; 148 tmp_opt.saw_tstamp = 0;
104 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { 149 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
105 tcp_parse_options(skb, &tmp_opt, 0); 150 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
106 151
107 if (tmp_opt.saw_tstamp) { 152 if (tmp_opt.saw_tstamp) {
108 tmp_opt.ts_recent = tcptw->tw_ts_recent; 153 tmp_opt.ts_recent = tcptw->tw_ts_recent;
@@ -128,7 +173,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
128 goto kill_with_rst; 173 goto kill_with_rst;
129 174
130 /* Dup ACK? */ 175 /* Dup ACK? */
131 if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || 176 if (!th->ack ||
177 !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
132 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { 178 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
133 inet_twsk_put(tw); 179 inet_twsk_put(tw);
134 return TCP_TW_SUCCESS; 180 return TCP_TW_SUCCESS;
@@ -153,14 +199,9 @@ kill_with_rst:
153 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 199 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
154 } 200 }
155 201
156 /* I am shamed, but failed to make it more elegant. 202 if (tcp_death_row.sysctl_tw_recycle &&
157 * Yes, it is direct reference to IP, which is impossible 203 tcptw->tw_ts_recent_stamp &&
158 * to generalize to IPv6. Taking into account that IPv6 204 tcp_tw_remember_stamp(tw))
159 * do not understand recycling in any case, it not
160 * a big problem in practice. --ANK */
161 if (tw->tw_family == AF_INET &&
162 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
163 tcp_v4_tw_remember_stamp(tw))
164 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, 205 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
165 TCP_TIMEWAIT_LEN); 206 TCP_TIMEWAIT_LEN);
166 else 207 else
@@ -265,6 +306,7 @@ kill:
265 inet_twsk_put(tw); 306 inet_twsk_put(tw);
266 return TCP_TW_SUCCESS; 307 return TCP_TW_SUCCESS;
267} 308}
309EXPORT_SYMBOL(tcp_timewait_state_process);
268 310
269/* 311/*
270 * Move a socket to time-wait or dead fin-wait-2 state. 312 * Move a socket to time-wait or dead fin-wait-2 state.
@@ -277,7 +319,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
277 int recycle_ok = 0; 319 int recycle_ok = 0;
278 320
279 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 321 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
280 recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); 322 recycle_ok = tcp_remember_stamp(sk);
281 323
282 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) 324 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
283 tw = inet_twsk_alloc(sk, state); 325 tw = inet_twsk_alloc(sk, state);
@@ -321,7 +363,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
321 if (key != NULL) { 363 if (key != NULL) {
322 memcpy(&tcptw->tw_md5_key, key->key, key->keylen); 364 memcpy(&tcptw->tw_md5_key, key->key, key->keylen);
323 tcptw->tw_md5_keylen = key->keylen; 365 tcptw->tw_md5_keylen = key->keylen;
324 if (tcp_alloc_md5sig_pool() == NULL) 366 if (tcp_alloc_md5sig_pool(sk) == NULL)
325 BUG(); 367 BUG();
326 } 368 }
327 } while (0); 369 } while (0);
@@ -350,7 +392,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
350 * socket up. We've got bigger problems than 392 * socket up. We've got bigger problems than
351 * non-graceful socket closings. 393 * non-graceful socket closings.
352 */ 394 */
353 LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); 395 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
354 } 396 }
355 397
356 tcp_update_metrics(sk); 398 tcp_update_metrics(sk);
@@ -362,10 +404,9 @@ void tcp_twsk_destructor(struct sock *sk)
362#ifdef CONFIG_TCP_MD5SIG 404#ifdef CONFIG_TCP_MD5SIG
363 struct tcp_timewait_sock *twsk = tcp_twsk(sk); 405 struct tcp_timewait_sock *twsk = tcp_twsk(sk);
364 if (twsk->tw_md5_keylen) 406 if (twsk->tw_md5_keylen)
365 tcp_put_md5sig_pool(); 407 tcp_free_md5sig_pool();
366#endif 408#endif
367} 409}
368
369EXPORT_SYMBOL_GPL(tcp_twsk_destructor); 410EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
370 411
371static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, 412static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
@@ -388,14 +429,43 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
388 const struct inet_request_sock *ireq = inet_rsk(req); 429 const struct inet_request_sock *ireq = inet_rsk(req);
389 struct tcp_request_sock *treq = tcp_rsk(req); 430 struct tcp_request_sock *treq = tcp_rsk(req);
390 struct inet_connection_sock *newicsk = inet_csk(newsk); 431 struct inet_connection_sock *newicsk = inet_csk(newsk);
391 struct tcp_sock *newtp; 432 struct tcp_sock *newtp = tcp_sk(newsk);
433 struct tcp_sock *oldtp = tcp_sk(sk);
434 struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
435
436 /* TCP Cookie Transactions require space for the cookie pair,
437 * as it differs for each connection. There is no need to
438 * copy any s_data_payload stored at the original socket.
439 * Failure will prevent resuming the connection.
440 *
441 * Presumed copied, in order of appearance:
442 * cookie_in_always, cookie_out_never
443 */
444 if (oldcvp != NULL) {
445 struct tcp_cookie_values *newcvp =
446 kzalloc(sizeof(*newtp->cookie_values),
447 GFP_ATOMIC);
448
449 if (newcvp != NULL) {
450 kref_init(&newcvp->kref);
451 newcvp->cookie_desired =
452 oldcvp->cookie_desired;
453 newtp->cookie_values = newcvp;
454 } else {
455 /* Not Yet Implemented */
456 newtp->cookie_values = NULL;
457 }
458 }
392 459
393 /* Now setup tcp_sock */ 460 /* Now setup tcp_sock */
394 newtp = tcp_sk(newsk);
395 newtp->pred_flags = 0; 461 newtp->pred_flags = 0;
396 newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; 462
397 newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; 463 newtp->rcv_wup = newtp->copied_seq =
398 newtp->snd_up = treq->snt_isn + 1; 464 newtp->rcv_nxt = treq->rcv_isn + 1;
465
466 newtp->snd_sml = newtp->snd_una =
467 newtp->snd_nxt = newtp->snd_up =
468 treq->snt_isn + 1 + tcp_s_data_size(oldtp);
399 469
400 tcp_prequeue_init(newtp); 470 tcp_prequeue_init(newtp);
401 471
@@ -409,7 +479,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
409 newtp->retrans_out = 0; 479 newtp->retrans_out = 0;
410 newtp->sacked_out = 0; 480 newtp->sacked_out = 0;
411 newtp->fackets_out = 0; 481 newtp->fackets_out = 0;
412 newtp->snd_ssthresh = 0x7fffffff; 482 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
413 483
414 /* So many TCP implementations out there (incorrectly) count the 484 /* So many TCP implementations out there (incorrectly) count the
415 * initial SYN frame in their delayed-ACK and congestion control 485 * initial SYN frame in their delayed-ACK and congestion control
@@ -428,8 +498,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
428 tcp_set_ca_state(newsk, TCP_CA_Open); 498 tcp_set_ca_state(newsk, TCP_CA_Open);
429 tcp_init_xmit_timers(newsk); 499 tcp_init_xmit_timers(newsk);
430 skb_queue_head_init(&newtp->out_of_order_queue); 500 skb_queue_head_init(&newtp->out_of_order_queue);
431 newtp->write_seq = treq->snt_isn + 1; 501 newtp->write_seq = newtp->pushed_seq =
432 newtp->pushed_seq = newtp->write_seq; 502 treq->snt_isn + 1 + tcp_s_data_size(oldtp);
433 503
434 newtp->rx_opt.saw_tstamp = 0; 504 newtp->rx_opt.saw_tstamp = 0;
435 505
@@ -475,7 +545,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
475 if (newtp->af_specific->md5_lookup(sk, newsk)) 545 if (newtp->af_specific->md5_lookup(sk, newsk))
476 newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; 546 newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
477#endif 547#endif
478 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) 548 if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
479 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 549 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
480 newtp->rx_opt.mss_clamp = req->mss; 550 newtp->rx_opt.mss_clamp = req->mss;
481 TCP_ECN_openreq_child(newtp, req); 551 TCP_ECN_openreq_child(newtp, req);
@@ -484,6 +554,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
484 } 554 }
485 return newsk; 555 return newsk;
486} 556}
557EXPORT_SYMBOL(tcp_create_openreq_child);
487 558
488/* 559/*
489 * Process an incoming packet for SYN_RECV sockets represented 560 * Process an incoming packet for SYN_RECV sockets represented
@@ -494,15 +565,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
494 struct request_sock *req, 565 struct request_sock *req,
495 struct request_sock **prev) 566 struct request_sock **prev)
496{ 567{
568 struct tcp_options_received tmp_opt;
569 u8 *hash_location;
570 struct sock *child;
497 const struct tcphdr *th = tcp_hdr(skb); 571 const struct tcphdr *th = tcp_hdr(skb);
498 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 572 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
499 int paws_reject = 0; 573 int paws_reject = 0;
500 struct tcp_options_received tmp_opt;
501 struct sock *child;
502 574
503 tmp_opt.saw_tstamp = 0; 575 tmp_opt.saw_tstamp = 0;
504 if (th->doff > (sizeof(struct tcphdr)>>2)) { 576 if (th->doff > (sizeof(struct tcphdr)>>2)) {
505 tcp_parse_options(skb, &tmp_opt, 0); 577 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
506 578
507 if (tmp_opt.saw_tstamp) { 579 if (tmp_opt.saw_tstamp) {
508 tmp_opt.ts_recent = req->ts_recent; 580 tmp_opt.ts_recent = req->ts_recent;
@@ -536,7 +608,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
536 * Enforce "SYN-ACK" according to figure 8, figure 6 608 * Enforce "SYN-ACK" according to figure 8, figure 6
537 * of RFC793, fixed by RFC1122. 609 * of RFC793, fixed by RFC1122.
538 */ 610 */
539 req->rsk_ops->rtx_syn_ack(sk, req); 611 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
540 return NULL; 612 return NULL;
541 } 613 }
542 614
@@ -595,7 +667,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
595 * Invalid ACK: reset will be sent by listening socket 667 * Invalid ACK: reset will be sent by listening socket
596 */ 668 */
597 if ((flg & TCP_FLAG_ACK) && 669 if ((flg & TCP_FLAG_ACK) &&
598 (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) 670 (TCP_SKB_CB(skb)->ack_seq !=
671 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
599 return sk; 672 return sk;
600 673
601 /* Also, it would be not so bad idea to check rcv_tsecr, which 674 /* Also, it would be not so bad idea to check rcv_tsecr, which
@@ -640,10 +713,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
640 if (!(flg & TCP_FLAG_ACK)) 713 if (!(flg & TCP_FLAG_ACK))
641 return NULL; 714 return NULL;
642 715
643 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ 716 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
644 if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && 717 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
645 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 718 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
646 inet_rsk(req)->acked = 1; 719 inet_rsk(req)->acked = 1;
720 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
647 return NULL; 721 return NULL;
648 } 722 }
649 723
@@ -656,29 +730,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
656 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); 730 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
657 if (child == NULL) 731 if (child == NULL)
658 goto listen_overflow; 732 goto listen_overflow;
659#ifdef CONFIG_TCP_MD5SIG
660 else {
661 /* Copy over the MD5 key from the original socket */
662 struct tcp_md5sig_key *key;
663 struct tcp_sock *tp = tcp_sk(sk);
664 key = tp->af_specific->md5_lookup(sk, child);
665 if (key != NULL) {
666 /*
667 * We're using one, so create a matching key on the
668 * newsk structure. If we fail to get memory then we
669 * end up not copying the key across. Shucks.
670 */
671 char *newkey = kmemdup(key->key, key->keylen,
672 GFP_ATOMIC);
673 if (newkey) {
674 if (!tcp_alloc_md5sig_pool())
675 BUG();
676 tp->af_specific->md5_add(child, child, newkey,
677 key->keylen);
678 }
679 }
680 }
681#endif
682 733
683 inet_csk_reqsk_queue_unlink(sk, req, prev); 734 inet_csk_reqsk_queue_unlink(sk, req, prev);
684 inet_csk_reqsk_queue_removed(sk, req); 735 inet_csk_reqsk_queue_removed(sk, req);
@@ -700,6 +751,7 @@ embryonic_reset:
700 inet_csk_reqsk_queue_drop(sk, req, prev); 751 inet_csk_reqsk_queue_drop(sk, req, prev);
701 return NULL; 752 return NULL;
702} 753}
754EXPORT_SYMBOL(tcp_check_req);
703 755
704/* 756/*
705 * Queue segment on the new socket if the new socket is active, 757 * Queue segment on the new socket if the new socket is active,
@@ -724,15 +776,11 @@ int tcp_child_process(struct sock *parent, struct sock *child,
724 * in main socket hash table and lock on listening 776 * in main socket hash table and lock on listening
725 * socket does not protect us more. 777 * socket does not protect us more.
726 */ 778 */
727 sk_add_backlog(child, skb); 779 __sk_add_backlog(child, skb);
728 } 780 }
729 781
730 bh_unlock_sock(child); 782 bh_unlock_sock(child);
731 sock_put(child); 783 sock_put(child);
732 return ret; 784 return ret;
733} 785}
734
735EXPORT_SYMBOL(tcp_check_req);
736EXPORT_SYMBOL(tcp_child_process); 786EXPORT_SYMBOL(tcp_child_process);
737EXPORT_SYMBOL(tcp_create_openreq_child);
738EXPORT_SYMBOL(tcp_timewait_state_process);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 416fc4c2e7eb..dc7c096ddfef 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -37,6 +37,7 @@
37#include <net/tcp.h> 37#include <net/tcp.h>
38 38
39#include <linux/compiler.h> 39#include <linux/compiler.h>
40#include <linux/gfp.h>
40#include <linux/module.h> 41#include <linux/module.h>
41 42
42/* People can turn this off for buggy TCP's found in printers etc. */ 43/* People can turn this off for buggy TCP's found in printers etc. */
@@ -54,11 +55,16 @@ int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
54int sysctl_tcp_tso_win_divisor __read_mostly = 3; 55int sysctl_tcp_tso_win_divisor __read_mostly = 3;
55 56
56int sysctl_tcp_mtu_probing __read_mostly = 0; 57int sysctl_tcp_mtu_probing __read_mostly = 0;
57int sysctl_tcp_base_mss __read_mostly = 512; 58int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
58 59
59/* By default, RFC2861 behavior. */ 60/* By default, RFC2861 behavior. */
60int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 61int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
61 62
63int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
64EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
65
66
67/* Account for new data that has been sent to the network. */
62static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) 68static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
63{ 69{
64 struct tcp_sock *tp = tcp_sk(sk); 70 struct tcp_sock *tp = tcp_sk(sk);
@@ -113,9 +119,13 @@ static __u16 tcp_advertise_mss(struct sock *sk)
113 struct dst_entry *dst = __sk_dst_get(sk); 119 struct dst_entry *dst = __sk_dst_get(sk);
114 int mss = tp->advmss; 120 int mss = tp->advmss;
115 121
116 if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { 122 if (dst) {
117 mss = dst_metric(dst, RTAX_ADVMSS); 123 unsigned int metric = dst_metric_advmss(dst);
118 tp->advmss = mss; 124
125 if (metric < mss) {
126 mss = metric;
127 tp->advmss = mss;
128 }
119 } 129 }
120 130
121 return (__u16)mss; 131 return (__u16)mss;
@@ -142,6 +152,7 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
142 tp->snd_cwnd_used = 0; 152 tp->snd_cwnd_used = 0;
143} 153}
144 154
155/* Congestion state accounting after a packet has been sent. */
145static void tcp_event_data_sent(struct tcp_sock *tp, 156static void tcp_event_data_sent(struct tcp_sock *tp,
146 struct sk_buff *skb, struct sock *sk) 157 struct sk_buff *skb, struct sock *sk)
147{ 158{
@@ -161,6 +172,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
161 icsk->icsk_ack.pingpong = 1; 172 icsk->icsk_ack.pingpong = 1;
162} 173}
163 174
175/* Account for an ACK we sent. */
164static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) 176static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
165{ 177{
166 tcp_dec_quickack_mode(sk, pkts); 178 tcp_dec_quickack_mode(sk, pkts);
@@ -176,7 +188,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
176 */ 188 */
177void tcp_select_initial_window(int __space, __u32 mss, 189void tcp_select_initial_window(int __space, __u32 mss,
178 __u32 *rcv_wnd, __u32 *window_clamp, 190 __u32 *rcv_wnd, __u32 *window_clamp,
179 int wscale_ok, __u8 *rcv_wscale) 191 int wscale_ok, __u8 *rcv_wscale,
192 __u32 init_rcv_wnd)
180{ 193{
181 unsigned int space = (__space < 0 ? 0 : __space); 194 unsigned int space = (__space < 0 ? 0 : __space);
182 195
@@ -215,23 +228,28 @@ void tcp_select_initial_window(int __space, __u32 mss,
215 } 228 }
216 } 229 }
217 230
218 /* Set initial window to value enough for senders, 231 /* Set initial window to a value enough for senders starting with
219 * following RFC2414. Senders, not following this RFC, 232 * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place
220 * will be satisfied with 2. 233 * a limit on the initial window when mss is larger than 1460.
221 */ 234 */
222 if (mss > (1 << *rcv_wscale)) { 235 if (mss > (1 << *rcv_wscale)) {
223 int init_cwnd = 4; 236 int init_cwnd = TCP_DEFAULT_INIT_RCVWND;
224 if (mss > 1460 * 3) 237 if (mss > 1460)
225 init_cwnd = 2; 238 init_cwnd =
226 else if (mss > 1460) 239 max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
227 init_cwnd = 3; 240 /* when initializing use the value from init_rcv_wnd
228 if (*rcv_wnd > init_cwnd * mss) 241 * rather than the default from above
229 *rcv_wnd = init_cwnd * mss; 242 */
243 if (init_rcv_wnd)
244 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
245 else
246 *rcv_wnd = min(*rcv_wnd, init_cwnd * mss);
230 } 247 }
231 248
232 /* Set the clamp no higher than max representable value */ 249 /* Set the clamp no higher than max representable value */
233 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); 250 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
234} 251}
252EXPORT_SYMBOL(tcp_select_initial_window);
235 253
236/* Chose a new window to advertise, update state in tcp_sock for the 254/* Chose a new window to advertise, update state in tcp_sock for the
237 * socket, and return result with RFC1323 scaling applied. The return 255 * socket, and return result with RFC1323 scaling applied. The return
@@ -276,20 +294,22 @@ static u16 tcp_select_window(struct sock *sk)
276 return new_win; 294 return new_win;
277} 295}
278 296
297/* Packet ECN state for a SYN-ACK */
279static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) 298static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
280{ 299{
281 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; 300 TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR;
282 if (!(tp->ecn_flags & TCP_ECN_OK)) 301 if (!(tp->ecn_flags & TCP_ECN_OK))
283 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; 302 TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE;
284} 303}
285 304
305/* Packet ECN state for a SYN. */
286static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) 306static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
287{ 307{
288 struct tcp_sock *tp = tcp_sk(sk); 308 struct tcp_sock *tp = tcp_sk(sk);
289 309
290 tp->ecn_flags = 0; 310 tp->ecn_flags = 0;
291 if (sysctl_tcp_ecn == 1) { 311 if (sysctl_tcp_ecn == 1) {
292 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR; 312 TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR;
293 tp->ecn_flags = TCP_ECN_OK; 313 tp->ecn_flags = TCP_ECN_OK;
294 } 314 }
295} 315}
@@ -301,6 +321,9 @@ TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th)
301 th->ece = 1; 321 th->ece = 1;
302} 322}
303 323
324/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
325 * be sent.
326 */
304static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, 327static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
305 int tcp_header_len) 328 int tcp_header_len)
306{ 329{
@@ -330,6 +353,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
330 */ 353 */
331static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) 354static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
332{ 355{
356 skb->ip_summed = CHECKSUM_PARTIAL;
333 skb->csum = 0; 357 skb->csum = 0;
334 358
335 TCP_SKB_CB(skb)->flags = flags; 359 TCP_SKB_CB(skb)->flags = flags;
@@ -340,7 +364,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
340 skb_shinfo(skb)->gso_type = 0; 364 skb_shinfo(skb)->gso_type = 0;
341 365
342 TCP_SKB_CB(skb)->seq = seq; 366 TCP_SKB_CB(skb)->seq = seq;
343 if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN)) 367 if (flags & (TCPHDR_SYN | TCPHDR_FIN))
344 seq++; 368 seq++;
345 TCP_SKB_CB(skb)->end_seq = seq; 369 TCP_SKB_CB(skb)->end_seq = seq;
346} 370}
@@ -353,16 +377,52 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp)
353#define OPTION_SACK_ADVERTISE (1 << 0) 377#define OPTION_SACK_ADVERTISE (1 << 0)
354#define OPTION_TS (1 << 1) 378#define OPTION_TS (1 << 1)
355#define OPTION_MD5 (1 << 2) 379#define OPTION_MD5 (1 << 2)
380#define OPTION_WSCALE (1 << 3)
381#define OPTION_COOKIE_EXTENSION (1 << 4)
356 382
357struct tcp_out_options { 383struct tcp_out_options {
358 u8 options; /* bit field of OPTION_* */ 384 u8 options; /* bit field of OPTION_* */
359 u8 ws; /* window scale, 0 to disable */ 385 u8 ws; /* window scale, 0 to disable */
360 u8 num_sack_blocks; /* number of SACK blocks to include */ 386 u8 num_sack_blocks; /* number of SACK blocks to include */
387 u8 hash_size; /* bytes in hash_location */
361 u16 mss; /* 0 to disable */ 388 u16 mss; /* 0 to disable */
362 __u32 tsval, tsecr; /* need to include OPTION_TS */ 389 __u32 tsval, tsecr; /* need to include OPTION_TS */
390 __u8 *hash_location; /* temporary pointer, overloaded */
363}; 391};
364 392
365/* Beware: Something in the Internet is very sensitive to the ordering of 393/* The sysctl int routines are generic, so check consistency here.
394 */
395static u8 tcp_cookie_size_check(u8 desired)
396{
397 int cookie_size;
398
399 if (desired > 0)
400 /* previously specified */
401 return desired;
402
403 cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size);
404 if (cookie_size <= 0)
405 /* no default specified */
406 return 0;
407
408 if (cookie_size <= TCP_COOKIE_MIN)
409 /* value too small, specify minimum */
410 return TCP_COOKIE_MIN;
411
412 if (cookie_size >= TCP_COOKIE_MAX)
413 /* value too large, specify maximum */
414 return TCP_COOKIE_MAX;
415
416 if (cookie_size & 1)
417 /* 8-bit multiple, illegal, fix it */
418 cookie_size++;
419
420 return (u8)cookie_size;
421}
422
423/* Write previously computed TCP options to the packet.
424 *
425 * Beware: Something in the Internet is very sensitive to the ordering of
366 * TCP options, we learned this through the hard way, so be careful here. 426 * TCP options, we learned this through the hard way, so be careful here.
367 * Luckily we can at least blame others for their non-compliance but from 427 * Luckily we can at least blame others for their non-compliance but from
368 * inter-operatibility perspective it seems that we're somewhat stuck with 428 * inter-operatibility perspective it seems that we're somewhat stuck with
@@ -374,17 +434,34 @@ struct tcp_out_options {
374 * (but it may well be that other scenarios fail similarly). 434 * (but it may well be that other scenarios fail similarly).
375 */ 435 */
376static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, 436static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
377 const struct tcp_out_options *opts, 437 struct tcp_out_options *opts)
378 __u8 **md5_hash) { 438{
379 if (unlikely(OPTION_MD5 & opts->options)) { 439 u8 options = opts->options; /* mungable copy */
380 *ptr++ = htonl((TCPOPT_NOP << 24) | 440
381 (TCPOPT_NOP << 16) | 441 /* Having both authentication and cookies for security is redundant,
382 (TCPOPT_MD5SIG << 8) | 442 * and there's certainly not enough room. Instead, the cookie-less
383 TCPOLEN_MD5SIG); 443 * extension variant is proposed.
384 *md5_hash = (__u8 *)ptr; 444 *
445 * Consider the pessimal case with authentication. The options
446 * could look like:
447 * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
448 */
449 if (unlikely(OPTION_MD5 & options)) {
450 if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
451 *ptr++ = htonl((TCPOPT_COOKIE << 24) |
452 (TCPOLEN_COOKIE_BASE << 16) |
453 (TCPOPT_MD5SIG << 8) |
454 TCPOLEN_MD5SIG);
455 } else {
456 *ptr++ = htonl((TCPOPT_NOP << 24) |
457 (TCPOPT_NOP << 16) |
458 (TCPOPT_MD5SIG << 8) |
459 TCPOLEN_MD5SIG);
460 }
461 options &= ~OPTION_COOKIE_EXTENSION;
462 /* overload cookie hash location */
463 opts->hash_location = (__u8 *)ptr;
385 ptr += 4; 464 ptr += 4;
386 } else {
387 *md5_hash = NULL;
388 } 465 }
389 466
390 if (unlikely(opts->mss)) { 467 if (unlikely(opts->mss)) {
@@ -393,12 +470,13 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
393 opts->mss); 470 opts->mss);
394 } 471 }
395 472
396 if (likely(OPTION_TS & opts->options)) { 473 if (likely(OPTION_TS & options)) {
397 if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) { 474 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
398 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | 475 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
399 (TCPOLEN_SACK_PERM << 16) | 476 (TCPOLEN_SACK_PERM << 16) |
400 (TCPOPT_TIMESTAMP << 8) | 477 (TCPOPT_TIMESTAMP << 8) |
401 TCPOLEN_TIMESTAMP); 478 TCPOLEN_TIMESTAMP);
479 options &= ~OPTION_SACK_ADVERTISE;
402 } else { 480 } else {
403 *ptr++ = htonl((TCPOPT_NOP << 24) | 481 *ptr++ = htonl((TCPOPT_NOP << 24) |
404 (TCPOPT_NOP << 16) | 482 (TCPOPT_NOP << 16) |
@@ -409,15 +487,52 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
409 *ptr++ = htonl(opts->tsecr); 487 *ptr++ = htonl(opts->tsecr);
410 } 488 }
411 489
412 if (unlikely(OPTION_SACK_ADVERTISE & opts->options && 490 /* Specification requires after timestamp, so do it now.
413 !(OPTION_TS & opts->options))) { 491 *
492 * Consider the pessimal case without authentication. The options
493 * could look like:
494 * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
495 */
496 if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
497 __u8 *cookie_copy = opts->hash_location;
498 u8 cookie_size = opts->hash_size;
499
500 /* 8-bit multiple handled in tcp_cookie_size_check() above,
501 * and elsewhere.
502 */
503 if (0x2 & cookie_size) {
504 __u8 *p = (__u8 *)ptr;
505
506 /* 16-bit multiple */
507 *p++ = TCPOPT_COOKIE;
508 *p++ = TCPOLEN_COOKIE_BASE + cookie_size;
509 *p++ = *cookie_copy++;
510 *p++ = *cookie_copy++;
511 ptr++;
512 cookie_size -= 2;
513 } else {
514 /* 32-bit multiple */
515 *ptr++ = htonl(((TCPOPT_NOP << 24) |
516 (TCPOPT_NOP << 16) |
517 (TCPOPT_COOKIE << 8) |
518 TCPOLEN_COOKIE_BASE) +
519 cookie_size);
520 }
521
522 if (cookie_size > 0) {
523 memcpy(ptr, cookie_copy, cookie_size);
524 ptr += (cookie_size / 4);
525 }
526 }
527
528 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
414 *ptr++ = htonl((TCPOPT_NOP << 24) | 529 *ptr++ = htonl((TCPOPT_NOP << 24) |
415 (TCPOPT_NOP << 16) | 530 (TCPOPT_NOP << 16) |
416 (TCPOPT_SACK_PERM << 8) | 531 (TCPOPT_SACK_PERM << 8) |
417 TCPOLEN_SACK_PERM); 532 TCPOLEN_SACK_PERM);
418 } 533 }
419 534
420 if (unlikely(opts->ws)) { 535 if (unlikely(OPTION_WSCALE & options)) {
421 *ptr++ = htonl((TCPOPT_NOP << 24) | 536 *ptr++ = htonl((TCPOPT_NOP << 24) |
422 (TCPOPT_WINDOW << 16) | 537 (TCPOPT_WINDOW << 16) |
423 (TCPOLEN_WINDOW << 8) | 538 (TCPOLEN_WINDOW << 8) |
@@ -445,17 +560,24 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
445 } 560 }
446} 561}
447 562
563/* Compute TCP options for SYN packets. This is not the final
564 * network wire format yet.
565 */
448static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, 566static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
449 struct tcp_out_options *opts, 567 struct tcp_out_options *opts,
450 struct tcp_md5sig_key **md5) { 568 struct tcp_md5sig_key **md5) {
451 struct tcp_sock *tp = tcp_sk(sk); 569 struct tcp_sock *tp = tcp_sk(sk);
452 unsigned size = 0; 570 struct tcp_cookie_values *cvp = tp->cookie_values;
571 unsigned remaining = MAX_TCP_OPTION_SPACE;
572 u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
573 tcp_cookie_size_check(cvp->cookie_desired) :
574 0;
453 575
454#ifdef CONFIG_TCP_MD5SIG 576#ifdef CONFIG_TCP_MD5SIG
455 *md5 = tp->af_specific->md5_lookup(sk, sk); 577 *md5 = tp->af_specific->md5_lookup(sk, sk);
456 if (*md5) { 578 if (*md5) {
457 opts->options |= OPTION_MD5; 579 opts->options |= OPTION_MD5;
458 size += TCPOLEN_MD5SIG_ALIGNED; 580 remaining -= TCPOLEN_MD5SIG_ALIGNED;
459 } 581 }
460#else 582#else
461 *md5 = NULL; 583 *md5 = NULL;
@@ -471,76 +593,154 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
471 * SACKs don't matter, we never delay an ACK when we have any of those 593 * SACKs don't matter, we never delay an ACK when we have any of those
472 * going out. */ 594 * going out. */
473 opts->mss = tcp_advertise_mss(sk); 595 opts->mss = tcp_advertise_mss(sk);
474 size += TCPOLEN_MSS_ALIGNED; 596 remaining -= TCPOLEN_MSS_ALIGNED;
475 597
476 if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { 598 if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
477 opts->options |= OPTION_TS; 599 opts->options |= OPTION_TS;
478 opts->tsval = TCP_SKB_CB(skb)->when; 600 opts->tsval = TCP_SKB_CB(skb)->when;
479 opts->tsecr = tp->rx_opt.ts_recent; 601 opts->tsecr = tp->rx_opt.ts_recent;
480 size += TCPOLEN_TSTAMP_ALIGNED; 602 remaining -= TCPOLEN_TSTAMP_ALIGNED;
481 } 603 }
482 if (likely(sysctl_tcp_window_scaling)) { 604 if (likely(sysctl_tcp_window_scaling)) {
483 opts->ws = tp->rx_opt.rcv_wscale; 605 opts->ws = tp->rx_opt.rcv_wscale;
484 if (likely(opts->ws)) 606 opts->options |= OPTION_WSCALE;
485 size += TCPOLEN_WSCALE_ALIGNED; 607 remaining -= TCPOLEN_WSCALE_ALIGNED;
486 } 608 }
487 if (likely(sysctl_tcp_sack)) { 609 if (likely(sysctl_tcp_sack)) {
488 opts->options |= OPTION_SACK_ADVERTISE; 610 opts->options |= OPTION_SACK_ADVERTISE;
489 if (unlikely(!(OPTION_TS & opts->options))) 611 if (unlikely(!(OPTION_TS & opts->options)))
490 size += TCPOLEN_SACKPERM_ALIGNED; 612 remaining -= TCPOLEN_SACKPERM_ALIGNED;
491 } 613 }
492 614
493 return size; 615 /* Note that timestamps are required by the specification.
616 *
617 * Odd numbers of bytes are prohibited by the specification, ensuring
618 * that the cookie is 16-bit aligned, and the resulting cookie pair is
619 * 32-bit aligned.
620 */
621 if (*md5 == NULL &&
622 (OPTION_TS & opts->options) &&
623 cookie_size > 0) {
624 int need = TCPOLEN_COOKIE_BASE + cookie_size;
625
626 if (0x2 & need) {
627 /* 32-bit multiple */
628 need += 2; /* NOPs */
629
630 if (need > remaining) {
631 /* try shrinking cookie to fit */
632 cookie_size -= 2;
633 need -= 4;
634 }
635 }
636 while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
637 cookie_size -= 4;
638 need -= 4;
639 }
640 if (TCP_COOKIE_MIN <= cookie_size) {
641 opts->options |= OPTION_COOKIE_EXTENSION;
642 opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
643 opts->hash_size = cookie_size;
644
645 /* Remember for future incarnations. */
646 cvp->cookie_desired = cookie_size;
647
648 if (cvp->cookie_desired != cvp->cookie_pair_size) {
649 /* Currently use random bytes as a nonce,
650 * assuming these are completely unpredictable
651 * by hostile users of the same system.
652 */
653 get_random_bytes(&cvp->cookie_pair[0],
654 cookie_size);
655 cvp->cookie_pair_size = cookie_size;
656 }
657
658 remaining -= need;
659 }
660 }
661 return MAX_TCP_OPTION_SPACE - remaining;
494} 662}
495 663
664/* Set up TCP options for SYN-ACKs. */
496static unsigned tcp_synack_options(struct sock *sk, 665static unsigned tcp_synack_options(struct sock *sk,
497 struct request_sock *req, 666 struct request_sock *req,
498 unsigned mss, struct sk_buff *skb, 667 unsigned mss, struct sk_buff *skb,
499 struct tcp_out_options *opts, 668 struct tcp_out_options *opts,
500 struct tcp_md5sig_key **md5) { 669 struct tcp_md5sig_key **md5,
501 unsigned size = 0; 670 struct tcp_extend_values *xvp)
671{
502 struct inet_request_sock *ireq = inet_rsk(req); 672 struct inet_request_sock *ireq = inet_rsk(req);
503 char doing_ts; 673 unsigned remaining = MAX_TCP_OPTION_SPACE;
674 u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
675 xvp->cookie_plus :
676 0;
504 677
505#ifdef CONFIG_TCP_MD5SIG 678#ifdef CONFIG_TCP_MD5SIG
506 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); 679 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
507 if (*md5) { 680 if (*md5) {
508 opts->options |= OPTION_MD5; 681 opts->options |= OPTION_MD5;
509 size += TCPOLEN_MD5SIG_ALIGNED; 682 remaining -= TCPOLEN_MD5SIG_ALIGNED;
683
684 /* We can't fit any SACK blocks in a packet with MD5 + TS
685 * options. There was discussion about disabling SACK
686 * rather than TS in order to fit in better with old,
687 * buggy kernels, but that was deemed to be unnecessary.
688 */
689 ireq->tstamp_ok &= !ireq->sack_ok;
510 } 690 }
511#else 691#else
512 *md5 = NULL; 692 *md5 = NULL;
513#endif 693#endif
514 694
515 /* we can't fit any SACK blocks in a packet with MD5 + TS 695 /* We always send an MSS option. */
516 options. There was discussion about disabling SACK rather than TS in
517 order to fit in better with old, buggy kernels, but that was deemed
518 to be unnecessary. */
519 doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok);
520
521 opts->mss = mss; 696 opts->mss = mss;
522 size += TCPOLEN_MSS_ALIGNED; 697 remaining -= TCPOLEN_MSS_ALIGNED;
523 698
524 if (likely(ireq->wscale_ok)) { 699 if (likely(ireq->wscale_ok)) {
525 opts->ws = ireq->rcv_wscale; 700 opts->ws = ireq->rcv_wscale;
526 if (likely(opts->ws)) 701 opts->options |= OPTION_WSCALE;
527 size += TCPOLEN_WSCALE_ALIGNED; 702 remaining -= TCPOLEN_WSCALE_ALIGNED;
528 } 703 }
529 if (likely(doing_ts)) { 704 if (likely(ireq->tstamp_ok)) {
530 opts->options |= OPTION_TS; 705 opts->options |= OPTION_TS;
531 opts->tsval = TCP_SKB_CB(skb)->when; 706 opts->tsval = TCP_SKB_CB(skb)->when;
532 opts->tsecr = req->ts_recent; 707 opts->tsecr = req->ts_recent;
533 size += TCPOLEN_TSTAMP_ALIGNED; 708 remaining -= TCPOLEN_TSTAMP_ALIGNED;
534 } 709 }
535 if (likely(ireq->sack_ok)) { 710 if (likely(ireq->sack_ok)) {
536 opts->options |= OPTION_SACK_ADVERTISE; 711 opts->options |= OPTION_SACK_ADVERTISE;
537 if (unlikely(!doing_ts)) 712 if (unlikely(!ireq->tstamp_ok))
538 size += TCPOLEN_SACKPERM_ALIGNED; 713 remaining -= TCPOLEN_SACKPERM_ALIGNED;
539 } 714 }
540 715
541 return size; 716 /* Similar rationale to tcp_syn_options() applies here, too.
717 * If the <SYN> options fit, the same options should fit now!
718 */
719 if (*md5 == NULL &&
720 ireq->tstamp_ok &&
721 cookie_plus > TCPOLEN_COOKIE_BASE) {
722 int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
723
724 if (0x2 & need) {
725 /* 32-bit multiple */
726 need += 2; /* NOPs */
727 }
728 if (need <= remaining) {
729 opts->options |= OPTION_COOKIE_EXTENSION;
730 opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
731 remaining -= need;
732 } else {
733 /* There's no error return, so flag it. */
734 xvp->cookie_out_never = 1; /* true */
735 opts->hash_size = 0;
736 }
737 }
738 return MAX_TCP_OPTION_SPACE - remaining;
542} 739}
543 740
741/* Compute TCP options for ESTABLISHED sockets. This is not the
742 * final wire format yet.
743 */
544static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, 744static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
545 struct tcp_out_options *opts, 745 struct tcp_out_options *opts,
546 struct tcp_md5sig_key **md5) { 746 struct tcp_md5sig_key **md5) {
@@ -601,7 +801,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
601 struct tcp_out_options opts; 801 struct tcp_out_options opts;
602 unsigned tcp_options_size, tcp_header_size; 802 unsigned tcp_options_size, tcp_header_size;
603 struct tcp_md5sig_key *md5; 803 struct tcp_md5sig_key *md5;
604 __u8 *md5_hash_location;
605 struct tcphdr *th; 804 struct tcphdr *th;
606 int err; 805 int err;
607 806
@@ -627,15 +826,18 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
627 tcb = TCP_SKB_CB(skb); 826 tcb = TCP_SKB_CB(skb);
628 memset(&opts, 0, sizeof(opts)); 827 memset(&opts, 0, sizeof(opts));
629 828
630 if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) 829 if (unlikely(tcb->flags & TCPHDR_SYN))
631 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); 830 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
632 else 831 else
633 tcp_options_size = tcp_established_options(sk, skb, &opts, 832 tcp_options_size = tcp_established_options(sk, skb, &opts,
634 &md5); 833 &md5);
635 tcp_header_size = tcp_options_size + sizeof(struct tcphdr); 834 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
636 835
637 if (tcp_packets_in_flight(tp) == 0) 836 if (tcp_packets_in_flight(tp) == 0) {
638 tcp_ca_event(sk, CA_EVENT_TX_START); 837 tcp_ca_event(sk, CA_EVENT_TX_START);
838 skb->ooo_okay = 1;
839 } else
840 skb->ooo_okay = 0;
639 841
640 skb_push(skb, tcp_header_size); 842 skb_push(skb, tcp_header_size);
641 skb_reset_transport_header(skb); 843 skb_reset_transport_header(skb);
@@ -643,14 +845,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
643 845
644 /* Build TCP header and checksum it. */ 846 /* Build TCP header and checksum it. */
645 th = tcp_hdr(skb); 847 th = tcp_hdr(skb);
646 th->source = inet->sport; 848 th->source = inet->inet_sport;
647 th->dest = inet->dport; 849 th->dest = inet->inet_dport;
648 th->seq = htonl(tcb->seq); 850 th->seq = htonl(tcb->seq);
649 th->ack_seq = htonl(tp->rcv_nxt); 851 th->ack_seq = htonl(tp->rcv_nxt);
650 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | 852 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
651 tcb->flags); 853 tcb->flags);
652 854
653 if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { 855 if (unlikely(tcb->flags & TCPHDR_SYN)) {
654 /* RFC1323: The window in SYN & SYN/ACK segments 856 /* RFC1323: The window in SYN & SYN/ACK segments
655 * is never scaled. 857 * is never scaled.
656 */ 858 */
@@ -667,36 +869,37 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
667 th->urg_ptr = htons(tp->snd_up - tcb->seq); 869 th->urg_ptr = htons(tp->snd_up - tcb->seq);
668 th->urg = 1; 870 th->urg = 1;
669 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { 871 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
670 th->urg_ptr = 0xFFFF; 872 th->urg_ptr = htons(0xFFFF);
671 th->urg = 1; 873 th->urg = 1;
672 } 874 }
673 } 875 }
674 876
675 tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); 877 tcp_options_write((__be32 *)(th + 1), tp, &opts);
676 if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) 878 if (likely((tcb->flags & TCPHDR_SYN) == 0))
677 TCP_ECN_send(sk, skb, tcp_header_size); 879 TCP_ECN_send(sk, skb, tcp_header_size);
678 880
679#ifdef CONFIG_TCP_MD5SIG 881#ifdef CONFIG_TCP_MD5SIG
680 /* Calculate the MD5 hash, as we have all we need now */ 882 /* Calculate the MD5 hash, as we have all we need now */
681 if (md5) { 883 if (md5) {
682 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 884 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
683 tp->af_specific->calc_md5_hash(md5_hash_location, 885 tp->af_specific->calc_md5_hash(opts.hash_location,
684 md5, sk, NULL, skb); 886 md5, sk, NULL, skb);
685 } 887 }
686#endif 888#endif
687 889
688 icsk->icsk_af_ops->send_check(sk, skb->len, skb); 890 icsk->icsk_af_ops->send_check(sk, skb);
689 891
690 if (likely(tcb->flags & TCPCB_FLAG_ACK)) 892 if (likely(tcb->flags & TCPHDR_ACK))
691 tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); 893 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
692 894
693 if (skb->len != tcp_header_size) 895 if (skb->len != tcp_header_size)
694 tcp_event_data_sent(tp, skb, sk); 896 tcp_event_data_sent(tp, skb, sk);
695 897
696 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) 898 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
697 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); 899 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
900 tcp_skb_pcount(skb));
698 901
699 err = icsk->icsk_af_ops->queue_xmit(skb, 0); 902 err = icsk->icsk_af_ops->queue_xmit(skb);
700 if (likely(err <= 0)) 903 if (likely(err <= 0))
701 return err; 904 return err;
702 905
@@ -705,7 +908,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
705 return net_xmit_eval(err); 908 return net_xmit_eval(err);
706} 909}
707 910
708/* This routine just queue's the buffer 911/* This routine just queues the buffer for sending.
709 * 912 *
710 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, 913 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
711 * otherwise socket can stall. 914 * otherwise socket can stall.
@@ -722,10 +925,12 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
722 sk_mem_charge(sk, skb->truesize); 925 sk_mem_charge(sk, skb->truesize);
723} 926}
724 927
928/* Initialize TSO segments for a packet. */
725static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, 929static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb,
726 unsigned int mss_now) 930 unsigned int mss_now)
727{ 931{
728 if (skb->len <= mss_now || !sk_can_gso(sk)) { 932 if (skb->len <= mss_now || !sk_can_gso(sk) ||
933 skb->ip_summed == CHECKSUM_NONE) {
729 /* Avoid the costly divide in the normal 934 /* Avoid the costly divide in the normal
730 * non-TSO case. 935 * non-TSO case.
731 */ 936 */
@@ -827,7 +1032,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
827 1032
828 /* PSH and FIN should only be set in the second packet. */ 1033 /* PSH and FIN should only be set in the second packet. */
829 flags = TCP_SKB_CB(skb)->flags; 1034 flags = TCP_SKB_CB(skb)->flags;
830 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); 1035 TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
831 TCP_SKB_CB(buff)->flags = flags; 1036 TCP_SKB_CB(buff)->flags = flags;
832 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; 1037 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
833 1038
@@ -908,6 +1113,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
908 skb->len = skb->data_len; 1113 skb->len = skb->data_len;
909} 1114}
910 1115
1116/* Remove acked data from a packet in the transmit queue. */
911int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) 1117int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
912{ 1118{
913 if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) 1119 if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
@@ -936,7 +1142,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
936 return 0; 1142 return 0;
937} 1143}
938 1144
939/* Not accounting for SACKs here. */ 1145/* Calculate MSS. Not accounting for SACKs here. */
940int tcp_mtu_to_mss(struct sock *sk, int pmtu) 1146int tcp_mtu_to_mss(struct sock *sk, int pmtu)
941{ 1147{
942 struct tcp_sock *tp = tcp_sk(sk); 1148 struct tcp_sock *tp = tcp_sk(sk);
@@ -980,6 +1186,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
980 return mtu; 1186 return mtu;
981} 1187}
982 1188
1189/* MTU probing init per socket */
983void tcp_mtup_init(struct sock *sk) 1190void tcp_mtup_init(struct sock *sk)
984{ 1191{
985 struct tcp_sock *tp = tcp_sk(sk); 1192 struct tcp_sock *tp = tcp_sk(sk);
@@ -991,6 +1198,7 @@ void tcp_mtup_init(struct sock *sk)
991 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); 1198 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
992 icsk->icsk_mtup.probe_size = 0; 1199 icsk->icsk_mtup.probe_size = 0;
993} 1200}
1201EXPORT_SYMBOL(tcp_mtup_init);
994 1202
995/* This function synchronize snd mss to current pmtu/exthdr set. 1203/* This function synchronize snd mss to current pmtu/exthdr set.
996 1204
@@ -1034,6 +1242,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1034 1242
1035 return mss_now; 1243 return mss_now;
1036} 1244}
1245EXPORT_SYMBOL(tcp_sync_mss);
1037 1246
1038/* Compute the current effective MSS, taking SACKs and IP options, 1247/* Compute the current effective MSS, taking SACKs and IP options,
1039 * and even PMTU discovery events into account. 1248 * and even PMTU discovery events into account.
@@ -1130,8 +1339,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
1130 u32 in_flight, cwnd; 1339 u32 in_flight, cwnd;
1131 1340
1132 /* Don't be strict about the congestion window for the final FIN. */ 1341 /* Don't be strict about the congestion window for the final FIN. */
1133 if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && 1342 if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1)
1134 tcp_skb_pcount(skb) == 1)
1135 return 1; 1343 return 1;
1136 1344
1137 in_flight = tcp_packets_in_flight(tp); 1345 in_flight = tcp_packets_in_flight(tp);
@@ -1142,7 +1350,8 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
1142 return 0; 1350 return 0;
1143} 1351}
1144 1352
1145/* This must be invoked the first time we consider transmitting 1353/* Intialize TSO state of a skb.
1354 * This must be invoked the first time we consider transmitting
1146 * SKB onto the wire. 1355 * SKB onto the wire.
1147 */ 1356 */
1148static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, 1357static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
@@ -1157,6 +1366,7 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
1157 return tso_segs; 1366 return tso_segs;
1158} 1367}
1159 1368
1369/* Minshall's variant of the Nagle send check. */
1160static inline int tcp_minshall_check(const struct tcp_sock *tp) 1370static inline int tcp_minshall_check(const struct tcp_sock *tp)
1161{ 1371{
1162 return after(tp->snd_sml, tp->snd_una) && 1372 return after(tp->snd_sml, tp->snd_una) &&
@@ -1174,9 +1384,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp,
1174 const struct sk_buff *skb, 1384 const struct sk_buff *skb,
1175 unsigned mss_now, int nonagle) 1385 unsigned mss_now, int nonagle)
1176{ 1386{
1177 return (skb->len < mss_now && 1387 return skb->len < mss_now &&
1178 ((nonagle & TCP_NAGLE_CORK) || 1388 ((nonagle & TCP_NAGLE_CORK) ||
1179 (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); 1389 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1180} 1390}
1181 1391
1182/* Return non-zero if the Nagle test allows this packet to be 1392/* Return non-zero if the Nagle test allows this packet to be
@@ -1198,7 +1408,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
1198 * Nagle can be ignored during F-RTO too (see RFC4138). 1408 * Nagle can be ignored during F-RTO too (see RFC4138).
1199 */ 1409 */
1200 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || 1410 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
1201 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) 1411 (TCP_SKB_CB(skb)->flags & TCPHDR_FIN))
1202 return 1; 1412 return 1;
1203 1413
1204 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) 1414 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
@@ -1241,15 +1451,16 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
1241 return cwnd_quota; 1451 return cwnd_quota;
1242} 1452}
1243 1453
1454/* Test if sending is allowed right now. */
1244int tcp_may_send_now(struct sock *sk) 1455int tcp_may_send_now(struct sock *sk)
1245{ 1456{
1246 struct tcp_sock *tp = tcp_sk(sk); 1457 struct tcp_sock *tp = tcp_sk(sk);
1247 struct sk_buff *skb = tcp_send_head(sk); 1458 struct sk_buff *skb = tcp_send_head(sk);
1248 1459
1249 return (skb && 1460 return skb &&
1250 tcp_snd_test(sk, skb, tcp_current_mss(sk), 1461 tcp_snd_test(sk, skb, tcp_current_mss(sk),
1251 (tcp_skb_is_last(sk, skb) ? 1462 (tcp_skb_is_last(sk, skb) ?
1252 tp->nonagle : TCP_NAGLE_PUSH))); 1463 tp->nonagle : TCP_NAGLE_PUSH));
1253} 1464}
1254 1465
1255/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet 1466/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
@@ -1260,7 +1471,7 @@ int tcp_may_send_now(struct sock *sk)
1260 * packet has never been sent out before (and thus is not cloned). 1471 * packet has never been sent out before (and thus is not cloned).
1261 */ 1472 */
1262static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, 1473static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1263 unsigned int mss_now) 1474 unsigned int mss_now, gfp_t gfp)
1264{ 1475{
1265 struct sk_buff *buff; 1476 struct sk_buff *buff;
1266 int nlen = skb->len - len; 1477 int nlen = skb->len - len;
@@ -1270,7 +1481,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1270 if (skb->len != skb->data_len) 1481 if (skb->len != skb->data_len)
1271 return tcp_fragment(sk, skb, len, mss_now); 1482 return tcp_fragment(sk, skb, len, mss_now);
1272 1483
1273 buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC); 1484 buff = sk_stream_alloc_skb(sk, 0, gfp);
1274 if (unlikely(buff == NULL)) 1485 if (unlikely(buff == NULL))
1275 return -ENOMEM; 1486 return -ENOMEM;
1276 1487
@@ -1286,7 +1497,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1286 1497
1287 /* PSH and FIN should only be set in the second packet. */ 1498 /* PSH and FIN should only be set in the second packet. */
1288 flags = TCP_SKB_CB(skb)->flags; 1499 flags = TCP_SKB_CB(skb)->flags;
1289 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); 1500 TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1290 TCP_SKB_CB(buff)->flags = flags; 1501 TCP_SKB_CB(buff)->flags = flags;
1291 1502
1292 /* This packet was never sent out yet, so no SACK bits. */ 1503 /* This packet was never sent out yet, so no SACK bits. */
@@ -1316,8 +1527,9 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1316 struct tcp_sock *tp = tcp_sk(sk); 1527 struct tcp_sock *tp = tcp_sk(sk);
1317 const struct inet_connection_sock *icsk = inet_csk(sk); 1528 const struct inet_connection_sock *icsk = inet_csk(sk);
1318 u32 send_win, cong_win, limit, in_flight; 1529 u32 send_win, cong_win, limit, in_flight;
1530 int win_divisor;
1319 1531
1320 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) 1532 if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
1321 goto send_now; 1533 goto send_now;
1322 1534
1323 if (icsk->icsk_ca_state != TCP_CA_Open) 1535 if (icsk->icsk_ca_state != TCP_CA_Open)
@@ -1347,13 +1559,14 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1347 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) 1559 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
1348 goto send_now; 1560 goto send_now;
1349 1561
1350 if (sysctl_tcp_tso_win_divisor) { 1562 win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
1563 if (win_divisor) {
1351 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); 1564 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
1352 1565
1353 /* If at least some fraction of a window is available, 1566 /* If at least some fraction of a window is available,
1354 * just use it. 1567 * just use it.
1355 */ 1568 */
1356 chunk /= sysctl_tcp_tso_win_divisor; 1569 chunk /= win_divisor;
1357 if (limit >= chunk) 1570 if (limit >= chunk)
1358 goto send_now; 1571 goto send_now;
1359 } else { 1572 } else {
@@ -1377,6 +1590,10 @@ send_now:
1377} 1590}
1378 1591
1379/* Create a new MTU probe if we are ready. 1592/* Create a new MTU probe if we are ready.
1593 * MTU probe is regularly attempting to increase the path MTU by
1594 * deliberately sending larger packets. This discovers routing
1595 * changes resulting in larger path MTUs.
1596 *
1380 * Returns 0 if we should wait to probe (no cwnd available), 1597 * Returns 0 if we should wait to probe (no cwnd available),
1381 * 1 if a probe was sent, 1598 * 1 if a probe was sent,
1382 * -1 otherwise 1599 * -1 otherwise
@@ -1439,7 +1656,7 @@ static int tcp_mtu_probe(struct sock *sk)
1439 1656
1440 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; 1657 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1441 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; 1658 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
1442 TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; 1659 TCP_SKB_CB(nskb)->flags = TCPHDR_ACK;
1443 TCP_SKB_CB(nskb)->sacked = 0; 1660 TCP_SKB_CB(nskb)->sacked = 0;
1444 nskb->csum = 0; 1661 nskb->csum = 0;
1445 nskb->ip_summed = skb->ip_summed; 1662 nskb->ip_summed = skb->ip_summed;
@@ -1464,7 +1681,7 @@ static int tcp_mtu_probe(struct sock *sk)
1464 sk_wmem_free_skb(sk, skb); 1681 sk_wmem_free_skb(sk, skb);
1465 } else { 1682 } else {
1466 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & 1683 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
1467 ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); 1684 ~(TCPHDR_FIN|TCPHDR_PSH);
1468 if (!skb_shinfo(skb)->nr_frags) { 1685 if (!skb_shinfo(skb)->nr_frags) {
1469 skb_pull(skb, copy); 1686 skb_pull(skb, copy);
1470 if (skb->ip_summed != CHECKSUM_PARTIAL) 1687 if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -1564,7 +1781,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1564 cwnd_quota); 1781 cwnd_quota);
1565 1782
1566 if (skb->len > limit && 1783 if (skb->len > limit &&
1567 unlikely(tso_fragment(sk, skb, limit, mss_now))) 1784 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
1568 break; 1785 break;
1569 1786
1570 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1787 TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -1598,11 +1815,6 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1598void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, 1815void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
1599 int nonagle) 1816 int nonagle)
1600{ 1817{
1601 struct sk_buff *skb = tcp_send_head(sk);
1602
1603 if (!skb)
1604 return;
1605
1606 /* If we are closed, the bytes will have to remain here. 1818 /* If we are closed, the bytes will have to remain here.
1607 * In time closedown will finish, we empty the write queue and 1819 * In time closedown will finish, we empty the write queue and
1608 * all will be happy. 1820 * all will be happy.
@@ -1789,6 +2001,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
1789 sk_wmem_free_skb(sk, next_skb); 2001 sk_wmem_free_skb(sk, next_skb);
1790} 2002}
1791 2003
2004/* Check if coalescing SKBs is legal. */
1792static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) 2005static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb)
1793{ 2006{
1794 if (tcp_skb_pcount(skb) > 1) 2007 if (tcp_skb_pcount(skb) > 1)
@@ -1807,6 +2020,9 @@ static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb)
1807 return 1; 2020 return 1;
1808} 2021}
1809 2022
2023/* Collapse packets in the retransmit queue to make to create
2024 * less packets on the wire. This is only done on retransmission.
2025 */
1810static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, 2026static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
1811 int space) 2027 int space)
1812{ 2028{
@@ -1816,7 +2032,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
1816 2032
1817 if (!sysctl_tcp_retrans_collapse) 2033 if (!sysctl_tcp_retrans_collapse)
1818 return; 2034 return;
1819 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) 2035 if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN)
1820 return; 2036 return;
1821 2037
1822 tcp_for_write_queue_from_safe(skb, tmp, sk) { 2038 tcp_for_write_queue_from_safe(skb, tmp, sk) {
@@ -1885,8 +2101,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1885 * case, when window is shrunk to zero. In this case 2101 * case, when window is shrunk to zero. In this case
1886 * our retransmit serves as a zero window probe. 2102 * our retransmit serves as a zero window probe.
1887 */ 2103 */
1888 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) 2104 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
1889 && TCP_SKB_CB(skb)->seq != tp->snd_una) 2105 TCP_SKB_CB(skb)->seq != tp->snd_una)
1890 return -EAGAIN; 2106 return -EAGAIN;
1891 2107
1892 if (skb->len > cur_mss) { 2108 if (skb->len > cur_mss) {
@@ -1908,7 +2124,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1908 * since it is cheap to do so and saves bytes on the network. 2124 * since it is cheap to do so and saves bytes on the network.
1909 */ 2125 */
1910 if (skb->len > 0 && 2126 if (skb->len > 0 &&
1911 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && 2127 (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) &&
1912 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { 2128 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
1913 if (!pskb_trim(skb, 0)) { 2129 if (!pskb_trim(skb, 0)) {
1914 /* Reuse, even though it does some unnecessary work */ 2130 /* Reuse, even though it does some unnecessary work */
@@ -1956,6 +2172,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1956 return err; 2172 return err;
1957} 2173}
1958 2174
2175/* Check if we forward retransmits are possible in the current
2176 * window/congestion state.
2177 */
1959static int tcp_can_forward_retransmit(struct sock *sk) 2178static int tcp_can_forward_retransmit(struct sock *sk)
1960{ 2179{
1961 const struct inet_connection_sock *icsk = inet_csk(sk); 2180 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2001,6 +2220,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2001 int mib_idx; 2220 int mib_idx;
2002 int fwd_rexmitting = 0; 2221 int fwd_rexmitting = 0;
2003 2222
2223 if (!tp->packets_out)
2224 return;
2225
2004 if (!tp->lost_out) 2226 if (!tp->lost_out)
2005 tp->retransmit_high = tp->snd_una; 2227 tp->retransmit_high = tp->snd_una;
2006 2228
@@ -2094,13 +2316,14 @@ void tcp_send_fin(struct sock *sk)
2094 mss_now = tcp_current_mss(sk); 2316 mss_now = tcp_current_mss(sk);
2095 2317
2096 if (tcp_send_head(sk) != NULL) { 2318 if (tcp_send_head(sk) != NULL) {
2097 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; 2319 TCP_SKB_CB(skb)->flags |= TCPHDR_FIN;
2098 TCP_SKB_CB(skb)->end_seq++; 2320 TCP_SKB_CB(skb)->end_seq++;
2099 tp->write_seq++; 2321 tp->write_seq++;
2100 } else { 2322 } else {
2101 /* Socket is locked, keep trying until memory is available. */ 2323 /* Socket is locked, keep trying until memory is available. */
2102 for (;;) { 2324 for (;;) {
2103 skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL); 2325 skb = alloc_skb_fclone(MAX_TCP_HEADER,
2326 sk->sk_allocation);
2104 if (skb) 2327 if (skb)
2105 break; 2328 break;
2106 yield(); 2329 yield();
@@ -2110,7 +2333,7 @@ void tcp_send_fin(struct sock *sk)
2110 skb_reserve(skb, MAX_TCP_HEADER); 2333 skb_reserve(skb, MAX_TCP_HEADER);
2111 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ 2334 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
2112 tcp_init_nondata_skb(skb, tp->write_seq, 2335 tcp_init_nondata_skb(skb, tp->write_seq,
2113 TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); 2336 TCPHDR_ACK | TCPHDR_FIN);
2114 tcp_queue_skb(sk, skb); 2337 tcp_queue_skb(sk, skb);
2115 } 2338 }
2116 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); 2339 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
@@ -2135,7 +2358,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2135 /* Reserve space for headers and prepare control bits. */ 2358 /* Reserve space for headers and prepare control bits. */
2136 skb_reserve(skb, MAX_TCP_HEADER); 2359 skb_reserve(skb, MAX_TCP_HEADER);
2137 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), 2360 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
2138 TCPCB_FLAG_ACK | TCPCB_FLAG_RST); 2361 TCPHDR_ACK | TCPHDR_RST);
2139 /* Send it off. */ 2362 /* Send it off. */
2140 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2363 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2141 if (tcp_transmit_skb(sk, skb, 0, priority)) 2364 if (tcp_transmit_skb(sk, skb, 0, priority))
@@ -2144,7 +2367,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2144 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); 2367 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
2145} 2368}
2146 2369
2147/* WARNING: This routine must only be called when we have already sent 2370/* Send a crossed SYN-ACK during socket establishment.
2371 * WARNING: This routine must only be called when we have already sent
2148 * a SYN packet that crossed the incoming SYN that caused this routine 2372 * a SYN packet that crossed the incoming SYN that caused this routine
2149 * to get called. If this assumption fails then the initial rcv_wnd 2373 * to get called. If this assumption fails then the initial rcv_wnd
2150 * and rcv_wscale values will not be correct. 2374 * and rcv_wscale values will not be correct.
@@ -2154,11 +2378,11 @@ int tcp_send_synack(struct sock *sk)
2154 struct sk_buff *skb; 2378 struct sk_buff *skb;
2155 2379
2156 skb = tcp_write_queue_head(sk); 2380 skb = tcp_write_queue_head(sk);
2157 if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { 2381 if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) {
2158 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); 2382 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
2159 return -EFAULT; 2383 return -EFAULT;
2160 } 2384 }
2161 if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) { 2385 if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) {
2162 if (skb_cloned(skb)) { 2386 if (skb_cloned(skb)) {
2163 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); 2387 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
2164 if (nskb == NULL) 2388 if (nskb == NULL)
@@ -2172,30 +2396,33 @@ int tcp_send_synack(struct sock *sk)
2172 skb = nskb; 2396 skb = nskb;
2173 } 2397 }
2174 2398
2175 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK; 2399 TCP_SKB_CB(skb)->flags |= TCPHDR_ACK;
2176 TCP_ECN_send_synack(tcp_sk(sk), skb); 2400 TCP_ECN_send_synack(tcp_sk(sk), skb);
2177 } 2401 }
2178 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2402 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2179 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2403 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2180} 2404}
2181 2405
2182/* 2406/* Prepare a SYN-ACK. */
2183 * Prepare a SYN-ACK.
2184 */
2185struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2407struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2186 struct request_sock *req) 2408 struct request_sock *req,
2409 struct request_values *rvp)
2187{ 2410{
2411 struct tcp_out_options opts;
2412 struct tcp_extend_values *xvp = tcp_xv(rvp);
2188 struct inet_request_sock *ireq = inet_rsk(req); 2413 struct inet_request_sock *ireq = inet_rsk(req);
2189 struct tcp_sock *tp = tcp_sk(sk); 2414 struct tcp_sock *tp = tcp_sk(sk);
2415 const struct tcp_cookie_values *cvp = tp->cookie_values;
2190 struct tcphdr *th; 2416 struct tcphdr *th;
2191 int tcp_header_size;
2192 struct tcp_out_options opts;
2193 struct sk_buff *skb; 2417 struct sk_buff *skb;
2194 struct tcp_md5sig_key *md5; 2418 struct tcp_md5sig_key *md5;
2195 __u8 *md5_hash_location; 2419 int tcp_header_size;
2196 int mss; 2420 int mss;
2421 int s_data_desired = 0;
2197 2422
2198 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); 2423 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
2424 s_data_desired = cvp->s_data_desired;
2425 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
2199 if (skb == NULL) 2426 if (skb == NULL)
2200 return NULL; 2427 return NULL;
2201 2428
@@ -2204,7 +2431,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2204 2431
2205 skb_dst_set(skb, dst_clone(dst)); 2432 skb_dst_set(skb, dst_clone(dst));
2206 2433
2207 mss = dst_metric(dst, RTAX_ADVMSS); 2434 mss = dst_metric_advmss(dst);
2208 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2435 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2209 mss = tp->rx_opt.user_mss; 2436 mss = tp->rx_opt.user_mss;
2210 2437
@@ -2212,13 +2439,20 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2212 __u8 rcv_wscale; 2439 __u8 rcv_wscale;
2213 /* Set this up on the first call only */ 2440 /* Set this up on the first call only */
2214 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); 2441 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2442
2443 /* limit the window selection if the user enforce a smaller rx buffer */
2444 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2445 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2446 req->window_clamp = tcp_full_space(sk);
2447
2215 /* tcp_full_space because it is guaranteed to be the first packet */ 2448 /* tcp_full_space because it is guaranteed to be the first packet */
2216 tcp_select_initial_window(tcp_full_space(sk), 2449 tcp_select_initial_window(tcp_full_space(sk),
2217 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), 2450 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
2218 &req->rcv_wnd, 2451 &req->rcv_wnd,
2219 &req->window_clamp, 2452 &req->window_clamp,
2220 ireq->wscale_ok, 2453 ireq->wscale_ok,
2221 &rcv_wscale); 2454 &rcv_wscale,
2455 dst_metric(dst, RTAX_INITRWND));
2222 ireq->rcv_wscale = rcv_wscale; 2456 ireq->rcv_wscale = rcv_wscale;
2223 } 2457 }
2224 2458
@@ -2230,8 +2464,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2230#endif 2464#endif
2231 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2465 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2232 tcp_header_size = tcp_synack_options(sk, req, mss, 2466 tcp_header_size = tcp_synack_options(sk, req, mss,
2233 skb, &opts, &md5) + 2467 skb, &opts, &md5, xvp)
2234 sizeof(struct tcphdr); 2468 + sizeof(*th);
2235 2469
2236 skb_push(skb, tcp_header_size); 2470 skb_push(skb, tcp_header_size);
2237 skb_reset_transport_header(skb); 2471 skb_reset_transport_header(skb);
@@ -2247,30 +2481,64 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2247 * not even correctly set) 2481 * not even correctly set)
2248 */ 2482 */
2249 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, 2483 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
2250 TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); 2484 TCPHDR_SYN | TCPHDR_ACK);
2485
2486 if (OPTION_COOKIE_EXTENSION & opts.options) {
2487 if (s_data_desired) {
2488 u8 *buf = skb_put(skb, s_data_desired);
2489
2490 /* copy data directly from the listening socket. */
2491 memcpy(buf, cvp->s_data_payload, s_data_desired);
2492 TCP_SKB_CB(skb)->end_seq += s_data_desired;
2493 }
2494
2495 if (opts.hash_size > 0) {
2496 __u32 workspace[SHA_WORKSPACE_WORDS];
2497 u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
2498 u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
2499
2500 /* Secret recipe depends on the Timestamp, (future)
2501 * Sequence and Acknowledgment Numbers, Initiator
2502 * Cookie, and others handled by IP variant caller.
2503 */
2504 *tail-- ^= opts.tsval;
2505 *tail-- ^= tcp_rsk(req)->rcv_isn + 1;
2506 *tail-- ^= TCP_SKB_CB(skb)->seq + 1;
2507
2508 /* recommended */
2509 *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
2510 *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
2511
2512 sha_transform((__u32 *)&xvp->cookie_bakery[0],
2513 (char *)mess,
2514 &workspace[0]);
2515 opts.hash_location =
2516 (__u8 *)&xvp->cookie_bakery[0];
2517 }
2518 }
2519
2251 th->seq = htonl(TCP_SKB_CB(skb)->seq); 2520 th->seq = htonl(TCP_SKB_CB(skb)->seq);
2252 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); 2521 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
2253 2522
2254 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 2523 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2255 th->window = htons(min(req->rcv_wnd, 65535U)); 2524 th->window = htons(min(req->rcv_wnd, 65535U));
2256 tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); 2525 tcp_options_write((__be32 *)(th + 1), tp, &opts);
2257 th->doff = (tcp_header_size >> 2); 2526 th->doff = (tcp_header_size >> 2);
2258 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); 2527 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
2259 2528
2260#ifdef CONFIG_TCP_MD5SIG 2529#ifdef CONFIG_TCP_MD5SIG
2261 /* Okay, we have all we need - do the md5 hash if needed */ 2530 /* Okay, we have all we need - do the md5 hash if needed */
2262 if (md5) { 2531 if (md5) {
2263 tp->af_specific->calc_md5_hash(md5_hash_location, 2532 tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
2264 md5, NULL, req, skb); 2533 md5, NULL, req, skb);
2265 } 2534 }
2266#endif 2535#endif
2267 2536
2268 return skb; 2537 return skb;
2269} 2538}
2539EXPORT_SYMBOL(tcp_make_synack);
2270 2540
2271/* 2541/* Do all connect socket setups that can be done AF independent. */
2272 * Do all connect socket setups that can be done AF independent.
2273 */
2274static void tcp_connect_init(struct sock *sk) 2542static void tcp_connect_init(struct sock *sk)
2275{ 2543{
2276 struct dst_entry *dst = __sk_dst_get(sk); 2544 struct dst_entry *dst = __sk_dst_get(sk);
@@ -2297,18 +2565,24 @@ static void tcp_connect_init(struct sock *sk)
2297 2565
2298 if (!tp->window_clamp) 2566 if (!tp->window_clamp)
2299 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 2567 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
2300 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 2568 tp->advmss = dst_metric_advmss(dst);
2301 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) 2569 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
2302 tp->advmss = tp->rx_opt.user_mss; 2570 tp->advmss = tp->rx_opt.user_mss;
2303 2571
2304 tcp_initialize_rcv_mss(sk); 2572 tcp_initialize_rcv_mss(sk);
2305 2573
2574 /* limit the window selection if the user enforce a smaller rx buffer */
2575 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2576 (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
2577 tp->window_clamp = tcp_full_space(sk);
2578
2306 tcp_select_initial_window(tcp_full_space(sk), 2579 tcp_select_initial_window(tcp_full_space(sk),
2307 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 2580 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
2308 &tp->rcv_wnd, 2581 &tp->rcv_wnd,
2309 &tp->window_clamp, 2582 &tp->window_clamp,
2310 sysctl_tcp_window_scaling, 2583 sysctl_tcp_window_scaling,
2311 &rcv_wscale); 2584 &rcv_wscale,
2585 dst_metric(dst, RTAX_INITRWND));
2312 2586
2313 tp->rx_opt.rcv_wscale = rcv_wscale; 2587 tp->rx_opt.rcv_wscale = rcv_wscale;
2314 tp->rcv_ssthresh = tp->rcv_wnd; 2588 tp->rcv_ssthresh = tp->rcv_wnd;
@@ -2329,13 +2603,12 @@ static void tcp_connect_init(struct sock *sk)
2329 tcp_clear_retrans(tp); 2603 tcp_clear_retrans(tp);
2330} 2604}
2331 2605
2332/* 2606/* Build a SYN and send it off. */
2333 * Build a SYN and send it off.
2334 */
2335int tcp_connect(struct sock *sk) 2607int tcp_connect(struct sock *sk)
2336{ 2608{
2337 struct tcp_sock *tp = tcp_sk(sk); 2609 struct tcp_sock *tp = tcp_sk(sk);
2338 struct sk_buff *buff; 2610 struct sk_buff *buff;
2611 int err;
2339 2612
2340 tcp_connect_init(sk); 2613 tcp_connect_init(sk);
2341 2614
@@ -2347,7 +2620,7 @@ int tcp_connect(struct sock *sk)
2347 skb_reserve(buff, MAX_TCP_HEADER); 2620 skb_reserve(buff, MAX_TCP_HEADER);
2348 2621
2349 tp->snd_nxt = tp->write_seq; 2622 tp->snd_nxt = tp->write_seq;
2350 tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); 2623 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
2351 TCP_ECN_send_syn(sk, buff); 2624 TCP_ECN_send_syn(sk, buff);
2352 2625
2353 /* Send it off. */ 2626 /* Send it off. */
@@ -2358,7 +2631,9 @@ int tcp_connect(struct sock *sk)
2358 sk->sk_wmem_queued += buff->truesize; 2631 sk->sk_wmem_queued += buff->truesize;
2359 sk_mem_charge(sk, buff->truesize); 2632 sk_mem_charge(sk, buff->truesize);
2360 tp->packets_out += tcp_skb_pcount(buff); 2633 tp->packets_out += tcp_skb_pcount(buff);
2361 tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); 2634 err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
2635 if (err == -ECONNREFUSED)
2636 return err;
2362 2637
2363 /* We change tp->snd_nxt after the tcp_transmit_skb() call 2638 /* We change tp->snd_nxt after the tcp_transmit_skb() call
2364 * in order to make this packet get counted in tcpOutSegs. 2639 * in order to make this packet get counted in tcpOutSegs.
@@ -2372,6 +2647,7 @@ int tcp_connect(struct sock *sk)
2372 inet_csk(sk)->icsk_rto, TCP_RTO_MAX); 2647 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
2373 return 0; 2648 return 0;
2374} 2649}
2650EXPORT_SYMBOL(tcp_connect);
2375 2651
2376/* Send out a delayed ack, the caller does the policy checking 2652/* Send out a delayed ack, the caller does the policy checking
2377 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() 2653 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
@@ -2453,7 +2729,7 @@ void tcp_send_ack(struct sock *sk)
2453 2729
2454 /* Reserve space for headers and prepare control bits. */ 2730 /* Reserve space for headers and prepare control bits. */
2455 skb_reserve(buff, MAX_TCP_HEADER); 2731 skb_reserve(buff, MAX_TCP_HEADER);
2456 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK); 2732 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
2457 2733
2458 /* Send it off, this clears delayed acks for us. */ 2734 /* Send it off, this clears delayed acks for us. */
2459 TCP_SKB_CB(buff)->when = tcp_time_stamp; 2735 TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -2487,11 +2763,12 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
2487 * end to send an ack. Don't queue or clone SKB, just 2763 * end to send an ack. Don't queue or clone SKB, just
2488 * send it. 2764 * send it.
2489 */ 2765 */
2490 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK); 2766 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
2491 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2767 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2492 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); 2768 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
2493} 2769}
2494 2770
2771/* Initiate keepalive or window probe from timer. */
2495int tcp_write_wakeup(struct sock *sk) 2772int tcp_write_wakeup(struct sock *sk)
2496{ 2773{
2497 struct tcp_sock *tp = tcp_sk(sk); 2774 struct tcp_sock *tp = tcp_sk(sk);
@@ -2516,13 +2793,13 @@ int tcp_write_wakeup(struct sock *sk)
2516 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || 2793 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
2517 skb->len > mss) { 2794 skb->len > mss) {
2518 seg_size = min(seg_size, mss); 2795 seg_size = min(seg_size, mss);
2519 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2796 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
2520 if (tcp_fragment(sk, skb, seg_size, mss)) 2797 if (tcp_fragment(sk, skb, seg_size, mss))
2521 return -1; 2798 return -1;
2522 } else if (!tcp_skb_pcount(skb)) 2799 } else if (!tcp_skb_pcount(skb))
2523 tcp_set_skb_tso_segs(sk, skb, mss); 2800 tcp_set_skb_tso_segs(sk, skb, mss);
2524 2801
2525 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2802 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
2526 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2803 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2527 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2804 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2528 if (!err) 2805 if (!err)
@@ -2575,10 +2852,3 @@ void tcp_send_probe0(struct sock *sk)
2575 TCP_RTO_MAX); 2852 TCP_RTO_MAX);
2576 } 2853 }
2577} 2854}
2578
2579EXPORT_SYMBOL(tcp_select_initial_window);
2580EXPORT_SYMBOL(tcp_connect);
2581EXPORT_SYMBOL(tcp_make_synack);
2582EXPORT_SYMBOL(tcp_simple_retransmit);
2583EXPORT_SYMBOL(tcp_sync_mss);
2584EXPORT_SYMBOL(tcp_mtup_init);
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 59f5b5e7c566..85ee7eb7e38e 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -22,6 +22,7 @@
22#include <linux/kprobes.h> 22#include <linux/kprobes.h>
23#include <linux/socket.h> 23#include <linux/socket.h>
24#include <linux/tcp.h> 24#include <linux/tcp.h>
25#include <linux/slab.h>
25#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/ktime.h> 28#include <linux/ktime.h>
@@ -39,9 +40,9 @@ static int port __read_mostly = 0;
39MODULE_PARM_DESC(port, "Port to match (0=all)"); 40MODULE_PARM_DESC(port, "Port to match (0=all)");
40module_param(port, int, 0); 41module_param(port, int, 0);
41 42
42static int bufsize __read_mostly = 4096; 43static unsigned int bufsize __read_mostly = 4096;
43MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); 44MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
44module_param(bufsize, int, 0); 45module_param(bufsize, uint, 0);
45 46
46static int full __read_mostly; 47static int full __read_mostly;
47MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); 48MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)");
@@ -75,12 +76,12 @@ static struct {
75 76
76static inline int tcp_probe_used(void) 77static inline int tcp_probe_used(void)
77{ 78{
78 return (tcp_probe.head - tcp_probe.tail) % bufsize; 79 return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1);
79} 80}
80 81
81static inline int tcp_probe_avail(void) 82static inline int tcp_probe_avail(void)
82{ 83{
83 return bufsize - tcp_probe_used(); 84 return bufsize - tcp_probe_used() - 1;
84} 85}
85 86
86/* 87/*
@@ -94,8 +95,9 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
94 const struct inet_sock *inet = inet_sk(sk); 95 const struct inet_sock *inet = inet_sk(sk);
95 96
96 /* Only update if port matches */ 97 /* Only update if port matches */
97 if ((port == 0 || ntohs(inet->dport) == port || ntohs(inet->sport) == port) 98 if ((port == 0 || ntohs(inet->inet_dport) == port ||
98 && (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { 99 ntohs(inet->inet_sport) == port) &&
100 (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
99 101
100 spin_lock(&tcp_probe.lock); 102 spin_lock(&tcp_probe.lock);
101 /* If log fills, just silently drop */ 103 /* If log fills, just silently drop */
@@ -103,10 +105,10 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
103 struct tcp_log *p = tcp_probe.log + tcp_probe.head; 105 struct tcp_log *p = tcp_probe.log + tcp_probe.head;
104 106
105 p->tstamp = ktime_get(); 107 p->tstamp = ktime_get();
106 p->saddr = inet->saddr; 108 p->saddr = inet->inet_saddr;
107 p->sport = inet->sport; 109 p->sport = inet->inet_sport;
108 p->daddr = inet->daddr; 110 p->daddr = inet->inet_daddr;
109 p->dport = inet->dport; 111 p->dport = inet->inet_dport;
110 p->length = skb->len; 112 p->length = skb->len;
111 p->snd_nxt = tp->snd_nxt; 113 p->snd_nxt = tp->snd_nxt;
112 p->snd_una = tp->snd_una; 114 p->snd_una = tp->snd_una;
@@ -115,7 +117,7 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
115 p->ssthresh = tcp_current_ssthresh(sk); 117 p->ssthresh = tcp_current_ssthresh(sk);
116 p->srtt = tp->srtt >> 3; 118 p->srtt = tp->srtt >> 3;
117 119
118 tcp_probe.head = (tcp_probe.head + 1) % bufsize; 120 tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
119 } 121 }
120 tcp_probe.lastcwnd = tp->snd_cwnd; 122 tcp_probe.lastcwnd = tp->snd_cwnd;
121 spin_unlock(&tcp_probe.lock); 123 spin_unlock(&tcp_probe.lock);
@@ -148,11 +150,11 @@ static int tcpprobe_open(struct inode * inode, struct file * file)
148static int tcpprobe_sprint(char *tbuf, int n) 150static int tcpprobe_sprint(char *tbuf, int n)
149{ 151{
150 const struct tcp_log *p 152 const struct tcp_log *p
151 = tcp_probe.log + tcp_probe.tail % bufsize; 153 = tcp_probe.log + tcp_probe.tail;
152 struct timespec tv 154 struct timespec tv
153 = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); 155 = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
154 156
155 return snprintf(tbuf, n, 157 return scnprintf(tbuf, n,
156 "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", 158 "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n",
157 (unsigned long) tv.tv_sec, 159 (unsigned long) tv.tv_sec,
158 (unsigned long) tv.tv_nsec, 160 (unsigned long) tv.tv_nsec,
@@ -172,7 +174,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,
172 return -EINVAL; 174 return -EINVAL;
173 175
174 while (cnt < len) { 176 while (cnt < len) {
175 char tbuf[128]; 177 char tbuf[164];
176 int width; 178 int width;
177 179
178 /* Wait for data in buffer */ 180 /* Wait for data in buffer */
@@ -191,7 +193,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,
191 width = tcpprobe_sprint(tbuf, sizeof(tbuf)); 193 width = tcpprobe_sprint(tbuf, sizeof(tbuf));
192 194
193 if (cnt + width < len) 195 if (cnt + width < len)
194 tcp_probe.tail = (tcp_probe.tail + 1) % bufsize; 196 tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1);
195 197
196 spin_unlock_bh(&tcp_probe.lock); 198 spin_unlock_bh(&tcp_probe.lock);
197 199
@@ -212,6 +214,7 @@ static const struct file_operations tcpprobe_fops = {
212 .owner = THIS_MODULE, 214 .owner = THIS_MODULE,
213 .open = tcpprobe_open, 215 .open = tcpprobe_open,
214 .read = tcpprobe_read, 216 .read = tcpprobe_read,
217 .llseek = noop_llseek,
215}; 218};
216 219
217static __init int tcpprobe_init(void) 220static __init int tcpprobe_init(void)
@@ -221,9 +224,10 @@ static __init int tcpprobe_init(void)
221 init_waitqueue_head(&tcp_probe.wait); 224 init_waitqueue_head(&tcp_probe.wait);
222 spin_lock_init(&tcp_probe.lock); 225 spin_lock_init(&tcp_probe.lock);
223 226
224 if (bufsize < 0) 227 if (bufsize == 0)
225 return -EINVAL; 228 return -EINVAL;
226 229
230 bufsize = roundup_pow_of_two(bufsize);
227 tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL); 231 tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL);
228 if (!tcp_probe.log) 232 if (!tcp_probe.log)
229 goto err0; 233 goto err0;
@@ -235,7 +239,7 @@ static __init int tcpprobe_init(void)
235 if (ret) 239 if (ret)
236 goto err1; 240 goto err1;
237 241
238 pr_info("TCP probe registered (port=%d)\n", port); 242 pr_info("TCP probe registered (port=%d) bufsize=%u\n", port, bufsize);
239 return 0; 243 return 0;
240 err1: 244 err1:
241 proc_net_remove(&init_net, procname); 245 proc_net_remove(&init_net, procname);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b144a26359bc..74a6aa003657 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -19,6 +19,7 @@
19 */ 19 */
20 20
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/gfp.h>
22#include <net/tcp.h> 23#include <net/tcp.h>
23 24
24int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; 25int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
@@ -29,6 +30,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
29int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; 30int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
30int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; 31int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
31int sysctl_tcp_orphan_retries __read_mostly; 32int sysctl_tcp_orphan_retries __read_mostly;
33int sysctl_tcp_thin_linear_timeouts __read_mostly;
32 34
33static void tcp_write_timer(unsigned long); 35static void tcp_write_timer(unsigned long);
34static void tcp_delack_timer(unsigned long); 36static void tcp_delack_timer(unsigned long);
@@ -39,7 +41,6 @@ void tcp_init_xmit_timers(struct sock *sk)
39 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, 41 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
40 &tcp_keepalive_timer); 42 &tcp_keepalive_timer);
41} 43}
42
43EXPORT_SYMBOL(tcp_init_xmit_timers); 44EXPORT_SYMBOL(tcp_init_xmit_timers);
44 45
45static void tcp_write_err(struct sock *sk) 46static void tcp_write_err(struct sock *sk)
@@ -65,18 +66,18 @@ static void tcp_write_err(struct sock *sk)
65static int tcp_out_of_resources(struct sock *sk, int do_reset) 66static int tcp_out_of_resources(struct sock *sk, int do_reset)
66{ 67{
67 struct tcp_sock *tp = tcp_sk(sk); 68 struct tcp_sock *tp = tcp_sk(sk);
68 int orphans = percpu_counter_read_positive(&tcp_orphan_count); 69 int shift = 0;
69 70
70 /* If peer does not open window for long time, or did not transmit 71 /* If peer does not open window for long time, or did not transmit
71 * anything for long time, penalize it. */ 72 * anything for long time, penalize it. */
72 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) 73 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
73 orphans <<= 1; 74 shift++;
74 75
75 /* If some dubious ICMP arrived, penalize even more. */ 76 /* If some dubious ICMP arrived, penalize even more. */
76 if (sk->sk_err_soft) 77 if (sk->sk_err_soft)
77 orphans <<= 1; 78 shift++;
78 79
79 if (tcp_too_many_orphans(sk, orphans)) { 80 if (tcp_too_many_orphans(sk, shift)) {
80 if (net_ratelimit()) 81 if (net_ratelimit())
81 printk(KERN_INFO "Out of socket memory\n"); 82 printk(KERN_INFO "Out of socket memory\n");
82 83
@@ -132,22 +133,57 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
132 } 133 }
133} 134}
134 135
136/* This function calculates a "timeout" which is equivalent to the timeout of a
137 * TCP connection after "boundary" unsuccessful, exponentially backed-off
138 * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
139 * syn_set flag is set.
140 */
141static bool retransmits_timed_out(struct sock *sk,
142 unsigned int boundary,
143 unsigned int timeout,
144 bool syn_set)
145{
146 unsigned int linear_backoff_thresh, start_ts;
147 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
148
149 if (!inet_csk(sk)->icsk_retransmits)
150 return false;
151
152 if (unlikely(!tcp_sk(sk)->retrans_stamp))
153 start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when;
154 else
155 start_ts = tcp_sk(sk)->retrans_stamp;
156
157 if (likely(timeout == 0)) {
158 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
159
160 if (boundary <= linear_backoff_thresh)
161 timeout = ((2 << boundary) - 1) * rto_base;
162 else
163 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
164 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
165 }
166 return (tcp_time_stamp - start_ts) >= timeout;
167}
168
135/* A write timeout has occurred. Process the after effects. */ 169/* A write timeout has occurred. Process the after effects. */
136static int tcp_write_timeout(struct sock *sk) 170static int tcp_write_timeout(struct sock *sk)
137{ 171{
138 struct inet_connection_sock *icsk = inet_csk(sk); 172 struct inet_connection_sock *icsk = inet_csk(sk);
139 int retry_until; 173 int retry_until;
174 bool do_reset, syn_set = 0;
140 175
141 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 176 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
142 if (icsk->icsk_retransmits) 177 if (icsk->icsk_retransmits)
143 dst_negative_advice(&sk->sk_dst_cache); 178 dst_negative_advice(sk);
144 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 179 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
180 syn_set = 1;
145 } else { 181 } else {
146 if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { 182 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
147 /* Black hole detection */ 183 /* Black hole detection */
148 tcp_mtu_probing(icsk, sk); 184 tcp_mtu_probing(icsk, sk);
149 185
150 dst_negative_advice(&sk->sk_dst_cache); 186 dst_negative_advice(sk);
151 } 187 }
152 188
153 retry_until = sysctl_tcp_retries2; 189 retry_until = sysctl_tcp_retries2;
@@ -155,13 +191,16 @@ static int tcp_write_timeout(struct sock *sk)
155 const int alive = (icsk->icsk_rto < TCP_RTO_MAX); 191 const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
156 192
157 retry_until = tcp_orphan_retries(sk, alive); 193 retry_until = tcp_orphan_retries(sk, alive);
194 do_reset = alive ||
195 !retransmits_timed_out(sk, retry_until, 0, 0);
158 196
159 if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until)) 197 if (tcp_out_of_resources(sk, do_reset))
160 return 1; 198 return 1;
161 } 199 }
162 } 200 }
163 201
164 if (icsk->icsk_retransmits >= retry_until) { 202 if (retransmits_timed_out(sk, retry_until,
203 syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) {
165 /* Has it gone just too far? */ 204 /* Has it gone just too far? */
166 tcp_write_err(sk); 205 tcp_write_err(sk);
167 return 1; 206 return 1;
@@ -279,7 +318,7 @@ static void tcp_probe_timer(struct sock *sk)
279 * The TCP retransmit timer. 318 * The TCP retransmit timer.
280 */ 319 */
281 320
282static void tcp_retransmit_timer(struct sock *sk) 321void tcp_retransmit_timer(struct sock *sk)
283{ 322{
284 struct tcp_sock *tp = tcp_sk(sk); 323 struct tcp_sock *tp = tcp_sk(sk);
285 struct inet_connection_sock *icsk = inet_csk(sk); 324 struct inet_connection_sock *icsk = inet_csk(sk);
@@ -300,15 +339,15 @@ static void tcp_retransmit_timer(struct sock *sk)
300 struct inet_sock *inet = inet_sk(sk); 339 struct inet_sock *inet = inet_sk(sk);
301 if (sk->sk_family == AF_INET) { 340 if (sk->sk_family == AF_INET) {
302 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", 341 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
303 &inet->daddr, ntohs(inet->dport), 342 &inet->inet_daddr, ntohs(inet->inet_dport),
304 inet->num, tp->snd_una, tp->snd_nxt); 343 inet->inet_num, tp->snd_una, tp->snd_nxt);
305 } 344 }
306#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 345#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
307 else if (sk->sk_family == AF_INET6) { 346 else if (sk->sk_family == AF_INET6) {
308 struct ipv6_pinfo *np = inet6_sk(sk); 347 struct ipv6_pinfo *np = inet6_sk(sk);
309 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", 348 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
310 &np->daddr, ntohs(inet->dport), 349 &np->daddr, ntohs(inet->inet_dport),
311 inet->num, tp->snd_una, tp->snd_nxt); 350 inet->inet_num, tp->snd_una, tp->snd_nxt);
312 } 351 }
313#endif 352#endif
314#endif 353#endif
@@ -328,18 +367,19 @@ static void tcp_retransmit_timer(struct sock *sk)
328 if (icsk->icsk_retransmits == 0) { 367 if (icsk->icsk_retransmits == 0) {
329 int mib_idx; 368 int mib_idx;
330 369
331 if (icsk->icsk_ca_state == TCP_CA_Disorder) { 370 if (icsk->icsk_ca_state == TCP_CA_Recovery) {
332 if (tcp_is_sack(tp))
333 mib_idx = LINUX_MIB_TCPSACKFAILURES;
334 else
335 mib_idx = LINUX_MIB_TCPRENOFAILURES;
336 } else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
337 if (tcp_is_sack(tp)) 371 if (tcp_is_sack(tp))
338 mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; 372 mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
339 else 373 else
340 mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; 374 mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
341 } else if (icsk->icsk_ca_state == TCP_CA_Loss) { 375 } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
342 mib_idx = LINUX_MIB_TCPLOSSFAILURES; 376 mib_idx = LINUX_MIB_TCPLOSSFAILURES;
377 } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
378 tp->sacked_out) {
379 if (tcp_is_sack(tp))
380 mib_idx = LINUX_MIB_TCPSACKFAILURES;
381 else
382 mib_idx = LINUX_MIB_TCPRENOFAILURES;
343 } else { 383 } else {
344 mib_idx = LINUX_MIB_TCPTIMEOUTS; 384 mib_idx = LINUX_MIB_TCPTIMEOUTS;
345 } 385 }
@@ -383,9 +423,27 @@ static void tcp_retransmit_timer(struct sock *sk)
383 icsk->icsk_retransmits++; 423 icsk->icsk_retransmits++;
384 424
385out_reset_timer: 425out_reset_timer:
386 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); 426 /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
427 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
428 * might be increased if the stream oscillates between thin and thick,
429 * thus the old value might already be too high compared to the value
430 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
431 * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
432 * exponential backoff behaviour to avoid continue hammering
433 * linear-timeout retransmissions into a black hole
434 */
435 if (sk->sk_state == TCP_ESTABLISHED &&
436 (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
437 tcp_stream_is_thin(tp) &&
438 icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
439 icsk->icsk_backoff = 0;
440 icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
441 } else {
442 /* Use normal (exponential) backoff */
443 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
444 }
387 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); 445 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
388 if (icsk->icsk_retransmits > sysctl_tcp_retries1) 446 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
389 __sk_dst_reset(sk); 447 __sk_dst_reset(sk);
390 448
391out:; 449out:;
@@ -442,6 +500,12 @@ static void tcp_synack_timer(struct sock *sk)
442 TCP_TIMEOUT_INIT, TCP_RTO_MAX); 500 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
443} 501}
444 502
503void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req)
504{
505 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
506}
507EXPORT_SYMBOL(tcp_syn_ack_timeout);
508
445void tcp_set_keepalive(struct sock *sk, int val) 509void tcp_set_keepalive(struct sock *sk, int val)
446{ 510{
447 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) 511 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
@@ -459,7 +523,7 @@ static void tcp_keepalive_timer (unsigned long data)
459 struct sock *sk = (struct sock *) data; 523 struct sock *sk = (struct sock *) data;
460 struct inet_connection_sock *icsk = inet_csk(sk); 524 struct inet_connection_sock *icsk = inet_csk(sk);
461 struct tcp_sock *tp = tcp_sk(sk); 525 struct tcp_sock *tp = tcp_sk(sk);
462 __u32 elapsed; 526 u32 elapsed;
463 527
464 /* Only process if socket is not in use. */ 528 /* Only process if socket is not in use. */
465 bh_lock_sock(sk); 529 bh_lock_sock(sk);
@@ -496,11 +560,17 @@ static void tcp_keepalive_timer (unsigned long data)
496 if (tp->packets_out || tcp_send_head(sk)) 560 if (tp->packets_out || tcp_send_head(sk))
497 goto resched; 561 goto resched;
498 562
499 elapsed = tcp_time_stamp - tp->rcv_tstamp; 563 elapsed = keepalive_time_elapsed(tp);
500 564
501 if (elapsed >= keepalive_time_when(tp)) { 565 if (elapsed >= keepalive_time_when(tp)) {
502 if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) || 566 /* If the TCP_USER_TIMEOUT option is enabled, use that
503 (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) { 567 * to determine when to timeout instead.
568 */
569 if ((icsk->icsk_user_timeout != 0 &&
570 elapsed >= icsk->icsk_user_timeout &&
571 icsk->icsk_probes_out > 0) ||
572 (icsk->icsk_user_timeout == 0 &&
573 icsk->icsk_probes_out >= keepalive_probes(tp))) {
504 tcp_send_active_reset(sk, GFP_ATOMIC); 574 tcp_send_active_reset(sk, GFP_ATOMIC);
505 tcp_write_err(sk); 575 tcp_write_err(sk);
506 goto out; 576 goto out;
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index e9bbff746488..38bc0b52d745 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -6,7 +6,7 @@
6 * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." 6 * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
7 * IEEE Journal on Selected Areas in Communication, 7 * IEEE Journal on Selected Areas in Communication,
8 * Feb. 2003. 8 * Feb. 2003.
9 * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf 9 * See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
10 */ 10 */
11 11
12#include <linux/mm.h> 12#include <linux/mm.h>
@@ -165,9 +165,8 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
165 * every other rtt. 165 * every other rtt.
166 */ 166 */
167 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { 167 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
168 if (veno->inc 168 if (veno->inc &&
169 && tp->snd_cwnd < 169 tp->snd_cwnd < tp->snd_cwnd_clamp) {
170 tp->snd_cwnd_clamp) {
171 tp->snd_cwnd++; 170 tp->snd_cwnd++;
172 veno->inc = 0; 171 veno->inc = 0;
173 } else 172 } else
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 20151d6a6241..a534dda5456e 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk)
80 */ 80 */
81static inline u32 westwood_do_filter(u32 a, u32 b) 81static inline u32 westwood_do_filter(u32 a, u32 b)
82{ 82{
83 return (((7 * a) + b) >> 3); 83 return ((7 * a) + b) >> 3;
84} 84}
85 85
86static void westwood_filter(struct westwood *w, u32 delta) 86static void westwood_filter(struct westwood *w, u32 delta)
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 66b6821b984e..a0f240358892 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -157,8 +157,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
157 157
158 if (queue > TCP_YEAH_ALPHA || 158 if (queue > TCP_YEAH_ALPHA ||
159 rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { 159 rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
160 if (queue > TCP_YEAH_ALPHA 160 if (queue > TCP_YEAH_ALPHA &&
161 && tp->snd_cwnd > yeah->reno_count) { 161 tp->snd_cwnd > yeah->reno_count) {
162 u32 reduction = min(queue / TCP_YEAH_GAMMA , 162 u32 reduction = min(queue / TCP_YEAH_GAMMA ,
163 tp->snd_cwnd >> TCP_YEAH_EPSILON); 163 tp->snd_cwnd >> TCP_YEAH_EPSILON);
164 164
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index cb1f0e83830b..ac3b3ee4b07c 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -8,37 +8,43 @@
8#include <linux/mutex.h> 8#include <linux/mutex.h>
9#include <linux/netdevice.h> 9#include <linux/netdevice.h>
10#include <linux/skbuff.h> 10#include <linux/skbuff.h>
11#include <linux/slab.h>
11#include <net/icmp.h> 12#include <net/icmp.h>
12#include <net/ip.h> 13#include <net/ip.h>
13#include <net/protocol.h> 14#include <net/protocol.h>
14#include <net/xfrm.h> 15#include <net/xfrm.h>
15 16
16static struct xfrm_tunnel *tunnel4_handlers; 17static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly;
17static struct xfrm_tunnel *tunnel64_handlers; 18static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly;
18static DEFINE_MUTEX(tunnel4_mutex); 19static DEFINE_MUTEX(tunnel4_mutex);
19 20
20static inline struct xfrm_tunnel **fam_handlers(unsigned short family) 21static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family)
21{ 22{
22 return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; 23 return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers;
23} 24}
24 25
25int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) 26int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
26{ 27{
27 struct xfrm_tunnel **pprev; 28 struct xfrm_tunnel __rcu **pprev;
29 struct xfrm_tunnel *t;
30
28 int ret = -EEXIST; 31 int ret = -EEXIST;
29 int priority = handler->priority; 32 int priority = handler->priority;
30 33
31 mutex_lock(&tunnel4_mutex); 34 mutex_lock(&tunnel4_mutex);
32 35
33 for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { 36 for (pprev = fam_handlers(family);
34 if ((*pprev)->priority > priority) 37 (t = rcu_dereference_protected(*pprev,
38 lockdep_is_held(&tunnel4_mutex))) != NULL;
39 pprev = &t->next) {
40 if (t->priority > priority)
35 break; 41 break;
36 if ((*pprev)->priority == priority) 42 if (t->priority == priority)
37 goto err; 43 goto err;
38 } 44 }
39 45
40 handler->next = *pprev; 46 handler->next = *pprev;
41 *pprev = handler; 47 rcu_assign_pointer(*pprev, handler);
42 48
43 ret = 0; 49 ret = 0;
44 50
@@ -47,18 +53,21 @@ err:
47 53
48 return ret; 54 return ret;
49} 55}
50
51EXPORT_SYMBOL(xfrm4_tunnel_register); 56EXPORT_SYMBOL(xfrm4_tunnel_register);
52 57
53int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) 58int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
54{ 59{
55 struct xfrm_tunnel **pprev; 60 struct xfrm_tunnel __rcu **pprev;
61 struct xfrm_tunnel *t;
56 int ret = -ENOENT; 62 int ret = -ENOENT;
57 63
58 mutex_lock(&tunnel4_mutex); 64 mutex_lock(&tunnel4_mutex);
59 65
60 for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { 66 for (pprev = fam_handlers(family);
61 if (*pprev == handler) { 67 (t = rcu_dereference_protected(*pprev,
68 lockdep_is_held(&tunnel4_mutex))) != NULL;
69 pprev = &t->next) {
70 if (t == handler) {
62 *pprev = handler->next; 71 *pprev = handler->next;
63 ret = 0; 72 ret = 0;
64 break; 73 break;
@@ -71,9 +80,13 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
71 80
72 return ret; 81 return ret;
73} 82}
74
75EXPORT_SYMBOL(xfrm4_tunnel_deregister); 83EXPORT_SYMBOL(xfrm4_tunnel_deregister);
76 84
85#define for_each_tunnel_rcu(head, handler) \
86 for (handler = rcu_dereference(head); \
87 handler != NULL; \
88 handler = rcu_dereference(handler->next)) \
89
77static int tunnel4_rcv(struct sk_buff *skb) 90static int tunnel4_rcv(struct sk_buff *skb)
78{ 91{
79 struct xfrm_tunnel *handler; 92 struct xfrm_tunnel *handler;
@@ -81,7 +94,7 @@ static int tunnel4_rcv(struct sk_buff *skb)
81 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 94 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
82 goto drop; 95 goto drop;
83 96
84 for (handler = tunnel4_handlers; handler; handler = handler->next) 97 for_each_tunnel_rcu(tunnel4_handlers, handler)
85 if (!handler->handler(skb)) 98 if (!handler->handler(skb))
86 return 0; 99 return 0;
87 100
@@ -100,7 +113,7 @@ static int tunnel64_rcv(struct sk_buff *skb)
100 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) 113 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
101 goto drop; 114 goto drop;
102 115
103 for (handler = tunnel64_handlers; handler; handler = handler->next) 116 for_each_tunnel_rcu(tunnel64_handlers, handler)
104 if (!handler->handler(skb)) 117 if (!handler->handler(skb))
105 return 0; 118 return 0;
106 119
@@ -116,7 +129,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)
116{ 129{
117 struct xfrm_tunnel *handler; 130 struct xfrm_tunnel *handler;
118 131
119 for (handler = tunnel4_handlers; handler; handler = handler->next) 132 for_each_tunnel_rcu(tunnel4_handlers, handler)
120 if (!handler->err_handler(skb, info)) 133 if (!handler->err_handler(skb, info))
121 break; 134 break;
122} 135}
@@ -126,13 +139,13 @@ static void tunnel64_err(struct sk_buff *skb, u32 info)
126{ 139{
127 struct xfrm_tunnel *handler; 140 struct xfrm_tunnel *handler;
128 141
129 for (handler = tunnel64_handlers; handler; handler = handler->next) 142 for_each_tunnel_rcu(tunnel64_handlers, handler)
130 if (!handler->err_handler(skb, info)) 143 if (!handler->err_handler(skb, info))
131 break; 144 break;
132} 145}
133#endif 146#endif
134 147
135static struct net_protocol tunnel4_protocol = { 148static const struct net_protocol tunnel4_protocol = {
136 .handler = tunnel4_rcv, 149 .handler = tunnel4_rcv,
137 .err_handler = tunnel4_err, 150 .err_handler = tunnel4_err,
138 .no_policy = 1, 151 .no_policy = 1,
@@ -140,7 +153,7 @@ static struct net_protocol tunnel4_protocol = {
140}; 153};
141 154
142#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 155#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
143static struct net_protocol tunnel64_protocol = { 156static const struct net_protocol tunnel64_protocol = {
144 .handler = tunnel64_rcv, 157 .handler = tunnel64_rcv,
145 .err_handler = tunnel64_err, 158 .err_handler = tunnel64_err,
146 .no_policy = 1, 159 .no_policy = 1,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 80e3812837ad..8157b17959ee 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -95,6 +95,7 @@
95#include <linux/mm.h> 95#include <linux/mm.h>
96#include <linux/inet.h> 96#include <linux/inet.h>
97#include <linux/netdevice.h> 97#include <linux/netdevice.h>
98#include <linux/slab.h>
98#include <net/tcp_states.h> 99#include <net/tcp_states.h>
99#include <linux/skbuff.h> 100#include <linux/skbuff.h>
100#include <linux/proc_fs.h> 101#include <linux/proc_fs.h>
@@ -106,42 +107,45 @@
106#include <net/xfrm.h> 107#include <net/xfrm.h>
107#include "udp_impl.h" 108#include "udp_impl.h"
108 109
109struct udp_table udp_table; 110struct udp_table udp_table __read_mostly;
110EXPORT_SYMBOL(udp_table); 111EXPORT_SYMBOL(udp_table);
111 112
112int sysctl_udp_mem[3] __read_mostly; 113long sysctl_udp_mem[3] __read_mostly;
113int sysctl_udp_rmem_min __read_mostly;
114int sysctl_udp_wmem_min __read_mostly;
115
116EXPORT_SYMBOL(sysctl_udp_mem); 114EXPORT_SYMBOL(sysctl_udp_mem);
115
116int sysctl_udp_rmem_min __read_mostly;
117EXPORT_SYMBOL(sysctl_udp_rmem_min); 117EXPORT_SYMBOL(sysctl_udp_rmem_min);
118
119int sysctl_udp_wmem_min __read_mostly;
118EXPORT_SYMBOL(sysctl_udp_wmem_min); 120EXPORT_SYMBOL(sysctl_udp_wmem_min);
119 121
120atomic_t udp_memory_allocated; 122atomic_long_t udp_memory_allocated;
121EXPORT_SYMBOL(udp_memory_allocated); 123EXPORT_SYMBOL(udp_memory_allocated);
122 124
123#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE) 125#define MAX_UDP_PORTS 65536
126#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
124 127
125static int udp_lib_lport_inuse(struct net *net, __u16 num, 128static int udp_lib_lport_inuse(struct net *net, __u16 num,
126 const struct udp_hslot *hslot, 129 const struct udp_hslot *hslot,
127 unsigned long *bitmap, 130 unsigned long *bitmap,
128 struct sock *sk, 131 struct sock *sk,
129 int (*saddr_comp)(const struct sock *sk1, 132 int (*saddr_comp)(const struct sock *sk1,
130 const struct sock *sk2)) 133 const struct sock *sk2),
134 unsigned int log)
131{ 135{
132 struct sock *sk2; 136 struct sock *sk2;
133 struct hlist_nulls_node *node; 137 struct hlist_nulls_node *node;
134 138
135 sk_nulls_for_each(sk2, node, &hslot->head) 139 sk_nulls_for_each(sk2, node, &hslot->head)
136 if (net_eq(sock_net(sk2), net) && 140 if (net_eq(sock_net(sk2), net) &&
137 sk2 != sk && 141 sk2 != sk &&
138 (bitmap || sk2->sk_hash == num) && 142 (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
139 (!sk2->sk_reuse || !sk->sk_reuse) && 143 (!sk2->sk_reuse || !sk->sk_reuse) &&
140 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if 144 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
141 || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 145 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
142 (*saddr_comp)(sk, sk2)) { 146 (*saddr_comp)(sk, sk2)) {
143 if (bitmap) 147 if (bitmap)
144 __set_bit(sk2->sk_hash / UDP_HTABLE_SIZE, 148 __set_bit(udp_sk(sk2)->udp_port_hash >> log,
145 bitmap); 149 bitmap);
146 else 150 else
147 return 1; 151 return 1;
@@ -149,18 +153,51 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
149 return 0; 153 return 0;
150} 154}
151 155
156/*
157 * Note: we still hold spinlock of primary hash chain, so no other writer
158 * can insert/delete a socket with local_port == num
159 */
160static int udp_lib_lport_inuse2(struct net *net, __u16 num,
161 struct udp_hslot *hslot2,
162 struct sock *sk,
163 int (*saddr_comp)(const struct sock *sk1,
164 const struct sock *sk2))
165{
166 struct sock *sk2;
167 struct hlist_nulls_node *node;
168 int res = 0;
169
170 spin_lock(&hslot2->lock);
171 udp_portaddr_for_each_entry(sk2, node, &hslot2->head)
172 if (net_eq(sock_net(sk2), net) &&
173 sk2 != sk &&
174 (udp_sk(sk2)->udp_port_hash == num) &&
175 (!sk2->sk_reuse || !sk->sk_reuse) &&
176 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
177 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
178 (*saddr_comp)(sk, sk2)) {
179 res = 1;
180 break;
181 }
182 spin_unlock(&hslot2->lock);
183 return res;
184}
185
152/** 186/**
153 * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 187 * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
154 * 188 *
155 * @sk: socket struct in question 189 * @sk: socket struct in question
156 * @snum: port number to look up 190 * @snum: port number to look up
157 * @saddr_comp: AF-dependent comparison of bound local IP addresses 191 * @saddr_comp: AF-dependent comparison of bound local IP addresses
192 * @hash2_nulladdr: AF-dependant hash value in secondary hash chains,
193 * with NULL address
158 */ 194 */
159int udp_lib_get_port(struct sock *sk, unsigned short snum, 195int udp_lib_get_port(struct sock *sk, unsigned short snum,
160 int (*saddr_comp)(const struct sock *sk1, 196 int (*saddr_comp)(const struct sock *sk1,
161 const struct sock *sk2 ) ) 197 const struct sock *sk2),
198 unsigned int hash2_nulladdr)
162{ 199{
163 struct udp_hslot *hslot; 200 struct udp_hslot *hslot, *hslot2;
164 struct udp_table *udptable = sk->sk_prot->h.udp_table; 201 struct udp_table *udptable = sk->sk_prot->h.udp_table;
165 int error = 1; 202 int error = 1;
166 struct net *net = sock_net(sk); 203 struct net *net = sock_net(sk);
@@ -179,13 +216,14 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
179 /* 216 /*
180 * force rand to be an odd multiple of UDP_HTABLE_SIZE 217 * force rand to be an odd multiple of UDP_HTABLE_SIZE
181 */ 218 */
182 rand = (rand | 1) * UDP_HTABLE_SIZE; 219 rand = (rand | 1) * (udptable->mask + 1);
183 for (last = first + UDP_HTABLE_SIZE; first != last; first++) { 220 last = first + udptable->mask + 1;
184 hslot = &udptable->hash[udp_hashfn(net, first)]; 221 do {
222 hslot = udp_hashslot(udptable, net, first);
185 bitmap_zero(bitmap, PORTS_PER_CHAIN); 223 bitmap_zero(bitmap, PORTS_PER_CHAIN);
186 spin_lock_bh(&hslot->lock); 224 spin_lock_bh(&hslot->lock);
187 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, 225 udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
188 saddr_comp); 226 saddr_comp, udptable->log);
189 227
190 snum = first; 228 snum = first;
191 /* 229 /*
@@ -195,25 +233,60 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
195 */ 233 */
196 do { 234 do {
197 if (low <= snum && snum <= high && 235 if (low <= snum && snum <= high &&
198 !test_bit(snum / UDP_HTABLE_SIZE, bitmap)) 236 !test_bit(snum >> udptable->log, bitmap) &&
237 !inet_is_reserved_local_port(snum))
199 goto found; 238 goto found;
200 snum += rand; 239 snum += rand;
201 } while (snum != first); 240 } while (snum != first);
202 spin_unlock_bh(&hslot->lock); 241 spin_unlock_bh(&hslot->lock);
203 } 242 } while (++first != last);
204 goto fail; 243 goto fail;
205 } else { 244 } else {
206 hslot = &udptable->hash[udp_hashfn(net, snum)]; 245 hslot = udp_hashslot(udptable, net, snum);
207 spin_lock_bh(&hslot->lock); 246 spin_lock_bh(&hslot->lock);
208 if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp)) 247 if (hslot->count > 10) {
248 int exist;
249 unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
250
251 slot2 &= udptable->mask;
252 hash2_nulladdr &= udptable->mask;
253
254 hslot2 = udp_hashslot2(udptable, slot2);
255 if (hslot->count < hslot2->count)
256 goto scan_primary_hash;
257
258 exist = udp_lib_lport_inuse2(net, snum, hslot2,
259 sk, saddr_comp);
260 if (!exist && (hash2_nulladdr != slot2)) {
261 hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
262 exist = udp_lib_lport_inuse2(net, snum, hslot2,
263 sk, saddr_comp);
264 }
265 if (exist)
266 goto fail_unlock;
267 else
268 goto found;
269 }
270scan_primary_hash:
271 if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
272 saddr_comp, 0))
209 goto fail_unlock; 273 goto fail_unlock;
210 } 274 }
211found: 275found:
212 inet_sk(sk)->num = snum; 276 inet_sk(sk)->inet_num = snum;
213 sk->sk_hash = snum; 277 udp_sk(sk)->udp_port_hash = snum;
278 udp_sk(sk)->udp_portaddr_hash ^= snum;
214 if (sk_unhashed(sk)) { 279 if (sk_unhashed(sk)) {
215 sk_nulls_add_node_rcu(sk, &hslot->head); 280 sk_nulls_add_node_rcu(sk, &hslot->head);
281 hslot->count++;
216 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 282 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
283
284 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
285 spin_lock(&hslot2->lock);
286 hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
287 &hslot2->head);
288 hslot2->count++;
289 spin_unlock(&hslot2->lock);
217 } 290 }
218 error = 0; 291 error = 0;
219fail_unlock: 292fail_unlock:
@@ -221,19 +294,33 @@ fail_unlock:
221fail: 294fail:
222 return error; 295 return error;
223} 296}
297EXPORT_SYMBOL(udp_lib_get_port);
224 298
225static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) 299static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
226{ 300{
227 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); 301 struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
228 302
229 return ( !ipv6_only_sock(sk2) && 303 return (!ipv6_only_sock(sk2) &&
230 (!inet1->rcv_saddr || !inet2->rcv_saddr || 304 (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
231 inet1->rcv_saddr == inet2->rcv_saddr )); 305 inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
306}
307
308static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
309 unsigned int port)
310{
311 return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
232} 312}
233 313
234int udp_v4_get_port(struct sock *sk, unsigned short snum) 314int udp_v4_get_port(struct sock *sk, unsigned short snum)
235{ 315{
236 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal); 316 unsigned int hash2_nulladdr =
317 udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
318 unsigned int hash2_partial =
319 udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
320
321 /* precompute partial secondary hash */
322 udp_sk(sk)->udp_portaddr_hash = hash2_partial;
323 return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
237} 324}
238 325
239static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, 326static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
@@ -242,23 +329,23 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
242{ 329{
243 int score = -1; 330 int score = -1;
244 331
245 if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && 332 if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
246 !ipv6_only_sock(sk)) { 333 !ipv6_only_sock(sk)) {
247 struct inet_sock *inet = inet_sk(sk); 334 struct inet_sock *inet = inet_sk(sk);
248 335
249 score = (sk->sk_family == PF_INET ? 1 : 0); 336 score = (sk->sk_family == PF_INET ? 1 : 0);
250 if (inet->rcv_saddr) { 337 if (inet->inet_rcv_saddr) {
251 if (inet->rcv_saddr != daddr) 338 if (inet->inet_rcv_saddr != daddr)
252 return -1; 339 return -1;
253 score += 2; 340 score += 2;
254 } 341 }
255 if (inet->daddr) { 342 if (inet->inet_daddr) {
256 if (inet->daddr != saddr) 343 if (inet->inet_daddr != saddr)
257 return -1; 344 return -1;
258 score += 2; 345 score += 2;
259 } 346 }
260 if (inet->dport) { 347 if (inet->inet_dport) {
261 if (inet->dport != sport) 348 if (inet->inet_dport != sport)
262 return -1; 349 return -1;
263 score += 2; 350 score += 2;
264 } 351 }
@@ -271,6 +358,89 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
271 return score; 358 return score;
272} 359}
273 360
361/*
362 * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
363 */
364#define SCORE2_MAX (1 + 2 + 2 + 2)
365static inline int compute_score2(struct sock *sk, struct net *net,
366 __be32 saddr, __be16 sport,
367 __be32 daddr, unsigned int hnum, int dif)
368{
369 int score = -1;
370
371 if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) {
372 struct inet_sock *inet = inet_sk(sk);
373
374 if (inet->inet_rcv_saddr != daddr)
375 return -1;
376 if (inet->inet_num != hnum)
377 return -1;
378
379 score = (sk->sk_family == PF_INET ? 1 : 0);
380 if (inet->inet_daddr) {
381 if (inet->inet_daddr != saddr)
382 return -1;
383 score += 2;
384 }
385 if (inet->inet_dport) {
386 if (inet->inet_dport != sport)
387 return -1;
388 score += 2;
389 }
390 if (sk->sk_bound_dev_if) {
391 if (sk->sk_bound_dev_if != dif)
392 return -1;
393 score += 2;
394 }
395 }
396 return score;
397}
398
399
400/* called with read_rcu_lock() */
401static struct sock *udp4_lib_lookup2(struct net *net,
402 __be32 saddr, __be16 sport,
403 __be32 daddr, unsigned int hnum, int dif,
404 struct udp_hslot *hslot2, unsigned int slot2)
405{
406 struct sock *sk, *result;
407 struct hlist_nulls_node *node;
408 int score, badness;
409
410begin:
411 result = NULL;
412 badness = -1;
413 udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
414 score = compute_score2(sk, net, saddr, sport,
415 daddr, hnum, dif);
416 if (score > badness) {
417 result = sk;
418 badness = score;
419 if (score == SCORE2_MAX)
420 goto exact_match;
421 }
422 }
423 /*
424 * if the nulls value we got at the end of this lookup is
425 * not the expected one, we must restart lookup.
426 * We probably met an item that was moved to another chain.
427 */
428 if (get_nulls_value(node) != slot2)
429 goto begin;
430
431 if (result) {
432exact_match:
433 if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
434 result = NULL;
435 else if (unlikely(compute_score2(result, net, saddr, sport,
436 daddr, hnum, dif) < badness)) {
437 sock_put(result);
438 goto begin;
439 }
440 }
441 return result;
442}
443
274/* UDP is nearly always wildcards out the wazoo, it makes no sense to try 444/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
275 * harder than this. -DaveM 445 * harder than this. -DaveM
276 */ 446 */
@@ -281,11 +451,35 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
281 struct sock *sk, *result; 451 struct sock *sk, *result;
282 struct hlist_nulls_node *node; 452 struct hlist_nulls_node *node;
283 unsigned short hnum = ntohs(dport); 453 unsigned short hnum = ntohs(dport);
284 unsigned int hash = udp_hashfn(net, hnum); 454 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
285 struct udp_hslot *hslot = &udptable->hash[hash]; 455 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
286 int score, badness; 456 int score, badness;
287 457
288 rcu_read_lock(); 458 rcu_read_lock();
459 if (hslot->count > 10) {
460 hash2 = udp4_portaddr_hash(net, daddr, hnum);
461 slot2 = hash2 & udptable->mask;
462 hslot2 = &udptable->hash2[slot2];
463 if (hslot->count < hslot2->count)
464 goto begin;
465
466 result = udp4_lib_lookup2(net, saddr, sport,
467 daddr, hnum, dif,
468 hslot2, slot2);
469 if (!result) {
470 hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
471 slot2 = hash2 & udptable->mask;
472 hslot2 = &udptable->hash2[slot2];
473 if (hslot->count < hslot2->count)
474 goto begin;
475
476 result = udp4_lib_lookup2(net, saddr, sport,
477 htonl(INADDR_ANY), hnum, dif,
478 hslot2, slot2);
479 }
480 rcu_read_unlock();
481 return result;
482 }
289begin: 483begin:
290 result = NULL; 484 result = NULL;
291 badness = -1; 485 badness = -1;
@@ -302,11 +496,11 @@ begin:
302 * not the expected one, we must restart lookup. 496 * not the expected one, we must restart lookup.
303 * We probably met an item that was moved to another chain. 497 * We probably met an item that was moved to another chain.
304 */ 498 */
305 if (get_nulls_value(node) != hash) 499 if (get_nulls_value(node) != slot)
306 goto begin; 500 goto begin;
307 501
308 if (result) { 502 if (result) {
309 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 503 if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
310 result = NULL; 504 result = NULL;
311 else if (unlikely(compute_score(result, net, saddr, hnum, sport, 505 else if (unlikely(compute_score(result, net, saddr, hnum, sport,
312 daddr, dport, dif) < badness)) { 506 daddr, dport, dif) < badness)) {
@@ -352,12 +546,13 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
352 sk_nulls_for_each_from(s, node) { 546 sk_nulls_for_each_from(s, node) {
353 struct inet_sock *inet = inet_sk(s); 547 struct inet_sock *inet = inet_sk(s);
354 548
355 if (!net_eq(sock_net(s), net) || 549 if (!net_eq(sock_net(s), net) ||
356 s->sk_hash != hnum || 550 udp_sk(s)->udp_port_hash != hnum ||
357 (inet->daddr && inet->daddr != rmt_addr) || 551 (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
358 (inet->dport != rmt_port && inet->dport) || 552 (inet->inet_dport != rmt_port && inet->inet_dport) ||
359 (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || 553 (inet->inet_rcv_saddr &&
360 ipv6_only_sock(s) || 554 inet->inet_rcv_saddr != loc_addr) ||
555 ipv6_only_sock(s) ||
361 (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) 556 (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
362 continue; 557 continue;
363 if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) 558 if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
@@ -383,8 +578,8 @@ found:
383void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) 578void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
384{ 579{
385 struct inet_sock *inet; 580 struct inet_sock *inet;
386 struct iphdr *iph = (struct iphdr*)skb->data; 581 struct iphdr *iph = (struct iphdr *)skb->data;
387 struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2)); 582 struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
388 const int type = icmp_hdr(skb)->type; 583 const int type = icmp_hdr(skb)->type;
389 const int code = icmp_hdr(skb)->code; 584 const int code = icmp_hdr(skb)->code;
390 struct sock *sk; 585 struct sock *sk;
@@ -438,9 +633,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
438 if (!inet->recverr) { 633 if (!inet->recverr) {
439 if (!harderr || sk->sk_state != TCP_ESTABLISHED) 634 if (!harderr || sk->sk_state != TCP_ESTABLISHED)
440 goto out; 635 goto out;
441 } else { 636 } else
442 ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1)); 637 ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
443 } 638
444 sk->sk_err = err; 639 sk->sk_err = err;
445 sk->sk_error_report(sk); 640 sk->sk_error_report(sk);
446out: 641out:
@@ -474,7 +669,7 @@ EXPORT_SYMBOL(udp_flush_pending_frames);
474 * (checksum field must be zeroed out) 669 * (checksum field must be zeroed out)
475 */ 670 */
476static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, 671static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
477 __be32 src, __be32 dst, int len ) 672 __be32 src, __be32 dst, int len)
478{ 673{
479 unsigned int offset; 674 unsigned int offset;
480 struct udphdr *uh = udp_hdr(skb); 675 struct udphdr *uh = udp_hdr(skb);
@@ -545,7 +740,7 @@ static int udp_push_pending_frames(struct sock *sk)
545 740
546 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ 741 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
547 742
548 udp4_hwcsum_outgoing(sk, skb, fl->fl4_src,fl->fl4_dst, up->len); 743 udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len);
549 goto send; 744 goto send;
550 745
551 } else /* `normal' UDP */ 746 } else /* `normal' UDP */
@@ -553,18 +748,24 @@ static int udp_push_pending_frames(struct sock *sk)
553 748
554 /* add protocol-dependent pseudo-header */ 749 /* add protocol-dependent pseudo-header */
555 uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, 750 uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len,
556 sk->sk_protocol, csum ); 751 sk->sk_protocol, csum);
557 if (uh->check == 0) 752 if (uh->check == 0)
558 uh->check = CSUM_MANGLED_0; 753 uh->check = CSUM_MANGLED_0;
559 754
560send: 755send:
561 err = ip_push_pending_frames(sk); 756 err = ip_push_pending_frames(sk);
757 if (err) {
758 if (err == -ENOBUFS && !inet->recverr) {
759 UDP_INC_STATS_USER(sock_net(sk),
760 UDP_MIB_SNDBUFERRORS, is_udplite);
761 err = 0;
762 }
763 } else
764 UDP_INC_STATS_USER(sock_net(sk),
765 UDP_MIB_OUTDATAGRAMS, is_udplite);
562out: 766out:
563 up->len = 0; 767 up->len = 0;
564 up->pending = 0; 768 up->pending = 0;
565 if (!err)
566 UDP_INC_STATS_USER(sock_net(sk),
567 UDP_MIB_OUTDATAGRAMS, is_udplite);
568 return err; 769 return err;
569} 770}
570 771
@@ -592,11 +793,11 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
592 * Check the flags. 793 * Check the flags.
593 */ 794 */
594 795
595 if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */ 796 if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
596 return -EOPNOTSUPP; 797 return -EOPNOTSUPP;
597 798
598 ipc.opt = NULL; 799 ipc.opt = NULL;
599 ipc.shtx.flags = 0; 800 ipc.tx_flags = 0;
600 801
601 if (up->pending) { 802 if (up->pending) {
602 /* 803 /*
@@ -619,7 +820,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
619 * Get and verify the address. 820 * Get and verify the address.
620 */ 821 */
621 if (msg->msg_name) { 822 if (msg->msg_name) {
622 struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name; 823 struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name;
623 if (msg->msg_namelen < sizeof(*usin)) 824 if (msg->msg_namelen < sizeof(*usin))
624 return -EINVAL; 825 return -EINVAL;
625 if (usin->sin_family != AF_INET) { 826 if (usin->sin_family != AF_INET) {
@@ -634,17 +835,17 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
634 } else { 835 } else {
635 if (sk->sk_state != TCP_ESTABLISHED) 836 if (sk->sk_state != TCP_ESTABLISHED)
636 return -EDESTADDRREQ; 837 return -EDESTADDRREQ;
637 daddr = inet->daddr; 838 daddr = inet->inet_daddr;
638 dport = inet->dport; 839 dport = inet->inet_dport;
639 /* Open fast path for connected socket. 840 /* Open fast path for connected socket.
640 Route will not be used, if at least one option is set. 841 Route will not be used, if at least one option is set.
641 */ 842 */
642 connected = 1; 843 connected = 1;
643 } 844 }
644 ipc.addr = inet->saddr; 845 ipc.addr = inet->inet_saddr;
645 846
646 ipc.oif = sk->sk_bound_dev_if; 847 ipc.oif = sk->sk_bound_dev_if;
647 err = sock_tx_timestamp(msg, sk, &ipc.shtx); 848 err = sock_tx_timestamp(sk, &ipc.tx_flags);
648 if (err) 849 if (err)
649 return err; 850 return err;
650 if (msg->msg_controllen) { 851 if (msg->msg_controllen) {
@@ -684,19 +885,18 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
684 } 885 }
685 886
686 if (connected) 887 if (connected)
687 rt = (struct rtable*)sk_dst_check(sk, 0); 888 rt = (struct rtable *)sk_dst_check(sk, 0);
688 889
689 if (rt == NULL) { 890 if (rt == NULL) {
690 struct flowi fl = { .oif = ipc.oif, 891 struct flowi fl = { .oif = ipc.oif,
691 .nl_u = { .ip4_u = 892 .mark = sk->sk_mark,
692 { .daddr = faddr, 893 .fl4_dst = faddr,
693 .saddr = saddr, 894 .fl4_src = saddr,
694 .tos = tos } }, 895 .fl4_tos = tos,
695 .proto = sk->sk_protocol, 896 .proto = sk->sk_protocol,
696 .flags = inet_sk_flowi_flags(sk), 897 .flags = inet_sk_flowi_flags(sk),
697 .uli_u = { .ports = 898 .fl_ip_sport = inet->inet_sport,
698 { .sport = inet->sport, 899 .fl_ip_dport = dport };
699 .dport = dport } } };
700 struct net *net = sock_net(sk); 900 struct net *net = sock_net(sk);
701 901
702 security_sk_classify_flow(sk, &fl); 902 security_sk_classify_flow(sk, &fl);
@@ -712,7 +912,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
712 !sock_flag(sk, SOCK_BROADCAST)) 912 !sock_flag(sk, SOCK_BROADCAST))
713 goto out; 913 goto out;
714 if (connected) 914 if (connected)
715 sk_dst_set(sk, dst_clone(&rt->u.dst)); 915 sk_dst_set(sk, dst_clone(&rt->dst));
716 } 916 }
717 917
718 if (msg->msg_flags&MSG_CONFIRM) 918 if (msg->msg_flags&MSG_CONFIRM)
@@ -739,7 +939,7 @@ back_from_confirm:
739 inet->cork.fl.fl4_dst = daddr; 939 inet->cork.fl.fl4_dst = daddr;
740 inet->cork.fl.fl_ip_dport = dport; 940 inet->cork.fl.fl_ip_dport = dport;
741 inet->cork.fl.fl4_src = saddr; 941 inet->cork.fl.fl4_src = saddr;
742 inet->cork.fl.fl_ip_sport = inet->sport; 942 inet->cork.fl.fl_ip_sport = inet->inet_sport;
743 up->pending = AF_INET; 943 up->pending = AF_INET;
744 944
745do_append_data: 945do_append_data:
@@ -776,12 +976,13 @@ out:
776 return err; 976 return err;
777 977
778do_confirm: 978do_confirm:
779 dst_confirm(&rt->u.dst); 979 dst_confirm(&rt->dst);
780 if (!(msg->msg_flags&MSG_PROBE) || len) 980 if (!(msg->msg_flags&MSG_PROBE) || len)
781 goto back_from_confirm; 981 goto back_from_confirm;
782 err = 0; 982 err = 0;
783 goto out; 983 goto out;
784} 984}
985EXPORT_SYMBOL(udp_sendmsg);
785 986
786int udp_sendpage(struct sock *sk, struct page *page, int offset, 987int udp_sendpage(struct sock *sk, struct page *page, int offset,
787 size_t size, int flags) 988 size_t size, int flags)
@@ -831,6 +1032,44 @@ out:
831 return ret; 1032 return ret;
832} 1033}
833 1034
1035
1036/**
1037 * first_packet_length - return length of first packet in receive queue
1038 * @sk: socket
1039 *
1040 * Drops all bad checksum frames, until a valid one is found.
1041 * Returns the length of found skb, or 0 if none is found.
1042 */
1043static unsigned int first_packet_length(struct sock *sk)
1044{
1045 struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
1046 struct sk_buff *skb;
1047 unsigned int res;
1048
1049 __skb_queue_head_init(&list_kill);
1050
1051 spin_lock_bh(&rcvq->lock);
1052 while ((skb = skb_peek(rcvq)) != NULL &&
1053 udp_lib_checksum_complete(skb)) {
1054 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
1055 IS_UDPLITE(sk));
1056 atomic_inc(&sk->sk_drops);
1057 __skb_unlink(skb, rcvq);
1058 __skb_queue_tail(&list_kill, skb);
1059 }
1060 res = skb ? skb->len : 0;
1061 spin_unlock_bh(&rcvq->lock);
1062
1063 if (!skb_queue_empty(&list_kill)) {
1064 bool slow = lock_sock_fast(sk);
1065
1066 __skb_queue_purge(&list_kill);
1067 sk_mem_reclaim_partial(sk);
1068 unlock_sock_fast(sk, slow);
1069 }
1070 return res;
1071}
1072
834/* 1073/*
835 * IOCTL requests applicable to the UDP protocol 1074 * IOCTL requests applicable to the UDP protocol
836 */ 1075 */
@@ -847,21 +1086,16 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
847 1086
848 case SIOCINQ: 1087 case SIOCINQ:
849 { 1088 {
850 struct sk_buff *skb; 1089 unsigned int amount = first_packet_length(sk);
851 unsigned long amount;
852 1090
853 amount = 0; 1091 if (amount)
854 spin_lock_bh(&sk->sk_receive_queue.lock);
855 skb = skb_peek(&sk->sk_receive_queue);
856 if (skb != NULL) {
857 /* 1092 /*
858 * We will only return the amount 1093 * We will only return the amount
859 * of this packet since that is all 1094 * of this packet since that is all
860 * that will be read. 1095 * that will be read.
861 */ 1096 */
862 amount = skb->len - sizeof(struct udphdr); 1097 amount -= sizeof(struct udphdr);
863 } 1098
864 spin_unlock_bh(&sk->sk_receive_queue.lock);
865 return put_user(amount, (int __user *)arg); 1099 return put_user(amount, (int __user *)arg);
866 } 1100 }
867 1101
@@ -871,6 +1105,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
871 1105
872 return 0; 1106 return 0;
873} 1107}
1108EXPORT_SYMBOL(udp_ioctl);
874 1109
875/* 1110/*
876 * This should be easy, if there is something there we 1111 * This should be easy, if there is something there we
@@ -883,16 +1118,17 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
883 struct inet_sock *inet = inet_sk(sk); 1118 struct inet_sock *inet = inet_sk(sk);
884 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; 1119 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
885 struct sk_buff *skb; 1120 struct sk_buff *skb;
886 unsigned int ulen, copied; 1121 unsigned int ulen;
887 int peeked; 1122 int peeked;
888 int err; 1123 int err;
889 int is_udplite = IS_UDPLITE(sk); 1124 int is_udplite = IS_UDPLITE(sk);
1125 bool slow;
890 1126
891 /* 1127 /*
892 * Check any passed addresses 1128 * Check any passed addresses
893 */ 1129 */
894 if (addr_len) 1130 if (addr_len)
895 *addr_len=sizeof(*sin); 1131 *addr_len = sizeof(*sin);
896 1132
897 if (flags & MSG_ERRQUEUE) 1133 if (flags & MSG_ERRQUEUE)
898 return ip_recv_error(sk, msg, len); 1134 return ip_recv_error(sk, msg, len);
@@ -904,10 +1140,9 @@ try_again:
904 goto out; 1140 goto out;
905 1141
906 ulen = skb->len - sizeof(struct udphdr); 1142 ulen = skb->len - sizeof(struct udphdr);
907 copied = len; 1143 if (len > ulen)
908 if (copied > ulen) 1144 len = ulen;
909 copied = ulen; 1145 else if (len < ulen)
910 else if (copied < ulen)
911 msg->msg_flags |= MSG_TRUNC; 1146 msg->msg_flags |= MSG_TRUNC;
912 1147
913 /* 1148 /*
@@ -916,16 +1151,18 @@ try_again:
916 * coverage checksum (UDP-Lite), do it before the copy. 1151 * coverage checksum (UDP-Lite), do it before the copy.
917 */ 1152 */
918 1153
919 if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { 1154 if (len < ulen || UDP_SKB_CB(skb)->partial_cov) {
920 if (udp_lib_checksum_complete(skb)) 1155 if (udp_lib_checksum_complete(skb))
921 goto csum_copy_err; 1156 goto csum_copy_err;
922 } 1157 }
923 1158
924 if (skb_csum_unnecessary(skb)) 1159 if (skb_csum_unnecessary(skb))
925 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), 1160 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
926 msg->msg_iov, copied ); 1161 msg->msg_iov, len);
927 else { 1162 else {
928 err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); 1163 err = skb_copy_and_csum_datagram_iovec(skb,
1164 sizeof(struct udphdr),
1165 msg->msg_iov);
929 1166
930 if (err == -EINVAL) 1167 if (err == -EINVAL)
931 goto csum_copy_err; 1168 goto csum_copy_err;
@@ -938,11 +1175,10 @@ try_again:
938 UDP_INC_STATS_USER(sock_net(sk), 1175 UDP_INC_STATS_USER(sock_net(sk),
939 UDP_MIB_INDATAGRAMS, is_udplite); 1176 UDP_MIB_INDATAGRAMS, is_udplite);
940 1177
941 sock_recv_timestamp(msg, sk, skb); 1178 sock_recv_ts_and_drops(msg, sk, skb);
942 1179
943 /* Copy the address. */ 1180 /* Copy the address. */
944 if (sin) 1181 if (sin) {
945 {
946 sin->sin_family = AF_INET; 1182 sin->sin_family = AF_INET;
947 sin->sin_port = udp_hdr(skb)->source; 1183 sin->sin_port = udp_hdr(skb)->source;
948 sin->sin_addr.s_addr = ip_hdr(skb)->saddr; 1184 sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
@@ -951,22 +1187,20 @@ try_again:
951 if (inet->cmsg_flags) 1187 if (inet->cmsg_flags)
952 ip_cmsg_recv(msg, skb); 1188 ip_cmsg_recv(msg, skb);
953 1189
954 err = copied; 1190 err = len;
955 if (flags & MSG_TRUNC) 1191 if (flags & MSG_TRUNC)
956 err = ulen; 1192 err = ulen;
957 1193
958out_free: 1194out_free:
959 lock_sock(sk); 1195 skb_free_datagram_locked(sk, skb);
960 skb_free_datagram(sk, skb);
961 release_sock(sk);
962out: 1196out:
963 return err; 1197 return err;
964 1198
965csum_copy_err: 1199csum_copy_err:
966 lock_sock(sk); 1200 slow = lock_sock_fast(sk);
967 if (!skb_kill_datagram(sk, skb, flags)) 1201 if (!skb_kill_datagram(sk, skb, flags))
968 UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); 1202 UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
969 release_sock(sk); 1203 unlock_sock_fast(sk, slow);
970 1204
971 if (noblock) 1205 if (noblock)
972 return -EAGAIN; 1206 return -EAGAIN;
@@ -982,58 +1216,113 @@ int udp_disconnect(struct sock *sk, int flags)
982 */ 1216 */
983 1217
984 sk->sk_state = TCP_CLOSE; 1218 sk->sk_state = TCP_CLOSE;
985 inet->daddr = 0; 1219 inet->inet_daddr = 0;
986 inet->dport = 0; 1220 inet->inet_dport = 0;
1221 sock_rps_save_rxhash(sk, 0);
987 sk->sk_bound_dev_if = 0; 1222 sk->sk_bound_dev_if = 0;
988 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 1223 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
989 inet_reset_saddr(sk); 1224 inet_reset_saddr(sk);
990 1225
991 if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { 1226 if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
992 sk->sk_prot->unhash(sk); 1227 sk->sk_prot->unhash(sk);
993 inet->sport = 0; 1228 inet->inet_sport = 0;
994 } 1229 }
995 sk_dst_reset(sk); 1230 sk_dst_reset(sk);
996 return 0; 1231 return 0;
997} 1232}
1233EXPORT_SYMBOL(udp_disconnect);
998 1234
999void udp_lib_unhash(struct sock *sk) 1235void udp_lib_unhash(struct sock *sk)
1000{ 1236{
1001 if (sk_hashed(sk)) { 1237 if (sk_hashed(sk)) {
1002 struct udp_table *udptable = sk->sk_prot->h.udp_table; 1238 struct udp_table *udptable = sk->sk_prot->h.udp_table;
1003 unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); 1239 struct udp_hslot *hslot, *hslot2;
1004 struct udp_hslot *hslot = &udptable->hash[hash]; 1240
1241 hslot = udp_hashslot(udptable, sock_net(sk),
1242 udp_sk(sk)->udp_port_hash);
1243 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
1005 1244
1006 spin_lock_bh(&hslot->lock); 1245 spin_lock_bh(&hslot->lock);
1007 if (sk_nulls_del_node_init_rcu(sk)) { 1246 if (sk_nulls_del_node_init_rcu(sk)) {
1008 inet_sk(sk)->num = 0; 1247 hslot->count--;
1248 inet_sk(sk)->inet_num = 0;
1009 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 1249 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
1250
1251 spin_lock(&hslot2->lock);
1252 hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
1253 hslot2->count--;
1254 spin_unlock(&hslot2->lock);
1010 } 1255 }
1011 spin_unlock_bh(&hslot->lock); 1256 spin_unlock_bh(&hslot->lock);
1012 } 1257 }
1013} 1258}
1014EXPORT_SYMBOL(udp_lib_unhash); 1259EXPORT_SYMBOL(udp_lib_unhash);
1015 1260
1261/*
1262 * inet_rcv_saddr was changed, we must rehash secondary hash
1263 */
1264void udp_lib_rehash(struct sock *sk, u16 newhash)
1265{
1266 if (sk_hashed(sk)) {
1267 struct udp_table *udptable = sk->sk_prot->h.udp_table;
1268 struct udp_hslot *hslot, *hslot2, *nhslot2;
1269
1270 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
1271 nhslot2 = udp_hashslot2(udptable, newhash);
1272 udp_sk(sk)->udp_portaddr_hash = newhash;
1273 if (hslot2 != nhslot2) {
1274 hslot = udp_hashslot(udptable, sock_net(sk),
1275 udp_sk(sk)->udp_port_hash);
1276 /* we must lock primary chain too */
1277 spin_lock_bh(&hslot->lock);
1278
1279 spin_lock(&hslot2->lock);
1280 hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
1281 hslot2->count--;
1282 spin_unlock(&hslot2->lock);
1283
1284 spin_lock(&nhslot2->lock);
1285 hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
1286 &nhslot2->head);
1287 nhslot2->count++;
1288 spin_unlock(&nhslot2->lock);
1289
1290 spin_unlock_bh(&hslot->lock);
1291 }
1292 }
1293}
1294EXPORT_SYMBOL(udp_lib_rehash);
1295
1296static void udp_v4_rehash(struct sock *sk)
1297{
1298 u16 new_hash = udp4_portaddr_hash(sock_net(sk),
1299 inet_sk(sk)->inet_rcv_saddr,
1300 inet_sk(sk)->inet_num);
1301 udp_lib_rehash(sk, new_hash);
1302}
1303
1016static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 1304static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1017{ 1305{
1018 int is_udplite = IS_UDPLITE(sk);
1019 int rc; 1306 int rc;
1020 1307
1021 if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) { 1308 if (inet_sk(sk)->inet_daddr)
1309 sock_rps_save_rxhash(sk, skb->rxhash);
1310
1311 rc = ip_queue_rcv_skb(sk, skb);
1312 if (rc < 0) {
1313 int is_udplite = IS_UDPLITE(sk);
1314
1022 /* Note that an ENOMEM error is charged twice */ 1315 /* Note that an ENOMEM error is charged twice */
1023 if (rc == -ENOMEM) { 1316 if (rc == -ENOMEM)
1024 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, 1317 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1025 is_udplite); 1318 is_udplite);
1026 atomic_inc(&sk->sk_drops); 1319 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1027 } 1320 kfree_skb(skb);
1028 goto drop; 1321 return -1;
1029 } 1322 }
1030 1323
1031 return 0; 1324 return 0;
1032 1325
1033drop:
1034 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1035 kfree_skb(skb);
1036 return -1;
1037} 1326}
1038 1327
1039/* returns: 1328/* returns:
@@ -1044,7 +1333,7 @@ drop:
1044 * Note that in the success and error cases, the skb is assumed to 1333 * Note that in the success and error cases, the skb is assumed to
1045 * have either been requeued or freed. 1334 * have either been requeued or freed.
1046 */ 1335 */
1047int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) 1336int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1048{ 1337{
1049 struct udp_sock *up = udp_sk(sk); 1338 struct udp_sock *up = udp_sk(sk);
1050 int rc; 1339 int rc;
@@ -1122,71 +1411,112 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
1122 } 1411 }
1123 } 1412 }
1124 1413
1125 if (sk->sk_filter) { 1414 if (rcu_dereference_raw(sk->sk_filter)) {
1126 if (udp_lib_checksum_complete(skb)) 1415 if (udp_lib_checksum_complete(skb))
1127 goto drop; 1416 goto drop;
1128 } 1417 }
1129 1418
1419
1420 if (sk_rcvqueues_full(sk, skb))
1421 goto drop;
1422
1130 rc = 0; 1423 rc = 0;
1131 1424
1132 bh_lock_sock(sk); 1425 bh_lock_sock(sk);
1133 if (!sock_owned_by_user(sk)) 1426 if (!sock_owned_by_user(sk))
1134 rc = __udp_queue_rcv_skb(sk, skb); 1427 rc = __udp_queue_rcv_skb(sk, skb);
1135 else 1428 else if (sk_add_backlog(sk, skb)) {
1136 sk_add_backlog(sk, skb); 1429 bh_unlock_sock(sk);
1430 goto drop;
1431 }
1137 bh_unlock_sock(sk); 1432 bh_unlock_sock(sk);
1138 1433
1139 return rc; 1434 return rc;
1140 1435
1141drop: 1436drop:
1142 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); 1437 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1438 atomic_inc(&sk->sk_drops);
1143 kfree_skb(skb); 1439 kfree_skb(skb);
1144 return -1; 1440 return -1;
1145} 1441}
1146 1442
1443
1444static void flush_stack(struct sock **stack, unsigned int count,
1445 struct sk_buff *skb, unsigned int final)
1446{
1447 unsigned int i;
1448 struct sk_buff *skb1 = NULL;
1449 struct sock *sk;
1450
1451 for (i = 0; i < count; i++) {
1452 sk = stack[i];
1453 if (likely(skb1 == NULL))
1454 skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
1455
1456 if (!skb1) {
1457 atomic_inc(&sk->sk_drops);
1458 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1459 IS_UDPLITE(sk));
1460 UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
1461 IS_UDPLITE(sk));
1462 }
1463
1464 if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
1465 skb1 = NULL;
1466 }
1467 if (unlikely(skb1))
1468 kfree_skb(skb1);
1469}
1470
1147/* 1471/*
1148 * Multicasts and broadcasts go to each listener. 1472 * Multicasts and broadcasts go to each listener.
1149 * 1473 *
1150 * Note: called only from the BH handler context, 1474 * Note: called only from the BH handler context.
1151 * so we don't need to lock the hashes.
1152 */ 1475 */
1153static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, 1476static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1154 struct udphdr *uh, 1477 struct udphdr *uh,
1155 __be32 saddr, __be32 daddr, 1478 __be32 saddr, __be32 daddr,
1156 struct udp_table *udptable) 1479 struct udp_table *udptable)
1157{ 1480{
1158 struct sock *sk; 1481 struct sock *sk, *stack[256 / sizeof(struct sock *)];
1159 struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))]; 1482 struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
1160 int dif; 1483 int dif;
1484 unsigned int i, count = 0;
1161 1485
1162 spin_lock(&hslot->lock); 1486 spin_lock(&hslot->lock);
1163 sk = sk_nulls_head(&hslot->head); 1487 sk = sk_nulls_head(&hslot->head);
1164 dif = skb->dev->ifindex; 1488 dif = skb->dev->ifindex;
1165 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); 1489 sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
1166 if (sk) { 1490 while (sk) {
1167 struct sock *sknext = NULL; 1491 stack[count++] = sk;
1492 sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
1493 daddr, uh->source, saddr, dif);
1494 if (unlikely(count == ARRAY_SIZE(stack))) {
1495 if (!sk)
1496 break;
1497 flush_stack(stack, count, skb, ~0);
1498 count = 0;
1499 }
1500 }
1501 /*
1502 * before releasing chain lock, we must take a reference on sockets
1503 */
1504 for (i = 0; i < count; i++)
1505 sock_hold(stack[i]);
1168 1506
1169 do {
1170 struct sk_buff *skb1 = skb;
1171
1172 sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
1173 daddr, uh->source, saddr,
1174 dif);
1175 if (sknext)
1176 skb1 = skb_clone(skb, GFP_ATOMIC);
1177
1178 if (skb1) {
1179 int ret = udp_queue_rcv_skb(sk, skb1);
1180 if (ret > 0)
1181 /* we should probably re-process instead
1182 * of dropping packets here. */
1183 kfree_skb(skb1);
1184 }
1185 sk = sknext;
1186 } while (sknext);
1187 } else
1188 consume_skb(skb);
1189 spin_unlock(&hslot->lock); 1507 spin_unlock(&hslot->lock);
1508
1509 /*
1510 * do the slow work with no lock held
1511 */
1512 if (count) {
1513 flush_stack(stack, count, skb, count - 1);
1514
1515 for (i = 0; i < count; i++)
1516 sock_put(stack[i]);
1517 } else {
1518 kfree_skb(skb);
1519 }
1190 return 0; 1520 return 0;
1191} 1521}
1192 1522
@@ -1214,7 +1544,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
1214 if (uh->check == 0) { 1544 if (uh->check == 0) {
1215 skb->ip_summed = CHECKSUM_UNNECESSARY; 1545 skb->ip_summed = CHECKSUM_UNNECESSARY;
1216 } else if (skb->ip_summed == CHECKSUM_COMPLETE) { 1546 } else if (skb->ip_summed == CHECKSUM_COMPLETE) {
1217 if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len, 1547 if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
1218 proto, skb->csum)) 1548 proto, skb->csum))
1219 skb->ip_summed = CHECKSUM_UNNECESSARY; 1549 skb->ip_summed = CHECKSUM_UNNECESSARY;
1220 } 1550 }
@@ -1250,6 +1580,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1250 1580
1251 uh = udp_hdr(skb); 1581 uh = udp_hdr(skb);
1252 ulen = ntohs(uh->len); 1582 ulen = ntohs(uh->len);
1583 saddr = ip_hdr(skb)->saddr;
1584 daddr = ip_hdr(skb)->daddr;
1585
1253 if (ulen > skb->len) 1586 if (ulen > skb->len)
1254 goto short_packet; 1587 goto short_packet;
1255 1588
@@ -1263,9 +1596,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1263 if (udp4_csum_init(skb, uh, proto)) 1596 if (udp4_csum_init(skb, uh, proto))
1264 goto csum_error; 1597 goto csum_error;
1265 1598
1266 saddr = ip_hdr(skb)->saddr;
1267 daddr = ip_hdr(skb)->daddr;
1268
1269 if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) 1599 if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
1270 return __udp4_lib_mcast_deliver(net, skb, uh, 1600 return __udp4_lib_mcast_deliver(net, skb, uh,
1271 saddr, daddr, udptable); 1601 saddr, daddr, udptable);
@@ -1338,16 +1668,16 @@ int udp_rcv(struct sk_buff *skb)
1338 1668
1339void udp_destroy_sock(struct sock *sk) 1669void udp_destroy_sock(struct sock *sk)
1340{ 1670{
1341 lock_sock(sk); 1671 bool slow = lock_sock_fast(sk);
1342 udp_flush_pending_frames(sk); 1672 udp_flush_pending_frames(sk);
1343 release_sock(sk); 1673 unlock_sock_fast(sk, slow);
1344} 1674}
1345 1675
1346/* 1676/*
1347 * Socket option code for UDP 1677 * Socket option code for UDP
1348 */ 1678 */
1349int udp_lib_setsockopt(struct sock *sk, int level, int optname, 1679int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1350 char __user *optval, int optlen, 1680 char __user *optval, unsigned int optlen,
1351 int (*push_pending_frames)(struct sock *)) 1681 int (*push_pending_frames)(struct sock *))
1352{ 1682{
1353 struct udp_sock *up = udp_sk(sk); 1683 struct udp_sock *up = udp_sk(sk);
@@ -1355,7 +1685,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1355 int err = 0; 1685 int err = 0;
1356 int is_udplite = IS_UDPLITE(sk); 1686 int is_udplite = IS_UDPLITE(sk);
1357 1687
1358 if (optlen<sizeof(int)) 1688 if (optlen < sizeof(int))
1359 return -EINVAL; 1689 return -EINVAL;
1360 1690
1361 if (get_user(val, (int __user *)optval)) 1691 if (get_user(val, (int __user *)optval))
@@ -1399,8 +1729,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1399 return -ENOPROTOOPT; 1729 return -ENOPROTOOPT;
1400 if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ 1730 if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
1401 val = 8; 1731 val = 8;
1402 else if (val > USHORT_MAX) 1732 else if (val > USHRT_MAX)
1403 val = USHORT_MAX; 1733 val = USHRT_MAX;
1404 up->pcslen = val; 1734 up->pcslen = val;
1405 up->pcflag |= UDPLITE_SEND_CC; 1735 up->pcflag |= UDPLITE_SEND_CC;
1406 break; 1736 break;
@@ -1413,8 +1743,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1413 return -ENOPROTOOPT; 1743 return -ENOPROTOOPT;
1414 if (val != 0 && val < 8) /* Avoid silly minimal values. */ 1744 if (val != 0 && val < 8) /* Avoid silly minimal values. */
1415 val = 8; 1745 val = 8;
1416 else if (val > USHORT_MAX) 1746 else if (val > USHRT_MAX)
1417 val = USHORT_MAX; 1747 val = USHRT_MAX;
1418 up->pcrlen = val; 1748 up->pcrlen = val;
1419 up->pcflag |= UDPLITE_RECV_CC; 1749 up->pcflag |= UDPLITE_RECV_CC;
1420 break; 1750 break;
@@ -1426,9 +1756,10 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1426 1756
1427 return err; 1757 return err;
1428} 1758}
1759EXPORT_SYMBOL(udp_lib_setsockopt);
1429 1760
1430int udp_setsockopt(struct sock *sk, int level, int optname, 1761int udp_setsockopt(struct sock *sk, int level, int optname,
1431 char __user *optval, int optlen) 1762 char __user *optval, unsigned int optlen)
1432{ 1763{
1433 if (level == SOL_UDP || level == SOL_UDPLITE) 1764 if (level == SOL_UDP || level == SOL_UDPLITE)
1434 return udp_lib_setsockopt(sk, level, optname, optval, optlen, 1765 return udp_lib_setsockopt(sk, level, optname, optval, optlen,
@@ -1438,7 +1769,7 @@ int udp_setsockopt(struct sock *sk, int level, int optname,
1438 1769
1439#ifdef CONFIG_COMPAT 1770#ifdef CONFIG_COMPAT
1440int compat_udp_setsockopt(struct sock *sk, int level, int optname, 1771int compat_udp_setsockopt(struct sock *sk, int level, int optname,
1441 char __user *optval, int optlen) 1772 char __user *optval, unsigned int optlen)
1442{ 1773{
1443 if (level == SOL_UDP || level == SOL_UDPLITE) 1774 if (level == SOL_UDP || level == SOL_UDPLITE)
1444 return udp_lib_setsockopt(sk, level, optname, optval, optlen, 1775 return udp_lib_setsockopt(sk, level, optname, optval, optlen,
@@ -1453,7 +1784,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
1453 struct udp_sock *up = udp_sk(sk); 1784 struct udp_sock *up = udp_sk(sk);
1454 int val, len; 1785 int val, len;
1455 1786
1456 if (get_user(len,optlen)) 1787 if (get_user(len, optlen))
1457 return -EFAULT; 1788 return -EFAULT;
1458 1789
1459 len = min_t(unsigned int, len, sizeof(int)); 1790 len = min_t(unsigned int, len, sizeof(int));
@@ -1486,10 +1817,11 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
1486 1817
1487 if (put_user(len, optlen)) 1818 if (put_user(len, optlen))
1488 return -EFAULT; 1819 return -EFAULT;
1489 if (copy_to_user(optval, &val,len)) 1820 if (copy_to_user(optval, &val, len))
1490 return -EFAULT; 1821 return -EFAULT;
1491 return 0; 1822 return 0;
1492} 1823}
1824EXPORT_SYMBOL(udp_lib_getsockopt);
1493 1825
1494int udp_getsockopt(struct sock *sk, int level, int optname, 1826int udp_getsockopt(struct sock *sk, int level, int optname,
1495 char __user *optval, int __user *optlen) 1827 char __user *optval, int __user *optlen)
@@ -1525,33 +1857,16 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
1525{ 1857{
1526 unsigned int mask = datagram_poll(file, sock, wait); 1858 unsigned int mask = datagram_poll(file, sock, wait);
1527 struct sock *sk = sock->sk; 1859 struct sock *sk = sock->sk;
1528 int is_lite = IS_UDPLITE(sk);
1529 1860
1530 /* Check for false positives due to checksum errors */ 1861 /* Check for false positives due to checksum errors */
1531 if ( (mask & POLLRDNORM) && 1862 if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
1532 !(file->f_flags & O_NONBLOCK) && 1863 !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk))
1533 !(sk->sk_shutdown & RCV_SHUTDOWN)){ 1864 mask &= ~(POLLIN | POLLRDNORM);
1534 struct sk_buff_head *rcvq = &sk->sk_receive_queue;
1535 struct sk_buff *skb;
1536
1537 spin_lock_bh(&rcvq->lock);
1538 while ((skb = skb_peek(rcvq)) != NULL &&
1539 udp_lib_checksum_complete(skb)) {
1540 UDP_INC_STATS_BH(sock_net(sk),
1541 UDP_MIB_INERRORS, is_lite);
1542 __skb_unlink(skb, rcvq);
1543 kfree_skb(skb);
1544 }
1545 spin_unlock_bh(&rcvq->lock);
1546
1547 /* nothing to see, move along */
1548 if (skb == NULL)
1549 mask &= ~(POLLIN | POLLRDNORM);
1550 }
1551 1865
1552 return mask; 1866 return mask;
1553 1867
1554} 1868}
1869EXPORT_SYMBOL(udp_poll);
1555 1870
1556struct proto udp_prot = { 1871struct proto udp_prot = {
1557 .name = "UDP", 1872 .name = "UDP",
@@ -1569,6 +1884,7 @@ struct proto udp_prot = {
1569 .backlog_rcv = __udp_queue_rcv_skb, 1884 .backlog_rcv = __udp_queue_rcv_skb,
1570 .hash = udp_lib_hash, 1885 .hash = udp_lib_hash,
1571 .unhash = udp_lib_unhash, 1886 .unhash = udp_lib_unhash,
1887 .rehash = udp_v4_rehash,
1572 .get_port = udp_v4_get_port, 1888 .get_port = udp_v4_get_port,
1573 .memory_allocated = &udp_memory_allocated, 1889 .memory_allocated = &udp_memory_allocated,
1574 .sysctl_mem = sysctl_udp_mem, 1890 .sysctl_mem = sysctl_udp_mem,
@@ -1581,7 +1897,9 @@ struct proto udp_prot = {
1581 .compat_setsockopt = compat_udp_setsockopt, 1897 .compat_setsockopt = compat_udp_setsockopt,
1582 .compat_getsockopt = compat_udp_getsockopt, 1898 .compat_getsockopt = compat_udp_getsockopt,
1583#endif 1899#endif
1900 .clear_sk = sk_prot_clear_portaddr_nulls,
1584}; 1901};
1902EXPORT_SYMBOL(udp_prot);
1585 1903
1586/* ------------------------------------------------------------------------ */ 1904/* ------------------------------------------------------------------------ */
1587#ifdef CONFIG_PROC_FS 1905#ifdef CONFIG_PROC_FS
@@ -1592,9 +1910,14 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
1592 struct udp_iter_state *state = seq->private; 1910 struct udp_iter_state *state = seq->private;
1593 struct net *net = seq_file_net(seq); 1911 struct net *net = seq_file_net(seq);
1594 1912
1595 for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { 1913 for (state->bucket = start; state->bucket <= state->udp_table->mask;
1914 ++state->bucket) {
1596 struct hlist_nulls_node *node; 1915 struct hlist_nulls_node *node;
1597 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; 1916 struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
1917
1918 if (hlist_nulls_empty(&hslot->head))
1919 continue;
1920
1598 spin_lock_bh(&hslot->lock); 1921 spin_lock_bh(&hslot->lock);
1599 sk_nulls_for_each(sk, node, &hslot->head) { 1922 sk_nulls_for_each(sk, node, &hslot->head) {
1600 if (!net_eq(sock_net(sk), net)) 1923 if (!net_eq(sock_net(sk), net))
@@ -1619,7 +1942,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
1619 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); 1942 } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
1620 1943
1621 if (!sk) { 1944 if (!sk) {
1622 if (state->bucket < UDP_HTABLE_SIZE) 1945 if (state->bucket <= state->udp_table->mask)
1623 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); 1946 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1624 return udp_get_first(seq, state->bucket + 1); 1947 return udp_get_first(seq, state->bucket + 1);
1625 } 1948 }
@@ -1639,7 +1962,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
1639static void *udp_seq_start(struct seq_file *seq, loff_t *pos) 1962static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
1640{ 1963{
1641 struct udp_iter_state *state = seq->private; 1964 struct udp_iter_state *state = seq->private;
1642 state->bucket = UDP_HTABLE_SIZE; 1965 state->bucket = MAX_UDP_PORTS;
1643 1966
1644 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; 1967 return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
1645} 1968}
@@ -1661,7 +1984,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v)
1661{ 1984{
1662 struct udp_iter_state *state = seq->private; 1985 struct udp_iter_state *state = seq->private;
1663 1986
1664 if (state->bucket < UDP_HTABLE_SIZE) 1987 if (state->bucket <= state->udp_table->mask)
1665 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); 1988 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1666} 1989}
1667 1990
@@ -1703,23 +2026,25 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
1703 rc = -ENOMEM; 2026 rc = -ENOMEM;
1704 return rc; 2027 return rc;
1705} 2028}
2029EXPORT_SYMBOL(udp_proc_register);
1706 2030
1707void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo) 2031void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo)
1708{ 2032{
1709 proc_net_remove(net, afinfo->name); 2033 proc_net_remove(net, afinfo->name);
1710} 2034}
2035EXPORT_SYMBOL(udp_proc_unregister);
1711 2036
1712/* ------------------------------------------------------------------------ */ 2037/* ------------------------------------------------------------------------ */
1713static void udp4_format_sock(struct sock *sp, struct seq_file *f, 2038static void udp4_format_sock(struct sock *sp, struct seq_file *f,
1714 int bucket, int *len) 2039 int bucket, int *len)
1715{ 2040{
1716 struct inet_sock *inet = inet_sk(sp); 2041 struct inet_sock *inet = inet_sk(sp);
1717 __be32 dest = inet->daddr; 2042 __be32 dest = inet->inet_daddr;
1718 __be32 src = inet->rcv_saddr; 2043 __be32 src = inet->inet_rcv_saddr;
1719 __u16 destp = ntohs(inet->dport); 2044 __u16 destp = ntohs(inet->inet_dport);
1720 __u16 srcp = ntohs(inet->sport); 2045 __u16 srcp = ntohs(inet->inet_sport);
1721 2046
1722 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2047 seq_printf(f, "%5d: %08X:%04X %08X:%04X"
1723 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", 2048 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
1724 bucket, src, srcp, dest, destp, sp->sk_state, 2049 bucket, src, srcp, dest, destp, sp->sk_state,
1725 sk_wmem_alloc_get(sp), 2050 sk_wmem_alloc_get(sp),
@@ -1741,7 +2066,7 @@ int udp4_seq_show(struct seq_file *seq, void *v)
1741 int len; 2066 int len;
1742 2067
1743 udp4_format_sock(v, seq, state->bucket, &len); 2068 udp4_format_sock(v, seq, state->bucket, &len);
1744 seq_printf(seq, "%*s\n", 127 - len ,""); 2069 seq_printf(seq, "%*s\n", 127 - len, "");
1745 } 2070 }
1746 return 0; 2071 return 0;
1747} 2072}
@@ -1759,12 +2084,12 @@ static struct udp_seq_afinfo udp4_seq_afinfo = {
1759 }, 2084 },
1760}; 2085};
1761 2086
1762static int udp4_proc_init_net(struct net *net) 2087static int __net_init udp4_proc_init_net(struct net *net)
1763{ 2088{
1764 return udp_proc_register(net, &udp4_seq_afinfo); 2089 return udp_proc_register(net, &udp4_seq_afinfo);
1765} 2090}
1766 2091
1767static void udp4_proc_exit_net(struct net *net) 2092static void __net_exit udp4_proc_exit_net(struct net *net)
1768{ 2093{
1769 udp_proc_unregister(net, &udp4_seq_afinfo); 2094 udp_proc_unregister(net, &udp4_seq_afinfo);
1770} 2095}
@@ -1785,21 +2110,60 @@ void udp4_proc_exit(void)
1785} 2110}
1786#endif /* CONFIG_PROC_FS */ 2111#endif /* CONFIG_PROC_FS */
1787 2112
1788void __init udp_table_init(struct udp_table *table) 2113static __initdata unsigned long uhash_entries;
2114static int __init set_uhash_entries(char *str)
1789{ 2115{
1790 int i; 2116 if (!str)
2117 return 0;
2118 uhash_entries = simple_strtoul(str, &str, 0);
2119 if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
2120 uhash_entries = UDP_HTABLE_SIZE_MIN;
2121 return 1;
2122}
2123__setup("uhash_entries=", set_uhash_entries);
1791 2124
1792 for (i = 0; i < UDP_HTABLE_SIZE; i++) { 2125void __init udp_table_init(struct udp_table *table, const char *name)
2126{
2127 unsigned int i;
2128
2129 if (!CONFIG_BASE_SMALL)
2130 table->hash = alloc_large_system_hash(name,
2131 2 * sizeof(struct udp_hslot),
2132 uhash_entries,
2133 21, /* one slot per 2 MB */
2134 0,
2135 &table->log,
2136 &table->mask,
2137 64 * 1024);
2138 /*
2139 * Make sure hash table has the minimum size
2140 */
2141 if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
2142 table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
2143 2 * sizeof(struct udp_hslot), GFP_KERNEL);
2144 if (!table->hash)
2145 panic(name);
2146 table->log = ilog2(UDP_HTABLE_SIZE_MIN);
2147 table->mask = UDP_HTABLE_SIZE_MIN - 1;
2148 }
2149 table->hash2 = table->hash + (table->mask + 1);
2150 for (i = 0; i <= table->mask; i++) {
1793 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); 2151 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
2152 table->hash[i].count = 0;
1794 spin_lock_init(&table->hash[i].lock); 2153 spin_lock_init(&table->hash[i].lock);
1795 } 2154 }
2155 for (i = 0; i <= table->mask; i++) {
2156 INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
2157 table->hash2[i].count = 0;
2158 spin_lock_init(&table->hash2[i].lock);
2159 }
1796} 2160}
1797 2161
1798void __init udp_init(void) 2162void __init udp_init(void)
1799{ 2163{
1800 unsigned long nr_pages, limit; 2164 unsigned long nr_pages, limit;
1801 2165
1802 udp_table_init(&udp_table); 2166 udp_table_init(&udp_table, "UDP");
1803 /* Set the pressure threshold up by the same strategy of TCP. It is a 2167 /* Set the pressure threshold up by the same strategy of TCP. It is a
1804 * fraction of global memory that is up to 1/2 at 256 MB, decreasing 2168 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
1805 * toward zero with the amount of memory, with a floor of 128 pages. 2169 * toward zero with the amount of memory, with a floor of 128 pages.
@@ -1816,16 +2180,64 @@ void __init udp_init(void)
1816 sysctl_udp_wmem_min = SK_MEM_QUANTUM; 2180 sysctl_udp_wmem_min = SK_MEM_QUANTUM;
1817} 2181}
1818 2182
1819EXPORT_SYMBOL(udp_disconnect); 2183int udp4_ufo_send_check(struct sk_buff *skb)
1820EXPORT_SYMBOL(udp_ioctl); 2184{
1821EXPORT_SYMBOL(udp_prot); 2185 const struct iphdr *iph;
1822EXPORT_SYMBOL(udp_sendmsg); 2186 struct udphdr *uh;
1823EXPORT_SYMBOL(udp_lib_getsockopt); 2187
1824EXPORT_SYMBOL(udp_lib_setsockopt); 2188 if (!pskb_may_pull(skb, sizeof(*uh)))
1825EXPORT_SYMBOL(udp_poll); 2189 return -EINVAL;
1826EXPORT_SYMBOL(udp_lib_get_port); 2190
2191 iph = ip_hdr(skb);
2192 uh = udp_hdr(skb);
2193
2194 uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
2195 IPPROTO_UDP, 0);
2196 skb->csum_start = skb_transport_header(skb) - skb->head;
2197 skb->csum_offset = offsetof(struct udphdr, check);
2198 skb->ip_summed = CHECKSUM_PARTIAL;
2199 return 0;
2200}
2201
2202struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features)
2203{
2204 struct sk_buff *segs = ERR_PTR(-EINVAL);
2205 unsigned int mss;
2206 int offset;
2207 __wsum csum;
2208
2209 mss = skb_shinfo(skb)->gso_size;
2210 if (unlikely(skb->len <= mss))
2211 goto out;
2212
2213 if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
2214 /* Packet is from an untrusted source, reset gso_segs. */
2215 int type = skb_shinfo(skb)->gso_type;
2216
2217 if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) ||
2218 !(type & (SKB_GSO_UDP))))
2219 goto out;
2220
2221 skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
2222
2223 segs = NULL;
2224 goto out;
2225 }
2226
2227 /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
2228 * do checksum of UDP packets sent as multiple IP fragments.
2229 */
2230 offset = skb_checksum_start_offset(skb);
2231 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2232 offset += skb->csum_offset;
2233 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2234 skb->ip_summed = CHECKSUM_NONE;
2235
2236 /* Fragment the skb. IP headers of the fragments are updated in
2237 * inet_gso_segment()
2238 */
2239 segs = skb_segment(skb, features);
2240out:
2241 return segs;
2242}
1827 2243
1828#ifdef CONFIG_PROC_FS
1829EXPORT_SYMBOL(udp_proc_register);
1830EXPORT_SYMBOL(udp_proc_unregister);
1831#endif
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 9f4a6165f722..aaad650d47d9 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -11,13 +11,13 @@ extern void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
11extern int udp_v4_get_port(struct sock *sk, unsigned short snum); 11extern int udp_v4_get_port(struct sock *sk, unsigned short snum);
12 12
13extern int udp_setsockopt(struct sock *sk, int level, int optname, 13extern int udp_setsockopt(struct sock *sk, int level, int optname,
14 char __user *optval, int optlen); 14 char __user *optval, unsigned int optlen);
15extern int udp_getsockopt(struct sock *sk, int level, int optname, 15extern int udp_getsockopt(struct sock *sk, int level, int optname,
16 char __user *optval, int __user *optlen); 16 char __user *optval, int __user *optlen);
17 17
18#ifdef CONFIG_COMPAT 18#ifdef CONFIG_COMPAT
19extern int compat_udp_setsockopt(struct sock *sk, int level, int optname, 19extern int compat_udp_setsockopt(struct sock *sk, int level, int optname,
20 char __user *optval, int optlen); 20 char __user *optval, unsigned int optlen);
21extern int compat_udp_getsockopt(struct sock *sk, int level, int optname, 21extern int compat_udp_getsockopt(struct sock *sk, int level, int optname,
22 char __user *optval, int __user *optlen); 22 char __user *optval, int __user *optlen);
23#endif 23#endif
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index c784891cb7e5..aee9963f7f5a 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -12,7 +12,7 @@
12 */ 12 */
13#include "udp_impl.h" 13#include "udp_impl.h"
14 14
15struct udp_table udplite_table; 15struct udp_table udplite_table __read_mostly;
16EXPORT_SYMBOL(udplite_table); 16EXPORT_SYMBOL(udplite_table);
17 17
18static int udplite_rcv(struct sk_buff *skb) 18static int udplite_rcv(struct sk_buff *skb)
@@ -25,7 +25,7 @@ static void udplite_err(struct sk_buff *skb, u32 info)
25 __udp4_lib_err(skb, info, &udplite_table); 25 __udp4_lib_err(skb, info, &udplite_table);
26} 26}
27 27
28static struct net_protocol udplite_protocol = { 28static const struct net_protocol udplite_protocol = {
29 .handler = udplite_rcv, 29 .handler = udplite_rcv,
30 .err_handler = udplite_err, 30 .err_handler = udplite_err,
31 .no_policy = 1, 31 .no_policy = 1,
@@ -57,14 +57,15 @@ struct proto udplite_prot = {
57 .compat_setsockopt = compat_udp_setsockopt, 57 .compat_setsockopt = compat_udp_setsockopt,
58 .compat_getsockopt = compat_udp_getsockopt, 58 .compat_getsockopt = compat_udp_getsockopt,
59#endif 59#endif
60 .clear_sk = sk_prot_clear_portaddr_nulls,
60}; 61};
62EXPORT_SYMBOL(udplite_prot);
61 63
62static struct inet_protosw udplite4_protosw = { 64static struct inet_protosw udplite4_protosw = {
63 .type = SOCK_DGRAM, 65 .type = SOCK_DGRAM,
64 .protocol = IPPROTO_UDPLITE, 66 .protocol = IPPROTO_UDPLITE,
65 .prot = &udplite_prot, 67 .prot = &udplite_prot,
66 .ops = &inet_dgram_ops, 68 .ops = &inet_dgram_ops,
67 .capability = -1,
68 .no_check = 0, /* must checksum (RFC 3828) */ 69 .no_check = 0, /* must checksum (RFC 3828) */
69 .flags = INET_PROTOSW_PERMANENT, 70 .flags = INET_PROTOSW_PERMANENT,
70}; 71};
@@ -82,12 +83,12 @@ static struct udp_seq_afinfo udplite4_seq_afinfo = {
82 }, 83 },
83}; 84};
84 85
85static int udplite4_proc_init_net(struct net *net) 86static int __net_init udplite4_proc_init_net(struct net *net)
86{ 87{
87 return udp_proc_register(net, &udplite4_seq_afinfo); 88 return udp_proc_register(net, &udplite4_seq_afinfo);
88} 89}
89 90
90static void udplite4_proc_exit_net(struct net *net) 91static void __net_exit udplite4_proc_exit_net(struct net *net)
91{ 92{
92 udp_proc_unregister(net, &udplite4_seq_afinfo); 93 udp_proc_unregister(net, &udplite4_seq_afinfo);
93} 94}
@@ -110,7 +111,7 @@ static inline int udplite4_proc_init(void)
110 111
111void __init udplite4_register(void) 112void __init udplite4_register(void)
112{ 113{
113 udp_table_init(&udplite_table); 114 udp_table_init(&udplite_table, "UDP-Lite");
114 if (proto_register(&udplite_prot, 1)) 115 if (proto_register(&udplite_prot, 1))
115 goto out_register_err; 116 goto out_register_err;
116 117
@@ -128,5 +129,3 @@ out_unregister_proto:
128out_register_err: 129out_register_err:
129 printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__); 130 printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
130} 131}
131
132EXPORT_SYMBOL(udplite_prot);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index f9f922a0ba88..06814b6216dc 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -9,6 +9,7 @@
9 * 9 *
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/string.h> 14#include <linux/string.h>
14#include <linux/netfilter.h> 15#include <linux/netfilter.h>
@@ -26,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
26 if (skb_dst(skb) == NULL) { 27 if (skb_dst(skb) == NULL) {
27 const struct iphdr *iph = ip_hdr(skb); 28 const struct iphdr *iph = ip_hdr(skb);
28 29
29 if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, 30 if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
30 skb->dev)) 31 iph->tos, skb->dev))
31 goto drop; 32 goto drop;
32 } 33 }
33 return dst_input(skb); 34 return dst_input(skb);
@@ -60,7 +61,7 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
60 iph->tot_len = htons(skb->len); 61 iph->tot_len = htons(skb->len);
61 ip_send_check(iph); 62 ip_send_check(iph);
62 63
63 NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, 64 NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
64 xfrm4_rcv_encap_finish); 65 xfrm4_rcv_encap_finish);
65 return 0; 66 return 0;
66} 67}
@@ -162,5 +163,4 @@ int xfrm4_rcv(struct sk_buff *skb)
162{ 163{
163 return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0); 164 return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0);
164} 165}
165
166EXPORT_SYMBOL(xfrm4_rcv); 166EXPORT_SYMBOL(xfrm4_rcv);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 3444f3b34eca..534972e114ac 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -4,6 +4,7 @@
4 * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> 4 * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
5 */ 5 */
6 6
7#include <linux/gfp.h>
7#include <linux/init.h> 8#include <linux/init.h>
8#include <linux/kernel.h> 9#include <linux/kernel.h>
9#include <linux/module.h> 10#include <linux/module.h>
@@ -55,7 +56,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
55 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); 56 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
56 ip_select_ident(top_iph, dst->child, NULL); 57 ip_select_ident(top_iph, dst->child, NULL);
57 58
58 top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); 59 top_iph->ttl = ip4_dst_hoplimit(dst->child);
59 60
60 top_iph->saddr = x->props.saddr.a4; 61 top_iph->saddr = x->props.saddr.a4;
61 top_iph->daddr = x->id.daddr.a4; 62 top_iph->daddr = x->id.daddr.a4;
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index c908bd99bcba..571aa96a175c 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -86,7 +86,7 @@ static int xfrm4_output_finish(struct sk_buff *skb)
86 86
87int xfrm4_output(struct sk_buff *skb) 87int xfrm4_output(struct sk_buff *skb)
88{ 88{
89 return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, 89 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb,
90 NULL, skb_dst(skb)->dev, xfrm4_output_finish, 90 NULL, skb_dst(skb)->dev, xfrm4_output_finish,
91 !(IPCB(skb)->flags & IPSKB_REROUTED)); 91 !(IPCB(skb)->flags & IPSKB_REROUTED));
92} 92}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 60d918c96a4f..b057d40addec 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -11,11 +11,11 @@
11#include <linux/err.h> 11#include <linux/err.h>
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/inetdevice.h> 13#include <linux/inetdevice.h>
14#include <linux/if_tunnel.h>
14#include <net/dst.h> 15#include <net/dst.h>
15#include <net/xfrm.h> 16#include <net/xfrm.h>
16#include <net/ip.h> 17#include <net/ip.h>
17 18
18static struct dst_ops xfrm4_dst_ops;
19static struct xfrm_policy_afinfo xfrm4_policy_afinfo; 19static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
20 20
21static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, 21static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
@@ -23,12 +23,8 @@ static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
23 xfrm_address_t *daddr) 23 xfrm_address_t *daddr)
24{ 24{
25 struct flowi fl = { 25 struct flowi fl = {
26 .nl_u = { 26 .fl4_dst = daddr->a4,
27 .ip4_u = { 27 .fl4_tos = tos,
28 .tos = tos,
29 .daddr = daddr->a4,
30 },
31 },
32 }; 28 };
33 struct dst_entry *dst; 29 struct dst_entry *dst;
34 struct rtable *rt; 30 struct rtable *rt;
@@ -38,7 +34,7 @@ static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
38 fl.fl4_src = saddr->a4; 34 fl.fl4_src = saddr->a4;
39 35
40 err = __ip_route_output_key(net, &rt, &fl); 36 err = __ip_route_output_key(net, &rt, &fl);
41 dst = &rt->u.dst; 37 dst = &rt->dst;
42 if (err) 38 if (err)
43 dst = ERR_PTR(err); 39 dst = ERR_PTR(err);
44 return dst; 40 return dst;
@@ -60,30 +56,9 @@ static int xfrm4_get_saddr(struct net *net,
60 return 0; 56 return 0;
61} 57}
62 58
63static struct dst_entry *
64__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
65{
66 struct dst_entry *dst;
67
68 read_lock_bh(&policy->lock);
69 for (dst = policy->bundles; dst; dst = dst->next) {
70 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
71 if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/
72 xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
73 xdst->u.rt.fl.fl4_src == fl->fl4_src &&
74 xdst->u.rt.fl.fl4_tos == fl->fl4_tos &&
75 xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) {
76 dst_clone(dst);
77 break;
78 }
79 }
80 read_unlock_bh(&policy->lock);
81 return dst;
82}
83
84static int xfrm4_get_tos(struct flowi *fl) 59static int xfrm4_get_tos(struct flowi *fl)
85{ 60{
86 return fl->fl4_tos; 61 return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */
87} 62}
88 63
89static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, 64static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
@@ -92,19 +67,16 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
92 return 0; 67 return 0;
93} 68}
94 69
95static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) 70static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
71 struct flowi *fl)
96{ 72{
97 struct rtable *rt = (struct rtable *)xdst->route; 73 struct rtable *rt = (struct rtable *)xdst->route;
98 74
99 xdst->u.rt.fl = rt->fl; 75 xdst->u.rt.fl = *fl;
100 76
101 xdst->u.dst.dev = dev; 77 xdst->u.dst.dev = dev;
102 dev_hold(dev); 78 dev_hold(dev);
103 79
104 xdst->u.rt.idev = in_dev_get(dev);
105 if (!xdst->u.rt.idev)
106 return -ENODEV;
107
108 xdst->u.rt.peer = rt->peer; 80 xdst->u.rt.peer = rt->peer;
109 if (rt->peer) 81 if (rt->peer)
110 atomic_inc(&rt->peer->refcnt); 82 atomic_inc(&rt->peer->refcnt);
@@ -129,6 +101,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
129 u8 *xprth = skb_network_header(skb) + iph->ihl * 4; 101 u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
130 102
131 memset(fl, 0, sizeof(struct flowi)); 103 memset(fl, 0, sizeof(struct flowi));
104 fl->mark = skb->mark;
105
132 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { 106 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
133 switch (iph->protocol) { 107 switch (iph->protocol) {
134 case IPPROTO_UDP: 108 case IPPROTO_UDP:
@@ -136,7 +110,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
136 case IPPROTO_TCP: 110 case IPPROTO_TCP:
137 case IPPROTO_SCTP: 111 case IPPROTO_SCTP:
138 case IPPROTO_DCCP: 112 case IPPROTO_DCCP:
139 if (pskb_may_pull(skb, xprth + 4 - skb->data)) { 113 if (xprth + 4 < skb->data ||
114 pskb_may_pull(skb, xprth + 4 - skb->data)) {
140 __be16 *ports = (__be16 *)xprth; 115 __be16 *ports = (__be16 *)xprth;
141 116
142 fl->fl_ip_sport = ports[!!reverse]; 117 fl->fl_ip_sport = ports[!!reverse];
@@ -176,6 +151,20 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
176 fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); 151 fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
177 } 152 }
178 break; 153 break;
154
155 case IPPROTO_GRE:
156 if (pskb_may_pull(skb, xprth + 12 - skb->data)) {
157 __be16 *greflags = (__be16 *)xprth;
158 __be32 *gre_hdr = (__be32 *)xprth;
159
160 if (greflags[0] & GRE_KEY) {
161 if (greflags[0] & GRE_CSUM)
162 gre_hdr++;
163 fl->fl_gre_key = gre_hdr[1];
164 }
165 }
166 break;
167
179 default: 168 default:
180 fl->fl_ipsec_spi = 0; 169 fl->fl_ipsec_spi = 0;
181 break; 170 break;
@@ -189,8 +178,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
189 178
190static inline int xfrm4_garbage_collect(struct dst_ops *ops) 179static inline int xfrm4_garbage_collect(struct dst_ops *ops)
191{ 180{
192 xfrm4_policy_afinfo.garbage_collect(&init_net); 181 struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
193 return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); 182
183 xfrm4_policy_afinfo.garbage_collect(net);
184 return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
194} 185}
195 186
196static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) 187static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -205,8 +196,6 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
205{ 196{
206 struct xfrm_dst *xdst = (struct xfrm_dst *)dst; 197 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
207 198
208 if (likely(xdst->u.rt.idev))
209 in_dev_put(xdst->u.rt.idev);
210 if (likely(xdst->u.rt.peer)) 199 if (likely(xdst->u.rt.peer))
211 inet_putpeer(xdst->u.rt.peer); 200 inet_putpeer(xdst->u.rt.peer);
212 xfrm_dst_destroy(xdst); 201 xfrm_dst_destroy(xdst);
@@ -215,27 +204,9 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
215static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 204static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
216 int unregister) 205 int unregister)
217{ 206{
218 struct xfrm_dst *xdst;
219
220 if (!unregister) 207 if (!unregister)
221 return; 208 return;
222 209
223 xdst = (struct xfrm_dst *)dst;
224 if (xdst->u.rt.idev->dev == dev) {
225 struct in_device *loopback_idev =
226 in_dev_get(dev_net(dev)->loopback_dev);
227 BUG_ON(!loopback_idev);
228
229 do {
230 in_dev_put(xdst->u.rt.idev);
231 xdst->u.rt.idev = loopback_idev;
232 in_dev_hold(loopback_idev);
233 xdst = (struct xfrm_dst *)xdst->u.dst.child;
234 } while (xdst->u.dst.xfrm);
235
236 __in_dev_put(loopback_idev);
237 }
238
239 xfrm_dst_ifdown(dst, dev); 210 xfrm_dst_ifdown(dst, dev);
240} 211}
241 212
@@ -248,7 +219,6 @@ static struct dst_ops xfrm4_dst_ops = {
248 .ifdown = xfrm4_dst_ifdown, 219 .ifdown = xfrm4_dst_ifdown,
249 .local_out = __ip_local_out, 220 .local_out = __ip_local_out,
250 .gc_thresh = 1024, 221 .gc_thresh = 1024,
251 .entries = ATOMIC_INIT(0),
252}; 222};
253 223
254static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { 224static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
@@ -256,13 +226,27 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
256 .dst_ops = &xfrm4_dst_ops, 226 .dst_ops = &xfrm4_dst_ops,
257 .dst_lookup = xfrm4_dst_lookup, 227 .dst_lookup = xfrm4_dst_lookup,
258 .get_saddr = xfrm4_get_saddr, 228 .get_saddr = xfrm4_get_saddr,
259 .find_bundle = __xfrm4_find_bundle,
260 .decode_session = _decode_session4, 229 .decode_session = _decode_session4,
261 .get_tos = xfrm4_get_tos, 230 .get_tos = xfrm4_get_tos,
262 .init_path = xfrm4_init_path, 231 .init_path = xfrm4_init_path,
263 .fill_dst = xfrm4_fill_dst, 232 .fill_dst = xfrm4_fill_dst,
264}; 233};
265 234
235#ifdef CONFIG_SYSCTL
236static struct ctl_table xfrm4_policy_table[] = {
237 {
238 .procname = "xfrm4_gc_thresh",
239 .data = &init_net.xfrm.xfrm4_dst_ops.gc_thresh,
240 .maxlen = sizeof(int),
241 .mode = 0644,
242 .proc_handler = proc_dointvec,
243 },
244 { }
245};
246
247static struct ctl_table_header *sysctl_hdr;
248#endif
249
266static void __init xfrm4_policy_init(void) 250static void __init xfrm4_policy_init(void)
267{ 251{
268 xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); 252 xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
@@ -270,12 +254,33 @@ static void __init xfrm4_policy_init(void)
270 254
271static void __exit xfrm4_policy_fini(void) 255static void __exit xfrm4_policy_fini(void)
272{ 256{
257#ifdef CONFIG_SYSCTL
258 if (sysctl_hdr)
259 unregister_net_sysctl_table(sysctl_hdr);
260#endif
273 xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); 261 xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
274} 262}
275 263
276void __init xfrm4_init(void) 264void __init xfrm4_init(int rt_max_size)
277{ 265{
266 /*
267 * Select a default value for the gc_thresh based on the main route
268 * table hash size. It seems to me the worst case scenario is when
269 * we have ipsec operating in transport mode, in which we create a
270 * dst_entry per socket. The xfrm gc algorithm starts trying to remove
271 * entries at gc_thresh, and prevents new allocations as 2*gc_thresh
272 * so lets set an initial xfrm gc_thresh value at the rt_max_size/2.
273 * That will let us store an ipsec connection per route table entry,
274 * and start cleaning when were 1/2 full
275 */
276 xfrm4_dst_ops.gc_thresh = rt_max_size/2;
277 dst_entries_init(&xfrm4_dst_ops);
278
278 xfrm4_state_init(); 279 xfrm4_state_init();
279 xfrm4_policy_init(); 280 xfrm4_policy_init();
281#ifdef CONFIG_SYSCTL
282 sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path,
283 xfrm4_policy_table);
284#endif
280} 285}
281 286
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 1ef1366a0a03..47947624eccc 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,21 +21,25 @@ static int xfrm4_init_flags(struct xfrm_state *x)
21} 21}
22 22
23static void 23static void
24__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, 24__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
25 struct xfrm_tmpl *tmpl, 25{
26 xfrm_address_t *daddr, xfrm_address_t *saddr) 26 sel->daddr.a4 = fl->fl4_dst;
27 sel->saddr.a4 = fl->fl4_src;
28 sel->dport = xfrm_flowi_dport(fl);
29 sel->dport_mask = htons(0xffff);
30 sel->sport = xfrm_flowi_sport(fl);
31 sel->sport_mask = htons(0xffff);
32 sel->family = AF_INET;
33 sel->prefixlen_d = 32;
34 sel->prefixlen_s = 32;
35 sel->proto = fl->proto;
36 sel->ifindex = fl->oif;
37}
38
39static void
40xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
41 xfrm_address_t *daddr, xfrm_address_t *saddr)
27{ 42{
28 x->sel.daddr.a4 = fl->fl4_dst;
29 x->sel.saddr.a4 = fl->fl4_src;
30 x->sel.dport = xfrm_flowi_dport(fl);
31 x->sel.dport_mask = htons(0xffff);
32 x->sel.sport = xfrm_flowi_sport(fl);
33 x->sel.sport_mask = htons(0xffff);
34 x->sel.family = AF_INET;
35 x->sel.prefixlen_d = 32;
36 x->sel.prefixlen_s = 32;
37 x->sel.proto = fl->proto;
38 x->sel.ifindex = fl->oif;
39 x->id = tmpl->id; 43 x->id = tmpl->id;
40 if (x->id.daddr.a4 == 0) 44 if (x->id.daddr.a4 == 0)
41 x->id.daddr.a4 = daddr->a4; 45 x->id.daddr.a4 = daddr->a4;
@@ -70,6 +74,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
70 .owner = THIS_MODULE, 74 .owner = THIS_MODULE,
71 .init_flags = xfrm4_init_flags, 75 .init_flags = xfrm4_init_flags,
72 .init_tempsel = __xfrm4_init_tempsel, 76 .init_tempsel = __xfrm4_init_tempsel,
77 .init_temprop = xfrm4_init_temprop,
73 .output = xfrm4_output, 78 .output = xfrm4_output,
74 .extract_input = xfrm4_extract_input, 79 .extract_input = xfrm4_extract_input,
75 .extract_output = xfrm4_extract_output, 80 .extract_output = xfrm4_extract_output,
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 41f5982d2087..82806455e859 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
58 return -ENOENT; 58 return -ENOENT;
59} 59}
60 60
61static struct xfrm_tunnel xfrm_tunnel_handler = { 61static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
62 .handler = xfrm_tunnel_rcv, 62 .handler = xfrm_tunnel_rcv,
63 .err_handler = xfrm_tunnel_err, 63 .err_handler = xfrm_tunnel_err,
64 .priority = 2, 64 .priority = 2,
65}; 65};
66 66
67#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 67#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
68static struct xfrm_tunnel xfrm64_tunnel_handler = { 68static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
69 .handler = xfrm_tunnel_rcv, 69 .handler = xfrm_tunnel_rcv,
70 .err_handler = xfrm_tunnel_err, 70 .err_handler = xfrm_tunnel_err,
71 .priority = 2, 71 .priority = 2,