aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig37
-rw-r--r--net/ipv4/af_inet.c187
-rw-r--r--net/ipv4/ah4.c3
-rw-r--r--net/ipv4/arp.c114
-rw-r--r--net/ipv4/cipso_ipv4.c3
-rw-r--r--net/ipv4/datagram.c9
-rw-r--r--net/ipv4/devinet.c45
-rw-r--r--net/ipv4/esp4.c2
-rw-r--r--net/ipv4/fib_frontend.c33
-rw-r--r--net/ipv4/fib_hash.c1
-rw-r--r--net/ipv4/fib_rules.c22
-rw-r--r--net/ipv4/fib_semantics.c81
-rw-r--r--net/ipv4/fib_trie.c19
-rw-r--r--net/ipv4/icmp.c51
-rw-r--r--net/ipv4/igmp.c138
-rw-r--r--net/ipv4/inet_connection_sock.c33
-rw-r--r--net/ipv4/inet_diag.c1
-rw-r--r--net/ipv4/inet_fragment.c2
-rw-r--r--net/ipv4/inet_hashtables.c6
-rw-r--r--net/ipv4/inet_timewait_sock.c1
-rw-r--r--net/ipv4/inetpeer.c244
-rw-r--r--net/ipv4/ip_forward.c13
-rw-r--r--net/ipv4/ip_fragment.c67
-rw-r--r--net/ipv4/ip_gre.c69
-rw-r--r--net/ipv4/ip_input.c35
-rw-r--r--net/ipv4/ip_options.c11
-rw-r--r--net/ipv4/ip_output.c140
-rw-r--r--net/ipv4/ip_sockglue.c81
-rw-r--r--net/ipv4/ipcomp.c17
-rw-r--r--net/ipv4/ipconfig.c67
-rw-r--r--net/ipv4/ipip.c36
-rw-r--r--net/ipv4/ipmr.c952
-rw-r--r--net/ipv4/netfilter.c19
-rw-r--r--net/ipv4/netfilter/arp_tables.c519
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c4
-rw-r--r--net/ipv4/netfilter/arptable_filter.c96
-rw-r--r--net/ipv4/netfilter/ip_queue.c62
-rw-r--r--net/ipv4/netfilter/ip_tables.c834
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c142
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c23
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c73
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c18
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c22
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c16
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c33
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c52
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c28
-rw-r--r--net/ipv4/netfilter/ipt_ah.c28
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c19
-rw-r--r--net/ipv4/netfilter/iptable_filter.c127
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c167
-rw-r--r--net/ipv4/netfilter/iptable_raw.c97
-rw-r--r--net/ipv4/netfilter/iptable_security.c118
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c15
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c11
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c11
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c26
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c70
-rw-r--r--net/ipv4/netfilter/nf_nat_ftp.c105
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c40
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c3
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_common.c12
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_dccp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c12
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c10
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_sctp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_tcp.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udp.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udplite.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_unknown.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c73
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c154
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c54
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c18
-rw-r--r--net/ipv4/netfilter/nf_nat_tftp.c1
-rw-r--r--net/ipv4/proc.c47
-rw-r--r--net/ipv4/protocol.c3
-rw-r--r--net/ipv4/raw.c27
-rw-r--r--net/ipv4/route.c759
-rw-r--r--net/ipv4/syncookies.c108
-rw-r--r--net/ipv4/sysctl_net_ipv4.c32
-rw-r--r--net/ipv4/tcp.c305
-rw-r--r--net/ipv4/tcp_cong.c6
-rw-r--r--net/ipv4/tcp_hybla.c4
-rw-r--r--net/ipv4/tcp_input.c65
-rw-r--r--net/ipv4/tcp_ipv4.c261
-rw-r--r--net/ipv4/tcp_minisocks.c13
-rw-r--r--net/ipv4/tcp_output.c146
-rw-r--r--net/ipv4/tcp_probe.c1
-rw-r--r--net/ipv4/tcp_timer.c71
-rw-r--r--net/ipv4/tunnel4.c3
-rw-r--r--net/ipv4/udp.c130
-rw-r--r--net/ipv4/udplite.c7
-rw-r--r--net/ipv4/xfrm4_input.c8
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c1
-rw-r--r--net/ipv4/xfrm4_output.c2
-rw-r--r--net/ipv4/xfrm4_policy.c33
-rw-r--r--net/ipv4/xfrm4_state.c33
99 files changed, 4213 insertions, 3533 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 0c94a1ac2946..7cd7760144f7 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -46,7 +46,7 @@ config IP_ADVANCED_ROUTER
46 rp_filter on use: 46 rp_filter on use:
47 47
48 echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter 48 echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
49 and 49 or
50 echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter 50 echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
51 51
52 Note that some distributions enable it in startup scripts. 52 Note that some distributions enable it in startup scripts.
@@ -217,6 +217,7 @@ config NET_IPIP
217 217
218config NET_IPGRE 218config NET_IPGRE
219 tristate "IP: GRE tunnels over IP" 219 tristate "IP: GRE tunnels over IP"
220 depends on IPV6 || IPV6=n
220 help 221 help
221 Tunneling means encapsulating data of one protocol type within 222 Tunneling means encapsulating data of one protocol type within
222 another protocol and sending it over a channel that understands the 223 another protocol and sending it over a channel that understands the
@@ -250,6 +251,20 @@ config IP_MROUTE
250 <file:Documentation/networking/multicast.txt>. If you haven't heard 251 <file:Documentation/networking/multicast.txt>. If you haven't heard
251 about it, you don't need it. 252 about it, you don't need it.
252 253
254config IP_MROUTE_MULTIPLE_TABLES
255 bool "IP: multicast policy routing"
256 depends on IP_MROUTE && IP_ADVANCED_ROUTER
257 select FIB_RULES
258 help
259 Normally, a multicast router runs a userspace daemon and decides
260 what to do with a multicast packet based on the source and
261 destination addresses. If you say Y here, the multicast router
262 will also be able to take interfaces and packet marks into
263 account and run multiple instances of userspace daemons
264 simultaneously, each one handling a single table.
265
266 If unsure, say N.
267
253config IP_PIMSM_V1 268config IP_PIMSM_V1
254 bool "IP: PIM-SM version 1 support" 269 bool "IP: PIM-SM version 1 support"
255 depends on IP_MROUTE 270 depends on IP_MROUTE
@@ -289,7 +304,7 @@ config ARPD
289 If unsure, say N. 304 If unsure, say N.
290 305
291config SYN_COOKIES 306config SYN_COOKIES
292 bool "IP: TCP syncookie support (disabled per default)" 307 bool "IP: TCP syncookie support"
293 ---help--- 308 ---help---
294 Normal TCP/IP networking is open to an attack known as "SYN 309 Normal TCP/IP networking is open to an attack known as "SYN
295 flooding". This denial-of-service attack prevents legitimate remote 310 flooding". This denial-of-service attack prevents legitimate remote
@@ -314,13 +329,13 @@ config SYN_COOKIES
314 server is really overloaded. If this happens frequently better turn 329 server is really overloaded. If this happens frequently better turn
315 them off. 330 them off.
316 331
317 If you say Y here, note that SYN cookies aren't enabled by default; 332 If you say Y here, you can disable SYN cookies at run time by
318 you can enable them by saying Y to "/proc file system support" and 333 saying Y to "/proc file system support" and
319 "Sysctl support" below and executing the command 334 "Sysctl support" below and executing the command
320 335
321 echo 1 >/proc/sys/net/ipv4/tcp_syncookies 336 echo 0 > /proc/sys/net/ipv4/tcp_syncookies
322 337
323 at boot time after the /proc file system has been mounted. 338 after the /proc file system has been mounted.
324 339
325 If unsure, say N. 340 If unsure, say N.
326 341
@@ -398,7 +413,7 @@ config INET_XFRM_MODE_BEET
398 If unsure, say Y. 413 If unsure, say Y.
399 414
400config INET_LRO 415config INET_LRO
401 bool "Large Receive Offload (ipv4/tcp)" 416 tristate "Large Receive Offload (ipv4/tcp)"
402 default y 417 default y
403 ---help--- 418 ---help---
404 Support for Large Receive Offload (ipv4/tcp). 419 Support for Large Receive Offload (ipv4/tcp).
@@ -587,9 +602,15 @@ choice
587 config DEFAULT_HTCP 602 config DEFAULT_HTCP
588 bool "Htcp" if TCP_CONG_HTCP=y 603 bool "Htcp" if TCP_CONG_HTCP=y
589 604
605 config DEFAULT_HYBLA
606 bool "Hybla" if TCP_CONG_HYBLA=y
607
590 config DEFAULT_VEGAS 608 config DEFAULT_VEGAS
591 bool "Vegas" if TCP_CONG_VEGAS=y 609 bool "Vegas" if TCP_CONG_VEGAS=y
592 610
611 config DEFAULT_VENO
612 bool "Veno" if TCP_CONG_VENO=y
613
593 config DEFAULT_WESTWOOD 614 config DEFAULT_WESTWOOD
594 bool "Westwood" if TCP_CONG_WESTWOOD=y 615 bool "Westwood" if TCP_CONG_WESTWOOD=y
595 616
@@ -610,8 +631,10 @@ config DEFAULT_TCP_CONG
610 default "bic" if DEFAULT_BIC 631 default "bic" if DEFAULT_BIC
611 default "cubic" if DEFAULT_CUBIC 632 default "cubic" if DEFAULT_CUBIC
612 default "htcp" if DEFAULT_HTCP 633 default "htcp" if DEFAULT_HTCP
634 default "hybla" if DEFAULT_HYBLA
613 default "vegas" if DEFAULT_VEGAS 635 default "vegas" if DEFAULT_VEGAS
614 default "westwood" if DEFAULT_WESTWOOD 636 default "westwood" if DEFAULT_WESTWOOD
637 default "veno" if DEFAULT_VENO
615 default "reno" if DEFAULT_RENO 638 default "reno" if DEFAULT_RENO
616 default "cubic" 639 default "cubic"
617 640
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 7d12c6a9b19b..6a1100c25a9f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -86,6 +86,7 @@
86#include <linux/poll.h> 86#include <linux/poll.h>
87#include <linux/netfilter_ipv4.h> 87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h> 88#include <linux/random.h>
89#include <linux/slab.h>
89 90
90#include <asm/uaccess.h> 91#include <asm/uaccess.h>
91#include <asm/system.h> 92#include <asm/system.h>
@@ -153,7 +154,7 @@ void inet_sock_destruct(struct sock *sk)
153 WARN_ON(sk->sk_forward_alloc); 154 WARN_ON(sk->sk_forward_alloc);
154 155
155 kfree(inet->opt); 156 kfree(inet->opt);
156 dst_release(sk->sk_dst_cache); 157 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
157 sk_refcnt_debug_dec(sk); 158 sk_refcnt_debug_dec(sk);
158} 159}
159EXPORT_SYMBOL(inet_sock_destruct); 160EXPORT_SYMBOL(inet_sock_destruct);
@@ -354,6 +355,8 @@ lookup_protocol:
354 inet = inet_sk(sk); 355 inet = inet_sk(sk);
355 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; 356 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
356 357
358 inet->nodefrag = 0;
359
357 if (SOCK_RAW == sock->type) { 360 if (SOCK_RAW == sock->type) {
358 inet->inet_num = protocol; 361 inet->inet_num = protocol;
359 if (IPPROTO_RAW == protocol) 362 if (IPPROTO_RAW == protocol)
@@ -418,6 +421,8 @@ int inet_release(struct socket *sock)
418 if (sk) { 421 if (sk) {
419 long timeout; 422 long timeout;
420 423
424 sock_rps_reset_flow(sk);
425
421 /* Applications forget to leave groups before exiting */ 426 /* Applications forget to leave groups before exiting */
422 ip_mc_drop_socket(sk); 427 ip_mc_drop_socket(sk);
423 428
@@ -530,6 +535,8 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
530{ 535{
531 struct sock *sk = sock->sk; 536 struct sock *sk = sock->sk;
532 537
538 if (addr_len < sizeof(uaddr->sa_family))
539 return -EINVAL;
533 if (uaddr->sa_family == AF_UNSPEC) 540 if (uaddr->sa_family == AF_UNSPEC)
534 return sk->sk_prot->disconnect(sk, flags); 541 return sk->sk_prot->disconnect(sk, flags);
535 542
@@ -543,7 +550,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
543{ 550{
544 DEFINE_WAIT(wait); 551 DEFINE_WAIT(wait);
545 552
546 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 553 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
547 554
548 /* Basic assumption: if someone sets sk->sk_err, he _must_ 555 /* Basic assumption: if someone sets sk->sk_err, he _must_
549 * change state of the socket from TCP_SYN_*. 556 * change state of the socket from TCP_SYN_*.
@@ -556,9 +563,9 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
556 lock_sock(sk); 563 lock_sock(sk);
557 if (signal_pending(current) || !timeo) 564 if (signal_pending(current) || !timeo)
558 break; 565 break;
559 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 566 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
560 } 567 }
561 finish_wait(sk->sk_sleep, &wait); 568 finish_wait(sk_sleep(sk), &wait);
562 return timeo; 569 return timeo;
563} 570}
564 571
@@ -573,6 +580,9 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
573 int err; 580 int err;
574 long timeo; 581 long timeo;
575 582
583 if (addr_len < sizeof(uaddr->sa_family))
584 return -EINVAL;
585
576 lock_sock(sk); 586 lock_sock(sk);
577 587
578 if (uaddr->sa_family == AF_UNSPEC) { 588 if (uaddr->sa_family == AF_UNSPEC) {
@@ -714,29 +724,51 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
714{ 724{
715 struct sock *sk = sock->sk; 725 struct sock *sk = sock->sk;
716 726
727 sock_rps_record_flow(sk);
728
717 /* We may need to bind the socket. */ 729 /* We may need to bind the socket. */
718 if (!inet_sk(sk)->inet_num && inet_autobind(sk)) 730 if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
731 inet_autobind(sk))
719 return -EAGAIN; 732 return -EAGAIN;
720 733
721 return sk->sk_prot->sendmsg(iocb, sk, msg, size); 734 return sk->sk_prot->sendmsg(iocb, sk, msg, size);
722} 735}
723EXPORT_SYMBOL(inet_sendmsg); 736EXPORT_SYMBOL(inet_sendmsg);
724 737
725 738ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
726static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, 739 size_t size, int flags)
727 size_t size, int flags)
728{ 740{
729 struct sock *sk = sock->sk; 741 struct sock *sk = sock->sk;
730 742
743 sock_rps_record_flow(sk);
744
731 /* We may need to bind the socket. */ 745 /* We may need to bind the socket. */
732 if (!inet_sk(sk)->inet_num && inet_autobind(sk)) 746 if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
747 inet_autobind(sk))
733 return -EAGAIN; 748 return -EAGAIN;
734 749
735 if (sk->sk_prot->sendpage) 750 if (sk->sk_prot->sendpage)
736 return sk->sk_prot->sendpage(sk, page, offset, size, flags); 751 return sk->sk_prot->sendpage(sk, page, offset, size, flags);
737 return sock_no_sendpage(sock, page, offset, size, flags); 752 return sock_no_sendpage(sock, page, offset, size, flags);
738} 753}
754EXPORT_SYMBOL(inet_sendpage);
739 755
756int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
757 size_t size, int flags)
758{
759 struct sock *sk = sock->sk;
760 int addr_len = 0;
761 int err;
762
763 sock_rps_record_flow(sk);
764
765 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
766 flags & ~MSG_DONTWAIT, &addr_len);
767 if (err >= 0)
768 msg->msg_namelen = addr_len;
769 return err;
770}
771EXPORT_SYMBOL(inet_recvmsg);
740 772
741int inet_shutdown(struct socket *sock, int how) 773int inet_shutdown(struct socket *sock, int how)
742{ 774{
@@ -865,10 +897,10 @@ const struct proto_ops inet_stream_ops = {
865 .shutdown = inet_shutdown, 897 .shutdown = inet_shutdown,
866 .setsockopt = sock_common_setsockopt, 898 .setsockopt = sock_common_setsockopt,
867 .getsockopt = sock_common_getsockopt, 899 .getsockopt = sock_common_getsockopt,
868 .sendmsg = tcp_sendmsg, 900 .sendmsg = inet_sendmsg,
869 .recvmsg = sock_common_recvmsg, 901 .recvmsg = inet_recvmsg,
870 .mmap = sock_no_mmap, 902 .mmap = sock_no_mmap,
871 .sendpage = tcp_sendpage, 903 .sendpage = inet_sendpage,
872 .splice_read = tcp_splice_read, 904 .splice_read = tcp_splice_read,
873#ifdef CONFIG_COMPAT 905#ifdef CONFIG_COMPAT
874 .compat_setsockopt = compat_sock_common_setsockopt, 906 .compat_setsockopt = compat_sock_common_setsockopt,
@@ -893,7 +925,7 @@ const struct proto_ops inet_dgram_ops = {
893 .setsockopt = sock_common_setsockopt, 925 .setsockopt = sock_common_setsockopt,
894 .getsockopt = sock_common_getsockopt, 926 .getsockopt = sock_common_getsockopt,
895 .sendmsg = inet_sendmsg, 927 .sendmsg = inet_sendmsg,
896 .recvmsg = sock_common_recvmsg, 928 .recvmsg = inet_recvmsg,
897 .mmap = sock_no_mmap, 929 .mmap = sock_no_mmap,
898 .sendpage = inet_sendpage, 930 .sendpage = inet_sendpage,
899#ifdef CONFIG_COMPAT 931#ifdef CONFIG_COMPAT
@@ -923,7 +955,7 @@ static const struct proto_ops inet_sockraw_ops = {
923 .setsockopt = sock_common_setsockopt, 955 .setsockopt = sock_common_setsockopt,
924 .getsockopt = sock_common_getsockopt, 956 .getsockopt = sock_common_getsockopt,
925 .sendmsg = inet_sendmsg, 957 .sendmsg = inet_sendmsg,
926 .recvmsg = sock_common_recvmsg, 958 .recvmsg = inet_recvmsg,
927 .mmap = sock_no_mmap, 959 .mmap = sock_no_mmap,
928 .sendpage = inet_sendpage, 960 .sendpage = inet_sendpage,
929#ifdef CONFIG_COMPAT 961#ifdef CONFIG_COMPAT
@@ -1073,7 +1105,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
1073 if (err) 1105 if (err)
1074 return err; 1106 return err;
1075 1107
1076 sk_setup_caps(sk, &rt->u.dst); 1108 sk_setup_caps(sk, &rt->dst);
1077 1109
1078 new_saddr = rt->rt_src; 1110 new_saddr = rt->rt_src;
1079 1111
@@ -1139,7 +1171,7 @@ int inet_sk_rebuild_header(struct sock *sk)
1139 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0); 1171 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
1140} 1172}
1141 if (!err) 1173 if (!err)
1142 sk_setup_caps(sk, &rt->u.dst); 1174 sk_setup_caps(sk, &rt->dst);
1143 else { 1175 else {
1144 /* Routing failed... */ 1176 /* Routing failed... */
1145 sk->sk_route_caps = 0; 1177 sk->sk_route_caps = 0;
@@ -1296,8 +1328,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1296 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) 1328 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
1297 goto out_unlock; 1329 goto out_unlock;
1298 1330
1299 id = ntohl(*(u32 *)&iph->id); 1331 id = ntohl(*(__be32 *)&iph->id);
1300 flush = (u16)((ntohl(*(u32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF)); 1332 flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
1301 id >>= 16; 1333 id >>= 16;
1302 1334
1303 for (p = *head; p; p = p->next) { 1335 for (p = *head; p; p = p->next) {
@@ -1310,8 +1342,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1310 1342
1311 if ((iph->protocol ^ iph2->protocol) | 1343 if ((iph->protocol ^ iph2->protocol) |
1312 (iph->tos ^ iph2->tos) | 1344 (iph->tos ^ iph2->tos) |
1313 (iph->saddr ^ iph2->saddr) | 1345 ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
1314 (iph->daddr ^ iph2->daddr)) { 1346 ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
1315 NAPI_GRO_CB(p)->same_flow = 0; 1347 NAPI_GRO_CB(p)->same_flow = 0;
1316 continue; 1348 continue;
1317 } 1349 }
@@ -1385,7 +1417,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
1385} 1417}
1386EXPORT_SYMBOL_GPL(inet_ctl_sock_create); 1418EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
1387 1419
1388unsigned long snmp_fold_field(void *mib[], int offt) 1420unsigned long snmp_fold_field(void __percpu *mib[], int offt)
1389{ 1421{
1390 unsigned long res = 0; 1422 unsigned long res = 0;
1391 int i; 1423 int i;
@@ -1398,13 +1430,49 @@ unsigned long snmp_fold_field(void *mib[], int offt)
1398} 1430}
1399EXPORT_SYMBOL_GPL(snmp_fold_field); 1431EXPORT_SYMBOL_GPL(snmp_fold_field);
1400 1432
1401int snmp_mib_init(void *ptr[2], size_t mibsize) 1433#if BITS_PER_LONG==32
1434
1435u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
1436{
1437 u64 res = 0;
1438 int cpu;
1439
1440 for_each_possible_cpu(cpu) {
1441 void *bhptr, *userptr;
1442 struct u64_stats_sync *syncp;
1443 u64 v_bh, v_user;
1444 unsigned int start;
1445
1446 /* first mib used by softirq context, we must use _bh() accessors */
1447 bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu);
1448 syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
1449 do {
1450 start = u64_stats_fetch_begin_bh(syncp);
1451 v_bh = *(((u64 *) bhptr) + offt);
1452 } while (u64_stats_fetch_retry_bh(syncp, start));
1453
1454 /* second mib used in USER context */
1455 userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu);
1456 syncp = (struct u64_stats_sync *)(userptr + syncp_offset);
1457 do {
1458 start = u64_stats_fetch_begin(syncp);
1459 v_user = *(((u64 *) userptr) + offt);
1460 } while (u64_stats_fetch_retry(syncp, start));
1461
1462 res += v_bh + v_user;
1463 }
1464 return res;
1465}
1466EXPORT_SYMBOL_GPL(snmp_fold_field64);
1467#endif
1468
1469int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
1402{ 1470{
1403 BUG_ON(ptr == NULL); 1471 BUG_ON(ptr == NULL);
1404 ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); 1472 ptr[0] = __alloc_percpu(mibsize, align);
1405 if (!ptr[0]) 1473 if (!ptr[0])
1406 goto err0; 1474 goto err0;
1407 ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); 1475 ptr[1] = __alloc_percpu(mibsize, align);
1408 if (!ptr[1]) 1476 if (!ptr[1])
1409 goto err1; 1477 goto err1;
1410 return 0; 1478 return 0;
@@ -1416,7 +1484,7 @@ err0:
1416} 1484}
1417EXPORT_SYMBOL_GPL(snmp_mib_init); 1485EXPORT_SYMBOL_GPL(snmp_mib_init);
1418 1486
1419void snmp_mib_free(void *ptr[2]) 1487void snmp_mib_free(void __percpu *ptr[2])
1420{ 1488{
1421 BUG_ON(ptr == NULL); 1489 BUG_ON(ptr == NULL);
1422 free_percpu(ptr[0]); 1490 free_percpu(ptr[0]);
@@ -1460,56 +1528,63 @@ static const struct net_protocol icmp_protocol = {
1460 1528
1461static __net_init int ipv4_mib_init_net(struct net *net) 1529static __net_init int ipv4_mib_init_net(struct net *net)
1462{ 1530{
1463 if (snmp_mib_init((void **)net->mib.tcp_statistics, 1531 if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
1464 sizeof(struct tcp_mib)) < 0) 1532 sizeof(struct tcp_mib),
1533 __alignof__(struct tcp_mib)) < 0)
1465 goto err_tcp_mib; 1534 goto err_tcp_mib;
1466 if (snmp_mib_init((void **)net->mib.ip_statistics, 1535 if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
1467 sizeof(struct ipstats_mib)) < 0) 1536 sizeof(struct ipstats_mib),
1537 __alignof__(struct ipstats_mib)) < 0)
1468 goto err_ip_mib; 1538 goto err_ip_mib;
1469 if (snmp_mib_init((void **)net->mib.net_statistics, 1539 if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
1470 sizeof(struct linux_mib)) < 0) 1540 sizeof(struct linux_mib),
1541 __alignof__(struct linux_mib)) < 0)
1471 goto err_net_mib; 1542 goto err_net_mib;
1472 if (snmp_mib_init((void **)net->mib.udp_statistics, 1543 if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
1473 sizeof(struct udp_mib)) < 0) 1544 sizeof(struct udp_mib),
1545 __alignof__(struct udp_mib)) < 0)
1474 goto err_udp_mib; 1546 goto err_udp_mib;
1475 if (snmp_mib_init((void **)net->mib.udplite_statistics, 1547 if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
1476 sizeof(struct udp_mib)) < 0) 1548 sizeof(struct udp_mib),
1549 __alignof__(struct udp_mib)) < 0)
1477 goto err_udplite_mib; 1550 goto err_udplite_mib;
1478 if (snmp_mib_init((void **)net->mib.icmp_statistics, 1551 if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
1479 sizeof(struct icmp_mib)) < 0) 1552 sizeof(struct icmp_mib),
1553 __alignof__(struct icmp_mib)) < 0)
1480 goto err_icmp_mib; 1554 goto err_icmp_mib;
1481 if (snmp_mib_init((void **)net->mib.icmpmsg_statistics, 1555 if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
1482 sizeof(struct icmpmsg_mib)) < 0) 1556 sizeof(struct icmpmsg_mib),
1557 __alignof__(struct icmpmsg_mib)) < 0)
1483 goto err_icmpmsg_mib; 1558 goto err_icmpmsg_mib;
1484 1559
1485 tcp_mib_init(net); 1560 tcp_mib_init(net);
1486 return 0; 1561 return 0;
1487 1562
1488err_icmpmsg_mib: 1563err_icmpmsg_mib:
1489 snmp_mib_free((void **)net->mib.icmp_statistics); 1564 snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
1490err_icmp_mib: 1565err_icmp_mib:
1491 snmp_mib_free((void **)net->mib.udplite_statistics); 1566 snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
1492err_udplite_mib: 1567err_udplite_mib:
1493 snmp_mib_free((void **)net->mib.udp_statistics); 1568 snmp_mib_free((void __percpu **)net->mib.udp_statistics);
1494err_udp_mib: 1569err_udp_mib:
1495 snmp_mib_free((void **)net->mib.net_statistics); 1570 snmp_mib_free((void __percpu **)net->mib.net_statistics);
1496err_net_mib: 1571err_net_mib:
1497 snmp_mib_free((void **)net->mib.ip_statistics); 1572 snmp_mib_free((void __percpu **)net->mib.ip_statistics);
1498err_ip_mib: 1573err_ip_mib:
1499 snmp_mib_free((void **)net->mib.tcp_statistics); 1574 snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
1500err_tcp_mib: 1575err_tcp_mib:
1501 return -ENOMEM; 1576 return -ENOMEM;
1502} 1577}
1503 1578
1504static __net_exit void ipv4_mib_exit_net(struct net *net) 1579static __net_exit void ipv4_mib_exit_net(struct net *net)
1505{ 1580{
1506 snmp_mib_free((void **)net->mib.icmpmsg_statistics); 1581 snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics);
1507 snmp_mib_free((void **)net->mib.icmp_statistics); 1582 snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
1508 snmp_mib_free((void **)net->mib.udplite_statistics); 1583 snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
1509 snmp_mib_free((void **)net->mib.udp_statistics); 1584 snmp_mib_free((void __percpu **)net->mib.udp_statistics);
1510 snmp_mib_free((void **)net->mib.net_statistics); 1585 snmp_mib_free((void __percpu **)net->mib.net_statistics);
1511 snmp_mib_free((void **)net->mib.ip_statistics); 1586 snmp_mib_free((void __percpu **)net->mib.ip_statistics);
1512 snmp_mib_free((void **)net->mib.tcp_statistics); 1587 snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
1513} 1588}
1514 1589
1515static __net_initdata struct pernet_operations ipv4_mib_ops = { 1590static __net_initdata struct pernet_operations ipv4_mib_ops = {
@@ -1546,9 +1621,13 @@ static int __init inet_init(void)
1546 1621
1547 BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)); 1622 BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
1548 1623
1624 sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
1625 if (!sysctl_local_reserved_ports)
1626 goto out;
1627
1549 rc = proto_register(&tcp_prot, 1); 1628 rc = proto_register(&tcp_prot, 1);
1550 if (rc) 1629 if (rc)
1551 goto out; 1630 goto out_free_reserved_ports;
1552 1631
1553 rc = proto_register(&udp_prot, 1); 1632 rc = proto_register(&udp_prot, 1);
1554 if (rc) 1633 if (rc)
@@ -1647,6 +1726,8 @@ out_unregister_udp_proto:
1647 proto_unregister(&udp_prot); 1726 proto_unregister(&udp_prot);
1648out_unregister_tcp_proto: 1727out_unregister_tcp_proto:
1649 proto_unregister(&tcp_prot); 1728 proto_unregister(&tcp_prot);
1729out_free_reserved_ports:
1730 kfree(sysctl_local_reserved_ports);
1650 goto out; 1731 goto out;
1651} 1732}
1652 1733
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 7ed3e4ae93ae..880a5ec6dce0 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,6 +1,7 @@
1#include <crypto/hash.h> 1#include <crypto/hash.h>
2#include <linux/err.h> 2#include <linux/err.h>
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/slab.h>
4#include <net/ip.h> 5#include <net/ip.h>
5#include <net/xfrm.h> 6#include <net/xfrm.h>
6#include <net/ah.h> 7#include <net/ah.h>
@@ -393,7 +394,7 @@ static void ah4_err(struct sk_buff *skb, u32 info)
393 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 394 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
394 return; 395 return;
395 396
396 x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); 397 x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
397 if (!x) 398 if (!x)
398 return; 399 return;
399 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", 400 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index c95cd93acf29..96c1955b3e2f 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -70,6 +70,7 @@
70 * bonding can change the skb before 70 * bonding can change the skb before
71 * sending (e.g. insert 8021q tag). 71 * sending (e.g. insert 8021q tag).
72 * Harald Welte : convert to make use of jenkins hash 72 * Harald Welte : convert to make use of jenkins hash
73 * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support.
73 */ 74 */
74 75
75#include <linux/module.h> 76#include <linux/module.h>
@@ -97,6 +98,7 @@
97#include <linux/net.h> 98#include <linux/net.h>
98#include <linux/rcupdate.h> 99#include <linux/rcupdate.h>
99#include <linux/jhash.h> 100#include <linux/jhash.h>
101#include <linux/slab.h>
100#ifdef CONFIG_SYSCTL 102#ifdef CONFIG_SYSCTL
101#include <linux/sysctl.h> 103#include <linux/sysctl.h>
102#endif 104#endif
@@ -114,6 +116,7 @@
114#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) 116#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
115#include <net/atmclip.h> 117#include <net/atmclip.h>
116struct neigh_table *clip_tbl_hook; 118struct neigh_table *clip_tbl_hook;
119EXPORT_SYMBOL(clip_tbl_hook);
117#endif 120#endif
118 121
119#include <asm/system.h> 122#include <asm/system.h>
@@ -167,6 +170,7 @@ const struct neigh_ops arp_broken_ops = {
167 .hh_output = dev_queue_xmit, 170 .hh_output = dev_queue_xmit,
168 .queue_xmit = dev_queue_xmit, 171 .queue_xmit = dev_queue_xmit,
169}; 172};
173EXPORT_SYMBOL(arp_broken_ops);
170 174
171struct neigh_table arp_tbl = { 175struct neigh_table arp_tbl = {
172 .family = AF_INET, 176 .family = AF_INET,
@@ -196,6 +200,7 @@ struct neigh_table arp_tbl = {
196 .gc_thresh2 = 512, 200 .gc_thresh2 = 512,
197 .gc_thresh3 = 1024, 201 .gc_thresh3 = 1024,
198}; 202};
203EXPORT_SYMBOL(arp_tbl);
199 204
200int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) 205int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
201{ 206{
@@ -331,11 +336,14 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
331 struct net_device *dev = neigh->dev; 336 struct net_device *dev = neigh->dev;
332 __be32 target = *(__be32*)neigh->primary_key; 337 __be32 target = *(__be32*)neigh->primary_key;
333 int probes = atomic_read(&neigh->probes); 338 int probes = atomic_read(&neigh->probes);
334 struct in_device *in_dev = in_dev_get(dev); 339 struct in_device *in_dev;
335 340
336 if (!in_dev) 341 rcu_read_lock();
342 in_dev = __in_dev_get_rcu(dev);
343 if (!in_dev) {
344 rcu_read_unlock();
337 return; 345 return;
338 346 }
339 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { 347 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
340 default: 348 default:
341 case 0: /* By default announce any local IP */ 349 case 0: /* By default announce any local IP */
@@ -356,9 +364,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
356 case 2: /* Avoid secondary IPs, get a primary/preferred one */ 364 case 2: /* Avoid secondary IPs, get a primary/preferred one */
357 break; 365 break;
358 } 366 }
367 rcu_read_unlock();
359 368
360 if (in_dev)
361 in_dev_put(in_dev);
362 if (!saddr) 369 if (!saddr)
363 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); 370 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
364 371
@@ -425,7 +432,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
425 432
426 if (ip_route_output_key(net, &rt, &fl) < 0) 433 if (ip_route_output_key(net, &rt, &fl) < 0)
427 return 1; 434 return 1;
428 if (rt->u.dst.dev != dev) { 435 if (rt->dst.dev != dev) {
429 NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); 436 NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
430 flag = 1; 437 flag = 1;
431 } 438 }
@@ -495,6 +502,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
495 kfree_skb(skb); 502 kfree_skb(skb);
496 return 1; 503 return 1;
497} 504}
505EXPORT_SYMBOL(arp_find);
498 506
499/* END OF OBSOLETE FUNCTIONS */ 507/* END OF OBSOLETE FUNCTIONS */
500 508
@@ -524,12 +532,15 @@ int arp_bind_neighbour(struct dst_entry *dst)
524/* 532/*
525 * Check if we can use proxy ARP for this path 533 * Check if we can use proxy ARP for this path
526 */ 534 */
527 535static inline int arp_fwd_proxy(struct in_device *in_dev,
528static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) 536 struct net_device *dev, struct rtable *rt)
529{ 537{
530 struct in_device *out_dev; 538 struct in_device *out_dev;
531 int imi, omi = -1; 539 int imi, omi = -1;
532 540
541 if (rt->dst.dev == dev)
542 return 0;
543
533 if (!IN_DEV_PROXY_ARP(in_dev)) 544 if (!IN_DEV_PROXY_ARP(in_dev))
534 return 0; 545 return 0;
535 546
@@ -540,14 +551,51 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt)
540 551
541 /* place to check for proxy_arp for routes */ 552 /* place to check for proxy_arp for routes */
542 553
543 if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) { 554 out_dev = __in_dev_get_rcu(rt->dst.dev);
555 if (out_dev)
544 omi = IN_DEV_MEDIUM_ID(out_dev); 556 omi = IN_DEV_MEDIUM_ID(out_dev);
545 in_dev_put(out_dev); 557
546 }
547 return (omi != imi && omi != -1); 558 return (omi != imi && omi != -1);
548} 559}
549 560
550/* 561/*
562 * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev)
563 *
564 * RFC3069 supports proxy arp replies back to the same interface. This
565 * is done to support (ethernet) switch features, like RFC 3069, where
566 * the individual ports are not allowed to communicate with each
567 * other, BUT they are allowed to talk to the upstream router. As
568 * described in RFC 3069, it is possible to allow these hosts to
569 * communicate through the upstream router, by proxy_arp'ing.
570 *
571 * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation"
572 *
573 * This technology is known by different names:
574 * In RFC 3069 it is called VLAN Aggregation.
575 * Cisco and Allied Telesyn call it Private VLAN.
576 * Hewlett-Packard call it Source-Port filtering or port-isolation.
577 * Ericsson call it MAC-Forced Forwarding (RFC Draft).
578 *
579 */
580static inline int arp_fwd_pvlan(struct in_device *in_dev,
581 struct net_device *dev, struct rtable *rt,
582 __be32 sip, __be32 tip)
583{
584 /* Private VLAN is only concerned about the same ethernet segment */
585 if (rt->dst.dev != dev)
586 return 0;
587
588 /* Don't reply on self probes (often done by windowz boxes)*/
589 if (sip == tip)
590 return 0;
591
592 if (IN_DEV_PROXY_ARP_PVLAN(in_dev))
593 return 1;
594 else
595 return 0;
596}
597
598/*
551 * Interface to link layer: send routine and receive handler. 599 * Interface to link layer: send routine and receive handler.
552 */ 600 */
553 601
@@ -619,13 +667,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
619#endif 667#endif
620#endif 668#endif
621 669
622#ifdef CONFIG_FDDI 670#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
623 case ARPHRD_FDDI: 671 case ARPHRD_FDDI:
624 arp->ar_hrd = htons(ARPHRD_ETHER); 672 arp->ar_hrd = htons(ARPHRD_ETHER);
625 arp->ar_pro = htons(ETH_P_IP); 673 arp->ar_pro = htons(ETH_P_IP);
626 break; 674 break;
627#endif 675#endif
628#ifdef CONFIG_TR 676#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE)
629 case ARPHRD_IEEE802_TR: 677 case ARPHRD_IEEE802_TR:
630 arp->ar_hrd = htons(ARPHRD_IEEE802); 678 arp->ar_hrd = htons(ARPHRD_IEEE802);
631 arp->ar_pro = htons(ETH_P_IP); 679 arp->ar_pro = htons(ETH_P_IP);
@@ -656,6 +704,7 @@ out:
656 kfree_skb(skb); 704 kfree_skb(skb);
657 return NULL; 705 return NULL;
658} 706}
707EXPORT_SYMBOL(arp_create);
659 708
660/* 709/*
661 * Send an arp packet. 710 * Send an arp packet.
@@ -665,6 +714,7 @@ void arp_xmit(struct sk_buff *skb)
665 /* Send it off, maybe filter it using firewalling first. */ 714 /* Send it off, maybe filter it using firewalling first. */
666 NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit); 715 NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
667} 716}
717EXPORT_SYMBOL(arp_xmit);
668 718
669/* 719/*
670 * Create and send an arp packet. 720 * Create and send an arp packet.
@@ -691,6 +741,7 @@ void arp_send(int type, int ptype, __be32 dest_ip,
691 741
692 arp_xmit(skb); 742 arp_xmit(skb);
693} 743}
744EXPORT_SYMBOL(arp_send);
694 745
695/* 746/*
696 * Process an arp request. 747 * Process an arp request.
@@ -699,7 +750,7 @@ void arp_send(int type, int ptype, __be32 dest_ip,
699static int arp_process(struct sk_buff *skb) 750static int arp_process(struct sk_buff *skb)
700{ 751{
701 struct net_device *dev = skb->dev; 752 struct net_device *dev = skb->dev;
702 struct in_device *in_dev = in_dev_get(dev); 753 struct in_device *in_dev = __in_dev_get_rcu(dev);
703 struct arphdr *arp; 754 struct arphdr *arp;
704 unsigned char *arp_ptr; 755 unsigned char *arp_ptr;
705 struct rtable *rt; 756 struct rtable *rt;
@@ -812,7 +863,7 @@ static int arp_process(struct sk_buff *skb)
812 } 863 }
813 864
814 if (arp->ar_op == htons(ARPOP_REQUEST) && 865 if (arp->ar_op == htons(ARPOP_REQUEST) &&
815 ip_route_input(skb, tip, sip, 0, dev) == 0) { 866 ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
816 867
817 rt = skb_rtable(skb); 868 rt = skb_rtable(skb);
818 addr_type = rt->rt_type; 869 addr_type = rt->rt_type;
@@ -833,8 +884,11 @@ static int arp_process(struct sk_buff *skb)
833 } 884 }
834 goto out; 885 goto out;
835 } else if (IN_DEV_FORWARD(in_dev)) { 886 } else if (IN_DEV_FORWARD(in_dev)) {
836 if (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && 887 if (addr_type == RTN_UNICAST &&
837 (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { 888 (arp_fwd_proxy(in_dev, dev, rt) ||
889 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
890 pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))
891 {
838 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 892 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
839 if (n) 893 if (n)
840 neigh_release(n); 894 neigh_release(n);
@@ -845,7 +899,6 @@ static int arp_process(struct sk_buff *skb)
845 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); 899 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
846 } else { 900 } else {
847 pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); 901 pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
848 in_dev_put(in_dev);
849 return 0; 902 return 0;
850 } 903 }
851 goto out; 904 goto out;
@@ -863,7 +916,8 @@ static int arp_process(struct sk_buff *skb)
863 devices (strip is candidate) 916 devices (strip is candidate)
864 */ 917 */
865 if (n == NULL && 918 if (n == NULL &&
866 arp->ar_op == htons(ARPOP_REPLY) && 919 (arp->ar_op == htons(ARPOP_REPLY) ||
920 (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) &&
867 inet_addr_type(net, sip) == RTN_UNICAST) 921 inet_addr_type(net, sip) == RTN_UNICAST)
868 n = __neigh_lookup(&arp_tbl, &sip, dev, 1); 922 n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
869 } 923 }
@@ -890,8 +944,6 @@ static int arp_process(struct sk_buff *skb)
890 } 944 }
891 945
892out: 946out:
893 if (in_dev)
894 in_dev_put(in_dev);
895 consume_skb(skb); 947 consume_skb(skb);
896 return 0; 948 return 0;
897} 949}
@@ -999,13 +1051,13 @@ static int arp_req_set(struct net *net, struct arpreq *r,
999 struct rtable * rt; 1051 struct rtable * rt;
1000 if ((err = ip_route_output_key(net, &rt, &fl)) != 0) 1052 if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
1001 return err; 1053 return err;
1002 dev = rt->u.dst.dev; 1054 dev = rt->dst.dev;
1003 ip_rt_put(rt); 1055 ip_rt_put(rt);
1004 if (!dev) 1056 if (!dev)
1005 return -EINVAL; 1057 return -EINVAL;
1006 } 1058 }
1007 switch (dev->type) { 1059 switch (dev->type) {
1008#ifdef CONFIG_FDDI 1060#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
1009 case ARPHRD_FDDI: 1061 case ARPHRD_FDDI:
1010 /* 1062 /*
1011 * According to RFC 1390, FDDI devices should accept ARP 1063 * According to RFC 1390, FDDI devices should accept ARP
@@ -1106,7 +1158,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
1106 struct rtable * rt; 1158 struct rtable * rt;
1107 if ((err = ip_route_output_key(net, &rt, &fl)) != 0) 1159 if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
1108 return err; 1160 return err;
1109 dev = rt->u.dst.dev; 1161 dev = rt->dst.dev;
1110 ip_rt_put(rt); 1162 ip_rt_put(rt);
1111 if (!dev) 1163 if (!dev)
1112 return -EINVAL; 1164 return -EINVAL;
@@ -1239,8 +1291,7 @@ void __init arp_init(void)
1239 dev_add_pack(&arp_packet_type); 1291 dev_add_pack(&arp_packet_type);
1240 arp_proc_init(); 1292 arp_proc_init();
1241#ifdef CONFIG_SYSCTL 1293#ifdef CONFIG_SYSCTL
1242 neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, 1294 neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL);
1243 NET_IPV4_NEIGH, "ipv4", NULL);
1244#endif 1295#endif
1245 register_netdevice_notifier(&arp_netdev_notifier); 1296 register_netdevice_notifier(&arp_netdev_notifier);
1246} 1297}
@@ -1408,14 +1459,3 @@ static int __init arp_proc_init(void)
1408} 1459}
1409 1460
1410#endif /* CONFIG_PROC_FS */ 1461#endif /* CONFIG_PROC_FS */
1411
1412EXPORT_SYMBOL(arp_broken_ops);
1413EXPORT_SYMBOL(arp_find);
1414EXPORT_SYMBOL(arp_create);
1415EXPORT_SYMBOL(arp_xmit);
1416EXPORT_SYMBOL(arp_send);
1417EXPORT_SYMBOL(arp_tbl);
1418
1419#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1420EXPORT_SYMBOL(clip_tbl_hook);
1421#endif
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 1e029dc75455..3a92a76ae41d 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -44,6 +44,7 @@
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/jhash.h> 45#include <linux/jhash.h>
46#include <linux/audit.h> 46#include <linux/audit.h>
47#include <linux/slab.h>
47#include <net/ip.h> 48#include <net/ip.h>
48#include <net/icmp.h> 49#include <net/icmp.h>
49#include <net/tcp.h> 50#include <net/tcp.h>
@@ -289,8 +290,6 @@ void cipso_v4_cache_invalidate(void)
289 cipso_v4_cache[iter].size = 0; 290 cipso_v4_cache[iter].size = 0;
290 spin_unlock_bh(&cipso_v4_cache[iter].lock); 291 spin_unlock_bh(&cipso_v4_cache[iter].lock);
291 } 292 }
292
293 return;
294} 293}
295 294
296/** 295/**
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index fb2465811b48..721a8a37b45c 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -62,16 +62,17 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
62 } 62 }
63 if (!inet->inet_saddr) 63 if (!inet->inet_saddr)
64 inet->inet_saddr = rt->rt_src; /* Update source address */ 64 inet->inet_saddr = rt->rt_src; /* Update source address */
65 if (!inet->inet_rcv_saddr) 65 if (!inet->inet_rcv_saddr) {
66 inet->inet_rcv_saddr = rt->rt_src; 66 inet->inet_rcv_saddr = rt->rt_src;
67 if (sk->sk_prot->rehash)
68 sk->sk_prot->rehash(sk);
69 }
67 inet->inet_daddr = rt->rt_dst; 70 inet->inet_daddr = rt->rt_dst;
68 inet->inet_dport = usin->sin_port; 71 inet->inet_dport = usin->sin_port;
69 sk->sk_state = TCP_ESTABLISHED; 72 sk->sk_state = TCP_ESTABLISHED;
70 inet->inet_id = jiffies; 73 inet->inet_id = jiffies;
71 74
72 sk_dst_set(sk, &rt->u.dst); 75 sk_dst_set(sk, &rt->dst);
73 return(0); 76 return(0);
74} 77}
75
76EXPORT_SYMBOL(ip4_datagram_connect); 78EXPORT_SYMBOL(ip4_datagram_connect);
77
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 040c4f05b653..da14c49284f4 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -50,6 +50,7 @@
50#include <linux/notifier.h> 50#include <linux/notifier.h>
51#include <linux/inetdevice.h> 51#include <linux/inetdevice.h>
52#include <linux/igmp.h> 52#include <linux/igmp.h>
53#include <linux/slab.h>
53#ifdef CONFIG_SYSCTL 54#ifdef CONFIG_SYSCTL
54#include <linux/sysctl.h> 55#include <linux/sysctl.h>
55#endif 56#endif
@@ -64,20 +65,20 @@
64 65
65static struct ipv4_devconf ipv4_devconf = { 66static struct ipv4_devconf ipv4_devconf = {
66 .data = { 67 .data = {
67 [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1, 68 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
68 [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1, 69 [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
69 [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1, 70 [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
70 [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1, 71 [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
71 }, 72 },
72}; 73};
73 74
74static struct ipv4_devconf ipv4_devconf_dflt = { 75static struct ipv4_devconf ipv4_devconf_dflt = {
75 .data = { 76 .data = {
76 [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1, 77 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
77 [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1, 78 [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
78 [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1, 79 [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
79 [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1, 80 [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
80 [NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE - 1] = 1, 81 [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
81 }, 82 },
82}; 83};
83 84
@@ -1080,6 +1081,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1080 } 1081 }
1081 ip_mc_up(in_dev); 1082 ip_mc_up(in_dev);
1082 /* fall through */ 1083 /* fall through */
1084 case NETDEV_NOTIFY_PEERS:
1083 case NETDEV_CHANGEADDR: 1085 case NETDEV_CHANGEADDR:
1084 /* Send gratuitous ARP to notify of link change */ 1086 /* Send gratuitous ARP to notify of link change */
1085 if (IN_DEV_ARP_NOTIFY(in_dev)) { 1087 if (IN_DEV_ARP_NOTIFY(in_dev)) {
@@ -1095,10 +1097,10 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1095 case NETDEV_DOWN: 1097 case NETDEV_DOWN:
1096 ip_mc_down(in_dev); 1098 ip_mc_down(in_dev);
1097 break; 1099 break;
1098 case NETDEV_BONDING_OLDTYPE: 1100 case NETDEV_PRE_TYPE_CHANGE:
1099 ip_mc_unmap(in_dev); 1101 ip_mc_unmap(in_dev);
1100 break; 1102 break;
1101 case NETDEV_BONDING_NEWTYPE: 1103 case NETDEV_POST_TYPE_CHANGE:
1102 ip_mc_remap(in_dev); 1104 ip_mc_remap(in_dev);
1103 break; 1105 break;
1104 case NETDEV_CHANGEMTU: 1106 case NETDEV_CHANGEMTU:
@@ -1194,7 +1196,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1194 hlist_for_each_entry_rcu(dev, node, head, index_hlist) { 1196 hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
1195 if (idx < s_idx) 1197 if (idx < s_idx)
1196 goto cont; 1198 goto cont;
1197 if (idx > s_idx) 1199 if (h > s_h || idx > s_idx)
1198 s_ip_idx = 0; 1200 s_ip_idx = 0;
1199 in_dev = __in_dev_get_rcu(dev); 1201 in_dev = __in_dev_get_rcu(dev);
1200 if (!in_dev) 1202 if (!in_dev)
@@ -1317,14 +1319,19 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
1317{ 1319{
1318 int *valp = ctl->data; 1320 int *valp = ctl->data;
1319 int val = *valp; 1321 int val = *valp;
1322 loff_t pos = *ppos;
1320 int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); 1323 int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
1321 1324
1322 if (write && *valp != val) { 1325 if (write && *valp != val) {
1323 struct net *net = ctl->extra2; 1326 struct net *net = ctl->extra2;
1324 1327
1325 if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { 1328 if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
1326 if (!rtnl_trylock()) 1329 if (!rtnl_trylock()) {
1330 /* Restore the original values before restarting */
1331 *valp = val;
1332 *ppos = pos;
1327 return restart_syscall(); 1333 return restart_syscall();
1334 }
1328 if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { 1335 if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
1329 inet_forward_change(net); 1336 inet_forward_change(net);
1330 } else if (*valp) { 1337 } else if (*valp) {
@@ -1360,7 +1367,7 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write,
1360 { \ 1367 { \
1361 .procname = name, \ 1368 .procname = name, \
1362 .data = ipv4_devconf.data + \ 1369 .data = ipv4_devconf.data + \
1363 NET_IPV4_CONF_ ## attr - 1, \ 1370 IPV4_DEVCONF_ ## attr - 1, \
1364 .maxlen = sizeof(int), \ 1371 .maxlen = sizeof(int), \
1365 .mode = mval, \ 1372 .mode = mval, \
1366 .proc_handler = proc, \ 1373 .proc_handler = proc, \
@@ -1381,7 +1388,7 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write,
1381 1388
1382static struct devinet_sysctl_table { 1389static struct devinet_sysctl_table {
1383 struct ctl_table_header *sysctl_header; 1390 struct ctl_table_header *sysctl_header;
1384 struct ctl_table devinet_vars[__NET_IPV4_CONF_MAX]; 1391 struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
1385 char *dev_name; 1392 char *dev_name;
1386} devinet_sysctl = { 1393} devinet_sysctl = {
1387 .devinet_vars = { 1394 .devinet_vars = {
@@ -1408,6 +1415,7 @@ static struct devinet_sysctl_table {
1408 DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), 1415 DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
1409 DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), 1416 DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
1410 DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), 1417 DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
1418 DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
1411 1419
1412 DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), 1420 DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
1413 DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), 1421 DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
@@ -1486,8 +1494,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
1486 1494
1487static void devinet_sysctl_register(struct in_device *idev) 1495static void devinet_sysctl_register(struct in_device *idev)
1488{ 1496{
1489 neigh_sysctl_register(idev->dev, idev->arp_parms, NET_IPV4, 1497 neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL);
1490 NET_IPV4_NEIGH, "ipv4", NULL);
1491 __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, 1498 __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
1492 &idev->cnf); 1499 &idev->cnf);
1493} 1500}
@@ -1502,7 +1509,7 @@ static struct ctl_table ctl_forward_entry[] = {
1502 { 1509 {
1503 .procname = "ip_forward", 1510 .procname = "ip_forward",
1504 .data = &ipv4_devconf.data[ 1511 .data = &ipv4_devconf.data[
1505 NET_IPV4_CONF_FORWARDING - 1], 1512 IPV4_DEVCONF_FORWARDING - 1],
1506 .maxlen = sizeof(int), 1513 .maxlen = sizeof(int),
1507 .mode = 0644, 1514 .mode = 0644,
1508 .proc_handler = devinet_sysctl_forward, 1515 .proc_handler = devinet_sysctl_forward,
@@ -1546,7 +1553,7 @@ static __net_init int devinet_init_net(struct net *net)
1546 if (tbl == NULL) 1553 if (tbl == NULL)
1547 goto err_alloc_ctl; 1554 goto err_alloc_ctl;
1548 1555
1549 tbl[0].data = &all->data[NET_IPV4_CONF_FORWARDING - 1]; 1556 tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
1550 tbl[0].extra1 = all; 1557 tbl[0].extra1 = all;
1551 tbl[0].extra2 = net; 1558 tbl[0].extra2 = net;
1552#endif 1559#endif
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 1948895beb6d..14ca1f1c3fb0 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -422,7 +422,7 @@ static void esp4_err(struct sk_buff *skb, u32 info)
422 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 422 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
423 return; 423 return;
424 424
425 x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); 425 x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
426 if (!x) 426 if (!x)
427 return; 427 return;
428 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", 428 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 82dbf711d6d0..7d02a9f999fa 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -34,6 +34,7 @@
34#include <linux/skbuff.h> 34#include <linux/skbuff.h>
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/list.h> 36#include <linux/list.h>
37#include <linux/slab.h>
37 38
38#include <net/ip.h> 39#include <net/ip.h>
39#include <net/protocol.h> 40#include <net/protocol.h>
@@ -174,6 +175,7 @@ out:
174 fib_res_put(&res); 175 fib_res_put(&res);
175 return dev; 176 return dev;
176} 177}
178EXPORT_SYMBOL(ip_dev_find);
177 179
178/* 180/*
179 * Find address type as if only "dev" was present in the system. If 181 * Find address type as if only "dev" was present in the system. If
@@ -213,12 +215,14 @@ unsigned int inet_addr_type(struct net *net, __be32 addr)
213{ 215{
214 return __inet_dev_addr_type(net, NULL, addr); 216 return __inet_dev_addr_type(net, NULL, addr);
215} 217}
218EXPORT_SYMBOL(inet_addr_type);
216 219
217unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, 220unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
218 __be32 addr) 221 __be32 addr)
219{ 222{
220 return __inet_dev_addr_type(net, dev, addr); 223 return __inet_dev_addr_type(net, dev, addr);
221} 224}
225EXPORT_SYMBOL(inet_dev_addr_type);
222 226
223/* Given (packet source, input interface) and optional (dst, oif, tos): 227/* Given (packet source, input interface) and optional (dst, oif, tos):
224 - (main) check, that source is valid i.e. not broadcast or our local 228 - (main) check, that source is valid i.e. not broadcast or our local
@@ -242,6 +246,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
242 246
243 struct fib_result res; 247 struct fib_result res;
244 int no_addr, rpf, accept_local; 248 int no_addr, rpf, accept_local;
249 bool dev_match;
245 int ret; 250 int ret;
246 struct net *net; 251 struct net *net;
247 252
@@ -269,12 +274,22 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
269 } 274 }
270 *spec_dst = FIB_RES_PREFSRC(res); 275 *spec_dst = FIB_RES_PREFSRC(res);
271 fib_combine_itag(itag, &res); 276 fib_combine_itag(itag, &res);
277 dev_match = false;
278
272#ifdef CONFIG_IP_ROUTE_MULTIPATH 279#ifdef CONFIG_IP_ROUTE_MULTIPATH
273 if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) 280 for (ret = 0; ret < res.fi->fib_nhs; ret++) {
281 struct fib_nh *nh = &res.fi->fib_nh[ret];
282
283 if (nh->nh_dev == dev) {
284 dev_match = true;
285 break;
286 }
287 }
274#else 288#else
275 if (FIB_RES_DEV(res) == dev) 289 if (FIB_RES_DEV(res) == dev)
290 dev_match = true;
276#endif 291#endif
277 { 292 if (dev_match) {
278 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 293 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
279 fib_res_put(&res); 294 fib_res_put(&res);
280 return ret; 295 return ret;
@@ -283,7 +298,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
283 if (no_addr) 298 if (no_addr)
284 goto last_resort; 299 goto last_resort;
285 if (rpf == 1) 300 if (rpf == 1)
286 goto e_inval; 301 goto e_rpf;
287 fl.oif = dev->ifindex; 302 fl.oif = dev->ifindex;
288 303
289 ret = 0; 304 ret = 0;
@@ -298,7 +313,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
298 313
299last_resort: 314last_resort:
300 if (rpf) 315 if (rpf)
301 goto e_inval; 316 goto e_rpf;
302 *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 317 *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
303 *itag = 0; 318 *itag = 0;
304 return 0; 319 return 0;
@@ -307,6 +322,8 @@ e_inval_res:
307 fib_res_put(&res); 322 fib_res_put(&res);
308e_inval: 323e_inval:
309 return -EINVAL; 324 return -EINVAL;
325e_rpf:
326 return -EXDEV;
310} 327}
311 328
312static inline __be32 sk_extract_addr(struct sockaddr *addr) 329static inline __be32 sk_extract_addr(struct sockaddr *addr)
@@ -883,7 +900,7 @@ static void nl_fib_input(struct sk_buff *skb)
883 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); 900 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
884} 901}
885 902
886static int nl_fib_lookup_init(struct net *net) 903static int __net_init nl_fib_lookup_init(struct net *net)
887{ 904{
888 struct sock *sk; 905 struct sock *sk;
889 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, 906 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
@@ -1004,7 +1021,7 @@ fail:
1004 return err; 1021 return err;
1005} 1022}
1006 1023
1007static void __net_exit ip_fib_net_exit(struct net *net) 1024static void ip_fib_net_exit(struct net *net)
1008{ 1025{
1009 unsigned int i; 1026 unsigned int i;
1010 1027
@@ -1074,7 +1091,3 @@ void __init ip_fib_init(void)
1074 1091
1075 fib_hash_init(); 1092 fib_hash_init();
1076} 1093}
1077
1078EXPORT_SYMBOL(inet_addr_type);
1079EXPORT_SYMBOL(inet_dev_addr_type);
1080EXPORT_SYMBOL(ip_dev_find);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 14972017b9c2..4ed7e0dea1bc 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -32,6 +32,7 @@
32#include <linux/skbuff.h> 32#include <linux/skbuff.h>
33#include <linux/netlink.h> 33#include <linux/netlink.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/slab.h>
35 36
36#include <net/net_namespace.h> 37#include <net/net_namespace.h>
37#include <net/ip.h> 38#include <net/ip.h>
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index ca2d07b1c706..76daeb5ff564 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -213,7 +213,6 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
213{ 213{
214 struct fib4_rule *rule4 = (struct fib4_rule *) rule; 214 struct fib4_rule *rule4 = (struct fib4_rule *) rule;
215 215
216 frh->family = AF_INET;
217 frh->dst_len = rule4->dst_len; 216 frh->dst_len = rule4->dst_len;
218 frh->src_len = rule4->src_len; 217 frh->src_len = rule4->src_len;
219 frh->tos = rule4->tos; 218 frh->tos = rule4->tos;
@@ -234,23 +233,6 @@ nla_put_failure:
234 return -ENOBUFS; 233 return -ENOBUFS;
235} 234}
236 235
237static u32 fib4_rule_default_pref(struct fib_rules_ops *ops)
238{
239 struct list_head *pos;
240 struct fib_rule *rule;
241
242 if (!list_empty(&ops->rules_list)) {
243 pos = ops->rules_list.next;
244 if (pos->next != &ops->rules_list) {
245 rule = list_entry(pos->next, struct fib_rule, list);
246 if (rule->pref)
247 return rule->pref - 1;
248 }
249 }
250
251 return 0;
252}
253
254static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) 236static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
255{ 237{
256 return nla_total_size(4) /* dst */ 238 return nla_total_size(4) /* dst */
@@ -263,7 +245,7 @@ static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
263 rt_cache_flush(ops->fro_net, -1); 245 rt_cache_flush(ops->fro_net, -1);
264} 246}
265 247
266static struct fib_rules_ops fib4_rules_ops_template = { 248static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
267 .family = AF_INET, 249 .family = AF_INET,
268 .rule_size = sizeof(struct fib4_rule), 250 .rule_size = sizeof(struct fib4_rule),
269 .addr_size = sizeof(u32), 251 .addr_size = sizeof(u32),
@@ -272,7 +254,7 @@ static struct fib_rules_ops fib4_rules_ops_template = {
272 .configure = fib4_rule_configure, 254 .configure = fib4_rule_configure,
273 .compare = fib4_rule_compare, 255 .compare = fib4_rule_compare,
274 .fill = fib4_rule_fill, 256 .fill = fib4_rule_fill,
275 .default_pref = fib4_rule_default_pref, 257 .default_pref = fib_default_rule_pref,
276 .nlmsg_payload = fib4_rule_nlmsg_payload, 258 .nlmsg_payload = fib4_rule_nlmsg_payload,
277 .flush_cache = fib4_rule_flush_cache, 259 .flush_cache = fib4_rule_flush_cache,
278 .nlgroup = RTNLGRP_IPV4_RULE, 260 .nlgroup = RTNLGRP_IPV4_RULE,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index ed19aa6919c2..20f09c5b31e8 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -32,6 +32,7 @@
32#include <linux/proc_fs.h> 32#include <linux/proc_fs.h>
33#include <linux/skbuff.h> 33#include <linux/skbuff.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/slab.h>
35 36
36#include <net/arp.h> 37#include <net/arp.h>
37#include <net/ip.h> 38#include <net/ip.h>
@@ -62,8 +63,8 @@ static DEFINE_SPINLOCK(fib_multipath_lock);
62#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ 63#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
63for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) 64for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
64 65
65#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \ 66#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
66for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) 67for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
67 68
68#else /* CONFIG_IP_ROUTE_MULTIPATH */ 69#else /* CONFIG_IP_ROUTE_MULTIPATH */
69 70
@@ -72,7 +73,7 @@ for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++,
72#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \ 73#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
73for (nhsel=0; nhsel < 1; nhsel++) 74for (nhsel=0; nhsel < 1; nhsel++)
74 75
75#define change_nexthops(fi) { int nhsel = 0; struct fib_nh * nh = (struct fib_nh *)((fi)->fib_nh); \ 76#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
76for (nhsel=0; nhsel < 1; nhsel++) 77for (nhsel=0; nhsel < 1; nhsel++)
77 78
78#endif /* CONFIG_IP_ROUTE_MULTIPATH */ 79#endif /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -145,9 +146,9 @@ void free_fib_info(struct fib_info *fi)
145 return; 146 return;
146 } 147 }
147 change_nexthops(fi) { 148 change_nexthops(fi) {
148 if (nh->nh_dev) 149 if (nexthop_nh->nh_dev)
149 dev_put(nh->nh_dev); 150 dev_put(nexthop_nh->nh_dev);
150 nh->nh_dev = NULL; 151 nexthop_nh->nh_dev = NULL;
151 } endfor_nexthops(fi); 152 } endfor_nexthops(fi);
152 fib_info_cnt--; 153 fib_info_cnt--;
153 release_net(fi->fib_net); 154 release_net(fi->fib_net);
@@ -162,9 +163,9 @@ void fib_release_info(struct fib_info *fi)
162 if (fi->fib_prefsrc) 163 if (fi->fib_prefsrc)
163 hlist_del(&fi->fib_lhash); 164 hlist_del(&fi->fib_lhash);
164 change_nexthops(fi) { 165 change_nexthops(fi) {
165 if (!nh->nh_dev) 166 if (!nexthop_nh->nh_dev)
166 continue; 167 continue;
167 hlist_del(&nh->nh_hash); 168 hlist_del(&nexthop_nh->nh_hash);
168 } endfor_nexthops(fi) 169 } endfor_nexthops(fi)
169 fi->fib_dead = 1; 170 fi->fib_dead = 1;
170 fib_info_put(fi); 171 fib_info_put(fi);
@@ -395,19 +396,20 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
395 if (!rtnh_ok(rtnh, remaining)) 396 if (!rtnh_ok(rtnh, remaining))
396 return -EINVAL; 397 return -EINVAL;
397 398
398 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; 399 nexthop_nh->nh_flags =
399 nh->nh_oif = rtnh->rtnh_ifindex; 400 (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
400 nh->nh_weight = rtnh->rtnh_hops + 1; 401 nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
402 nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
401 403
402 attrlen = rtnh_attrlen(rtnh); 404 attrlen = rtnh_attrlen(rtnh);
403 if (attrlen > 0) { 405 if (attrlen > 0) {
404 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 406 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
405 407
406 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 408 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
407 nh->nh_gw = nla ? nla_get_be32(nla) : 0; 409 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
408#ifdef CONFIG_NET_CLS_ROUTE 410#ifdef CONFIG_NET_CLS_ROUTE
409 nla = nla_find(attrs, attrlen, RTA_FLOW); 411 nla = nla_find(attrs, attrlen, RTA_FLOW);
410 nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 412 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
411#endif 413#endif
412 } 414 }
413 415
@@ -527,10 +529,6 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
527 if (nh->nh_gw) { 529 if (nh->nh_gw) {
528 struct fib_result res; 530 struct fib_result res;
529 531
530#ifdef CONFIG_IP_ROUTE_PERVASIVE
531 if (nh->nh_flags&RTNH_F_PERVASIVE)
532 return 0;
533#endif
534 if (nh->nh_flags&RTNH_F_ONLINK) { 532 if (nh->nh_flags&RTNH_F_ONLINK) {
535 struct net_device *dev; 533 struct net_device *dev;
536 534
@@ -738,7 +736,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
738 736
739 fi->fib_nhs = nhs; 737 fi->fib_nhs = nhs;
740 change_nexthops(fi) { 738 change_nexthops(fi) {
741 nh->nh_parent = fi; 739 nexthop_nh->nh_parent = fi;
742 } endfor_nexthops(fi) 740 } endfor_nexthops(fi)
743 741
744 if (cfg->fc_mx) { 742 if (cfg->fc_mx) {
@@ -808,7 +806,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
808 goto failure; 806 goto failure;
809 } else { 807 } else {
810 change_nexthops(fi) { 808 change_nexthops(fi) {
811 if ((err = fib_check_nh(cfg, fi, nh)) != 0) 809 if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
812 goto failure; 810 goto failure;
813 } endfor_nexthops(fi) 811 } endfor_nexthops(fi)
814 } 812 }
@@ -843,11 +841,11 @@ link_it:
843 struct hlist_head *head; 841 struct hlist_head *head;
844 unsigned int hash; 842 unsigned int hash;
845 843
846 if (!nh->nh_dev) 844 if (!nexthop_nh->nh_dev)
847 continue; 845 continue;
848 hash = fib_devindex_hashfn(nh->nh_dev->ifindex); 846 hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
849 head = &fib_info_devhash[hash]; 847 head = &fib_info_devhash[hash];
850 hlist_add_head(&nh->nh_hash, head); 848 hlist_add_head(&nexthop_nh->nh_hash, head);
851 } endfor_nexthops(fi) 849 } endfor_nexthops(fi)
852 spin_unlock_bh(&fib_info_lock); 850 spin_unlock_bh(&fib_info_lock);
853 return fi; 851 return fi;
@@ -1080,21 +1078,21 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1080 prev_fi = fi; 1078 prev_fi = fi;
1081 dead = 0; 1079 dead = 0;
1082 change_nexthops(fi) { 1080 change_nexthops(fi) {
1083 if (nh->nh_flags&RTNH_F_DEAD) 1081 if (nexthop_nh->nh_flags&RTNH_F_DEAD)
1084 dead++; 1082 dead++;
1085 else if (nh->nh_dev == dev && 1083 else if (nexthop_nh->nh_dev == dev &&
1086 nh->nh_scope != scope) { 1084 nexthop_nh->nh_scope != scope) {
1087 nh->nh_flags |= RTNH_F_DEAD; 1085 nexthop_nh->nh_flags |= RTNH_F_DEAD;
1088#ifdef CONFIG_IP_ROUTE_MULTIPATH 1086#ifdef CONFIG_IP_ROUTE_MULTIPATH
1089 spin_lock_bh(&fib_multipath_lock); 1087 spin_lock_bh(&fib_multipath_lock);
1090 fi->fib_power -= nh->nh_power; 1088 fi->fib_power -= nexthop_nh->nh_power;
1091 nh->nh_power = 0; 1089 nexthop_nh->nh_power = 0;
1092 spin_unlock_bh(&fib_multipath_lock); 1090 spin_unlock_bh(&fib_multipath_lock);
1093#endif 1091#endif
1094 dead++; 1092 dead++;
1095 } 1093 }
1096#ifdef CONFIG_IP_ROUTE_MULTIPATH 1094#ifdef CONFIG_IP_ROUTE_MULTIPATH
1097 if (force > 1 && nh->nh_dev == dev) { 1095 if (force > 1 && nexthop_nh->nh_dev == dev) {
1098 dead = fi->fib_nhs; 1096 dead = fi->fib_nhs;
1099 break; 1097 break;
1100 } 1098 }
@@ -1144,18 +1142,20 @@ int fib_sync_up(struct net_device *dev)
1144 prev_fi = fi; 1142 prev_fi = fi;
1145 alive = 0; 1143 alive = 0;
1146 change_nexthops(fi) { 1144 change_nexthops(fi) {
1147 if (!(nh->nh_flags&RTNH_F_DEAD)) { 1145 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
1148 alive++; 1146 alive++;
1149 continue; 1147 continue;
1150 } 1148 }
1151 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) 1149 if (nexthop_nh->nh_dev == NULL ||
1150 !(nexthop_nh->nh_dev->flags&IFF_UP))
1152 continue; 1151 continue;
1153 if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev)) 1152 if (nexthop_nh->nh_dev != dev ||
1153 !__in_dev_get_rtnl(dev))
1154 continue; 1154 continue;
1155 alive++; 1155 alive++;
1156 spin_lock_bh(&fib_multipath_lock); 1156 spin_lock_bh(&fib_multipath_lock);
1157 nh->nh_power = 0; 1157 nexthop_nh->nh_power = 0;
1158 nh->nh_flags &= ~RTNH_F_DEAD; 1158 nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
1159 spin_unlock_bh(&fib_multipath_lock); 1159 spin_unlock_bh(&fib_multipath_lock);
1160 } endfor_nexthops(fi) 1160 } endfor_nexthops(fi)
1161 1161
@@ -1182,9 +1182,9 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1182 if (fi->fib_power <= 0) { 1182 if (fi->fib_power <= 0) {
1183 int power = 0; 1183 int power = 0;
1184 change_nexthops(fi) { 1184 change_nexthops(fi) {
1185 if (!(nh->nh_flags&RTNH_F_DEAD)) { 1185 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
1186 power += nh->nh_weight; 1186 power += nexthop_nh->nh_weight;
1187 nh->nh_power = nh->nh_weight; 1187 nexthop_nh->nh_power = nexthop_nh->nh_weight;
1188 } 1188 }
1189 } endfor_nexthops(fi); 1189 } endfor_nexthops(fi);
1190 fi->fib_power = power; 1190 fi->fib_power = power;
@@ -1204,9 +1204,10 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1204 w = jiffies % fi->fib_power; 1204 w = jiffies % fi->fib_power;
1205 1205
1206 change_nexthops(fi) { 1206 change_nexthops(fi) {
1207 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { 1207 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
1208 if ((w -= nh->nh_power) <= 0) { 1208 nexthop_nh->nh_power) {
1209 nh->nh_power--; 1209 if ((w -= nexthop_nh->nh_power) <= 0) {
1210 nexthop_nh->nh_power--;
1210 fi->fib_power--; 1211 fi->fib_power--;
1211 res->nh_sel = nhsel; 1212 res->nh_sel = nhsel;
1212 spin_unlock_bh(&fib_multipath_lock); 1213 spin_unlock_bh(&fib_multipath_lock);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index af5d89792860..4a8e370862bc 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -71,6 +71,7 @@
71#include <linux/netlink.h> 71#include <linux/netlink.h>
72#include <linux/init.h> 72#include <linux/init.h>
73#include <linux/list.h> 73#include <linux/list.h>
74#include <linux/slab.h>
74#include <net/net_namespace.h> 75#include <net/net_namespace.h>
75#include <net/ip.h> 76#include <net/ip.h>
76#include <net/protocol.h> 77#include <net/protocol.h>
@@ -185,7 +186,9 @@ static inline struct tnode *node_parent_rcu(struct node *node)
185{ 186{
186 struct tnode *ret = node_parent(node); 187 struct tnode *ret = node_parent(node);
187 188
188 return rcu_dereference(ret); 189 return rcu_dereference_check(ret,
190 rcu_read_lock_held() ||
191 lockdep_rtnl_is_held());
189} 192}
190 193
191/* Same as rcu_assign_pointer 194/* Same as rcu_assign_pointer
@@ -208,7 +211,9 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
208{ 211{
209 struct node *ret = tnode_get_child(tn, i); 212 struct node *ret = tnode_get_child(tn, i);
210 213
211 return rcu_dereference(ret); 214 return rcu_dereference_check(ret,
215 rcu_read_lock_held() ||
216 lockdep_rtnl_is_held());
212} 217}
213 218
214static inline int tnode_child_length(const struct tnode *tn) 219static inline int tnode_child_length(const struct tnode *tn)
@@ -961,7 +966,9 @@ fib_find_node(struct trie *t, u32 key)
961 struct node *n; 966 struct node *n;
962 967
963 pos = 0; 968 pos = 0;
964 n = rcu_dereference(t->trie); 969 n = rcu_dereference_check(t->trie,
970 rcu_read_lock_held() ||
971 lockdep_rtnl_is_held());
965 972
966 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 973 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
967 tn = (struct tnode *) n; 974 tn = (struct tnode *) n;
@@ -1017,8 +1024,6 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1017 1024
1018 rcu_assign_pointer(t->trie, (struct node *)tn); 1025 rcu_assign_pointer(t->trie, (struct node *)tn);
1019 tnode_free_flush(); 1026 tnode_free_flush();
1020
1021 return;
1022} 1027}
1023 1028
1024/* only used from updater-side */ 1029/* only used from updater-side */
@@ -1750,7 +1755,9 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1750 1755
1751static struct leaf *trie_firstleaf(struct trie *t) 1756static struct leaf *trie_firstleaf(struct trie *t)
1752{ 1757{
1753 struct tnode *n = (struct tnode *) rcu_dereference(t->trie); 1758 struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie,
1759 rcu_read_lock_held() ||
1760 lockdep_rtnl_is_held());
1754 1761
1755 if (!n) 1762 if (!n)
1756 return NULL; 1763 return NULL;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index fe11f60ce41b..a0d847c7cba5 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -74,6 +74,7 @@
74#include <linux/netdevice.h> 74#include <linux/netdevice.h>
75#include <linux/string.h> 75#include <linux/string.h>
76#include <linux/netfilter_ipv4.h> 76#include <linux/netfilter_ipv4.h>
77#include <linux/slab.h>
77#include <net/snmp.h> 78#include <net/snmp.h>
78#include <net/ip.h> 79#include <net/ip.h>
79#include <net/route.h> 80#include <net/route.h>
@@ -114,7 +115,7 @@ struct icmp_bxm {
114/* An array of errno for error messages from dest unreach. */ 115/* An array of errno for error messages from dest unreach. */
115/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ 116/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
116 117
117struct icmp_err icmp_err_convert[] = { 118const struct icmp_err icmp_err_convert[] = {
118 { 119 {
119 .errno = ENETUNREACH, /* ICMP_NET_UNREACH */ 120 .errno = ENETUNREACH, /* ICMP_NET_UNREACH */
120 .fatal = 0, 121 .fatal = 0,
@@ -180,6 +181,7 @@ struct icmp_err icmp_err_convert[] = {
180 .fatal = 1, 181 .fatal = 1,
181 }, 182 },
182}; 183};
184EXPORT_SYMBOL(icmp_err_convert);
183 185
184/* 186/*
185 * ICMP control array. This specifies what to do with each ICMP. 187 * ICMP control array. This specifies what to do with each ICMP.
@@ -266,11 +268,12 @@ int xrlim_allow(struct dst_entry *dst, int timeout)
266 dst->rate_tokens = token; 268 dst->rate_tokens = token;
267 return rc; 269 return rc;
268} 270}
271EXPORT_SYMBOL(xrlim_allow);
269 272
270static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, 273static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
271 int type, int code) 274 int type, int code)
272{ 275{
273 struct dst_entry *dst = &rt->u.dst; 276 struct dst_entry *dst = &rt->dst;
274 int rc = 1; 277 int rc = 1;
275 278
276 if (type > NR_ICMP_TYPES) 279 if (type > NR_ICMP_TYPES)
@@ -326,13 +329,14 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
326 struct sock *sk; 329 struct sock *sk;
327 struct sk_buff *skb; 330 struct sk_buff *skb;
328 331
329 sk = icmp_sk(dev_net((*rt)->u.dst.dev)); 332 sk = icmp_sk(dev_net((*rt)->dst.dev));
330 if (ip_append_data(sk, icmp_glue_bits, icmp_param, 333 if (ip_append_data(sk, icmp_glue_bits, icmp_param,
331 icmp_param->data_len+icmp_param->head_len, 334 icmp_param->data_len+icmp_param->head_len,
332 icmp_param->head_len, 335 icmp_param->head_len,
333 ipc, rt, MSG_DONTWAIT) < 0) 336 ipc, rt, MSG_DONTWAIT) < 0) {
337 ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS);
334 ip_flush_pending_frames(sk); 338 ip_flush_pending_frames(sk);
335 else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { 339 } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
336 struct icmphdr *icmph = icmp_hdr(skb); 340 struct icmphdr *icmph = icmp_hdr(skb);
337 __wsum csum = 0; 341 __wsum csum = 0;
338 struct sk_buff *skb1; 342 struct sk_buff *skb1;
@@ -357,7 +361,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
357{ 361{
358 struct ipcm_cookie ipc; 362 struct ipcm_cookie ipc;
359 struct rtable *rt = skb_rtable(skb); 363 struct rtable *rt = skb_rtable(skb);
360 struct net *net = dev_net(rt->u.dst.dev); 364 struct net *net = dev_net(rt->dst.dev);
361 struct sock *sk; 365 struct sock *sk;
362 struct inet_sock *inet; 366 struct inet_sock *inet;
363 __be32 daddr; 367 __be32 daddr;
@@ -425,7 +429,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
425 429
426 if (!rt) 430 if (!rt)
427 goto out; 431 goto out;
428 net = dev_net(rt->u.dst.dev); 432 net = dev_net(rt->dst.dev);
429 433
430 /* 434 /*
431 * Find the original header. It is expected to be valid, of course. 435 * Find the original header. It is expected to be valid, of course.
@@ -585,20 +589,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
585 err = __ip_route_output_key(net, &rt2, &fl); 589 err = __ip_route_output_key(net, &rt2, &fl);
586 else { 590 else {
587 struct flowi fl2 = {}; 591 struct flowi fl2 = {};
588 struct dst_entry *odst; 592 unsigned long orefdst;
589 593
590 fl2.fl4_dst = fl.fl4_src; 594 fl2.fl4_dst = fl.fl4_src;
591 if (ip_route_output_key(net, &rt2, &fl2)) 595 if (ip_route_output_key(net, &rt2, &fl2))
592 goto relookup_failed; 596 goto relookup_failed;
593 597
594 /* Ugh! */ 598 /* Ugh! */
595 odst = skb_dst(skb_in); 599 orefdst = skb_in->_skb_refdst; /* save old refdst */
596 err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, 600 err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
597 RT_TOS(tos), rt2->u.dst.dev); 601 RT_TOS(tos), rt2->dst.dev);
598 602
599 dst_release(&rt2->u.dst); 603 dst_release(&rt2->dst);
600 rt2 = skb_rtable(skb_in); 604 rt2 = skb_rtable(skb_in);
601 skb_dst_set(skb_in, odst); 605 skb_in->_skb_refdst = orefdst; /* restore old refdst */
602 } 606 }
603 607
604 if (err) 608 if (err)
@@ -608,7 +612,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
608 XFRM_LOOKUP_ICMP); 612 XFRM_LOOKUP_ICMP);
609 switch (err) { 613 switch (err) {
610 case 0: 614 case 0:
611 dst_release(&rt->u.dst); 615 dst_release(&rt->dst);
612 rt = rt2; 616 rt = rt2;
613 break; 617 break;
614 case -EPERM: 618 case -EPERM:
@@ -627,7 +631,7 @@ route_done:
627 631
628 /* RFC says return as much as we can without exceeding 576 bytes. */ 632 /* RFC says return as much as we can without exceeding 576 bytes. */
629 633
630 room = dst_mtu(&rt->u.dst); 634 room = dst_mtu(&rt->dst);
631 if (room > 576) 635 if (room > 576)
632 room = 576; 636 room = 576;
633 room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; 637 room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
@@ -645,6 +649,7 @@ out_unlock:
645 icmp_xmit_unlock(sk); 649 icmp_xmit_unlock(sk);
646out:; 650out:;
647} 651}
652EXPORT_SYMBOL(icmp_send);
648 653
649 654
650/* 655/*
@@ -923,6 +928,7 @@ static void icmp_address(struct sk_buff *skb)
923/* 928/*
924 * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain 929 * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain
925 * loudly if an inconsistency is found. 930 * loudly if an inconsistency is found.
931 * called with rcu_read_lock()
926 */ 932 */
927 933
928static void icmp_address_reply(struct sk_buff *skb) 934static void icmp_address_reply(struct sk_buff *skb)
@@ -933,12 +939,12 @@ static void icmp_address_reply(struct sk_buff *skb)
933 struct in_ifaddr *ifa; 939 struct in_ifaddr *ifa;
934 940
935 if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) 941 if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
936 goto out; 942 return;
937 943
938 in_dev = in_dev_get(dev); 944 in_dev = __in_dev_get_rcu(dev);
939 if (!in_dev) 945 if (!in_dev)
940 goto out; 946 return;
941 rcu_read_lock(); 947
942 if (in_dev->ifa_list && 948 if (in_dev->ifa_list &&
943 IN_DEV_LOG_MARTIANS(in_dev) && 949 IN_DEV_LOG_MARTIANS(in_dev) &&
944 IN_DEV_FORWARD(in_dev)) { 950 IN_DEV_FORWARD(in_dev)) {
@@ -956,9 +962,6 @@ static void icmp_address_reply(struct sk_buff *skb)
956 mp, dev->name, &rt->rt_src); 962 mp, dev->name, &rt->rt_src);
957 } 963 }
958 } 964 }
959 rcu_read_unlock();
960 in_dev_put(in_dev);
961out:;
962} 965}
963 966
964static void icmp_discard(struct sk_buff *skb) 967static void icmp_discard(struct sk_buff *skb)
@@ -972,7 +975,7 @@ int icmp_rcv(struct sk_buff *skb)
972{ 975{
973 struct icmphdr *icmph; 976 struct icmphdr *icmph;
974 struct rtable *rt = skb_rtable(skb); 977 struct rtable *rt = skb_rtable(skb);
975 struct net *net = dev_net(rt->u.dst.dev); 978 struct net *net = dev_net(rt->dst.dev);
976 979
977 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 980 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
978 struct sec_path *sp = skb_sec_path(skb); 981 struct sec_path *sp = skb_sec_path(skb);
@@ -1214,7 +1217,3 @@ int __init icmp_init(void)
1214{ 1217{
1215 return register_pernet_subsys(&icmp_sk_ops); 1218 return register_pernet_subsys(&icmp_sk_ops);
1216} 1219}
1217
1218EXPORT_SYMBOL(icmp_err_convert);
1219EXPORT_SYMBOL(icmp_send);
1220EXPORT_SYMBOL(xrlim_allow);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 76c08402c933..2a4bb76f2132 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -71,6 +71,7 @@
71 */ 71 */
72 72
73#include <linux/module.h> 73#include <linux/module.h>
74#include <linux/slab.h>
74#include <asm/uaccess.h> 75#include <asm/uaccess.h>
75#include <asm/system.h> 76#include <asm/system.h>
76#include <linux/types.h> 77#include <linux/types.h>
@@ -311,7 +312,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
311 return NULL; 312 return NULL;
312 } 313 }
313 314
314 skb_dst_set(skb, &rt->u.dst); 315 skb_dst_set(skb, &rt->dst);
315 skb->dev = dev; 316 skb->dev = dev;
316 317
317 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 318 skb_reserve(skb, LL_RESERVED_SPACE(dev));
@@ -329,7 +330,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
329 pip->saddr = rt->rt_src; 330 pip->saddr = rt->rt_src;
330 pip->protocol = IPPROTO_IGMP; 331 pip->protocol = IPPROTO_IGMP;
331 pip->tot_len = 0; /* filled in later */ 332 pip->tot_len = 0; /* filled in later */
332 ip_select_ident(pip, &rt->u.dst, NULL); 333 ip_select_ident(pip, &rt->dst, NULL);
333 ((u8*)&pip[1])[0] = IPOPT_RA; 334 ((u8*)&pip[1])[0] = IPOPT_RA;
334 ((u8*)&pip[1])[1] = 4; 335 ((u8*)&pip[1])[1] = 4;
335 ((u8*)&pip[1])[2] = 0; 336 ((u8*)&pip[1])[2] = 0;
@@ -659,7 +660,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
659 return -1; 660 return -1;
660 } 661 }
661 662
662 skb_dst_set(skb, &rt->u.dst); 663 skb_dst_set(skb, &rt->dst);
663 664
664 skb_reserve(skb, LL_RESERVED_SPACE(dev)); 665 skb_reserve(skb, LL_RESERVED_SPACE(dev));
665 666
@@ -675,7 +676,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
675 iph->daddr = dst; 676 iph->daddr = dst;
676 iph->saddr = rt->rt_src; 677 iph->saddr = rt->rt_src;
677 iph->protocol = IPPROTO_IGMP; 678 iph->protocol = IPPROTO_IGMP;
678 ip_select_ident(iph, &rt->u.dst, NULL); 679 ip_select_ident(iph, &rt->dst, NULL);
679 ((u8*)&iph[1])[0] = IPOPT_RA; 680 ((u8*)&iph[1])[0] = IPOPT_RA;
680 ((u8*)&iph[1])[1] = 4; 681 ((u8*)&iph[1])[1] = 4;
681 ((u8*)&iph[1])[2] = 0; 682 ((u8*)&iph[1])[2] = 0;
@@ -855,6 +856,18 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
855 igmpv3_clear_delrec(in_dev); 856 igmpv3_clear_delrec(in_dev);
856 } else if (len < 12) { 857 } else if (len < 12) {
857 return; /* ignore bogus packet; freed by caller */ 858 return; /* ignore bogus packet; freed by caller */
859 } else if (IGMP_V1_SEEN(in_dev)) {
860 /* This is a v3 query with v1 queriers present */
861 max_delay = IGMP_Query_Response_Interval;
862 group = 0;
863 } else if (IGMP_V2_SEEN(in_dev)) {
864 /* this is a v3 query with v2 queriers present;
865 * Interpretation of the max_delay code is problematic here.
866 * A real v2 host would use ih_code directly, while v3 has a
867 * different encoding. We use the v3 encoding as more likely
868 * to be intended in a v3 query.
869 */
870 max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
858 } else { /* v3 */ 871 } else { /* v3 */
859 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) 872 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
860 return; 873 return;
@@ -915,18 +928,19 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
915 read_unlock(&in_dev->mc_list_lock); 928 read_unlock(&in_dev->mc_list_lock);
916} 929}
917 930
931/* called in rcu_read_lock() section */
918int igmp_rcv(struct sk_buff *skb) 932int igmp_rcv(struct sk_buff *skb)
919{ 933{
920 /* This basically follows the spec line by line -- see RFC1112 */ 934 /* This basically follows the spec line by line -- see RFC1112 */
921 struct igmphdr *ih; 935 struct igmphdr *ih;
922 struct in_device *in_dev = in_dev_get(skb->dev); 936 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
923 int len = skb->len; 937 int len = skb->len;
924 938
925 if (in_dev == NULL) 939 if (in_dev == NULL)
926 goto drop; 940 goto drop;
927 941
928 if (!pskb_may_pull(skb, sizeof(struct igmphdr))) 942 if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
929 goto drop_ref; 943 goto drop;
930 944
931 switch (skb->ip_summed) { 945 switch (skb->ip_summed) {
932 case CHECKSUM_COMPLETE: 946 case CHECKSUM_COMPLETE:
@@ -936,7 +950,7 @@ int igmp_rcv(struct sk_buff *skb)
936 case CHECKSUM_NONE: 950 case CHECKSUM_NONE:
937 skb->csum = 0; 951 skb->csum = 0;
938 if (__skb_checksum_complete(skb)) 952 if (__skb_checksum_complete(skb))
939 goto drop_ref; 953 goto drop;
940 } 954 }
941 955
942 ih = igmp_hdr(skb); 956 ih = igmp_hdr(skb);
@@ -946,7 +960,6 @@ int igmp_rcv(struct sk_buff *skb)
946 break; 960 break;
947 case IGMP_HOST_MEMBERSHIP_REPORT: 961 case IGMP_HOST_MEMBERSHIP_REPORT:
948 case IGMPV2_HOST_MEMBERSHIP_REPORT: 962 case IGMPV2_HOST_MEMBERSHIP_REPORT:
949 case IGMPV3_HOST_MEMBERSHIP_REPORT:
950 /* Is it our report looped back? */ 963 /* Is it our report looped back? */
951 if (skb_rtable(skb)->fl.iif == 0) 964 if (skb_rtable(skb)->fl.iif == 0)
952 break; 965 break;
@@ -957,9 +970,9 @@ int igmp_rcv(struct sk_buff *skb)
957 break; 970 break;
958 case IGMP_PIM: 971 case IGMP_PIM:
959#ifdef CONFIG_IP_PIMSM_V1 972#ifdef CONFIG_IP_PIMSM_V1
960 in_dev_put(in_dev);
961 return pim_rcv_v1(skb); 973 return pim_rcv_v1(skb);
962#endif 974#endif
975 case IGMPV3_HOST_MEMBERSHIP_REPORT:
963 case IGMP_DVMRP: 976 case IGMP_DVMRP:
964 case IGMP_TRACE: 977 case IGMP_TRACE:
965 case IGMP_HOST_LEAVE_MESSAGE: 978 case IGMP_HOST_LEAVE_MESSAGE:
@@ -970,8 +983,6 @@ int igmp_rcv(struct sk_buff *skb)
970 break; 983 break;
971 } 984 }
972 985
973drop_ref:
974 in_dev_put(in_dev);
975drop: 986drop:
976 kfree_skb(skb); 987 kfree_skb(skb);
977 return 0; 988 return 0;
@@ -997,7 +1008,7 @@ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
997 --ANK 1008 --ANK
998 */ 1009 */
999 if (arp_mc_map(addr, buf, dev, 0) == 0) 1010 if (arp_mc_map(addr, buf, dev, 0) == 0)
1000 dev_mc_add(dev, buf, dev->addr_len, 0); 1011 dev_mc_add(dev, buf);
1001} 1012}
1002 1013
1003/* 1014/*
@@ -1010,7 +1021,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
1010 struct net_device *dev = in_dev->dev; 1021 struct net_device *dev = in_dev->dev;
1011 1022
1012 if (arp_mc_map(addr, buf, dev, 0) == 0) 1023 if (arp_mc_map(addr, buf, dev, 0) == 0)
1013 dev_mc_delete(dev, buf, dev->addr_len, 0); 1024 dev_mc_del(dev, buf);
1014} 1025}
1015 1026
1016#ifdef CONFIG_IP_MULTICAST 1027#ifdef CONFIG_IP_MULTICAST
@@ -1245,6 +1256,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1245out: 1256out:
1246 return; 1257 return;
1247} 1258}
1259EXPORT_SYMBOL(ip_mc_inc_group);
1248 1260
1249/* 1261/*
1250 * Resend IGMP JOIN report; used for bonding. 1262 * Resend IGMP JOIN report; used for bonding.
@@ -1267,6 +1279,7 @@ void ip_mc_rejoin_group(struct ip_mc_list *im)
1267 igmp_ifc_event(in_dev); 1279 igmp_ifc_event(in_dev);
1268#endif 1280#endif
1269} 1281}
1282EXPORT_SYMBOL(ip_mc_rejoin_group);
1270 1283
1271/* 1284/*
1272 * A socket has left a multicast group on device dev 1285 * A socket has left a multicast group on device dev
@@ -1297,6 +1310,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
1297 } 1310 }
1298 } 1311 }
1299} 1312}
1313EXPORT_SYMBOL(ip_mc_dec_group);
1300 1314
1301/* Device changing type */ 1315/* Device changing type */
1302 1316
@@ -1426,7 +1440,7 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1426 } 1440 }
1427 1441
1428 if (!dev && !ip_route_output_key(net, &rt, &fl)) { 1442 if (!dev && !ip_route_output_key(net, &rt, &fl)) {
1429 dev = rt->u.dst.dev; 1443 dev = rt->dst.dev;
1430 ip_rt_put(rt); 1444 ip_rt_put(rt);
1431 } 1445 }
1432 if (dev) { 1446 if (dev) {
@@ -1645,8 +1659,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
1645 if (dpsf->sf_inaddr == psf->sf_inaddr) 1659 if (dpsf->sf_inaddr == psf->sf_inaddr)
1646 break; 1660 break;
1647 if (!dpsf) { 1661 if (!dpsf) {
1648 dpsf = (struct ip_sf_list *) 1662 dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
1649 kmalloc(sizeof(*dpsf), GFP_ATOMIC);
1650 if (!dpsf) 1663 if (!dpsf)
1651 continue; 1664 continue;
1652 *dpsf = *psf; 1665 *dpsf = *psf;
@@ -1799,32 +1812,55 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1799 iml->next = inet->mc_list; 1812 iml->next = inet->mc_list;
1800 iml->sflist = NULL; 1813 iml->sflist = NULL;
1801 iml->sfmode = MCAST_EXCLUDE; 1814 iml->sfmode = MCAST_EXCLUDE;
1802 inet->mc_list = iml; 1815 rcu_assign_pointer(inet->mc_list, iml);
1803 ip_mc_inc_group(in_dev, addr); 1816 ip_mc_inc_group(in_dev, addr);
1804 err = 0; 1817 err = 0;
1805done: 1818done:
1806 rtnl_unlock(); 1819 rtnl_unlock();
1807 return err; 1820 return err;
1808} 1821}
1822EXPORT_SYMBOL(ip_mc_join_group);
1823
1824static void ip_sf_socklist_reclaim(struct rcu_head *rp)
1825{
1826 struct ip_sf_socklist *psf;
1827
1828 psf = container_of(rp, struct ip_sf_socklist, rcu);
1829 /* sk_omem_alloc should have been decreased by the caller*/
1830 kfree(psf);
1831}
1809 1832
1810static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, 1833static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
1811 struct in_device *in_dev) 1834 struct in_device *in_dev)
1812{ 1835{
1836 struct ip_sf_socklist *psf = iml->sflist;
1813 int err; 1837 int err;
1814 1838
1815 if (iml->sflist == NULL) { 1839 if (psf == NULL) {
1816 /* any-source empty exclude case */ 1840 /* any-source empty exclude case */
1817 return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, 1841 return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
1818 iml->sfmode, 0, NULL, 0); 1842 iml->sfmode, 0, NULL, 0);
1819 } 1843 }
1820 err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, 1844 err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
1821 iml->sfmode, iml->sflist->sl_count, 1845 iml->sfmode, psf->sl_count, psf->sl_addr, 0);
1822 iml->sflist->sl_addr, 0); 1846 rcu_assign_pointer(iml->sflist, NULL);
1823 sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max)); 1847 /* decrease mem now to avoid the memleak warning */
1824 iml->sflist = NULL; 1848 atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
1849 call_rcu(&psf->rcu, ip_sf_socklist_reclaim);
1825 return err; 1850 return err;
1826} 1851}
1827 1852
1853
1854static void ip_mc_socklist_reclaim(struct rcu_head *rp)
1855{
1856 struct ip_mc_socklist *iml;
1857
1858 iml = container_of(rp, struct ip_mc_socklist, rcu);
1859 /* sk_omem_alloc should have been decreased by the caller*/
1860 kfree(iml);
1861}
1862
1863
1828/* 1864/*
1829 * Ask a socket to leave a group. 1865 * Ask a socket to leave a group.
1830 */ 1866 */
@@ -1854,12 +1890,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1854 1890
1855 (void) ip_mc_leave_src(sk, iml, in_dev); 1891 (void) ip_mc_leave_src(sk, iml, in_dev);
1856 1892
1857 *imlp = iml->next; 1893 rcu_assign_pointer(*imlp, iml->next);
1858 1894
1859 if (in_dev) 1895 if (in_dev)
1860 ip_mc_dec_group(in_dev, group); 1896 ip_mc_dec_group(in_dev, group);
1861 rtnl_unlock(); 1897 rtnl_unlock();
1862 sock_kfree_s(sk, iml, sizeof(*iml)); 1898 /* decrease mem now to avoid the memleak warning */
1899 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
1900 call_rcu(&iml->rcu, ip_mc_socklist_reclaim);
1863 return 0; 1901 return 0;
1864 } 1902 }
1865 if (!in_dev) 1903 if (!in_dev)
@@ -1974,9 +2012,12 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1974 if (psl) { 2012 if (psl) {
1975 for (i=0; i<psl->sl_count; i++) 2013 for (i=0; i<psl->sl_count; i++)
1976 newpsl->sl_addr[i] = psl->sl_addr[i]; 2014 newpsl->sl_addr[i] = psl->sl_addr[i];
1977 sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); 2015 /* decrease mem now to avoid the memleak warning */
2016 atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
2017 call_rcu(&psl->rcu, ip_sf_socklist_reclaim);
1978 } 2018 }
1979 pmc->sflist = psl = newpsl; 2019 rcu_assign_pointer(pmc->sflist, newpsl);
2020 psl = newpsl;
1980 } 2021 }
1981 rv = 1; /* > 0 for insert logic below if sl_count is 0 */ 2022 rv = 1; /* > 0 for insert logic below if sl_count is 0 */
1982 for (i=0; i<psl->sl_count; i++) { 2023 for (i=0; i<psl->sl_count; i++) {
@@ -2072,11 +2113,13 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
2072 if (psl) { 2113 if (psl) {
2073 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 2114 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
2074 psl->sl_count, psl->sl_addr, 0); 2115 psl->sl_count, psl->sl_addr, 0);
2075 sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); 2116 /* decrease mem now to avoid the memleak warning */
2117 atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
2118 call_rcu(&psl->rcu, ip_sf_socklist_reclaim);
2076 } else 2119 } else
2077 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 2120 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
2078 0, NULL, 0); 2121 0, NULL, 0);
2079 pmc->sflist = newpsl; 2122 rcu_assign_pointer(pmc->sflist, newpsl);
2080 pmc->sfmode = msf->imsf_fmode; 2123 pmc->sfmode = msf->imsf_fmode;
2081 err = 0; 2124 err = 0;
2082done: 2125done:
@@ -2209,30 +2252,40 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
2209 struct ip_mc_socklist *pmc; 2252 struct ip_mc_socklist *pmc;
2210 struct ip_sf_socklist *psl; 2253 struct ip_sf_socklist *psl;
2211 int i; 2254 int i;
2255 int ret;
2212 2256
2257 ret = 1;
2213 if (!ipv4_is_multicast(loc_addr)) 2258 if (!ipv4_is_multicast(loc_addr))
2214 return 1; 2259 goto out;
2215 2260
2216 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2261 rcu_read_lock();
2262 for (pmc=rcu_dereference(inet->mc_list); pmc; pmc=rcu_dereference(pmc->next)) {
2217 if (pmc->multi.imr_multiaddr.s_addr == loc_addr && 2263 if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
2218 pmc->multi.imr_ifindex == dif) 2264 pmc->multi.imr_ifindex == dif)
2219 break; 2265 break;
2220 } 2266 }
2267 ret = inet->mc_all;
2221 if (!pmc) 2268 if (!pmc)
2222 return inet->mc_all; 2269 goto unlock;
2223 psl = pmc->sflist; 2270 psl = pmc->sflist;
2271 ret = (pmc->sfmode == MCAST_EXCLUDE);
2224 if (!psl) 2272 if (!psl)
2225 return pmc->sfmode == MCAST_EXCLUDE; 2273 goto unlock;
2226 2274
2227 for (i=0; i<psl->sl_count; i++) { 2275 for (i=0; i<psl->sl_count; i++) {
2228 if (psl->sl_addr[i] == rmt_addr) 2276 if (psl->sl_addr[i] == rmt_addr)
2229 break; 2277 break;
2230 } 2278 }
2279 ret = 0;
2231 if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) 2280 if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
2232 return 0; 2281 goto unlock;
2233 if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) 2282 if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
2234 return 0; 2283 goto unlock;
2235 return 1; 2284 ret = 1;
2285unlock:
2286 rcu_read_unlock();
2287out:
2288 return ret;
2236} 2289}
2237 2290
2238/* 2291/*
@@ -2251,7 +2304,7 @@ void ip_mc_drop_socket(struct sock *sk)
2251 rtnl_lock(); 2304 rtnl_lock();
2252 while ((iml = inet->mc_list) != NULL) { 2305 while ((iml = inet->mc_list) != NULL) {
2253 struct in_device *in_dev; 2306 struct in_device *in_dev;
2254 inet->mc_list = iml->next; 2307 rcu_assign_pointer(inet->mc_list, iml->next);
2255 2308
2256 in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); 2309 in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
2257 (void) ip_mc_leave_src(sk, iml, in_dev); 2310 (void) ip_mc_leave_src(sk, iml, in_dev);
@@ -2259,7 +2312,9 @@ void ip_mc_drop_socket(struct sock *sk)
2259 ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); 2312 ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
2260 in_dev_put(in_dev); 2313 in_dev_put(in_dev);
2261 } 2314 }
2262 sock_kfree_s(sk, iml, sizeof(*iml)); 2315 /* decrease mem now to avoid the memleak warning */
2316 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
2317 call_rcu(&iml->rcu, ip_mc_socklist_reclaim);
2263 } 2318 }
2264 rtnl_unlock(); 2319 rtnl_unlock();
2265} 2320}
@@ -2603,7 +2658,7 @@ static const struct file_operations igmp_mcf_seq_fops = {
2603 .release = seq_release_net, 2658 .release = seq_release_net,
2604}; 2659};
2605 2660
2606static int igmp_net_init(struct net *net) 2661static int __net_init igmp_net_init(struct net *net)
2607{ 2662{
2608 struct proc_dir_entry *pde; 2663 struct proc_dir_entry *pde;
2609 2664
@@ -2621,7 +2676,7 @@ out_igmp:
2621 return -ENOMEM; 2676 return -ENOMEM;
2622} 2677}
2623 2678
2624static void igmp_net_exit(struct net *net) 2679static void __net_exit igmp_net_exit(struct net *net)
2625{ 2680{
2626 proc_net_remove(net, "mcfilter"); 2681 proc_net_remove(net, "mcfilter");
2627 proc_net_remove(net, "igmp"); 2682 proc_net_remove(net, "igmp");
@@ -2637,8 +2692,3 @@ int __init igmp_mc_proc_init(void)
2637 return register_pernet_subsys(&igmp_net_ops); 2692 return register_pernet_subsys(&igmp_net_ops);
2638} 2693}
2639#endif 2694#endif
2640
2641EXPORT_SYMBOL(ip_mc_dec_group);
2642EXPORT_SYMBOL(ip_mc_inc_group);
2643EXPORT_SYMBOL(ip_mc_join_group);
2644EXPORT_SYMBOL(ip_mc_rejoin_group);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index ee16475f8fc3..7174370b1195 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -37,6 +37,9 @@ struct local_ports sysctl_local_ports __read_mostly = {
37 .range = { 32768, 61000 }, 37 .range = { 32768, 61000 },
38}; 38};
39 39
40unsigned long *sysctl_local_reserved_ports;
41EXPORT_SYMBOL(sysctl_local_reserved_ports);
42
40void inet_get_local_port_range(int *low, int *high) 43void inet_get_local_port_range(int *low, int *high)
41{ 44{
42 unsigned seq; 45 unsigned seq;
@@ -81,7 +84,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
81 } 84 }
82 return node != NULL; 85 return node != NULL;
83} 86}
84
85EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); 87EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
86 88
87/* Obtain a reference to a local port for the given sock, 89/* Obtain a reference to a local port for the given sock,
@@ -108,6 +110,8 @@ again:
108 110
109 smallest_size = -1; 111 smallest_size = -1;
110 do { 112 do {
113 if (inet_is_reserved_local_port(rover))
114 goto next_nolock;
111 head = &hashinfo->bhash[inet_bhashfn(net, rover, 115 head = &hashinfo->bhash[inet_bhashfn(net, rover,
112 hashinfo->bhash_size)]; 116 hashinfo->bhash_size)];
113 spin_lock(&head->lock); 117 spin_lock(&head->lock);
@@ -130,6 +134,7 @@ again:
130 break; 134 break;
131 next: 135 next:
132 spin_unlock(&head->lock); 136 spin_unlock(&head->lock);
137 next_nolock:
133 if (++rover > high) 138 if (++rover > high)
134 rover = low; 139 rover = low;
135 } while (--remaining > 0); 140 } while (--remaining > 0);
@@ -206,7 +211,6 @@ fail:
206 local_bh_enable(); 211 local_bh_enable();
207 return ret; 212 return ret;
208} 213}
209
210EXPORT_SYMBOL_GPL(inet_csk_get_port); 214EXPORT_SYMBOL_GPL(inet_csk_get_port);
211 215
212/* 216/*
@@ -234,7 +238,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
234 * having to remove and re-insert us on the wait queue. 238 * having to remove and re-insert us on the wait queue.
235 */ 239 */
236 for (;;) { 240 for (;;) {
237 prepare_to_wait_exclusive(sk->sk_sleep, &wait, 241 prepare_to_wait_exclusive(sk_sleep(sk), &wait,
238 TASK_INTERRUPTIBLE); 242 TASK_INTERRUPTIBLE);
239 release_sock(sk); 243 release_sock(sk);
240 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) 244 if (reqsk_queue_empty(&icsk->icsk_accept_queue))
@@ -253,7 +257,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
253 if (!timeo) 257 if (!timeo)
254 break; 258 break;
255 } 259 }
256 finish_wait(sk->sk_sleep, &wait); 260 finish_wait(sk_sleep(sk), &wait);
257 return err; 261 return err;
258} 262}
259 263
@@ -299,7 +303,6 @@ out_err:
299 *err = error; 303 *err = error;
300 goto out; 304 goto out;
301} 305}
302
303EXPORT_SYMBOL(inet_csk_accept); 306EXPORT_SYMBOL(inet_csk_accept);
304 307
305/* 308/*
@@ -321,7 +324,6 @@ void inet_csk_init_xmit_timers(struct sock *sk,
321 setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); 324 setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
322 icsk->icsk_pending = icsk->icsk_ack.pending = 0; 325 icsk->icsk_pending = icsk->icsk_ack.pending = 0;
323} 326}
324
325EXPORT_SYMBOL(inet_csk_init_xmit_timers); 327EXPORT_SYMBOL(inet_csk_init_xmit_timers);
326 328
327void inet_csk_clear_xmit_timers(struct sock *sk) 329void inet_csk_clear_xmit_timers(struct sock *sk)
@@ -334,21 +336,18 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
334 sk_stop_timer(sk, &icsk->icsk_delack_timer); 336 sk_stop_timer(sk, &icsk->icsk_delack_timer);
335 sk_stop_timer(sk, &sk->sk_timer); 337 sk_stop_timer(sk, &sk->sk_timer);
336} 338}
337
338EXPORT_SYMBOL(inet_csk_clear_xmit_timers); 339EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
339 340
340void inet_csk_delete_keepalive_timer(struct sock *sk) 341void inet_csk_delete_keepalive_timer(struct sock *sk)
341{ 342{
342 sk_stop_timer(sk, &sk->sk_timer); 343 sk_stop_timer(sk, &sk->sk_timer);
343} 344}
344
345EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); 345EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
346 346
347void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) 347void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
348{ 348{
349 sk_reset_timer(sk, &sk->sk_timer, jiffies + len); 349 sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
350} 350}
351
352EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); 351EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
353 352
354struct dst_entry *inet_csk_route_req(struct sock *sk, 353struct dst_entry *inet_csk_route_req(struct sock *sk,
@@ -377,7 +376,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
377 goto no_route; 376 goto no_route;
378 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 377 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
379 goto route_err; 378 goto route_err;
380 return &rt->u.dst; 379 return &rt->dst;
381 380
382route_err: 381route_err:
383 ip_rt_put(rt); 382 ip_rt_put(rt);
@@ -385,7 +384,6 @@ no_route:
385 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); 384 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
386 return NULL; 385 return NULL;
387} 386}
388
389EXPORT_SYMBOL_GPL(inet_csk_route_req); 387EXPORT_SYMBOL_GPL(inet_csk_route_req);
390 388
391static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, 389static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
@@ -427,7 +425,6 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
427 425
428 return req; 426 return req;
429} 427}
430
431EXPORT_SYMBOL_GPL(inet_csk_search_req); 428EXPORT_SYMBOL_GPL(inet_csk_search_req);
432 429
433void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, 430void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
@@ -441,11 +438,11 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
441 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); 438 reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
442 inet_csk_reqsk_queue_added(sk, timeout); 439 inet_csk_reqsk_queue_added(sk, timeout);
443} 440}
441EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
444 442
445/* Only thing we need from tcp.h */ 443/* Only thing we need from tcp.h */
446extern int sysctl_tcp_synack_retries; 444extern int sysctl_tcp_synack_retries;
447 445
448EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
449 446
450/* Decide when to expire the request and when to resend SYN-ACK */ 447/* Decide when to expire the request and when to resend SYN-ACK */
451static inline void syn_ack_recalc(struct request_sock *req, const int thresh, 448static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
@@ -529,6 +526,8 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
529 syn_ack_recalc(req, thresh, max_retries, 526 syn_ack_recalc(req, thresh, max_retries,
530 queue->rskq_defer_accept, 527 queue->rskq_defer_accept,
531 &expire, &resend); 528 &expire, &resend);
529 if (req->rsk_ops->syn_ack_timeout)
530 req->rsk_ops->syn_ack_timeout(parent, req);
532 if (!expire && 531 if (!expire &&
533 (!resend || 532 (!resend ||
534 !req->rsk_ops->rtx_syn_ack(parent, req, NULL) || 533 !req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
@@ -561,7 +560,6 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
561 if (lopt->qlen) 560 if (lopt->qlen)
562 inet_csk_reset_keepalive_timer(parent, interval); 561 inet_csk_reset_keepalive_timer(parent, interval);
563} 562}
564
565EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); 563EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
566 564
567struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, 565struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
@@ -591,7 +589,6 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
591 } 589 }
592 return newsk; 590 return newsk;
593} 591}
594
595EXPORT_SYMBOL_GPL(inet_csk_clone); 592EXPORT_SYMBOL_GPL(inet_csk_clone);
596 593
597/* 594/*
@@ -622,7 +619,6 @@ void inet_csk_destroy_sock(struct sock *sk)
622 percpu_counter_dec(sk->sk_prot->orphan_count); 619 percpu_counter_dec(sk->sk_prot->orphan_count);
623 sock_put(sk); 620 sock_put(sk);
624} 621}
625
626EXPORT_SYMBOL(inet_csk_destroy_sock); 622EXPORT_SYMBOL(inet_csk_destroy_sock);
627 623
628int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) 624int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
@@ -657,7 +653,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
657 __reqsk_queue_destroy(&icsk->icsk_accept_queue); 653 __reqsk_queue_destroy(&icsk->icsk_accept_queue);
658 return -EADDRINUSE; 654 return -EADDRINUSE;
659} 655}
660
661EXPORT_SYMBOL_GPL(inet_csk_listen_start); 656EXPORT_SYMBOL_GPL(inet_csk_listen_start);
662 657
663/* 658/*
@@ -712,7 +707,6 @@ void inet_csk_listen_stop(struct sock *sk)
712 } 707 }
713 WARN_ON(sk->sk_ack_backlog); 708 WARN_ON(sk->sk_ack_backlog);
714} 709}
715
716EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 710EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
717 711
718void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) 712void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
@@ -724,7 +718,6 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
724 sin->sin_addr.s_addr = inet->inet_daddr; 718 sin->sin_addr.s_addr = inet->inet_daddr;
725 sin->sin_port = inet->inet_dport; 719 sin->sin_port = inet->inet_dport;
726} 720}
727
728EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); 721EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
729 722
730#ifdef CONFIG_COMPAT 723#ifdef CONFIG_COMPAT
@@ -739,7 +732,6 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
739 return icsk->icsk_af_ops->getsockopt(sk, level, optname, 732 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
740 optval, optlen); 733 optval, optlen);
741} 734}
742
743EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt); 735EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
744 736
745int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, 737int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
@@ -753,6 +745,5 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
753 return icsk->icsk_af_ops->setsockopt(sk, level, optname, 745 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
754 optval, optlen); 746 optval, optlen);
755} 747}
756
757EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); 748EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
758#endif 749#endif
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 1aaa8110d84b..e5fa2ddce320 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -14,6 +14,7 @@
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/fcntl.h> 15#include <linux/fcntl.h>
16#include <linux/random.h> 16#include <linux/random.h>
17#include <linux/slab.h>
17#include <linux/cache.h> 18#include <linux/cache.h>
18#include <linux/init.h> 19#include <linux/init.h>
19#include <linux/time.h> 20#include <linux/time.h>
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index eaf3e2c8646a..5ff2a51b6d0c 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -19,6 +19,7 @@
19#include <linux/random.h> 19#include <linux/random.h>
20#include <linux/skbuff.h> 20#include <linux/skbuff.h>
21#include <linux/rtnetlink.h> 21#include <linux/rtnetlink.h>
22#include <linux/slab.h>
22 23
23#include <net/inet_frag.h> 24#include <net/inet_frag.h>
24 25
@@ -113,7 +114,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
113 fq->last_in |= INET_FRAG_COMPLETE; 114 fq->last_in |= INET_FRAG_COMPLETE;
114 } 115 }
115} 116}
116
117EXPORT_SYMBOL(inet_frag_kill); 117EXPORT_SYMBOL(inet_frag_kill);
118 118
119static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f, 119static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 2b79377b468d..fb7ad5a21ff3 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -99,7 +99,6 @@ void inet_put_port(struct sock *sk)
99 __inet_put_port(sk); 99 __inet_put_port(sk);
100 local_bh_enable(); 100 local_bh_enable();
101} 101}
102
103EXPORT_SYMBOL(inet_put_port); 102EXPORT_SYMBOL(inet_put_port);
104 103
105void __inet_inherit_port(struct sock *sk, struct sock *child) 104void __inet_inherit_port(struct sock *sk, struct sock *child)
@@ -116,7 +115,6 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)
116 inet_csk(child)->icsk_bind_hash = tb; 115 inet_csk(child)->icsk_bind_hash = tb;
117 spin_unlock(&head->lock); 116 spin_unlock(&head->lock);
118} 117}
119
120EXPORT_SYMBOL_GPL(__inet_inherit_port); 118EXPORT_SYMBOL_GPL(__inet_inherit_port);
121 119
122static inline int compute_score(struct sock *sk, struct net *net, 120static inline int compute_score(struct sock *sk, struct net *net,
@@ -456,6 +454,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
456 local_bh_disable(); 454 local_bh_disable();
457 for (i = 1; i <= remaining; i++) { 455 for (i = 1; i <= remaining; i++) {
458 port = low + (i + offset) % remaining; 456 port = low + (i + offset) % remaining;
457 if (inet_is_reserved_local_port(port))
458 continue;
459 head = &hinfo->bhash[inet_bhashfn(net, port, 459 head = &hinfo->bhash[inet_bhashfn(net, port,
460 hinfo->bhash_size)]; 460 hinfo->bhash_size)];
461 spin_lock(&head->lock); 461 spin_lock(&head->lock);
@@ -544,7 +544,6 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
544 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), 544 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
545 __inet_check_established, __inet_hash_nolisten); 545 __inet_check_established, __inet_hash_nolisten);
546} 546}
547
548EXPORT_SYMBOL_GPL(inet_hash_connect); 547EXPORT_SYMBOL_GPL(inet_hash_connect);
549 548
550void inet_hashinfo_init(struct inet_hashinfo *h) 549void inet_hashinfo_init(struct inet_hashinfo *h)
@@ -558,5 +557,4 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
558 i + LISTENING_NULLS_BASE); 557 i + LISTENING_NULLS_BASE);
559 } 558 }
560} 559}
561
562EXPORT_SYMBOL_GPL(inet_hashinfo_init); 560EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index cc94cc2d8b2d..c5af909cf701 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/kmemcheck.h> 12#include <linux/kmemcheck.h>
13#include <linux/slab.h>
13#include <net/inet_hashtables.h> 14#include <net/inet_hashtables.h>
14#include <net/inet_timewait_sock.h> 15#include <net/inet_timewait_sock.h>
15#include <net/ip.h> 16#include <net/ip.h>
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 6bcfe52a9c87..9ffa24b9a804 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -51,8 +51,8 @@
51 * lookups performed with disabled BHs. 51 * lookups performed with disabled BHs.
52 * 52 *
53 * Serialisation issues. 53 * Serialisation issues.
54 * 1. Nodes may appear in the tree only with the pool write lock held. 54 * 1. Nodes may appear in the tree only with the pool lock held.
55 * 2. Nodes may disappear from the tree only with the pool write lock held 55 * 2. Nodes may disappear from the tree only with the pool lock held
56 * AND reference count being 0. 56 * AND reference count being 0.
57 * 3. Nodes appears and disappears from unused node list only under 57 * 3. Nodes appears and disappears from unused node list only under
58 * "inet_peer_unused_lock". 58 * "inet_peer_unused_lock".
@@ -64,23 +64,31 @@
64 * usually under some other lock to prevent node disappearing 64 * usually under some other lock to prevent node disappearing
65 * dtime: unused node list lock 65 * dtime: unused node list lock
66 * v4daddr: unchangeable 66 * v4daddr: unchangeable
67 * ip_id_count: idlock 67 * ip_id_count: atomic value (no lock needed)
68 */ 68 */
69 69
70static struct kmem_cache *peer_cachep __read_mostly; 70static struct kmem_cache *peer_cachep __read_mostly;
71 71
72#define node_height(x) x->avl_height 72#define node_height(x) x->avl_height
73static struct inet_peer peer_fake_node = { 73
74 .avl_left = &peer_fake_node, 74#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
75 .avl_right = &peer_fake_node, 75static const struct inet_peer peer_fake_node = {
76 .avl_left = peer_avl_empty,
77 .avl_right = peer_avl_empty,
76 .avl_height = 0 78 .avl_height = 0
77}; 79};
78#define peer_avl_empty (&peer_fake_node) 80
79static struct inet_peer *peer_root = peer_avl_empty; 81static struct {
80static DEFINE_RWLOCK(peer_pool_lock); 82 struct inet_peer *root;
83 spinlock_t lock;
84 int total;
85} peers = {
86 .root = peer_avl_empty,
87 .lock = __SPIN_LOCK_UNLOCKED(peers.lock),
88 .total = 0,
89};
81#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 90#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
82 91
83static int peer_total;
84/* Exported for sysctl_net_ipv4. */ 92/* Exported for sysctl_net_ipv4. */
85int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more 93int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
86 * aggressively at this stage */ 94 * aggressively at this stage */
@@ -89,8 +97,13 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min
89int inet_peer_gc_mintime __read_mostly = 10 * HZ; 97int inet_peer_gc_mintime __read_mostly = 10 * HZ;
90int inet_peer_gc_maxtime __read_mostly = 120 * HZ; 98int inet_peer_gc_maxtime __read_mostly = 120 * HZ;
91 99
92static LIST_HEAD(unused_peers); 100static struct {
93static DEFINE_SPINLOCK(inet_peer_unused_lock); 101 struct list_head list;
102 spinlock_t lock;
103} unused_peers = {
104 .list = LIST_HEAD_INIT(unused_peers.list),
105 .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock),
106};
94 107
95static void peer_check_expire(unsigned long dummy); 108static void peer_check_expire(unsigned long dummy);
96static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); 109static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
@@ -116,7 +129,7 @@ void __init inet_initpeers(void)
116 129
117 peer_cachep = kmem_cache_create("inet_peer_cache", 130 peer_cachep = kmem_cache_create("inet_peer_cache",
118 sizeof(struct inet_peer), 131 sizeof(struct inet_peer),
119 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 132 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
120 NULL); 133 NULL);
121 134
122 /* All the timers, started at system startup tend 135 /* All the timers, started at system startup tend
@@ -131,38 +144,69 @@ void __init inet_initpeers(void)
131/* Called with or without local BH being disabled. */ 144/* Called with or without local BH being disabled. */
132static void unlink_from_unused(struct inet_peer *p) 145static void unlink_from_unused(struct inet_peer *p)
133{ 146{
134 spin_lock_bh(&inet_peer_unused_lock); 147 if (!list_empty(&p->unused)) {
135 list_del_init(&p->unused); 148 spin_lock_bh(&unused_peers.lock);
136 spin_unlock_bh(&inet_peer_unused_lock); 149 list_del_init(&p->unused);
150 spin_unlock_bh(&unused_peers.lock);
151 }
137} 152}
138 153
139/* 154/*
140 * Called with local BH disabled and the pool lock held. 155 * Called with local BH disabled and the pool lock held.
141 * _stack is known to be NULL or not at compile time,
142 * so compiler will optimize the if (_stack) tests.
143 */ 156 */
144#define lookup(_daddr, _stack) \ 157#define lookup(_daddr, _stack) \
145({ \ 158({ \
146 struct inet_peer *u, **v; \ 159 struct inet_peer *u, **v; \
147 if (_stack != NULL) { \ 160 \
148 stackptr = _stack; \ 161 stackptr = _stack; \
149 *stackptr++ = &peer_root; \ 162 *stackptr++ = &peers.root; \
150 } \ 163 for (u = peers.root; u != peer_avl_empty; ) { \
151 for (u = peer_root; u != peer_avl_empty; ) { \
152 if (_daddr == u->v4daddr) \ 164 if (_daddr == u->v4daddr) \
153 break; \ 165 break; \
154 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ 166 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \
155 v = &u->avl_left; \ 167 v = &u->avl_left; \
156 else \ 168 else \
157 v = &u->avl_right; \ 169 v = &u->avl_right; \
158 if (_stack != NULL) \ 170 *stackptr++ = v; \
159 *stackptr++ = v; \
160 u = *v; \ 171 u = *v; \
161 } \ 172 } \
162 u; \ 173 u; \
163}) 174})
164 175
165/* Called with local BH disabled and the pool write lock held. */ 176/*
177 * Called with rcu_read_lock_bh()
178 * Because we hold no lock against a writer, its quite possible we fall
179 * in an endless loop.
180 * But every pointer we follow is guaranteed to be valid thanks to RCU.
181 * We exit from this function if number of links exceeds PEER_MAXDEPTH
182 */
183static struct inet_peer *lookup_rcu_bh(__be32 daddr)
184{
185 struct inet_peer *u = rcu_dereference_bh(peers.root);
186 int count = 0;
187
188 while (u != peer_avl_empty) {
189 if (daddr == u->v4daddr) {
190 /* Before taking a reference, check if this entry was
191 * deleted, unlink_from_pool() sets refcnt=-1 to make
192 * distinction between an unused entry (refcnt=0) and
193 * a freed one.
194 */
195 if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1)))
196 u = NULL;
197 return u;
198 }
199 if ((__force __u32)daddr < (__force __u32)u->v4daddr)
200 u = rcu_dereference_bh(u->avl_left);
201 else
202 u = rcu_dereference_bh(u->avl_right);
203 if (unlikely(++count == PEER_MAXDEPTH))
204 break;
205 }
206 return NULL;
207}
208
209/* Called with local BH disabled and the pool lock held. */
166#define lookup_rightempty(start) \ 210#define lookup_rightempty(start) \
167({ \ 211({ \
168 struct inet_peer *u, **v; \ 212 struct inet_peer *u, **v; \
@@ -176,9 +220,10 @@ static void unlink_from_unused(struct inet_peer *p)
176 u; \ 220 u; \
177}) 221})
178 222
179/* Called with local BH disabled and the pool write lock held. 223/* Called with local BH disabled and the pool lock held.
180 * Variable names are the proof of operation correctness. 224 * Variable names are the proof of operation correctness.
181 * Look into mm/map_avl.c for more detail description of the ideas. */ 225 * Look into mm/map_avl.c for more detail description of the ideas.
226 */
182static void peer_avl_rebalance(struct inet_peer **stack[], 227static void peer_avl_rebalance(struct inet_peer **stack[],
183 struct inet_peer ***stackend) 228 struct inet_peer ***stackend)
184{ 229{
@@ -254,15 +299,21 @@ static void peer_avl_rebalance(struct inet_peer **stack[],
254 } 299 }
255} 300}
256 301
257/* Called with local BH disabled and the pool write lock held. */ 302/* Called with local BH disabled and the pool lock held. */
258#define link_to_pool(n) \ 303#define link_to_pool(n) \
259do { \ 304do { \
260 n->avl_height = 1; \ 305 n->avl_height = 1; \
261 n->avl_left = peer_avl_empty; \ 306 n->avl_left = peer_avl_empty; \
262 n->avl_right = peer_avl_empty; \ 307 n->avl_right = peer_avl_empty; \
308 smp_wmb(); /* lockless readers can catch us now */ \
263 **--stackptr = n; \ 309 **--stackptr = n; \
264 peer_avl_rebalance(stack, stackptr); \ 310 peer_avl_rebalance(stack, stackptr); \
265} while(0) 311} while (0)
312
313static void inetpeer_free_rcu(struct rcu_head *head)
314{
315 kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
316}
266 317
267/* May be called with local BH enabled. */ 318/* May be called with local BH enabled. */
268static void unlink_from_pool(struct inet_peer *p) 319static void unlink_from_pool(struct inet_peer *p)
@@ -271,13 +322,14 @@ static void unlink_from_pool(struct inet_peer *p)
271 322
272 do_free = 0; 323 do_free = 0;
273 324
274 write_lock_bh(&peer_pool_lock); 325 spin_lock_bh(&peers.lock);
275 /* Check the reference counter. It was artificially incremented by 1 326 /* Check the reference counter. It was artificially incremented by 1
276 * in cleanup() function to prevent sudden disappearing. If the 327 * in cleanup() function to prevent sudden disappearing. If we can
277 * reference count is still 1 then the node is referenced only as `p' 328 * atomically (because of lockless readers) take this last reference,
278 * here and from the pool. So under the exclusive pool lock it's safe 329 * it's safe to remove the node and free it later.
279 * to remove the node and free it later. */ 330 * We use refcnt=-1 to alert lockless readers this entry is deleted.
280 if (atomic_read(&p->refcnt) == 1) { 331 */
332 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
281 struct inet_peer **stack[PEER_MAXDEPTH]; 333 struct inet_peer **stack[PEER_MAXDEPTH];
282 struct inet_peer ***stackptr, ***delp; 334 struct inet_peer ***stackptr, ***delp;
283 if (lookup(p->v4daddr, stack) != p) 335 if (lookup(p->v4daddr, stack) != p)
@@ -303,20 +355,21 @@ static void unlink_from_pool(struct inet_peer *p)
303 delp[1] = &t->avl_left; /* was &p->avl_left */ 355 delp[1] = &t->avl_left; /* was &p->avl_left */
304 } 356 }
305 peer_avl_rebalance(stack, stackptr); 357 peer_avl_rebalance(stack, stackptr);
306 peer_total--; 358 peers.total--;
307 do_free = 1; 359 do_free = 1;
308 } 360 }
309 write_unlock_bh(&peer_pool_lock); 361 spin_unlock_bh(&peers.lock);
310 362
311 if (do_free) 363 if (do_free)
312 kmem_cache_free(peer_cachep, p); 364 call_rcu_bh(&p->rcu, inetpeer_free_rcu);
313 else 365 else
314 /* The node is used again. Decrease the reference counter 366 /* The node is used again. Decrease the reference counter
315 * back. The loop "cleanup -> unlink_from_unused 367 * back. The loop "cleanup -> unlink_from_unused
316 * -> unlink_from_pool -> putpeer -> link_to_unused 368 * -> unlink_from_pool -> putpeer -> link_to_unused
317 * -> cleanup (for the same node)" 369 * -> cleanup (for the same node)"
318 * doesn't really exist because the entry will have a 370 * doesn't really exist because the entry will have a
319 * recent deletion time and will not be cleaned again soon. */ 371 * recent deletion time and will not be cleaned again soon.
372 */
320 inet_putpeer(p); 373 inet_putpeer(p);
321} 374}
322 375
@@ -326,16 +379,16 @@ static int cleanup_once(unsigned long ttl)
326 struct inet_peer *p = NULL; 379 struct inet_peer *p = NULL;
327 380
328 /* Remove the first entry from the list of unused nodes. */ 381 /* Remove the first entry from the list of unused nodes. */
329 spin_lock_bh(&inet_peer_unused_lock); 382 spin_lock_bh(&unused_peers.lock);
330 if (!list_empty(&unused_peers)) { 383 if (!list_empty(&unused_peers.list)) {
331 __u32 delta; 384 __u32 delta;
332 385
333 p = list_first_entry(&unused_peers, struct inet_peer, unused); 386 p = list_first_entry(&unused_peers.list, struct inet_peer, unused);
334 delta = (__u32)jiffies - p->dtime; 387 delta = (__u32)jiffies - p->dtime;
335 388
336 if (delta < ttl) { 389 if (delta < ttl) {
337 /* Do not prune fresh entries. */ 390 /* Do not prune fresh entries. */
338 spin_unlock_bh(&inet_peer_unused_lock); 391 spin_unlock_bh(&unused_peers.lock);
339 return -1; 392 return -1;
340 } 393 }
341 394
@@ -345,7 +398,7 @@ static int cleanup_once(unsigned long ttl)
345 * before unlink_from_pool() call. */ 398 * before unlink_from_pool() call. */
346 atomic_inc(&p->refcnt); 399 atomic_inc(&p->refcnt);
347 } 400 }
348 spin_unlock_bh(&inet_peer_unused_lock); 401 spin_unlock_bh(&unused_peers.lock);
349 402
350 if (p == NULL) 403 if (p == NULL)
351 /* It means that the total number of USED entries has 404 /* It means that the total number of USED entries has
@@ -360,62 +413,56 @@ static int cleanup_once(unsigned long ttl)
360/* Called with or without local BH being disabled. */ 413/* Called with or without local BH being disabled. */
361struct inet_peer *inet_getpeer(__be32 daddr, int create) 414struct inet_peer *inet_getpeer(__be32 daddr, int create)
362{ 415{
363 struct inet_peer *p, *n; 416 struct inet_peer *p;
364 struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; 417 struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
365 418
366 /* Look up for the address quickly. */ 419 /* Look up for the address quickly, lockless.
367 read_lock_bh(&peer_pool_lock); 420 * Because of a concurrent writer, we might not find an existing entry.
368 p = lookup(daddr, NULL); 421 */
369 if (p != peer_avl_empty) 422 rcu_read_lock_bh();
370 atomic_inc(&p->refcnt); 423 p = lookup_rcu_bh(daddr);
371 read_unlock_bh(&peer_pool_lock); 424 rcu_read_unlock_bh();
425
426 if (p) {
427 /* The existing node has been found.
428 * Remove the entry from unused list if it was there.
429 */
430 unlink_from_unused(p);
431 return p;
432 }
372 433
434 /* retry an exact lookup, taking the lock before.
435 * At least, nodes should be hot in our cache.
436 */
437 spin_lock_bh(&peers.lock);
438 p = lookup(daddr, stack);
373 if (p != peer_avl_empty) { 439 if (p != peer_avl_empty) {
374 /* The existing node has been found. */ 440 atomic_inc(&p->refcnt);
441 spin_unlock_bh(&peers.lock);
375 /* Remove the entry from unused list if it was there. */ 442 /* Remove the entry from unused list if it was there. */
376 unlink_from_unused(p); 443 unlink_from_unused(p);
377 return p; 444 return p;
378 } 445 }
446 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
447 if (p) {
448 p->v4daddr = daddr;
449 atomic_set(&p->refcnt, 1);
450 atomic_set(&p->rid, 0);
451 atomic_set(&p->ip_id_count, secure_ip_id(daddr));
452 p->tcp_ts_stamp = 0;
453 INIT_LIST_HEAD(&p->unused);
454
455
456 /* Link the node. */
457 link_to_pool(p);
458 peers.total++;
459 }
460 spin_unlock_bh(&peers.lock);
379 461
380 if (!create) 462 if (peers.total >= inet_peer_threshold)
381 return NULL;
382
383 /* Allocate the space outside the locked region. */
384 n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
385 if (n == NULL)
386 return NULL;
387 n->v4daddr = daddr;
388 atomic_set(&n->refcnt, 1);
389 atomic_set(&n->rid, 0);
390 atomic_set(&n->ip_id_count, secure_ip_id(daddr));
391 n->tcp_ts_stamp = 0;
392
393 write_lock_bh(&peer_pool_lock);
394 /* Check if an entry has suddenly appeared. */
395 p = lookup(daddr, stack);
396 if (p != peer_avl_empty)
397 goto out_free;
398
399 /* Link the node. */
400 link_to_pool(n);
401 INIT_LIST_HEAD(&n->unused);
402 peer_total++;
403 write_unlock_bh(&peer_pool_lock);
404
405 if (peer_total >= inet_peer_threshold)
406 /* Remove one less-recently-used entry. */ 463 /* Remove one less-recently-used entry. */
407 cleanup_once(0); 464 cleanup_once(0);
408 465
409 return n;
410
411out_free:
412 /* The appropriate node is already in the pool. */
413 atomic_inc(&p->refcnt);
414 write_unlock_bh(&peer_pool_lock);
415 /* Remove the entry from unused list if it was there. */
416 unlink_from_unused(p);
417 /* Free preallocated the preallocated node. */
418 kmem_cache_free(peer_cachep, n);
419 return p; 466 return p;
420} 467}
421 468
@@ -425,12 +472,12 @@ static void peer_check_expire(unsigned long dummy)
425 unsigned long now = jiffies; 472 unsigned long now = jiffies;
426 int ttl; 473 int ttl;
427 474
428 if (peer_total >= inet_peer_threshold) 475 if (peers.total >= inet_peer_threshold)
429 ttl = inet_peer_minttl; 476 ttl = inet_peer_minttl;
430 else 477 else
431 ttl = inet_peer_maxttl 478 ttl = inet_peer_maxttl
432 - (inet_peer_maxttl - inet_peer_minttl) / HZ * 479 - (inet_peer_maxttl - inet_peer_minttl) / HZ *
433 peer_total / inet_peer_threshold * HZ; 480 peers.total / inet_peer_threshold * HZ;
434 while (!cleanup_once(ttl)) { 481 while (!cleanup_once(ttl)) {
435 if (jiffies != now) 482 if (jiffies != now)
436 break; 483 break;
@@ -439,22 +486,25 @@ static void peer_check_expire(unsigned long dummy)
439 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime 486 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
440 * interval depending on the total number of entries (more entries, 487 * interval depending on the total number of entries (more entries,
441 * less interval). */ 488 * less interval). */
442 if (peer_total >= inet_peer_threshold) 489 if (peers.total >= inet_peer_threshold)
443 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; 490 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
444 else 491 else
445 peer_periodic_timer.expires = jiffies 492 peer_periodic_timer.expires = jiffies
446 + inet_peer_gc_maxtime 493 + inet_peer_gc_maxtime
447 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * 494 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
448 peer_total / inet_peer_threshold * HZ; 495 peers.total / inet_peer_threshold * HZ;
449 add_timer(&peer_periodic_timer); 496 add_timer(&peer_periodic_timer);
450} 497}
451 498
452void inet_putpeer(struct inet_peer *p) 499void inet_putpeer(struct inet_peer *p)
453{ 500{
454 spin_lock_bh(&inet_peer_unused_lock); 501 local_bh_disable();
455 if (atomic_dec_and_test(&p->refcnt)) { 502
456 list_add_tail(&p->unused, &unused_peers); 503 if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) {
504 list_add_tail(&p->unused, &unused_peers.list);
457 p->dtime = (__u32)jiffies; 505 p->dtime = (__u32)jiffies;
506 spin_unlock(&unused_peers.lock);
458 } 507 }
459 spin_unlock_bh(&inet_peer_unused_lock); 508
509 local_bh_enable();
460} 510}
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index a2991bc8e32e..99461f09320f 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -25,6 +25,7 @@
25#include <linux/ip.h> 25#include <linux/ip.h>
26#include <linux/icmp.h> 26#include <linux/icmp.h>
27#include <linux/netdevice.h> 27#include <linux/netdevice.h>
28#include <linux/slab.h>
28#include <net/sock.h> 29#include <net/sock.h>
29#include <net/ip.h> 30#include <net/ip.h>
30#include <net/tcp.h> 31#include <net/tcp.h>
@@ -86,16 +87,16 @@ int ip_forward(struct sk_buff *skb)
86 if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 87 if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
87 goto sr_failed; 88 goto sr_failed;
88 89
89 if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) && 90 if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
90 (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { 91 (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
91 IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS); 92 IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
92 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 93 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
93 htonl(dst_mtu(&rt->u.dst))); 94 htonl(dst_mtu(&rt->dst)));
94 goto drop; 95 goto drop;
95 } 96 }
96 97
97 /* We are about to mangle packet. Copy it! */ 98 /* We are about to mangle packet. Copy it! */
98 if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) 99 if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
99 goto drop; 100 goto drop;
100 iph = ip_hdr(skb); 101 iph = ip_hdr(skb);
101 102
@@ -111,8 +112,8 @@ int ip_forward(struct sk_buff *skb)
111 112
112 skb->priority = rt_tos2priority(iph->tos); 113 skb->priority = rt_tos2priority(iph->tos);
113 114
114 return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev, 115 return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
115 ip_forward_finish); 116 rt->dst.dev, ip_forward_finish);
116 117
117sr_failed: 118sr_failed:
118 /* 119 /*
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 86964b353c31..b7c41654dde5 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -32,6 +32,9 @@
32#include <linux/netdevice.h> 32#include <linux/netdevice.h>
33#include <linux/jhash.h> 33#include <linux/jhash.h>
34#include <linux/random.h> 34#include <linux/random.h>
35#include <linux/slab.h>
36#include <net/route.h>
37#include <net/dst.h>
35#include <net/sock.h> 38#include <net/sock.h>
36#include <net/ip.h> 39#include <net/ip.h>
37#include <net/icmp.h> 40#include <net/icmp.h>
@@ -121,11 +124,8 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
121} 124}
122 125
123/* Memory Tracking Functions. */ 126/* Memory Tracking Functions. */
124static __inline__ void frag_kfree_skb(struct netns_frags *nf, 127static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
125 struct sk_buff *skb, int *work)
126{ 128{
127 if (work)
128 *work -= skb->truesize;
129 atomic_sub(skb->truesize, &nf->mem); 129 atomic_sub(skb->truesize, &nf->mem);
130 kfree_skb(skb); 130 kfree_skb(skb);
131} 131}
@@ -205,11 +205,34 @@ static void ip_expire(unsigned long arg)
205 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { 205 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
206 struct sk_buff *head = qp->q.fragments; 206 struct sk_buff *head = qp->q.fragments;
207 207
208 /* Send an ICMP "Fragment Reassembly Timeout" message. */
209 rcu_read_lock(); 208 rcu_read_lock();
210 head->dev = dev_get_by_index_rcu(net, qp->iif); 209 head->dev = dev_get_by_index_rcu(net, qp->iif);
211 if (head->dev) 210 if (!head->dev)
212 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); 211 goto out_rcu_unlock;
212
213 /*
214 * Only search router table for the head fragment,
215 * when defraging timeout at PRE_ROUTING HOOK.
216 */
217 if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) {
218 const struct iphdr *iph = ip_hdr(head);
219 int err = ip_route_input(head, iph->daddr, iph->saddr,
220 iph->tos, head->dev);
221 if (unlikely(err))
222 goto out_rcu_unlock;
223
224 /*
225 * Only an end host needs to send an ICMP
226 * "Fragment Reassembly Timeout" message, per RFC792.
227 */
228 if (skb_rtable(head)->rt_type != RTN_LOCAL)
229 goto out_rcu_unlock;
230
231 }
232
233 /* Send an ICMP "Fragment Reassembly Timeout" message. */
234 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
235out_rcu_unlock:
213 rcu_read_unlock(); 236 rcu_read_unlock();
214 } 237 }
215out: 238out:
@@ -283,7 +306,7 @@ static int ip_frag_reinit(struct ipq *qp)
283 fp = qp->q.fragments; 306 fp = qp->q.fragments;
284 do { 307 do {
285 struct sk_buff *xp = fp->next; 308 struct sk_buff *xp = fp->next;
286 frag_kfree_skb(qp->q.net, fp, NULL); 309 frag_kfree_skb(qp->q.net, fp);
287 fp = xp; 310 fp = xp;
288 } while (fp); 311 } while (fp);
289 312
@@ -291,6 +314,7 @@ static int ip_frag_reinit(struct ipq *qp)
291 qp->q.len = 0; 314 qp->q.len = 0;
292 qp->q.meat = 0; 315 qp->q.meat = 0;
293 qp->q.fragments = NULL; 316 qp->q.fragments = NULL;
317 qp->q.fragments_tail = NULL;
294 qp->iif = 0; 318 qp->iif = 0;
295 319
296 return 0; 320 return 0;
@@ -363,6 +387,11 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
363 * in the chain of fragments so far. We must know where to put 387 * in the chain of fragments so far. We must know where to put
364 * this fragment, right? 388 * this fragment, right?
365 */ 389 */
390 prev = qp->q.fragments_tail;
391 if (!prev || FRAG_CB(prev)->offset < offset) {
392 next = NULL;
393 goto found;
394 }
366 prev = NULL; 395 prev = NULL;
367 for (next = qp->q.fragments; next != NULL; next = next->next) { 396 for (next = qp->q.fragments; next != NULL; next = next->next) {
368 if (FRAG_CB(next)->offset >= offset) 397 if (FRAG_CB(next)->offset >= offset)
@@ -370,6 +399,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
370 prev = next; 399 prev = next;
371 } 400 }
372 401
402found:
373 /* We found where to put this one. Check for overlap with 403 /* We found where to put this one. Check for overlap with
374 * preceding fragment, and, if needed, align things so that 404 * preceding fragment, and, if needed, align things so that
375 * any overlaps are eliminated. 405 * any overlaps are eliminated.
@@ -420,7 +450,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
420 qp->q.fragments = next; 450 qp->q.fragments = next;
421 451
422 qp->q.meat -= free_it->len; 452 qp->q.meat -= free_it->len;
423 frag_kfree_skb(qp->q.net, free_it, NULL); 453 frag_kfree_skb(qp->q.net, free_it);
424 } 454 }
425 } 455 }
426 456
@@ -428,6 +458,8 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
428 458
429 /* Insert this fragment in the chain of fragments. */ 459 /* Insert this fragment in the chain of fragments. */
430 skb->next = next; 460 skb->next = next;
461 if (!next)
462 qp->q.fragments_tail = skb;
431 if (prev) 463 if (prev)
432 prev->next = skb; 464 prev->next = skb;
433 else 465 else
@@ -481,6 +513,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
481 goto out_nomem; 513 goto out_nomem;
482 514
483 fp->next = head->next; 515 fp->next = head->next;
516 if (!fp->next)
517 qp->q.fragments_tail = fp;
484 prev->next = fp; 518 prev->next = fp;
485 519
486 skb_morph(head, qp->q.fragments); 520 skb_morph(head, qp->q.fragments);
@@ -530,7 +564,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
530 564
531 skb_shinfo(head)->frag_list = head->next; 565 skb_shinfo(head)->frag_list = head->next;
532 skb_push(head, head->data - skb_network_header(head)); 566 skb_push(head, head->data - skb_network_header(head));
533 atomic_sub(head->truesize, &qp->q.net->mem);
534 567
535 for (fp=head->next; fp; fp = fp->next) { 568 for (fp=head->next; fp; fp = fp->next) {
536 head->data_len += fp->len; 569 head->data_len += fp->len;
@@ -540,8 +573,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
540 else if (head->ip_summed == CHECKSUM_COMPLETE) 573 else if (head->ip_summed == CHECKSUM_COMPLETE)
541 head->csum = csum_add(head->csum, fp->csum); 574 head->csum = csum_add(head->csum, fp->csum);
542 head->truesize += fp->truesize; 575 head->truesize += fp->truesize;
543 atomic_sub(fp->truesize, &qp->q.net->mem);
544 } 576 }
577 atomic_sub(head->truesize, &qp->q.net->mem);
545 578
546 head->next = NULL; 579 head->next = NULL;
547 head->dev = dev; 580 head->dev = dev;
@@ -552,6 +585,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
552 iph->tot_len = htons(len); 585 iph->tot_len = htons(len);
553 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); 586 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
554 qp->q.fragments = NULL; 587 qp->q.fragments = NULL;
588 qp->q.fragments_tail = NULL;
555 return 0; 589 return 0;
556 590
557out_nomem: 591out_nomem:
@@ -598,6 +632,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
598 kfree_skb(skb); 632 kfree_skb(skb);
599 return -ENOMEM; 633 return -ENOMEM;
600} 634}
635EXPORT_SYMBOL(ip_defrag);
601 636
602#ifdef CONFIG_SYSCTL 637#ifdef CONFIG_SYSCTL
603static int zero; 638static int zero;
@@ -646,7 +681,7 @@ static struct ctl_table ip4_frags_ctl_table[] = {
646 { } 681 { }
647}; 682};
648 683
649static int ip4_frags_ns_ctl_register(struct net *net) 684static int __net_init ip4_frags_ns_ctl_register(struct net *net)
650{ 685{
651 struct ctl_table *table; 686 struct ctl_table *table;
652 struct ctl_table_header *hdr; 687 struct ctl_table_header *hdr;
@@ -676,7 +711,7 @@ err_alloc:
676 return -ENOMEM; 711 return -ENOMEM;
677} 712}
678 713
679static void ip4_frags_ns_ctl_unregister(struct net *net) 714static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
680{ 715{
681 struct ctl_table *table; 716 struct ctl_table *table;
682 717
@@ -704,7 +739,7 @@ static inline void ip4_frags_ctl_register(void)
704} 739}
705#endif 740#endif
706 741
707static int ipv4_frags_init_net(struct net *net) 742static int __net_init ipv4_frags_init_net(struct net *net)
708{ 743{
709 /* 744 /*
710 * Fragment cache limits. We will commit 256K at one time. Should we 745 * Fragment cache limits. We will commit 256K at one time. Should we
@@ -726,7 +761,7 @@ static int ipv4_frags_init_net(struct net *net)
726 return ip4_frags_ns_ctl_register(net); 761 return ip4_frags_ns_ctl_register(net);
727} 762}
728 763
729static void ipv4_frags_exit_net(struct net *net) 764static void __net_exit ipv4_frags_exit_net(struct net *net)
730{ 765{
731 ip4_frags_ns_ctl_unregister(net); 766 ip4_frags_ns_ctl_unregister(net);
732 inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); 767 inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
@@ -751,5 +786,3 @@ void __init ipfrag_init(void)
751 ip4_frags.secret_interval = 10 * 60 * HZ; 786 ip4_frags.secret_interval = 10 * 60 * HZ;
752 inet_frags_init(&ip4_frags); 787 inet_frags_init(&ip4_frags);
753} 788}
754
755EXPORT_SYMBOL(ip_defrag);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f36ce156cac6..35c93e8b6a46 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/types.h> 15#include <linux/types.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/slab.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
18#include <linux/skbuff.h> 19#include <linux/skbuff.h>
19#include <linux/netdevice.h> 20#include <linux/netdevice.h>
@@ -44,7 +45,7 @@
44#include <net/netns/generic.h> 45#include <net/netns/generic.h>
45#include <net/rtnetlink.h> 46#include <net/rtnetlink.h>
46 47
47#ifdef CONFIG_IPV6 48#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
48#include <net/ipv6.h> 49#include <net/ipv6.h>
49#include <net/ip6_fib.h> 50#include <net/ip6_fib.h>
50#include <net/ip6_route.h> 51#include <net/ip6_route.h>
@@ -501,7 +502,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
501 t->err_time = jiffies; 502 t->err_time = jiffies;
502out: 503out:
503 rcu_read_unlock(); 504 rcu_read_unlock();
504 return;
505} 505}
506 506
507static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) 507static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
@@ -537,7 +537,6 @@ static int ipgre_rcv(struct sk_buff *skb)
537 struct ip_tunnel *tunnel; 537 struct ip_tunnel *tunnel;
538 int offset = 4; 538 int offset = 4;
539 __be16 gre_proto; 539 __be16 gre_proto;
540 unsigned int len;
541 540
542 if (!pskb_may_pull(skb, 16)) 541 if (!pskb_may_pull(skb, 16))
543 goto drop_nolock; 542 goto drop_nolock;
@@ -628,8 +627,6 @@ static int ipgre_rcv(struct sk_buff *skb)
628 tunnel->i_seqno = seqno + 1; 627 tunnel->i_seqno = seqno + 1;
629 } 628 }
630 629
631 len = skb->len;
632
633 /* Warning: All skb pointers will be invalidated! */ 630 /* Warning: All skb pointers will be invalidated! */
634 if (tunnel->dev->type == ARPHRD_ETHER) { 631 if (tunnel->dev->type == ARPHRD_ETHER) {
635 if (!pskb_may_pull(skb, ETH_HLEN)) { 632 if (!pskb_may_pull(skb, ETH_HLEN)) {
@@ -643,11 +640,7 @@ static int ipgre_rcv(struct sk_buff *skb)
643 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 640 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
644 } 641 }
645 642
646 stats->rx_packets++; 643 skb_tunnel_rx(skb, tunnel->dev);
647 stats->rx_bytes += len;
648 skb->dev = tunnel->dev;
649 skb_dst_drop(skb);
650 nf_reset(skb);
651 644
652 skb_reset_network_header(skb); 645 skb_reset_network_header(skb);
653 ipgre_ecn_decapsulate(iph, skb); 646 ipgre_ecn_decapsulate(iph, skb);
@@ -706,7 +699,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
706 if ((dst = rt->rt_gateway) == 0) 699 if ((dst = rt->rt_gateway) == 0)
707 goto tx_error_icmp; 700 goto tx_error_icmp;
708 } 701 }
709#ifdef CONFIG_IPV6 702#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
710 else if (skb->protocol == htons(ETH_P_IPV6)) { 703 else if (skb->protocol == htons(ETH_P_IPV6)) {
711 struct in6_addr *addr6; 704 struct in6_addr *addr6;
712 int addr_type; 705 int addr_type;
@@ -738,6 +731,8 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
738 tos = 0; 731 tos = 0;
739 if (skb->protocol == htons(ETH_P_IP)) 732 if (skb->protocol == htons(ETH_P_IP))
740 tos = old_iph->tos; 733 tos = old_iph->tos;
734 else if (skb->protocol == htons(ETH_P_IPV6))
735 tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
741 } 736 }
742 737
743 { 738 {
@@ -752,7 +747,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
752 goto tx_error; 747 goto tx_error;
753 } 748 }
754 } 749 }
755 tdev = rt->u.dst.dev; 750 tdev = rt->dst.dev;
756 751
757 if (tdev == dev) { 752 if (tdev == dev) {
758 ip_rt_put(rt); 753 ip_rt_put(rt);
@@ -762,7 +757,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
762 757
763 df = tiph->frag_off; 758 df = tiph->frag_off;
764 if (df) 759 if (df)
765 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen; 760 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
766 else 761 else
767 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 762 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
768 763
@@ -779,7 +774,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
779 goto tx_error; 774 goto tx_error;
780 } 775 }
781 } 776 }
782#ifdef CONFIG_IPV6 777#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
783 else if (skb->protocol == htons(ETH_P_IPV6)) { 778 else if (skb->protocol == htons(ETH_P_IPV6)) {
784 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); 779 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
785 780
@@ -793,7 +788,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
793 } 788 }
794 789
795 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { 790 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
796 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); 791 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
797 ip_rt_put(rt); 792 ip_rt_put(rt);
798 goto tx_error; 793 goto tx_error;
799 } 794 }
@@ -810,11 +805,13 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
810 tunnel->err_count = 0; 805 tunnel->err_count = 0;
811 } 806 }
812 807
813 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen; 808 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
814 809
815 if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| 810 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
816 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { 811 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
817 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 812 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
813 if (max_headroom > dev->needed_headroom)
814 dev->needed_headroom = max_headroom;
818 if (!new_skb) { 815 if (!new_skb) {
819 ip_rt_put(rt); 816 ip_rt_put(rt);
820 txq->tx_dropped++; 817 txq->tx_dropped++;
@@ -835,7 +832,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
835 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 832 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
836 IPSKB_REROUTED); 833 IPSKB_REROUTED);
837 skb_dst_drop(skb); 834 skb_dst_drop(skb);
838 skb_dst_set(skb, &rt->u.dst); 835 skb_dst_set(skb, &rt->dst);
839 836
840 /* 837 /*
841 * Push down and install the IPIP header. 838 * Push down and install the IPIP header.
@@ -853,12 +850,12 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
853 if ((iph->ttl = tiph->ttl) == 0) { 850 if ((iph->ttl = tiph->ttl) == 0) {
854 if (skb->protocol == htons(ETH_P_IP)) 851 if (skb->protocol == htons(ETH_P_IP))
855 iph->ttl = old_iph->ttl; 852 iph->ttl = old_iph->ttl;
856#ifdef CONFIG_IPV6 853#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
857 else if (skb->protocol == htons(ETH_P_IPV6)) 854 else if (skb->protocol == htons(ETH_P_IPV6))
858 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; 855 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
859#endif 856#endif
860 else 857 else
861 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); 858 iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
862 } 859 }
863 860
864 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; 861 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
@@ -920,7 +917,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
920 .proto = IPPROTO_GRE }; 917 .proto = IPPROTO_GRE };
921 struct rtable *rt; 918 struct rtable *rt;
922 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 919 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
923 tdev = rt->u.dst.dev; 920 tdev = rt->dst.dev;
924 ip_rt_put(rt); 921 ip_rt_put(rt);
925 } 922 }
926 923
@@ -1144,12 +1141,9 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1144 1141
1145 if (saddr) 1142 if (saddr)
1146 memcpy(&iph->saddr, saddr, 4); 1143 memcpy(&iph->saddr, saddr, 4);
1147 1144 if (daddr)
1148 if (daddr) {
1149 memcpy(&iph->daddr, daddr, 4); 1145 memcpy(&iph->daddr, daddr, 4);
1150 return t->hlen; 1146 if (iph->daddr)
1151 }
1152 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1153 return t->hlen; 1147 return t->hlen;
1154 1148
1155 return -t->hlen; 1149 return -t->hlen;
@@ -1182,7 +1176,7 @@ static int ipgre_open(struct net_device *dev)
1182 struct rtable *rt; 1176 struct rtable *rt;
1183 if (ip_route_output_key(dev_net(dev), &rt, &fl)) 1177 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1184 return -EADDRNOTAVAIL; 1178 return -EADDRNOTAVAIL;
1185 dev = rt->u.dst.dev; 1179 dev = rt->dst.dev;
1186 ip_rt_put(rt); 1180 ip_rt_put(rt);
1187 if (__in_dev_get_rtnl(dev) == NULL) 1181 if (__in_dev_get_rtnl(dev) == NULL)
1188 return -EADDRNOTAVAIL; 1182 return -EADDRNOTAVAIL;
@@ -1307,7 +1301,7 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1307 } 1301 }
1308} 1302}
1309 1303
1310static int ipgre_init_net(struct net *net) 1304static int __net_init ipgre_init_net(struct net *net)
1311{ 1305{
1312 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1306 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1313 int err; 1307 int err;
@@ -1334,7 +1328,7 @@ err_alloc_dev:
1334 return err; 1328 return err;
1335} 1329}
1336 1330
1337static void ipgre_exit_net(struct net *net) 1331static void __net_exit ipgre_exit_net(struct net *net)
1338{ 1332{
1339 struct ipgre_net *ign; 1333 struct ipgre_net *ign;
1340 LIST_HEAD(list); 1334 LIST_HEAD(list);
@@ -1665,14 +1659,15 @@ static int __init ipgre_init(void)
1665 1659
1666 printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); 1660 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1667 1661
1668 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1669 printk(KERN_INFO "ipgre init: can't add protocol\n");
1670 return -EAGAIN;
1671 }
1672
1673 err = register_pernet_device(&ipgre_net_ops); 1662 err = register_pernet_device(&ipgre_net_ops);
1674 if (err < 0) 1663 if (err < 0)
1675 goto gen_device_failed; 1664 return err;
1665
1666 err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
1667 if (err < 0) {
1668 printk(KERN_INFO "ipgre init: can't add protocol\n");
1669 goto add_proto_failed;
1670 }
1676 1671
1677 err = rtnl_link_register(&ipgre_link_ops); 1672 err = rtnl_link_register(&ipgre_link_ops);
1678 if (err < 0) 1673 if (err < 0)
@@ -1688,9 +1683,9 @@ out:
1688tap_ops_failed: 1683tap_ops_failed:
1689 rtnl_link_unregister(&ipgre_link_ops); 1684 rtnl_link_unregister(&ipgre_link_ops);
1690rtnl_link_failed: 1685rtnl_link_failed:
1691 unregister_pernet_device(&ipgre_net_ops);
1692gen_device_failed:
1693 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); 1686 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1687add_proto_failed:
1688 unregister_pernet_device(&ipgre_net_ops);
1694 goto out; 1689 goto out;
1695} 1690}
1696 1691
@@ -1698,9 +1693,9 @@ static void __exit ipgre_fini(void)
1698{ 1693{
1699 rtnl_link_unregister(&ipgre_tap_ops); 1694 rtnl_link_unregister(&ipgre_tap_ops);
1700 rtnl_link_unregister(&ipgre_link_ops); 1695 rtnl_link_unregister(&ipgre_link_ops);
1701 unregister_pernet_device(&ipgre_net_ops);
1702 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) 1696 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1703 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1697 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1698 unregister_pernet_device(&ipgre_net_ops);
1704} 1699}
1705 1700
1706module_init(ipgre_init); 1701module_init(ipgre_init);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c29de9879fda..d859bcc26cb7 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -119,6 +119,7 @@
119#include <linux/kernel.h> 119#include <linux/kernel.h>
120#include <linux/string.h> 120#include <linux/string.h>
121#include <linux/errno.h> 121#include <linux/errno.h>
122#include <linux/slab.h>
122 123
123#include <linux/net.h> 124#include <linux/net.h>
124#include <linux/socket.h> 125#include <linux/socket.h>
@@ -145,7 +146,7 @@
145#include <linux/netlink.h> 146#include <linux/netlink.h>
146 147
147/* 148/*
148 * Process Router Attention IP option 149 * Process Router Attention IP option (RFC 2113)
149 */ 150 */
150int ip_call_ra_chain(struct sk_buff *skb) 151int ip_call_ra_chain(struct sk_buff *skb)
151{ 152{
@@ -154,8 +155,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
154 struct sock *last = NULL; 155 struct sock *last = NULL;
155 struct net_device *dev = skb->dev; 156 struct net_device *dev = skb->dev;
156 157
157 read_lock(&ip_ra_lock); 158 for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
158 for (ra = ip_ra_chain; ra; ra = ra->next) {
159 struct sock *sk = ra->sk; 159 struct sock *sk = ra->sk;
160 160
161 /* If socket is bound to an interface, only report 161 /* If socket is bound to an interface, only report
@@ -166,10 +166,8 @@ int ip_call_ra_chain(struct sk_buff *skb)
166 sk->sk_bound_dev_if == dev->ifindex) && 166 sk->sk_bound_dev_if == dev->ifindex) &&
167 net_eq(sock_net(sk), dev_net(dev))) { 167 net_eq(sock_net(sk), dev_net(dev))) {
168 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { 168 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
169 if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) { 169 if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
170 read_unlock(&ip_ra_lock);
171 return 1; 170 return 1;
172 }
173 } 171 }
174 if (last) { 172 if (last) {
175 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 173 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -182,10 +180,8 @@ int ip_call_ra_chain(struct sk_buff *skb)
182 180
183 if (last) { 181 if (last) {
184 raw_rcv(last, skb); 182 raw_rcv(last, skb);
185 read_unlock(&ip_ra_lock);
186 return 1; 183 return 1;
187 } 184 }
188 read_unlock(&ip_ra_lock);
189 return 0; 185 return 0;
190} 186}
191 187
@@ -265,7 +261,7 @@ int ip_local_deliver(struct sk_buff *skb)
265 return 0; 261 return 0;
266 } 262 }
267 263
268 return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL, 264 return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
269 ip_local_deliver_finish); 265 ip_local_deliver_finish);
270} 266}
271 267
@@ -297,18 +293,16 @@ static inline int ip_rcv_options(struct sk_buff *skb)
297 } 293 }
298 294
299 if (unlikely(opt->srr)) { 295 if (unlikely(opt->srr)) {
300 struct in_device *in_dev = in_dev_get(dev); 296 struct in_device *in_dev = __in_dev_get_rcu(dev);
297
301 if (in_dev) { 298 if (in_dev) {
302 if (!IN_DEV_SOURCE_ROUTE(in_dev)) { 299 if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
303 if (IN_DEV_LOG_MARTIANS(in_dev) && 300 if (IN_DEV_LOG_MARTIANS(in_dev) &&
304 net_ratelimit()) 301 net_ratelimit())
305 printk(KERN_INFO "source route option %pI4 -> %pI4\n", 302 printk(KERN_INFO "source route option %pI4 -> %pI4\n",
306 &iph->saddr, &iph->daddr); 303 &iph->saddr, &iph->daddr);
307 in_dev_put(in_dev);
308 goto drop; 304 goto drop;
309 } 305 }
310
311 in_dev_put(in_dev);
312 } 306 }
313 307
314 if (ip_options_rcv_srr(skb)) 308 if (ip_options_rcv_srr(skb))
@@ -330,8 +324,8 @@ static int ip_rcv_finish(struct sk_buff *skb)
330 * how the packet travels inside Linux networking. 324 * how the packet travels inside Linux networking.
331 */ 325 */
332 if (skb_dst(skb) == NULL) { 326 if (skb_dst(skb) == NULL) {
333 int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, 327 int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
334 skb->dev); 328 iph->tos, skb->dev);
335 if (unlikely(err)) { 329 if (unlikely(err)) {
336 if (err == -EHOSTUNREACH) 330 if (err == -EHOSTUNREACH)
337 IP_INC_STATS_BH(dev_net(skb->dev), 331 IP_INC_STATS_BH(dev_net(skb->dev),
@@ -339,13 +333,16 @@ static int ip_rcv_finish(struct sk_buff *skb)
339 else if (err == -ENETUNREACH) 333 else if (err == -ENETUNREACH)
340 IP_INC_STATS_BH(dev_net(skb->dev), 334 IP_INC_STATS_BH(dev_net(skb->dev),
341 IPSTATS_MIB_INNOROUTES); 335 IPSTATS_MIB_INNOROUTES);
336 else if (err == -EXDEV)
337 NET_INC_STATS_BH(dev_net(skb->dev),
338 LINUX_MIB_IPRPFILTER);
342 goto drop; 339 goto drop;
343 } 340 }
344 } 341 }
345 342
346#ifdef CONFIG_NET_CLS_ROUTE 343#ifdef CONFIG_NET_CLS_ROUTE
347 if (unlikely(skb_dst(skb)->tclassid)) { 344 if (unlikely(skb_dst(skb)->tclassid)) {
348 struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id()); 345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
349 u32 idx = skb_dst(skb)->tclassid; 346 u32 idx = skb_dst(skb)->tclassid;
350 st[idx&0xFF].o_packets++; 347 st[idx&0xFF].o_packets++;
351 st[idx&0xFF].o_bytes += skb->len; 348 st[idx&0xFF].o_bytes += skb->len;
@@ -359,10 +356,10 @@ static int ip_rcv_finish(struct sk_buff *skb)
359 356
360 rt = skb_rtable(skb); 357 rt = skb_rtable(skb);
361 if (rt->rt_type == RTN_MULTICAST) { 358 if (rt->rt_type == RTN_MULTICAST) {
362 IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST, 359 IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
363 skb->len); 360 skb->len);
364 } else if (rt->rt_type == RTN_BROADCAST) 361 } else if (rt->rt_type == RTN_BROADCAST)
365 IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST, 362 IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
366 skb->len); 363 skb->len);
367 364
368 return dst_input(skb); 365 return dst_input(skb);
@@ -443,7 +440,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
443 /* Must drop socket now because of tproxy. */ 440 /* Must drop socket now because of tproxy. */
444 skb_orphan(skb); 441 skb_orphan(skb);
445 442
446 return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, 443 return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
447 ip_rcv_finish); 444 ip_rcv_finish);
448 445
449inhdr_error: 446inhdr_error:
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 94bf105ef3c9..ba9836c488ed 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/capability.h> 12#include <linux/capability.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/slab.h>
14#include <linux/types.h> 15#include <linux/types.h>
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16#include <linux/skbuff.h> 17#include <linux/skbuff.h>
@@ -237,7 +238,6 @@ void ip_options_fragment(struct sk_buff * skb)
237 opt->rr_needaddr = 0; 238 opt->rr_needaddr = 0;
238 opt->ts_needaddr = 0; 239 opt->ts_needaddr = 0;
239 opt->ts_needtime = 0; 240 opt->ts_needtime = 0;
240 return;
241} 241}
242 242
243/* 243/*
@@ -600,6 +600,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
600 unsigned char *optptr = skb_network_header(skb) + opt->srr; 600 unsigned char *optptr = skb_network_header(skb) + opt->srr;
601 struct rtable *rt = skb_rtable(skb); 601 struct rtable *rt = skb_rtable(skb);
602 struct rtable *rt2; 602 struct rtable *rt2;
603 unsigned long orefdst;
603 int err; 604 int err;
604 605
605 if (!opt->srr) 606 if (!opt->srr)
@@ -623,16 +624,16 @@ int ip_options_rcv_srr(struct sk_buff *skb)
623 } 624 }
624 memcpy(&nexthop, &optptr[srrptr-1], 4); 625 memcpy(&nexthop, &optptr[srrptr-1], 4);
625 626
626 rt = skb_rtable(skb); 627 orefdst = skb->_skb_refdst;
627 skb_dst_set(skb, NULL); 628 skb_dst_set(skb, NULL);
628 err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); 629 err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
629 rt2 = skb_rtable(skb); 630 rt2 = skb_rtable(skb);
630 if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { 631 if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
631 ip_rt_put(rt2); 632 skb_dst_drop(skb);
632 skb_dst_set(skb, &rt->u.dst); 633 skb->_skb_refdst = orefdst;
633 return -EINVAL; 634 return -EINVAL;
634 } 635 }
635 ip_rt_put(rt); 636 refdst_drop(orefdst);
636 if (rt2->rt_type != RTN_LOCAL) 637 if (rt2->rt_type != RTN_LOCAL)
637 break; 638 break;
638 /* Superfast 8) loopback forward */ 639 /* Superfast 8) loopback forward */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 3451799e3dbf..7649d7750075 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -51,6 +51,7 @@
51#include <linux/string.h> 51#include <linux/string.h>
52#include <linux/errno.h> 52#include <linux/errno.h>
53#include <linux/highmem.h> 53#include <linux/highmem.h>
54#include <linux/slab.h>
54 55
55#include <linux/socket.h> 56#include <linux/socket.h>
56#include <linux/sockios.h> 57#include <linux/sockios.h>
@@ -88,6 +89,7 @@ __inline__ void ip_send_check(struct iphdr *iph)
88 iph->check = 0; 89 iph->check = 0;
89 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
90} 91}
92EXPORT_SYMBOL(ip_send_check);
91 93
92int __ip_local_out(struct sk_buff *skb) 94int __ip_local_out(struct sk_buff *skb)
93{ 95{
@@ -95,8 +97,8 @@ int __ip_local_out(struct sk_buff *skb)
95 97
96 iph->tot_len = htons(skb->len); 98 iph->tot_len = htons(skb->len);
97 ip_send_check(iph); 99 ip_send_check(iph);
98 return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev, 100 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
99 dst_output); 101 skb_dst(skb)->dev, dst_output);
100} 102}
101 103
102int ip_local_out(struct sk_buff *skb) 104int ip_local_out(struct sk_buff *skb)
@@ -119,7 +121,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119 newskb->pkt_type = PACKET_LOOPBACK; 121 newskb->pkt_type = PACKET_LOOPBACK;
120 newskb->ip_summed = CHECKSUM_UNNECESSARY; 122 newskb->ip_summed = CHECKSUM_UNNECESSARY;
121 WARN_ON(!skb_dst(newskb)); 123 WARN_ON(!skb_dst(newskb));
122 netif_rx(newskb); 124 netif_rx_ni(newskb);
123 return 0; 125 return 0;
124} 126}
125 127
@@ -150,15 +152,15 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
150 iph->version = 4; 152 iph->version = 4;
151 iph->ihl = 5; 153 iph->ihl = 5;
152 iph->tos = inet->tos; 154 iph->tos = inet->tos;
153 if (ip_dont_fragment(sk, &rt->u.dst)) 155 if (ip_dont_fragment(sk, &rt->dst))
154 iph->frag_off = htons(IP_DF); 156 iph->frag_off = htons(IP_DF);
155 else 157 else
156 iph->frag_off = 0; 158 iph->frag_off = 0;
157 iph->ttl = ip_select_ttl(inet, &rt->u.dst); 159 iph->ttl = ip_select_ttl(inet, &rt->dst);
158 iph->daddr = rt->rt_dst; 160 iph->daddr = rt->rt_dst;
159 iph->saddr = rt->rt_src; 161 iph->saddr = rt->rt_src;
160 iph->protocol = sk->sk_protocol; 162 iph->protocol = sk->sk_protocol;
161 ip_select_ident(iph, &rt->u.dst, sk); 163 ip_select_ident(iph, &rt->dst, sk);
162 164
163 if (opt && opt->optlen) { 165 if (opt && opt->optlen) {
164 iph->ihl += opt->optlen>>2; 166 iph->ihl += opt->optlen>>2;
@@ -171,7 +173,6 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
171 /* Send it out. */ 173 /* Send it out. */
172 return ip_local_out(skb); 174 return ip_local_out(skb);
173} 175}
174
175EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); 176EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
176 177
177static inline int ip_finish_output2(struct sk_buff *skb) 178static inline int ip_finish_output2(struct sk_buff *skb)
@@ -239,7 +240,7 @@ int ip_mc_output(struct sk_buff *skb)
239{ 240{
240 struct sock *sk = skb->sk; 241 struct sock *sk = skb->sk;
241 struct rtable *rt = skb_rtable(skb); 242 struct rtable *rt = skb_rtable(skb);
242 struct net_device *dev = rt->u.dst.dev; 243 struct net_device *dev = rt->dst.dev;
243 244
244 /* 245 /*
245 * If the indicated interface is up and running, send the packet. 246 * If the indicated interface is up and running, send the packet.
@@ -271,8 +272,8 @@ int ip_mc_output(struct sk_buff *skb)
271 ) { 272 ) {
272 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 273 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
273 if (newskb) 274 if (newskb)
274 NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, 275 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
275 NULL, newskb->dev, 276 newskb, NULL, newskb->dev,
276 ip_dev_loopback_xmit); 277 ip_dev_loopback_xmit);
277 } 278 }
278 279
@@ -287,12 +288,12 @@ int ip_mc_output(struct sk_buff *skb)
287 if (rt->rt_flags&RTCF_BROADCAST) { 288 if (rt->rt_flags&RTCF_BROADCAST) {
288 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 289 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
289 if (newskb) 290 if (newskb)
290 NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL, 291 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
291 newskb->dev, ip_dev_loopback_xmit); 292 NULL, newskb->dev, ip_dev_loopback_xmit);
292 } 293 }
293 294
294 return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev, 295 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
295 ip_finish_output, 296 skb->dev, ip_finish_output,
296 !(IPCB(skb)->flags & IPSKB_REROUTED)); 297 !(IPCB(skb)->flags & IPSKB_REROUTED));
297} 298}
298 299
@@ -305,22 +306,24 @@ int ip_output(struct sk_buff *skb)
305 skb->dev = dev; 306 skb->dev = dev;
306 skb->protocol = htons(ETH_P_IP); 307 skb->protocol = htons(ETH_P_IP);
307 308
308 return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, 309 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
309 ip_finish_output, 310 ip_finish_output,
310 !(IPCB(skb)->flags & IPSKB_REROUTED)); 311 !(IPCB(skb)->flags & IPSKB_REROUTED));
311} 312}
312 313
313int ip_queue_xmit(struct sk_buff *skb, int ipfragok) 314int ip_queue_xmit(struct sk_buff *skb)
314{ 315{
315 struct sock *sk = skb->sk; 316 struct sock *sk = skb->sk;
316 struct inet_sock *inet = inet_sk(sk); 317 struct inet_sock *inet = inet_sk(sk);
317 struct ip_options *opt = inet->opt; 318 struct ip_options *opt = inet->opt;
318 struct rtable *rt; 319 struct rtable *rt;
319 struct iphdr *iph; 320 struct iphdr *iph;
321 int res;
320 322
321 /* Skip all of this if the packet is already routed, 323 /* Skip all of this if the packet is already routed,
322 * f.e. by something like SCTP. 324 * f.e. by something like SCTP.
323 */ 325 */
326 rcu_read_lock();
324 rt = skb_rtable(skb); 327 rt = skb_rtable(skb);
325 if (rt != NULL) 328 if (rt != NULL)
326 goto packet_routed; 329 goto packet_routed;
@@ -356,9 +359,9 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
356 if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) 359 if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
357 goto no_route; 360 goto no_route;
358 } 361 }
359 sk_setup_caps(sk, &rt->u.dst); 362 sk_setup_caps(sk, &rt->dst);
360 } 363 }
361 skb_dst_set(skb, dst_clone(&rt->u.dst)); 364 skb_dst_set_noref(skb, &rt->dst);
362 365
363packet_routed: 366packet_routed:
364 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 367 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
@@ -369,11 +372,11 @@ packet_routed:
369 skb_reset_network_header(skb); 372 skb_reset_network_header(skb);
370 iph = ip_hdr(skb); 373 iph = ip_hdr(skb);
371 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); 374 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
372 if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) 375 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
373 iph->frag_off = htons(IP_DF); 376 iph->frag_off = htons(IP_DF);
374 else 377 else
375 iph->frag_off = 0; 378 iph->frag_off = 0;
376 iph->ttl = ip_select_ttl(inet, &rt->u.dst); 379 iph->ttl = ip_select_ttl(inet, &rt->dst);
377 iph->protocol = sk->sk_protocol; 380 iph->protocol = sk->sk_protocol;
378 iph->saddr = rt->rt_src; 381 iph->saddr = rt->rt_src;
379 iph->daddr = rt->rt_dst; 382 iph->daddr = rt->rt_dst;
@@ -384,19 +387,23 @@ packet_routed:
384 ip_options_build(skb, opt, inet->inet_daddr, rt, 0); 387 ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
385 } 388 }
386 389
387 ip_select_ident_more(iph, &rt->u.dst, sk, 390 ip_select_ident_more(iph, &rt->dst, sk,
388 (skb_shinfo(skb)->gso_segs ?: 1) - 1); 391 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
389 392
390 skb->priority = sk->sk_priority; 393 skb->priority = sk->sk_priority;
391 skb->mark = sk->sk_mark; 394 skb->mark = sk->sk_mark;
392 395
393 return ip_local_out(skb); 396 res = ip_local_out(skb);
397 rcu_read_unlock();
398 return res;
394 399
395no_route: 400no_route:
401 rcu_read_unlock();
396 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 402 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
397 kfree_skb(skb); 403 kfree_skb(skb);
398 return -EHOSTUNREACH; 404 return -EHOSTUNREACH;
399} 405}
406EXPORT_SYMBOL(ip_queue_xmit);
400 407
401 408
402static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) 409static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
@@ -405,7 +412,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
405 to->priority = from->priority; 412 to->priority = from->priority;
406 to->protocol = from->protocol; 413 to->protocol = from->protocol;
407 skb_dst_drop(to); 414 skb_dst_drop(to);
408 skb_dst_set(to, dst_clone(skb_dst(from))); 415 skb_dst_copy(to, from);
409 to->dev = from->dev; 416 to->dev = from->dev;
410 to->mark = from->mark; 417 to->mark = from->mark;
411 418
@@ -436,17 +443,16 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
436int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) 443int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
437{ 444{
438 struct iphdr *iph; 445 struct iphdr *iph;
439 int raw = 0;
440 int ptr; 446 int ptr;
441 struct net_device *dev; 447 struct net_device *dev;
442 struct sk_buff *skb2; 448 struct sk_buff *skb2;
443 unsigned int mtu, hlen, left, len, ll_rs, pad; 449 unsigned int mtu, hlen, left, len, ll_rs;
444 int offset; 450 int offset;
445 __be16 not_last_frag; 451 __be16 not_last_frag;
446 struct rtable *rt = skb_rtable(skb); 452 struct rtable *rt = skb_rtable(skb);
447 int err = 0; 453 int err = 0;
448 454
449 dev = rt->u.dst.dev; 455 dev = rt->dst.dev;
450 456
451 /* 457 /*
452 * Point into the IP datagram header. 458 * Point into the IP datagram header.
@@ -467,7 +473,11 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
467 */ 473 */
468 474
469 hlen = iph->ihl * 4; 475 hlen = iph->ihl * 4;
470 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ 476 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
477#ifdef CONFIG_BRIDGE_NETFILTER
478 if (skb->nf_bridge)
479 mtu -= nf_bridge_mtu_reduction(skb);
480#endif
471 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; 481 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
472 482
473 /* When frag_list is given, use it. First, check its validity: 483 /* When frag_list is given, use it. First, check its validity:
@@ -478,9 +488,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
478 * we can switch to copy when see the first bad fragment. 488 * we can switch to copy when see the first bad fragment.
479 */ 489 */
480 if (skb_has_frags(skb)) { 490 if (skb_has_frags(skb)) {
481 struct sk_buff *frag; 491 struct sk_buff *frag, *frag2;
482 int first_len = skb_pagelen(skb); 492 int first_len = skb_pagelen(skb);
483 int truesizes = 0;
484 493
485 if (first_len - hlen > mtu || 494 if (first_len - hlen > mtu ||
486 ((first_len - hlen) & 7) || 495 ((first_len - hlen) & 7) ||
@@ -493,18 +502,18 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
493 if (frag->len > mtu || 502 if (frag->len > mtu ||
494 ((frag->len & 7) && frag->next) || 503 ((frag->len & 7) && frag->next) ||
495 skb_headroom(frag) < hlen) 504 skb_headroom(frag) < hlen)
496 goto slow_path; 505 goto slow_path_clean;
497 506
498 /* Partially cloned skb? */ 507 /* Partially cloned skb? */
499 if (skb_shared(frag)) 508 if (skb_shared(frag))
500 goto slow_path; 509 goto slow_path_clean;
501 510
502 BUG_ON(frag->sk); 511 BUG_ON(frag->sk);
503 if (skb->sk) { 512 if (skb->sk) {
504 frag->sk = skb->sk; 513 frag->sk = skb->sk;
505 frag->destructor = sock_wfree; 514 frag->destructor = sock_wfree;
506 } 515 }
507 truesizes += frag->truesize; 516 skb->truesize -= frag->truesize;
508 } 517 }
509 518
510 /* Everything is OK. Generate! */ 519 /* Everything is OK. Generate! */
@@ -514,7 +523,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
514 frag = skb_shinfo(skb)->frag_list; 523 frag = skb_shinfo(skb)->frag_list;
515 skb_frag_list_init(skb); 524 skb_frag_list_init(skb);
516 skb->data_len = first_len - skb_headlen(skb); 525 skb->data_len = first_len - skb_headlen(skb);
517 skb->truesize -= truesizes;
518 skb->len = first_len; 526 skb->len = first_len;
519 iph->tot_len = htons(first_len); 527 iph->tot_len = htons(first_len);
520 iph->frag_off = htons(IP_MF); 528 iph->frag_off = htons(IP_MF);
@@ -566,18 +574,25 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
566 } 574 }
567 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 575 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
568 return err; 576 return err;
577
578slow_path_clean:
579 skb_walk_frags(skb, frag2) {
580 if (frag2 == frag)
581 break;
582 frag2->sk = NULL;
583 frag2->destructor = NULL;
584 skb->truesize += frag2->truesize;
585 }
569 } 586 }
570 587
571slow_path: 588slow_path:
572 left = skb->len - hlen; /* Space per frame */ 589 left = skb->len - hlen; /* Space per frame */
573 ptr = raw + hlen; /* Where to start from */ 590 ptr = hlen; /* Where to start from */
574 591
575 /* for bridged IP traffic encapsulated inside f.e. a vlan header, 592 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
576 * we need to make room for the encapsulating header 593 * we need to make room for the encapsulating header
577 */ 594 */
578 pad = nf_bridge_pad(skb); 595 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
579 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
580 mtu -= pad;
581 596
582 /* 597 /*
583 * Fragment the datagram. 598 * Fragment the datagram.
@@ -687,7 +702,6 @@ fail:
687 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 702 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
688 return err; 703 return err;
689} 704}
690
691EXPORT_SYMBOL(ip_fragment); 705EXPORT_SYMBOL(ip_fragment);
692 706
693int 707int
@@ -706,6 +720,7 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
706 } 720 }
707 return 0; 721 return 0;
708} 722}
723EXPORT_SYMBOL(ip_generic_getfrag);
709 724
710static inline __wsum 725static inline __wsum
711csum_page(struct page *page, int offset, int copy) 726csum_page(struct page *page, int offset, int copy)
@@ -823,13 +838,13 @@ int ip_append_data(struct sock *sk,
823 */ 838 */
824 *rtp = NULL; 839 *rtp = NULL;
825 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? 840 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
826 rt->u.dst.dev->mtu : 841 rt->dst.dev->mtu :
827 dst_mtu(rt->u.dst.path); 842 dst_mtu(rt->dst.path);
828 inet->cork.dst = &rt->u.dst; 843 inet->cork.dst = &rt->dst;
829 inet->cork.length = 0; 844 inet->cork.length = 0;
830 sk->sk_sndmsg_page = NULL; 845 sk->sk_sndmsg_page = NULL;
831 sk->sk_sndmsg_off = 0; 846 sk->sk_sndmsg_off = 0;
832 if ((exthdrlen = rt->u.dst.header_len) != 0) { 847 if ((exthdrlen = rt->dst.header_len) != 0) {
833 length += exthdrlen; 848 length += exthdrlen;
834 transhdrlen += exthdrlen; 849 transhdrlen += exthdrlen;
835 } 850 }
@@ -842,7 +857,7 @@ int ip_append_data(struct sock *sk,
842 exthdrlen = 0; 857 exthdrlen = 0;
843 mtu = inet->cork.fragsize; 858 mtu = inet->cork.fragsize;
844 } 859 }
845 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); 860 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
846 861
847 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 862 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
848 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 863 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
@@ -859,14 +874,16 @@ int ip_append_data(struct sock *sk,
859 */ 874 */
860 if (transhdrlen && 875 if (transhdrlen &&
861 length + fragheaderlen <= mtu && 876 length + fragheaderlen <= mtu &&
862 rt->u.dst.dev->features & NETIF_F_V4_CSUM && 877 rt->dst.dev->features & NETIF_F_V4_CSUM &&
863 !exthdrlen) 878 !exthdrlen)
864 csummode = CHECKSUM_PARTIAL; 879 csummode = CHECKSUM_PARTIAL;
865 880
881 skb = skb_peek_tail(&sk->sk_write_queue);
882
866 inet->cork.length += length; 883 inet->cork.length += length;
867 if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) && 884 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
868 (sk->sk_protocol == IPPROTO_UDP) && 885 (sk->sk_protocol == IPPROTO_UDP) &&
869 (rt->u.dst.dev->features & NETIF_F_UFO)) { 886 (rt->dst.dev->features & NETIF_F_UFO)) {
870 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, 887 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
871 fragheaderlen, transhdrlen, mtu, 888 fragheaderlen, transhdrlen, mtu,
872 flags); 889 flags);
@@ -882,7 +899,7 @@ int ip_append_data(struct sock *sk,
882 * adding appropriate IP header. 899 * adding appropriate IP header.
883 */ 900 */
884 901
885 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 902 if (!skb)
886 goto alloc_new_skb; 903 goto alloc_new_skb;
887 904
888 while (length > 0) { 905 while (length > 0) {
@@ -914,7 +931,7 @@ alloc_new_skb:
914 fraglen = datalen + fragheaderlen; 931 fraglen = datalen + fragheaderlen;
915 932
916 if ((flags & MSG_MORE) && 933 if ((flags & MSG_MORE) &&
917 !(rt->u.dst.dev->features&NETIF_F_SG)) 934 !(rt->dst.dev->features&NETIF_F_SG))
918 alloclen = mtu; 935 alloclen = mtu;
919 else 936 else
920 alloclen = datalen + fragheaderlen; 937 alloclen = datalen + fragheaderlen;
@@ -925,7 +942,7 @@ alloc_new_skb:
925 * the last. 942 * the last.
926 */ 943 */
927 if (datalen == length + fraggap) 944 if (datalen == length + fraggap)
928 alloclen += rt->u.dst.trailer_len; 945 alloclen += rt->dst.trailer_len;
929 946
930 if (transhdrlen) { 947 if (transhdrlen) {
931 skb = sock_alloc_send_skb(sk, 948 skb = sock_alloc_send_skb(sk,
@@ -998,7 +1015,7 @@ alloc_new_skb:
998 if (copy > length) 1015 if (copy > length)
999 copy = length; 1016 copy = length;
1000 1017
1001 if (!(rt->u.dst.dev->features&NETIF_F_SG)) { 1018 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1002 unsigned int off; 1019 unsigned int off;
1003 1020
1004 off = skb->len; 1021 off = skb->len;
@@ -1093,10 +1110,10 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1093 if (inet->cork.flags & IPCORK_OPT) 1110 if (inet->cork.flags & IPCORK_OPT)
1094 opt = inet->cork.opt; 1111 opt = inet->cork.opt;
1095 1112
1096 if (!(rt->u.dst.dev->features&NETIF_F_SG)) 1113 if (!(rt->dst.dev->features&NETIF_F_SG))
1097 return -EOPNOTSUPP; 1114 return -EOPNOTSUPP;
1098 1115
1099 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); 1116 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1100 mtu = inet->cork.fragsize; 1117 mtu = inet->cork.fragsize;
1101 1118
1102 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1119 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
@@ -1111,8 +1128,9 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1111 return -EINVAL; 1128 return -EINVAL;
1112 1129
1113 inet->cork.length += size; 1130 inet->cork.length += size;
1114 if ((sk->sk_protocol == IPPROTO_UDP) && 1131 if ((size + skb->len > mtu) &&
1115 (rt->u.dst.dev->features & NETIF_F_UFO)) { 1132 (sk->sk_protocol == IPPROTO_UDP) &&
1133 (rt->dst.dev->features & NETIF_F_UFO)) {
1116 skb_shinfo(skb)->gso_size = mtu - fragheaderlen; 1134 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1117 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 1135 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1118 } 1136 }
@@ -1264,8 +1282,8 @@ int ip_push_pending_frames(struct sock *sk)
1264 * If local_df is set too, we still allow to fragment this frame 1282 * If local_df is set too, we still allow to fragment this frame
1265 * locally. */ 1283 * locally. */
1266 if (inet->pmtudisc >= IP_PMTUDISC_DO || 1284 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1267 (skb->len <= dst_mtu(&rt->u.dst) && 1285 (skb->len <= dst_mtu(&rt->dst) &&
1268 ip_dont_fragment(sk, &rt->u.dst))) 1286 ip_dont_fragment(sk, &rt->dst)))
1269 df = htons(IP_DF); 1287 df = htons(IP_DF);
1270 1288
1271 if (inet->cork.flags & IPCORK_OPT) 1289 if (inet->cork.flags & IPCORK_OPT)
@@ -1274,7 +1292,7 @@ int ip_push_pending_frames(struct sock *sk)
1274 if (rt->rt_type == RTN_MULTICAST) 1292 if (rt->rt_type == RTN_MULTICAST)
1275 ttl = inet->mc_ttl; 1293 ttl = inet->mc_ttl;
1276 else 1294 else
1277 ttl = ip_select_ttl(inet, &rt->u.dst); 1295 ttl = ip_select_ttl(inet, &rt->dst);
1278 1296
1279 iph = (struct iphdr *)skb->data; 1297 iph = (struct iphdr *)skb->data;
1280 iph->version = 4; 1298 iph->version = 4;
@@ -1285,7 +1303,7 @@ int ip_push_pending_frames(struct sock *sk)
1285 } 1303 }
1286 iph->tos = inet->tos; 1304 iph->tos = inet->tos;
1287 iph->frag_off = df; 1305 iph->frag_off = df;
1288 ip_select_ident(iph, &rt->u.dst, sk); 1306 ip_select_ident(iph, &rt->dst, sk);
1289 iph->ttl = ttl; 1307 iph->ttl = ttl;
1290 iph->protocol = sk->sk_protocol; 1308 iph->protocol = sk->sk_protocol;
1291 iph->saddr = rt->rt_src; 1309 iph->saddr = rt->rt_src;
@@ -1298,7 +1316,7 @@ int ip_push_pending_frames(struct sock *sk)
1298 * on dst refcount 1316 * on dst refcount
1299 */ 1317 */
1300 inet->cork.dst = NULL; 1318 inet->cork.dst = NULL;
1301 skb_dst_set(skb, &rt->u.dst); 1319 skb_dst_set(skb, &rt->dst);
1302 1320
1303 if (iph->protocol == IPPROTO_ICMP) 1321 if (iph->protocol == IPPROTO_ICMP)
1304 icmp_out_count(net, ((struct icmphdr *) 1322 icmp_out_count(net, ((struct icmphdr *)
@@ -1435,7 +1453,3 @@ void __init ip_init(void)
1435 igmp_mc_proc_init(); 1453 igmp_mc_proc_init();
1436#endif 1454#endif
1437} 1455}
1438
1439EXPORT_SYMBOL(ip_generic_getfrag);
1440EXPORT_SYMBOL(ip_queue_xmit);
1441EXPORT_SYMBOL(ip_send_check);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cafad9baff03..64b70ad162e3 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -23,6 +23,7 @@
23#include <linux/icmp.h> 23#include <linux/icmp.h>
24#include <linux/inetdevice.h> 24#include <linux/inetdevice.h>
25#include <linux/netdevice.h> 25#include <linux/netdevice.h>
26#include <linux/slab.h>
26#include <net/sock.h> 27#include <net/sock.h>
27#include <net/ip.h> 28#include <net/ip.h>
28#include <net/icmp.h> 29#include <net/icmp.h>
@@ -238,7 +239,16 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
238 sent to multicast group to reach destination designated router. 239 sent to multicast group to reach destination designated router.
239 */ 240 */
240struct ip_ra_chain *ip_ra_chain; 241struct ip_ra_chain *ip_ra_chain;
241DEFINE_RWLOCK(ip_ra_lock); 242static DEFINE_SPINLOCK(ip_ra_lock);
243
244
245static void ip_ra_destroy_rcu(struct rcu_head *head)
246{
247 struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
248
249 sock_put(ra->saved_sk);
250 kfree(ra);
251}
242 252
243int ip_ra_control(struct sock *sk, unsigned char on, 253int ip_ra_control(struct sock *sk, unsigned char on,
244 void (*destructor)(struct sock *)) 254 void (*destructor)(struct sock *))
@@ -250,35 +260,42 @@ int ip_ra_control(struct sock *sk, unsigned char on,
250 260
251 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; 261 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
252 262
253 write_lock_bh(&ip_ra_lock); 263 spin_lock_bh(&ip_ra_lock);
254 for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { 264 for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) {
255 if (ra->sk == sk) { 265 if (ra->sk == sk) {
256 if (on) { 266 if (on) {
257 write_unlock_bh(&ip_ra_lock); 267 spin_unlock_bh(&ip_ra_lock);
258 kfree(new_ra); 268 kfree(new_ra);
259 return -EADDRINUSE; 269 return -EADDRINUSE;
260 } 270 }
261 *rap = ra->next; 271 /* dont let ip_call_ra_chain() use sk again */
262 write_unlock_bh(&ip_ra_lock); 272 ra->sk = NULL;
273 rcu_assign_pointer(*rap, ra->next);
274 spin_unlock_bh(&ip_ra_lock);
263 275
264 if (ra->destructor) 276 if (ra->destructor)
265 ra->destructor(sk); 277 ra->destructor(sk);
266 sock_put(sk); 278 /*
267 kfree(ra); 279 * Delay sock_put(sk) and kfree(ra) after one rcu grace
280 * period. This guarantee ip_call_ra_chain() dont need
281 * to mess with socket refcounts.
282 */
283 ra->saved_sk = sk;
284 call_rcu(&ra->rcu, ip_ra_destroy_rcu);
268 return 0; 285 return 0;
269 } 286 }
270 } 287 }
271 if (new_ra == NULL) { 288 if (new_ra == NULL) {
272 write_unlock_bh(&ip_ra_lock); 289 spin_unlock_bh(&ip_ra_lock);
273 return -ENOBUFS; 290 return -ENOBUFS;
274 } 291 }
275 new_ra->sk = sk; 292 new_ra->sk = sk;
276 new_ra->destructor = destructor; 293 new_ra->destructor = destructor;
277 294
278 new_ra->next = ra; 295 new_ra->next = ra;
279 *rap = new_ra; 296 rcu_assign_pointer(*rap, new_ra);
280 sock_hold(sk); 297 sock_hold(sk);
281 write_unlock_bh(&ip_ra_lock); 298 spin_unlock_bh(&ip_ra_lock);
282 299
283 return 0; 300 return 0;
284} 301}
@@ -286,12 +303,8 @@ int ip_ra_control(struct sock *sk, unsigned char on,
286void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, 303void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
287 __be16 port, u32 info, u8 *payload) 304 __be16 port, u32 info, u8 *payload)
288{ 305{
289 struct inet_sock *inet = inet_sk(sk);
290 struct sock_exterr_skb *serr; 306 struct sock_exterr_skb *serr;
291 307
292 if (!inet->recverr)
293 return;
294
295 skb = skb_clone(skb, GFP_ATOMIC); 308 skb = skb_clone(skb, GFP_ATOMIC);
296 if (!skb) 309 if (!skb)
297 return; 310 return;
@@ -451,7 +464,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
451 (1<<IP_TTL) | (1<<IP_HDRINCL) | 464 (1<<IP_TTL) | (1<<IP_HDRINCL) |
452 (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | 465 (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
453 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | 466 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
454 (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) || 467 (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
468 (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) ||
455 optname == IP_MULTICAST_TTL || 469 optname == IP_MULTICAST_TTL ||
456 optname == IP_MULTICAST_ALL || 470 optname == IP_MULTICAST_ALL ||
457 optname == IP_MULTICAST_LOOP || 471 optname == IP_MULTICAST_LOOP ||
@@ -574,6 +588,13 @@ static int do_ip_setsockopt(struct sock *sk, int level,
574 } 588 }
575 inet->hdrincl = val ? 1 : 0; 589 inet->hdrincl = val ? 1 : 0;
576 break; 590 break;
591 case IP_NODEFRAG:
592 if (sk->sk_type != SOCK_RAW) {
593 err = -ENOPROTOOPT;
594 break;
595 }
596 inet->nodefrag = val ? 1 : 0;
597 break;
577 case IP_MTU_DISCOVER: 598 case IP_MTU_DISCOVER:
578 if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) 599 if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
579 goto e_inval; 600 goto e_inval;
@@ -936,6 +957,14 @@ mc_msf_out:
936 inet->transparent = !!val; 957 inet->transparent = !!val;
937 break; 958 break;
938 959
960 case IP_MINTTL:
961 if (optlen < 1)
962 goto e_inval;
963 if (val < 0 || val > 255)
964 goto e_inval;
965 inet->min_ttl = val;
966 break;
967
939 default: 968 default:
940 err = -ENOPROTOOPT; 969 err = -ENOPROTOOPT;
941 break; 970 break;
@@ -948,6 +977,22 @@ e_inval:
948 return -EINVAL; 977 return -EINVAL;
949} 978}
950 979
980/**
981 * ip_queue_rcv_skb - Queue an skb into sock receive queue
982 * @sk: socket
983 * @skb: buffer
984 *
985 * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option
986 * is not set, we drop skb dst entry now, while dst cache line is hot.
987 */
988int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
989{
990 if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO))
991 skb_dst_drop(skb);
992 return sock_queue_rcv_skb(sk, skb);
993}
994EXPORT_SYMBOL(ip_queue_rcv_skb);
995
951int ip_setsockopt(struct sock *sk, int level, 996int ip_setsockopt(struct sock *sk, int level,
952 int optname, char __user *optval, unsigned int optlen) 997 int optname, char __user *optval, unsigned int optlen)
953{ 998{
@@ -1084,6 +1129,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1084 case IP_HDRINCL: 1129 case IP_HDRINCL:
1085 val = inet->hdrincl; 1130 val = inet->hdrincl;
1086 break; 1131 break;
1132 case IP_NODEFRAG:
1133 val = inet->nodefrag;
1134 break;
1087 case IP_MTU_DISCOVER: 1135 case IP_MTU_DISCOVER:
1088 val = inet->pmtudisc; 1136 val = inet->pmtudisc;
1089 break; 1137 break;
@@ -1198,6 +1246,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1198 case IP_TRANSPARENT: 1246 case IP_TRANSPARENT:
1199 val = inet->transparent; 1247 val = inet->transparent;
1200 break; 1248 break;
1249 case IP_MINTTL:
1250 val = inet->min_ttl;
1251 break;
1201 default: 1252 default:
1202 release_sock(sk); 1253 release_sock(sk);
1203 return -ENOPROTOOPT; 1254 return -ENOPROTOOPT;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 38fbf04150ae..629067571f02 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -25,6 +25,7 @@
25 25
26static void ipcomp4_err(struct sk_buff *skb, u32 info) 26static void ipcomp4_err(struct sk_buff *skb, u32 info)
27{ 27{
28 struct net *net = dev_net(skb->dev);
28 __be32 spi; 29 __be32 spi;
29 struct iphdr *iph = (struct iphdr *)skb->data; 30 struct iphdr *iph = (struct iphdr *)skb->data;
30 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); 31 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
@@ -35,7 +36,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
35 return; 36 return;
36 37
37 spi = htonl(ntohs(ipch->cpi)); 38 spi = htonl(ntohs(ipch->cpi));
38 x = xfrm_state_lookup(&init_net, (xfrm_address_t *)&iph->daddr, 39 x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr,
39 spi, IPPROTO_COMP, AF_INET); 40 spi, IPPROTO_COMP, AF_INET);
40 if (!x) 41 if (!x)
41 return; 42 return;
@@ -47,9 +48,10 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
47/* We always hold one tunnel user reference to indicate a tunnel */ 48/* We always hold one tunnel user reference to indicate a tunnel */
48static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) 49static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
49{ 50{
51 struct net *net = xs_net(x);
50 struct xfrm_state *t; 52 struct xfrm_state *t;
51 53
52 t = xfrm_state_alloc(&init_net); 54 t = xfrm_state_alloc(net);
53 if (t == NULL) 55 if (t == NULL)
54 goto out; 56 goto out;
55 57
@@ -61,6 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
61 t->props.mode = x->props.mode; 63 t->props.mode = x->props.mode;
62 t->props.saddr.a4 = x->props.saddr.a4; 64 t->props.saddr.a4 = x->props.saddr.a4;
63 t->props.flags = x->props.flags; 65 t->props.flags = x->props.flags;
66 memcpy(&t->mark, &x->mark, sizeof(t->mark));
64 67
65 if (xfrm_init_state(t)) 68 if (xfrm_init_state(t))
66 goto error; 69 goto error;
@@ -82,10 +85,12 @@ error:
82 */ 85 */
83static int ipcomp_tunnel_attach(struct xfrm_state *x) 86static int ipcomp_tunnel_attach(struct xfrm_state *x)
84{ 87{
88 struct net *net = xs_net(x);
85 int err = 0; 89 int err = 0;
86 struct xfrm_state *t; 90 struct xfrm_state *t;
91 u32 mark = x->mark.v & x->mark.m;
87 92
88 t = xfrm_state_lookup(&init_net, (xfrm_address_t *)&x->id.daddr.a4, 93 t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4,
89 x->props.saddr.a4, IPPROTO_IPIP, AF_INET); 94 x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
90 if (!t) { 95 if (!t) {
91 t = ipcomp_tunnel_create(x); 96 t = ipcomp_tunnel_create(x);
@@ -124,16 +129,12 @@ static int ipcomp4_init_state(struct xfrm_state *x)
124 if (x->props.mode == XFRM_MODE_TUNNEL) { 129 if (x->props.mode == XFRM_MODE_TUNNEL) {
125 err = ipcomp_tunnel_attach(x); 130 err = ipcomp_tunnel_attach(x);
126 if (err) 131 if (err)
127 goto error_tunnel; 132 goto out;
128 } 133 }
129 134
130 err = 0; 135 err = 0;
131out: 136out:
132 return err; 137 return err;
133
134error_tunnel:
135 ipcomp_destroy(x);
136 goto out;
137} 138}
138 139
139static const struct xfrm_type ipcomp_type = { 140static const struct xfrm_type ipcomp_type = {
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 10a6a604bf32..3a6e1ec5e9ae 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -53,6 +53,7 @@
53#include <linux/root_dev.h> 53#include <linux/root_dev.h>
54#include <linux/delay.h> 54#include <linux/delay.h>
55#include <linux/nfs_fs.h> 55#include <linux/nfs_fs.h>
56#include <linux/slab.h>
56#include <net/net_namespace.h> 57#include <net/net_namespace.h>
57#include <net/arp.h> 58#include <net/arp.h>
58#include <net/ip.h> 59#include <net/ip.h>
@@ -187,6 +188,16 @@ struct ic_device {
187static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ 188static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
188static struct net_device *ic_dev __initdata = NULL; /* Selected device */ 189static struct net_device *ic_dev __initdata = NULL; /* Selected device */
189 190
191static bool __init ic_device_match(struct net_device *dev)
192{
193 if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
194 (!(dev->flags & IFF_LOOPBACK) &&
195 (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
196 strncmp(dev->name, "dummy", 5)))
197 return true;
198 return false;
199}
200
190static int __init ic_open_devs(void) 201static int __init ic_open_devs(void)
191{ 202{
192 struct ic_device *d, **last; 203 struct ic_device *d, **last;
@@ -207,10 +218,7 @@ static int __init ic_open_devs(void)
207 for_each_netdev(&init_net, dev) { 218 for_each_netdev(&init_net, dev) {
208 if (dev->flags & IFF_LOOPBACK) 219 if (dev->flags & IFF_LOOPBACK)
209 continue; 220 continue;
210 if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : 221 if (ic_device_match(dev)) {
211 (!(dev->flags & IFF_LOOPBACK) &&
212 (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
213 strncmp(dev->name, "dummy", 5))) {
214 int able = 0; 222 int able = 0;
215 if (dev->mtu >= 364) 223 if (dev->mtu >= 364)
216 able |= IC_BOOTP; 224 able |= IC_BOOTP;
@@ -228,7 +236,7 @@ static int __init ic_open_devs(void)
228 } 236 }
229 if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) { 237 if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
230 rtnl_unlock(); 238 rtnl_unlock();
231 return -1; 239 return -ENOMEM;
232 } 240 }
233 d->dev = dev; 241 d->dev = dev;
234 *last = d; 242 *last = d;
@@ -253,7 +261,7 @@ static int __init ic_open_devs(void)
253 printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name); 261 printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
254 else 262 else
255 printk(KERN_ERR "IP-Config: No network devices available.\n"); 263 printk(KERN_ERR "IP-Config: No network devices available.\n");
256 return -1; 264 return -ENODEV;
257 } 265 }
258 return 0; 266 return 0;
259} 267}
@@ -657,6 +665,13 @@ ic_dhcp_init_options(u8 *options)
657 memcpy(e, ic_req_params, sizeof(ic_req_params)); 665 memcpy(e, ic_req_params, sizeof(ic_req_params));
658 e += sizeof(ic_req_params); 666 e += sizeof(ic_req_params);
659 667
668 if (ic_host_name_set) {
669 *e++ = 12; /* host-name */
670 len = strlen(utsname()->nodename);
671 *e++ = len;
672 memcpy(e, utsname()->nodename, len);
673 e += len;
674 }
660 if (*vendor_class_identifier) { 675 if (*vendor_class_identifier) {
661 printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n", 676 printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n",
662 vendor_class_identifier); 677 vendor_class_identifier);
@@ -968,7 +983,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
968 /* Is it a reply for the device we are configuring? */ 983 /* Is it a reply for the device we are configuring? */
969 if (b->xid != ic_dev_xid) { 984 if (b->xid != ic_dev_xid) {
970 if (net_ratelimit()) 985 if (net_ratelimit())
971 printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet \n"); 986 printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet\n");
972 goto drop_unlock; 987 goto drop_unlock;
973 } 988 }
974 989
@@ -1303,6 +1318,32 @@ __be32 __init root_nfs_parse_addr(char *name)
1303 return addr; 1318 return addr;
1304} 1319}
1305 1320
1321#define DEVICE_WAIT_MAX 12 /* 12 seconds */
1322
1323static int __init wait_for_devices(void)
1324{
1325 int i;
1326
1327 msleep(CONF_PRE_OPEN);
1328 for (i = 0; i < DEVICE_WAIT_MAX; i++) {
1329 struct net_device *dev;
1330 int found = 0;
1331
1332 rtnl_lock();
1333 for_each_netdev(&init_net, dev) {
1334 if (ic_device_match(dev)) {
1335 found = 1;
1336 break;
1337 }
1338 }
1339 rtnl_unlock();
1340 if (found)
1341 return 0;
1342 ssleep(1);
1343 }
1344 return -ENODEV;
1345}
1346
1306/* 1347/*
1307 * IP Autoconfig dispatcher. 1348 * IP Autoconfig dispatcher.
1308 */ 1349 */
@@ -1313,6 +1354,7 @@ static int __init ip_auto_config(void)
1313#ifdef IPCONFIG_DYNAMIC 1354#ifdef IPCONFIG_DYNAMIC
1314 int retries = CONF_OPEN_RETRIES; 1355 int retries = CONF_OPEN_RETRIES;
1315#endif 1356#endif
1357 int err;
1316 1358
1317#ifdef CONFIG_PROC_FS 1359#ifdef CONFIG_PROC_FS
1318 proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); 1360 proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
@@ -1325,12 +1367,15 @@ static int __init ip_auto_config(void)
1325#ifdef IPCONFIG_DYNAMIC 1367#ifdef IPCONFIG_DYNAMIC
1326 try_try_again: 1368 try_try_again:
1327#endif 1369#endif
1328 /* Give hardware a chance to settle */ 1370 /* Wait for devices to appear */
1329 msleep(CONF_PRE_OPEN); 1371 err = wait_for_devices();
1372 if (err)
1373 return err;
1330 1374
1331 /* Setup all network devices */ 1375 /* Setup all network devices */
1332 if (ic_open_devs() < 0) 1376 err = ic_open_devs();
1333 return -1; 1377 if (err)
1378 return err;
1334 1379
1335 /* Give drivers a chance to settle */ 1380 /* Give drivers a chance to settle */
1336 ssleep(CONF_POST_OPEN); 1381 ssleep(CONF_POST_OPEN);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index eda04fed3379..ec036731a70b 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -95,6 +95,7 @@
95#include <linux/module.h> 95#include <linux/module.h>
96#include <linux/types.h> 96#include <linux/types.h>
97#include <linux/kernel.h> 97#include <linux/kernel.h>
98#include <linux/slab.h>
98#include <asm/uaccess.h> 99#include <asm/uaccess.h>
99#include <linux/skbuff.h> 100#include <linux/skbuff.h>
100#include <linux/netdevice.h> 101#include <linux/netdevice.h>
@@ -130,7 +131,6 @@ struct ipip_net {
130 struct net_device *fb_tunnel_dev; 131 struct net_device *fb_tunnel_dev;
131}; 132};
132 133
133static void ipip_fb_tunnel_init(struct net_device *dev);
134static void ipip_tunnel_init(struct net_device *dev); 134static void ipip_tunnel_init(struct net_device *dev);
135static void ipip_tunnel_setup(struct net_device *dev); 135static void ipip_tunnel_setup(struct net_device *dev);
136 136
@@ -374,11 +374,8 @@ static int ipip_rcv(struct sk_buff *skb)
374 skb->protocol = htons(ETH_P_IP); 374 skb->protocol = htons(ETH_P_IP);
375 skb->pkt_type = PACKET_HOST; 375 skb->pkt_type = PACKET_HOST;
376 376
377 tunnel->dev->stats.rx_packets++; 377 skb_tunnel_rx(skb, tunnel->dev);
378 tunnel->dev->stats.rx_bytes += skb->len; 378
379 skb->dev = tunnel->dev;
380 skb_dst_drop(skb);
381 nf_reset(skb);
382 ipip_ecn_decapsulate(iph, skb); 379 ipip_ecn_decapsulate(iph, skb);
383 netif_rx(skb); 380 netif_rx(skb);
384 rcu_read_unlock(); 381 rcu_read_unlock();
@@ -438,7 +435,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
438 goto tx_error_icmp; 435 goto tx_error_icmp;
439 } 436 }
440 } 437 }
441 tdev = rt->u.dst.dev; 438 tdev = rt->dst.dev;
442 439
443 if (tdev == dev) { 440 if (tdev == dev) {
444 ip_rt_put(rt); 441 ip_rt_put(rt);
@@ -449,7 +446,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
449 df |= old_iph->frag_off & htons(IP_DF); 446 df |= old_iph->frag_off & htons(IP_DF);
450 447
451 if (df) { 448 if (df) {
452 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); 449 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
453 450
454 if (mtu < 68) { 451 if (mtu < 68) {
455 stats->collisions++; 452 stats->collisions++;
@@ -506,7 +503,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
506 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 503 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
507 IPSKB_REROUTED); 504 IPSKB_REROUTED);
508 skb_dst_drop(skb); 505 skb_dst_drop(skb);
509 skb_dst_set(skb, &rt->u.dst); 506 skb_dst_set(skb, &rt->dst);
510 507
511 /* 508 /*
512 * Push down and install the IPIP header. 509 * Push down and install the IPIP header.
@@ -555,7 +552,7 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
555 .proto = IPPROTO_IPIP }; 552 .proto = IPPROTO_IPIP };
556 struct rtable *rt; 553 struct rtable *rt;
557 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 554 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
558 tdev = rt->u.dst.dev; 555 tdev = rt->dst.dev;
559 ip_rt_put(rt); 556 ip_rt_put(rt);
560 } 557 }
561 dev->flags |= IFF_POINTOPOINT; 558 dev->flags |= IFF_POINTOPOINT;
@@ -730,7 +727,7 @@ static void ipip_tunnel_init(struct net_device *dev)
730 ipip_tunnel_bind_dev(dev); 727 ipip_tunnel_bind_dev(dev);
731} 728}
732 729
733static void ipip_fb_tunnel_init(struct net_device *dev) 730static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
734{ 731{
735 struct ip_tunnel *tunnel = netdev_priv(dev); 732 struct ip_tunnel *tunnel = netdev_priv(dev);
736 struct iphdr *iph = &tunnel->parms.iph; 733 struct iphdr *iph = &tunnel->parms.iph;
@@ -773,7 +770,7 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
773 } 770 }
774} 771}
775 772
776static int ipip_init_net(struct net *net) 773static int __net_init ipip_init_net(struct net *net)
777{ 774{
778 struct ipip_net *ipn = net_generic(net, ipip_net_id); 775 struct ipip_net *ipn = net_generic(net, ipip_net_id);
779 int err; 776 int err;
@@ -806,7 +803,7 @@ err_alloc_dev:
806 return err; 803 return err;
807} 804}
808 805
809static void ipip_exit_net(struct net *net) 806static void __net_exit ipip_exit_net(struct net *net)
810{ 807{
811 struct ipip_net *ipn = net_generic(net, ipip_net_id); 808 struct ipip_net *ipn = net_generic(net, ipip_net_id);
812 LIST_HEAD(list); 809 LIST_HEAD(list);
@@ -831,15 +828,14 @@ static int __init ipip_init(void)
831 828
832 printk(banner); 829 printk(banner);
833 830
834 if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) { 831 err = register_pernet_device(&ipip_net_ops);
832 if (err < 0)
833 return err;
834 err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
835 if (err < 0) {
836 unregister_pernet_device(&ipip_net_ops);
835 printk(KERN_INFO "ipip init: can't register tunnel\n"); 837 printk(KERN_INFO "ipip init: can't register tunnel\n");
836 return -EAGAIN;
837 } 838 }
838
839 err = register_pernet_device(&ipip_net_ops);
840 if (err)
841 xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
842
843 return err; 839 return err;
844} 840}
845 841
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 54596f73eff5..179fcab866fc 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -22,7 +22,7 @@
22 * overflow. 22 * overflow.
23 * Carlos Picoto : PIMv1 Support 23 * Carlos Picoto : PIMv1 Support
24 * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header 24 * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header
25 * Relax this requrement to work with older peers. 25 * Relax this requirement to work with older peers.
26 * 26 *
27 */ 27 */
28 28
@@ -47,6 +47,7 @@
47#include <linux/mroute.h> 47#include <linux/mroute.h>
48#include <linux/init.h> 48#include <linux/init.h>
49#include <linux/if_ether.h> 49#include <linux/if_ether.h>
50#include <linux/slab.h>
50#include <net/net_namespace.h> 51#include <net/net_namespace.h>
51#include <net/ip.h> 52#include <net/ip.h>
52#include <net/protocol.h> 53#include <net/protocol.h>
@@ -62,11 +63,40 @@
62#include <net/ipip.h> 63#include <net/ipip.h>
63#include <net/checksum.h> 64#include <net/checksum.h>
64#include <net/netlink.h> 65#include <net/netlink.h>
66#include <net/fib_rules.h>
65 67
66#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) 68#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67#define CONFIG_IP_PIMSM 1 69#define CONFIG_IP_PIMSM 1
68#endif 70#endif
69 71
72struct mr_table {
73 struct list_head list;
74#ifdef CONFIG_NET_NS
75 struct net *net;
76#endif
77 u32 id;
78 struct sock *mroute_sk;
79 struct timer_list ipmr_expire_timer;
80 struct list_head mfc_unres_queue;
81 struct list_head mfc_cache_array[MFC_LINES];
82 struct vif_device vif_table[MAXVIFS];
83 int maxvif;
84 atomic_t cache_resolve_queue_len;
85 int mroute_do_assert;
86 int mroute_do_pim;
87#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
88 int mroute_reg_vif_num;
89#endif
90};
91
92struct ipmr_rule {
93 struct fib_rule common;
94};
95
96struct ipmr_result {
97 struct mr_table *mrt;
98};
99
70/* Big lock, protecting vif table, mrt cache and mroute socket state. 100/* Big lock, protecting vif table, mrt cache and mroute socket state.
71 Note that the changes are semaphored via rtnl_lock. 101 Note that the changes are semaphored via rtnl_lock.
72 */ 102 */
@@ -77,9 +107,7 @@ static DEFINE_RWLOCK(mrt_lock);
77 * Multicast router control variables 107 * Multicast router control variables
78 */ 108 */
79 109
80#define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL) 110#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
81
82static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */
83 111
84/* Special spinlock for queue of unresolved entries */ 112/* Special spinlock for queue of unresolved entries */
85static DEFINE_SPINLOCK(mfc_unres_lock); 113static DEFINE_SPINLOCK(mfc_unres_lock);
@@ -94,12 +122,217 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
94 122
95static struct kmem_cache *mrt_cachep __read_mostly; 123static struct kmem_cache *mrt_cachep __read_mostly;
96 124
97static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); 125static struct mr_table *ipmr_new_table(struct net *net, u32 id);
98static int ipmr_cache_report(struct net *net, 126static int ip_mr_forward(struct net *net, struct mr_table *mrt,
127 struct sk_buff *skb, struct mfc_cache *cache,
128 int local);
129static int ipmr_cache_report(struct mr_table *mrt,
99 struct sk_buff *pkt, vifi_t vifi, int assert); 130 struct sk_buff *pkt, vifi_t vifi, int assert);
100static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); 131static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
132 struct mfc_cache *c, struct rtmsg *rtm);
133static void ipmr_expire_process(unsigned long arg);
134
135#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
136#define ipmr_for_each_table(mrt, net) \
137 list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
138
139static struct mr_table *ipmr_get_table(struct net *net, u32 id)
140{
141 struct mr_table *mrt;
142
143 ipmr_for_each_table(mrt, net) {
144 if (mrt->id == id)
145 return mrt;
146 }
147 return NULL;
148}
149
150static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
151 struct mr_table **mrt)
152{
153 struct ipmr_result res;
154 struct fib_lookup_arg arg = { .result = &res, };
155 int err;
156
157 err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
158 if (err < 0)
159 return err;
160 *mrt = res.mrt;
161 return 0;
162}
163
164static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
165 int flags, struct fib_lookup_arg *arg)
166{
167 struct ipmr_result *res = arg->result;
168 struct mr_table *mrt;
169
170 switch (rule->action) {
171 case FR_ACT_TO_TBL:
172 break;
173 case FR_ACT_UNREACHABLE:
174 return -ENETUNREACH;
175 case FR_ACT_PROHIBIT:
176 return -EACCES;
177 case FR_ACT_BLACKHOLE:
178 default:
179 return -EINVAL;
180 }
181
182 mrt = ipmr_get_table(rule->fr_net, rule->table);
183 if (mrt == NULL)
184 return -EAGAIN;
185 res->mrt = mrt;
186 return 0;
187}
188
189static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
190{
191 return 1;
192}
193
194static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
195 FRA_GENERIC_POLICY,
196};
197
198static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
199 struct fib_rule_hdr *frh, struct nlattr **tb)
200{
201 return 0;
202}
203
204static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
205 struct nlattr **tb)
206{
207 return 1;
208}
209
210static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
211 struct fib_rule_hdr *frh)
212{
213 frh->dst_len = 0;
214 frh->src_len = 0;
215 frh->tos = 0;
216 return 0;
217}
101 218
102static struct timer_list ipmr_expire_timer; 219static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
220 .family = RTNL_FAMILY_IPMR,
221 .rule_size = sizeof(struct ipmr_rule),
222 .addr_size = sizeof(u32),
223 .action = ipmr_rule_action,
224 .match = ipmr_rule_match,
225 .configure = ipmr_rule_configure,
226 .compare = ipmr_rule_compare,
227 .default_pref = fib_default_rule_pref,
228 .fill = ipmr_rule_fill,
229 .nlgroup = RTNLGRP_IPV4_RULE,
230 .policy = ipmr_rule_policy,
231 .owner = THIS_MODULE,
232};
233
234static int __net_init ipmr_rules_init(struct net *net)
235{
236 struct fib_rules_ops *ops;
237 struct mr_table *mrt;
238 int err;
239
240 ops = fib_rules_register(&ipmr_rules_ops_template, net);
241 if (IS_ERR(ops))
242 return PTR_ERR(ops);
243
244 INIT_LIST_HEAD(&net->ipv4.mr_tables);
245
246 mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
247 if (mrt == NULL) {
248 err = -ENOMEM;
249 goto err1;
250 }
251
252 err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
253 if (err < 0)
254 goto err2;
255
256 net->ipv4.mr_rules_ops = ops;
257 return 0;
258
259err2:
260 kfree(mrt);
261err1:
262 fib_rules_unregister(ops);
263 return err;
264}
265
266static void __net_exit ipmr_rules_exit(struct net *net)
267{
268 struct mr_table *mrt, *next;
269
270 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
271 list_del(&mrt->list);
272 kfree(mrt);
273 }
274 fib_rules_unregister(net->ipv4.mr_rules_ops);
275}
276#else
277#define ipmr_for_each_table(mrt, net) \
278 for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
279
280static struct mr_table *ipmr_get_table(struct net *net, u32 id)
281{
282 return net->ipv4.mrt;
283}
284
285static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
286 struct mr_table **mrt)
287{
288 *mrt = net->ipv4.mrt;
289 return 0;
290}
291
292static int __net_init ipmr_rules_init(struct net *net)
293{
294 net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
295 return net->ipv4.mrt ? 0 : -ENOMEM;
296}
297
298static void __net_exit ipmr_rules_exit(struct net *net)
299{
300 kfree(net->ipv4.mrt);
301}
302#endif
303
304static struct mr_table *ipmr_new_table(struct net *net, u32 id)
305{
306 struct mr_table *mrt;
307 unsigned int i;
308
309 mrt = ipmr_get_table(net, id);
310 if (mrt != NULL)
311 return mrt;
312
313 mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
314 if (mrt == NULL)
315 return NULL;
316 write_pnet(&mrt->net, net);
317 mrt->id = id;
318
319 /* Forwarding cache */
320 for (i = 0; i < MFC_LINES; i++)
321 INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
322
323 INIT_LIST_HEAD(&mrt->mfc_unres_queue);
324
325 setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
326 (unsigned long)mrt);
327
328#ifdef CONFIG_IP_PIMSM
329 mrt->mroute_reg_vif_num = -1;
330#endif
331#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
332 list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
333#endif
334 return mrt;
335}
103 336
104/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ 337/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
105 338
@@ -200,12 +433,24 @@ failure:
200static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) 433static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
201{ 434{
202 struct net *net = dev_net(dev); 435 struct net *net = dev_net(dev);
436 struct mr_table *mrt;
437 struct flowi fl = {
438 .oif = dev->ifindex,
439 .iif = skb->skb_iif,
440 .mark = skb->mark,
441 };
442 int err;
443
444 err = ipmr_fib_lookup(net, &fl, &mrt);
445 if (err < 0) {
446 kfree_skb(skb);
447 return err;
448 }
203 449
204 read_lock(&mrt_lock); 450 read_lock(&mrt_lock);
205 dev->stats.tx_bytes += skb->len; 451 dev->stats.tx_bytes += skb->len;
206 dev->stats.tx_packets++; 452 dev->stats.tx_packets++;
207 ipmr_cache_report(net, skb, net->ipv4.mroute_reg_vif_num, 453 ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
208 IGMPMSG_WHOLEPKT);
209 read_unlock(&mrt_lock); 454 read_unlock(&mrt_lock);
210 kfree_skb(skb); 455 kfree_skb(skb);
211 return NETDEV_TX_OK; 456 return NETDEV_TX_OK;
@@ -225,12 +470,18 @@ static void reg_vif_setup(struct net_device *dev)
225 dev->features |= NETIF_F_NETNS_LOCAL; 470 dev->features |= NETIF_F_NETNS_LOCAL;
226} 471}
227 472
228static struct net_device *ipmr_reg_vif(struct net *net) 473static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
229{ 474{
230 struct net_device *dev; 475 struct net_device *dev;
231 struct in_device *in_dev; 476 struct in_device *in_dev;
477 char name[IFNAMSIZ];
478
479 if (mrt->id == RT_TABLE_DEFAULT)
480 sprintf(name, "pimreg");
481 else
482 sprintf(name, "pimreg%u", mrt->id);
232 483
233 dev = alloc_netdev(0, "pimreg", reg_vif_setup); 484 dev = alloc_netdev(0, name, reg_vif_setup);
234 485
235 if (dev == NULL) 486 if (dev == NULL)
236 return NULL; 487 return NULL;
@@ -275,17 +526,17 @@ failure:
275 * @notify: Set to 1, if the caller is a notifier_call 526 * @notify: Set to 1, if the caller is a notifier_call
276 */ 527 */
277 528
278static int vif_delete(struct net *net, int vifi, int notify, 529static int vif_delete(struct mr_table *mrt, int vifi, int notify,
279 struct list_head *head) 530 struct list_head *head)
280{ 531{
281 struct vif_device *v; 532 struct vif_device *v;
282 struct net_device *dev; 533 struct net_device *dev;
283 struct in_device *in_dev; 534 struct in_device *in_dev;
284 535
285 if (vifi < 0 || vifi >= net->ipv4.maxvif) 536 if (vifi < 0 || vifi >= mrt->maxvif)
286 return -EADDRNOTAVAIL; 537 return -EADDRNOTAVAIL;
287 538
288 v = &net->ipv4.vif_table[vifi]; 539 v = &mrt->vif_table[vifi];
289 540
290 write_lock_bh(&mrt_lock); 541 write_lock_bh(&mrt_lock);
291 dev = v->dev; 542 dev = v->dev;
@@ -297,17 +548,17 @@ static int vif_delete(struct net *net, int vifi, int notify,
297 } 548 }
298 549
299#ifdef CONFIG_IP_PIMSM 550#ifdef CONFIG_IP_PIMSM
300 if (vifi == net->ipv4.mroute_reg_vif_num) 551 if (vifi == mrt->mroute_reg_vif_num)
301 net->ipv4.mroute_reg_vif_num = -1; 552 mrt->mroute_reg_vif_num = -1;
302#endif 553#endif
303 554
304 if (vifi+1 == net->ipv4.maxvif) { 555 if (vifi+1 == mrt->maxvif) {
305 int tmp; 556 int tmp;
306 for (tmp=vifi-1; tmp>=0; tmp--) { 557 for (tmp=vifi-1; tmp>=0; tmp--) {
307 if (VIF_EXISTS(net, tmp)) 558 if (VIF_EXISTS(mrt, tmp))
308 break; 559 break;
309 } 560 }
310 net->ipv4.maxvif = tmp+1; 561 mrt->maxvif = tmp+1;
311 } 562 }
312 563
313 write_unlock_bh(&mrt_lock); 564 write_unlock_bh(&mrt_lock);
@@ -328,7 +579,6 @@ static int vif_delete(struct net *net, int vifi, int notify,
328 579
329static inline void ipmr_cache_free(struct mfc_cache *c) 580static inline void ipmr_cache_free(struct mfc_cache *c)
330{ 581{
331 release_net(mfc_net(c));
332 kmem_cache_free(mrt_cachep, c); 582 kmem_cache_free(mrt_cachep, c);
333} 583}
334 584
@@ -336,13 +586,13 @@ static inline void ipmr_cache_free(struct mfc_cache *c)
336 and reporting error to netlink readers. 586 and reporting error to netlink readers.
337 */ 587 */
338 588
339static void ipmr_destroy_unres(struct mfc_cache *c) 589static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
340{ 590{
591 struct net *net = read_pnet(&mrt->net);
341 struct sk_buff *skb; 592 struct sk_buff *skb;
342 struct nlmsgerr *e; 593 struct nlmsgerr *e;
343 struct net *net = mfc_net(c);
344 594
345 atomic_dec(&net->ipv4.cache_resolve_queue_len); 595 atomic_dec(&mrt->cache_resolve_queue_len);
346 596
347 while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) { 597 while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
348 if (ip_hdr(skb)->version == 0) { 598 if (ip_hdr(skb)->version == 0) {
@@ -363,42 +613,40 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
363} 613}
364 614
365 615
366/* Single timer process for all the unresolved queue. */ 616/* Timer process for the unresolved queue. */
367 617
368static void ipmr_expire_process(unsigned long dummy) 618static void ipmr_expire_process(unsigned long arg)
369{ 619{
620 struct mr_table *mrt = (struct mr_table *)arg;
370 unsigned long now; 621 unsigned long now;
371 unsigned long expires; 622 unsigned long expires;
372 struct mfc_cache *c, **cp; 623 struct mfc_cache *c, *next;
373 624
374 if (!spin_trylock(&mfc_unres_lock)) { 625 if (!spin_trylock(&mfc_unres_lock)) {
375 mod_timer(&ipmr_expire_timer, jiffies+HZ/10); 626 mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
376 return; 627 return;
377 } 628 }
378 629
379 if (mfc_unres_queue == NULL) 630 if (list_empty(&mrt->mfc_unres_queue))
380 goto out; 631 goto out;
381 632
382 now = jiffies; 633 now = jiffies;
383 expires = 10*HZ; 634 expires = 10*HZ;
384 cp = &mfc_unres_queue;
385 635
386 while ((c=*cp) != NULL) { 636 list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
387 if (time_after(c->mfc_un.unres.expires, now)) { 637 if (time_after(c->mfc_un.unres.expires, now)) {
388 unsigned long interval = c->mfc_un.unres.expires - now; 638 unsigned long interval = c->mfc_un.unres.expires - now;
389 if (interval < expires) 639 if (interval < expires)
390 expires = interval; 640 expires = interval;
391 cp = &c->next;
392 continue; 641 continue;
393 } 642 }
394 643
395 *cp = c->next; 644 list_del(&c->list);
396 645 ipmr_destroy_unres(mrt, c);
397 ipmr_destroy_unres(c);
398 } 646 }
399 647
400 if (mfc_unres_queue != NULL) 648 if (!list_empty(&mrt->mfc_unres_queue))
401 mod_timer(&ipmr_expire_timer, jiffies + expires); 649 mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
402 650
403out: 651out:
404 spin_unlock(&mfc_unres_lock); 652 spin_unlock(&mfc_unres_lock);
@@ -406,17 +654,17 @@ out:
406 654
407/* Fill oifs list. It is called under write locked mrt_lock. */ 655/* Fill oifs list. It is called under write locked mrt_lock. */
408 656
409static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls) 657static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
658 unsigned char *ttls)
410{ 659{
411 int vifi; 660 int vifi;
412 struct net *net = mfc_net(cache);
413 661
414 cache->mfc_un.res.minvif = MAXVIFS; 662 cache->mfc_un.res.minvif = MAXVIFS;
415 cache->mfc_un.res.maxvif = 0; 663 cache->mfc_un.res.maxvif = 0;
416 memset(cache->mfc_un.res.ttls, 255, MAXVIFS); 664 memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
417 665
418 for (vifi = 0; vifi < net->ipv4.maxvif; vifi++) { 666 for (vifi = 0; vifi < mrt->maxvif; vifi++) {
419 if (VIF_EXISTS(net, vifi) && 667 if (VIF_EXISTS(mrt, vifi) &&
420 ttls[vifi] && ttls[vifi] < 255) { 668 ttls[vifi] && ttls[vifi] < 255) {
421 cache->mfc_un.res.ttls[vifi] = ttls[vifi]; 669 cache->mfc_un.res.ttls[vifi] = ttls[vifi];
422 if (cache->mfc_un.res.minvif > vifi) 670 if (cache->mfc_un.res.minvif > vifi)
@@ -427,16 +675,17 @@ static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
427 } 675 }
428} 676}
429 677
430static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock) 678static int vif_add(struct net *net, struct mr_table *mrt,
679 struct vifctl *vifc, int mrtsock)
431{ 680{
432 int vifi = vifc->vifc_vifi; 681 int vifi = vifc->vifc_vifi;
433 struct vif_device *v = &net->ipv4.vif_table[vifi]; 682 struct vif_device *v = &mrt->vif_table[vifi];
434 struct net_device *dev; 683 struct net_device *dev;
435 struct in_device *in_dev; 684 struct in_device *in_dev;
436 int err; 685 int err;
437 686
438 /* Is vif busy ? */ 687 /* Is vif busy ? */
439 if (VIF_EXISTS(net, vifi)) 688 if (VIF_EXISTS(mrt, vifi))
440 return -EADDRINUSE; 689 return -EADDRINUSE;
441 690
442 switch (vifc->vifc_flags) { 691 switch (vifc->vifc_flags) {
@@ -446,9 +695,9 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
446 * Special Purpose VIF in PIM 695 * Special Purpose VIF in PIM
447 * All the packets will be sent to the daemon 696 * All the packets will be sent to the daemon
448 */ 697 */
449 if (net->ipv4.mroute_reg_vif_num >= 0) 698 if (mrt->mroute_reg_vif_num >= 0)
450 return -EADDRINUSE; 699 return -EADDRINUSE;
451 dev = ipmr_reg_vif(net); 700 dev = ipmr_reg_vif(net, mrt);
452 if (!dev) 701 if (!dev)
453 return -ENOBUFS; 702 return -ENOBUFS;
454 err = dev_set_allmulti(dev, 1); 703 err = dev_set_allmulti(dev, 1);
@@ -524,49 +773,47 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
524 v->dev = dev; 773 v->dev = dev;
525#ifdef CONFIG_IP_PIMSM 774#ifdef CONFIG_IP_PIMSM
526 if (v->flags&VIFF_REGISTER) 775 if (v->flags&VIFF_REGISTER)
527 net->ipv4.mroute_reg_vif_num = vifi; 776 mrt->mroute_reg_vif_num = vifi;
528#endif 777#endif
529 if (vifi+1 > net->ipv4.maxvif) 778 if (vifi+1 > mrt->maxvif)
530 net->ipv4.maxvif = vifi+1; 779 mrt->maxvif = vifi+1;
531 write_unlock_bh(&mrt_lock); 780 write_unlock_bh(&mrt_lock);
532 return 0; 781 return 0;
533} 782}
534 783
535static struct mfc_cache *ipmr_cache_find(struct net *net, 784static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
536 __be32 origin, 785 __be32 origin,
537 __be32 mcastgrp) 786 __be32 mcastgrp)
538{ 787{
539 int line = MFC_HASH(mcastgrp, origin); 788 int line = MFC_HASH(mcastgrp, origin);
540 struct mfc_cache *c; 789 struct mfc_cache *c;
541 790
542 for (c = net->ipv4.mfc_cache_array[line]; c; c = c->next) { 791 list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
543 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp) 792 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
544 break; 793 return c;
545 } 794 }
546 return c; 795 return NULL;
547} 796}
548 797
549/* 798/*
550 * Allocate a multicast cache entry 799 * Allocate a multicast cache entry
551 */ 800 */
552static struct mfc_cache *ipmr_cache_alloc(struct net *net) 801static struct mfc_cache *ipmr_cache_alloc(void)
553{ 802{
554 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); 803 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
555 if (c == NULL) 804 if (c == NULL)
556 return NULL; 805 return NULL;
557 c->mfc_un.res.minvif = MAXVIFS; 806 c->mfc_un.res.minvif = MAXVIFS;
558 mfc_net_set(c, net);
559 return c; 807 return c;
560} 808}
561 809
562static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net) 810static struct mfc_cache *ipmr_cache_alloc_unres(void)
563{ 811{
564 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); 812 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
565 if (c == NULL) 813 if (c == NULL)
566 return NULL; 814 return NULL;
567 skb_queue_head_init(&c->mfc_un.unres.unresolved); 815 skb_queue_head_init(&c->mfc_un.unres.unresolved);
568 c->mfc_un.unres.expires = jiffies + 10*HZ; 816 c->mfc_un.unres.expires = jiffies + 10*HZ;
569 mfc_net_set(c, net);
570 return c; 817 return c;
571} 818}
572 819
@@ -574,7 +821,8 @@ static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
574 * A cache entry has gone into a resolved state from queued 821 * A cache entry has gone into a resolved state from queued
575 */ 822 */
576 823
577static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) 824static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
825 struct mfc_cache *uc, struct mfc_cache *c)
578{ 826{
579 struct sk_buff *skb; 827 struct sk_buff *skb;
580 struct nlmsgerr *e; 828 struct nlmsgerr *e;
@@ -587,7 +835,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
587 if (ip_hdr(skb)->version == 0) { 835 if (ip_hdr(skb)->version == 0) {
588 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); 836 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
589 837
590 if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) { 838 if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
591 nlh->nlmsg_len = (skb_tail_pointer(skb) - 839 nlh->nlmsg_len = (skb_tail_pointer(skb) -
592 (u8 *)nlh); 840 (u8 *)nlh);
593 } else { 841 } else {
@@ -599,9 +847,9 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
599 memset(&e->msg, 0, sizeof(e->msg)); 847 memset(&e->msg, 0, sizeof(e->msg));
600 } 848 }
601 849
602 rtnl_unicast(skb, mfc_net(c), NETLINK_CB(skb).pid); 850 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
603 } else 851 } else
604 ip_mr_forward(skb, c, 0); 852 ip_mr_forward(net, mrt, skb, c, 0);
605 } 853 }
606} 854}
607 855
@@ -612,7 +860,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
612 * Called under mrt_lock. 860 * Called under mrt_lock.
613 */ 861 */
614 862
615static int ipmr_cache_report(struct net *net, 863static int ipmr_cache_report(struct mr_table *mrt,
616 struct sk_buff *pkt, vifi_t vifi, int assert) 864 struct sk_buff *pkt, vifi_t vifi, int assert)
617{ 865{
618 struct sk_buff *skb; 866 struct sk_buff *skb;
@@ -645,7 +893,7 @@ static int ipmr_cache_report(struct net *net,
645 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); 893 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
646 msg->im_msgtype = IGMPMSG_WHOLEPKT; 894 msg->im_msgtype = IGMPMSG_WHOLEPKT;
647 msg->im_mbz = 0; 895 msg->im_mbz = 0;
648 msg->im_vif = net->ipv4.mroute_reg_vif_num; 896 msg->im_vif = mrt->mroute_reg_vif_num;
649 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; 897 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
650 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + 898 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
651 sizeof(struct iphdr)); 899 sizeof(struct iphdr));
@@ -677,7 +925,7 @@ static int ipmr_cache_report(struct net *net,
677 skb->transport_header = skb->network_header; 925 skb->transport_header = skb->network_header;
678 } 926 }
679 927
680 if (net->ipv4.mroute_sk == NULL) { 928 if (mrt->mroute_sk == NULL) {
681 kfree_skb(skb); 929 kfree_skb(skb);
682 return -EINVAL; 930 return -EINVAL;
683 } 931 }
@@ -685,7 +933,7 @@ static int ipmr_cache_report(struct net *net,
685 /* 933 /*
686 * Deliver to mrouted 934 * Deliver to mrouted
687 */ 935 */
688 ret = sock_queue_rcv_skb(net->ipv4.mroute_sk, skb); 936 ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
689 if (ret < 0) { 937 if (ret < 0) {
690 if (net_ratelimit()) 938 if (net_ratelimit())
691 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); 939 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -700,27 +948,29 @@ static int ipmr_cache_report(struct net *net,
700 */ 948 */
701 949
702static int 950static int
703ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb) 951ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
704{ 952{
953 bool found = false;
705 int err; 954 int err;
706 struct mfc_cache *c; 955 struct mfc_cache *c;
707 const struct iphdr *iph = ip_hdr(skb); 956 const struct iphdr *iph = ip_hdr(skb);
708 957
709 spin_lock_bh(&mfc_unres_lock); 958 spin_lock_bh(&mfc_unres_lock);
710 for (c=mfc_unres_queue; c; c=c->next) { 959 list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
711 if (net_eq(mfc_net(c), net) && 960 if (c->mfc_mcastgrp == iph->daddr &&
712 c->mfc_mcastgrp == iph->daddr && 961 c->mfc_origin == iph->saddr) {
713 c->mfc_origin == iph->saddr) 962 found = true;
714 break; 963 break;
964 }
715 } 965 }
716 966
717 if (c == NULL) { 967 if (!found) {
718 /* 968 /*
719 * Create a new entry if allowable 969 * Create a new entry if allowable
720 */ 970 */
721 971
722 if (atomic_read(&net->ipv4.cache_resolve_queue_len) >= 10 || 972 if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
723 (c = ipmr_cache_alloc_unres(net)) == NULL) { 973 (c = ipmr_cache_alloc_unres()) == NULL) {
724 spin_unlock_bh(&mfc_unres_lock); 974 spin_unlock_bh(&mfc_unres_lock);
725 975
726 kfree_skb(skb); 976 kfree_skb(skb);
@@ -737,7 +987,7 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
737 /* 987 /*
738 * Reflect first query at mrouted. 988 * Reflect first query at mrouted.
739 */ 989 */
740 err = ipmr_cache_report(net, skb, vifi, IGMPMSG_NOCACHE); 990 err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
741 if (err < 0) { 991 if (err < 0) {
742 /* If the report failed throw the cache entry 992 /* If the report failed throw the cache entry
743 out - Brad Parker 993 out - Brad Parker
@@ -749,11 +999,11 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
749 return err; 999 return err;
750 } 1000 }
751 1001
752 atomic_inc(&net->ipv4.cache_resolve_queue_len); 1002 atomic_inc(&mrt->cache_resolve_queue_len);
753 c->next = mfc_unres_queue; 1003 list_add(&c->list, &mrt->mfc_unres_queue);
754 mfc_unres_queue = c;
755 1004
756 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires); 1005 if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
1006 mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
757 } 1007 }
758 1008
759 /* 1009 /*
@@ -775,19 +1025,18 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
775 * MFC cache manipulation by user space mroute daemon 1025 * MFC cache manipulation by user space mroute daemon
776 */ 1026 */
777 1027
778static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc) 1028static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
779{ 1029{
780 int line; 1030 int line;
781 struct mfc_cache *c, **cp; 1031 struct mfc_cache *c, *next;
782 1032
783 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 1033 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
784 1034
785 for (cp = &net->ipv4.mfc_cache_array[line]; 1035 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
786 (c = *cp) != NULL; cp = &c->next) {
787 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 1036 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
788 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { 1037 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
789 write_lock_bh(&mrt_lock); 1038 write_lock_bh(&mrt_lock);
790 *cp = c->next; 1039 list_del(&c->list);
791 write_unlock_bh(&mrt_lock); 1040 write_unlock_bh(&mrt_lock);
792 1041
793 ipmr_cache_free(c); 1042 ipmr_cache_free(c);
@@ -797,24 +1046,30 @@ static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc)
797 return -ENOENT; 1046 return -ENOENT;
798} 1047}
799 1048
800static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock) 1049static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1050 struct mfcctl *mfc, int mrtsock)
801{ 1051{
1052 bool found = false;
802 int line; 1053 int line;
803 struct mfc_cache *uc, *c, **cp; 1054 struct mfc_cache *uc, *c;
1055
1056 if (mfc->mfcc_parent >= MAXVIFS)
1057 return -ENFILE;
804 1058
805 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 1059 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
806 1060
807 for (cp = &net->ipv4.mfc_cache_array[line]; 1061 list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
808 (c = *cp) != NULL; cp = &c->next) {
809 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 1062 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
810 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) 1063 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1064 found = true;
811 break; 1065 break;
1066 }
812 } 1067 }
813 1068
814 if (c != NULL) { 1069 if (found) {
815 write_lock_bh(&mrt_lock); 1070 write_lock_bh(&mrt_lock);
816 c->mfc_parent = mfc->mfcc_parent; 1071 c->mfc_parent = mfc->mfcc_parent;
817 ipmr_update_thresholds(c, mfc->mfcc_ttls); 1072 ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
818 if (!mrtsock) 1073 if (!mrtsock)
819 c->mfc_flags |= MFC_STATIC; 1074 c->mfc_flags |= MFC_STATIC;
820 write_unlock_bh(&mrt_lock); 1075 write_unlock_bh(&mrt_lock);
@@ -824,43 +1079,42 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
824 if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) 1079 if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
825 return -EINVAL; 1080 return -EINVAL;
826 1081
827 c = ipmr_cache_alloc(net); 1082 c = ipmr_cache_alloc();
828 if (c == NULL) 1083 if (c == NULL)
829 return -ENOMEM; 1084 return -ENOMEM;
830 1085
831 c->mfc_origin = mfc->mfcc_origin.s_addr; 1086 c->mfc_origin = mfc->mfcc_origin.s_addr;
832 c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr; 1087 c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
833 c->mfc_parent = mfc->mfcc_parent; 1088 c->mfc_parent = mfc->mfcc_parent;
834 ipmr_update_thresholds(c, mfc->mfcc_ttls); 1089 ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
835 if (!mrtsock) 1090 if (!mrtsock)
836 c->mfc_flags |= MFC_STATIC; 1091 c->mfc_flags |= MFC_STATIC;
837 1092
838 write_lock_bh(&mrt_lock); 1093 write_lock_bh(&mrt_lock);
839 c->next = net->ipv4.mfc_cache_array[line]; 1094 list_add(&c->list, &mrt->mfc_cache_array[line]);
840 net->ipv4.mfc_cache_array[line] = c;
841 write_unlock_bh(&mrt_lock); 1095 write_unlock_bh(&mrt_lock);
842 1096
843 /* 1097 /*
844 * Check to see if we resolved a queued list. If so we 1098 * Check to see if we resolved a queued list. If so we
845 * need to send on the frames and tidy up. 1099 * need to send on the frames and tidy up.
846 */ 1100 */
1101 found = false;
847 spin_lock_bh(&mfc_unres_lock); 1102 spin_lock_bh(&mfc_unres_lock);
848 for (cp = &mfc_unres_queue; (uc=*cp) != NULL; 1103 list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
849 cp = &uc->next) { 1104 if (uc->mfc_origin == c->mfc_origin &&
850 if (net_eq(mfc_net(uc), net) &&
851 uc->mfc_origin == c->mfc_origin &&
852 uc->mfc_mcastgrp == c->mfc_mcastgrp) { 1105 uc->mfc_mcastgrp == c->mfc_mcastgrp) {
853 *cp = uc->next; 1106 list_del(&uc->list);
854 atomic_dec(&net->ipv4.cache_resolve_queue_len); 1107 atomic_dec(&mrt->cache_resolve_queue_len);
1108 found = true;
855 break; 1109 break;
856 } 1110 }
857 } 1111 }
858 if (mfc_unres_queue == NULL) 1112 if (list_empty(&mrt->mfc_unres_queue))
859 del_timer(&ipmr_expire_timer); 1113 del_timer(&mrt->ipmr_expire_timer);
860 spin_unlock_bh(&mfc_unres_lock); 1114 spin_unlock_bh(&mfc_unres_lock);
861 1115
862 if (uc) { 1116 if (found) {
863 ipmr_cache_resolve(uc, c); 1117 ipmr_cache_resolve(net, mrt, uc, c);
864 ipmr_cache_free(uc); 1118 ipmr_cache_free(uc);
865 } 1119 }
866 return 0; 1120 return 0;
@@ -870,53 +1124,41 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
870 * Close the multicast socket, and clear the vif tables etc 1124 * Close the multicast socket, and clear the vif tables etc
871 */ 1125 */
872 1126
873static void mroute_clean_tables(struct net *net) 1127static void mroute_clean_tables(struct mr_table *mrt)
874{ 1128{
875 int i; 1129 int i;
876 LIST_HEAD(list); 1130 LIST_HEAD(list);
1131 struct mfc_cache *c, *next;
877 1132
878 /* 1133 /*
879 * Shut down all active vif entries 1134 * Shut down all active vif entries
880 */ 1135 */
881 for (i = 0; i < net->ipv4.maxvif; i++) { 1136 for (i = 0; i < mrt->maxvif; i++) {
882 if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC)) 1137 if (!(mrt->vif_table[i].flags&VIFF_STATIC))
883 vif_delete(net, i, 0, &list); 1138 vif_delete(mrt, i, 0, &list);
884 } 1139 }
885 unregister_netdevice_many(&list); 1140 unregister_netdevice_many(&list);
886 1141
887 /* 1142 /*
888 * Wipe the cache 1143 * Wipe the cache
889 */ 1144 */
890 for (i=0; i<MFC_LINES; i++) { 1145 for (i = 0; i < MFC_LINES; i++) {
891 struct mfc_cache *c, **cp; 1146 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
892 1147 if (c->mfc_flags&MFC_STATIC)
893 cp = &net->ipv4.mfc_cache_array[i];
894 while ((c = *cp) != NULL) {
895 if (c->mfc_flags&MFC_STATIC) {
896 cp = &c->next;
897 continue; 1148 continue;
898 }
899 write_lock_bh(&mrt_lock); 1149 write_lock_bh(&mrt_lock);
900 *cp = c->next; 1150 list_del(&c->list);
901 write_unlock_bh(&mrt_lock); 1151 write_unlock_bh(&mrt_lock);
902 1152
903 ipmr_cache_free(c); 1153 ipmr_cache_free(c);
904 } 1154 }
905 } 1155 }
906 1156
907 if (atomic_read(&net->ipv4.cache_resolve_queue_len) != 0) { 1157 if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
908 struct mfc_cache *c, **cp;
909
910 spin_lock_bh(&mfc_unres_lock); 1158 spin_lock_bh(&mfc_unres_lock);
911 cp = &mfc_unres_queue; 1159 list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
912 while ((c = *cp) != NULL) { 1160 list_del(&c->list);
913 if (!net_eq(mfc_net(c), net)) { 1161 ipmr_destroy_unres(mrt, c);
914 cp = &c->next;
915 continue;
916 }
917 *cp = c->next;
918
919 ipmr_destroy_unres(c);
920 } 1162 }
921 spin_unlock_bh(&mfc_unres_lock); 1163 spin_unlock_bh(&mfc_unres_lock);
922 } 1164 }
@@ -925,16 +1167,19 @@ static void mroute_clean_tables(struct net *net)
925static void mrtsock_destruct(struct sock *sk) 1167static void mrtsock_destruct(struct sock *sk)
926{ 1168{
927 struct net *net = sock_net(sk); 1169 struct net *net = sock_net(sk);
1170 struct mr_table *mrt;
928 1171
929 rtnl_lock(); 1172 rtnl_lock();
930 if (sk == net->ipv4.mroute_sk) { 1173 ipmr_for_each_table(mrt, net) {
931 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; 1174 if (sk == mrt->mroute_sk) {
1175 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
932 1176
933 write_lock_bh(&mrt_lock); 1177 write_lock_bh(&mrt_lock);
934 net->ipv4.mroute_sk = NULL; 1178 mrt->mroute_sk = NULL;
935 write_unlock_bh(&mrt_lock); 1179 write_unlock_bh(&mrt_lock);
936 1180
937 mroute_clean_tables(net); 1181 mroute_clean_tables(mrt);
1182 }
938 } 1183 }
939 rtnl_unlock(); 1184 rtnl_unlock();
940} 1185}
@@ -952,9 +1197,14 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
952 struct vifctl vif; 1197 struct vifctl vif;
953 struct mfcctl mfc; 1198 struct mfcctl mfc;
954 struct net *net = sock_net(sk); 1199 struct net *net = sock_net(sk);
1200 struct mr_table *mrt;
1201
1202 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1203 if (mrt == NULL)
1204 return -ENOENT;
955 1205
956 if (optname != MRT_INIT) { 1206 if (optname != MRT_INIT) {
957 if (sk != net->ipv4.mroute_sk && !capable(CAP_NET_ADMIN)) 1207 if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
958 return -EACCES; 1208 return -EACCES;
959 } 1209 }
960 1210
@@ -967,7 +1217,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
967 return -ENOPROTOOPT; 1217 return -ENOPROTOOPT;
968 1218
969 rtnl_lock(); 1219 rtnl_lock();
970 if (net->ipv4.mroute_sk) { 1220 if (mrt->mroute_sk) {
971 rtnl_unlock(); 1221 rtnl_unlock();
972 return -EADDRINUSE; 1222 return -EADDRINUSE;
973 } 1223 }
@@ -975,7 +1225,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
975 ret = ip_ra_control(sk, 1, mrtsock_destruct); 1225 ret = ip_ra_control(sk, 1, mrtsock_destruct);
976 if (ret == 0) { 1226 if (ret == 0) {
977 write_lock_bh(&mrt_lock); 1227 write_lock_bh(&mrt_lock);
978 net->ipv4.mroute_sk = sk; 1228 mrt->mroute_sk = sk;
979 write_unlock_bh(&mrt_lock); 1229 write_unlock_bh(&mrt_lock);
980 1230
981 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; 1231 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
@@ -983,7 +1233,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
983 rtnl_unlock(); 1233 rtnl_unlock();
984 return ret; 1234 return ret;
985 case MRT_DONE: 1235 case MRT_DONE:
986 if (sk != net->ipv4.mroute_sk) 1236 if (sk != mrt->mroute_sk)
987 return -EACCES; 1237 return -EACCES;
988 return ip_ra_control(sk, 0, NULL); 1238 return ip_ra_control(sk, 0, NULL);
989 case MRT_ADD_VIF: 1239 case MRT_ADD_VIF:
@@ -996,9 +1246,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
996 return -ENFILE; 1246 return -ENFILE;
997 rtnl_lock(); 1247 rtnl_lock();
998 if (optname == MRT_ADD_VIF) { 1248 if (optname == MRT_ADD_VIF) {
999 ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk); 1249 ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
1000 } else { 1250 } else {
1001 ret = vif_delete(net, vif.vifc_vifi, 0, NULL); 1251 ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
1002 } 1252 }
1003 rtnl_unlock(); 1253 rtnl_unlock();
1004 return ret; 1254 return ret;
@@ -1015,9 +1265,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1015 return -EFAULT; 1265 return -EFAULT;
1016 rtnl_lock(); 1266 rtnl_lock();
1017 if (optname == MRT_DEL_MFC) 1267 if (optname == MRT_DEL_MFC)
1018 ret = ipmr_mfc_delete(net, &mfc); 1268 ret = ipmr_mfc_delete(mrt, &mfc);
1019 else 1269 else
1020 ret = ipmr_mfc_add(net, &mfc, sk == net->ipv4.mroute_sk); 1270 ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
1021 rtnl_unlock(); 1271 rtnl_unlock();
1022 return ret; 1272 return ret;
1023 /* 1273 /*
@@ -1028,7 +1278,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1028 int v; 1278 int v;
1029 if (get_user(v,(int __user *)optval)) 1279 if (get_user(v,(int __user *)optval))
1030 return -EFAULT; 1280 return -EFAULT;
1031 net->ipv4.mroute_do_assert = (v) ? 1 : 0; 1281 mrt->mroute_do_assert = (v) ? 1 : 0;
1032 return 0; 1282 return 0;
1033 } 1283 }
1034#ifdef CONFIG_IP_PIMSM 1284#ifdef CONFIG_IP_PIMSM
@@ -1042,14 +1292,35 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1042 1292
1043 rtnl_lock(); 1293 rtnl_lock();
1044 ret = 0; 1294 ret = 0;
1045 if (v != net->ipv4.mroute_do_pim) { 1295 if (v != mrt->mroute_do_pim) {
1046 net->ipv4.mroute_do_pim = v; 1296 mrt->mroute_do_pim = v;
1047 net->ipv4.mroute_do_assert = v; 1297 mrt->mroute_do_assert = v;
1048 } 1298 }
1049 rtnl_unlock(); 1299 rtnl_unlock();
1050 return ret; 1300 return ret;
1051 } 1301 }
1052#endif 1302#endif
1303#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
1304 case MRT_TABLE:
1305 {
1306 u32 v;
1307
1308 if (optlen != sizeof(u32))
1309 return -EINVAL;
1310 if (get_user(v, (u32 __user *)optval))
1311 return -EFAULT;
1312 if (sk == mrt->mroute_sk)
1313 return -EBUSY;
1314
1315 rtnl_lock();
1316 ret = 0;
1317 if (!ipmr_new_table(net, v))
1318 ret = -ENOMEM;
1319 raw_sk(sk)->ipmr_table = v;
1320 rtnl_unlock();
1321 return ret;
1322 }
1323#endif
1053 /* 1324 /*
1054 * Spurious command, or MRT_VERSION which you cannot 1325 * Spurious command, or MRT_VERSION which you cannot
1055 * set. 1326 * set.
@@ -1068,6 +1339,11 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
1068 int olr; 1339 int olr;
1069 int val; 1340 int val;
1070 struct net *net = sock_net(sk); 1341 struct net *net = sock_net(sk);
1342 struct mr_table *mrt;
1343
1344 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1345 if (mrt == NULL)
1346 return -ENOENT;
1071 1347
1072 if (optname != MRT_VERSION && 1348 if (optname != MRT_VERSION &&
1073#ifdef CONFIG_IP_PIMSM 1349#ifdef CONFIG_IP_PIMSM
@@ -1089,10 +1365,10 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
1089 val = 0x0305; 1365 val = 0x0305;
1090#ifdef CONFIG_IP_PIMSM 1366#ifdef CONFIG_IP_PIMSM
1091 else if (optname == MRT_PIM) 1367 else if (optname == MRT_PIM)
1092 val = net->ipv4.mroute_do_pim; 1368 val = mrt->mroute_do_pim;
1093#endif 1369#endif
1094 else 1370 else
1095 val = net->ipv4.mroute_do_assert; 1371 val = mrt->mroute_do_assert;
1096 if (copy_to_user(optval, &val, olr)) 1372 if (copy_to_user(optval, &val, olr))
1097 return -EFAULT; 1373 return -EFAULT;
1098 return 0; 1374 return 0;
@@ -1109,16 +1385,21 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1109 struct vif_device *vif; 1385 struct vif_device *vif;
1110 struct mfc_cache *c; 1386 struct mfc_cache *c;
1111 struct net *net = sock_net(sk); 1387 struct net *net = sock_net(sk);
1388 struct mr_table *mrt;
1389
1390 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1391 if (mrt == NULL)
1392 return -ENOENT;
1112 1393
1113 switch (cmd) { 1394 switch (cmd) {
1114 case SIOCGETVIFCNT: 1395 case SIOCGETVIFCNT:
1115 if (copy_from_user(&vr, arg, sizeof(vr))) 1396 if (copy_from_user(&vr, arg, sizeof(vr)))
1116 return -EFAULT; 1397 return -EFAULT;
1117 if (vr.vifi >= net->ipv4.maxvif) 1398 if (vr.vifi >= mrt->maxvif)
1118 return -EINVAL; 1399 return -EINVAL;
1119 read_lock(&mrt_lock); 1400 read_lock(&mrt_lock);
1120 vif = &net->ipv4.vif_table[vr.vifi]; 1401 vif = &mrt->vif_table[vr.vifi];
1121 if (VIF_EXISTS(net, vr.vifi)) { 1402 if (VIF_EXISTS(mrt, vr.vifi)) {
1122 vr.icount = vif->pkt_in; 1403 vr.icount = vif->pkt_in;
1123 vr.ocount = vif->pkt_out; 1404 vr.ocount = vif->pkt_out;
1124 vr.ibytes = vif->bytes_in; 1405 vr.ibytes = vif->bytes_in;
@@ -1136,7 +1417,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1136 return -EFAULT; 1417 return -EFAULT;
1137 1418
1138 read_lock(&mrt_lock); 1419 read_lock(&mrt_lock);
1139 c = ipmr_cache_find(net, sr.src.s_addr, sr.grp.s_addr); 1420 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1140 if (c) { 1421 if (c) {
1141 sr.pktcnt = c->mfc_un.res.pkt; 1422 sr.pktcnt = c->mfc_un.res.pkt;
1142 sr.bytecnt = c->mfc_un.res.bytes; 1423 sr.bytecnt = c->mfc_un.res.bytes;
@@ -1159,19 +1440,20 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
1159{ 1440{
1160 struct net_device *dev = ptr; 1441 struct net_device *dev = ptr;
1161 struct net *net = dev_net(dev); 1442 struct net *net = dev_net(dev);
1443 struct mr_table *mrt;
1162 struct vif_device *v; 1444 struct vif_device *v;
1163 int ct; 1445 int ct;
1164 LIST_HEAD(list); 1446 LIST_HEAD(list);
1165 1447
1166 if (!net_eq(dev_net(dev), net))
1167 return NOTIFY_DONE;
1168
1169 if (event != NETDEV_UNREGISTER) 1448 if (event != NETDEV_UNREGISTER)
1170 return NOTIFY_DONE; 1449 return NOTIFY_DONE;
1171 v = &net->ipv4.vif_table[0]; 1450
1172 for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) { 1451 ipmr_for_each_table(mrt, net) {
1173 if (v->dev == dev) 1452 v = &mrt->vif_table[0];
1174 vif_delete(net, ct, 1, &list); 1453 for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1454 if (v->dev == dev)
1455 vif_delete(mrt, ct, 1, &list);
1456 }
1175 } 1457 }
1176 unregister_netdevice_many(&list); 1458 unregister_netdevice_many(&list);
1177 return NOTIFY_DONE; 1459 return NOTIFY_DONE;
@@ -1230,11 +1512,11 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
1230 * Processing handlers for ipmr_forward 1512 * Processing handlers for ipmr_forward
1231 */ 1513 */
1232 1514
1233static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) 1515static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1516 struct sk_buff *skb, struct mfc_cache *c, int vifi)
1234{ 1517{
1235 struct net *net = mfc_net(c);
1236 const struct iphdr *iph = ip_hdr(skb); 1518 const struct iphdr *iph = ip_hdr(skb);
1237 struct vif_device *vif = &net->ipv4.vif_table[vifi]; 1519 struct vif_device *vif = &mrt->vif_table[vifi];
1238 struct net_device *dev; 1520 struct net_device *dev;
1239 struct rtable *rt; 1521 struct rtable *rt;
1240 int encap = 0; 1522 int encap = 0;
@@ -1248,7 +1530,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1248 vif->bytes_out += skb->len; 1530 vif->bytes_out += skb->len;
1249 vif->dev->stats.tx_bytes += skb->len; 1531 vif->dev->stats.tx_bytes += skb->len;
1250 vif->dev->stats.tx_packets++; 1532 vif->dev->stats.tx_packets++;
1251 ipmr_cache_report(net, skb, vifi, IGMPMSG_WHOLEPKT); 1533 ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
1252 goto out_free; 1534 goto out_free;
1253 } 1535 }
1254#endif 1536#endif
@@ -1273,9 +1555,9 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1273 goto out_free; 1555 goto out_free;
1274 } 1556 }
1275 1557
1276 dev = rt->u.dst.dev; 1558 dev = rt->dst.dev;
1277 1559
1278 if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) { 1560 if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
1279 /* Do not fragment multicasts. Alas, IPv4 does not 1561 /* Do not fragment multicasts. Alas, IPv4 does not
1280 allow to send ICMP, so that packets will disappear 1562 allow to send ICMP, so that packets will disappear
1281 to blackhole. 1563 to blackhole.
@@ -1286,7 +1568,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1286 goto out_free; 1568 goto out_free;
1287 } 1569 }
1288 1570
1289 encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len; 1571 encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
1290 1572
1291 if (skb_cow(skb, encap)) { 1573 if (skb_cow(skb, encap)) {
1292 ip_rt_put(rt); 1574 ip_rt_put(rt);
@@ -1297,7 +1579,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1297 vif->bytes_out += skb->len; 1579 vif->bytes_out += skb->len;
1298 1580
1299 skb_dst_drop(skb); 1581 skb_dst_drop(skb);
1300 skb_dst_set(skb, &rt->u.dst); 1582 skb_dst_set(skb, &rt->dst);
1301 ip_decrease_ttl(ip_hdr(skb)); 1583 ip_decrease_ttl(ip_hdr(skb));
1302 1584
1303 /* FIXME: forward and output firewalls used to be called here. 1585 /* FIXME: forward and output firewalls used to be called here.
@@ -1322,21 +1604,20 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1322 * not mrouter) cannot join to more than one interface - it will 1604 * not mrouter) cannot join to more than one interface - it will
1323 * result in receiving multiple packets. 1605 * result in receiving multiple packets.
1324 */ 1606 */
1325 NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev, 1607 NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
1326 ipmr_forward_finish); 1608 ipmr_forward_finish);
1327 return; 1609 return;
1328 1610
1329out_free: 1611out_free:
1330 kfree_skb(skb); 1612 kfree_skb(skb);
1331 return;
1332} 1613}
1333 1614
1334static int ipmr_find_vif(struct net_device *dev) 1615static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
1335{ 1616{
1336 struct net *net = dev_net(dev);
1337 int ct; 1617 int ct;
1338 for (ct = net->ipv4.maxvif-1; ct >= 0; ct--) { 1618
1339 if (net->ipv4.vif_table[ct].dev == dev) 1619 for (ct = mrt->maxvif-1; ct >= 0; ct--) {
1620 if (mrt->vif_table[ct].dev == dev)
1340 break; 1621 break;
1341 } 1622 }
1342 return ct; 1623 return ct;
@@ -1344,11 +1625,12 @@ static int ipmr_find_vif(struct net_device *dev)
1344 1625
1345/* "local" means that we should preserve one skb (for local delivery) */ 1626/* "local" means that we should preserve one skb (for local delivery) */
1346 1627
1347static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local) 1628static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1629 struct sk_buff *skb, struct mfc_cache *cache,
1630 int local)
1348{ 1631{
1349 int psend = -1; 1632 int psend = -1;
1350 int vif, ct; 1633 int vif, ct;
1351 struct net *net = mfc_net(cache);
1352 1634
1353 vif = cache->mfc_parent; 1635 vif = cache->mfc_parent;
1354 cache->mfc_un.res.pkt++; 1636 cache->mfc_un.res.pkt++;
@@ -1357,7 +1639,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
1357 /* 1639 /*
1358 * Wrong interface: drop packet and (maybe) send PIM assert. 1640 * Wrong interface: drop packet and (maybe) send PIM assert.
1359 */ 1641 */
1360 if (net->ipv4.vif_table[vif].dev != skb->dev) { 1642 if (mrt->vif_table[vif].dev != skb->dev) {
1361 int true_vifi; 1643 int true_vifi;
1362 1644
1363 if (skb_rtable(skb)->fl.iif == 0) { 1645 if (skb_rtable(skb)->fl.iif == 0) {
@@ -1376,26 +1658,26 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
1376 } 1658 }
1377 1659
1378 cache->mfc_un.res.wrong_if++; 1660 cache->mfc_un.res.wrong_if++;
1379 true_vifi = ipmr_find_vif(skb->dev); 1661 true_vifi = ipmr_find_vif(mrt, skb->dev);
1380 1662
1381 if (true_vifi >= 0 && net->ipv4.mroute_do_assert && 1663 if (true_vifi >= 0 && mrt->mroute_do_assert &&
1382 /* pimsm uses asserts, when switching from RPT to SPT, 1664 /* pimsm uses asserts, when switching from RPT to SPT,
1383 so that we cannot check that packet arrived on an oif. 1665 so that we cannot check that packet arrived on an oif.
1384 It is bad, but otherwise we would need to move pretty 1666 It is bad, but otherwise we would need to move pretty
1385 large chunk of pimd to kernel. Ough... --ANK 1667 large chunk of pimd to kernel. Ough... --ANK
1386 */ 1668 */
1387 (net->ipv4.mroute_do_pim || 1669 (mrt->mroute_do_pim ||
1388 cache->mfc_un.res.ttls[true_vifi] < 255) && 1670 cache->mfc_un.res.ttls[true_vifi] < 255) &&
1389 time_after(jiffies, 1671 time_after(jiffies,
1390 cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { 1672 cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1391 cache->mfc_un.res.last_assert = jiffies; 1673 cache->mfc_un.res.last_assert = jiffies;
1392 ipmr_cache_report(net, skb, true_vifi, IGMPMSG_WRONGVIF); 1674 ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
1393 } 1675 }
1394 goto dont_forward; 1676 goto dont_forward;
1395 } 1677 }
1396 1678
1397 net->ipv4.vif_table[vif].pkt_in++; 1679 mrt->vif_table[vif].pkt_in++;
1398 net->ipv4.vif_table[vif].bytes_in += skb->len; 1680 mrt->vif_table[vif].bytes_in += skb->len;
1399 1681
1400 /* 1682 /*
1401 * Forward the frame 1683 * Forward the frame
@@ -1405,7 +1687,8 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
1405 if (psend != -1) { 1687 if (psend != -1) {
1406 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1688 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1407 if (skb2) 1689 if (skb2)
1408 ipmr_queue_xmit(skb2, cache, psend); 1690 ipmr_queue_xmit(net, mrt, skb2, cache,
1691 psend);
1409 } 1692 }
1410 psend = ct; 1693 psend = ct;
1411 } 1694 }
@@ -1414,9 +1697,9 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
1414 if (local) { 1697 if (local) {
1415 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1698 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1416 if (skb2) 1699 if (skb2)
1417 ipmr_queue_xmit(skb2, cache, psend); 1700 ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1418 } else { 1701 } else {
1419 ipmr_queue_xmit(skb, cache, psend); 1702 ipmr_queue_xmit(net, mrt, skb, cache, psend);
1420 return 0; 1703 return 0;
1421 } 1704 }
1422 } 1705 }
@@ -1437,6 +1720,8 @@ int ip_mr_input(struct sk_buff *skb)
1437 struct mfc_cache *cache; 1720 struct mfc_cache *cache;
1438 struct net *net = dev_net(skb->dev); 1721 struct net *net = dev_net(skb->dev);
1439 int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; 1722 int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1723 struct mr_table *mrt;
1724 int err;
1440 1725
1441 /* Packet is looped back after forward, it should not be 1726 /* Packet is looped back after forward, it should not be
1442 forwarded second time, but still can be delivered locally. 1727 forwarded second time, but still can be delivered locally.
@@ -1444,6 +1729,12 @@ int ip_mr_input(struct sk_buff *skb)
1444 if (IPCB(skb)->flags&IPSKB_FORWARDED) 1729 if (IPCB(skb)->flags&IPSKB_FORWARDED)
1445 goto dont_forward; 1730 goto dont_forward;
1446 1731
1732 err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
1733 if (err < 0) {
1734 kfree_skb(skb);
1735 return err;
1736 }
1737
1447 if (!local) { 1738 if (!local) {
1448 if (IPCB(skb)->opt.router_alert) { 1739 if (IPCB(skb)->opt.router_alert) {
1449 if (ip_call_ra_chain(skb)) 1740 if (ip_call_ra_chain(skb))
@@ -1456,9 +1747,9 @@ int ip_mr_input(struct sk_buff *skb)
1456 that we can forward NO IGMP messages. 1747 that we can forward NO IGMP messages.
1457 */ 1748 */
1458 read_lock(&mrt_lock); 1749 read_lock(&mrt_lock);
1459 if (net->ipv4.mroute_sk) { 1750 if (mrt->mroute_sk) {
1460 nf_reset(skb); 1751 nf_reset(skb);
1461 raw_rcv(net->ipv4.mroute_sk, skb); 1752 raw_rcv(mrt->mroute_sk, skb);
1462 read_unlock(&mrt_lock); 1753 read_unlock(&mrt_lock);
1463 return 0; 1754 return 0;
1464 } 1755 }
@@ -1467,7 +1758,7 @@ int ip_mr_input(struct sk_buff *skb)
1467 } 1758 }
1468 1759
1469 read_lock(&mrt_lock); 1760 read_lock(&mrt_lock);
1470 cache = ipmr_cache_find(net, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); 1761 cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1471 1762
1472 /* 1763 /*
1473 * No usable cache entry 1764 * No usable cache entry
@@ -1485,19 +1776,19 @@ int ip_mr_input(struct sk_buff *skb)
1485 skb = skb2; 1776 skb = skb2;
1486 } 1777 }
1487 1778
1488 vif = ipmr_find_vif(skb->dev); 1779 vif = ipmr_find_vif(mrt, skb->dev);
1489 if (vif >= 0) { 1780 if (vif >= 0) {
1490 int err = ipmr_cache_unresolved(net, vif, skb); 1781 int err2 = ipmr_cache_unresolved(mrt, vif, skb);
1491 read_unlock(&mrt_lock); 1782 read_unlock(&mrt_lock);
1492 1783
1493 return err; 1784 return err2;
1494 } 1785 }
1495 read_unlock(&mrt_lock); 1786 read_unlock(&mrt_lock);
1496 kfree_skb(skb); 1787 kfree_skb(skb);
1497 return -ENODEV; 1788 return -ENODEV;
1498 } 1789 }
1499 1790
1500 ip_mr_forward(skb, cache, local); 1791 ip_mr_forward(net, mrt, skb, cache, local);
1501 1792
1502 read_unlock(&mrt_lock); 1793 read_unlock(&mrt_lock);
1503 1794
@@ -1514,11 +1805,11 @@ dont_forward:
1514} 1805}
1515 1806
1516#ifdef CONFIG_IP_PIMSM 1807#ifdef CONFIG_IP_PIMSM
1517static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen) 1808static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1809 unsigned int pimlen)
1518{ 1810{
1519 struct net_device *reg_dev = NULL; 1811 struct net_device *reg_dev = NULL;
1520 struct iphdr *encap; 1812 struct iphdr *encap;
1521 struct net *net = dev_net(skb->dev);
1522 1813
1523 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); 1814 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1524 /* 1815 /*
@@ -1533,8 +1824,8 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1533 return 1; 1824 return 1;
1534 1825
1535 read_lock(&mrt_lock); 1826 read_lock(&mrt_lock);
1536 if (net->ipv4.mroute_reg_vif_num >= 0) 1827 if (mrt->mroute_reg_vif_num >= 0)
1537 reg_dev = net->ipv4.vif_table[net->ipv4.mroute_reg_vif_num].dev; 1828 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
1538 if (reg_dev) 1829 if (reg_dev)
1539 dev_hold(reg_dev); 1830 dev_hold(reg_dev);
1540 read_unlock(&mrt_lock); 1831 read_unlock(&mrt_lock);
@@ -1545,14 +1836,12 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1545 skb->mac_header = skb->network_header; 1836 skb->mac_header = skb->network_header;
1546 skb_pull(skb, (u8*)encap - skb->data); 1837 skb_pull(skb, (u8*)encap - skb->data);
1547 skb_reset_network_header(skb); 1838 skb_reset_network_header(skb);
1548 skb->dev = reg_dev;
1549 skb->protocol = htons(ETH_P_IP); 1839 skb->protocol = htons(ETH_P_IP);
1550 skb->ip_summed = 0; 1840 skb->ip_summed = 0;
1551 skb->pkt_type = PACKET_HOST; 1841 skb->pkt_type = PACKET_HOST;
1552 skb_dst_drop(skb); 1842
1553 reg_dev->stats.rx_bytes += skb->len; 1843 skb_tunnel_rx(skb, reg_dev);
1554 reg_dev->stats.rx_packets++; 1844
1555 nf_reset(skb);
1556 netif_rx(skb); 1845 netif_rx(skb);
1557 dev_put(reg_dev); 1846 dev_put(reg_dev);
1558 1847
@@ -1569,17 +1858,21 @@ int pim_rcv_v1(struct sk_buff * skb)
1569{ 1858{
1570 struct igmphdr *pim; 1859 struct igmphdr *pim;
1571 struct net *net = dev_net(skb->dev); 1860 struct net *net = dev_net(skb->dev);
1861 struct mr_table *mrt;
1572 1862
1573 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) 1863 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1574 goto drop; 1864 goto drop;
1575 1865
1576 pim = igmp_hdr(skb); 1866 pim = igmp_hdr(skb);
1577 1867
1578 if (!net->ipv4.mroute_do_pim || 1868 if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1869 goto drop;
1870
1871 if (!mrt->mroute_do_pim ||
1579 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 1872 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1580 goto drop; 1873 goto drop;
1581 1874
1582 if (__pim_rcv(skb, sizeof(*pim))) { 1875 if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1583drop: 1876drop:
1584 kfree_skb(skb); 1877 kfree_skb(skb);
1585 } 1878 }
@@ -1591,6 +1884,8 @@ drop:
1591static int pim_rcv(struct sk_buff * skb) 1884static int pim_rcv(struct sk_buff * skb)
1592{ 1885{
1593 struct pimreghdr *pim; 1886 struct pimreghdr *pim;
1887 struct net *net = dev_net(skb->dev);
1888 struct mr_table *mrt;
1594 1889
1595 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) 1890 if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1596 goto drop; 1891 goto drop;
@@ -1602,7 +1897,10 @@ static int pim_rcv(struct sk_buff * skb)
1602 csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 1897 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1603 goto drop; 1898 goto drop;
1604 1899
1605 if (__pim_rcv(skb, sizeof(*pim))) { 1900 if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1901 goto drop;
1902
1903 if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1606drop: 1904drop:
1607 kfree_skb(skb); 1905 kfree_skb(skb);
1608 } 1906 }
@@ -1610,29 +1908,31 @@ drop:
1610} 1908}
1611#endif 1909#endif
1612 1910
1613static int 1911static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
1614ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) 1912 struct mfc_cache *c, struct rtmsg *rtm)
1615{ 1913{
1616 int ct; 1914 int ct;
1617 struct rtnexthop *nhp; 1915 struct rtnexthop *nhp;
1618 struct net *net = mfc_net(c);
1619 struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev;
1620 u8 *b = skb_tail_pointer(skb); 1916 u8 *b = skb_tail_pointer(skb);
1621 struct rtattr *mp_head; 1917 struct rtattr *mp_head;
1622 1918
1623 if (dev) 1919 /* If cache is unresolved, don't try to parse IIF and OIF */
1624 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); 1920 if (c->mfc_parent >= MAXVIFS)
1921 return -ENOENT;
1922
1923 if (VIF_EXISTS(mrt, c->mfc_parent))
1924 RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
1625 1925
1626 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); 1926 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1627 1927
1628 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { 1928 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1629 if (c->mfc_un.res.ttls[ct] < 255) { 1929 if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
1630 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) 1930 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1631 goto rtattr_failure; 1931 goto rtattr_failure;
1632 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); 1932 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1633 nhp->rtnh_flags = 0; 1933 nhp->rtnh_flags = 0;
1634 nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; 1934 nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1635 nhp->rtnh_ifindex = net->ipv4.vif_table[ct].dev->ifindex; 1935 nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
1636 nhp->rtnh_len = sizeof(*nhp); 1936 nhp->rtnh_len = sizeof(*nhp);
1637 } 1937 }
1638 } 1938 }
@@ -1650,11 +1950,16 @@ int ipmr_get_route(struct net *net,
1650 struct sk_buff *skb, struct rtmsg *rtm, int nowait) 1950 struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1651{ 1951{
1652 int err; 1952 int err;
1953 struct mr_table *mrt;
1653 struct mfc_cache *cache; 1954 struct mfc_cache *cache;
1654 struct rtable *rt = skb_rtable(skb); 1955 struct rtable *rt = skb_rtable(skb);
1655 1956
1957 mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
1958 if (mrt == NULL)
1959 return -ENOENT;
1960
1656 read_lock(&mrt_lock); 1961 read_lock(&mrt_lock);
1657 cache = ipmr_cache_find(net, rt->rt_src, rt->rt_dst); 1962 cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
1658 1963
1659 if (cache == NULL) { 1964 if (cache == NULL) {
1660 struct sk_buff *skb2; 1965 struct sk_buff *skb2;
@@ -1668,7 +1973,7 @@ int ipmr_get_route(struct net *net,
1668 } 1973 }
1669 1974
1670 dev = skb->dev; 1975 dev = skb->dev;
1671 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) { 1976 if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
1672 read_unlock(&mrt_lock); 1977 read_unlock(&mrt_lock);
1673 return -ENODEV; 1978 return -ENODEV;
1674 } 1979 }
@@ -1685,24 +1990,107 @@ int ipmr_get_route(struct net *net,
1685 iph->saddr = rt->rt_src; 1990 iph->saddr = rt->rt_src;
1686 iph->daddr = rt->rt_dst; 1991 iph->daddr = rt->rt_dst;
1687 iph->version = 0; 1992 iph->version = 0;
1688 err = ipmr_cache_unresolved(net, vif, skb2); 1993 err = ipmr_cache_unresolved(mrt, vif, skb2);
1689 read_unlock(&mrt_lock); 1994 read_unlock(&mrt_lock);
1690 return err; 1995 return err;
1691 } 1996 }
1692 1997
1693 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) 1998 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1694 cache->mfc_flags |= MFC_NOTIFY; 1999 cache->mfc_flags |= MFC_NOTIFY;
1695 err = ipmr_fill_mroute(skb, cache, rtm); 2000 err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
1696 read_unlock(&mrt_lock); 2001 read_unlock(&mrt_lock);
1697 return err; 2002 return err;
1698} 2003}
1699 2004
2005static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2006 u32 pid, u32 seq, struct mfc_cache *c)
2007{
2008 struct nlmsghdr *nlh;
2009 struct rtmsg *rtm;
2010
2011 nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
2012 if (nlh == NULL)
2013 return -EMSGSIZE;
2014
2015 rtm = nlmsg_data(nlh);
2016 rtm->rtm_family = RTNL_FAMILY_IPMR;
2017 rtm->rtm_dst_len = 32;
2018 rtm->rtm_src_len = 32;
2019 rtm->rtm_tos = 0;
2020 rtm->rtm_table = mrt->id;
2021 NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
2022 rtm->rtm_type = RTN_MULTICAST;
2023 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2024 rtm->rtm_protocol = RTPROT_UNSPEC;
2025 rtm->rtm_flags = 0;
2026
2027 NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
2028 NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
2029
2030 if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
2031 goto nla_put_failure;
2032
2033 return nlmsg_end(skb, nlh);
2034
2035nla_put_failure:
2036 nlmsg_cancel(skb, nlh);
2037 return -EMSGSIZE;
2038}
2039
2040static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2041{
2042 struct net *net = sock_net(skb->sk);
2043 struct mr_table *mrt;
2044 struct mfc_cache *mfc;
2045 unsigned int t = 0, s_t;
2046 unsigned int h = 0, s_h;
2047 unsigned int e = 0, s_e;
2048
2049 s_t = cb->args[0];
2050 s_h = cb->args[1];
2051 s_e = cb->args[2];
2052
2053 read_lock(&mrt_lock);
2054 ipmr_for_each_table(mrt, net) {
2055 if (t < s_t)
2056 goto next_table;
2057 if (t > s_t)
2058 s_h = 0;
2059 for (h = s_h; h < MFC_LINES; h++) {
2060 list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
2061 if (e < s_e)
2062 goto next_entry;
2063 if (ipmr_fill_mroute(mrt, skb,
2064 NETLINK_CB(cb->skb).pid,
2065 cb->nlh->nlmsg_seq,
2066 mfc) < 0)
2067 goto done;
2068next_entry:
2069 e++;
2070 }
2071 e = s_e = 0;
2072 }
2073 s_h = 0;
2074next_table:
2075 t++;
2076 }
2077done:
2078 read_unlock(&mrt_lock);
2079
2080 cb->args[2] = e;
2081 cb->args[1] = h;
2082 cb->args[0] = t;
2083
2084 return skb->len;
2085}
2086
1700#ifdef CONFIG_PROC_FS 2087#ifdef CONFIG_PROC_FS
1701/* 2088/*
1702 * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif 2089 * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1703 */ 2090 */
1704struct ipmr_vif_iter { 2091struct ipmr_vif_iter {
1705 struct seq_net_private p; 2092 struct seq_net_private p;
2093 struct mr_table *mrt;
1706 int ct; 2094 int ct;
1707}; 2095};
1708 2096
@@ -1710,11 +2098,13 @@ static struct vif_device *ipmr_vif_seq_idx(struct net *net,
1710 struct ipmr_vif_iter *iter, 2098 struct ipmr_vif_iter *iter,
1711 loff_t pos) 2099 loff_t pos)
1712{ 2100{
1713 for (iter->ct = 0; iter->ct < net->ipv4.maxvif; ++iter->ct) { 2101 struct mr_table *mrt = iter->mrt;
1714 if (!VIF_EXISTS(net, iter->ct)) 2102
2103 for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
2104 if (!VIF_EXISTS(mrt, iter->ct))
1715 continue; 2105 continue;
1716 if (pos-- == 0) 2106 if (pos-- == 0)
1717 return &net->ipv4.vif_table[iter->ct]; 2107 return &mrt->vif_table[iter->ct];
1718 } 2108 }
1719 return NULL; 2109 return NULL;
1720} 2110}
@@ -1722,7 +2112,15 @@ static struct vif_device *ipmr_vif_seq_idx(struct net *net,
1722static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) 2112static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1723 __acquires(mrt_lock) 2113 __acquires(mrt_lock)
1724{ 2114{
2115 struct ipmr_vif_iter *iter = seq->private;
1725 struct net *net = seq_file_net(seq); 2116 struct net *net = seq_file_net(seq);
2117 struct mr_table *mrt;
2118
2119 mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2120 if (mrt == NULL)
2121 return ERR_PTR(-ENOENT);
2122
2123 iter->mrt = mrt;
1726 2124
1727 read_lock(&mrt_lock); 2125 read_lock(&mrt_lock);
1728 return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1) 2126 return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
@@ -1733,15 +2131,16 @@ static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1733{ 2131{
1734 struct ipmr_vif_iter *iter = seq->private; 2132 struct ipmr_vif_iter *iter = seq->private;
1735 struct net *net = seq_file_net(seq); 2133 struct net *net = seq_file_net(seq);
2134 struct mr_table *mrt = iter->mrt;
1736 2135
1737 ++*pos; 2136 ++*pos;
1738 if (v == SEQ_START_TOKEN) 2137 if (v == SEQ_START_TOKEN)
1739 return ipmr_vif_seq_idx(net, iter, 0); 2138 return ipmr_vif_seq_idx(net, iter, 0);
1740 2139
1741 while (++iter->ct < net->ipv4.maxvif) { 2140 while (++iter->ct < mrt->maxvif) {
1742 if (!VIF_EXISTS(net, iter->ct)) 2141 if (!VIF_EXISTS(mrt, iter->ct))
1743 continue; 2142 continue;
1744 return &net->ipv4.vif_table[iter->ct]; 2143 return &mrt->vif_table[iter->ct];
1745 } 2144 }
1746 return NULL; 2145 return NULL;
1747} 2146}
@@ -1754,7 +2153,8 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1754 2153
1755static int ipmr_vif_seq_show(struct seq_file *seq, void *v) 2154static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1756{ 2155{
1757 struct net *net = seq_file_net(seq); 2156 struct ipmr_vif_iter *iter = seq->private;
2157 struct mr_table *mrt = iter->mrt;
1758 2158
1759 if (v == SEQ_START_TOKEN) { 2159 if (v == SEQ_START_TOKEN) {
1760 seq_puts(seq, 2160 seq_puts(seq,
@@ -1765,7 +2165,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1765 2165
1766 seq_printf(seq, 2166 seq_printf(seq,
1767 "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", 2167 "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
1768 vif - net->ipv4.vif_table, 2168 vif - mrt->vif_table,
1769 name, vif->bytes_in, vif->pkt_in, 2169 name, vif->bytes_in, vif->pkt_in,
1770 vif->bytes_out, vif->pkt_out, 2170 vif->bytes_out, vif->pkt_out,
1771 vif->flags, vif->local, vif->remote); 2171 vif->flags, vif->local, vif->remote);
@@ -1796,7 +2196,8 @@ static const struct file_operations ipmr_vif_fops = {
1796 2196
1797struct ipmr_mfc_iter { 2197struct ipmr_mfc_iter {
1798 struct seq_net_private p; 2198 struct seq_net_private p;
1799 struct mfc_cache **cache; 2199 struct mr_table *mrt;
2200 struct list_head *cache;
1800 int ct; 2201 int ct;
1801}; 2202};
1802 2203
@@ -1804,22 +2205,22 @@ struct ipmr_mfc_iter {
1804static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, 2205static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
1805 struct ipmr_mfc_iter *it, loff_t pos) 2206 struct ipmr_mfc_iter *it, loff_t pos)
1806{ 2207{
2208 struct mr_table *mrt = it->mrt;
1807 struct mfc_cache *mfc; 2209 struct mfc_cache *mfc;
1808 2210
1809 it->cache = net->ipv4.mfc_cache_array;
1810 read_lock(&mrt_lock); 2211 read_lock(&mrt_lock);
1811 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) 2212 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
1812 for (mfc = net->ipv4.mfc_cache_array[it->ct]; 2213 it->cache = &mrt->mfc_cache_array[it->ct];
1813 mfc; mfc = mfc->next) 2214 list_for_each_entry(mfc, it->cache, list)
1814 if (pos-- == 0) 2215 if (pos-- == 0)
1815 return mfc; 2216 return mfc;
2217 }
1816 read_unlock(&mrt_lock); 2218 read_unlock(&mrt_lock);
1817 2219
1818 it->cache = &mfc_unres_queue;
1819 spin_lock_bh(&mfc_unres_lock); 2220 spin_lock_bh(&mfc_unres_lock);
1820 for (mfc = mfc_unres_queue; mfc; mfc = mfc->next) 2221 it->cache = &mrt->mfc_unres_queue;
1821 if (net_eq(mfc_net(mfc), net) && 2222 list_for_each_entry(mfc, it->cache, list)
1822 pos-- == 0) 2223 if (pos-- == 0)
1823 return mfc; 2224 return mfc;
1824 spin_unlock_bh(&mfc_unres_lock); 2225 spin_unlock_bh(&mfc_unres_lock);
1825 2226
@@ -1832,7 +2233,13 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1832{ 2233{
1833 struct ipmr_mfc_iter *it = seq->private; 2234 struct ipmr_mfc_iter *it = seq->private;
1834 struct net *net = seq_file_net(seq); 2235 struct net *net = seq_file_net(seq);
2236 struct mr_table *mrt;
1835 2237
2238 mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2239 if (mrt == NULL)
2240 return ERR_PTR(-ENOENT);
2241
2242 it->mrt = mrt;
1836 it->cache = NULL; 2243 it->cache = NULL;
1837 it->ct = 0; 2244 it->ct = 0;
1838 return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) 2245 return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
@@ -1844,37 +2251,36 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1844 struct mfc_cache *mfc = v; 2251 struct mfc_cache *mfc = v;
1845 struct ipmr_mfc_iter *it = seq->private; 2252 struct ipmr_mfc_iter *it = seq->private;
1846 struct net *net = seq_file_net(seq); 2253 struct net *net = seq_file_net(seq);
2254 struct mr_table *mrt = it->mrt;
1847 2255
1848 ++*pos; 2256 ++*pos;
1849 2257
1850 if (v == SEQ_START_TOKEN) 2258 if (v == SEQ_START_TOKEN)
1851 return ipmr_mfc_seq_idx(net, seq->private, 0); 2259 return ipmr_mfc_seq_idx(net, seq->private, 0);
1852 2260
1853 if (mfc->next) 2261 if (mfc->list.next != it->cache)
1854 return mfc->next; 2262 return list_entry(mfc->list.next, struct mfc_cache, list);
1855 2263
1856 if (it->cache == &mfc_unres_queue) 2264 if (it->cache == &mrt->mfc_unres_queue)
1857 goto end_of_list; 2265 goto end_of_list;
1858 2266
1859 BUG_ON(it->cache != net->ipv4.mfc_cache_array); 2267 BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
1860 2268
1861 while (++it->ct < MFC_LINES) { 2269 while (++it->ct < MFC_LINES) {
1862 mfc = net->ipv4.mfc_cache_array[it->ct]; 2270 it->cache = &mrt->mfc_cache_array[it->ct];
1863 if (mfc) 2271 if (list_empty(it->cache))
1864 return mfc; 2272 continue;
2273 return list_first_entry(it->cache, struct mfc_cache, list);
1865 } 2274 }
1866 2275
1867 /* exhausted cache_array, show unresolved */ 2276 /* exhausted cache_array, show unresolved */
1868 read_unlock(&mrt_lock); 2277 read_unlock(&mrt_lock);
1869 it->cache = &mfc_unres_queue; 2278 it->cache = &mrt->mfc_unres_queue;
1870 it->ct = 0; 2279 it->ct = 0;
1871 2280
1872 spin_lock_bh(&mfc_unres_lock); 2281 spin_lock_bh(&mfc_unres_lock);
1873 mfc = mfc_unres_queue; 2282 if (!list_empty(it->cache))
1874 while (mfc && !net_eq(mfc_net(mfc), net)) 2283 return list_first_entry(it->cache, struct mfc_cache, list);
1875 mfc = mfc->next;
1876 if (mfc)
1877 return mfc;
1878 2284
1879 end_of_list: 2285 end_of_list:
1880 spin_unlock_bh(&mfc_unres_lock); 2286 spin_unlock_bh(&mfc_unres_lock);
@@ -1886,18 +2292,17 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1886static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) 2292static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1887{ 2293{
1888 struct ipmr_mfc_iter *it = seq->private; 2294 struct ipmr_mfc_iter *it = seq->private;
1889 struct net *net = seq_file_net(seq); 2295 struct mr_table *mrt = it->mrt;
1890 2296
1891 if (it->cache == &mfc_unres_queue) 2297 if (it->cache == &mrt->mfc_unres_queue)
1892 spin_unlock_bh(&mfc_unres_lock); 2298 spin_unlock_bh(&mfc_unres_lock);
1893 else if (it->cache == net->ipv4.mfc_cache_array) 2299 else if (it->cache == &mrt->mfc_cache_array[it->ct])
1894 read_unlock(&mrt_lock); 2300 read_unlock(&mrt_lock);
1895} 2301}
1896 2302
1897static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) 2303static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1898{ 2304{
1899 int n; 2305 int n;
1900 struct net *net = seq_file_net(seq);
1901 2306
1902 if (v == SEQ_START_TOKEN) { 2307 if (v == SEQ_START_TOKEN) {
1903 seq_puts(seq, 2308 seq_puts(seq,
@@ -1905,20 +2310,21 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1905 } else { 2310 } else {
1906 const struct mfc_cache *mfc = v; 2311 const struct mfc_cache *mfc = v;
1907 const struct ipmr_mfc_iter *it = seq->private; 2312 const struct ipmr_mfc_iter *it = seq->private;
2313 const struct mr_table *mrt = it->mrt;
1908 2314
1909 seq_printf(seq, "%08lX %08lX %-3hd", 2315 seq_printf(seq, "%08X %08X %-3hd",
1910 (unsigned long) mfc->mfc_mcastgrp, 2316 (__force u32) mfc->mfc_mcastgrp,
1911 (unsigned long) mfc->mfc_origin, 2317 (__force u32) mfc->mfc_origin,
1912 mfc->mfc_parent); 2318 mfc->mfc_parent);
1913 2319
1914 if (it->cache != &mfc_unres_queue) { 2320 if (it->cache != &mrt->mfc_unres_queue) {
1915 seq_printf(seq, " %8lu %8lu %8lu", 2321 seq_printf(seq, " %8lu %8lu %8lu",
1916 mfc->mfc_un.res.pkt, 2322 mfc->mfc_un.res.pkt,
1917 mfc->mfc_un.res.bytes, 2323 mfc->mfc_un.res.bytes,
1918 mfc->mfc_un.res.wrong_if); 2324 mfc->mfc_un.res.wrong_if);
1919 for (n = mfc->mfc_un.res.minvif; 2325 for (n = mfc->mfc_un.res.minvif;
1920 n < mfc->mfc_un.res.maxvif; n++ ) { 2326 n < mfc->mfc_un.res.maxvif; n++ ) {
1921 if (VIF_EXISTS(net, n) && 2327 if (VIF_EXISTS(mrt, n) &&
1922 mfc->mfc_un.res.ttls[n] < 255) 2328 mfc->mfc_un.res.ttls[n] < 255)
1923 seq_printf(seq, 2329 seq_printf(seq,
1924 " %2d:%-3d", 2330 " %2d:%-3d",
@@ -1970,27 +2376,11 @@ static const struct net_protocol pim_protocol = {
1970 */ 2376 */
1971static int __net_init ipmr_net_init(struct net *net) 2377static int __net_init ipmr_net_init(struct net *net)
1972{ 2378{
1973 int err = 0; 2379 int err;
1974 2380
1975 net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device), 2381 err = ipmr_rules_init(net);
1976 GFP_KERNEL); 2382 if (err < 0)
1977 if (!net->ipv4.vif_table) {
1978 err = -ENOMEM;
1979 goto fail; 2383 goto fail;
1980 }
1981
1982 /* Forwarding cache */
1983 net->ipv4.mfc_cache_array = kcalloc(MFC_LINES,
1984 sizeof(struct mfc_cache *),
1985 GFP_KERNEL);
1986 if (!net->ipv4.mfc_cache_array) {
1987 err = -ENOMEM;
1988 goto fail_mfc_cache;
1989 }
1990
1991#ifdef CONFIG_IP_PIMSM
1992 net->ipv4.mroute_reg_vif_num = -1;
1993#endif
1994 2384
1995#ifdef CONFIG_PROC_FS 2385#ifdef CONFIG_PROC_FS
1996 err = -ENOMEM; 2386 err = -ENOMEM;
@@ -2005,10 +2395,8 @@ static int __net_init ipmr_net_init(struct net *net)
2005proc_cache_fail: 2395proc_cache_fail:
2006 proc_net_remove(net, "ip_mr_vif"); 2396 proc_net_remove(net, "ip_mr_vif");
2007proc_vif_fail: 2397proc_vif_fail:
2008 kfree(net->ipv4.mfc_cache_array); 2398 ipmr_rules_exit(net);
2009#endif 2399#endif
2010fail_mfc_cache:
2011 kfree(net->ipv4.vif_table);
2012fail: 2400fail:
2013 return err; 2401 return err;
2014} 2402}
@@ -2019,8 +2407,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
2019 proc_net_remove(net, "ip_mr_cache"); 2407 proc_net_remove(net, "ip_mr_cache");
2020 proc_net_remove(net, "ip_mr_vif"); 2408 proc_net_remove(net, "ip_mr_vif");
2021#endif 2409#endif
2022 kfree(net->ipv4.mfc_cache_array); 2410 ipmr_rules_exit(net);
2023 kfree(net->ipv4.vif_table);
2024} 2411}
2025 2412
2026static struct pernet_operations ipmr_net_ops = { 2413static struct pernet_operations ipmr_net_ops = {
@@ -2043,7 +2430,6 @@ int __init ip_mr_init(void)
2043 if (err) 2430 if (err)
2044 goto reg_pernet_fail; 2431 goto reg_pernet_fail;
2045 2432
2046 setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
2047 err = register_netdevice_notifier(&ip_mr_notifier); 2433 err = register_netdevice_notifier(&ip_mr_notifier);
2048 if (err) 2434 if (err)
2049 goto reg_notif_fail; 2435 goto reg_notif_fail;
@@ -2054,6 +2440,7 @@ int __init ip_mr_init(void)
2054 goto add_proto_fail; 2440 goto add_proto_fail;
2055 } 2441 }
2056#endif 2442#endif
2443 rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
2057 return 0; 2444 return 0;
2058 2445
2059#ifdef CONFIG_IP_PIMSM_V2 2446#ifdef CONFIG_IP_PIMSM_V2
@@ -2061,7 +2448,6 @@ add_proto_fail:
2061 unregister_netdevice_notifier(&ip_mr_notifier); 2448 unregister_netdevice_notifier(&ip_mr_notifier);
2062#endif 2449#endif
2063reg_notif_fail: 2450reg_notif_fail:
2064 del_timer(&ipmr_expire_timer);
2065 unregister_pernet_subsys(&ipmr_net_ops); 2451 unregister_pernet_subsys(&ipmr_net_ops);
2066reg_pernet_fail: 2452reg_pernet_fail:
2067 kmem_cache_destroy(mrt_cachep); 2453 kmem_cache_destroy(mrt_cachep);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index c14623fc4d5e..d88a46c54fd1 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -4,6 +4,7 @@
4#include <linux/netfilter_ipv4.h> 4#include <linux/netfilter_ipv4.h>
5#include <linux/ip.h> 5#include <linux/ip.h>
6#include <linux/skbuff.h> 6#include <linux/skbuff.h>
7#include <linux/gfp.h>
7#include <net/route.h> 8#include <net/route.h>
8#include <net/xfrm.h> 9#include <net/xfrm.h>
9#include <net/ip.h> 10#include <net/ip.h>
@@ -16,7 +17,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
16 const struct iphdr *iph = ip_hdr(skb); 17 const struct iphdr *iph = ip_hdr(skb);
17 struct rtable *rt; 18 struct rtable *rt;
18 struct flowi fl = {}; 19 struct flowi fl = {};
19 struct dst_entry *odst; 20 unsigned long orefdst;
20 unsigned int hh_len; 21 unsigned int hh_len;
21 unsigned int type; 22 unsigned int type;
22 23
@@ -42,7 +43,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
42 43
43 /* Drop old route. */ 44 /* Drop old route. */
44 skb_dst_drop(skb); 45 skb_dst_drop(skb);
45 skb_dst_set(skb, &rt->u.dst); 46 skb_dst_set(skb, &rt->dst);
46 } else { 47 } else {
47 /* non-local src, find valid iif to satisfy 48 /* non-local src, find valid iif to satisfy
48 * rp-filter when calling ip_route_input. */ 49 * rp-filter when calling ip_route_input. */
@@ -50,14 +51,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
50 if (ip_route_output_key(net, &rt, &fl) != 0) 51 if (ip_route_output_key(net, &rt, &fl) != 0)
51 return -1; 52 return -1;
52 53
53 odst = skb_dst(skb); 54 orefdst = skb->_skb_refdst;
54 if (ip_route_input(skb, iph->daddr, iph->saddr, 55 if (ip_route_input(skb, iph->daddr, iph->saddr,
55 RT_TOS(iph->tos), rt->u.dst.dev) != 0) { 56 RT_TOS(iph->tos), rt->dst.dev) != 0) {
56 dst_release(&rt->u.dst); 57 dst_release(&rt->dst);
57 return -1; 58 return -1;
58 } 59 }
59 dst_release(&rt->u.dst); 60 dst_release(&rt->dst);
60 dst_release(odst); 61 refdst_drop(orefdst);
61 } 62 }
62 63
63 if (skb_dst(skb)->error) 64 if (skb_dst(skb)->error)
@@ -211,9 +212,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
211 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, 212 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
212 skb->len - dataoff, 0); 213 skb->len - dataoff, 0);
213 skb->ip_summed = CHECKSUM_NONE; 214 skb->ip_summed = CHECKSUM_NONE;
214 csum = __skb_checksum_complete_head(skb, dataoff + len); 215 return __skb_checksum_complete_head(skb, dataoff + len);
215 if (!csum)
216 skb->ip_summed = CHECKSUM_UNNECESSARY;
217 } 216 }
218 return csum; 217 return csum;
219} 218}
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 06632762ba5f..e8f4f9a57f12 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -27,6 +27,7 @@
27 27
28#include <linux/netfilter/x_tables.h> 28#include <linux/netfilter/x_tables.h>
29#include <linux/netfilter_arp/arp_tables.h> 29#include <linux/netfilter_arp/arp_tables.h>
30#include "../../netfilter/xt_repldata.h"
30 31
31MODULE_LICENSE("GPL"); 32MODULE_LICENSE("GPL");
32MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); 33MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
@@ -48,16 +49,17 @@ MODULE_DESCRIPTION("arptables core");
48#endif 49#endif
49 50
50#ifdef CONFIG_NETFILTER_DEBUG 51#ifdef CONFIG_NETFILTER_DEBUG
51#define ARP_NF_ASSERT(x) \ 52#define ARP_NF_ASSERT(x) WARN_ON(!(x))
52do { \
53 if (!(x)) \
54 printk("ARP_NF_ASSERT: %s:%s:%u\n", \
55 __func__, __FILE__, __LINE__); \
56} while(0)
57#else 53#else
58#define ARP_NF_ASSERT(x) 54#define ARP_NF_ASSERT(x)
59#endif 55#endif
60 56
57void *arpt_alloc_initial_table(const struct xt_table *info)
58{
59 return xt_alloc_initial_table(arpt, ARPT);
60}
61EXPORT_SYMBOL_GPL(arpt_alloc_initial_table);
62
61static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, 63static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
62 const char *hdr_addr, int len) 64 const char *hdr_addr, int len)
63{ 65{
@@ -217,16 +219,23 @@ static inline int arp_checkentry(const struct arpt_arp *arp)
217} 219}
218 220
219static unsigned int 221static unsigned int
220arpt_error(struct sk_buff *skb, const struct xt_target_param *par) 222arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
221{ 223{
222 if (net_ratelimit()) 224 if (net_ratelimit())
223 printk("arp_tables: error: '%s'\n", 225 pr_err("arp_tables: error: '%s'\n",
224 (const char *)par->targinfo); 226 (const char *)par->targinfo);
225 227
226 return NF_DROP; 228 return NF_DROP;
227} 229}
228 230
229static inline struct arpt_entry *get_entry(void *base, unsigned int offset) 231static inline const struct arpt_entry_target *
232arpt_get_target_c(const struct arpt_entry *e)
233{
234 return arpt_get_target((struct arpt_entry *)e);
235}
236
237static inline struct arpt_entry *
238get_entry(const void *base, unsigned int offset)
230{ 239{
231 return (struct arpt_entry *)(base + offset); 240 return (struct arpt_entry *)(base + offset);
232} 241}
@@ -246,12 +255,11 @@ unsigned int arpt_do_table(struct sk_buff *skb,
246 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); 255 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
247 unsigned int verdict = NF_DROP; 256 unsigned int verdict = NF_DROP;
248 const struct arphdr *arp; 257 const struct arphdr *arp;
249 bool hotdrop = false;
250 struct arpt_entry *e, *back; 258 struct arpt_entry *e, *back;
251 const char *indev, *outdev; 259 const char *indev, *outdev;
252 void *table_base; 260 void *table_base;
253 const struct xt_table_info *private; 261 const struct xt_table_info *private;
254 struct xt_target_param tgpar; 262 struct xt_action_param acpar;
255 263
256 if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) 264 if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
257 return NF_DROP; 265 return NF_DROP;
@@ -266,26 +274,24 @@ unsigned int arpt_do_table(struct sk_buff *skb,
266 e = get_entry(table_base, private->hook_entry[hook]); 274 e = get_entry(table_base, private->hook_entry[hook]);
267 back = get_entry(table_base, private->underflow[hook]); 275 back = get_entry(table_base, private->underflow[hook]);
268 276
269 tgpar.in = in; 277 acpar.in = in;
270 tgpar.out = out; 278 acpar.out = out;
271 tgpar.hooknum = hook; 279 acpar.hooknum = hook;
272 tgpar.family = NFPROTO_ARP; 280 acpar.family = NFPROTO_ARP;
281 acpar.hotdrop = false;
273 282
274 arp = arp_hdr(skb); 283 arp = arp_hdr(skb);
275 do { 284 do {
276 struct arpt_entry_target *t; 285 const struct arpt_entry_target *t;
277 int hdr_len;
278 286
279 if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { 287 if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
280 e = arpt_next_entry(e); 288 e = arpt_next_entry(e);
281 continue; 289 continue;
282 } 290 }
283 291
284 hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + 292 ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1);
285 (2 * skb->dev->addr_len);
286 ADD_COUNTER(e->counters, hdr_len, 1);
287 293
288 t = arpt_get_target(e); 294 t = arpt_get_target_c(e);
289 295
290 /* Standard target? */ 296 /* Standard target? */
291 if (!t->u.kernel.target->target) { 297 if (!t->u.kernel.target->target) {
@@ -319,9 +325,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
319 /* Targets which reenter must return 325 /* Targets which reenter must return
320 * abs. verdicts 326 * abs. verdicts
321 */ 327 */
322 tgpar.target = t->u.kernel.target; 328 acpar.target = t->u.kernel.target;
323 tgpar.targinfo = t->data; 329 acpar.targinfo = t->data;
324 verdict = t->u.kernel.target->target(skb, &tgpar); 330 verdict = t->u.kernel.target->target(skb, &acpar);
325 331
326 /* Target might have changed stuff. */ 332 /* Target might have changed stuff. */
327 arp = arp_hdr(skb); 333 arp = arp_hdr(skb);
@@ -331,10 +337,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
331 else 337 else
332 /* Verdict */ 338 /* Verdict */
333 break; 339 break;
334 } while (!hotdrop); 340 } while (!acpar.hotdrop);
335 xt_info_rdunlock_bh(); 341 xt_info_rdunlock_bh();
336 342
337 if (hotdrop) 343 if (acpar.hotdrop)
338 return NF_DROP; 344 return NF_DROP;
339 else 345 else
340 return verdict; 346 return verdict;
@@ -351,7 +357,7 @@ static inline bool unconditional(const struct arpt_arp *arp)
351/* Figures out from what hook each rule can be called: returns 0 if 357/* Figures out from what hook each rule can be called: returns 0 if
352 * there are loops. Puts hook bitmask in comefrom. 358 * there are loops. Puts hook bitmask in comefrom.
353 */ 359 */
354static int mark_source_chains(struct xt_table_info *newinfo, 360static int mark_source_chains(const struct xt_table_info *newinfo,
355 unsigned int valid_hooks, void *entry0) 361 unsigned int valid_hooks, void *entry0)
356{ 362{
357 unsigned int hook; 363 unsigned int hook;
@@ -372,11 +378,11 @@ static int mark_source_chains(struct xt_table_info *newinfo,
372 378
373 for (;;) { 379 for (;;) {
374 const struct arpt_standard_target *t 380 const struct arpt_standard_target *t
375 = (void *)arpt_get_target(e); 381 = (void *)arpt_get_target_c(e);
376 int visited = e->comefrom & (1 << hook); 382 int visited = e->comefrom & (1 << hook);
377 383
378 if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) { 384 if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
379 printk("arptables: loop hook %u pos %u %08X.\n", 385 pr_notice("arptables: loop hook %u pos %u %08X.\n",
380 hook, pos, e->comefrom); 386 hook, pos, e->comefrom);
381 return 0; 387 return 0;
382 } 388 }
@@ -456,7 +462,7 @@ static int mark_source_chains(struct xt_table_info *newinfo,
456 return 1; 462 return 1;
457} 463}
458 464
459static inline int check_entry(struct arpt_entry *e, const char *name) 465static inline int check_entry(const struct arpt_entry *e, const char *name)
460{ 466{
461 const struct arpt_entry_target *t; 467 const struct arpt_entry_target *t;
462 468
@@ -468,7 +474,7 @@ static inline int check_entry(struct arpt_entry *e, const char *name)
468 if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset) 474 if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset)
469 return -EINVAL; 475 return -EINVAL;
470 476
471 t = arpt_get_target(e); 477 t = arpt_get_target_c(e);
472 if (e->target_offset + t->u.target_size > e->next_offset) 478 if (e->target_offset + t->u.target_size > e->next_offset)
473 return -EINVAL; 479 return -EINVAL;
474 480
@@ -498,8 +504,7 @@ static inline int check_target(struct arpt_entry *e, const char *name)
498} 504}
499 505
500static inline int 506static inline int
501find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, 507find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
502 unsigned int *i)
503{ 508{
504 struct arpt_entry_target *t; 509 struct arpt_entry_target *t;
505 struct xt_target *target; 510 struct xt_target *target;
@@ -510,13 +515,11 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
510 return ret; 515 return ret;
511 516
512 t = arpt_get_target(e); 517 t = arpt_get_target(e);
513 target = try_then_request_module(xt_find_target(NFPROTO_ARP, 518 target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
514 t->u.user.name, 519 t->u.user.revision);
515 t->u.user.revision), 520 if (IS_ERR(target)) {
516 "arpt_%s", t->u.user.name);
517 if (IS_ERR(target) || !target) {
518 duprintf("find_check_entry: `%s' not found\n", t->u.user.name); 521 duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
519 ret = target ? PTR_ERR(target) : -ENOENT; 522 ret = PTR_ERR(target);
520 goto out; 523 goto out;
521 } 524 }
522 t->u.kernel.target = target; 525 t->u.kernel.target = target;
@@ -524,8 +527,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
524 ret = check_target(e, name); 527 ret = check_target(e, name);
525 if (ret) 528 if (ret)
526 goto err; 529 goto err;
527
528 (*i)++;
529 return 0; 530 return 0;
530err: 531err:
531 module_put(t->u.kernel.target->me); 532 module_put(t->u.kernel.target->me);
@@ -533,14 +534,14 @@ out:
533 return ret; 534 return ret;
534} 535}
535 536
536static bool check_underflow(struct arpt_entry *e) 537static bool check_underflow(const struct arpt_entry *e)
537{ 538{
538 const struct arpt_entry_target *t; 539 const struct arpt_entry_target *t;
539 unsigned int verdict; 540 unsigned int verdict;
540 541
541 if (!unconditional(&e->arp)) 542 if (!unconditional(&e->arp))
542 return false; 543 return false;
543 t = arpt_get_target(e); 544 t = arpt_get_target_c(e);
544 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) 545 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
545 return false; 546 return false;
546 verdict = ((struct arpt_standard_target *)t)->verdict; 547 verdict = ((struct arpt_standard_target *)t)->verdict;
@@ -550,12 +551,11 @@ static bool check_underflow(struct arpt_entry *e)
550 551
551static inline int check_entry_size_and_hooks(struct arpt_entry *e, 552static inline int check_entry_size_and_hooks(struct arpt_entry *e,
552 struct xt_table_info *newinfo, 553 struct xt_table_info *newinfo,
553 unsigned char *base, 554 const unsigned char *base,
554 unsigned char *limit, 555 const unsigned char *limit,
555 const unsigned int *hook_entries, 556 const unsigned int *hook_entries,
556 const unsigned int *underflows, 557 const unsigned int *underflows,
557 unsigned int valid_hooks, 558 unsigned int valid_hooks)
558 unsigned int *i)
559{ 559{
560 unsigned int h; 560 unsigned int h;
561 561
@@ -592,19 +592,14 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
592 /* Clear counters and comefrom */ 592 /* Clear counters and comefrom */
593 e->counters = ((struct xt_counters) { 0, 0 }); 593 e->counters = ((struct xt_counters) { 0, 0 });
594 e->comefrom = 0; 594 e->comefrom = 0;
595
596 (*i)++;
597 return 0; 595 return 0;
598} 596}
599 597
600static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) 598static inline void cleanup_entry(struct arpt_entry *e)
601{ 599{
602 struct xt_tgdtor_param par; 600 struct xt_tgdtor_param par;
603 struct arpt_entry_target *t; 601 struct arpt_entry_target *t;
604 602
605 if (i && (*i)-- == 0)
606 return 1;
607
608 t = arpt_get_target(e); 603 t = arpt_get_target(e);
609 par.target = t->u.kernel.target; 604 par.target = t->u.kernel.target;
610 par.targinfo = t->data; 605 par.targinfo = t->data;
@@ -612,26 +607,20 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
612 if (par.target->destroy != NULL) 607 if (par.target->destroy != NULL)
613 par.target->destroy(&par); 608 par.target->destroy(&par);
614 module_put(par.target->me); 609 module_put(par.target->me);
615 return 0;
616} 610}
617 611
618/* Checks and translates the user-supplied table segment (held in 612/* Checks and translates the user-supplied table segment (held in
619 * newinfo). 613 * newinfo).
620 */ 614 */
621static int translate_table(const char *name, 615static int translate_table(struct xt_table_info *newinfo, void *entry0,
622 unsigned int valid_hooks, 616 const struct arpt_replace *repl)
623 struct xt_table_info *newinfo,
624 void *entry0,
625 unsigned int size,
626 unsigned int number,
627 const unsigned int *hook_entries,
628 const unsigned int *underflows)
629{ 617{
618 struct arpt_entry *iter;
630 unsigned int i; 619 unsigned int i;
631 int ret; 620 int ret = 0;
632 621
633 newinfo->size = size; 622 newinfo->size = repl->size;
634 newinfo->number = number; 623 newinfo->number = repl->num_entries;
635 624
636 /* Init all hooks to impossible value. */ 625 /* Init all hooks to impossible value. */
637 for (i = 0; i < NF_ARP_NUMHOOKS; i++) { 626 for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
@@ -643,52 +632,66 @@ static int translate_table(const char *name,
643 i = 0; 632 i = 0;
644 633
645 /* Walk through entries, checking offsets. */ 634 /* Walk through entries, checking offsets. */
646 ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, 635 xt_entry_foreach(iter, entry0, newinfo->size) {
647 check_entry_size_and_hooks, 636 ret = check_entry_size_and_hooks(iter, newinfo, entry0,
648 newinfo, 637 entry0 + repl->size,
649 entry0, 638 repl->hook_entry,
650 entry0 + size, 639 repl->underflow,
651 hook_entries, underflows, valid_hooks, &i); 640 repl->valid_hooks);
641 if (ret != 0)
642 break;
643 ++i;
644 if (strcmp(arpt_get_target(iter)->u.user.name,
645 XT_ERROR_TARGET) == 0)
646 ++newinfo->stacksize;
647 }
652 duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); 648 duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
653 if (ret != 0) 649 if (ret != 0)
654 return ret; 650 return ret;
655 651
656 if (i != number) { 652 if (i != repl->num_entries) {
657 duprintf("translate_table: %u not %u entries\n", 653 duprintf("translate_table: %u not %u entries\n",
658 i, number); 654 i, repl->num_entries);
659 return -EINVAL; 655 return -EINVAL;
660 } 656 }
661 657
662 /* Check hooks all assigned */ 658 /* Check hooks all assigned */
663 for (i = 0; i < NF_ARP_NUMHOOKS; i++) { 659 for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
664 /* Only hooks which are valid */ 660 /* Only hooks which are valid */
665 if (!(valid_hooks & (1 << i))) 661 if (!(repl->valid_hooks & (1 << i)))
666 continue; 662 continue;
667 if (newinfo->hook_entry[i] == 0xFFFFFFFF) { 663 if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
668 duprintf("Invalid hook entry %u %u\n", 664 duprintf("Invalid hook entry %u %u\n",
669 i, hook_entries[i]); 665 i, repl->hook_entry[i]);
670 return -EINVAL; 666 return -EINVAL;
671 } 667 }
672 if (newinfo->underflow[i] == 0xFFFFFFFF) { 668 if (newinfo->underflow[i] == 0xFFFFFFFF) {
673 duprintf("Invalid underflow %u %u\n", 669 duprintf("Invalid underflow %u %u\n",
674 i, underflows[i]); 670 i, repl->underflow[i]);
675 return -EINVAL; 671 return -EINVAL;
676 } 672 }
677 } 673 }
678 674
679 if (!mark_source_chains(newinfo, valid_hooks, entry0)) { 675 if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) {
680 duprintf("Looping hook\n"); 676 duprintf("Looping hook\n");
681 return -ELOOP; 677 return -ELOOP;
682 } 678 }
683 679
684 /* Finally, each sanity check must pass */ 680 /* Finally, each sanity check must pass */
685 i = 0; 681 i = 0;
686 ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, 682 xt_entry_foreach(iter, entry0, newinfo->size) {
687 find_check_entry, name, size, &i); 683 ret = find_check_entry(iter, repl->name, repl->size);
684 if (ret != 0)
685 break;
686 ++i;
687 }
688 688
689 if (ret != 0) { 689 if (ret != 0) {
690 ARPT_ENTRY_ITERATE(entry0, newinfo->size, 690 xt_entry_foreach(iter, entry0, newinfo->size) {
691 cleanup_entry, &i); 691 if (i-- == 0)
692 break;
693 cleanup_entry(iter);
694 }
692 return ret; 695 return ret;
693 } 696 }
694 697
@@ -701,33 +704,13 @@ static int translate_table(const char *name,
701 return ret; 704 return ret;
702} 705}
703 706
704/* Gets counters. */
705static inline int add_entry_to_counter(const struct arpt_entry *e,
706 struct xt_counters total[],
707 unsigned int *i)
708{
709 ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
710
711 (*i)++;
712 return 0;
713}
714
715static inline int set_entry_to_counter(const struct arpt_entry *e,
716 struct xt_counters total[],
717 unsigned int *i)
718{
719 SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
720
721 (*i)++;
722 return 0;
723}
724
725static void get_counters(const struct xt_table_info *t, 707static void get_counters(const struct xt_table_info *t,
726 struct xt_counters counters[]) 708 struct xt_counters counters[])
727{ 709{
710 struct arpt_entry *iter;
728 unsigned int cpu; 711 unsigned int cpu;
729 unsigned int i; 712 unsigned int i;
730 unsigned int curcpu; 713 unsigned int curcpu = get_cpu();
731 714
732 /* Instead of clearing (by a previous call to memset()) 715 /* Instead of clearing (by a previous call to memset())
733 * the counters and using adds, we set the counters 716 * the counters and using adds, we set the counters
@@ -737,42 +720,46 @@ static void get_counters(const struct xt_table_info *t,
737 * if new softirq were to run and call ipt_do_table 720 * if new softirq were to run and call ipt_do_table
738 */ 721 */
739 local_bh_disable(); 722 local_bh_disable();
740 curcpu = smp_processor_id();
741
742 i = 0; 723 i = 0;
743 ARPT_ENTRY_ITERATE(t->entries[curcpu], 724 xt_entry_foreach(iter, t->entries[curcpu], t->size) {
744 t->size, 725 SET_COUNTER(counters[i], iter->counters.bcnt,
745 set_entry_to_counter, 726 iter->counters.pcnt);
746 counters, 727 ++i;
747 &i); 728 }
729 local_bh_enable();
730 /* Processing counters from other cpus, we can let bottom half enabled,
731 * (preemption is disabled)
732 */
748 733
749 for_each_possible_cpu(cpu) { 734 for_each_possible_cpu(cpu) {
750 if (cpu == curcpu) 735 if (cpu == curcpu)
751 continue; 736 continue;
752 i = 0; 737 i = 0;
738 local_bh_disable();
753 xt_info_wrlock(cpu); 739 xt_info_wrlock(cpu);
754 ARPT_ENTRY_ITERATE(t->entries[cpu], 740 xt_entry_foreach(iter, t->entries[cpu], t->size) {
755 t->size, 741 ADD_COUNTER(counters[i], iter->counters.bcnt,
756 add_entry_to_counter, 742 iter->counters.pcnt);
757 counters, 743 ++i;
758 &i); 744 }
759 xt_info_wrunlock(cpu); 745 xt_info_wrunlock(cpu);
746 local_bh_enable();
760 } 747 }
761 local_bh_enable(); 748 put_cpu();
762} 749}
763 750
764static struct xt_counters *alloc_counters(struct xt_table *table) 751static struct xt_counters *alloc_counters(const struct xt_table *table)
765{ 752{
766 unsigned int countersize; 753 unsigned int countersize;
767 struct xt_counters *counters; 754 struct xt_counters *counters;
768 struct xt_table_info *private = table->private; 755 const struct xt_table_info *private = table->private;
769 756
770 /* We need atomic snapshot of counters: rest doesn't change 757 /* We need atomic snapshot of counters: rest doesn't change
771 * (other than comefrom, which userspace doesn't care 758 * (other than comefrom, which userspace doesn't care
772 * about). 759 * about).
773 */ 760 */
774 countersize = sizeof(struct xt_counters) * private->number; 761 countersize = sizeof(struct xt_counters) * private->number;
775 counters = vmalloc_node(countersize, numa_node_id()); 762 counters = vmalloc(countersize);
776 763
777 if (counters == NULL) 764 if (counters == NULL)
778 return ERR_PTR(-ENOMEM); 765 return ERR_PTR(-ENOMEM);
@@ -783,11 +770,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
783} 770}
784 771
785static int copy_entries_to_user(unsigned int total_size, 772static int copy_entries_to_user(unsigned int total_size,
786 struct xt_table *table, 773 const struct xt_table *table,
787 void __user *userptr) 774 void __user *userptr)
788{ 775{
789 unsigned int off, num; 776 unsigned int off, num;
790 struct arpt_entry *e; 777 const struct arpt_entry *e;
791 struct xt_counters *counters; 778 struct xt_counters *counters;
792 struct xt_table_info *private = table->private; 779 struct xt_table_info *private = table->private;
793 int ret = 0; 780 int ret = 0;
@@ -807,7 +794,7 @@ static int copy_entries_to_user(unsigned int total_size,
807 /* FIXME: use iterator macros --RR */ 794 /* FIXME: use iterator macros --RR */
808 /* ... then go back and fix counters and names */ 795 /* ... then go back and fix counters and names */
809 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ 796 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
810 struct arpt_entry_target *t; 797 const struct arpt_entry_target *t;
811 798
812 e = (struct arpt_entry *)(loc_cpu_entry + off); 799 e = (struct arpt_entry *)(loc_cpu_entry + off);
813 if (copy_to_user(userptr + off 800 if (copy_to_user(userptr + off
@@ -818,7 +805,7 @@ static int copy_entries_to_user(unsigned int total_size,
818 goto free_counters; 805 goto free_counters;
819 } 806 }
820 807
821 t = arpt_get_target(e); 808 t = arpt_get_target_c(e);
822 if (copy_to_user(userptr + off + e->target_offset 809 if (copy_to_user(userptr + off + e->target_offset
823 + offsetof(struct arpt_entry_target, 810 + offsetof(struct arpt_entry_target,
824 u.user.name), 811 u.user.name),
@@ -835,7 +822,7 @@ static int copy_entries_to_user(unsigned int total_size,
835} 822}
836 823
837#ifdef CONFIG_COMPAT 824#ifdef CONFIG_COMPAT
838static void compat_standard_from_user(void *dst, void *src) 825static void compat_standard_from_user(void *dst, const void *src)
839{ 826{
840 int v = *(compat_int_t *)src; 827 int v = *(compat_int_t *)src;
841 828
@@ -844,7 +831,7 @@ static void compat_standard_from_user(void *dst, void *src)
844 memcpy(dst, &v, sizeof(v)); 831 memcpy(dst, &v, sizeof(v));
845} 832}
846 833
847static int compat_standard_to_user(void __user *dst, void *src) 834static int compat_standard_to_user(void __user *dst, const void *src)
848{ 835{
849 compat_int_t cv = *(int *)src; 836 compat_int_t cv = *(int *)src;
850 837
@@ -853,18 +840,18 @@ static int compat_standard_to_user(void __user *dst, void *src)
853 return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; 840 return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
854} 841}
855 842
856static int compat_calc_entry(struct arpt_entry *e, 843static int compat_calc_entry(const struct arpt_entry *e,
857 const struct xt_table_info *info, 844 const struct xt_table_info *info,
858 void *base, struct xt_table_info *newinfo) 845 const void *base, struct xt_table_info *newinfo)
859{ 846{
860 struct arpt_entry_target *t; 847 const struct arpt_entry_target *t;
861 unsigned int entry_offset; 848 unsigned int entry_offset;
862 int off, i, ret; 849 int off, i, ret;
863 850
864 off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); 851 off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
865 entry_offset = (void *)e - base; 852 entry_offset = (void *)e - base;
866 853
867 t = arpt_get_target(e); 854 t = arpt_get_target_c(e);
868 off += xt_compat_target_offset(t->u.kernel.target); 855 off += xt_compat_target_offset(t->u.kernel.target);
869 newinfo->size -= off; 856 newinfo->size -= off;
870 ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); 857 ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
@@ -885,7 +872,9 @@ static int compat_calc_entry(struct arpt_entry *e,
885static int compat_table_info(const struct xt_table_info *info, 872static int compat_table_info(const struct xt_table_info *info,
886 struct xt_table_info *newinfo) 873 struct xt_table_info *newinfo)
887{ 874{
875 struct arpt_entry *iter;
888 void *loc_cpu_entry; 876 void *loc_cpu_entry;
877 int ret;
889 878
890 if (!newinfo || !info) 879 if (!newinfo || !info)
891 return -EINVAL; 880 return -EINVAL;
@@ -894,13 +883,17 @@ static int compat_table_info(const struct xt_table_info *info,
894 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 883 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
895 newinfo->initial_entries = 0; 884 newinfo->initial_entries = 0;
896 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 885 loc_cpu_entry = info->entries[raw_smp_processor_id()];
897 return ARPT_ENTRY_ITERATE(loc_cpu_entry, info->size, 886 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
898 compat_calc_entry, info, loc_cpu_entry, 887 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
899 newinfo); 888 if (ret != 0)
889 return ret;
890 }
891 return 0;
900} 892}
901#endif 893#endif
902 894
903static int get_info(struct net *net, void __user *user, int *len, int compat) 895static int get_info(struct net *net, void __user *user,
896 const int *len, int compat)
904{ 897{
905 char name[ARPT_TABLE_MAXNAMELEN]; 898 char name[ARPT_TABLE_MAXNAMELEN];
906 struct xt_table *t; 899 struct xt_table *t;
@@ -925,10 +918,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
925 if (t && !IS_ERR(t)) { 918 if (t && !IS_ERR(t)) {
926 struct arpt_getinfo info; 919 struct arpt_getinfo info;
927 const struct xt_table_info *private = t->private; 920 const struct xt_table_info *private = t->private;
928
929#ifdef CONFIG_COMPAT 921#ifdef CONFIG_COMPAT
922 struct xt_table_info tmp;
923
930 if (compat) { 924 if (compat) {
931 struct xt_table_info tmp;
932 ret = compat_table_info(private, &tmp); 925 ret = compat_table_info(private, &tmp);
933 xt_compat_flush_offsets(NFPROTO_ARP); 926 xt_compat_flush_offsets(NFPROTO_ARP);
934 private = &tmp; 927 private = &tmp;
@@ -959,7 +952,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
959} 952}
960 953
961static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, 954static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
962 int *len) 955 const int *len)
963{ 956{
964 int ret; 957 int ret;
965 struct arpt_get_entries get; 958 struct arpt_get_entries get;
@@ -1010,10 +1003,10 @@ static int __do_replace(struct net *net, const char *name,
1010 struct xt_table_info *oldinfo; 1003 struct xt_table_info *oldinfo;
1011 struct xt_counters *counters; 1004 struct xt_counters *counters;
1012 void *loc_cpu_old_entry; 1005 void *loc_cpu_old_entry;
1006 struct arpt_entry *iter;
1013 1007
1014 ret = 0; 1008 ret = 0;
1015 counters = vmalloc_node(num_counters * sizeof(struct xt_counters), 1009 counters = vmalloc(num_counters * sizeof(struct xt_counters));
1016 numa_node_id());
1017 if (!counters) { 1010 if (!counters) {
1018 ret = -ENOMEM; 1011 ret = -ENOMEM;
1019 goto out; 1012 goto out;
@@ -1053,8 +1046,8 @@ static int __do_replace(struct net *net, const char *name,
1053 1046
1054 /* Decrease module usage counts and free resource */ 1047 /* Decrease module usage counts and free resource */
1055 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1048 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1056 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1049 xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
1057 NULL); 1050 cleanup_entry(iter);
1058 1051
1059 xt_free_table_info(oldinfo); 1052 xt_free_table_info(oldinfo);
1060 if (copy_to_user(counters_ptr, counters, 1053 if (copy_to_user(counters_ptr, counters,
@@ -1073,12 +1066,14 @@ static int __do_replace(struct net *net, const char *name,
1073 return ret; 1066 return ret;
1074} 1067}
1075 1068
1076static int do_replace(struct net *net, void __user *user, unsigned int len) 1069static int do_replace(struct net *net, const void __user *user,
1070 unsigned int len)
1077{ 1071{
1078 int ret; 1072 int ret;
1079 struct arpt_replace tmp; 1073 struct arpt_replace tmp;
1080 struct xt_table_info *newinfo; 1074 struct xt_table_info *newinfo;
1081 void *loc_cpu_entry; 1075 void *loc_cpu_entry;
1076 struct arpt_entry *iter;
1082 1077
1083 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1078 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1084 return -EFAULT; 1079 return -EFAULT;
@@ -1099,9 +1094,7 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
1099 goto free_newinfo; 1094 goto free_newinfo;
1100 } 1095 }
1101 1096
1102 ret = translate_table(tmp.name, tmp.valid_hooks, 1097 ret = translate_table(newinfo, loc_cpu_entry, &tmp);
1103 newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
1104 tmp.hook_entry, tmp.underflow);
1105 if (ret != 0) 1098 if (ret != 0)
1106 goto free_newinfo; 1099 goto free_newinfo;
1107 1100
@@ -1114,27 +1107,15 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
1114 return 0; 1107 return 0;
1115 1108
1116 free_newinfo_untrans: 1109 free_newinfo_untrans:
1117 ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); 1110 xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
1111 cleanup_entry(iter);
1118 free_newinfo: 1112 free_newinfo:
1119 xt_free_table_info(newinfo); 1113 xt_free_table_info(newinfo);
1120 return ret; 1114 return ret;
1121} 1115}
1122 1116
1123/* We're lazy, and add to the first CPU; overflow works its fey magic 1117static int do_add_counters(struct net *net, const void __user *user,
1124 * and everything is OK. */ 1118 unsigned int len, int compat)
1125static int
1126add_counter_to_entry(struct arpt_entry *e,
1127 const struct xt_counters addme[],
1128 unsigned int *i)
1129{
1130 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1131
1132 (*i)++;
1133 return 0;
1134}
1135
1136static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1137 int compat)
1138{ 1119{
1139 unsigned int i, curcpu; 1120 unsigned int i, curcpu;
1140 struct xt_counters_info tmp; 1121 struct xt_counters_info tmp;
@@ -1147,6 +1128,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1147 const struct xt_table_info *private; 1128 const struct xt_table_info *private;
1148 int ret = 0; 1129 int ret = 0;
1149 void *loc_cpu_entry; 1130 void *loc_cpu_entry;
1131 struct arpt_entry *iter;
1150#ifdef CONFIG_COMPAT 1132#ifdef CONFIG_COMPAT
1151 struct compat_xt_counters_info compat_tmp; 1133 struct compat_xt_counters_info compat_tmp;
1152 1134
@@ -1177,7 +1159,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1177 if (len != size + num_counters * sizeof(struct xt_counters)) 1159 if (len != size + num_counters * sizeof(struct xt_counters))
1178 return -EINVAL; 1160 return -EINVAL;
1179 1161
1180 paddc = vmalloc_node(len - size, numa_node_id()); 1162 paddc = vmalloc(len - size);
1181 if (!paddc) 1163 if (!paddc)
1182 return -ENOMEM; 1164 return -ENOMEM;
1183 1165
@@ -1204,11 +1186,10 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1204 curcpu = smp_processor_id(); 1186 curcpu = smp_processor_id();
1205 loc_cpu_entry = private->entries[curcpu]; 1187 loc_cpu_entry = private->entries[curcpu];
1206 xt_info_wrlock(curcpu); 1188 xt_info_wrlock(curcpu);
1207 ARPT_ENTRY_ITERATE(loc_cpu_entry, 1189 xt_entry_foreach(iter, loc_cpu_entry, private->size) {
1208 private->size, 1190 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
1209 add_counter_to_entry, 1191 ++i;
1210 paddc, 1192 }
1211 &i);
1212 xt_info_wrunlock(curcpu); 1193 xt_info_wrunlock(curcpu);
1213 unlock_up_free: 1194 unlock_up_free:
1214 local_bh_enable(); 1195 local_bh_enable();
@@ -1221,28 +1202,22 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1221} 1202}
1222 1203
1223#ifdef CONFIG_COMPAT 1204#ifdef CONFIG_COMPAT
1224static inline int 1205static inline void compat_release_entry(struct compat_arpt_entry *e)
1225compat_release_entry(struct compat_arpt_entry *e, unsigned int *i)
1226{ 1206{
1227 struct arpt_entry_target *t; 1207 struct arpt_entry_target *t;
1228 1208
1229 if (i && (*i)-- == 0)
1230 return 1;
1231
1232 t = compat_arpt_get_target(e); 1209 t = compat_arpt_get_target(e);
1233 module_put(t->u.kernel.target->me); 1210 module_put(t->u.kernel.target->me);
1234 return 0;
1235} 1211}
1236 1212
1237static inline int 1213static inline int
1238check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, 1214check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1239 struct xt_table_info *newinfo, 1215 struct xt_table_info *newinfo,
1240 unsigned int *size, 1216 unsigned int *size,
1241 unsigned char *base, 1217 const unsigned char *base,
1242 unsigned char *limit, 1218 const unsigned char *limit,
1243 unsigned int *hook_entries, 1219 const unsigned int *hook_entries,
1244 unsigned int *underflows, 1220 const unsigned int *underflows,
1245 unsigned int *i,
1246 const char *name) 1221 const char *name)
1247{ 1222{
1248 struct arpt_entry_target *t; 1223 struct arpt_entry_target *t;
@@ -1273,14 +1248,12 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1273 entry_offset = (void *)e - (void *)base; 1248 entry_offset = (void *)e - (void *)base;
1274 1249
1275 t = compat_arpt_get_target(e); 1250 t = compat_arpt_get_target(e);
1276 target = try_then_request_module(xt_find_target(NFPROTO_ARP, 1251 target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
1277 t->u.user.name, 1252 t->u.user.revision);
1278 t->u.user.revision), 1253 if (IS_ERR(target)) {
1279 "arpt_%s", t->u.user.name);
1280 if (IS_ERR(target) || !target) {
1281 duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", 1254 duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
1282 t->u.user.name); 1255 t->u.user.name);
1283 ret = target ? PTR_ERR(target) : -ENOENT; 1256 ret = PTR_ERR(target);
1284 goto out; 1257 goto out;
1285 } 1258 }
1286 t->u.kernel.target = target; 1259 t->u.kernel.target = target;
@@ -1302,8 +1275,6 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1302 /* Clear counters and comefrom */ 1275 /* Clear counters and comefrom */
1303 memset(&e->counters, 0, sizeof(e->counters)); 1276 memset(&e->counters, 0, sizeof(e->counters));
1304 e->comefrom = 0; 1277 e->comefrom = 0;
1305
1306 (*i)++;
1307 return 0; 1278 return 0;
1308 1279
1309release_target: 1280release_target:
@@ -1347,19 +1318,6 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
1347 return ret; 1318 return ret;
1348} 1319}
1349 1320
1350static inline int compat_check_entry(struct arpt_entry *e, const char *name,
1351 unsigned int *i)
1352{
1353 int ret;
1354
1355 ret = check_target(e, name);
1356 if (ret)
1357 return ret;
1358
1359 (*i)++;
1360 return 0;
1361}
1362
1363static int translate_compat_table(const char *name, 1321static int translate_compat_table(const char *name,
1364 unsigned int valid_hooks, 1322 unsigned int valid_hooks,
1365 struct xt_table_info **pinfo, 1323 struct xt_table_info **pinfo,
@@ -1372,8 +1330,10 @@ static int translate_compat_table(const char *name,
1372 unsigned int i, j; 1330 unsigned int i, j;
1373 struct xt_table_info *newinfo, *info; 1331 struct xt_table_info *newinfo, *info;
1374 void *pos, *entry0, *entry1; 1332 void *pos, *entry0, *entry1;
1333 struct compat_arpt_entry *iter0;
1334 struct arpt_entry *iter1;
1375 unsigned int size; 1335 unsigned int size;
1376 int ret; 1336 int ret = 0;
1377 1337
1378 info = *pinfo; 1338 info = *pinfo;
1379 entry0 = *pentry0; 1339 entry0 = *pentry0;
@@ -1390,13 +1350,17 @@ static int translate_compat_table(const char *name,
1390 j = 0; 1350 j = 0;
1391 xt_compat_lock(NFPROTO_ARP); 1351 xt_compat_lock(NFPROTO_ARP);
1392 /* Walk through entries, checking offsets. */ 1352 /* Walk through entries, checking offsets. */
1393 ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, 1353 xt_entry_foreach(iter0, entry0, total_size) {
1394 check_compat_entry_size_and_hooks, 1354 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
1395 info, &size, entry0, 1355 entry0,
1396 entry0 + total_size, 1356 entry0 + total_size,
1397 hook_entries, underflows, &j, name); 1357 hook_entries,
1398 if (ret != 0) 1358 underflows,
1399 goto out_unlock; 1359 name);
1360 if (ret != 0)
1361 goto out_unlock;
1362 ++j;
1363 }
1400 1364
1401 ret = -EINVAL; 1365 ret = -EINVAL;
1402 if (j != number) { 1366 if (j != number) {
@@ -1435,9 +1399,12 @@ static int translate_compat_table(const char *name,
1435 entry1 = newinfo->entries[raw_smp_processor_id()]; 1399 entry1 = newinfo->entries[raw_smp_processor_id()];
1436 pos = entry1; 1400 pos = entry1;
1437 size = total_size; 1401 size = total_size;
1438 ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, 1402 xt_entry_foreach(iter0, entry0, total_size) {
1439 compat_copy_entry_from_user, 1403 ret = compat_copy_entry_from_user(iter0, &pos, &size,
1440 &pos, &size, name, newinfo, entry1); 1404 name, newinfo, entry1);
1405 if (ret != 0)
1406 break;
1407 }
1441 xt_compat_flush_offsets(NFPROTO_ARP); 1408 xt_compat_flush_offsets(NFPROTO_ARP);
1442 xt_compat_unlock(NFPROTO_ARP); 1409 xt_compat_unlock(NFPROTO_ARP);
1443 if (ret) 1410 if (ret)
@@ -1448,13 +1415,35 @@ static int translate_compat_table(const char *name,
1448 goto free_newinfo; 1415 goto free_newinfo;
1449 1416
1450 i = 0; 1417 i = 0;
1451 ret = ARPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, 1418 xt_entry_foreach(iter1, entry1, newinfo->size) {
1452 name, &i); 1419 ret = check_target(iter1, name);
1420 if (ret != 0)
1421 break;
1422 ++i;
1423 if (strcmp(arpt_get_target(iter1)->u.user.name,
1424 XT_ERROR_TARGET) == 0)
1425 ++newinfo->stacksize;
1426 }
1453 if (ret) { 1427 if (ret) {
1428 /*
1429 * The first i matches need cleanup_entry (calls ->destroy)
1430 * because they had called ->check already. The other j-i
1431 * entries need only release.
1432 */
1433 int skip = i;
1454 j -= i; 1434 j -= i;
1455 COMPAT_ARPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, 1435 xt_entry_foreach(iter0, entry0, newinfo->size) {
1456 compat_release_entry, &j); 1436 if (skip-- > 0)
1457 ARPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); 1437 continue;
1438 if (j-- == 0)
1439 break;
1440 compat_release_entry(iter0);
1441 }
1442 xt_entry_foreach(iter1, entry1, newinfo->size) {
1443 if (i-- == 0)
1444 break;
1445 cleanup_entry(iter1);
1446 }
1458 xt_free_table_info(newinfo); 1447 xt_free_table_info(newinfo);
1459 return ret; 1448 return ret;
1460 } 1449 }
@@ -1472,7 +1461,11 @@ static int translate_compat_table(const char *name,
1472free_newinfo: 1461free_newinfo:
1473 xt_free_table_info(newinfo); 1462 xt_free_table_info(newinfo);
1474out: 1463out:
1475 COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); 1464 xt_entry_foreach(iter0, entry0, total_size) {
1465 if (j-- == 0)
1466 break;
1467 compat_release_entry(iter0);
1468 }
1476 return ret; 1469 return ret;
1477out_unlock: 1470out_unlock:
1478 xt_compat_flush_offsets(NFPROTO_ARP); 1471 xt_compat_flush_offsets(NFPROTO_ARP);
@@ -1499,6 +1492,7 @@ static int compat_do_replace(struct net *net, void __user *user,
1499 struct compat_arpt_replace tmp; 1492 struct compat_arpt_replace tmp;
1500 struct xt_table_info *newinfo; 1493 struct xt_table_info *newinfo;
1501 void *loc_cpu_entry; 1494 void *loc_cpu_entry;
1495 struct arpt_entry *iter;
1502 1496
1503 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1497 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1504 return -EFAULT; 1498 return -EFAULT;
@@ -1536,7 +1530,8 @@ static int compat_do_replace(struct net *net, void __user *user,
1536 return 0; 1530 return 0;
1537 1531
1538 free_newinfo_untrans: 1532 free_newinfo_untrans:
1539 ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); 1533 xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
1534 cleanup_entry(iter);
1540 free_newinfo: 1535 free_newinfo:
1541 xt_free_table_info(newinfo); 1536 xt_free_table_info(newinfo);
1542 return ret; 1537 return ret;
@@ -1570,7 +1565,7 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
1570static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, 1565static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
1571 compat_uint_t *size, 1566 compat_uint_t *size,
1572 struct xt_counters *counters, 1567 struct xt_counters *counters,
1573 unsigned int *i) 1568 unsigned int i)
1574{ 1569{
1575 struct arpt_entry_target *t; 1570 struct arpt_entry_target *t;
1576 struct compat_arpt_entry __user *ce; 1571 struct compat_arpt_entry __user *ce;
@@ -1578,14 +1573,12 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
1578 compat_uint_t origsize; 1573 compat_uint_t origsize;
1579 int ret; 1574 int ret;
1580 1575
1581 ret = -EFAULT;
1582 origsize = *size; 1576 origsize = *size;
1583 ce = (struct compat_arpt_entry __user *)*dstptr; 1577 ce = (struct compat_arpt_entry __user *)*dstptr;
1584 if (copy_to_user(ce, e, sizeof(struct arpt_entry))) 1578 if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 ||
1585 goto out; 1579 copy_to_user(&ce->counters, &counters[i],
1586 1580 sizeof(counters[i])) != 0)
1587 if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) 1581 return -EFAULT;
1588 goto out;
1589 1582
1590 *dstptr += sizeof(struct compat_arpt_entry); 1583 *dstptr += sizeof(struct compat_arpt_entry);
1591 *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); 1584 *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
@@ -1595,18 +1588,12 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
1595 t = arpt_get_target(e); 1588 t = arpt_get_target(e);
1596 ret = xt_compat_target_to_user(t, dstptr, size); 1589 ret = xt_compat_target_to_user(t, dstptr, size);
1597 if (ret) 1590 if (ret)
1598 goto out; 1591 return ret;
1599 ret = -EFAULT;
1600 next_offset = e->next_offset - (origsize - *size); 1592 next_offset = e->next_offset - (origsize - *size);
1601 if (put_user(target_offset, &ce->target_offset)) 1593 if (put_user(target_offset, &ce->target_offset) != 0 ||
1602 goto out; 1594 put_user(next_offset, &ce->next_offset) != 0)
1603 if (put_user(next_offset, &ce->next_offset)) 1595 return -EFAULT;
1604 goto out;
1605
1606 (*i)++;
1607 return 0; 1596 return 0;
1608out:
1609 return ret;
1610} 1597}
1611 1598
1612static int compat_copy_entries_to_user(unsigned int total_size, 1599static int compat_copy_entries_to_user(unsigned int total_size,
@@ -1620,6 +1607,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
1620 int ret = 0; 1607 int ret = 0;
1621 void *loc_cpu_entry; 1608 void *loc_cpu_entry;
1622 unsigned int i = 0; 1609 unsigned int i = 0;
1610 struct arpt_entry *iter;
1623 1611
1624 counters = alloc_counters(table); 1612 counters = alloc_counters(table);
1625 if (IS_ERR(counters)) 1613 if (IS_ERR(counters))
@@ -1629,9 +1617,12 @@ static int compat_copy_entries_to_user(unsigned int total_size,
1629 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1617 loc_cpu_entry = private->entries[raw_smp_processor_id()];
1630 pos = userptr; 1618 pos = userptr;
1631 size = total_size; 1619 size = total_size;
1632 ret = ARPT_ENTRY_ITERATE(loc_cpu_entry, total_size, 1620 xt_entry_foreach(iter, loc_cpu_entry, total_size) {
1633 compat_copy_entry_to_user, 1621 ret = compat_copy_entry_to_user(iter, &pos,
1634 &pos, &size, counters, &i); 1622 &size, counters, i++);
1623 if (ret != 0)
1624 break;
1625 }
1635 vfree(counters); 1626 vfree(counters);
1636 return ret; 1627 return ret;
1637} 1628}
@@ -1784,8 +1775,7 @@ struct xt_table *arpt_register_table(struct net *net,
1784{ 1775{
1785 int ret; 1776 int ret;
1786 struct xt_table_info *newinfo; 1777 struct xt_table_info *newinfo;
1787 struct xt_table_info bootstrap 1778 struct xt_table_info bootstrap = {0};
1788 = { 0, 0, 0, { 0 }, { 0 }, { } };
1789 void *loc_cpu_entry; 1779 void *loc_cpu_entry;
1790 struct xt_table *new_table; 1780 struct xt_table *new_table;
1791 1781
@@ -1799,12 +1789,7 @@ struct xt_table *arpt_register_table(struct net *net,
1799 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; 1789 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1800 memcpy(loc_cpu_entry, repl->entries, repl->size); 1790 memcpy(loc_cpu_entry, repl->entries, repl->size);
1801 1791
1802 ret = translate_table(table->name, table->valid_hooks, 1792 ret = translate_table(newinfo, loc_cpu_entry, repl);
1803 newinfo, loc_cpu_entry, repl->size,
1804 repl->num_entries,
1805 repl->hook_entry,
1806 repl->underflow);
1807
1808 duprintf("arpt_register_table: translate table gives %d\n", ret); 1793 duprintf("arpt_register_table: translate table gives %d\n", ret);
1809 if (ret != 0) 1794 if (ret != 0)
1810 goto out_free; 1795 goto out_free;
@@ -1827,35 +1812,37 @@ void arpt_unregister_table(struct xt_table *table)
1827 struct xt_table_info *private; 1812 struct xt_table_info *private;
1828 void *loc_cpu_entry; 1813 void *loc_cpu_entry;
1829 struct module *table_owner = table->me; 1814 struct module *table_owner = table->me;
1815 struct arpt_entry *iter;
1830 1816
1831 private = xt_unregister_table(table); 1817 private = xt_unregister_table(table);
1832 1818
1833 /* Decrease module usage counts and free resources */ 1819 /* Decrease module usage counts and free resources */
1834 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1820 loc_cpu_entry = private->entries[raw_smp_processor_id()];
1835 ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size, 1821 xt_entry_foreach(iter, loc_cpu_entry, private->size)
1836 cleanup_entry, NULL); 1822 cleanup_entry(iter);
1837 if (private->number > private->initial_entries) 1823 if (private->number > private->initial_entries)
1838 module_put(table_owner); 1824 module_put(table_owner);
1839 xt_free_table_info(private); 1825 xt_free_table_info(private);
1840} 1826}
1841 1827
1842/* The built-in targets: standard (NULL) and error. */ 1828/* The built-in targets: standard (NULL) and error. */
1843static struct xt_target arpt_standard_target __read_mostly = { 1829static struct xt_target arpt_builtin_tg[] __read_mostly = {
1844 .name = ARPT_STANDARD_TARGET, 1830 {
1845 .targetsize = sizeof(int), 1831 .name = ARPT_STANDARD_TARGET,
1846 .family = NFPROTO_ARP, 1832 .targetsize = sizeof(int),
1833 .family = NFPROTO_ARP,
1847#ifdef CONFIG_COMPAT 1834#ifdef CONFIG_COMPAT
1848 .compatsize = sizeof(compat_int_t), 1835 .compatsize = sizeof(compat_int_t),
1849 .compat_from_user = compat_standard_from_user, 1836 .compat_from_user = compat_standard_from_user,
1850 .compat_to_user = compat_standard_to_user, 1837 .compat_to_user = compat_standard_to_user,
1851#endif 1838#endif
1852}; 1839 },
1853 1840 {
1854static struct xt_target arpt_error_target __read_mostly = { 1841 .name = ARPT_ERROR_TARGET,
1855 .name = ARPT_ERROR_TARGET, 1842 .target = arpt_error,
1856 .target = arpt_error, 1843 .targetsize = ARPT_FUNCTION_MAXNAMELEN,
1857 .targetsize = ARPT_FUNCTION_MAXNAMELEN, 1844 .family = NFPROTO_ARP,
1858 .family = NFPROTO_ARP, 1845 },
1859}; 1846};
1860 1847
1861static struct nf_sockopt_ops arpt_sockopts = { 1848static struct nf_sockopt_ops arpt_sockopts = {
@@ -1899,12 +1886,9 @@ static int __init arp_tables_init(void)
1899 goto err1; 1886 goto err1;
1900 1887
1901 /* Noone else will be downing sem now, so we won't sleep */ 1888 /* Noone else will be downing sem now, so we won't sleep */
1902 ret = xt_register_target(&arpt_standard_target); 1889 ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
1903 if (ret < 0) 1890 if (ret < 0)
1904 goto err2; 1891 goto err2;
1905 ret = xt_register_target(&arpt_error_target);
1906 if (ret < 0)
1907 goto err3;
1908 1892
1909 /* Register setsockopt */ 1893 /* Register setsockopt */
1910 ret = nf_register_sockopt(&arpt_sockopts); 1894 ret = nf_register_sockopt(&arpt_sockopts);
@@ -1915,9 +1899,7 @@ static int __init arp_tables_init(void)
1915 return 0; 1899 return 0;
1916 1900
1917err4: 1901err4:
1918 xt_unregister_target(&arpt_error_target); 1902 xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
1919err3:
1920 xt_unregister_target(&arpt_standard_target);
1921err2: 1903err2:
1922 unregister_pernet_subsys(&arp_tables_net_ops); 1904 unregister_pernet_subsys(&arp_tables_net_ops);
1923err1: 1905err1:
@@ -1927,8 +1909,7 @@ err1:
1927static void __exit arp_tables_fini(void) 1909static void __exit arp_tables_fini(void)
1928{ 1910{
1929 nf_unregister_sockopt(&arpt_sockopts); 1911 nf_unregister_sockopt(&arpt_sockopts);
1930 xt_unregister_target(&arpt_error_target); 1912 xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
1931 xt_unregister_target(&arpt_standard_target);
1932 unregister_pernet_subsys(&arp_tables_net_ops); 1913 unregister_pernet_subsys(&arp_tables_net_ops);
1933} 1914}
1934 1915
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index b0d5b1d0a769..e1be7dd1171b 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -9,7 +9,7 @@ MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
9MODULE_DESCRIPTION("arptables arp payload mangle target"); 9MODULE_DESCRIPTION("arptables arp payload mangle target");
10 10
11static unsigned int 11static unsigned int
12target(struct sk_buff *skb, const struct xt_target_param *par) 12target(struct sk_buff *skb, const struct xt_action_param *par)
13{ 13{
14 const struct arpt_mangle *mangle = par->targinfo; 14 const struct arpt_mangle *mangle = par->targinfo;
15 const struct arphdr *arp; 15 const struct arphdr *arp;
@@ -54,7 +54,7 @@ target(struct sk_buff *skb, const struct xt_target_param *par)
54 return mangle->target; 54 return mangle->target;
55} 55}
56 56
57static bool checkentry(const struct xt_tgchk_param *par) 57static int checkentry(const struct xt_tgchk_param *par)
58{ 58{
59 const struct arpt_mangle *mangle = par->targinfo; 59 const struct arpt_mangle *mangle = par->targinfo;
60 60
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 97337601827a..79ca5e70d497 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -6,7 +6,9 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/netfilter/x_tables.h>
9#include <linux/netfilter_arp/arp_tables.h> 10#include <linux/netfilter_arp/arp_tables.h>
11#include <linux/slab.h>
10 12
11MODULE_LICENSE("GPL"); 13MODULE_LICENSE("GPL");
12MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); 14MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
@@ -15,93 +17,37 @@ MODULE_DESCRIPTION("arptables filter table");
15#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ 17#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
16 (1 << NF_ARP_FORWARD)) 18 (1 << NF_ARP_FORWARD))
17 19
18static const struct
19{
20 struct arpt_replace repl;
21 struct arpt_standard entries[3];
22 struct arpt_error term;
23} initial_table __net_initdata = {
24 .repl = {
25 .name = "filter",
26 .valid_hooks = FILTER_VALID_HOOKS,
27 .num_entries = 4,
28 .size = sizeof(struct arpt_standard) * 3 + sizeof(struct arpt_error),
29 .hook_entry = {
30 [NF_ARP_IN] = 0,
31 [NF_ARP_OUT] = sizeof(struct arpt_standard),
32 [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard),
33 },
34 .underflow = {
35 [NF_ARP_IN] = 0,
36 [NF_ARP_OUT] = sizeof(struct arpt_standard),
37 [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard),
38 },
39 },
40 .entries = {
41 ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_IN */
42 ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_OUT */
43 ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_FORWARD */
44 },
45 .term = ARPT_ERROR_INIT,
46};
47
48static const struct xt_table packet_filter = { 20static const struct xt_table packet_filter = {
49 .name = "filter", 21 .name = "filter",
50 .valid_hooks = FILTER_VALID_HOOKS, 22 .valid_hooks = FILTER_VALID_HOOKS,
51 .me = THIS_MODULE, 23 .me = THIS_MODULE,
52 .af = NFPROTO_ARP, 24 .af = NFPROTO_ARP,
25 .priority = NF_IP_PRI_FILTER,
53}; 26};
54 27
55/* The work comes in here from netfilter.c */ 28/* The work comes in here from netfilter.c */
56static unsigned int arpt_in_hook(unsigned int hook, 29static unsigned int
57 struct sk_buff *skb, 30arptable_filter_hook(unsigned int hook, struct sk_buff *skb,
58 const struct net_device *in, 31 const struct net_device *in, const struct net_device *out,
59 const struct net_device *out, 32 int (*okfn)(struct sk_buff *))
60 int (*okfn)(struct sk_buff *))
61{ 33{
62 return arpt_do_table(skb, hook, in, out, 34 const struct net *net = dev_net((in != NULL) ? in : out);
63 dev_net(in)->ipv4.arptable_filter);
64}
65 35
66static unsigned int arpt_out_hook(unsigned int hook, 36 return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter);
67 struct sk_buff *skb,
68 const struct net_device *in,
69 const struct net_device *out,
70 int (*okfn)(struct sk_buff *))
71{
72 return arpt_do_table(skb, hook, in, out,
73 dev_net(out)->ipv4.arptable_filter);
74} 37}
75 38
76static struct nf_hook_ops arpt_ops[] __read_mostly = { 39static struct nf_hook_ops *arpfilter_ops __read_mostly;
77 {
78 .hook = arpt_in_hook,
79 .owner = THIS_MODULE,
80 .pf = NFPROTO_ARP,
81 .hooknum = NF_ARP_IN,
82 .priority = NF_IP_PRI_FILTER,
83 },
84 {
85 .hook = arpt_out_hook,
86 .owner = THIS_MODULE,
87 .pf = NFPROTO_ARP,
88 .hooknum = NF_ARP_OUT,
89 .priority = NF_IP_PRI_FILTER,
90 },
91 {
92 .hook = arpt_in_hook,
93 .owner = THIS_MODULE,
94 .pf = NFPROTO_ARP,
95 .hooknum = NF_ARP_FORWARD,
96 .priority = NF_IP_PRI_FILTER,
97 },
98};
99 40
100static int __net_init arptable_filter_net_init(struct net *net) 41static int __net_init arptable_filter_net_init(struct net *net)
101{ 42{
102 /* Register table */ 43 struct arpt_replace *repl;
44
45 repl = arpt_alloc_initial_table(&packet_filter);
46 if (repl == NULL)
47 return -ENOMEM;
103 net->ipv4.arptable_filter = 48 net->ipv4.arptable_filter =
104 arpt_register_table(net, &packet_filter, &initial_table.repl); 49 arpt_register_table(net, &packet_filter, repl);
50 kfree(repl);
105 if (IS_ERR(net->ipv4.arptable_filter)) 51 if (IS_ERR(net->ipv4.arptable_filter))
106 return PTR_ERR(net->ipv4.arptable_filter); 52 return PTR_ERR(net->ipv4.arptable_filter);
107 return 0; 53 return 0;
@@ -125,9 +71,11 @@ static int __init arptable_filter_init(void)
125 if (ret < 0) 71 if (ret < 0)
126 return ret; 72 return ret;
127 73
128 ret = nf_register_hooks(arpt_ops, ARRAY_SIZE(arpt_ops)); 74 arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook);
129 if (ret < 0) 75 if (IS_ERR(arpfilter_ops)) {
76 ret = PTR_ERR(arpfilter_ops);
130 goto cleanup_table; 77 goto cleanup_table;
78 }
131 return ret; 79 return ret;
132 80
133cleanup_table: 81cleanup_table:
@@ -137,7 +85,7 @@ cleanup_table:
137 85
138static void __exit arptable_filter_fini(void) 86static void __exit arptable_filter_fini(void)
139{ 87{
140 nf_unregister_hooks(arpt_ops, ARRAY_SIZE(arpt_ops)); 88 xt_hook_unlink(&packet_filter, arpfilter_ops);
141 unregister_pernet_subsys(&arptable_filter_net_ops); 89 unregister_pernet_subsys(&arptable_filter_net_ops);
142} 90}
143 91
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index 2855f1f38cbc..d2c1311cb28d 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -26,6 +26,7 @@
26#include <linux/security.h> 26#include <linux/security.h>
27#include <linux/net.h> 27#include <linux/net.h>
28#include <linux/mutex.h> 28#include <linux/mutex.h>
29#include <linux/slab.h>
29#include <net/net_namespace.h> 30#include <net/net_namespace.h>
30#include <net/sock.h> 31#include <net/sock.h>
31#include <net/route.h> 32#include <net/route.h>
@@ -41,7 +42,7 @@ typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
41 42
42static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; 43static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
43static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; 44static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
44static DEFINE_RWLOCK(queue_lock); 45static DEFINE_SPINLOCK(queue_lock);
45static int peer_pid __read_mostly; 46static int peer_pid __read_mostly;
46static unsigned int copy_range __read_mostly; 47static unsigned int copy_range __read_mostly;
47static unsigned int queue_total; 48static unsigned int queue_total;
@@ -71,10 +72,10 @@ __ipq_set_mode(unsigned char mode, unsigned int range)
71 break; 72 break;
72 73
73 case IPQ_COPY_PACKET: 74 case IPQ_COPY_PACKET:
74 copy_mode = mode; 75 if (range > 0xFFFF)
76 range = 0xFFFF;
75 copy_range = range; 77 copy_range = range;
76 if (copy_range > 0xFFFF) 78 copy_mode = mode;
77 copy_range = 0xFFFF;
78 break; 79 break;
79 80
80 default: 81 default:
@@ -100,7 +101,7 @@ ipq_find_dequeue_entry(unsigned long id)
100{ 101{
101 struct nf_queue_entry *entry = NULL, *i; 102 struct nf_queue_entry *entry = NULL, *i;
102 103
103 write_lock_bh(&queue_lock); 104 spin_lock_bh(&queue_lock);
104 105
105 list_for_each_entry(i, &queue_list, list) { 106 list_for_each_entry(i, &queue_list, list) {
106 if ((unsigned long)i == id) { 107 if ((unsigned long)i == id) {
@@ -114,7 +115,7 @@ ipq_find_dequeue_entry(unsigned long id)
114 queue_total--; 115 queue_total--;
115 } 116 }
116 117
117 write_unlock_bh(&queue_lock); 118 spin_unlock_bh(&queue_lock);
118 return entry; 119 return entry;
119} 120}
120 121
@@ -135,9 +136,9 @@ __ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
135static void 136static void
136ipq_flush(ipq_cmpfn cmpfn, unsigned long data) 137ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
137{ 138{
138 write_lock_bh(&queue_lock); 139 spin_lock_bh(&queue_lock);
139 __ipq_flush(cmpfn, data); 140 __ipq_flush(cmpfn, data);
140 write_unlock_bh(&queue_lock); 141 spin_unlock_bh(&queue_lock);
141} 142}
142 143
143static struct sk_buff * 144static struct sk_buff *
@@ -151,37 +152,29 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
151 struct nlmsghdr *nlh; 152 struct nlmsghdr *nlh;
152 struct timeval tv; 153 struct timeval tv;
153 154
154 read_lock_bh(&queue_lock); 155 switch (ACCESS_ONCE(copy_mode)) {
155
156 switch (copy_mode) {
157 case IPQ_COPY_META: 156 case IPQ_COPY_META:
158 case IPQ_COPY_NONE: 157 case IPQ_COPY_NONE:
159 size = NLMSG_SPACE(sizeof(*pmsg)); 158 size = NLMSG_SPACE(sizeof(*pmsg));
160 break; 159 break;
161 160
162 case IPQ_COPY_PACKET: 161 case IPQ_COPY_PACKET:
163 if ((entry->skb->ip_summed == CHECKSUM_PARTIAL || 162 if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
164 entry->skb->ip_summed == CHECKSUM_COMPLETE) && 163 (*errp = skb_checksum_help(entry->skb)))
165 (*errp = skb_checksum_help(entry->skb))) {
166 read_unlock_bh(&queue_lock);
167 return NULL; 164 return NULL;
168 } 165
169 if (copy_range == 0 || copy_range > entry->skb->len) 166 data_len = ACCESS_ONCE(copy_range);
167 if (data_len == 0 || data_len > entry->skb->len)
170 data_len = entry->skb->len; 168 data_len = entry->skb->len;
171 else
172 data_len = copy_range;
173 169
174 size = NLMSG_SPACE(sizeof(*pmsg) + data_len); 170 size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
175 break; 171 break;
176 172
177 default: 173 default:
178 *errp = -EINVAL; 174 *errp = -EINVAL;
179 read_unlock_bh(&queue_lock);
180 return NULL; 175 return NULL;
181 } 176 }
182 177
183 read_unlock_bh(&queue_lock);
184
185 skb = alloc_skb(size, GFP_ATOMIC); 178 skb = alloc_skb(size, GFP_ATOMIC);
186 if (!skb) 179 if (!skb)
187 goto nlmsg_failure; 180 goto nlmsg_failure;
@@ -242,7 +235,7 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
242 if (nskb == NULL) 235 if (nskb == NULL)
243 return status; 236 return status;
244 237
245 write_lock_bh(&queue_lock); 238 spin_lock_bh(&queue_lock);
246 239
247 if (!peer_pid) 240 if (!peer_pid)
248 goto err_out_free_nskb; 241 goto err_out_free_nskb;
@@ -266,14 +259,14 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
266 259
267 __ipq_enqueue_entry(entry); 260 __ipq_enqueue_entry(entry);
268 261
269 write_unlock_bh(&queue_lock); 262 spin_unlock_bh(&queue_lock);
270 return status; 263 return status;
271 264
272err_out_free_nskb: 265err_out_free_nskb:
273 kfree_skb(nskb); 266 kfree_skb(nskb);
274 267
275err_out_unlock: 268err_out_unlock:
276 write_unlock_bh(&queue_lock); 269 spin_unlock_bh(&queue_lock);
277 return status; 270 return status;
278} 271}
279 272
@@ -342,9 +335,9 @@ ipq_set_mode(unsigned char mode, unsigned int range)
342{ 335{
343 int status; 336 int status;
344 337
345 write_lock_bh(&queue_lock); 338 spin_lock_bh(&queue_lock);
346 status = __ipq_set_mode(mode, range); 339 status = __ipq_set_mode(mode, range);
347 write_unlock_bh(&queue_lock); 340 spin_unlock_bh(&queue_lock);
348 return status; 341 return status;
349} 342}
350 343
@@ -440,11 +433,11 @@ __ipq_rcv_skb(struct sk_buff *skb)
440 if (security_netlink_recv(skb, CAP_NET_ADMIN)) 433 if (security_netlink_recv(skb, CAP_NET_ADMIN))
441 RCV_SKB_FAIL(-EPERM); 434 RCV_SKB_FAIL(-EPERM);
442 435
443 write_lock_bh(&queue_lock); 436 spin_lock_bh(&queue_lock);
444 437
445 if (peer_pid) { 438 if (peer_pid) {
446 if (peer_pid != pid) { 439 if (peer_pid != pid) {
447 write_unlock_bh(&queue_lock); 440 spin_unlock_bh(&queue_lock);
448 RCV_SKB_FAIL(-EBUSY); 441 RCV_SKB_FAIL(-EBUSY);
449 } 442 }
450 } else { 443 } else {
@@ -452,7 +445,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
452 peer_pid = pid; 445 peer_pid = pid;
453 } 446 }
454 447
455 write_unlock_bh(&queue_lock); 448 spin_unlock_bh(&queue_lock);
456 449
457 status = ipq_receive_peer(NLMSG_DATA(nlh), type, 450 status = ipq_receive_peer(NLMSG_DATA(nlh), type,
458 nlmsglen - NLMSG_LENGTH(0)); 451 nlmsglen - NLMSG_LENGTH(0));
@@ -461,7 +454,6 @@ __ipq_rcv_skb(struct sk_buff *skb)
461 454
462 if (flags & NLM_F_ACK) 455 if (flags & NLM_F_ACK)
463 netlink_ack(skb, nlh, 0); 456 netlink_ack(skb, nlh, 0);
464 return;
465} 457}
466 458
467static void 459static void
@@ -498,10 +490,10 @@ ipq_rcv_nl_event(struct notifier_block *this,
498 struct netlink_notify *n = ptr; 490 struct netlink_notify *n = ptr;
499 491
500 if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) { 492 if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
501 write_lock_bh(&queue_lock); 493 spin_lock_bh(&queue_lock);
502 if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) 494 if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
503 __ipq_reset(); 495 __ipq_reset();
504 write_unlock_bh(&queue_lock); 496 spin_unlock_bh(&queue_lock);
505 } 497 }
506 return NOTIFY_DONE; 498 return NOTIFY_DONE;
507} 499}
@@ -528,7 +520,7 @@ static ctl_table ipq_table[] = {
528#ifdef CONFIG_PROC_FS 520#ifdef CONFIG_PROC_FS
529static int ip_queue_show(struct seq_file *m, void *v) 521static int ip_queue_show(struct seq_file *m, void *v)
530{ 522{
531 read_lock_bh(&queue_lock); 523 spin_lock_bh(&queue_lock);
532 524
533 seq_printf(m, 525 seq_printf(m,
534 "Peer PID : %d\n" 526 "Peer PID : %d\n"
@@ -546,7 +538,7 @@ static int ip_queue_show(struct seq_file *m, void *v)
546 queue_dropped, 538 queue_dropped,
547 queue_user_dropped); 539 queue_user_dropped);
548 540
549 read_unlock_bh(&queue_lock); 541 spin_unlock_bh(&queue_lock);
550 return 0; 542 return 0;
551} 543}
552 544
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 572330a552ef..d163f2e3b2e9 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -28,6 +28,7 @@
28#include <linux/netfilter/x_tables.h> 28#include <linux/netfilter/x_tables.h>
29#include <linux/netfilter_ipv4/ip_tables.h> 29#include <linux/netfilter_ipv4/ip_tables.h>
30#include <net/netfilter/nf_log.h> 30#include <net/netfilter/nf_log.h>
31#include "../../netfilter/xt_repldata.h"
31 32
32MODULE_LICENSE("GPL"); 33MODULE_LICENSE("GPL");
33MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 34MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -38,24 +39,19 @@ MODULE_DESCRIPTION("IPv4 packet filter");
38/*#define DEBUG_IP_FIREWALL_USER*/ 39/*#define DEBUG_IP_FIREWALL_USER*/
39 40
40#ifdef DEBUG_IP_FIREWALL 41#ifdef DEBUG_IP_FIREWALL
41#define dprintf(format, args...) printk(format , ## args) 42#define dprintf(format, args...) pr_info(format , ## args)
42#else 43#else
43#define dprintf(format, args...) 44#define dprintf(format, args...)
44#endif 45#endif
45 46
46#ifdef DEBUG_IP_FIREWALL_USER 47#ifdef DEBUG_IP_FIREWALL_USER
47#define duprintf(format, args...) printk(format , ## args) 48#define duprintf(format, args...) pr_info(format , ## args)
48#else 49#else
49#define duprintf(format, args...) 50#define duprintf(format, args...)
50#endif 51#endif
51 52
52#ifdef CONFIG_NETFILTER_DEBUG 53#ifdef CONFIG_NETFILTER_DEBUG
53#define IP_NF_ASSERT(x) \ 54#define IP_NF_ASSERT(x) WARN_ON(!(x))
54do { \
55 if (!(x)) \
56 printk("IP_NF_ASSERT: %s:%s:%u\n", \
57 __func__, __FILE__, __LINE__); \
58} while(0)
59#else 55#else
60#define IP_NF_ASSERT(x) 56#define IP_NF_ASSERT(x)
61#endif 57#endif
@@ -66,6 +62,12 @@ do { \
66#define inline 62#define inline
67#endif 63#endif
68 64
65void *ipt_alloc_initial_table(const struct xt_table *info)
66{
67 return xt_alloc_initial_table(ipt, IPT);
68}
69EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
70
69/* 71/*
70 We keep a set of rules for each CPU, so we can avoid write-locking 72 We keep a set of rules for each CPU, so we can avoid write-locking
71 them in the softirq when updating the counters and therefore 73 them in the softirq when updating the counters and therefore
@@ -158,33 +160,17 @@ ip_checkentry(const struct ipt_ip *ip)
158} 160}
159 161
160static unsigned int 162static unsigned int
161ipt_error(struct sk_buff *skb, const struct xt_target_param *par) 163ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
162{ 164{
163 if (net_ratelimit()) 165 if (net_ratelimit())
164 printk("ip_tables: error: `%s'\n", 166 pr_info("error: `%s'\n", (const char *)par->targinfo);
165 (const char *)par->targinfo);
166 167
167 return NF_DROP; 168 return NF_DROP;
168} 169}
169 170
170/* Performance critical - called for every packet */
171static inline bool
172do_match(struct ipt_entry_match *m, const struct sk_buff *skb,
173 struct xt_match_param *par)
174{
175 par->match = m->u.kernel.match;
176 par->matchinfo = m->data;
177
178 /* Stop iteration if it doesn't match */
179 if (!m->u.kernel.match->match(skb, par))
180 return true;
181 else
182 return false;
183}
184
185/* Performance critical */ 171/* Performance critical */
186static inline struct ipt_entry * 172static inline struct ipt_entry *
187get_entry(void *base, unsigned int offset) 173get_entry(const void *base, unsigned int offset)
188{ 174{
189 return (struct ipt_entry *)(base + offset); 175 return (struct ipt_entry *)(base + offset);
190} 176}
@@ -199,6 +185,13 @@ static inline bool unconditional(const struct ipt_ip *ip)
199#undef FWINV 185#undef FWINV
200} 186}
201 187
188/* for const-correctness */
189static inline const struct ipt_entry_target *
190ipt_get_target_c(const struct ipt_entry *e)
191{
192 return ipt_get_target((struct ipt_entry *)e);
193}
194
202#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ 195#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
203 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) 196 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
204static const char *const hooknames[] = { 197static const char *const hooknames[] = {
@@ -233,11 +226,11 @@ static struct nf_loginfo trace_loginfo = {
233 226
234/* Mildly perf critical (only if packet tracing is on) */ 227/* Mildly perf critical (only if packet tracing is on) */
235static inline int 228static inline int
236get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e, 229get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
237 const char *hookname, const char **chainname, 230 const char *hookname, const char **chainname,
238 const char **comment, unsigned int *rulenum) 231 const char **comment, unsigned int *rulenum)
239{ 232{
240 struct ipt_standard_target *t = (void *)ipt_get_target(s); 233 const struct ipt_standard_target *t = (void *)ipt_get_target_c(s);
241 234
242 if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) { 235 if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) {
243 /* Head of user chain: ERROR target with chainname */ 236 /* Head of user chain: ERROR target with chainname */
@@ -263,17 +256,18 @@ get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e,
263 return 0; 256 return 0;
264} 257}
265 258
266static void trace_packet(struct sk_buff *skb, 259static void trace_packet(const struct sk_buff *skb,
267 unsigned int hook, 260 unsigned int hook,
268 const struct net_device *in, 261 const struct net_device *in,
269 const struct net_device *out, 262 const struct net_device *out,
270 const char *tablename, 263 const char *tablename,
271 struct xt_table_info *private, 264 const struct xt_table_info *private,
272 struct ipt_entry *e) 265 const struct ipt_entry *e)
273{ 266{
274 void *table_base; 267 const void *table_base;
275 const struct ipt_entry *root; 268 const struct ipt_entry *root;
276 const char *hookname, *chainname, *comment; 269 const char *hookname, *chainname, *comment;
270 const struct ipt_entry *iter;
277 unsigned int rulenum = 0; 271 unsigned int rulenum = 0;
278 272
279 table_base = private->entries[smp_processor_id()]; 273 table_base = private->entries[smp_processor_id()];
@@ -282,10 +276,10 @@ static void trace_packet(struct sk_buff *skb,
282 hookname = chainname = hooknames[hook]; 276 hookname = chainname = hooknames[hook];
283 comment = comments[NF_IP_TRACE_COMMENT_RULE]; 277 comment = comments[NF_IP_TRACE_COMMENT_RULE];
284 278
285 IPT_ENTRY_ITERATE(root, 279 xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
286 private->size - private->hook_entry[hook], 280 if (get_chainname_rulenum(iter, e, hookname,
287 get_chainname_rulenum, 281 &chainname, &comment, &rulenum) != 0)
288 e, hookname, &chainname, &comment, &rulenum); 282 break;
289 283
290 nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo, 284 nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo,
291 "TRACE: %s:%s:%s:%u ", 285 "TRACE: %s:%s:%s:%u ",
@@ -307,19 +301,16 @@ ipt_do_table(struct sk_buff *skb,
307 const struct net_device *out, 301 const struct net_device *out,
308 struct xt_table *table) 302 struct xt_table *table)
309{ 303{
310#define tb_comefrom ((struct ipt_entry *)table_base)->comefrom
311
312 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); 304 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
313 const struct iphdr *ip; 305 const struct iphdr *ip;
314 bool hotdrop = false;
315 /* Initializing verdict to NF_DROP keeps gcc happy. */ 306 /* Initializing verdict to NF_DROP keeps gcc happy. */
316 unsigned int verdict = NF_DROP; 307 unsigned int verdict = NF_DROP;
317 const char *indev, *outdev; 308 const char *indev, *outdev;
318 void *table_base; 309 const void *table_base;
319 struct ipt_entry *e, *back; 310 struct ipt_entry *e, **jumpstack;
320 struct xt_table_info *private; 311 unsigned int *stackptr, origptr, cpu;
321 struct xt_match_param mtpar; 312 const struct xt_table_info *private;
322 struct xt_target_param tgpar; 313 struct xt_action_param acpar;
323 314
324 /* Initialization */ 315 /* Initialization */
325 ip = ip_hdr(skb); 316 ip = ip_hdr(skb);
@@ -331,37 +322,49 @@ ipt_do_table(struct sk_buff *skb,
331 * things we don't know, ie. tcp syn flag or ports). If the 322 * things we don't know, ie. tcp syn flag or ports). If the
332 * rule is also a fragment-specific rule, non-fragments won't 323 * rule is also a fragment-specific rule, non-fragments won't
333 * match it. */ 324 * match it. */
334 mtpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; 325 acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
335 mtpar.thoff = ip_hdrlen(skb); 326 acpar.thoff = ip_hdrlen(skb);
336 mtpar.hotdrop = &hotdrop; 327 acpar.hotdrop = false;
337 mtpar.in = tgpar.in = in; 328 acpar.in = in;
338 mtpar.out = tgpar.out = out; 329 acpar.out = out;
339 mtpar.family = tgpar.family = NFPROTO_IPV4; 330 acpar.family = NFPROTO_IPV4;
340 mtpar.hooknum = tgpar.hooknum = hook; 331 acpar.hooknum = hook;
341 332
342 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 333 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
343 xt_info_rdlock_bh(); 334 xt_info_rdlock_bh();
344 private = table->private; 335 private = table->private;
345 table_base = private->entries[smp_processor_id()]; 336 cpu = smp_processor_id();
337 table_base = private->entries[cpu];
338 jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
339 stackptr = per_cpu_ptr(private->stackptr, cpu);
340 origptr = *stackptr;
346 341
347 e = get_entry(table_base, private->hook_entry[hook]); 342 e = get_entry(table_base, private->hook_entry[hook]);
348 343
349 /* For return from builtin chain */ 344 pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n",
350 back = get_entry(table_base, private->underflow[hook]); 345 table->name, hook, origptr,
346 get_entry(table_base, private->underflow[hook]));
351 347
352 do { 348 do {
353 struct ipt_entry_target *t; 349 const struct ipt_entry_target *t;
350 const struct xt_entry_match *ematch;
354 351
355 IP_NF_ASSERT(e); 352 IP_NF_ASSERT(e);
356 IP_NF_ASSERT(back);
357 if (!ip_packet_match(ip, indev, outdev, 353 if (!ip_packet_match(ip, indev, outdev,
358 &e->ip, mtpar.fragoff) || 354 &e->ip, acpar.fragoff)) {
359 IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0) { 355 no_match:
360 e = ipt_next_entry(e); 356 e = ipt_next_entry(e);
361 continue; 357 continue;
362 } 358 }
363 359
364 ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); 360 xt_ematch_foreach(ematch, e) {
361 acpar.match = ematch->u.kernel.match;
362 acpar.matchinfo = ematch->data;
363 if (!acpar.match->match(skb, &acpar))
364 goto no_match;
365 }
366
367 ADD_COUNTER(e->counters, skb->len, 1);
365 368
366 t = ipt_get_target(e); 369 t = ipt_get_target(e);
367 IP_NF_ASSERT(t->u.kernel.target); 370 IP_NF_ASSERT(t->u.kernel.target);
@@ -384,41 +387,38 @@ ipt_do_table(struct sk_buff *skb,
384 verdict = (unsigned)(-v) - 1; 387 verdict = (unsigned)(-v) - 1;
385 break; 388 break;
386 } 389 }
387 e = back; 390 if (*stackptr == 0) {
388 back = get_entry(table_base, back->comefrom); 391 e = get_entry(table_base,
392 private->underflow[hook]);
393 pr_debug("Underflow (this is normal) "
394 "to %p\n", e);
395 } else {
396 e = jumpstack[--*stackptr];
397 pr_debug("Pulled %p out from pos %u\n",
398 e, *stackptr);
399 e = ipt_next_entry(e);
400 }
389 continue; 401 continue;
390 } 402 }
391 if (table_base + v != ipt_next_entry(e) && 403 if (table_base + v != ipt_next_entry(e) &&
392 !(e->ip.flags & IPT_F_GOTO)) { 404 !(e->ip.flags & IPT_F_GOTO)) {
393 /* Save old back ptr in next entry */ 405 if (*stackptr >= private->stacksize) {
394 struct ipt_entry *next = ipt_next_entry(e); 406 verdict = NF_DROP;
395 next->comefrom = (void *)back - table_base; 407 break;
396 /* set back pointer to next entry */ 408 }
397 back = next; 409 jumpstack[(*stackptr)++] = e;
410 pr_debug("Pushed %p into pos %u\n",
411 e, *stackptr - 1);
398 } 412 }
399 413
400 e = get_entry(table_base, v); 414 e = get_entry(table_base, v);
401 continue; 415 continue;
402 } 416 }
403 417
404 /* Targets which reenter must return 418 acpar.target = t->u.kernel.target;
405 abs. verdicts */ 419 acpar.targinfo = t->data;
406 tgpar.target = t->u.kernel.target;
407 tgpar.targinfo = t->data;
408
409 420
410#ifdef CONFIG_NETFILTER_DEBUG 421 verdict = t->u.kernel.target->target(skb, &acpar);
411 tb_comefrom = 0xeeeeeeec;
412#endif
413 verdict = t->u.kernel.target->target(skb, &tgpar);
414#ifdef CONFIG_NETFILTER_DEBUG
415 if (tb_comefrom != 0xeeeeeeec && verdict == IPT_CONTINUE) {
416 printk("Target %s reentered!\n",
417 t->u.kernel.target->name);
418 verdict = NF_DROP;
419 }
420 tb_comefrom = 0x57acc001;
421#endif
422 /* Target might have changed stuff. */ 422 /* Target might have changed stuff. */
423 ip = ip_hdr(skb); 423 ip = ip_hdr(skb);
424 if (verdict == IPT_CONTINUE) 424 if (verdict == IPT_CONTINUE)
@@ -426,24 +426,24 @@ ipt_do_table(struct sk_buff *skb,
426 else 426 else
427 /* Verdict */ 427 /* Verdict */
428 break; 428 break;
429 } while (!hotdrop); 429 } while (!acpar.hotdrop);
430 xt_info_rdunlock_bh(); 430 xt_info_rdunlock_bh();
431 431 pr_debug("Exiting %s; resetting sp from %u to %u\n",
432 __func__, *stackptr, origptr);
433 *stackptr = origptr;
432#ifdef DEBUG_ALLOW_ALL 434#ifdef DEBUG_ALLOW_ALL
433 return NF_ACCEPT; 435 return NF_ACCEPT;
434#else 436#else
435 if (hotdrop) 437 if (acpar.hotdrop)
436 return NF_DROP; 438 return NF_DROP;
437 else return verdict; 439 else return verdict;
438#endif 440#endif
439
440#undef tb_comefrom
441} 441}
442 442
443/* Figures out from what hook each rule can be called: returns 0 if 443/* Figures out from what hook each rule can be called: returns 0 if
444 there are loops. Puts hook bitmask in comefrom. */ 444 there are loops. Puts hook bitmask in comefrom. */
445static int 445static int
446mark_source_chains(struct xt_table_info *newinfo, 446mark_source_chains(const struct xt_table_info *newinfo,
447 unsigned int valid_hooks, void *entry0) 447 unsigned int valid_hooks, void *entry0)
448{ 448{
449 unsigned int hook; 449 unsigned int hook;
@@ -461,12 +461,12 @@ mark_source_chains(struct xt_table_info *newinfo,
461 e->counters.pcnt = pos; 461 e->counters.pcnt = pos;
462 462
463 for (;;) { 463 for (;;) {
464 struct ipt_standard_target *t 464 const struct ipt_standard_target *t
465 = (void *)ipt_get_target(e); 465 = (void *)ipt_get_target_c(e);
466 int visited = e->comefrom & (1 << hook); 466 int visited = e->comefrom & (1 << hook);
467 467
468 if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { 468 if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
469 printk("iptables: loop hook %u pos %u %08X.\n", 469 pr_err("iptables: loop hook %u pos %u %08X.\n",
470 hook, pos, e->comefrom); 470 hook, pos, e->comefrom);
471 return 0; 471 return 0;
472 } 472 }
@@ -552,30 +552,26 @@ mark_source_chains(struct xt_table_info *newinfo,
552 return 1; 552 return 1;
553} 553}
554 554
555static int 555static void cleanup_match(struct ipt_entry_match *m, struct net *net)
556cleanup_match(struct ipt_entry_match *m, unsigned int *i)
557{ 556{
558 struct xt_mtdtor_param par; 557 struct xt_mtdtor_param par;
559 558
560 if (i && (*i)-- == 0) 559 par.net = net;
561 return 1;
562
563 par.match = m->u.kernel.match; 560 par.match = m->u.kernel.match;
564 par.matchinfo = m->data; 561 par.matchinfo = m->data;
565 par.family = NFPROTO_IPV4; 562 par.family = NFPROTO_IPV4;
566 if (par.match->destroy != NULL) 563 if (par.match->destroy != NULL)
567 par.match->destroy(&par); 564 par.match->destroy(&par);
568 module_put(par.match->me); 565 module_put(par.match->me);
569 return 0;
570} 566}
571 567
572static int 568static int
573check_entry(struct ipt_entry *e, const char *name) 569check_entry(const struct ipt_entry *e, const char *name)
574{ 570{
575 struct ipt_entry_target *t; 571 const struct ipt_entry_target *t;
576 572
577 if (!ip_checkentry(&e->ip)) { 573 if (!ip_checkentry(&e->ip)) {
578 duprintf("ip_tables: ip check failed %p %s.\n", e, name); 574 duprintf("ip check failed %p %s.\n", e, par->match->name);
579 return -EINVAL; 575 return -EINVAL;
580 } 576 }
581 577
@@ -583,7 +579,7 @@ check_entry(struct ipt_entry *e, const char *name)
583 e->next_offset) 579 e->next_offset)
584 return -EINVAL; 580 return -EINVAL;
585 581
586 t = ipt_get_target(e); 582 t = ipt_get_target_c(e);
587 if (e->target_offset + t->u.target_size > e->next_offset) 583 if (e->target_offset + t->u.target_size > e->next_offset)
588 return -EINVAL; 584 return -EINVAL;
589 585
@@ -591,8 +587,7 @@ check_entry(struct ipt_entry *e, const char *name)
591} 587}
592 588
593static int 589static int
594check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, 590check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
595 unsigned int *i)
596{ 591{
597 const struct ipt_ip *ip = par->entryinfo; 592 const struct ipt_ip *ip = par->entryinfo;
598 int ret; 593 int ret;
@@ -603,31 +598,27 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par,
603 ret = xt_check_match(par, m->u.match_size - sizeof(*m), 598 ret = xt_check_match(par, m->u.match_size - sizeof(*m),
604 ip->proto, ip->invflags & IPT_INV_PROTO); 599 ip->proto, ip->invflags & IPT_INV_PROTO);
605 if (ret < 0) { 600 if (ret < 0) {
606 duprintf("ip_tables: check failed for `%s'.\n", 601 duprintf("check failed for `%s'.\n", par->match->name);
607 par.match->name);
608 return ret; 602 return ret;
609 } 603 }
610 ++*i;
611 return 0; 604 return 0;
612} 605}
613 606
614static int 607static int
615find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, 608find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
616 unsigned int *i)
617{ 609{
618 struct xt_match *match; 610 struct xt_match *match;
619 int ret; 611 int ret;
620 612
621 match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name, 613 match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
622 m->u.user.revision), 614 m->u.user.revision);
623 "ipt_%s", m->u.user.name); 615 if (IS_ERR(match)) {
624 if (IS_ERR(match) || !match) {
625 duprintf("find_check_match: `%s' not found\n", m->u.user.name); 616 duprintf("find_check_match: `%s' not found\n", m->u.user.name);
626 return match ? PTR_ERR(match) : -ENOENT; 617 return PTR_ERR(match);
627 } 618 }
628 m->u.kernel.match = match; 619 m->u.kernel.match = match;
629 620
630 ret = check_match(m, par, i); 621 ret = check_match(m, par);
631 if (ret) 622 if (ret)
632 goto err; 623 goto err;
633 624
@@ -637,10 +628,11 @@ err:
637 return ret; 628 return ret;
638} 629}
639 630
640static int check_target(struct ipt_entry *e, const char *name) 631static int check_target(struct ipt_entry *e, struct net *net, const char *name)
641{ 632{
642 struct ipt_entry_target *t = ipt_get_target(e); 633 struct ipt_entry_target *t = ipt_get_target(e);
643 struct xt_tgchk_param par = { 634 struct xt_tgchk_param par = {
635 .net = net,
644 .table = name, 636 .table = name,
645 .entryinfo = e, 637 .entryinfo = e,
646 .target = t->u.kernel.target, 638 .target = t->u.kernel.target,
@@ -653,7 +645,7 @@ static int check_target(struct ipt_entry *e, const char *name)
653 ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 645 ret = xt_check_target(&par, t->u.target_size - sizeof(*t),
654 e->ip.proto, e->ip.invflags & IPT_INV_PROTO); 646 e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
655 if (ret < 0) { 647 if (ret < 0) {
656 duprintf("ip_tables: check failed for `%s'.\n", 648 duprintf("check failed for `%s'.\n",
657 t->u.kernel.target->name); 649 t->u.kernel.target->name);
658 return ret; 650 return ret;
659 } 651 }
@@ -661,61 +653,66 @@ static int check_target(struct ipt_entry *e, const char *name)
661} 653}
662 654
663static int 655static int
664find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, 656find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
665 unsigned int *i) 657 unsigned int size)
666{ 658{
667 struct ipt_entry_target *t; 659 struct ipt_entry_target *t;
668 struct xt_target *target; 660 struct xt_target *target;
669 int ret; 661 int ret;
670 unsigned int j; 662 unsigned int j;
671 struct xt_mtchk_param mtpar; 663 struct xt_mtchk_param mtpar;
664 struct xt_entry_match *ematch;
672 665
673 ret = check_entry(e, name); 666 ret = check_entry(e, name);
674 if (ret) 667 if (ret)
675 return ret; 668 return ret;
676 669
677 j = 0; 670 j = 0;
671 mtpar.net = net;
678 mtpar.table = name; 672 mtpar.table = name;
679 mtpar.entryinfo = &e->ip; 673 mtpar.entryinfo = &e->ip;
680 mtpar.hook_mask = e->comefrom; 674 mtpar.hook_mask = e->comefrom;
681 mtpar.family = NFPROTO_IPV4; 675 mtpar.family = NFPROTO_IPV4;
682 ret = IPT_MATCH_ITERATE(e, find_check_match, &mtpar, &j); 676 xt_ematch_foreach(ematch, e) {
683 if (ret != 0) 677 ret = find_check_match(ematch, &mtpar);
684 goto cleanup_matches; 678 if (ret != 0)
679 goto cleanup_matches;
680 ++j;
681 }
685 682
686 t = ipt_get_target(e); 683 t = ipt_get_target(e);
687 target = try_then_request_module(xt_find_target(AF_INET, 684 target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
688 t->u.user.name, 685 t->u.user.revision);
689 t->u.user.revision), 686 if (IS_ERR(target)) {
690 "ipt_%s", t->u.user.name);
691 if (IS_ERR(target) || !target) {
692 duprintf("find_check_entry: `%s' not found\n", t->u.user.name); 687 duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
693 ret = target ? PTR_ERR(target) : -ENOENT; 688 ret = PTR_ERR(target);
694 goto cleanup_matches; 689 goto cleanup_matches;
695 } 690 }
696 t->u.kernel.target = target; 691 t->u.kernel.target = target;
697 692
698 ret = check_target(e, name); 693 ret = check_target(e, net, name);
699 if (ret) 694 if (ret)
700 goto err; 695 goto err;
701
702 (*i)++;
703 return 0; 696 return 0;
704 err: 697 err:
705 module_put(t->u.kernel.target->me); 698 module_put(t->u.kernel.target->me);
706 cleanup_matches: 699 cleanup_matches:
707 IPT_MATCH_ITERATE(e, cleanup_match, &j); 700 xt_ematch_foreach(ematch, e) {
701 if (j-- == 0)
702 break;
703 cleanup_match(ematch, net);
704 }
708 return ret; 705 return ret;
709} 706}
710 707
711static bool check_underflow(struct ipt_entry *e) 708static bool check_underflow(const struct ipt_entry *e)
712{ 709{
713 const struct ipt_entry_target *t; 710 const struct ipt_entry_target *t;
714 unsigned int verdict; 711 unsigned int verdict;
715 712
716 if (!unconditional(&e->ip)) 713 if (!unconditional(&e->ip))
717 return false; 714 return false;
718 t = ipt_get_target(e); 715 t = ipt_get_target_c(e);
719 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) 716 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
720 return false; 717 return false;
721 verdict = ((struct ipt_standard_target *)t)->verdict; 718 verdict = ((struct ipt_standard_target *)t)->verdict;
@@ -726,12 +723,11 @@ static bool check_underflow(struct ipt_entry *e)
726static int 723static int
727check_entry_size_and_hooks(struct ipt_entry *e, 724check_entry_size_and_hooks(struct ipt_entry *e,
728 struct xt_table_info *newinfo, 725 struct xt_table_info *newinfo,
729 unsigned char *base, 726 const unsigned char *base,
730 unsigned char *limit, 727 const unsigned char *limit,
731 const unsigned int *hook_entries, 728 const unsigned int *hook_entries,
732 const unsigned int *underflows, 729 const unsigned int *underflows,
733 unsigned int valid_hooks, 730 unsigned int valid_hooks)
734 unsigned int *i)
735{ 731{
736 unsigned int h; 732 unsigned int h;
737 733
@@ -768,50 +764,42 @@ check_entry_size_and_hooks(struct ipt_entry *e,
768 /* Clear counters and comefrom */ 764 /* Clear counters and comefrom */
769 e->counters = ((struct xt_counters) { 0, 0 }); 765 e->counters = ((struct xt_counters) { 0, 0 });
770 e->comefrom = 0; 766 e->comefrom = 0;
771
772 (*i)++;
773 return 0; 767 return 0;
774} 768}
775 769
776static int 770static void
777cleanup_entry(struct ipt_entry *e, unsigned int *i) 771cleanup_entry(struct ipt_entry *e, struct net *net)
778{ 772{
779 struct xt_tgdtor_param par; 773 struct xt_tgdtor_param par;
780 struct ipt_entry_target *t; 774 struct ipt_entry_target *t;
781 775 struct xt_entry_match *ematch;
782 if (i && (*i)-- == 0)
783 return 1;
784 776
785 /* Cleanup all matches */ 777 /* Cleanup all matches */
786 IPT_MATCH_ITERATE(e, cleanup_match, NULL); 778 xt_ematch_foreach(ematch, e)
779 cleanup_match(ematch, net);
787 t = ipt_get_target(e); 780 t = ipt_get_target(e);
788 781
782 par.net = net;
789 par.target = t->u.kernel.target; 783 par.target = t->u.kernel.target;
790 par.targinfo = t->data; 784 par.targinfo = t->data;
791 par.family = NFPROTO_IPV4; 785 par.family = NFPROTO_IPV4;
792 if (par.target->destroy != NULL) 786 if (par.target->destroy != NULL)
793 par.target->destroy(&par); 787 par.target->destroy(&par);
794 module_put(par.target->me); 788 module_put(par.target->me);
795 return 0;
796} 789}
797 790
798/* Checks and translates the user-supplied table segment (held in 791/* Checks and translates the user-supplied table segment (held in
799 newinfo) */ 792 newinfo) */
800static int 793static int
801translate_table(const char *name, 794translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
802 unsigned int valid_hooks, 795 const struct ipt_replace *repl)
803 struct xt_table_info *newinfo,
804 void *entry0,
805 unsigned int size,
806 unsigned int number,
807 const unsigned int *hook_entries,
808 const unsigned int *underflows)
809{ 796{
797 struct ipt_entry *iter;
810 unsigned int i; 798 unsigned int i;
811 int ret; 799 int ret = 0;
812 800
813 newinfo->size = size; 801 newinfo->size = repl->size;
814 newinfo->number = number; 802 newinfo->number = repl->num_entries;
815 803
816 /* Init all hooks to impossible value. */ 804 /* Init all hooks to impossible value. */
817 for (i = 0; i < NF_INET_NUMHOOKS; i++) { 805 for (i = 0; i < NF_INET_NUMHOOKS; i++) {
@@ -822,49 +810,61 @@ translate_table(const char *name,
822 duprintf("translate_table: size %u\n", newinfo->size); 810 duprintf("translate_table: size %u\n", newinfo->size);
823 i = 0; 811 i = 0;
824 /* Walk through entries, checking offsets. */ 812 /* Walk through entries, checking offsets. */
825 ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, 813 xt_entry_foreach(iter, entry0, newinfo->size) {
826 check_entry_size_and_hooks, 814 ret = check_entry_size_and_hooks(iter, newinfo, entry0,
827 newinfo, 815 entry0 + repl->size,
828 entry0, 816 repl->hook_entry,
829 entry0 + size, 817 repl->underflow,
830 hook_entries, underflows, valid_hooks, &i); 818 repl->valid_hooks);
831 if (ret != 0) 819 if (ret != 0)
832 return ret; 820 return ret;
821 ++i;
822 if (strcmp(ipt_get_target(iter)->u.user.name,
823 XT_ERROR_TARGET) == 0)
824 ++newinfo->stacksize;
825 }
833 826
834 if (i != number) { 827 if (i != repl->num_entries) {
835 duprintf("translate_table: %u not %u entries\n", 828 duprintf("translate_table: %u not %u entries\n",
836 i, number); 829 i, repl->num_entries);
837 return -EINVAL; 830 return -EINVAL;
838 } 831 }
839 832
840 /* Check hooks all assigned */ 833 /* Check hooks all assigned */
841 for (i = 0; i < NF_INET_NUMHOOKS; i++) { 834 for (i = 0; i < NF_INET_NUMHOOKS; i++) {
842 /* Only hooks which are valid */ 835 /* Only hooks which are valid */
843 if (!(valid_hooks & (1 << i))) 836 if (!(repl->valid_hooks & (1 << i)))
844 continue; 837 continue;
845 if (newinfo->hook_entry[i] == 0xFFFFFFFF) { 838 if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
846 duprintf("Invalid hook entry %u %u\n", 839 duprintf("Invalid hook entry %u %u\n",
847 i, hook_entries[i]); 840 i, repl->hook_entry[i]);
848 return -EINVAL; 841 return -EINVAL;
849 } 842 }
850 if (newinfo->underflow[i] == 0xFFFFFFFF) { 843 if (newinfo->underflow[i] == 0xFFFFFFFF) {
851 duprintf("Invalid underflow %u %u\n", 844 duprintf("Invalid underflow %u %u\n",
852 i, underflows[i]); 845 i, repl->underflow[i]);
853 return -EINVAL; 846 return -EINVAL;
854 } 847 }
855 } 848 }
856 849
857 if (!mark_source_chains(newinfo, valid_hooks, entry0)) 850 if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
858 return -ELOOP; 851 return -ELOOP;
859 852
860 /* Finally, each sanity check must pass */ 853 /* Finally, each sanity check must pass */
861 i = 0; 854 i = 0;
862 ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, 855 xt_entry_foreach(iter, entry0, newinfo->size) {
863 find_check_entry, name, size, &i); 856 ret = find_check_entry(iter, net, repl->name, repl->size);
857 if (ret != 0)
858 break;
859 ++i;
860 }
864 861
865 if (ret != 0) { 862 if (ret != 0) {
866 IPT_ENTRY_ITERATE(entry0, newinfo->size, 863 xt_entry_foreach(iter, entry0, newinfo->size) {
867 cleanup_entry, &i); 864 if (i-- == 0)
865 break;
866 cleanup_entry(iter, net);
867 }
868 return ret; 868 return ret;
869 } 869 }
870 870
@@ -877,36 +877,14 @@ translate_table(const char *name,
877 return ret; 877 return ret;
878} 878}
879 879
880/* Gets counters. */
881static inline int
882add_entry_to_counter(const struct ipt_entry *e,
883 struct xt_counters total[],
884 unsigned int *i)
885{
886 ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
887
888 (*i)++;
889 return 0;
890}
891
892static inline int
893set_entry_to_counter(const struct ipt_entry *e,
894 struct ipt_counters total[],
895 unsigned int *i)
896{
897 SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
898
899 (*i)++;
900 return 0;
901}
902
903static void 880static void
904get_counters(const struct xt_table_info *t, 881get_counters(const struct xt_table_info *t,
905 struct xt_counters counters[]) 882 struct xt_counters counters[])
906{ 883{
884 struct ipt_entry *iter;
907 unsigned int cpu; 885 unsigned int cpu;
908 unsigned int i; 886 unsigned int i;
909 unsigned int curcpu; 887 unsigned int curcpu = get_cpu();
910 888
911 /* Instead of clearing (by a previous call to memset()) 889 /* Instead of clearing (by a previous call to memset())
912 * the counters and using adds, we set the counters 890 * the counters and using adds, we set the counters
@@ -916,41 +894,45 @@ get_counters(const struct xt_table_info *t,
916 * if new softirq were to run and call ipt_do_table 894 * if new softirq were to run and call ipt_do_table
917 */ 895 */
918 local_bh_disable(); 896 local_bh_disable();
919 curcpu = smp_processor_id();
920
921 i = 0; 897 i = 0;
922 IPT_ENTRY_ITERATE(t->entries[curcpu], 898 xt_entry_foreach(iter, t->entries[curcpu], t->size) {
923 t->size, 899 SET_COUNTER(counters[i], iter->counters.bcnt,
924 set_entry_to_counter, 900 iter->counters.pcnt);
925 counters, 901 ++i;
926 &i); 902 }
903 local_bh_enable();
904 /* Processing counters from other cpus, we can let bottom half enabled,
905 * (preemption is disabled)
906 */
927 907
928 for_each_possible_cpu(cpu) { 908 for_each_possible_cpu(cpu) {
929 if (cpu == curcpu) 909 if (cpu == curcpu)
930 continue; 910 continue;
931 i = 0; 911 i = 0;
912 local_bh_disable();
932 xt_info_wrlock(cpu); 913 xt_info_wrlock(cpu);
933 IPT_ENTRY_ITERATE(t->entries[cpu], 914 xt_entry_foreach(iter, t->entries[cpu], t->size) {
934 t->size, 915 ADD_COUNTER(counters[i], iter->counters.bcnt,
935 add_entry_to_counter, 916 iter->counters.pcnt);
936 counters, 917 ++i; /* macro does multi eval of i */
937 &i); 918 }
938 xt_info_wrunlock(cpu); 919 xt_info_wrunlock(cpu);
920 local_bh_enable();
939 } 921 }
940 local_bh_enable(); 922 put_cpu();
941} 923}
942 924
943static struct xt_counters * alloc_counters(struct xt_table *table) 925static struct xt_counters *alloc_counters(const struct xt_table *table)
944{ 926{
945 unsigned int countersize; 927 unsigned int countersize;
946 struct xt_counters *counters; 928 struct xt_counters *counters;
947 struct xt_table_info *private = table->private; 929 const struct xt_table_info *private = table->private;
948 930
949 /* We need atomic snapshot of counters: rest doesn't change 931 /* We need atomic snapshot of counters: rest doesn't change
950 (other than comefrom, which userspace doesn't care 932 (other than comefrom, which userspace doesn't care
951 about). */ 933 about). */
952 countersize = sizeof(struct xt_counters) * private->number; 934 countersize = sizeof(struct xt_counters) * private->number;
953 counters = vmalloc_node(countersize, numa_node_id()); 935 counters = vmalloc(countersize);
954 936
955 if (counters == NULL) 937 if (counters == NULL)
956 return ERR_PTR(-ENOMEM); 938 return ERR_PTR(-ENOMEM);
@@ -962,11 +944,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
962 944
963static int 945static int
964copy_entries_to_user(unsigned int total_size, 946copy_entries_to_user(unsigned int total_size,
965 struct xt_table *table, 947 const struct xt_table *table,
966 void __user *userptr) 948 void __user *userptr)
967{ 949{
968 unsigned int off, num; 950 unsigned int off, num;
969 struct ipt_entry *e; 951 const struct ipt_entry *e;
970 struct xt_counters *counters; 952 struct xt_counters *counters;
971 const struct xt_table_info *private = table->private; 953 const struct xt_table_info *private = table->private;
972 int ret = 0; 954 int ret = 0;
@@ -1018,7 +1000,7 @@ copy_entries_to_user(unsigned int total_size,
1018 } 1000 }
1019 } 1001 }
1020 1002
1021 t = ipt_get_target(e); 1003 t = ipt_get_target_c(e);
1022 if (copy_to_user(userptr + off + e->target_offset 1004 if (copy_to_user(userptr + off + e->target_offset
1023 + offsetof(struct ipt_entry_target, 1005 + offsetof(struct ipt_entry_target,
1024 u.user.name), 1006 u.user.name),
@@ -1035,7 +1017,7 @@ copy_entries_to_user(unsigned int total_size,
1035} 1017}
1036 1018
1037#ifdef CONFIG_COMPAT 1019#ifdef CONFIG_COMPAT
1038static void compat_standard_from_user(void *dst, void *src) 1020static void compat_standard_from_user(void *dst, const void *src)
1039{ 1021{
1040 int v = *(compat_int_t *)src; 1022 int v = *(compat_int_t *)src;
1041 1023
@@ -1044,7 +1026,7 @@ static void compat_standard_from_user(void *dst, void *src)
1044 memcpy(dst, &v, sizeof(v)); 1026 memcpy(dst, &v, sizeof(v));
1045} 1027}
1046 1028
1047static int compat_standard_to_user(void __user *dst, void *src) 1029static int compat_standard_to_user(void __user *dst, const void *src)
1048{ 1030{
1049 compat_int_t cv = *(int *)src; 1031 compat_int_t cv = *(int *)src;
1050 1032
@@ -1053,25 +1035,20 @@ static int compat_standard_to_user(void __user *dst, void *src)
1053 return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; 1035 return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
1054} 1036}
1055 1037
1056static inline int 1038static int compat_calc_entry(const struct ipt_entry *e,
1057compat_calc_match(struct ipt_entry_match *m, int *size)
1058{
1059 *size += xt_compat_match_offset(m->u.kernel.match);
1060 return 0;
1061}
1062
1063static int compat_calc_entry(struct ipt_entry *e,
1064 const struct xt_table_info *info, 1039 const struct xt_table_info *info,
1065 void *base, struct xt_table_info *newinfo) 1040 const void *base, struct xt_table_info *newinfo)
1066{ 1041{
1067 struct ipt_entry_target *t; 1042 const struct xt_entry_match *ematch;
1043 const struct ipt_entry_target *t;
1068 unsigned int entry_offset; 1044 unsigned int entry_offset;
1069 int off, i, ret; 1045 int off, i, ret;
1070 1046
1071 off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); 1047 off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
1072 entry_offset = (void *)e - base; 1048 entry_offset = (void *)e - base;
1073 IPT_MATCH_ITERATE(e, compat_calc_match, &off); 1049 xt_ematch_foreach(ematch, e)
1074 t = ipt_get_target(e); 1050 off += xt_compat_match_offset(ematch->u.kernel.match);
1051 t = ipt_get_target_c(e);
1075 off += xt_compat_target_offset(t->u.kernel.target); 1052 off += xt_compat_target_offset(t->u.kernel.target);
1076 newinfo->size -= off; 1053 newinfo->size -= off;
1077 ret = xt_compat_add_offset(AF_INET, entry_offset, off); 1054 ret = xt_compat_add_offset(AF_INET, entry_offset, off);
@@ -1092,7 +1069,9 @@ static int compat_calc_entry(struct ipt_entry *e,
1092static int compat_table_info(const struct xt_table_info *info, 1069static int compat_table_info(const struct xt_table_info *info,
1093 struct xt_table_info *newinfo) 1070 struct xt_table_info *newinfo)
1094{ 1071{
1072 struct ipt_entry *iter;
1095 void *loc_cpu_entry; 1073 void *loc_cpu_entry;
1074 int ret;
1096 1075
1097 if (!newinfo || !info) 1076 if (!newinfo || !info)
1098 return -EINVAL; 1077 return -EINVAL;
@@ -1101,13 +1080,17 @@ static int compat_table_info(const struct xt_table_info *info,
1101 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 1080 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
1102 newinfo->initial_entries = 0; 1081 newinfo->initial_entries = 0;
1103 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 1082 loc_cpu_entry = info->entries[raw_smp_processor_id()];
1104 return IPT_ENTRY_ITERATE(loc_cpu_entry, info->size, 1083 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
1105 compat_calc_entry, info, loc_cpu_entry, 1084 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
1106 newinfo); 1085 if (ret != 0)
1086 return ret;
1087 }
1088 return 0;
1107} 1089}
1108#endif 1090#endif
1109 1091
1110static int get_info(struct net *net, void __user *user, int *len, int compat) 1092static int get_info(struct net *net, void __user *user,
1093 const int *len, int compat)
1111{ 1094{
1112 char name[IPT_TABLE_MAXNAMELEN]; 1095 char name[IPT_TABLE_MAXNAMELEN];
1113 struct xt_table *t; 1096 struct xt_table *t;
@@ -1132,10 +1115,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
1132 if (t && !IS_ERR(t)) { 1115 if (t && !IS_ERR(t)) {
1133 struct ipt_getinfo info; 1116 struct ipt_getinfo info;
1134 const struct xt_table_info *private = t->private; 1117 const struct xt_table_info *private = t->private;
1135
1136#ifdef CONFIG_COMPAT 1118#ifdef CONFIG_COMPAT
1119 struct xt_table_info tmp;
1120
1137 if (compat) { 1121 if (compat) {
1138 struct xt_table_info tmp;
1139 ret = compat_table_info(private, &tmp); 1122 ret = compat_table_info(private, &tmp);
1140 xt_compat_flush_offsets(AF_INET); 1123 xt_compat_flush_offsets(AF_INET);
1141 private = &tmp; 1124 private = &tmp;
@@ -1167,7 +1150,8 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
1167} 1150}
1168 1151
1169static int 1152static int
1170get_entries(struct net *net, struct ipt_get_entries __user *uptr, int *len) 1153get_entries(struct net *net, struct ipt_get_entries __user *uptr,
1154 const int *len)
1171{ 1155{
1172 int ret; 1156 int ret;
1173 struct ipt_get_entries get; 1157 struct ipt_get_entries get;
@@ -1215,6 +1199,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1215 struct xt_table_info *oldinfo; 1199 struct xt_table_info *oldinfo;
1216 struct xt_counters *counters; 1200 struct xt_counters *counters;
1217 void *loc_cpu_old_entry; 1201 void *loc_cpu_old_entry;
1202 struct ipt_entry *iter;
1218 1203
1219 ret = 0; 1204 ret = 0;
1220 counters = vmalloc(num_counters * sizeof(struct xt_counters)); 1205 counters = vmalloc(num_counters * sizeof(struct xt_counters));
@@ -1257,8 +1242,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1257 1242
1258 /* Decrease module usage counts and free resource */ 1243 /* Decrease module usage counts and free resource */
1259 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1244 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1260 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1245 xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
1261 NULL); 1246 cleanup_entry(iter, net);
1247
1262 xt_free_table_info(oldinfo); 1248 xt_free_table_info(oldinfo);
1263 if (copy_to_user(counters_ptr, counters, 1249 if (copy_to_user(counters_ptr, counters,
1264 sizeof(struct xt_counters) * num_counters) != 0) 1250 sizeof(struct xt_counters) * num_counters) != 0)
@@ -1277,12 +1263,13 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1277} 1263}
1278 1264
1279static int 1265static int
1280do_replace(struct net *net, void __user *user, unsigned int len) 1266do_replace(struct net *net, const void __user *user, unsigned int len)
1281{ 1267{
1282 int ret; 1268 int ret;
1283 struct ipt_replace tmp; 1269 struct ipt_replace tmp;
1284 struct xt_table_info *newinfo; 1270 struct xt_table_info *newinfo;
1285 void *loc_cpu_entry; 1271 void *loc_cpu_entry;
1272 struct ipt_entry *iter;
1286 1273
1287 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1274 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1288 return -EFAULT; 1275 return -EFAULT;
@@ -1303,13 +1290,11 @@ do_replace(struct net *net, void __user *user, unsigned int len)
1303 goto free_newinfo; 1290 goto free_newinfo;
1304 } 1291 }
1305 1292
1306 ret = translate_table(tmp.name, tmp.valid_hooks, 1293 ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
1307 newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
1308 tmp.hook_entry, tmp.underflow);
1309 if (ret != 0) 1294 if (ret != 0)
1310 goto free_newinfo; 1295 goto free_newinfo;
1311 1296
1312 duprintf("ip_tables: Translated table\n"); 1297 duprintf("Translated table\n");
1313 1298
1314 ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, 1299 ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
1315 tmp.num_counters, tmp.counters); 1300 tmp.num_counters, tmp.counters);
@@ -1318,27 +1303,16 @@ do_replace(struct net *net, void __user *user, unsigned int len)
1318 return 0; 1303 return 0;
1319 1304
1320 free_newinfo_untrans: 1305 free_newinfo_untrans:
1321 IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); 1306 xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
1307 cleanup_entry(iter, net);
1322 free_newinfo: 1308 free_newinfo:
1323 xt_free_table_info(newinfo); 1309 xt_free_table_info(newinfo);
1324 return ret; 1310 return ret;
1325} 1311}
1326 1312
1327/* We're lazy, and add to the first CPU; overflow works its fey magic
1328 * and everything is OK. */
1329static int 1313static int
1330add_counter_to_entry(struct ipt_entry *e, 1314do_add_counters(struct net *net, const void __user *user,
1331 const struct xt_counters addme[], 1315 unsigned int len, int compat)
1332 unsigned int *i)
1333{
1334 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1335
1336 (*i)++;
1337 return 0;
1338}
1339
1340static int
1341do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
1342{ 1316{
1343 unsigned int i, curcpu; 1317 unsigned int i, curcpu;
1344 struct xt_counters_info tmp; 1318 struct xt_counters_info tmp;
@@ -1351,6 +1325,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1351 const struct xt_table_info *private; 1325 const struct xt_table_info *private;
1352 int ret = 0; 1326 int ret = 0;
1353 void *loc_cpu_entry; 1327 void *loc_cpu_entry;
1328 struct ipt_entry *iter;
1354#ifdef CONFIG_COMPAT 1329#ifdef CONFIG_COMPAT
1355 struct compat_xt_counters_info compat_tmp; 1330 struct compat_xt_counters_info compat_tmp;
1356 1331
@@ -1381,7 +1356,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1381 if (len != size + num_counters * sizeof(struct xt_counters)) 1356 if (len != size + num_counters * sizeof(struct xt_counters))
1382 return -EINVAL; 1357 return -EINVAL;
1383 1358
1384 paddc = vmalloc_node(len - size, numa_node_id()); 1359 paddc = vmalloc(len - size);
1385 if (!paddc) 1360 if (!paddc)
1386 return -ENOMEM; 1361 return -ENOMEM;
1387 1362
@@ -1408,11 +1383,10 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1408 curcpu = smp_processor_id(); 1383 curcpu = smp_processor_id();
1409 loc_cpu_entry = private->entries[curcpu]; 1384 loc_cpu_entry = private->entries[curcpu];
1410 xt_info_wrlock(curcpu); 1385 xt_info_wrlock(curcpu);
1411 IPT_ENTRY_ITERATE(loc_cpu_entry, 1386 xt_entry_foreach(iter, loc_cpu_entry, private->size) {
1412 private->size, 1387 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
1413 add_counter_to_entry, 1388 ++i;
1414 paddc, 1389 }
1415 &i);
1416 xt_info_wrunlock(curcpu); 1390 xt_info_wrunlock(curcpu);
1417 unlock_up_free: 1391 unlock_up_free:
1418 local_bh_enable(); 1392 local_bh_enable();
@@ -1440,45 +1414,40 @@ struct compat_ipt_replace {
1440static int 1414static int
1441compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, 1415compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
1442 unsigned int *size, struct xt_counters *counters, 1416 unsigned int *size, struct xt_counters *counters,
1443 unsigned int *i) 1417 unsigned int i)
1444{ 1418{
1445 struct ipt_entry_target *t; 1419 struct ipt_entry_target *t;
1446 struct compat_ipt_entry __user *ce; 1420 struct compat_ipt_entry __user *ce;
1447 u_int16_t target_offset, next_offset; 1421 u_int16_t target_offset, next_offset;
1448 compat_uint_t origsize; 1422 compat_uint_t origsize;
1449 int ret; 1423 const struct xt_entry_match *ematch;
1424 int ret = 0;
1450 1425
1451 ret = -EFAULT;
1452 origsize = *size; 1426 origsize = *size;
1453 ce = (struct compat_ipt_entry __user *)*dstptr; 1427 ce = (struct compat_ipt_entry __user *)*dstptr;
1454 if (copy_to_user(ce, e, sizeof(struct ipt_entry))) 1428 if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 ||
1455 goto out; 1429 copy_to_user(&ce->counters, &counters[i],
1456 1430 sizeof(counters[i])) != 0)
1457 if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) 1431 return -EFAULT;
1458 goto out;
1459 1432
1460 *dstptr += sizeof(struct compat_ipt_entry); 1433 *dstptr += sizeof(struct compat_ipt_entry);
1461 *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); 1434 *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
1462 1435
1463 ret = IPT_MATCH_ITERATE(e, xt_compat_match_to_user, dstptr, size); 1436 xt_ematch_foreach(ematch, e) {
1437 ret = xt_compat_match_to_user(ematch, dstptr, size);
1438 if (ret != 0)
1439 return ret;
1440 }
1464 target_offset = e->target_offset - (origsize - *size); 1441 target_offset = e->target_offset - (origsize - *size);
1465 if (ret)
1466 goto out;
1467 t = ipt_get_target(e); 1442 t = ipt_get_target(e);
1468 ret = xt_compat_target_to_user(t, dstptr, size); 1443 ret = xt_compat_target_to_user(t, dstptr, size);
1469 if (ret) 1444 if (ret)
1470 goto out; 1445 return ret;
1471 ret = -EFAULT;
1472 next_offset = e->next_offset - (origsize - *size); 1446 next_offset = e->next_offset - (origsize - *size);
1473 if (put_user(target_offset, &ce->target_offset)) 1447 if (put_user(target_offset, &ce->target_offset) != 0 ||
1474 goto out; 1448 put_user(next_offset, &ce->next_offset) != 0)
1475 if (put_user(next_offset, &ce->next_offset)) 1449 return -EFAULT;
1476 goto out;
1477
1478 (*i)++;
1479 return 0; 1450 return 0;
1480out:
1481 return ret;
1482} 1451}
1483 1452
1484static int 1453static int
@@ -1486,61 +1455,45 @@ compat_find_calc_match(struct ipt_entry_match *m,
1486 const char *name, 1455 const char *name,
1487 const struct ipt_ip *ip, 1456 const struct ipt_ip *ip,
1488 unsigned int hookmask, 1457 unsigned int hookmask,
1489 int *size, unsigned int *i) 1458 int *size)
1490{ 1459{
1491 struct xt_match *match; 1460 struct xt_match *match;
1492 1461
1493 match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name, 1462 match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
1494 m->u.user.revision), 1463 m->u.user.revision);
1495 "ipt_%s", m->u.user.name); 1464 if (IS_ERR(match)) {
1496 if (IS_ERR(match) || !match) {
1497 duprintf("compat_check_calc_match: `%s' not found\n", 1465 duprintf("compat_check_calc_match: `%s' not found\n",
1498 m->u.user.name); 1466 m->u.user.name);
1499 return match ? PTR_ERR(match) : -ENOENT; 1467 return PTR_ERR(match);
1500 } 1468 }
1501 m->u.kernel.match = match; 1469 m->u.kernel.match = match;
1502 *size += xt_compat_match_offset(match); 1470 *size += xt_compat_match_offset(match);
1503
1504 (*i)++;
1505 return 0; 1471 return 0;
1506} 1472}
1507 1473
1508static int 1474static void compat_release_entry(struct compat_ipt_entry *e)
1509compat_release_match(struct ipt_entry_match *m, unsigned int *i)
1510{
1511 if (i && (*i)-- == 0)
1512 return 1;
1513
1514 module_put(m->u.kernel.match->me);
1515 return 0;
1516}
1517
1518static int
1519compat_release_entry(struct compat_ipt_entry *e, unsigned int *i)
1520{ 1475{
1521 struct ipt_entry_target *t; 1476 struct ipt_entry_target *t;
1522 1477 struct xt_entry_match *ematch;
1523 if (i && (*i)-- == 0)
1524 return 1;
1525 1478
1526 /* Cleanup all matches */ 1479 /* Cleanup all matches */
1527 COMPAT_IPT_MATCH_ITERATE(e, compat_release_match, NULL); 1480 xt_ematch_foreach(ematch, e)
1481 module_put(ematch->u.kernel.match->me);
1528 t = compat_ipt_get_target(e); 1482 t = compat_ipt_get_target(e);
1529 module_put(t->u.kernel.target->me); 1483 module_put(t->u.kernel.target->me);
1530 return 0;
1531} 1484}
1532 1485
1533static int 1486static int
1534check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, 1487check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
1535 struct xt_table_info *newinfo, 1488 struct xt_table_info *newinfo,
1536 unsigned int *size, 1489 unsigned int *size,
1537 unsigned char *base, 1490 const unsigned char *base,
1538 unsigned char *limit, 1491 const unsigned char *limit,
1539 unsigned int *hook_entries, 1492 const unsigned int *hook_entries,
1540 unsigned int *underflows, 1493 const unsigned int *underflows,
1541 unsigned int *i,
1542 const char *name) 1494 const char *name)
1543{ 1495{
1496 struct xt_entry_match *ematch;
1544 struct ipt_entry_target *t; 1497 struct ipt_entry_target *t;
1545 struct xt_target *target; 1498 struct xt_target *target;
1546 unsigned int entry_offset; 1499 unsigned int entry_offset;
@@ -1569,20 +1522,21 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
1569 off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); 1522 off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
1570 entry_offset = (void *)e - (void *)base; 1523 entry_offset = (void *)e - (void *)base;
1571 j = 0; 1524 j = 0;
1572 ret = COMPAT_IPT_MATCH_ITERATE(e, compat_find_calc_match, name, 1525 xt_ematch_foreach(ematch, e) {
1573 &e->ip, e->comefrom, &off, &j); 1526 ret = compat_find_calc_match(ematch, name,
1574 if (ret != 0) 1527 &e->ip, e->comefrom, &off);
1575 goto release_matches; 1528 if (ret != 0)
1529 goto release_matches;
1530 ++j;
1531 }
1576 1532
1577 t = compat_ipt_get_target(e); 1533 t = compat_ipt_get_target(e);
1578 target = try_then_request_module(xt_find_target(AF_INET, 1534 target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
1579 t->u.user.name, 1535 t->u.user.revision);
1580 t->u.user.revision), 1536 if (IS_ERR(target)) {
1581 "ipt_%s", t->u.user.name);
1582 if (IS_ERR(target) || !target) {
1583 duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", 1537 duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
1584 t->u.user.name); 1538 t->u.user.name);
1585 ret = target ? PTR_ERR(target) : -ENOENT; 1539 ret = PTR_ERR(target);
1586 goto release_matches; 1540 goto release_matches;
1587 } 1541 }
1588 t->u.kernel.target = target; 1542 t->u.kernel.target = target;
@@ -1604,14 +1558,16 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
1604 /* Clear counters and comefrom */ 1558 /* Clear counters and comefrom */
1605 memset(&e->counters, 0, sizeof(e->counters)); 1559 memset(&e->counters, 0, sizeof(e->counters));
1606 e->comefrom = 0; 1560 e->comefrom = 0;
1607
1608 (*i)++;
1609 return 0; 1561 return 0;
1610 1562
1611out: 1563out:
1612 module_put(t->u.kernel.target->me); 1564 module_put(t->u.kernel.target->me);
1613release_matches: 1565release_matches:
1614 IPT_MATCH_ITERATE(e, compat_release_match, &j); 1566 xt_ematch_foreach(ematch, e) {
1567 if (j-- == 0)
1568 break;
1569 module_put(ematch->u.kernel.match->me);
1570 }
1615 return ret; 1571 return ret;
1616} 1572}
1617 1573
@@ -1625,6 +1581,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
1625 struct ipt_entry *de; 1581 struct ipt_entry *de;
1626 unsigned int origsize; 1582 unsigned int origsize;
1627 int ret, h; 1583 int ret, h;
1584 struct xt_entry_match *ematch;
1628 1585
1629 ret = 0; 1586 ret = 0;
1630 origsize = *size; 1587 origsize = *size;
@@ -1635,10 +1592,11 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
1635 *dstptr += sizeof(struct ipt_entry); 1592 *dstptr += sizeof(struct ipt_entry);
1636 *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); 1593 *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
1637 1594
1638 ret = COMPAT_IPT_MATCH_ITERATE(e, xt_compat_match_from_user, 1595 xt_ematch_foreach(ematch, e) {
1639 dstptr, size); 1596 ret = xt_compat_match_from_user(ematch, dstptr, size);
1640 if (ret) 1597 if (ret != 0)
1641 return ret; 1598 return ret;
1599 }
1642 de->target_offset = e->target_offset - (origsize - *size); 1600 de->target_offset = e->target_offset - (origsize - *size);
1643 t = compat_ipt_get_target(e); 1601 t = compat_ipt_get_target(e);
1644 target = t->u.kernel.target; 1602 target = t->u.kernel.target;
@@ -1655,36 +1613,43 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
1655} 1613}
1656 1614
1657static int 1615static int
1658compat_check_entry(struct ipt_entry *e, const char *name, 1616compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
1659 unsigned int *i)
1660{ 1617{
1618 struct xt_entry_match *ematch;
1661 struct xt_mtchk_param mtpar; 1619 struct xt_mtchk_param mtpar;
1662 unsigned int j; 1620 unsigned int j;
1663 int ret; 1621 int ret = 0;
1664 1622
1665 j = 0; 1623 j = 0;
1624 mtpar.net = net;
1666 mtpar.table = name; 1625 mtpar.table = name;
1667 mtpar.entryinfo = &e->ip; 1626 mtpar.entryinfo = &e->ip;
1668 mtpar.hook_mask = e->comefrom; 1627 mtpar.hook_mask = e->comefrom;
1669 mtpar.family = NFPROTO_IPV4; 1628 mtpar.family = NFPROTO_IPV4;
1670 ret = IPT_MATCH_ITERATE(e, check_match, &mtpar, &j); 1629 xt_ematch_foreach(ematch, e) {
1671 if (ret) 1630 ret = check_match(ematch, &mtpar);
1672 goto cleanup_matches; 1631 if (ret != 0)
1632 goto cleanup_matches;
1633 ++j;
1634 }
1673 1635
1674 ret = check_target(e, name); 1636 ret = check_target(e, net, name);
1675 if (ret) 1637 if (ret)
1676 goto cleanup_matches; 1638 goto cleanup_matches;
1677
1678 (*i)++;
1679 return 0; 1639 return 0;
1680 1640
1681 cleanup_matches: 1641 cleanup_matches:
1682 IPT_MATCH_ITERATE(e, cleanup_match, &j); 1642 xt_ematch_foreach(ematch, e) {
1643 if (j-- == 0)
1644 break;
1645 cleanup_match(ematch, net);
1646 }
1683 return ret; 1647 return ret;
1684} 1648}
1685 1649
1686static int 1650static int
1687translate_compat_table(const char *name, 1651translate_compat_table(struct net *net,
1652 const char *name,
1688 unsigned int valid_hooks, 1653 unsigned int valid_hooks,
1689 struct xt_table_info **pinfo, 1654 struct xt_table_info **pinfo,
1690 void **pentry0, 1655 void **pentry0,
@@ -1696,6 +1661,8 @@ translate_compat_table(const char *name,
1696 unsigned int i, j; 1661 unsigned int i, j;
1697 struct xt_table_info *newinfo, *info; 1662 struct xt_table_info *newinfo, *info;
1698 void *pos, *entry0, *entry1; 1663 void *pos, *entry0, *entry1;
1664 struct compat_ipt_entry *iter0;
1665 struct ipt_entry *iter1;
1699 unsigned int size; 1666 unsigned int size;
1700 int ret; 1667 int ret;
1701 1668
@@ -1714,13 +1681,17 @@ translate_compat_table(const char *name,
1714 j = 0; 1681 j = 0;
1715 xt_compat_lock(AF_INET); 1682 xt_compat_lock(AF_INET);
1716 /* Walk through entries, checking offsets. */ 1683 /* Walk through entries, checking offsets. */
1717 ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, 1684 xt_entry_foreach(iter0, entry0, total_size) {
1718 check_compat_entry_size_and_hooks, 1685 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
1719 info, &size, entry0, 1686 entry0,
1720 entry0 + total_size, 1687 entry0 + total_size,
1721 hook_entries, underflows, &j, name); 1688 hook_entries,
1722 if (ret != 0) 1689 underflows,
1723 goto out_unlock; 1690 name);
1691 if (ret != 0)
1692 goto out_unlock;
1693 ++j;
1694 }
1724 1695
1725 ret = -EINVAL; 1696 ret = -EINVAL;
1726 if (j != number) { 1697 if (j != number) {
@@ -1759,9 +1730,12 @@ translate_compat_table(const char *name,
1759 entry1 = newinfo->entries[raw_smp_processor_id()]; 1730 entry1 = newinfo->entries[raw_smp_processor_id()];
1760 pos = entry1; 1731 pos = entry1;
1761 size = total_size; 1732 size = total_size;
1762 ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, 1733 xt_entry_foreach(iter0, entry0, total_size) {
1763 compat_copy_entry_from_user, 1734 ret = compat_copy_entry_from_user(iter0, &pos, &size,
1764 &pos, &size, name, newinfo, entry1); 1735 name, newinfo, entry1);
1736 if (ret != 0)
1737 break;
1738 }
1765 xt_compat_flush_offsets(AF_INET); 1739 xt_compat_flush_offsets(AF_INET);
1766 xt_compat_unlock(AF_INET); 1740 xt_compat_unlock(AF_INET);
1767 if (ret) 1741 if (ret)
@@ -1772,13 +1746,35 @@ translate_compat_table(const char *name,
1772 goto free_newinfo; 1746 goto free_newinfo;
1773 1747
1774 i = 0; 1748 i = 0;
1775 ret = IPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, 1749 xt_entry_foreach(iter1, entry1, newinfo->size) {
1776 name, &i); 1750 ret = compat_check_entry(iter1, net, name);
1751 if (ret != 0)
1752 break;
1753 ++i;
1754 if (strcmp(ipt_get_target(iter1)->u.user.name,
1755 XT_ERROR_TARGET) == 0)
1756 ++newinfo->stacksize;
1757 }
1777 if (ret) { 1758 if (ret) {
1759 /*
1760 * The first i matches need cleanup_entry (calls ->destroy)
1761 * because they had called ->check already. The other j-i
1762 * entries need only release.
1763 */
1764 int skip = i;
1778 j -= i; 1765 j -= i;
1779 COMPAT_IPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, 1766 xt_entry_foreach(iter0, entry0, newinfo->size) {
1780 compat_release_entry, &j); 1767 if (skip-- > 0)
1781 IPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); 1768 continue;
1769 if (j-- == 0)
1770 break;
1771 compat_release_entry(iter0);
1772 }
1773 xt_entry_foreach(iter1, entry1, newinfo->size) {
1774 if (i-- == 0)
1775 break;
1776 cleanup_entry(iter1, net);
1777 }
1782 xt_free_table_info(newinfo); 1778 xt_free_table_info(newinfo);
1783 return ret; 1779 return ret;
1784 } 1780 }
@@ -1796,7 +1792,11 @@ translate_compat_table(const char *name,
1796free_newinfo: 1792free_newinfo:
1797 xt_free_table_info(newinfo); 1793 xt_free_table_info(newinfo);
1798out: 1794out:
1799 COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); 1795 xt_entry_foreach(iter0, entry0, total_size) {
1796 if (j-- == 0)
1797 break;
1798 compat_release_entry(iter0);
1799 }
1800 return ret; 1800 return ret;
1801out_unlock: 1801out_unlock:
1802 xt_compat_flush_offsets(AF_INET); 1802 xt_compat_flush_offsets(AF_INET);
@@ -1811,6 +1811,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
1811 struct compat_ipt_replace tmp; 1811 struct compat_ipt_replace tmp;
1812 struct xt_table_info *newinfo; 1812 struct xt_table_info *newinfo;
1813 void *loc_cpu_entry; 1813 void *loc_cpu_entry;
1814 struct ipt_entry *iter;
1814 1815
1815 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) 1816 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1816 return -EFAULT; 1817 return -EFAULT;
@@ -1833,7 +1834,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
1833 goto free_newinfo; 1834 goto free_newinfo;
1834 } 1835 }
1835 1836
1836 ret = translate_compat_table(tmp.name, tmp.valid_hooks, 1837 ret = translate_compat_table(net, tmp.name, tmp.valid_hooks,
1837 &newinfo, &loc_cpu_entry, tmp.size, 1838 &newinfo, &loc_cpu_entry, tmp.size,
1838 tmp.num_entries, tmp.hook_entry, 1839 tmp.num_entries, tmp.hook_entry,
1839 tmp.underflow); 1840 tmp.underflow);
@@ -1849,7 +1850,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
1849 return 0; 1850 return 0;
1850 1851
1851 free_newinfo_untrans: 1852 free_newinfo_untrans:
1852 IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); 1853 xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
1854 cleanup_entry(iter, net);
1853 free_newinfo: 1855 free_newinfo:
1854 xt_free_table_info(newinfo); 1856 xt_free_table_info(newinfo);
1855 return ret; 1857 return ret;
@@ -1898,6 +1900,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
1898 int ret = 0; 1900 int ret = 0;
1899 const void *loc_cpu_entry; 1901 const void *loc_cpu_entry;
1900 unsigned int i = 0; 1902 unsigned int i = 0;
1903 struct ipt_entry *iter;
1901 1904
1902 counters = alloc_counters(table); 1905 counters = alloc_counters(table);
1903 if (IS_ERR(counters)) 1906 if (IS_ERR(counters))
@@ -1910,9 +1913,12 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
1910 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1913 loc_cpu_entry = private->entries[raw_smp_processor_id()];
1911 pos = userptr; 1914 pos = userptr;
1912 size = total_size; 1915 size = total_size;
1913 ret = IPT_ENTRY_ITERATE(loc_cpu_entry, total_size, 1916 xt_entry_foreach(iter, loc_cpu_entry, total_size) {
1914 compat_copy_entry_to_user, 1917 ret = compat_copy_entry_to_user(iter, &pos,
1915 &pos, &size, counters, &i); 1918 &size, counters, i++);
1919 if (ret != 0)
1920 break;
1921 }
1916 1922
1917 vfree(counters); 1923 vfree(counters);
1918 return ret; 1924 return ret;
@@ -2071,8 +2077,7 @@ struct xt_table *ipt_register_table(struct net *net,
2071{ 2077{
2072 int ret; 2078 int ret;
2073 struct xt_table_info *newinfo; 2079 struct xt_table_info *newinfo;
2074 struct xt_table_info bootstrap 2080 struct xt_table_info bootstrap = {0};
2075 = { 0, 0, 0, { 0 }, { 0 }, { } };
2076 void *loc_cpu_entry; 2081 void *loc_cpu_entry;
2077 struct xt_table *new_table; 2082 struct xt_table *new_table;
2078 2083
@@ -2086,11 +2091,7 @@ struct xt_table *ipt_register_table(struct net *net,
2086 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; 2091 loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
2087 memcpy(loc_cpu_entry, repl->entries, repl->size); 2092 memcpy(loc_cpu_entry, repl->entries, repl->size);
2088 2093
2089 ret = translate_table(table->name, table->valid_hooks, 2094 ret = translate_table(net, newinfo, loc_cpu_entry, repl);
2090 newinfo, loc_cpu_entry, repl->size,
2091 repl->num_entries,
2092 repl->hook_entry,
2093 repl->underflow);
2094 if (ret != 0) 2095 if (ret != 0)
2095 goto out_free; 2096 goto out_free;
2096 2097
@@ -2108,17 +2109,19 @@ out:
2108 return ERR_PTR(ret); 2109 return ERR_PTR(ret);
2109} 2110}
2110 2111
2111void ipt_unregister_table(struct xt_table *table) 2112void ipt_unregister_table(struct net *net, struct xt_table *table)
2112{ 2113{
2113 struct xt_table_info *private; 2114 struct xt_table_info *private;
2114 void *loc_cpu_entry; 2115 void *loc_cpu_entry;
2115 struct module *table_owner = table->me; 2116 struct module *table_owner = table->me;
2117 struct ipt_entry *iter;
2116 2118
2117 private = xt_unregister_table(table); 2119 private = xt_unregister_table(table);
2118 2120
2119 /* Decrease module usage counts and free resources */ 2121 /* Decrease module usage counts and free resources */
2120 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 2122 loc_cpu_entry = private->entries[raw_smp_processor_id()];
2121 IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); 2123 xt_entry_foreach(iter, loc_cpu_entry, private->size)
2124 cleanup_entry(iter, net);
2122 if (private->number > private->initial_entries) 2125 if (private->number > private->initial_entries)
2123 module_put(table_owner); 2126 module_put(table_owner);
2124 xt_free_table_info(private); 2127 xt_free_table_info(private);
@@ -2136,7 +2139,7 @@ icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
2136} 2139}
2137 2140
2138static bool 2141static bool
2139icmp_match(const struct sk_buff *skb, const struct xt_match_param *par) 2142icmp_match(const struct sk_buff *skb, struct xt_action_param *par)
2140{ 2143{
2141 const struct icmphdr *ic; 2144 const struct icmphdr *ic;
2142 struct icmphdr _icmph; 2145 struct icmphdr _icmph;
@@ -2152,7 +2155,7 @@ icmp_match(const struct sk_buff *skb, const struct xt_match_param *par)
2152 * can't. Hence, no choice but to drop. 2155 * can't. Hence, no choice but to drop.
2153 */ 2156 */
2154 duprintf("Dropping evil ICMP tinygram.\n"); 2157 duprintf("Dropping evil ICMP tinygram.\n");
2155 *par->hotdrop = true; 2158 par->hotdrop = true;
2156 return false; 2159 return false;
2157 } 2160 }
2158 2161
@@ -2163,31 +2166,31 @@ icmp_match(const struct sk_buff *skb, const struct xt_match_param *par)
2163 !!(icmpinfo->invflags&IPT_ICMP_INV)); 2166 !!(icmpinfo->invflags&IPT_ICMP_INV));
2164} 2167}
2165 2168
2166static bool icmp_checkentry(const struct xt_mtchk_param *par) 2169static int icmp_checkentry(const struct xt_mtchk_param *par)
2167{ 2170{
2168 const struct ipt_icmp *icmpinfo = par->matchinfo; 2171 const struct ipt_icmp *icmpinfo = par->matchinfo;
2169 2172
2170 /* Must specify no unknown invflags */ 2173 /* Must specify no unknown invflags */
2171 return !(icmpinfo->invflags & ~IPT_ICMP_INV); 2174 return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0;
2172} 2175}
2173 2176
2174/* The built-in targets: standard (NULL) and error. */ 2177static struct xt_target ipt_builtin_tg[] __read_mostly = {
2175static struct xt_target ipt_standard_target __read_mostly = { 2178 {
2176 .name = IPT_STANDARD_TARGET, 2179 .name = IPT_STANDARD_TARGET,
2177 .targetsize = sizeof(int), 2180 .targetsize = sizeof(int),
2178 .family = NFPROTO_IPV4, 2181 .family = NFPROTO_IPV4,
2179#ifdef CONFIG_COMPAT 2182#ifdef CONFIG_COMPAT
2180 .compatsize = sizeof(compat_int_t), 2183 .compatsize = sizeof(compat_int_t),
2181 .compat_from_user = compat_standard_from_user, 2184 .compat_from_user = compat_standard_from_user,
2182 .compat_to_user = compat_standard_to_user, 2185 .compat_to_user = compat_standard_to_user,
2183#endif 2186#endif
2184}; 2187 },
2185 2188 {
2186static struct xt_target ipt_error_target __read_mostly = { 2189 .name = IPT_ERROR_TARGET,
2187 .name = IPT_ERROR_TARGET, 2190 .target = ipt_error,
2188 .target = ipt_error, 2191 .targetsize = IPT_FUNCTION_MAXNAMELEN,
2189 .targetsize = IPT_FUNCTION_MAXNAMELEN, 2192 .family = NFPROTO_IPV4,
2190 .family = NFPROTO_IPV4, 2193 },
2191}; 2194};
2192 2195
2193static struct nf_sockopt_ops ipt_sockopts = { 2196static struct nf_sockopt_ops ipt_sockopts = {
@@ -2207,13 +2210,15 @@ static struct nf_sockopt_ops ipt_sockopts = {
2207 .owner = THIS_MODULE, 2210 .owner = THIS_MODULE,
2208}; 2211};
2209 2212
2210static struct xt_match icmp_matchstruct __read_mostly = { 2213static struct xt_match ipt_builtin_mt[] __read_mostly = {
2211 .name = "icmp", 2214 {
2212 .match = icmp_match, 2215 .name = "icmp",
2213 .matchsize = sizeof(struct ipt_icmp), 2216 .match = icmp_match,
2214 .checkentry = icmp_checkentry, 2217 .matchsize = sizeof(struct ipt_icmp),
2215 .proto = IPPROTO_ICMP, 2218 .checkentry = icmp_checkentry,
2216 .family = NFPROTO_IPV4, 2219 .proto = IPPROTO_ICMP,
2220 .family = NFPROTO_IPV4,
2221 },
2217}; 2222};
2218 2223
2219static int __net_init ip_tables_net_init(struct net *net) 2224static int __net_init ip_tables_net_init(struct net *net)
@@ -2240,13 +2245,10 @@ static int __init ip_tables_init(void)
2240 goto err1; 2245 goto err1;
2241 2246
2242 /* Noone else will be downing sem now, so we won't sleep */ 2247 /* Noone else will be downing sem now, so we won't sleep */
2243 ret = xt_register_target(&ipt_standard_target); 2248 ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
2244 if (ret < 0) 2249 if (ret < 0)
2245 goto err2; 2250 goto err2;
2246 ret = xt_register_target(&ipt_error_target); 2251 ret = xt_register_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
2247 if (ret < 0)
2248 goto err3;
2249 ret = xt_register_match(&icmp_matchstruct);
2250 if (ret < 0) 2252 if (ret < 0)
2251 goto err4; 2253 goto err4;
2252 2254
@@ -2255,15 +2257,13 @@ static int __init ip_tables_init(void)
2255 if (ret < 0) 2257 if (ret < 0)
2256 goto err5; 2258 goto err5;
2257 2259
2258 printk(KERN_INFO "ip_tables: (C) 2000-2006 Netfilter Core Team\n"); 2260 pr_info("(C) 2000-2006 Netfilter Core Team\n");
2259 return 0; 2261 return 0;
2260 2262
2261err5: 2263err5:
2262 xt_unregister_match(&icmp_matchstruct); 2264 xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
2263err4: 2265err4:
2264 xt_unregister_target(&ipt_error_target); 2266 xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
2265err3:
2266 xt_unregister_target(&ipt_standard_target);
2267err2: 2267err2:
2268 unregister_pernet_subsys(&ip_tables_net_ops); 2268 unregister_pernet_subsys(&ip_tables_net_ops);
2269err1: 2269err1:
@@ -2274,10 +2274,8 @@ static void __exit ip_tables_fini(void)
2274{ 2274{
2275 nf_unregister_sockopt(&ipt_sockopts); 2275 nf_unregister_sockopt(&ipt_sockopts);
2276 2276
2277 xt_unregister_match(&icmp_matchstruct); 2277 xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
2278 xt_unregister_target(&ipt_error_target); 2278 xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
2279 xt_unregister_target(&ipt_standard_target);
2280
2281 unregister_pernet_subsys(&ip_tables_net_ops); 2279 unregister_pernet_subsys(&ip_tables_net_ops);
2282} 2280}
2283 2281
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 40ca2d240abb..3a43cf36db87 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -9,11 +9,13 @@
9 * published by the Free Software Foundation. 9 * published by the Free Software Foundation.
10 * 10 *
11 */ 11 */
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/proc_fs.h> 14#include <linux/proc_fs.h>
14#include <linux/jhash.h> 15#include <linux/jhash.h>
15#include <linux/bitops.h> 16#include <linux/bitops.h>
16#include <linux/skbuff.h> 17#include <linux/skbuff.h>
18#include <linux/slab.h>
17#include <linux/ip.h> 19#include <linux/ip.h>
18#include <linux/tcp.h> 20#include <linux/tcp.h>
19#include <linux/udp.h> 21#include <linux/udp.h>
@@ -51,12 +53,13 @@ struct clusterip_config {
51#endif 53#endif
52 enum clusterip_hashmode hash_mode; /* which hashing mode */ 54 enum clusterip_hashmode hash_mode; /* which hashing mode */
53 u_int32_t hash_initval; /* hash initialization */ 55 u_int32_t hash_initval; /* hash initialization */
56 struct rcu_head rcu;
54}; 57};
55 58
56static LIST_HEAD(clusterip_configs); 59static LIST_HEAD(clusterip_configs);
57 60
58/* clusterip_lock protects the clusterip_configs list */ 61/* clusterip_lock protects the clusterip_configs list */
59static DEFINE_RWLOCK(clusterip_lock); 62static DEFINE_SPINLOCK(clusterip_lock);
60 63
61#ifdef CONFIG_PROC_FS 64#ifdef CONFIG_PROC_FS
62static const struct file_operations clusterip_proc_fops; 65static const struct file_operations clusterip_proc_fops;
@@ -69,11 +72,17 @@ clusterip_config_get(struct clusterip_config *c)
69 atomic_inc(&c->refcount); 72 atomic_inc(&c->refcount);
70} 73}
71 74
75
76static void clusterip_config_rcu_free(struct rcu_head *head)
77{
78 kfree(container_of(head, struct clusterip_config, rcu));
79}
80
72static inline void 81static inline void
73clusterip_config_put(struct clusterip_config *c) 82clusterip_config_put(struct clusterip_config *c)
74{ 83{
75 if (atomic_dec_and_test(&c->refcount)) 84 if (atomic_dec_and_test(&c->refcount))
76 kfree(c); 85 call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
77} 86}
78 87
79/* decrease the count of entries using/referencing this config. If last 88/* decrease the count of entries using/referencing this config. If last
@@ -82,12 +91,13 @@ clusterip_config_put(struct clusterip_config *c)
82static inline void 91static inline void
83clusterip_config_entry_put(struct clusterip_config *c) 92clusterip_config_entry_put(struct clusterip_config *c)
84{ 93{
85 write_lock_bh(&clusterip_lock); 94 local_bh_disable();
86 if (atomic_dec_and_test(&c->entries)) { 95 if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
87 list_del(&c->list); 96 list_del_rcu(&c->list);
88 write_unlock_bh(&clusterip_lock); 97 spin_unlock(&clusterip_lock);
98 local_bh_enable();
89 99
90 dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0); 100 dev_mc_del(c->dev, c->clustermac);
91 dev_put(c->dev); 101 dev_put(c->dev);
92 102
93 /* In case anyone still accesses the file, the open/close 103 /* In case anyone still accesses the file, the open/close
@@ -98,7 +108,7 @@ clusterip_config_entry_put(struct clusterip_config *c)
98#endif 108#endif
99 return; 109 return;
100 } 110 }
101 write_unlock_bh(&clusterip_lock); 111 local_bh_enable();
102} 112}
103 113
104static struct clusterip_config * 114static struct clusterip_config *
@@ -106,7 +116,7 @@ __clusterip_config_find(__be32 clusterip)
106{ 116{
107 struct clusterip_config *c; 117 struct clusterip_config *c;
108 118
109 list_for_each_entry(c, &clusterip_configs, list) { 119 list_for_each_entry_rcu(c, &clusterip_configs, list) {
110 if (c->clusterip == clusterip) 120 if (c->clusterip == clusterip)
111 return c; 121 return c;
112 } 122 }
@@ -119,16 +129,15 @@ clusterip_config_find_get(__be32 clusterip, int entry)
119{ 129{
120 struct clusterip_config *c; 130 struct clusterip_config *c;
121 131
122 read_lock_bh(&clusterip_lock); 132 rcu_read_lock_bh();
123 c = __clusterip_config_find(clusterip); 133 c = __clusterip_config_find(clusterip);
124 if (!c) { 134 if (c) {
125 read_unlock_bh(&clusterip_lock); 135 if (unlikely(!atomic_inc_not_zero(&c->refcount)))
126 return NULL; 136 c = NULL;
137 else if (entry)
138 atomic_inc(&c->entries);
127 } 139 }
128 atomic_inc(&c->refcount); 140 rcu_read_unlock_bh();
129 if (entry)
130 atomic_inc(&c->entries);
131 read_unlock_bh(&clusterip_lock);
132 141
133 return c; 142 return c;
134} 143}
@@ -179,9 +188,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
179 } 188 }
180#endif 189#endif
181 190
182 write_lock_bh(&clusterip_lock); 191 spin_lock_bh(&clusterip_lock);
183 list_add(&c->list, &clusterip_configs); 192 list_add_rcu(&c->list, &clusterip_configs);
184 write_unlock_bh(&clusterip_lock); 193 spin_unlock_bh(&clusterip_lock);
185 194
186 return c; 195 return c;
187} 196}
@@ -238,8 +247,7 @@ clusterip_hashfn(const struct sk_buff *skb,
238 break; 247 break;
239 default: 248 default:
240 if (net_ratelimit()) 249 if (net_ratelimit())
241 printk(KERN_NOTICE "CLUSTERIP: unknown protocol `%u'\n", 250 pr_info("unknown protocol %u\n", iph->protocol);
242 iph->protocol);
243 sport = dport = 0; 251 sport = dport = 0;
244 } 252 }
245 253
@@ -261,7 +269,7 @@ clusterip_hashfn(const struct sk_buff *skb,
261 hashval = 0; 269 hashval = 0;
262 /* This cannot happen, unless the check function wasn't called 270 /* This cannot happen, unless the check function wasn't called
263 * at rule load time */ 271 * at rule load time */
264 printk("CLUSTERIP: unknown mode `%u'\n", config->hash_mode); 272 pr_info("unknown mode %u\n", config->hash_mode);
265 BUG(); 273 BUG();
266 break; 274 break;
267 } 275 }
@@ -281,7 +289,7 @@ clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
281 ***********************************************************************/ 289 ***********************************************************************/
282 290
283static unsigned int 291static unsigned int
284clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par) 292clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
285{ 293{
286 const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; 294 const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
287 struct nf_conn *ct; 295 struct nf_conn *ct;
@@ -294,7 +302,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par)
294 302
295 ct = nf_ct_get(skb, &ctinfo); 303 ct = nf_ct_get(skb, &ctinfo);
296 if (ct == NULL) { 304 if (ct == NULL) {
297 printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); 305 pr_info("no conntrack!\n");
298 /* FIXME: need to drop invalid ones, since replies 306 /* FIXME: need to drop invalid ones, since replies
299 * to outgoing connections of other nodes will be 307 * to outgoing connections of other nodes will be
300 * marked as INVALID */ 308 * marked as INVALID */
@@ -347,25 +355,24 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par)
347 return XT_CONTINUE; 355 return XT_CONTINUE;
348} 356}
349 357
350static bool clusterip_tg_check(const struct xt_tgchk_param *par) 358static int clusterip_tg_check(const struct xt_tgchk_param *par)
351{ 359{
352 struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; 360 struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
353 const struct ipt_entry *e = par->entryinfo; 361 const struct ipt_entry *e = par->entryinfo;
354
355 struct clusterip_config *config; 362 struct clusterip_config *config;
363 int ret;
356 364
357 if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && 365 if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
358 cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && 366 cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
359 cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { 367 cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
360 printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n", 368 pr_info("unknown mode %u\n", cipinfo->hash_mode);
361 cipinfo->hash_mode); 369 return -EINVAL;
362 return false;
363 370
364 } 371 }
365 if (e->ip.dmsk.s_addr != htonl(0xffffffff) || 372 if (e->ip.dmsk.s_addr != htonl(0xffffffff) ||
366 e->ip.dst.s_addr == 0) { 373 e->ip.dst.s_addr == 0) {
367 printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n"); 374 pr_info("Please specify destination IP\n");
368 return false; 375 return -EINVAL;
369 } 376 }
370 377
371 /* FIXME: further sanity checks */ 378 /* FIXME: further sanity checks */
@@ -373,41 +380,41 @@ static bool clusterip_tg_check(const struct xt_tgchk_param *par)
373 config = clusterip_config_find_get(e->ip.dst.s_addr, 1); 380 config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
374 if (!config) { 381 if (!config) {
375 if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { 382 if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
376 printk(KERN_WARNING "CLUSTERIP: no config found for %pI4, need 'new'\n", &e->ip.dst.s_addr); 383 pr_info("no config found for %pI4, need 'new'\n",
377 return false; 384 &e->ip.dst.s_addr);
385 return -EINVAL;
378 } else { 386 } else {
379 struct net_device *dev; 387 struct net_device *dev;
380 388
381 if (e->ip.iniface[0] == '\0') { 389 if (e->ip.iniface[0] == '\0') {
382 printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n"); 390 pr_info("Please specify an interface name\n");
383 return false; 391 return -EINVAL;
384 } 392 }
385 393
386 dev = dev_get_by_name(&init_net, e->ip.iniface); 394 dev = dev_get_by_name(&init_net, e->ip.iniface);
387 if (!dev) { 395 if (!dev) {
388 printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface); 396 pr_info("no such interface %s\n",
389 return false; 397 e->ip.iniface);
398 return -ENOENT;
390 } 399 }
391 400
392 config = clusterip_config_init(cipinfo, 401 config = clusterip_config_init(cipinfo,
393 e->ip.dst.s_addr, dev); 402 e->ip.dst.s_addr, dev);
394 if (!config) { 403 if (!config) {
395 printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n"); 404 pr_info("cannot allocate config\n");
396 dev_put(dev); 405 dev_put(dev);
397 return false; 406 return -ENOMEM;
398 } 407 }
399 dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0); 408 dev_mc_add(config->dev, config->clustermac);
400 } 409 }
401 } 410 }
402 cipinfo->config = config; 411 cipinfo->config = config;
403 412
404 if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { 413 ret = nf_ct_l3proto_try_module_get(par->family);
405 printk(KERN_WARNING "can't load conntrack support for " 414 if (ret < 0)
406 "proto=%u\n", par->target->family); 415 pr_info("cannot load conntrack support for proto=%u\n",
407 return false; 416 par->family);
408 } 417 return ret;
409
410 return true;
411} 418}
412 419
413/* drop reference count of cluster config when rule is deleted */ 420/* drop reference count of cluster config when rule is deleted */
@@ -421,7 +428,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
421 428
422 clusterip_config_put(cipinfo->config); 429 clusterip_config_put(cipinfo->config);
423 430
424 nf_ct_l3proto_module_put(par->target->family); 431 nf_ct_l3proto_module_put(par->family);
425} 432}
426 433
427#ifdef CONFIG_COMPAT 434#ifdef CONFIG_COMPAT
@@ -462,7 +469,7 @@ struct arp_payload {
462 __be32 src_ip; 469 __be32 src_ip;
463 u_int8_t dst_hw[ETH_ALEN]; 470 u_int8_t dst_hw[ETH_ALEN];
464 __be32 dst_ip; 471 __be32 dst_ip;
465} __attribute__ ((packed)); 472} __packed;
466 473
467#ifdef DEBUG 474#ifdef DEBUG
468static void arp_print(struct arp_payload *payload) 475static void arp_print(struct arp_payload *payload)
@@ -478,8 +485,8 @@ static void arp_print(struct arp_payload *payload)
478 } 485 }
479 hbuffer[--k]='\0'; 486 hbuffer[--k]='\0';
480 487
481 printk("src %pI4@%s, dst %pI4\n", 488 pr_debug("src %pI4@%s, dst %pI4\n",
482 &payload->src_ip, hbuffer, &payload->dst_ip); 489 &payload->src_ip, hbuffer, &payload->dst_ip);
483} 490}
484#endif 491#endif
485 492
@@ -518,7 +525,7 @@ arp_mangle(unsigned int hook,
518 * this wouldn't work, since we didn't subscribe the mcast group on 525 * this wouldn't work, since we didn't subscribe the mcast group on
519 * other interfaces */ 526 * other interfaces */
520 if (c->dev != out) { 527 if (c->dev != out) {
521 pr_debug("CLUSTERIP: not mangling arp reply on different " 528 pr_debug("not mangling arp reply on different "
522 "interface: cip'%s'-skb'%s'\n", 529 "interface: cip'%s'-skb'%s'\n",
523 c->dev->name, out->name); 530 c->dev->name, out->name);
524 clusterip_config_put(c); 531 clusterip_config_put(c);
@@ -529,7 +536,7 @@ arp_mangle(unsigned int hook,
529 memcpy(payload->src_hw, c->clustermac, arp->ar_hln); 536 memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
530 537
531#ifdef DEBUG 538#ifdef DEBUG
532 pr_debug(KERN_DEBUG "CLUSTERIP mangled arp reply: "); 539 pr_debug("mangled arp reply: ");
533 arp_print(payload); 540 arp_print(payload);
534#endif 541#endif
535 542
@@ -560,8 +567,7 @@ struct clusterip_seq_position {
560 567
561static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) 568static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
562{ 569{
563 const struct proc_dir_entry *pde = s->private; 570 struct clusterip_config *c = s->private;
564 struct clusterip_config *c = pde->data;
565 unsigned int weight; 571 unsigned int weight;
566 u_int32_t local_nodes; 572 u_int32_t local_nodes;
567 struct clusterip_seq_position *idx; 573 struct clusterip_seq_position *idx;
@@ -601,7 +607,8 @@ static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
601 607
602static void clusterip_seq_stop(struct seq_file *s, void *v) 608static void clusterip_seq_stop(struct seq_file *s, void *v)
603{ 609{
604 kfree(v); 610 if (!IS_ERR(v))
611 kfree(v);
605} 612}
606 613
607static int clusterip_seq_show(struct seq_file *s, void *v) 614static int clusterip_seq_show(struct seq_file *s, void *v)
@@ -632,10 +639,9 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
632 639
633 if (!ret) { 640 if (!ret) {
634 struct seq_file *sf = file->private_data; 641 struct seq_file *sf = file->private_data;
635 struct proc_dir_entry *pde = PDE(inode); 642 struct clusterip_config *c = PDE(inode)->data;
636 struct clusterip_config *c = pde->data;
637 643
638 sf->private = pde; 644 sf->private = c;
639 645
640 clusterip_config_get(c); 646 clusterip_config_get(c);
641 } 647 }
@@ -645,8 +651,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
645 651
646static int clusterip_proc_release(struct inode *inode, struct file *file) 652static int clusterip_proc_release(struct inode *inode, struct file *file)
647{ 653{
648 struct proc_dir_entry *pde = PDE(inode); 654 struct clusterip_config *c = PDE(inode)->data;
649 struct clusterip_config *c = pde->data;
650 int ret; 655 int ret;
651 656
652 ret = seq_release(inode, file); 657 ret = seq_release(inode, file);
@@ -660,10 +665,9 @@ static int clusterip_proc_release(struct inode *inode, struct file *file)
660static ssize_t clusterip_proc_write(struct file *file, const char __user *input, 665static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
661 size_t size, loff_t *ofs) 666 size_t size, loff_t *ofs)
662{ 667{
668 struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data;
663#define PROC_WRITELEN 10 669#define PROC_WRITELEN 10
664 char buffer[PROC_WRITELEN+1]; 670 char buffer[PROC_WRITELEN+1];
665 const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
666 struct clusterip_config *c = pde->data;
667 unsigned long nodenum; 671 unsigned long nodenum;
668 672
669 if (copy_from_user(buffer, input, PROC_WRITELEN)) 673 if (copy_from_user(buffer, input, PROC_WRITELEN))
@@ -709,13 +713,13 @@ static int __init clusterip_tg_init(void)
709#ifdef CONFIG_PROC_FS 713#ifdef CONFIG_PROC_FS
710 clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net); 714 clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net);
711 if (!clusterip_procdir) { 715 if (!clusterip_procdir) {
712 printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n"); 716 pr_err("Unable to proc dir entry\n");
713 ret = -ENOMEM; 717 ret = -ENOMEM;
714 goto cleanup_hook; 718 goto cleanup_hook;
715 } 719 }
716#endif /* CONFIG_PROC_FS */ 720#endif /* CONFIG_PROC_FS */
717 721
718 printk(KERN_NOTICE "ClusterIP Version %s loaded successfully\n", 722 pr_info("ClusterIP Version %s loaded successfully\n",
719 CLUSTERIP_VERSION); 723 CLUSTERIP_VERSION);
720 return 0; 724 return 0;
721 725
@@ -730,13 +734,15 @@ cleanup_target:
730 734
731static void __exit clusterip_tg_exit(void) 735static void __exit clusterip_tg_exit(void)
732{ 736{
733 printk(KERN_NOTICE "ClusterIP Version %s unloading\n", 737 pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
734 CLUSTERIP_VERSION);
735#ifdef CONFIG_PROC_FS 738#ifdef CONFIG_PROC_FS
736 remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent); 739 remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
737#endif 740#endif
738 nf_unregister_hook(&cip_arp_ops); 741 nf_unregister_hook(&cip_arp_ops);
739 xt_unregister_target(&clusterip_tg_reg); 742 xt_unregister_target(&clusterip_tg_reg);
743
744 /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
745 rcu_barrier_bh();
740} 746}
741 747
742module_init(clusterip_tg_init); 748module_init(clusterip_tg_init);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index ea5cea2415c1..4bf3dc49ad1e 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -6,7 +6,7 @@
6 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8*/ 8*/
9 9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10#include <linux/in.h> 10#include <linux/in.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/skbuff.h> 12#include <linux/skbuff.h>
@@ -77,7 +77,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
77} 77}
78 78
79static unsigned int 79static unsigned int
80ecn_tg(struct sk_buff *skb, const struct xt_target_param *par) 80ecn_tg(struct sk_buff *skb, const struct xt_action_param *par)
81{ 81{
82 const struct ipt_ECN_info *einfo = par->targinfo; 82 const struct ipt_ECN_info *einfo = par->targinfo;
83 83
@@ -93,28 +93,25 @@ ecn_tg(struct sk_buff *skb, const struct xt_target_param *par)
93 return XT_CONTINUE; 93 return XT_CONTINUE;
94} 94}
95 95
96static bool ecn_tg_check(const struct xt_tgchk_param *par) 96static int ecn_tg_check(const struct xt_tgchk_param *par)
97{ 97{
98 const struct ipt_ECN_info *einfo = par->targinfo; 98 const struct ipt_ECN_info *einfo = par->targinfo;
99 const struct ipt_entry *e = par->entryinfo; 99 const struct ipt_entry *e = par->entryinfo;
100 100
101 if (einfo->operation & IPT_ECN_OP_MASK) { 101 if (einfo->operation & IPT_ECN_OP_MASK) {
102 printk(KERN_WARNING "ECN: unsupported ECN operation %x\n", 102 pr_info("unsupported ECN operation %x\n", einfo->operation);
103 einfo->operation); 103 return -EINVAL;
104 return false;
105 } 104 }
106 if (einfo->ip_ect & ~IPT_ECN_IP_MASK) { 105 if (einfo->ip_ect & ~IPT_ECN_IP_MASK) {
107 printk(KERN_WARNING "ECN: new ECT codepoint %x out of mask\n", 106 pr_info("new ECT codepoint %x out of mask\n", einfo->ip_ect);
108 einfo->ip_ect); 107 return -EINVAL;
109 return false;
110 } 108 }
111 if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) && 109 if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) &&
112 (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) { 110 (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) {
113 printk(KERN_WARNING "ECN: cannot use TCP operations on a " 111 pr_info("cannot use TCP operations on a non-tcp rule\n");
114 "non-tcp rule\n"); 112 return -EINVAL;
115 return false;
116 } 113 }
117 return true; 114 return 0;
118} 115}
119 116
120static struct xt_target ecn_tg_reg __read_mostly = { 117static struct xt_target ecn_tg_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index ee128efa1c8d..915fc17d7ce2 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -9,10 +9,11 @@
9 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation. 10 * published by the Free Software Foundation.
11 */ 11 */
12 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/skbuff.h> 15#include <linux/skbuff.h>
16#include <linux/if_arp.h>
16#include <linux/ip.h> 17#include <linux/ip.h>
17#include <net/icmp.h> 18#include <net/icmp.h>
18#include <net/udp.h> 19#include <net/udp.h>
@@ -363,11 +364,47 @@ static void dump_packet(const struct nf_loginfo *info,
363 /* maxlen = 230+ 91 + 230 + 252 = 803 */ 364 /* maxlen = 230+ 91 + 230 + 252 = 803 */
364} 365}
365 366
367static void dump_mac_header(const struct nf_loginfo *info,
368 const struct sk_buff *skb)
369{
370 struct net_device *dev = skb->dev;
371 unsigned int logflags = 0;
372
373 if (info->type == NF_LOG_TYPE_LOG)
374 logflags = info->u.log.logflags;
375
376 if (!(logflags & IPT_LOG_MACDECODE))
377 goto fallback;
378
379 switch (dev->type) {
380 case ARPHRD_ETHER:
381 printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
382 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
383 ntohs(eth_hdr(skb)->h_proto));
384 return;
385 default:
386 break;
387 }
388
389fallback:
390 printk("MAC=");
391 if (dev->hard_header_len &&
392 skb->mac_header != skb->network_header) {
393 const unsigned char *p = skb_mac_header(skb);
394 unsigned int i;
395
396 printk("%02x", *p++);
397 for (i = 1; i < dev->hard_header_len; i++, p++)
398 printk(":%02x", *p);
399 }
400 printk(" ");
401}
402
366static struct nf_loginfo default_loginfo = { 403static struct nf_loginfo default_loginfo = {
367 .type = NF_LOG_TYPE_LOG, 404 .type = NF_LOG_TYPE_LOG,
368 .u = { 405 .u = {
369 .log = { 406 .log = {
370 .level = 0, 407 .level = 5,
371 .logflags = NF_LOG_MASK, 408 .logflags = NF_LOG_MASK,
372 }, 409 },
373 }, 410 },
@@ -404,20 +441,9 @@ ipt_log_packet(u_int8_t pf,
404 } 441 }
405#endif 442#endif
406 443
407 if (in && !out) { 444 /* MAC logging for input path only. */
408 /* MAC logging for input chain only. */ 445 if (in && !out)
409 printk("MAC="); 446 dump_mac_header(loginfo, skb);
410 if (skb->dev && skb->dev->hard_header_len &&
411 skb->mac_header != skb->network_header) {
412 int i;
413 const unsigned char *p = skb_mac_header(skb);
414 for (i = 0; i < skb->dev->hard_header_len; i++,p++)
415 printk("%02x%c", *p,
416 i==skb->dev->hard_header_len - 1
417 ? ' ':':');
418 } else
419 printk(" ");
420 }
421 447
422 dump_packet(loginfo, skb, 0); 448 dump_packet(loginfo, skb, 0);
423 printk("\n"); 449 printk("\n");
@@ -425,7 +451,7 @@ ipt_log_packet(u_int8_t pf,
425} 451}
426 452
427static unsigned int 453static unsigned int
428log_tg(struct sk_buff *skb, const struct xt_target_param *par) 454log_tg(struct sk_buff *skb, const struct xt_action_param *par)
429{ 455{
430 const struct ipt_log_info *loginfo = par->targinfo; 456 const struct ipt_log_info *loginfo = par->targinfo;
431 struct nf_loginfo li; 457 struct nf_loginfo li;
@@ -439,20 +465,19 @@ log_tg(struct sk_buff *skb, const struct xt_target_param *par)
439 return XT_CONTINUE; 465 return XT_CONTINUE;
440} 466}
441 467
442static bool log_tg_check(const struct xt_tgchk_param *par) 468static int log_tg_check(const struct xt_tgchk_param *par)
443{ 469{
444 const struct ipt_log_info *loginfo = par->targinfo; 470 const struct ipt_log_info *loginfo = par->targinfo;
445 471
446 if (loginfo->level >= 8) { 472 if (loginfo->level >= 8) {
447 pr_debug("LOG: level %u >= 8\n", loginfo->level); 473 pr_debug("level %u >= 8\n", loginfo->level);
448 return false; 474 return -EINVAL;
449 } 475 }
450 if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { 476 if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
451 pr_debug("LOG: prefix term %i\n", 477 pr_debug("prefix is not null-terminated\n");
452 loginfo->prefix[sizeof(loginfo->prefix)-1]); 478 return -EINVAL;
453 return false;
454 } 479 }
455 return true; 480 return 0;
456} 481}
457 482
458static struct xt_target log_tg_reg __read_mostly = { 483static struct xt_target log_tg_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 650b54042b01..d2ed9dc74ebc 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -8,7 +8,7 @@
8 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation. 9 * published by the Free Software Foundation.
10 */ 10 */
11 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/inetdevice.h> 13#include <linux/inetdevice.h>
14#include <linux/ip.h> 14#include <linux/ip.h>
@@ -28,23 +28,23 @@ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
28MODULE_DESCRIPTION("Xtables: automatic-address SNAT"); 28MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
29 29
30/* FIXME: Multiple targets. --RR */ 30/* FIXME: Multiple targets. --RR */
31static bool masquerade_tg_check(const struct xt_tgchk_param *par) 31static int masquerade_tg_check(const struct xt_tgchk_param *par)
32{ 32{
33 const struct nf_nat_multi_range_compat *mr = par->targinfo; 33 const struct nf_nat_multi_range_compat *mr = par->targinfo;
34 34
35 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { 35 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
36 pr_debug("masquerade_check: bad MAP_IPS.\n"); 36 pr_debug("bad MAP_IPS.\n");
37 return false; 37 return -EINVAL;
38 } 38 }
39 if (mr->rangesize != 1) { 39 if (mr->rangesize != 1) {
40 pr_debug("masquerade_check: bad rangesize %u\n", mr->rangesize); 40 pr_debug("bad rangesize %u\n", mr->rangesize);
41 return false; 41 return -EINVAL;
42 } 42 }
43 return true; 43 return 0;
44} 44}
45 45
46static unsigned int 46static unsigned int
47masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par) 47masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
48{ 48{
49 struct nf_conn *ct; 49 struct nf_conn *ct;
50 struct nf_conn_nat *nat; 50 struct nf_conn_nat *nat;
@@ -72,7 +72,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par)
72 rt = skb_rtable(skb); 72 rt = skb_rtable(skb);
73 newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); 73 newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
74 if (!newsrc) { 74 if (!newsrc) {
75 printk("MASQUERADE: %s ate my IP address\n", par->out->name); 75 pr_info("%s ate my IP address\n", par->out->name);
76 return NF_DROP; 76 return NF_DROP;
77 } 77 }
78 78
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 7c29582d4ec8..6cdb298f1035 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -9,7 +9,7 @@
9 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation. 10 * published by the Free Software Foundation.
11 */ 11 */
12 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13#include <linux/ip.h> 13#include <linux/ip.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/netdevice.h> 15#include <linux/netdevice.h>
@@ -22,23 +22,23 @@ MODULE_LICENSE("GPL");
22MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>"); 22MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
23MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets"); 23MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets");
24 24
25static bool netmap_tg_check(const struct xt_tgchk_param *par) 25static int netmap_tg_check(const struct xt_tgchk_param *par)
26{ 26{
27 const struct nf_nat_multi_range_compat *mr = par->targinfo; 27 const struct nf_nat_multi_range_compat *mr = par->targinfo;
28 28
29 if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { 29 if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
30 pr_debug("NETMAP:check: bad MAP_IPS.\n"); 30 pr_debug("bad MAP_IPS.\n");
31 return false; 31 return -EINVAL;
32 } 32 }
33 if (mr->rangesize != 1) { 33 if (mr->rangesize != 1) {
34 pr_debug("NETMAP:check: bad rangesize %u.\n", mr->rangesize); 34 pr_debug("bad rangesize %u.\n", mr->rangesize);
35 return false; 35 return -EINVAL;
36 } 36 }
37 return true; 37 return 0;
38} 38}
39 39
40static unsigned int 40static unsigned int
41netmap_tg(struct sk_buff *skb, const struct xt_target_param *par) 41netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
42{ 42{
43 struct nf_conn *ct; 43 struct nf_conn *ct;
44 enum ip_conntrack_info ctinfo; 44 enum ip_conntrack_info ctinfo;
@@ -48,7 +48,8 @@ netmap_tg(struct sk_buff *skb, const struct xt_target_param *par)
48 48
49 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || 49 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
50 par->hooknum == NF_INET_POST_ROUTING || 50 par->hooknum == NF_INET_POST_ROUTING ||
51 par->hooknum == NF_INET_LOCAL_OUT); 51 par->hooknum == NF_INET_LOCAL_OUT ||
52 par->hooknum == NF_INET_LOCAL_IN);
52 ct = nf_ct_get(skb, &ctinfo); 53 ct = nf_ct_get(skb, &ctinfo);
53 54
54 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); 55 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
@@ -77,7 +78,8 @@ static struct xt_target netmap_tg_reg __read_mostly = {
77 .table = "nat", 78 .table = "nat",
78 .hooks = (1 << NF_INET_PRE_ROUTING) | 79 .hooks = (1 << NF_INET_PRE_ROUTING) |
79 (1 << NF_INET_POST_ROUTING) | 80 (1 << NF_INET_POST_ROUTING) |
80 (1 << NF_INET_LOCAL_OUT), 81 (1 << NF_INET_LOCAL_OUT) |
82 (1 << NF_INET_LOCAL_IN),
81 .checkentry = netmap_tg_check, 83 .checkentry = netmap_tg_check,
82 .me = THIS_MODULE 84 .me = THIS_MODULE
83}; 85};
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index 698e5e78685b..18a0656505a0 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -6,7 +6,7 @@
6 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 */ 8 */
9 9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/ip.h> 11#include <linux/ip.h>
12#include <linux/timer.h> 12#include <linux/timer.h>
@@ -26,23 +26,23 @@ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
26MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); 26MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
27 27
28/* FIXME: Take multiple ranges --RR */ 28/* FIXME: Take multiple ranges --RR */
29static bool redirect_tg_check(const struct xt_tgchk_param *par) 29static int redirect_tg_check(const struct xt_tgchk_param *par)
30{ 30{
31 const struct nf_nat_multi_range_compat *mr = par->targinfo; 31 const struct nf_nat_multi_range_compat *mr = par->targinfo;
32 32
33 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { 33 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
34 pr_debug("redirect_check: bad MAP_IPS.\n"); 34 pr_debug("bad MAP_IPS.\n");
35 return false; 35 return -EINVAL;
36 } 36 }
37 if (mr->rangesize != 1) { 37 if (mr->rangesize != 1) {
38 pr_debug("redirect_check: bad rangesize %u.\n", mr->rangesize); 38 pr_debug("bad rangesize %u.\n", mr->rangesize);
39 return false; 39 return -EINVAL;
40 } 40 }
41 return true; 41 return 0;
42} 42}
43 43
44static unsigned int 44static unsigned int
45redirect_tg(struct sk_buff *skb, const struct xt_target_param *par) 45redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
46{ 46{
47 struct nf_conn *ct; 47 struct nf_conn *ct;
48 enum ip_conntrack_info ctinfo; 48 enum ip_conntrack_info ctinfo;
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 5113b8f1a379..43eec80c0e7c 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -9,9 +9,10 @@
9 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation. 10 * published by the Free Software Foundation.
11 */ 11 */
12 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/slab.h>
15#include <linux/ip.h> 16#include <linux/ip.h>
16#include <linux/udp.h> 17#include <linux/udp.h>
17#include <linux/icmp.h> 18#include <linux/icmp.h>
@@ -94,10 +95,11 @@ static void send_reset(struct sk_buff *oldskb, int hook)
94 } 95 }
95 96
96 tcph->rst = 1; 97 tcph->rst = 1;
97 tcph->check = tcp_v4_check(sizeof(struct tcphdr), 98 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
98 niph->saddr, niph->daddr, 99 niph->daddr, 0);
99 csum_partial(tcph, 100 nskb->ip_summed = CHECKSUM_PARTIAL;
100 sizeof(struct tcphdr), 0)); 101 nskb->csum_start = (unsigned char *)tcph - nskb->head;
102 nskb->csum_offset = offsetof(struct tcphdr, check);
101 103
102 addr_type = RTN_UNSPEC; 104 addr_type = RTN_UNSPEC;
103 if (hook != NF_INET_FORWARD 105 if (hook != NF_INET_FORWARD
@@ -108,13 +110,13 @@ static void send_reset(struct sk_buff *oldskb, int hook)
108 addr_type = RTN_LOCAL; 110 addr_type = RTN_LOCAL;
109 111
110 /* ip_route_me_harder expects skb->dst to be set */ 112 /* ip_route_me_harder expects skb->dst to be set */
111 skb_dst_set(nskb, dst_clone(skb_dst(oldskb))); 113 skb_dst_set_noref(nskb, skb_dst(oldskb));
112 114
115 nskb->protocol = htons(ETH_P_IP);
113 if (ip_route_me_harder(nskb, addr_type)) 116 if (ip_route_me_harder(nskb, addr_type))
114 goto free_nskb; 117 goto free_nskb;
115 118
116 niph->ttl = dst_metric(skb_dst(nskb), RTAX_HOPLIMIT); 119 niph->ttl = dst_metric(skb_dst(nskb), RTAX_HOPLIMIT);
117 nskb->ip_summed = CHECKSUM_NONE;
118 120
119 /* "Never happens" */ 121 /* "Never happens" */
120 if (nskb->len > dst_mtu(skb_dst(nskb))) 122 if (nskb->len > dst_mtu(skb_dst(nskb)))
@@ -135,13 +137,10 @@ static inline void send_unreach(struct sk_buff *skb_in, int code)
135} 137}
136 138
137static unsigned int 139static unsigned int
138reject_tg(struct sk_buff *skb, const struct xt_target_param *par) 140reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
139{ 141{
140 const struct ipt_reject_info *reject = par->targinfo; 142 const struct ipt_reject_info *reject = par->targinfo;
141 143
142 /* WARNING: This code causes reentry within iptables.
143 This means that the iptables jump stack is now crap. We
144 must return an absolute verdict. --RR */
145 switch (reject->with) { 144 switch (reject->with) {
146 case IPT_ICMP_NET_UNREACHABLE: 145 case IPT_ICMP_NET_UNREACHABLE:
147 send_unreach(skb, ICMP_NET_UNREACH); 146 send_unreach(skb, ICMP_NET_UNREACH);
@@ -174,23 +173,23 @@ reject_tg(struct sk_buff *skb, const struct xt_target_param *par)
174 return NF_DROP; 173 return NF_DROP;
175} 174}
176 175
177static bool reject_tg_check(const struct xt_tgchk_param *par) 176static int reject_tg_check(const struct xt_tgchk_param *par)
178{ 177{
179 const struct ipt_reject_info *rejinfo = par->targinfo; 178 const struct ipt_reject_info *rejinfo = par->targinfo;
180 const struct ipt_entry *e = par->entryinfo; 179 const struct ipt_entry *e = par->entryinfo;
181 180
182 if (rejinfo->with == IPT_ICMP_ECHOREPLY) { 181 if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
183 printk("ipt_REJECT: ECHOREPLY no longer supported.\n"); 182 pr_info("ECHOREPLY no longer supported.\n");
184 return false; 183 return -EINVAL;
185 } else if (rejinfo->with == IPT_TCP_RESET) { 184 } else if (rejinfo->with == IPT_TCP_RESET) {
186 /* Must specify that it's a TCP packet */ 185 /* Must specify that it's a TCP packet */
187 if (e->ip.proto != IPPROTO_TCP || 186 if (e->ip.proto != IPPROTO_TCP ||
188 (e->ip.invflags & XT_INV_PROTO)) { 187 (e->ip.invflags & XT_INV_PROTO)) {
189 printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n"); 188 pr_info("TCP_RESET invalid for non-tcp\n");
190 return false; 189 return -EINVAL;
191 } 190 }
192 } 191 }
193 return true; 192 return 0;
194} 193}
195 194
196static struct xt_target reject_tg_reg __read_mostly = { 195static struct xt_target reject_tg_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 399061c3fd7d..446e0f467a17 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -29,10 +29,11 @@
29 * Specify, after how many hundredths of a second the queue should be 29 * Specify, after how many hundredths of a second the queue should be
30 * flushed even if it is not full yet. 30 * flushed even if it is not full yet.
31 */ 31 */
32 32#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
33#include <linux/module.h> 33#include <linux/module.h>
34#include <linux/spinlock.h> 34#include <linux/spinlock.h>
35#include <linux/socket.h> 35#include <linux/socket.h>
36#include <linux/slab.h>
36#include <linux/skbuff.h> 37#include <linux/skbuff.h>
37#include <linux/kernel.h> 38#include <linux/kernel.h>
38#include <linux/timer.h> 39#include <linux/timer.h>
@@ -56,8 +57,6 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
56#define ULOG_NL_EVENT 111 /* Harald's favorite number */ 57#define ULOG_NL_EVENT 111 /* Harald's favorite number */
57#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ 58#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
58 59
59#define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0)
60
61static unsigned int nlbufsiz = NLMSG_GOODSIZE; 60static unsigned int nlbufsiz = NLMSG_GOODSIZE;
62module_param(nlbufsiz, uint, 0400); 61module_param(nlbufsiz, uint, 0400);
63MODULE_PARM_DESC(nlbufsiz, "netlink buffer size"); 62MODULE_PARM_DESC(nlbufsiz, "netlink buffer size");
@@ -90,12 +89,12 @@ static void ulog_send(unsigned int nlgroupnum)
90 ulog_buff_t *ub = &ulog_buffers[nlgroupnum]; 89 ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
91 90
92 if (timer_pending(&ub->timer)) { 91 if (timer_pending(&ub->timer)) {
93 pr_debug("ipt_ULOG: ulog_send: timer was pending, deleting\n"); 92 pr_debug("ulog_send: timer was pending, deleting\n");
94 del_timer(&ub->timer); 93 del_timer(&ub->timer);
95 } 94 }
96 95
97 if (!ub->skb) { 96 if (!ub->skb) {
98 pr_debug("ipt_ULOG: ulog_send: nothing to send\n"); 97 pr_debug("ulog_send: nothing to send\n");
99 return; 98 return;
100 } 99 }
101 100
@@ -104,7 +103,7 @@ static void ulog_send(unsigned int nlgroupnum)
104 ub->lastnlh->nlmsg_type = NLMSG_DONE; 103 ub->lastnlh->nlmsg_type = NLMSG_DONE;
105 104
106 NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1; 105 NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
107 pr_debug("ipt_ULOG: throwing %d packets to netlink group %u\n", 106 pr_debug("throwing %d packets to netlink group %u\n",
108 ub->qlen, nlgroupnum + 1); 107 ub->qlen, nlgroupnum + 1);
109 netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC); 108 netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
110 109
@@ -117,7 +116,7 @@ static void ulog_send(unsigned int nlgroupnum)
117/* timer function to flush queue in flushtimeout time */ 116/* timer function to flush queue in flushtimeout time */
118static void ulog_timer(unsigned long data) 117static void ulog_timer(unsigned long data)
119{ 118{
120 pr_debug("ipt_ULOG: timer function called, calling ulog_send\n"); 119 pr_debug("timer function called, calling ulog_send\n");
121 120
122 /* lock to protect against somebody modifying our structure 121 /* lock to protect against somebody modifying our structure
123 * from ipt_ulog_target at the same time */ 122 * from ipt_ulog_target at the same time */
@@ -138,7 +137,7 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
138 n = max(size, nlbufsiz); 137 n = max(size, nlbufsiz);
139 skb = alloc_skb(n, GFP_ATOMIC); 138 skb = alloc_skb(n, GFP_ATOMIC);
140 if (!skb) { 139 if (!skb) {
141 PRINTR("ipt_ULOG: can't alloc whole buffer %ub!\n", n); 140 pr_debug("cannot alloc whole buffer %ub!\n", n);
142 141
143 if (n > size) { 142 if (n > size) {
144 /* try to allocate only as much as we need for 143 /* try to allocate only as much as we need for
@@ -146,8 +145,7 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
146 145
147 skb = alloc_skb(size, GFP_ATOMIC); 146 skb = alloc_skb(size, GFP_ATOMIC);
148 if (!skb) 147 if (!skb)
149 PRINTR("ipt_ULOG: can't even allocate %ub\n", 148 pr_debug("cannot even allocate %ub\n", size);
150 size);
151 } 149 }
152 } 150 }
153 151
@@ -198,8 +196,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
198 goto alloc_failure; 196 goto alloc_failure;
199 } 197 }
200 198
201 pr_debug("ipt_ULOG: qlen %d, qthreshold %Zu\n", ub->qlen, 199 pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
202 loginfo->qthreshold);
203 200
204 /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ 201 /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
205 nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, 202 nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
@@ -272,16 +269,14 @@ static void ipt_ulog_packet(unsigned int hooknum,
272 return; 269 return;
273 270
274nlmsg_failure: 271nlmsg_failure:
275 PRINTR("ipt_ULOG: error during NLMSG_PUT\n"); 272 pr_debug("error during NLMSG_PUT\n");
276
277alloc_failure: 273alloc_failure:
278 PRINTR("ipt_ULOG: Error building netlink message\n"); 274 pr_debug("Error building netlink message\n");
279
280 spin_unlock_bh(&ulog_lock); 275 spin_unlock_bh(&ulog_lock);
281} 276}
282 277
283static unsigned int 278static unsigned int
284ulog_tg(struct sk_buff *skb, const struct xt_target_param *par) 279ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
285{ 280{
286 ipt_ulog_packet(par->hooknum, skb, par->in, par->out, 281 ipt_ulog_packet(par->hooknum, skb, par->in, par->out,
287 par->targinfo, NULL); 282 par->targinfo, NULL);
@@ -313,21 +308,20 @@ static void ipt_logfn(u_int8_t pf,
313 ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); 308 ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
314} 309}
315 310
316static bool ulog_tg_check(const struct xt_tgchk_param *par) 311static int ulog_tg_check(const struct xt_tgchk_param *par)
317{ 312{
318 const struct ipt_ulog_info *loginfo = par->targinfo; 313 const struct ipt_ulog_info *loginfo = par->targinfo;
319 314
320 if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { 315 if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
321 pr_debug("ipt_ULOG: prefix term %i\n", 316 pr_debug("prefix not null-terminated\n");
322 loginfo->prefix[sizeof(loginfo->prefix) - 1]); 317 return -EINVAL;
323 return false;
324 } 318 }
325 if (loginfo->qthreshold > ULOG_MAX_QLEN) { 319 if (loginfo->qthreshold > ULOG_MAX_QLEN) {
326 pr_debug("ipt_ULOG: queue threshold %Zu > MAX_QLEN\n", 320 pr_debug("queue threshold %Zu > MAX_QLEN\n",
327 loginfo->qthreshold); 321 loginfo->qthreshold);
328 return false; 322 return -EINVAL;
329 } 323 }
330 return true; 324 return 0;
331} 325}
332 326
333#ifdef CONFIG_COMPAT 327#ifdef CONFIG_COMPAT
@@ -338,7 +332,7 @@ struct compat_ipt_ulog_info {
338 char prefix[ULOG_PREFIX_LEN]; 332 char prefix[ULOG_PREFIX_LEN];
339}; 333};
340 334
341static void ulog_tg_compat_from_user(void *dst, void *src) 335static void ulog_tg_compat_from_user(void *dst, const void *src)
342{ 336{
343 const struct compat_ipt_ulog_info *cl = src; 337 const struct compat_ipt_ulog_info *cl = src;
344 struct ipt_ulog_info l = { 338 struct ipt_ulog_info l = {
@@ -351,7 +345,7 @@ static void ulog_tg_compat_from_user(void *dst, void *src)
351 memcpy(dst, &l, sizeof(l)); 345 memcpy(dst, &l, sizeof(l));
352} 346}
353 347
354static int ulog_tg_compat_to_user(void __user *dst, void *src) 348static int ulog_tg_compat_to_user(void __user *dst, const void *src)
355{ 349{
356 const struct ipt_ulog_info *l = src; 350 const struct ipt_ulog_info *l = src;
357 struct compat_ipt_ulog_info cl = { 351 struct compat_ipt_ulog_info cl = {
@@ -389,10 +383,10 @@ static int __init ulog_tg_init(void)
389{ 383{
390 int ret, i; 384 int ret, i;
391 385
392 pr_debug("ipt_ULOG: init module\n"); 386 pr_debug("init module\n");
393 387
394 if (nlbufsiz > 128*1024) { 388 if (nlbufsiz > 128*1024) {
395 printk("Netlink buffer has to be <= 128kB\n"); 389 pr_warning("Netlink buffer has to be <= 128kB\n");
396 return -EINVAL; 390 return -EINVAL;
397 } 391 }
398 392
@@ -422,7 +416,7 @@ static void __exit ulog_tg_exit(void)
422 ulog_buff_t *ub; 416 ulog_buff_t *ub;
423 int i; 417 int i;
424 418
425 pr_debug("ipt_ULOG: cleanup_module\n"); 419 pr_debug("cleanup_module\n");
426 420
427 if (nflog) 421 if (nflog)
428 nf_log_unregister(&ipt_ulog_logger); 422 nf_log_unregister(&ipt_ulog_logger);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index 3b216be3bc9f..db8bff0fb86d 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -8,7 +8,7 @@
8 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation. 9 * published by the Free Software Foundation.
10 */ 10 */
11 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
@@ -30,7 +30,7 @@ static inline bool match_type(struct net *net, const struct net_device *dev,
30} 30}
31 31
32static bool 32static bool
33addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) 33addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
34{ 34{
35 struct net *net = dev_net(par->in ? par->in : par->out); 35 struct net *net = dev_net(par->in ? par->in : par->out);
36 const struct ipt_addrtype_info *info = par->matchinfo; 36 const struct ipt_addrtype_info *info = par->matchinfo;
@@ -48,7 +48,7 @@ addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
48} 48}
49 49
50static bool 50static bool
51addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par) 51addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
52{ 52{
53 struct net *net = dev_net(par->in ? par->in : par->out); 53 struct net *net = dev_net(par->in ? par->in : par->out);
54 const struct ipt_addrtype_info_v1 *info = par->matchinfo; 54 const struct ipt_addrtype_info_v1 *info = par->matchinfo;
@@ -70,34 +70,34 @@ addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par)
70 return ret; 70 return ret;
71} 71}
72 72
73static bool addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) 73static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
74{ 74{
75 struct ipt_addrtype_info_v1 *info = par->matchinfo; 75 struct ipt_addrtype_info_v1 *info = par->matchinfo;
76 76
77 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && 77 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN &&
78 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { 78 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
79 printk(KERN_ERR "ipt_addrtype: both incoming and outgoing " 79 pr_info("both incoming and outgoing "
80 "interface limitation cannot be selected\n"); 80 "interface limitation cannot be selected\n");
81 return false; 81 return -EINVAL;
82 } 82 }
83 83
84 if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | 84 if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
85 (1 << NF_INET_LOCAL_IN)) && 85 (1 << NF_INET_LOCAL_IN)) &&
86 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { 86 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
87 printk(KERN_ERR "ipt_addrtype: output interface limitation " 87 pr_info("output interface limitation "
88 "not valid in PRE_ROUTING and INPUT\n"); 88 "not valid in PREROUTING and INPUT\n");
89 return false; 89 return -EINVAL;
90 } 90 }
91 91
92 if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | 92 if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
93 (1 << NF_INET_LOCAL_OUT)) && 93 (1 << NF_INET_LOCAL_OUT)) &&
94 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { 94 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) {
95 printk(KERN_ERR "ipt_addrtype: input interface limitation " 95 pr_info("input interface limitation "
96 "not valid in POST_ROUTING and OUTPUT\n"); 96 "not valid in POSTROUTING and OUTPUT\n");
97 return false; 97 return -EINVAL;
98 } 98 }
99 99
100 return true; 100 return 0;
101} 101}
102 102
103static struct xt_match addrtype_mt_reg[] __read_mostly = { 103static struct xt_match addrtype_mt_reg[] __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index 0104c0b399de..14a2aa8b8a14 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -5,7 +5,7 @@
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation. 6 * published by the Free Software Foundation.
7 */ 7 */
8 8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9#include <linux/in.h> 9#include <linux/in.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/skbuff.h> 11#include <linux/skbuff.h>
@@ -18,25 +18,19 @@ MODULE_LICENSE("GPL");
18MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>"); 18MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
19MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match"); 19MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match");
20 20
21#ifdef DEBUG_CONNTRACK
22#define duprintf(format, args...) printk(format , ## args)
23#else
24#define duprintf(format, args...)
25#endif
26
27/* Returns 1 if the spi is matched by the range, 0 otherwise */ 21/* Returns 1 if the spi is matched by the range, 0 otherwise */
28static inline bool 22static inline bool
29spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) 23spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
30{ 24{
31 bool r; 25 bool r;
32 duprintf("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', 26 pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
33 min,spi,max); 27 invert ? '!' : ' ', min, spi, max);
34 r=(spi >= min && spi <= max) ^ invert; 28 r=(spi >= min && spi <= max) ^ invert;
35 duprintf(" result %s\n",r? "PASS" : "FAILED"); 29 pr_debug(" result %s\n", r ? "PASS" : "FAILED");
36 return r; 30 return r;
37} 31}
38 32
39static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par) 33static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par)
40{ 34{
41 struct ip_auth_hdr _ahdr; 35 struct ip_auth_hdr _ahdr;
42 const struct ip_auth_hdr *ah; 36 const struct ip_auth_hdr *ah;
@@ -51,8 +45,8 @@ static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par)
51 /* We've been asked to examine this packet, and we 45 /* We've been asked to examine this packet, and we
52 * can't. Hence, no choice but to drop. 46 * can't. Hence, no choice but to drop.
53 */ 47 */
54 duprintf("Dropping evil AH tinygram.\n"); 48 pr_debug("Dropping evil AH tinygram.\n");
55 *par->hotdrop = true; 49 par->hotdrop = true;
56 return 0; 50 return 0;
57 } 51 }
58 52
@@ -61,16 +55,16 @@ static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par)
61 !!(ahinfo->invflags & IPT_AH_INV_SPI)); 55 !!(ahinfo->invflags & IPT_AH_INV_SPI));
62} 56}
63 57
64static bool ah_mt_check(const struct xt_mtchk_param *par) 58static int ah_mt_check(const struct xt_mtchk_param *par)
65{ 59{
66 const struct ipt_ah *ahinfo = par->matchinfo; 60 const struct ipt_ah *ahinfo = par->matchinfo;
67 61
68 /* Must specify no unknown invflags */ 62 /* Must specify no unknown invflags */
69 if (ahinfo->invflags & ~IPT_AH_INV_MASK) { 63 if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
70 duprintf("ipt_ah: unknown flags %X\n", ahinfo->invflags); 64 pr_debug("unknown flags %X\n", ahinfo->invflags);
71 return false; 65 return -EINVAL;
72 } 66 }
73 return true; 67 return 0;
74} 68}
75 69
76static struct xt_match ah_mt_reg __read_mostly = { 70static struct xt_match ah_mt_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index 2a1e56b71908..af6e9c778345 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -6,7 +6,7 @@
6 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 */ 8 */
9 9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10#include <linux/in.h> 10#include <linux/in.h>
11#include <linux/ip.h> 11#include <linux/ip.h>
12#include <net/ip.h> 12#include <net/ip.h>
@@ -67,7 +67,7 @@ static inline bool match_tcp(const struct sk_buff *skb,
67 return true; 67 return true;
68} 68}
69 69
70static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par) 70static bool ecn_mt(const struct sk_buff *skb, struct xt_action_param *par)
71{ 71{
72 const struct ipt_ecn_info *info = par->matchinfo; 72 const struct ipt_ecn_info *info = par->matchinfo;
73 73
@@ -78,32 +78,31 @@ static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par)
78 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { 78 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
79 if (ip_hdr(skb)->protocol != IPPROTO_TCP) 79 if (ip_hdr(skb)->protocol != IPPROTO_TCP)
80 return false; 80 return false;
81 if (!match_tcp(skb, info, par->hotdrop)) 81 if (!match_tcp(skb, info, &par->hotdrop))
82 return false; 82 return false;
83 } 83 }
84 84
85 return true; 85 return true;
86} 86}
87 87
88static bool ecn_mt_check(const struct xt_mtchk_param *par) 88static int ecn_mt_check(const struct xt_mtchk_param *par)
89{ 89{
90 const struct ipt_ecn_info *info = par->matchinfo; 90 const struct ipt_ecn_info *info = par->matchinfo;
91 const struct ipt_ip *ip = par->entryinfo; 91 const struct ipt_ip *ip = par->entryinfo;
92 92
93 if (info->operation & IPT_ECN_OP_MATCH_MASK) 93 if (info->operation & IPT_ECN_OP_MATCH_MASK)
94 return false; 94 return -EINVAL;
95 95
96 if (info->invert & IPT_ECN_OP_MATCH_MASK) 96 if (info->invert & IPT_ECN_OP_MATCH_MASK)
97 return false; 97 return -EINVAL;
98 98
99 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) && 99 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) &&
100 ip->proto != IPPROTO_TCP) { 100 ip->proto != IPPROTO_TCP) {
101 printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for" 101 pr_info("cannot match TCP bits in rule for non-tcp packets\n");
102 " non-tcp packets\n"); 102 return -EINVAL;
103 return false;
104 } 103 }
105 104
106 return true; 105 return 0;
107} 106}
108 107
109static struct xt_match ecn_mt_reg __read_mostly = { 108static struct xt_match ecn_mt_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index df566cbd68e5..c37641e819f2 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/netfilter_ipv4/ip_tables.h> 15#include <linux/netfilter_ipv4/ip_tables.h>
16#include <linux/slab.h>
16#include <net/ip.h> 17#include <net/ip.h>
17 18
18MODULE_LICENSE("GPL"); 19MODULE_LICENSE("GPL");
@@ -23,104 +24,32 @@ MODULE_DESCRIPTION("iptables filter table");
23 (1 << NF_INET_FORWARD) | \ 24 (1 << NF_INET_FORWARD) | \
24 (1 << NF_INET_LOCAL_OUT)) 25 (1 << NF_INET_LOCAL_OUT))
25 26
26static struct
27{
28 struct ipt_replace repl;
29 struct ipt_standard entries[3];
30 struct ipt_error term;
31} initial_table __net_initdata = {
32 .repl = {
33 .name = "filter",
34 .valid_hooks = FILTER_VALID_HOOKS,
35 .num_entries = 4,
36 .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
37 .hook_entry = {
38 [NF_INET_LOCAL_IN] = 0,
39 [NF_INET_FORWARD] = sizeof(struct ipt_standard),
40 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
41 },
42 .underflow = {
43 [NF_INET_LOCAL_IN] = 0,
44 [NF_INET_FORWARD] = sizeof(struct ipt_standard),
45 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
46 },
47 },
48 .entries = {
49 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */
50 IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */
51 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
52 },
53 .term = IPT_ERROR_INIT, /* ERROR */
54};
55
56static const struct xt_table packet_filter = { 27static const struct xt_table packet_filter = {
57 .name = "filter", 28 .name = "filter",
58 .valid_hooks = FILTER_VALID_HOOKS, 29 .valid_hooks = FILTER_VALID_HOOKS,
59 .me = THIS_MODULE, 30 .me = THIS_MODULE,
60 .af = NFPROTO_IPV4, 31 .af = NFPROTO_IPV4,
32 .priority = NF_IP_PRI_FILTER,
61}; 33};
62 34
63/* The work comes in here from netfilter.c. */
64static unsigned int
65ipt_local_in_hook(unsigned int hook,
66 struct sk_buff *skb,
67 const struct net_device *in,
68 const struct net_device *out,
69 int (*okfn)(struct sk_buff *))
70{
71 return ipt_do_table(skb, hook, in, out,
72 dev_net(in)->ipv4.iptable_filter);
73}
74
75static unsigned int 35static unsigned int
76ipt_hook(unsigned int hook, 36iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
77 struct sk_buff *skb, 37 const struct net_device *in, const struct net_device *out,
78 const struct net_device *in, 38 int (*okfn)(struct sk_buff *))
79 const struct net_device *out,
80 int (*okfn)(struct sk_buff *))
81{ 39{
82 return ipt_do_table(skb, hook, in, out, 40 const struct net *net;
83 dev_net(in)->ipv4.iptable_filter);
84}
85 41
86static unsigned int 42 if (hook == NF_INET_LOCAL_OUT &&
87ipt_local_out_hook(unsigned int hook, 43 (skb->len < sizeof(struct iphdr) ||
88 struct sk_buff *skb, 44 ip_hdrlen(skb) < sizeof(struct iphdr)))
89 const struct net_device *in, 45 /* root is playing with raw sockets. */
90 const struct net_device *out,
91 int (*okfn)(struct sk_buff *))
92{
93 /* root is playing with raw sockets. */
94 if (skb->len < sizeof(struct iphdr) ||
95 ip_hdrlen(skb) < sizeof(struct iphdr))
96 return NF_ACCEPT; 46 return NF_ACCEPT;
97 return ipt_do_table(skb, hook, in, out, 47
98 dev_net(out)->ipv4.iptable_filter); 48 net = dev_net((in != NULL) ? in : out);
49 return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter);
99} 50}
100 51
101static struct nf_hook_ops ipt_ops[] __read_mostly = { 52static struct nf_hook_ops *filter_ops __read_mostly;
102 {
103 .hook = ipt_local_in_hook,
104 .owner = THIS_MODULE,
105 .pf = NFPROTO_IPV4,
106 .hooknum = NF_INET_LOCAL_IN,
107 .priority = NF_IP_PRI_FILTER,
108 },
109 {
110 .hook = ipt_hook,
111 .owner = THIS_MODULE,
112 .pf = NFPROTO_IPV4,
113 .hooknum = NF_INET_FORWARD,
114 .priority = NF_IP_PRI_FILTER,
115 },
116 {
117 .hook = ipt_local_out_hook,
118 .owner = THIS_MODULE,
119 .pf = NFPROTO_IPV4,
120 .hooknum = NF_INET_LOCAL_OUT,
121 .priority = NF_IP_PRI_FILTER,
122 },
123};
124 53
125/* Default to forward because I got too much mail already. */ 54/* Default to forward because I got too much mail already. */
126static int forward = NF_ACCEPT; 55static int forward = NF_ACCEPT;
@@ -128,9 +57,18 @@ module_param(forward, bool, 0000);
128 57
129static int __net_init iptable_filter_net_init(struct net *net) 58static int __net_init iptable_filter_net_init(struct net *net)
130{ 59{
131 /* Register table */ 60 struct ipt_replace *repl;
61
62 repl = ipt_alloc_initial_table(&packet_filter);
63 if (repl == NULL)
64 return -ENOMEM;
65 /* Entry 1 is the FORWARD hook */
66 ((struct ipt_standard *)repl->entries)[1].target.verdict =
67 -forward - 1;
68
132 net->ipv4.iptable_filter = 69 net->ipv4.iptable_filter =
133 ipt_register_table(net, &packet_filter, &initial_table.repl); 70 ipt_register_table(net, &packet_filter, repl);
71 kfree(repl);
134 if (IS_ERR(net->ipv4.iptable_filter)) 72 if (IS_ERR(net->ipv4.iptable_filter))
135 return PTR_ERR(net->ipv4.iptable_filter); 73 return PTR_ERR(net->ipv4.iptable_filter);
136 return 0; 74 return 0;
@@ -138,7 +76,7 @@ static int __net_init iptable_filter_net_init(struct net *net)
138 76
139static void __net_exit iptable_filter_net_exit(struct net *net) 77static void __net_exit iptable_filter_net_exit(struct net *net)
140{ 78{
141 ipt_unregister_table(net->ipv4.iptable_filter); 79 ipt_unregister_table(net, net->ipv4.iptable_filter);
142} 80}
143 81
144static struct pernet_operations iptable_filter_net_ops = { 82static struct pernet_operations iptable_filter_net_ops = {
@@ -151,21 +89,20 @@ static int __init iptable_filter_init(void)
151 int ret; 89 int ret;
152 90
153 if (forward < 0 || forward > NF_MAX_VERDICT) { 91 if (forward < 0 || forward > NF_MAX_VERDICT) {
154 printk("iptables forward must be 0 or 1\n"); 92 pr_err("iptables forward must be 0 or 1\n");
155 return -EINVAL; 93 return -EINVAL;
156 } 94 }
157 95
158 /* Entry 1 is the FORWARD hook */
159 initial_table.entries[1].target.verdict = -forward - 1;
160
161 ret = register_pernet_subsys(&iptable_filter_net_ops); 96 ret = register_pernet_subsys(&iptable_filter_net_ops);
162 if (ret < 0) 97 if (ret < 0)
163 return ret; 98 return ret;
164 99
165 /* Register hooks */ 100 /* Register hooks */
166 ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 101 filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
167 if (ret < 0) 102 if (IS_ERR(filter_ops)) {
103 ret = PTR_ERR(filter_ops);
168 goto cleanup_table; 104 goto cleanup_table;
105 }
169 106
170 return ret; 107 return ret;
171 108
@@ -176,7 +113,7 @@ static int __init iptable_filter_init(void)
176 113
177static void __exit iptable_filter_fini(void) 114static void __exit iptable_filter_fini(void)
178{ 115{
179 nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 116 xt_hook_unlink(&packet_filter, filter_ops);
180 unregister_pernet_subsys(&iptable_filter_net_ops); 117 unregister_pernet_subsys(&iptable_filter_net_ops);
181} 118}
182 119
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index fae78c3076c4..294a2a32f293 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -12,6 +12,7 @@
12#include <linux/netfilter_ipv4/ip_tables.h> 12#include <linux/netfilter_ipv4/ip_tables.h>
13#include <linux/netdevice.h> 13#include <linux/netdevice.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/slab.h>
15#include <net/sock.h> 16#include <net/sock.h>
16#include <net/route.h> 17#include <net/route.h>
17#include <linux/ip.h> 18#include <linux/ip.h>
@@ -27,101 +28,16 @@ MODULE_DESCRIPTION("iptables mangle table");
27 (1 << NF_INET_LOCAL_OUT) | \ 28 (1 << NF_INET_LOCAL_OUT) | \
28 (1 << NF_INET_POST_ROUTING)) 29 (1 << NF_INET_POST_ROUTING))
29 30
30/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */
31static const struct
32{
33 struct ipt_replace repl;
34 struct ipt_standard entries[5];
35 struct ipt_error term;
36} initial_table __net_initdata = {
37 .repl = {
38 .name = "mangle",
39 .valid_hooks = MANGLE_VALID_HOOKS,
40 .num_entries = 6,
41 .size = sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error),
42 .hook_entry = {
43 [NF_INET_PRE_ROUTING] = 0,
44 [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard),
45 [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2,
46 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
47 [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4,
48 },
49 .underflow = {
50 [NF_INET_PRE_ROUTING] = 0,
51 [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard),
52 [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2,
53 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
54 [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4,
55 },
56 },
57 .entries = {
58 IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */
59 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */
60 IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */
61 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
62 IPT_STANDARD_INIT(NF_ACCEPT), /* POST_ROUTING */
63 },
64 .term = IPT_ERROR_INIT, /* ERROR */
65};
66
67static const struct xt_table packet_mangler = { 31static const struct xt_table packet_mangler = {
68 .name = "mangle", 32 .name = "mangle",
69 .valid_hooks = MANGLE_VALID_HOOKS, 33 .valid_hooks = MANGLE_VALID_HOOKS,
70 .me = THIS_MODULE, 34 .me = THIS_MODULE,
71 .af = NFPROTO_IPV4, 35 .af = NFPROTO_IPV4,
36 .priority = NF_IP_PRI_MANGLE,
72}; 37};
73 38
74/* The work comes in here from netfilter.c. */
75static unsigned int
76ipt_pre_routing_hook(unsigned int hook,
77 struct sk_buff *skb,
78 const struct net_device *in,
79 const struct net_device *out,
80 int (*okfn)(struct sk_buff *))
81{
82 return ipt_do_table(skb, hook, in, out,
83 dev_net(in)->ipv4.iptable_mangle);
84}
85
86static unsigned int
87ipt_post_routing_hook(unsigned int hook,
88 struct sk_buff *skb,
89 const struct net_device *in,
90 const struct net_device *out,
91 int (*okfn)(struct sk_buff *))
92{
93 return ipt_do_table(skb, hook, in, out,
94 dev_net(out)->ipv4.iptable_mangle);
95}
96
97static unsigned int
98ipt_local_in_hook(unsigned int hook,
99 struct sk_buff *skb,
100 const struct net_device *in,
101 const struct net_device *out,
102 int (*okfn)(struct sk_buff *))
103{
104 return ipt_do_table(skb, hook, in, out,
105 dev_net(in)->ipv4.iptable_mangle);
106}
107
108static unsigned int
109ipt_forward_hook(unsigned int hook,
110 struct sk_buff *skb,
111 const struct net_device *in,
112 const struct net_device *out,
113 int (*okfn)(struct sk_buff *))
114{
115 return ipt_do_table(skb, hook, in, out,
116 dev_net(in)->ipv4.iptable_mangle);
117}
118
119static unsigned int 39static unsigned int
120ipt_local_hook(unsigned int hook, 40ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
121 struct sk_buff *skb,
122 const struct net_device *in,
123 const struct net_device *out,
124 int (*okfn)(struct sk_buff *))
125{ 41{
126 unsigned int ret; 42 unsigned int ret;
127 const struct iphdr *iph; 43 const struct iphdr *iph;
@@ -141,7 +57,7 @@ ipt_local_hook(unsigned int hook,
141 daddr = iph->daddr; 57 daddr = iph->daddr;
142 tos = iph->tos; 58 tos = iph->tos;
143 59
144 ret = ipt_do_table(skb, hook, in, out, 60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
145 dev_net(out)->ipv4.iptable_mangle); 61 dev_net(out)->ipv4.iptable_mangle);
146 /* Reroute for ANY change. */ 62 /* Reroute for ANY change. */
147 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { 63 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) {
@@ -158,49 +74,36 @@ ipt_local_hook(unsigned int hook,
158 return ret; 74 return ret;
159} 75}
160 76
161static struct nf_hook_ops ipt_ops[] __read_mostly = { 77/* The work comes in here from netfilter.c. */
162 { 78static unsigned int
163 .hook = ipt_pre_routing_hook, 79iptable_mangle_hook(unsigned int hook,
164 .owner = THIS_MODULE, 80 struct sk_buff *skb,
165 .pf = NFPROTO_IPV4, 81 const struct net_device *in,
166 .hooknum = NF_INET_PRE_ROUTING, 82 const struct net_device *out,
167 .priority = NF_IP_PRI_MANGLE, 83 int (*okfn)(struct sk_buff *))
168 }, 84{
169 { 85 if (hook == NF_INET_LOCAL_OUT)
170 .hook = ipt_local_in_hook, 86 return ipt_mangle_out(skb, out);
171 .owner = THIS_MODULE, 87 if (hook == NF_INET_POST_ROUTING)
172 .pf = NFPROTO_IPV4, 88 return ipt_do_table(skb, hook, in, out,
173 .hooknum = NF_INET_LOCAL_IN, 89 dev_net(out)->ipv4.iptable_mangle);
174 .priority = NF_IP_PRI_MANGLE, 90 /* PREROUTING/INPUT/FORWARD: */
175 }, 91 return ipt_do_table(skb, hook, in, out,
176 { 92 dev_net(in)->ipv4.iptable_mangle);
177 .hook = ipt_forward_hook, 93}
178 .owner = THIS_MODULE, 94
179 .pf = NFPROTO_IPV4, 95static struct nf_hook_ops *mangle_ops __read_mostly;
180 .hooknum = NF_INET_FORWARD,
181 .priority = NF_IP_PRI_MANGLE,
182 },
183 {
184 .hook = ipt_local_hook,
185 .owner = THIS_MODULE,
186 .pf = NFPROTO_IPV4,
187 .hooknum = NF_INET_LOCAL_OUT,
188 .priority = NF_IP_PRI_MANGLE,
189 },
190 {
191 .hook = ipt_post_routing_hook,
192 .owner = THIS_MODULE,
193 .pf = NFPROTO_IPV4,
194 .hooknum = NF_INET_POST_ROUTING,
195 .priority = NF_IP_PRI_MANGLE,
196 },
197};
198 96
199static int __net_init iptable_mangle_net_init(struct net *net) 97static int __net_init iptable_mangle_net_init(struct net *net)
200{ 98{
201 /* Register table */ 99 struct ipt_replace *repl;
100
101 repl = ipt_alloc_initial_table(&packet_mangler);
102 if (repl == NULL)
103 return -ENOMEM;
202 net->ipv4.iptable_mangle = 104 net->ipv4.iptable_mangle =
203 ipt_register_table(net, &packet_mangler, &initial_table.repl); 105 ipt_register_table(net, &packet_mangler, repl);
106 kfree(repl);
204 if (IS_ERR(net->ipv4.iptable_mangle)) 107 if (IS_ERR(net->ipv4.iptable_mangle))
205 return PTR_ERR(net->ipv4.iptable_mangle); 108 return PTR_ERR(net->ipv4.iptable_mangle);
206 return 0; 109 return 0;
@@ -208,7 +111,7 @@ static int __net_init iptable_mangle_net_init(struct net *net)
208 111
209static void __net_exit iptable_mangle_net_exit(struct net *net) 112static void __net_exit iptable_mangle_net_exit(struct net *net)
210{ 113{
211 ipt_unregister_table(net->ipv4.iptable_mangle); 114 ipt_unregister_table(net, net->ipv4.iptable_mangle);
212} 115}
213 116
214static struct pernet_operations iptable_mangle_net_ops = { 117static struct pernet_operations iptable_mangle_net_ops = {
@@ -225,9 +128,11 @@ static int __init iptable_mangle_init(void)
225 return ret; 128 return ret;
226 129
227 /* Register hooks */ 130 /* Register hooks */
228 ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 131 mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
229 if (ret < 0) 132 if (IS_ERR(mangle_ops)) {
133 ret = PTR_ERR(mangle_ops);
230 goto cleanup_table; 134 goto cleanup_table;
135 }
231 136
232 return ret; 137 return ret;
233 138
@@ -238,7 +143,7 @@ static int __init iptable_mangle_init(void)
238 143
239static void __exit iptable_mangle_fini(void) 144static void __exit iptable_mangle_fini(void)
240{ 145{
241 nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 146 xt_hook_unlink(&packet_mangler, mangle_ops);
242 unregister_pernet_subsys(&iptable_mangle_net_ops); 147 unregister_pernet_subsys(&iptable_mangle_net_ops);
243} 148}
244 149
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 993edc23be09..07fb710cd722 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -5,94 +5,49 @@
5 */ 5 */
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/netfilter_ipv4/ip_tables.h> 7#include <linux/netfilter_ipv4/ip_tables.h>
8#include <linux/slab.h>
8#include <net/ip.h> 9#include <net/ip.h>
9 10
10#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) 11#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
11 12
12static const struct
13{
14 struct ipt_replace repl;
15 struct ipt_standard entries[2];
16 struct ipt_error term;
17} initial_table __net_initdata = {
18 .repl = {
19 .name = "raw",
20 .valid_hooks = RAW_VALID_HOOKS,
21 .num_entries = 3,
22 .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error),
23 .hook_entry = {
24 [NF_INET_PRE_ROUTING] = 0,
25 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard)
26 },
27 .underflow = {
28 [NF_INET_PRE_ROUTING] = 0,
29 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard)
30 },
31 },
32 .entries = {
33 IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */
34 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
35 },
36 .term = IPT_ERROR_INIT, /* ERROR */
37};
38
39static const struct xt_table packet_raw = { 13static const struct xt_table packet_raw = {
40 .name = "raw", 14 .name = "raw",
41 .valid_hooks = RAW_VALID_HOOKS, 15 .valid_hooks = RAW_VALID_HOOKS,
42 .me = THIS_MODULE, 16 .me = THIS_MODULE,
43 .af = NFPROTO_IPV4, 17 .af = NFPROTO_IPV4,
18 .priority = NF_IP_PRI_RAW,
44}; 19};
45 20
46/* The work comes in here from netfilter.c. */ 21/* The work comes in here from netfilter.c. */
47static unsigned int 22static unsigned int
48ipt_hook(unsigned int hook, 23iptable_raw_hook(unsigned int hook, struct sk_buff *skb,
49 struct sk_buff *skb, 24 const struct net_device *in, const struct net_device *out,
50 const struct net_device *in, 25 int (*okfn)(struct sk_buff *))
51 const struct net_device *out,
52 int (*okfn)(struct sk_buff *))
53{ 26{
54 return ipt_do_table(skb, hook, in, out, 27 const struct net *net;
55 dev_net(in)->ipv4.iptable_raw);
56}
57 28
58static unsigned int 29 if (hook == NF_INET_LOCAL_OUT &&
59ipt_local_hook(unsigned int hook, 30 (skb->len < sizeof(struct iphdr) ||
60 struct sk_buff *skb, 31 ip_hdrlen(skb) < sizeof(struct iphdr)))
61 const struct net_device *in, 32 /* root is playing with raw sockets. */
62 const struct net_device *out,
63 int (*okfn)(struct sk_buff *))
64{
65 /* root is playing with raw sockets. */
66 if (skb->len < sizeof(struct iphdr) ||
67 ip_hdrlen(skb) < sizeof(struct iphdr))
68 return NF_ACCEPT; 33 return NF_ACCEPT;
69 return ipt_do_table(skb, hook, in, out, 34
70 dev_net(out)->ipv4.iptable_raw); 35 net = dev_net((in != NULL) ? in : out);
36 return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw);
71} 37}
72 38
73/* 'raw' is the very first table. */ 39static struct nf_hook_ops *rawtable_ops __read_mostly;
74static struct nf_hook_ops ipt_ops[] __read_mostly = {
75 {
76 .hook = ipt_hook,
77 .pf = NFPROTO_IPV4,
78 .hooknum = NF_INET_PRE_ROUTING,
79 .priority = NF_IP_PRI_RAW,
80 .owner = THIS_MODULE,
81 },
82 {
83 .hook = ipt_local_hook,
84 .pf = NFPROTO_IPV4,
85 .hooknum = NF_INET_LOCAL_OUT,
86 .priority = NF_IP_PRI_RAW,
87 .owner = THIS_MODULE,
88 },
89};
90 40
91static int __net_init iptable_raw_net_init(struct net *net) 41static int __net_init iptable_raw_net_init(struct net *net)
92{ 42{
93 /* Register table */ 43 struct ipt_replace *repl;
44
45 repl = ipt_alloc_initial_table(&packet_raw);
46 if (repl == NULL)
47 return -ENOMEM;
94 net->ipv4.iptable_raw = 48 net->ipv4.iptable_raw =
95 ipt_register_table(net, &packet_raw, &initial_table.repl); 49 ipt_register_table(net, &packet_raw, repl);
50 kfree(repl);
96 if (IS_ERR(net->ipv4.iptable_raw)) 51 if (IS_ERR(net->ipv4.iptable_raw))
97 return PTR_ERR(net->ipv4.iptable_raw); 52 return PTR_ERR(net->ipv4.iptable_raw);
98 return 0; 53 return 0;
@@ -100,7 +55,7 @@ static int __net_init iptable_raw_net_init(struct net *net)
100 55
101static void __net_exit iptable_raw_net_exit(struct net *net) 56static void __net_exit iptable_raw_net_exit(struct net *net)
102{ 57{
103 ipt_unregister_table(net->ipv4.iptable_raw); 58 ipt_unregister_table(net, net->ipv4.iptable_raw);
104} 59}
105 60
106static struct pernet_operations iptable_raw_net_ops = { 61static struct pernet_operations iptable_raw_net_ops = {
@@ -117,9 +72,11 @@ static int __init iptable_raw_init(void)
117 return ret; 72 return ret;
118 73
119 /* Register hooks */ 74 /* Register hooks */
120 ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 75 rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
121 if (ret < 0) 76 if (IS_ERR(rawtable_ops)) {
77 ret = PTR_ERR(rawtable_ops);
122 goto cleanup_table; 78 goto cleanup_table;
79 }
123 80
124 return ret; 81 return ret;
125 82
@@ -130,7 +87,7 @@ static int __init iptable_raw_init(void)
130 87
131static void __exit iptable_raw_fini(void) 88static void __exit iptable_raw_fini(void)
132{ 89{
133 nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 90 xt_hook_unlink(&packet_raw, rawtable_ops);
134 unregister_pernet_subsys(&iptable_raw_net_ops); 91 unregister_pernet_subsys(&iptable_raw_net_ops);
135} 92}
136 93
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index 3bd3d6388da5..be45bdc4c602 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/netfilter_ipv4/ip_tables.h> 19#include <linux/netfilter_ipv4/ip_tables.h>
20#include <linux/slab.h>
20#include <net/ip.h> 21#include <net/ip.h>
21 22
22MODULE_LICENSE("GPL"); 23MODULE_LICENSE("GPL");
@@ -27,109 +28,44 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules");
27 (1 << NF_INET_FORWARD) | \ 28 (1 << NF_INET_FORWARD) | \
28 (1 << NF_INET_LOCAL_OUT) 29 (1 << NF_INET_LOCAL_OUT)
29 30
30static const struct
31{
32 struct ipt_replace repl;
33 struct ipt_standard entries[3];
34 struct ipt_error term;
35} initial_table __net_initdata = {
36 .repl = {
37 .name = "security",
38 .valid_hooks = SECURITY_VALID_HOOKS,
39 .num_entries = 4,
40 .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
41 .hook_entry = {
42 [NF_INET_LOCAL_IN] = 0,
43 [NF_INET_FORWARD] = sizeof(struct ipt_standard),
44 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
45 },
46 .underflow = {
47 [NF_INET_LOCAL_IN] = 0,
48 [NF_INET_FORWARD] = sizeof(struct ipt_standard),
49 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
50 },
51 },
52 .entries = {
53 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */
54 IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */
55 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
56 },
57 .term = IPT_ERROR_INIT, /* ERROR */
58};
59
60static const struct xt_table security_table = { 31static const struct xt_table security_table = {
61 .name = "security", 32 .name = "security",
62 .valid_hooks = SECURITY_VALID_HOOKS, 33 .valid_hooks = SECURITY_VALID_HOOKS,
63 .me = THIS_MODULE, 34 .me = THIS_MODULE,
64 .af = NFPROTO_IPV4, 35 .af = NFPROTO_IPV4,
36 .priority = NF_IP_PRI_SECURITY,
65}; 37};
66 38
67static unsigned int 39static unsigned int
68ipt_local_in_hook(unsigned int hook, 40iptable_security_hook(unsigned int hook, struct sk_buff *skb,
69 struct sk_buff *skb, 41 const struct net_device *in,
70 const struct net_device *in, 42 const struct net_device *out,
71 const struct net_device *out, 43 int (*okfn)(struct sk_buff *))
72 int (*okfn)(struct sk_buff *))
73{
74 return ipt_do_table(skb, hook, in, out,
75 dev_net(in)->ipv4.iptable_security);
76}
77
78static unsigned int
79ipt_forward_hook(unsigned int hook,
80 struct sk_buff *skb,
81 const struct net_device *in,
82 const struct net_device *out,
83 int (*okfn)(struct sk_buff *))
84{ 44{
85 return ipt_do_table(skb, hook, in, out, 45 const struct net *net;
86 dev_net(in)->ipv4.iptable_security);
87}
88 46
89static unsigned int 47 if (hook == NF_INET_LOCAL_OUT &&
90ipt_local_out_hook(unsigned int hook, 48 (skb->len < sizeof(struct iphdr) ||
91 struct sk_buff *skb, 49 ip_hdrlen(skb) < sizeof(struct iphdr)))
92 const struct net_device *in, 50 /* Somebody is playing with raw sockets. */
93 const struct net_device *out,
94 int (*okfn)(struct sk_buff *))
95{
96 /* Somebody is playing with raw sockets. */
97 if (skb->len < sizeof(struct iphdr) ||
98 ip_hdrlen(skb) < sizeof(struct iphdr))
99 return NF_ACCEPT; 51 return NF_ACCEPT;
100 return ipt_do_table(skb, hook, in, out, 52
101 dev_net(out)->ipv4.iptable_security); 53 net = dev_net((in != NULL) ? in : out);
54 return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security);
102} 55}
103 56
104static struct nf_hook_ops ipt_ops[] __read_mostly = { 57static struct nf_hook_ops *sectbl_ops __read_mostly;
105 {
106 .hook = ipt_local_in_hook,
107 .owner = THIS_MODULE,
108 .pf = NFPROTO_IPV4,
109 .hooknum = NF_INET_LOCAL_IN,
110 .priority = NF_IP_PRI_SECURITY,
111 },
112 {
113 .hook = ipt_forward_hook,
114 .owner = THIS_MODULE,
115 .pf = NFPROTO_IPV4,
116 .hooknum = NF_INET_FORWARD,
117 .priority = NF_IP_PRI_SECURITY,
118 },
119 {
120 .hook = ipt_local_out_hook,
121 .owner = THIS_MODULE,
122 .pf = NFPROTO_IPV4,
123 .hooknum = NF_INET_LOCAL_OUT,
124 .priority = NF_IP_PRI_SECURITY,
125 },
126};
127 58
128static int __net_init iptable_security_net_init(struct net *net) 59static int __net_init iptable_security_net_init(struct net *net)
129{ 60{
130 net->ipv4.iptable_security = 61 struct ipt_replace *repl;
131 ipt_register_table(net, &security_table, &initial_table.repl);
132 62
63 repl = ipt_alloc_initial_table(&security_table);
64 if (repl == NULL)
65 return -ENOMEM;
66 net->ipv4.iptable_security =
67 ipt_register_table(net, &security_table, repl);
68 kfree(repl);
133 if (IS_ERR(net->ipv4.iptable_security)) 69 if (IS_ERR(net->ipv4.iptable_security))
134 return PTR_ERR(net->ipv4.iptable_security); 70 return PTR_ERR(net->ipv4.iptable_security);
135 71
@@ -138,7 +74,7 @@ static int __net_init iptable_security_net_init(struct net *net)
138 74
139static void __net_exit iptable_security_net_exit(struct net *net) 75static void __net_exit iptable_security_net_exit(struct net *net)
140{ 76{
141 ipt_unregister_table(net->ipv4.iptable_security); 77 ipt_unregister_table(net, net->ipv4.iptable_security);
142} 78}
143 79
144static struct pernet_operations iptable_security_net_ops = { 80static struct pernet_operations iptable_security_net_ops = {
@@ -154,9 +90,11 @@ static int __init iptable_security_init(void)
154 if (ret < 0) 90 if (ret < 0)
155 return ret; 91 return ret;
156 92
157 ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 93 sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
158 if (ret < 0) 94 if (IS_ERR(sectbl_ops)) {
95 ret = PTR_ERR(sectbl_ops);
159 goto cleanup_table; 96 goto cleanup_table;
97 }
160 98
161 return ret; 99 return ret;
162 100
@@ -167,7 +105,7 @@ cleanup_table:
167 105
168static void __exit iptable_security_fini(void) 106static void __exit iptable_security_fini(void)
169{ 107{
170 nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); 108 xt_hook_unlink(&security_table, sectbl_ops);
171 unregister_pernet_subsys(&iptable_security_net_ops); 109 unregister_pernet_subsys(&iptable_security_net_ops);
172} 110}
173 111
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index d171b123a656..5a03c02af999 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -22,6 +22,7 @@
22#include <net/netfilter/nf_conntrack_helper.h> 22#include <net/netfilter/nf_conntrack_helper.h>
23#include <net/netfilter/nf_conntrack_l4proto.h> 23#include <net/netfilter/nf_conntrack_l4proto.h>
24#include <net/netfilter/nf_conntrack_l3proto.h> 24#include <net/netfilter/nf_conntrack_l3proto.h>
25#include <net/netfilter/nf_conntrack_zones.h>
25#include <net/netfilter/nf_conntrack_core.h> 26#include <net/netfilter/nf_conntrack_core.h>
26#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> 27#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
27#include <net/netfilter/nf_nat_helper.h> 28#include <net/netfilter/nf_nat_helper.h>
@@ -210,7 +211,7 @@ static ctl_table ip_ct_sysctl_table[] = {
210 }, 211 },
211 { 212 {
212 .procname = "ip_conntrack_buckets", 213 .procname = "ip_conntrack_buckets",
213 .data = &nf_conntrack_htable_size, 214 .data = &init_net.ct.htable_size,
214 .maxlen = sizeof(unsigned int), 215 .maxlen = sizeof(unsigned int),
215 .mode = 0444, 216 .mode = 0444,
216 .proc_handler = proc_dointvec, 217 .proc_handler = proc_dointvec,
@@ -266,7 +267,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
266 return -EINVAL; 267 return -EINVAL;
267 } 268 }
268 269
269 h = nf_conntrack_find_get(sock_net(sk), &tuple); 270 h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
270 if (h) { 271 if (h) {
271 struct sockaddr_in sin; 272 struct sockaddr_in sin;
272 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 273 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
@@ -381,32 +382,32 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
381 382
382 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); 383 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4);
383 if (ret < 0) { 384 if (ret < 0) {
384 printk("nf_conntrack_ipv4: can't register tcp.\n"); 385 pr_err("nf_conntrack_ipv4: can't register tcp.\n");
385 goto cleanup_sockopt; 386 goto cleanup_sockopt;
386 } 387 }
387 388
388 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4); 389 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
389 if (ret < 0) { 390 if (ret < 0) {
390 printk("nf_conntrack_ipv4: can't register udp.\n"); 391 pr_err("nf_conntrack_ipv4: can't register udp.\n");
391 goto cleanup_tcp; 392 goto cleanup_tcp;
392 } 393 }
393 394
394 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp); 395 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
395 if (ret < 0) { 396 if (ret < 0) {
396 printk("nf_conntrack_ipv4: can't register icmp.\n"); 397 pr_err("nf_conntrack_ipv4: can't register icmp.\n");
397 goto cleanup_udp; 398 goto cleanup_udp;
398 } 399 }
399 400
400 ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); 401 ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
401 if (ret < 0) { 402 if (ret < 0) {
402 printk("nf_conntrack_ipv4: can't register ipv4\n"); 403 pr_err("nf_conntrack_ipv4: can't register ipv4\n");
403 goto cleanup_icmp; 404 goto cleanup_icmp;
404 } 405 }
405 406
406 ret = nf_register_hooks(ipv4_conntrack_ops, 407 ret = nf_register_hooks(ipv4_conntrack_ops,
407 ARRAY_SIZE(ipv4_conntrack_ops)); 408 ARRAY_SIZE(ipv4_conntrack_ops));
408 if (ret < 0) { 409 if (ret < 0) {
409 printk("nf_conntrack_ipv4: can't register hooks.\n"); 410 pr_err("nf_conntrack_ipv4: can't register hooks.\n");
410 goto cleanup_ipv4; 411 goto cleanup_ipv4;
411 } 412 }
412#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) 413#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 8668a3defda6..244f7cb08d68 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -32,7 +32,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
32 struct hlist_nulls_node *n; 32 struct hlist_nulls_node *n;
33 33
34 for (st->bucket = 0; 34 for (st->bucket = 0;
35 st->bucket < nf_conntrack_htable_size; 35 st->bucket < net->ct.htable_size;
36 st->bucket++) { 36 st->bucket++) {
37 n = rcu_dereference(net->ct.hash[st->bucket].first); 37 n = rcu_dereference(net->ct.hash[st->bucket].first);
38 if (!is_a_nulls(n)) 38 if (!is_a_nulls(n))
@@ -50,7 +50,7 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
50 head = rcu_dereference(head->next); 50 head = rcu_dereference(head->next);
51 while (is_a_nulls(head)) { 51 while (is_a_nulls(head)) {
52 if (likely(get_nulls_value(head) == st->bucket)) { 52 if (likely(get_nulls_value(head) == st->bucket)) {
53 if (++st->bucket >= nf_conntrack_htable_size) 53 if (++st->bucket >= net->ct.htable_size)
54 return NULL; 54 return NULL;
55 } 55 }
56 head = rcu_dereference(net->ct.hash[st->bucket].first); 56 head = rcu_dereference(net->ct.hash[st->bucket].first);
@@ -336,12 +336,12 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
336 const struct ip_conntrack_stat *st = v; 336 const struct ip_conntrack_stat *st = v;
337 337
338 if (v == SEQ_START_TOKEN) { 338 if (v == SEQ_START_TOKEN) {
339 seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n"); 339 seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
340 return 0; 340 return 0;
341 } 341 }
342 342
343 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " 343 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
344 "%08x %08x %08x %08x %08x %08x %08x %08x \n", 344 "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
345 nr_conntracks, 345 nr_conntracks,
346 st->searched, 346 st->searched,
347 st->found, 347 st->found,
@@ -358,7 +358,8 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
358 358
359 st->expect_new, 359 st->expect_new,
360 st->expect_create, 360 st->expect_create,
361 st->expect_delete 361 st->expect_delete,
362 st->search_restart
362 ); 363 );
363 return 0; 364 return 0;
364} 365}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 7afd39b5b781..7404bde95994 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -18,6 +18,7 @@
18#include <net/netfilter/nf_conntrack_tuple.h> 18#include <net/netfilter/nf_conntrack_tuple.h>
19#include <net/netfilter/nf_conntrack_l4proto.h> 19#include <net/netfilter/nf_conntrack_l4proto.h>
20#include <net/netfilter/nf_conntrack_core.h> 20#include <net/netfilter/nf_conntrack_core.h>
21#include <net/netfilter/nf_conntrack_zones.h>
21#include <net/netfilter/nf_log.h> 22#include <net/netfilter/nf_log.h>
22 23
23static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; 24static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
@@ -114,13 +115,14 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
114 115
115/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ 116/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
116static int 117static int
117icmp_error_message(struct net *net, struct sk_buff *skb, 118icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
118 enum ip_conntrack_info *ctinfo, 119 enum ip_conntrack_info *ctinfo,
119 unsigned int hooknum) 120 unsigned int hooknum)
120{ 121{
121 struct nf_conntrack_tuple innertuple, origtuple; 122 struct nf_conntrack_tuple innertuple, origtuple;
122 const struct nf_conntrack_l4proto *innerproto; 123 const struct nf_conntrack_l4proto *innerproto;
123 const struct nf_conntrack_tuple_hash *h; 124 const struct nf_conntrack_tuple_hash *h;
125 u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
124 126
125 NF_CT_ASSERT(skb->nfct == NULL); 127 NF_CT_ASSERT(skb->nfct == NULL);
126 128
@@ -146,7 +148,7 @@ icmp_error_message(struct net *net, struct sk_buff *skb,
146 148
147 *ctinfo = IP_CT_RELATED; 149 *ctinfo = IP_CT_RELATED;
148 150
149 h = nf_conntrack_find_get(net, &innertuple); 151 h = nf_conntrack_find_get(net, zone, &innertuple);
150 if (!h) { 152 if (!h) {
151 pr_debug("icmp_error_message: no match\n"); 153 pr_debug("icmp_error_message: no match\n");
152 return -NF_ACCEPT; 154 return -NF_ACCEPT;
@@ -163,7 +165,8 @@ icmp_error_message(struct net *net, struct sk_buff *skb,
163 165
164/* Small and modified version of icmp_rcv */ 166/* Small and modified version of icmp_rcv */
165static int 167static int
166icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, 168icmp_error(struct net *net, struct nf_conn *tmpl,
169 struct sk_buff *skb, unsigned int dataoff,
167 enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) 170 enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
168{ 171{
169 const struct icmphdr *icmph; 172 const struct icmphdr *icmph;
@@ -208,7 +211,7 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
208 icmph->type != ICMP_REDIRECT) 211 icmph->type != ICMP_REDIRECT)
209 return NF_ACCEPT; 212 return NF_ACCEPT;
210 213
211 return icmp_error_message(net, skb, ctinfo, hooknum); 214 return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
212} 215}
213 216
214#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 217#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 331ead3ebd1b..f3a9b42b16c6 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -17,6 +17,10 @@
17#include <linux/netfilter_bridge.h> 17#include <linux/netfilter_bridge.h>
18#include <linux/netfilter_ipv4.h> 18#include <linux/netfilter_ipv4.h>
19#include <net/netfilter/ipv4/nf_defrag_ipv4.h> 19#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
20#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
21#include <net/netfilter/nf_conntrack.h>
22#endif
23#include <net/netfilter/nf_conntrack_zones.h>
20 24
21/* Returns new sk_buff, or NULL */ 25/* Returns new sk_buff, or NULL */
22static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) 26static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
@@ -38,15 +42,22 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
38static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, 42static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
39 struct sk_buff *skb) 43 struct sk_buff *skb)
40{ 44{
45 u16 zone = NF_CT_DEFAULT_ZONE;
46
47#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
48 if (skb->nfct)
49 zone = nf_ct_zone((struct nf_conn *)skb->nfct);
50#endif
51
41#ifdef CONFIG_BRIDGE_NETFILTER 52#ifdef CONFIG_BRIDGE_NETFILTER
42 if (skb->nf_bridge && 53 if (skb->nf_bridge &&
43 skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) 54 skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
44 return IP_DEFRAG_CONNTRACK_BRIDGE_IN; 55 return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
45#endif 56#endif
46 if (hooknum == NF_INET_PRE_ROUTING) 57 if (hooknum == NF_INET_PRE_ROUTING)
47 return IP_DEFRAG_CONNTRACK_IN; 58 return IP_DEFRAG_CONNTRACK_IN + zone;
48 else 59 else
49 return IP_DEFRAG_CONNTRACK_OUT; 60 return IP_DEFRAG_CONNTRACK_OUT + zone;
50} 61}
51 62
52static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, 63static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
@@ -55,11 +66,18 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
55 const struct net_device *out, 66 const struct net_device *out,
56 int (*okfn)(struct sk_buff *)) 67 int (*okfn)(struct sk_buff *))
57{ 68{
69 struct sock *sk = skb->sk;
70 struct inet_sock *inet = inet_sk(skb->sk);
71
72 if (sk && (sk->sk_family == PF_INET) &&
73 inet->nodefrag)
74 return NF_ACCEPT;
75
58#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 76#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
59#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) 77#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
60 /* Previously seen (loopback)? Ignore. Do this before 78 /* Previously seen (loopback)? Ignore. Do this before
61 fragment check. */ 79 fragment check. */
62 if (skb->nfct) 80 if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
63 return NF_ACCEPT; 81 return NF_ACCEPT;
64#endif 82#endif
65#endif 83#endif
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index fe1a64479dd0..8c8632d9b93c 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -12,6 +12,7 @@
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/timer.h> 13#include <linux/timer.h>
14#include <linux/skbuff.h> 14#include <linux/skbuff.h>
15#include <linux/gfp.h>
15#include <net/checksum.h> 16#include <net/checksum.h>
16#include <net/icmp.h> 17#include <net/icmp.h>
17#include <net/ip.h> 18#include <net/ip.h>
@@ -30,14 +31,12 @@
30#include <net/netfilter/nf_conntrack_helper.h> 31#include <net/netfilter/nf_conntrack_helper.h>
31#include <net/netfilter/nf_conntrack_l3proto.h> 32#include <net/netfilter/nf_conntrack_l3proto.h>
32#include <net/netfilter/nf_conntrack_l4proto.h> 33#include <net/netfilter/nf_conntrack_l4proto.h>
34#include <net/netfilter/nf_conntrack_zones.h>
33 35
34static DEFINE_SPINLOCK(nf_nat_lock); 36static DEFINE_SPINLOCK(nf_nat_lock);
35 37
36static struct nf_conntrack_l3proto *l3proto __read_mostly; 38static struct nf_conntrack_l3proto *l3proto __read_mostly;
37 39
38/* Calculated at init based on memory size */
39static unsigned int nf_nat_htable_size __read_mostly;
40
41#define MAX_IP_NAT_PROTO 256 40#define MAX_IP_NAT_PROTO 256
42static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] 41static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
43 __read_mostly; 42 __read_mostly;
@@ -72,15 +71,16 @@ EXPORT_SYMBOL_GPL(nf_nat_proto_put);
72 71
73/* We keep an extra hash for each conntrack, for fast searching. */ 72/* We keep an extra hash for each conntrack, for fast searching. */
74static inline unsigned int 73static inline unsigned int
75hash_by_src(const struct nf_conntrack_tuple *tuple) 74hash_by_src(const struct net *net, u16 zone,
75 const struct nf_conntrack_tuple *tuple)
76{ 76{
77 unsigned int hash; 77 unsigned int hash;
78 78
79 /* Original src, to ensure we map it consistently if poss. */ 79 /* Original src, to ensure we map it consistently if poss. */
80 hash = jhash_3words((__force u32)tuple->src.u3.ip, 80 hash = jhash_3words((__force u32)tuple->src.u3.ip,
81 (__force u32)tuple->src.u.all, 81 (__force u32)tuple->src.u.all ^ zone,
82 tuple->dst.protonum, 0); 82 tuple->dst.protonum, 0);
83 return ((u64)hash * nf_nat_htable_size) >> 32; 83 return ((u64)hash * net->ipv4.nat_htable_size) >> 32;
84} 84}
85 85
86/* Is this tuple already taken? (not by us) */ 86/* Is this tuple already taken? (not by us) */
@@ -142,12 +142,12 @@ same_src(const struct nf_conn *ct,
142 142
143/* Only called for SRC manip */ 143/* Only called for SRC manip */
144static int 144static int
145find_appropriate_src(struct net *net, 145find_appropriate_src(struct net *net, u16 zone,
146 const struct nf_conntrack_tuple *tuple, 146 const struct nf_conntrack_tuple *tuple,
147 struct nf_conntrack_tuple *result, 147 struct nf_conntrack_tuple *result,
148 const struct nf_nat_range *range) 148 const struct nf_nat_range *range)
149{ 149{
150 unsigned int h = hash_by_src(tuple); 150 unsigned int h = hash_by_src(net, zone, tuple);
151 const struct nf_conn_nat *nat; 151 const struct nf_conn_nat *nat;
152 const struct nf_conn *ct; 152 const struct nf_conn *ct;
153 const struct hlist_node *n; 153 const struct hlist_node *n;
@@ -155,7 +155,7 @@ find_appropriate_src(struct net *net,
155 rcu_read_lock(); 155 rcu_read_lock();
156 hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) { 156 hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
157 ct = nat->ct; 157 ct = nat->ct;
158 if (same_src(ct, tuple)) { 158 if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
159 /* Copy source part from reply tuple. */ 159 /* Copy source part from reply tuple. */
160 nf_ct_invert_tuplepr(result, 160 nf_ct_invert_tuplepr(result,
161 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 161 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -178,7 +178,7 @@ find_appropriate_src(struct net *net,
178 the ip with the lowest src-ip/dst-ip/proto usage. 178 the ip with the lowest src-ip/dst-ip/proto usage.
179*/ 179*/
180static void 180static void
181find_best_ips_proto(struct nf_conntrack_tuple *tuple, 181find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
182 const struct nf_nat_range *range, 182 const struct nf_nat_range *range,
183 const struct nf_conn *ct, 183 const struct nf_conn *ct,
184 enum nf_nat_manip_type maniptype) 184 enum nf_nat_manip_type maniptype)
@@ -212,7 +212,7 @@ find_best_ips_proto(struct nf_conntrack_tuple *tuple,
212 maxip = ntohl(range->max_ip); 212 maxip = ntohl(range->max_ip);
213 j = jhash_2words((__force u32)tuple->src.u3.ip, 213 j = jhash_2words((__force u32)tuple->src.u3.ip,
214 range->flags & IP_NAT_RANGE_PERSISTENT ? 214 range->flags & IP_NAT_RANGE_PERSISTENT ?
215 0 : (__force u32)tuple->dst.u3.ip, 0); 215 0 : (__force u32)tuple->dst.u3.ip ^ zone, 0);
216 j = ((u64)j * (maxip - minip + 1)) >> 32; 216 j = ((u64)j * (maxip - minip + 1)) >> 32;
217 *var_ipp = htonl(minip + j); 217 *var_ipp = htonl(minip + j);
218} 218}
@@ -232,6 +232,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
232{ 232{
233 struct net *net = nf_ct_net(ct); 233 struct net *net = nf_ct_net(ct);
234 const struct nf_nat_protocol *proto; 234 const struct nf_nat_protocol *proto;
235 u16 zone = nf_ct_zone(ct);
235 236
236 /* 1) If this srcip/proto/src-proto-part is currently mapped, 237 /* 1) If this srcip/proto/src-proto-part is currently mapped,
237 and that same mapping gives a unique tuple within the given 238 and that same mapping gives a unique tuple within the given
@@ -242,7 +243,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
242 manips not an issue. */ 243 manips not an issue. */
243 if (maniptype == IP_NAT_MANIP_SRC && 244 if (maniptype == IP_NAT_MANIP_SRC &&
244 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { 245 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
245 if (find_appropriate_src(net, orig_tuple, tuple, range)) { 246 if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) {
246 pr_debug("get_unique_tuple: Found current src map\n"); 247 pr_debug("get_unique_tuple: Found current src map\n");
247 if (!nf_nat_used_tuple(tuple, ct)) 248 if (!nf_nat_used_tuple(tuple, ct))
248 return; 249 return;
@@ -252,7 +253,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
252 /* 2) Select the least-used IP/proto combination in the given 253 /* 2) Select the least-used IP/proto combination in the given
253 range. */ 254 range. */
254 *tuple = *orig_tuple; 255 *tuple = *orig_tuple;
255 find_best_ips_proto(tuple, range, ct, maniptype); 256 find_best_ips_proto(zone, tuple, range, ct, maniptype);
256 257
257 /* 3) The per-protocol part of the manip is made to map into 258 /* 3) The per-protocol part of the manip is made to map into
258 the range to make a unique tuple. */ 259 the range to make a unique tuple. */
@@ -260,14 +261,9 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
260 rcu_read_lock(); 261 rcu_read_lock();
261 proto = __nf_nat_proto_find(orig_tuple->dst.protonum); 262 proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
262 263
263 /* Change protocol info to have some randomization */
264 if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) {
265 proto->unique_tuple(tuple, range, maniptype, ct);
266 goto out;
267 }
268
269 /* Only bother mapping if it's not already in range and unique */ 264 /* Only bother mapping if it's not already in range and unique */
270 if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || 265 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM) &&
266 (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
271 proto->in_range(tuple, maniptype, &range->min, &range->max)) && 267 proto->in_range(tuple, maniptype, &range->min, &range->max)) &&
272 !nf_nat_used_tuple(tuple, ct)) 268 !nf_nat_used_tuple(tuple, ct))
273 goto out; 269 goto out;
@@ -330,7 +326,8 @@ nf_nat_setup_info(struct nf_conn *ct,
330 if (have_to_hash) { 326 if (have_to_hash) {
331 unsigned int srchash; 327 unsigned int srchash;
332 328
333 srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 329 srchash = hash_by_src(net, nf_ct_zone(ct),
330 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
334 spin_lock_bh(&nf_nat_lock); 331 spin_lock_bh(&nf_nat_lock);
335 /* nf_conntrack_alter_reply might re-allocate exntension aera */ 332 /* nf_conntrack_alter_reply might re-allocate exntension aera */
336 nat = nfct_nat(ct); 333 nat = nfct_nat(ct);
@@ -438,7 +435,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
438 if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) 435 if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
439 return 0; 436 return 0;
440 437
441 inside = (void *)skb->data + ip_hdrlen(skb); 438 inside = (void *)skb->data + hdrlen;
442 439
443 /* We're actually going to mangle it beyond trivial checksum 440 /* We're actually going to mangle it beyond trivial checksum
444 adjustment, so make sure the current checksum is correct. */ 441 adjustment, so make sure the current checksum is correct. */
@@ -468,12 +465,10 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
468 /* rcu_read_lock()ed by nf_hook_slow */ 465 /* rcu_read_lock()ed by nf_hook_slow */
469 l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); 466 l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
470 467
471 if (!nf_ct_get_tuple(skb, 468 if (!nf_ct_get_tuple(skb, hdrlen + sizeof(struct icmphdr),
472 ip_hdrlen(skb) + sizeof(struct icmphdr), 469 (hdrlen +
473 (ip_hdrlen(skb) +
474 sizeof(struct icmphdr) + inside->ip.ihl * 4), 470 sizeof(struct icmphdr) + inside->ip.ihl * 4),
475 (u_int16_t)AF_INET, 471 (u_int16_t)AF_INET, inside->ip.protocol,
476 inside->ip.protocol,
477 &inner, l3proto, l4proto)) 472 &inner, l3proto, l4proto))
478 return 0; 473 return 0;
479 474
@@ -482,15 +477,13 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
482 pass all hooks (locally-generated ICMP). Consider incoming 477 pass all hooks (locally-generated ICMP). Consider incoming
483 packet: PREROUTING (DST manip), routing produces ICMP, goes 478 packet: PREROUTING (DST manip), routing produces ICMP, goes
484 through POSTROUTING (which must correct the DST manip). */ 479 through POSTROUTING (which must correct the DST manip). */
485 if (!manip_pkt(inside->ip.protocol, skb, 480 if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp),
486 ip_hdrlen(skb) + sizeof(inside->icmp), 481 &ct->tuplehash[!dir].tuple, !manip))
487 &ct->tuplehash[!dir].tuple,
488 !manip))
489 return 0; 482 return 0;
490 483
491 if (skb->ip_summed != CHECKSUM_PARTIAL) { 484 if (skb->ip_summed != CHECKSUM_PARTIAL) {
492 /* Reloading "inside" here since manip_pkt inner. */ 485 /* Reloading "inside" here since manip_pkt inner. */
493 inside = (void *)skb->data + ip_hdrlen(skb); 486 inside = (void *)skb->data + hdrlen;
494 inside->icmp.checksum = 0; 487 inside->icmp.checksum = 0;
495 inside->icmp.checksum = 488 inside->icmp.checksum =
496 csum_fold(skb_checksum(skb, hdrlen, 489 csum_fold(skb_checksum(skb, hdrlen,
@@ -679,8 +672,10 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
679 672
680static int __net_init nf_nat_net_init(struct net *net) 673static int __net_init nf_nat_net_init(struct net *net)
681{ 674{
682 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 675 /* Leave them the same for the moment. */
683 &net->ipv4.nat_vmalloced, 0); 676 net->ipv4.nat_htable_size = net->ct.htable_size;
677 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size,
678 &net->ipv4.nat_vmalloced, 0);
684 if (!net->ipv4.nat_bysource) 679 if (!net->ipv4.nat_bysource)
685 return -ENOMEM; 680 return -ENOMEM;
686 return 0; 681 return 0;
@@ -703,7 +698,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
703 nf_ct_iterate_cleanup(net, &clean_nat, NULL); 698 nf_ct_iterate_cleanup(net, &clean_nat, NULL);
704 synchronize_rcu(); 699 synchronize_rcu();
705 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, 700 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
706 nf_nat_htable_size); 701 net->ipv4.nat_htable_size);
707} 702}
708 703
709static struct pernet_operations nf_nat_net_ops = { 704static struct pernet_operations nf_nat_net_ops = {
@@ -724,9 +719,6 @@ static int __init nf_nat_init(void)
724 return ret; 719 return ret;
725 } 720 }
726 721
727 /* Leave them the same for the moment. */
728 nf_nat_htable_size = nf_conntrack_htable_size;
729
730 ret = register_pernet_subsys(&nf_nat_net_ops); 722 ret = register_pernet_subsys(&nf_nat_net_ops);
731 if (ret < 0) 723 if (ret < 0)
732 goto cleanup_extend; 724 goto cleanup_extend;
@@ -741,7 +733,7 @@ static int __init nf_nat_init(void)
741 spin_unlock_bh(&nf_nat_lock); 733 spin_unlock_bh(&nf_nat_lock);
742 734
743 /* Initialize fake conntrack so that NAT will skip it */ 735 /* Initialize fake conntrack so that NAT will skip it */
744 nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK; 736 nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
745 737
746 l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); 738 l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
747 739
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index a1d5d58a58bf..86e0e84ff0a0 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -27,76 +27,29 @@ MODULE_ALIAS("ip_nat_ftp");
27 27
28/* FIXME: Time out? --RR */ 28/* FIXME: Time out? --RR */
29 29
30static int 30static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type,
31mangle_rfc959_packet(struct sk_buff *skb, 31 char *buffer, size_t buflen,
32 __be32 newip, 32 __be32 addr, u16 port)
33 u_int16_t port,
34 unsigned int matchoff,
35 unsigned int matchlen,
36 struct nf_conn *ct,
37 enum ip_conntrack_info ctinfo)
38{ 33{
39 char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")]; 34 switch (type) {
40 35 case NF_CT_FTP_PORT:
41 sprintf(buffer, "%u,%u,%u,%u,%u,%u", 36 case NF_CT_FTP_PASV:
42 NIPQUAD(newip), port>>8, port&0xFF); 37 return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u",
43 38 ((unsigned char *)&addr)[0],
44 pr_debug("calling nf_nat_mangle_tcp_packet\n"); 39 ((unsigned char *)&addr)[1],
45 40 ((unsigned char *)&addr)[2],
46 return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, 41 ((unsigned char *)&addr)[3],
47 matchlen, buffer, strlen(buffer)); 42 port >> 8,
48} 43 port & 0xFF);
49 44 case NF_CT_FTP_EPRT:
50/* |1|132.235.1.2|6275| */ 45 return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port);
51static int 46 case NF_CT_FTP_EPSV:
52mangle_eprt_packet(struct sk_buff *skb, 47 return snprintf(buffer, buflen, "|||%u|", port);
53 __be32 newip, 48 }
54 u_int16_t port,
55 unsigned int matchoff,
56 unsigned int matchlen,
57 struct nf_conn *ct,
58 enum ip_conntrack_info ctinfo)
59{
60 char buffer[sizeof("|1|255.255.255.255|65535|")];
61
62 sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port);
63
64 pr_debug("calling nf_nat_mangle_tcp_packet\n");
65
66 return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
67 matchlen, buffer, strlen(buffer));
68}
69
70/* |1|132.235.1.2|6275| */
71static int
72mangle_epsv_packet(struct sk_buff *skb,
73 __be32 newip,
74 u_int16_t port,
75 unsigned int matchoff,
76 unsigned int matchlen,
77 struct nf_conn *ct,
78 enum ip_conntrack_info ctinfo)
79{
80 char buffer[sizeof("|||65535|")];
81
82 sprintf(buffer, "|||%u|", port);
83
84 pr_debug("calling nf_nat_mangle_tcp_packet\n");
85 49
86 return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, 50 return 0;
87 matchlen, buffer, strlen(buffer));
88} 51}
89 52
90static int (*mangle[])(struct sk_buff *, __be32, u_int16_t,
91 unsigned int, unsigned int, struct nf_conn *,
92 enum ip_conntrack_info)
93= {
94 [NF_CT_FTP_PORT] = mangle_rfc959_packet,
95 [NF_CT_FTP_PASV] = mangle_rfc959_packet,
96 [NF_CT_FTP_EPRT] = mangle_eprt_packet,
97 [NF_CT_FTP_EPSV] = mangle_epsv_packet
98};
99
100/* So, this packet has hit the connection tracking matching code. 53/* So, this packet has hit the connection tracking matching code.
101 Mangle it, and change the expectation to match the new version. */ 54 Mangle it, and change the expectation to match the new version. */
102static unsigned int nf_nat_ftp(struct sk_buff *skb, 55static unsigned int nf_nat_ftp(struct sk_buff *skb,
@@ -110,6 +63,8 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
110 u_int16_t port; 63 u_int16_t port;
111 int dir = CTINFO2DIR(ctinfo); 64 int dir = CTINFO2DIR(ctinfo);
112 struct nf_conn *ct = exp->master; 65 struct nf_conn *ct = exp->master;
66 char buffer[sizeof("|1|255.255.255.255|65535|")];
67 unsigned int buflen;
113 68
114 pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); 69 pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
115 70
@@ -132,11 +87,21 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
132 if (port == 0) 87 if (port == 0)
133 return NF_DROP; 88 return NF_DROP;
134 89
135 if (!mangle[type](skb, newip, port, matchoff, matchlen, ct, ctinfo)) { 90 buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port);
136 nf_ct_unexpect_related(exp); 91 if (!buflen)
137 return NF_DROP; 92 goto out;
138 } 93
94 pr_debug("calling nf_nat_mangle_tcp_packet\n");
95
96 if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
97 matchlen, buffer, buflen))
98 goto out;
99
139 return NF_ACCEPT; 100 return NF_ACCEPT;
101
102out:
103 nf_ct_unexpect_related(exp);
104 return NF_DROP;
140} 105}
141 106
142static void __exit nf_nat_ftp_fini(void) 107static void __exit nf_nat_ftp_fini(void)
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 7e8e6fc75413..5045196d853c 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/moduleparam.h>
14#include <linux/tcp.h> 13#include <linux/tcp.h>
15#include <net/tcp.h> 14#include <net/tcp.h>
16 15
@@ -44,7 +43,7 @@ static int set_addr(struct sk_buff *skb,
44 addroff, sizeof(buf), 43 addroff, sizeof(buf),
45 (char *) &buf, sizeof(buf))) { 44 (char *) &buf, sizeof(buf))) {
46 if (net_ratelimit()) 45 if (net_ratelimit())
47 printk("nf_nat_h323: nf_nat_mangle_tcp_packet" 46 pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet"
48 " error\n"); 47 " error\n");
49 return -1; 48 return -1;
50 } 49 }
@@ -60,7 +59,7 @@ static int set_addr(struct sk_buff *skb,
60 addroff, sizeof(buf), 59 addroff, sizeof(buf),
61 (char *) &buf, sizeof(buf))) { 60 (char *) &buf, sizeof(buf))) {
62 if (net_ratelimit()) 61 if (net_ratelimit())
63 printk("nf_nat_h323: nf_nat_mangle_udp_packet" 62 pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet"
64 " error\n"); 63 " error\n");
65 return -1; 64 return -1;
66 } 65 }
@@ -216,7 +215,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
216 /* Run out of expectations */ 215 /* Run out of expectations */
217 if (i >= H323_RTP_CHANNEL_MAX) { 216 if (i >= H323_RTP_CHANNEL_MAX) {
218 if (net_ratelimit()) 217 if (net_ratelimit())
219 printk("nf_nat_h323: out of expectations\n"); 218 pr_notice("nf_nat_h323: out of expectations\n");
220 return 0; 219 return 0;
221 } 220 }
222 221
@@ -235,7 +234,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
235 234
236 if (nated_port == 0) { /* No port available */ 235 if (nated_port == 0) { /* No port available */
237 if (net_ratelimit()) 236 if (net_ratelimit())
238 printk("nf_nat_h323: out of RTP ports\n"); 237 pr_notice("nf_nat_h323: out of RTP ports\n");
239 return 0; 238 return 0;
240 } 239 }
241 240
@@ -292,7 +291,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
292 291
293 if (nated_port == 0) { /* No port available */ 292 if (nated_port == 0) { /* No port available */
294 if (net_ratelimit()) 293 if (net_ratelimit())
295 printk("nf_nat_h323: out of TCP ports\n"); 294 pr_notice("nf_nat_h323: out of TCP ports\n");
296 return 0; 295 return 0;
297 } 296 }
298 297
@@ -342,7 +341,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
342 341
343 if (nated_port == 0) { /* No port available */ 342 if (nated_port == 0) { /* No port available */
344 if (net_ratelimit()) 343 if (net_ratelimit())
345 printk("nf_nat_q931: out of TCP ports\n"); 344 pr_notice("nf_nat_q931: out of TCP ports\n");
346 return 0; 345 return 0;
347 } 346 }
348 347
@@ -426,7 +425,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
426 425
427 if (nated_port == 0) { /* No port available */ 426 if (nated_port == 0) { /* No port available */
428 if (net_ratelimit()) 427 if (net_ratelimit())
429 printk("nf_nat_ras: out of TCP ports\n"); 428 pr_notice("nf_nat_ras: out of TCP ports\n");
430 return 0; 429 return 0;
431 } 430 }
432 431
@@ -508,7 +507,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
508 507
509 if (nated_port == 0) { /* No port available */ 508 if (nated_port == 0) { /* No port available */
510 if (net_ratelimit()) 509 if (net_ratelimit())
511 printk("nf_nat_q931: out of TCP ports\n"); 510 pr_notice("nf_nat_q931: out of TCP ports\n");
512 return 0; 511 return 0;
513 } 512 }
514 513
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 7f10a6be0191..4a0c6b548eee 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -8,6 +8,7 @@
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/gfp.h>
11#include <linux/kmod.h> 12#include <linux/kmod.h>
12#include <linux/types.h> 13#include <linux/types.h>
13#include <linux/timer.h> 14#include <linux/timer.h>
@@ -141,6 +142,17 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
141 return 1; 142 return 1;
142} 143}
143 144
145void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
146 __be32 seq, s16 off)
147{
148 if (!off)
149 return;
150 set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
151 adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo);
152 nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
153}
154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
155
144/* Generic function for mangling variable-length address changes inside 156/* Generic function for mangling variable-length address changes inside
145 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX 157 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
146 * command in FTP). 158 * command in FTP).
@@ -149,14 +161,13 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
149 * skb enlargement, ... 161 * skb enlargement, ...
150 * 162 *
151 * */ 163 * */
152int 164int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
153nf_nat_mangle_tcp_packet(struct sk_buff *skb, 165 struct nf_conn *ct,
154 struct nf_conn *ct, 166 enum ip_conntrack_info ctinfo,
155 enum ip_conntrack_info ctinfo, 167 unsigned int match_offset,
156 unsigned int match_offset, 168 unsigned int match_len,
157 unsigned int match_len, 169 const char *rep_buffer,
158 const char *rep_buffer, 170 unsigned int rep_len, bool adjust)
159 unsigned int rep_len)
160{ 171{
161 struct rtable *rt = skb_rtable(skb); 172 struct rtable *rt = skb_rtable(skb);
162 struct iphdr *iph; 173 struct iphdr *iph;
@@ -202,16 +213,13 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb,
202 inet_proto_csum_replace2(&tcph->check, skb, 213 inet_proto_csum_replace2(&tcph->check, skb,
203 htons(oldlen), htons(datalen), 1); 214 htons(oldlen), htons(datalen), 1);
204 215
205 if (rep_len != match_len) { 216 if (adjust && rep_len != match_len)
206 set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); 217 nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
207 adjust_tcp_sequence(ntohl(tcph->seq), 218 (int)rep_len - (int)match_len);
208 (int)rep_len - (int)match_len, 219
209 ct, ctinfo);
210 nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
211 }
212 return 1; 220 return 1;
213} 221}
214EXPORT_SYMBOL(nf_nat_mangle_tcp_packet); 222EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet);
215 223
216/* Generic function for mangling variable-length address changes inside 224/* Generic function for mangling variable-length address changes inside
217 * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX 225 * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 9eb171056c63..4c060038d29f 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -25,6 +25,7 @@
25#include <net/netfilter/nf_nat_rule.h> 25#include <net/netfilter/nf_nat_rule.h>
26#include <net/netfilter/nf_conntrack_helper.h> 26#include <net/netfilter/nf_conntrack_helper.h>
27#include <net/netfilter/nf_conntrack_expect.h> 27#include <net/netfilter/nf_conntrack_expect.h>
28#include <net/netfilter/nf_conntrack_zones.h>
28#include <linux/netfilter/nf_conntrack_proto_gre.h> 29#include <linux/netfilter/nf_conntrack_proto_gre.h>
29#include <linux/netfilter/nf_conntrack_pptp.h> 30#include <linux/netfilter/nf_conntrack_pptp.h>
30 31
@@ -74,7 +75,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
74 75
75 pr_debug("trying to unexpect other dir: "); 76 pr_debug("trying to unexpect other dir: ");
76 nf_ct_dump_tuple_ip(&t); 77 nf_ct_dump_tuple_ip(&t);
77 other_exp = nf_ct_expect_find_get(net, &t); 78 other_exp = nf_ct_expect_find_get(net, nf_ct_zone(ct), &t);
78 if (other_exp) { 79 if (other_exp) {
79 nf_ct_unexpect_related(other_exp); 80 nf_ct_unexpect_related(other_exp);
80 nf_ct_expect_put(other_exp); 81 nf_ct_expect_put(other_exp);
diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
index 6c4f11f51446..3e61faf23a9a 100644
--- a/net/ipv4/netfilter/nf_nat_proto_common.c
+++ b/net/ipv4/netfilter/nf_nat_proto_common.c
@@ -34,7 +34,7 @@ bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
34} 34}
35EXPORT_SYMBOL_GPL(nf_nat_proto_in_range); 35EXPORT_SYMBOL_GPL(nf_nat_proto_in_range);
36 36
37bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, 37void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
38 const struct nf_nat_range *range, 38 const struct nf_nat_range *range,
39 enum nf_nat_manip_type maniptype, 39 enum nf_nat_manip_type maniptype,
40 const struct nf_conn *ct, 40 const struct nf_conn *ct,
@@ -53,7 +53,7 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
53 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { 53 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
54 /* If it's dst rewrite, can't change port */ 54 /* If it's dst rewrite, can't change port */
55 if (maniptype == IP_NAT_MANIP_DST) 55 if (maniptype == IP_NAT_MANIP_DST)
56 return false; 56 return;
57 57
58 if (ntohs(*portptr) < 1024) { 58 if (ntohs(*portptr) < 1024) {
59 /* Loose convention: >> 512 is credential passing */ 59 /* Loose convention: >> 512 is credential passing */
@@ -81,15 +81,15 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
81 else 81 else
82 off = *rover; 82 off = *rover;
83 83
84 for (i = 0; i < range_size; i++, off++) { 84 for (i = 0; ; ++off) {
85 *portptr = htons(min + off % range_size); 85 *portptr = htons(min + off % range_size);
86 if (nf_nat_used_tuple(tuple, ct)) 86 if (++i != range_size && nf_nat_used_tuple(tuple, ct))
87 continue; 87 continue;
88 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) 88 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM))
89 *rover = off; 89 *rover = off;
90 return true; 90 return;
91 } 91 }
92 return false; 92 return;
93} 93}
94EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple); 94EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple);
95 95
diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c
index 22485ce306d4..570faf2667b2 100644
--- a/net/ipv4/netfilter/nf_nat_proto_dccp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_dccp.c
@@ -22,14 +22,14 @@
22 22
23static u_int16_t dccp_port_rover; 23static u_int16_t dccp_port_rover;
24 24
25static bool 25static void
26dccp_unique_tuple(struct nf_conntrack_tuple *tuple, 26dccp_unique_tuple(struct nf_conntrack_tuple *tuple,
27 const struct nf_nat_range *range, 27 const struct nf_nat_range *range,
28 enum nf_nat_manip_type maniptype, 28 enum nf_nat_manip_type maniptype,
29 const struct nf_conn *ct) 29 const struct nf_conn *ct)
30{ 30{
31 return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, 31 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
32 &dccp_port_rover); 32 &dccp_port_rover);
33} 33}
34 34
35static bool 35static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index d7e89201351e..bc8d83a31c73 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -37,7 +37,7 @@ MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
37MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); 37MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
38 38
39/* generate unique tuple ... */ 39/* generate unique tuple ... */
40static bool 40static void
41gre_unique_tuple(struct nf_conntrack_tuple *tuple, 41gre_unique_tuple(struct nf_conntrack_tuple *tuple,
42 const struct nf_nat_range *range, 42 const struct nf_nat_range *range,
43 enum nf_nat_manip_type maniptype, 43 enum nf_nat_manip_type maniptype,
@@ -50,7 +50,7 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
50 /* If there is no master conntrack we are not PPTP, 50 /* If there is no master conntrack we are not PPTP,
51 do not change tuples */ 51 do not change tuples */
52 if (!ct->master) 52 if (!ct->master)
53 return false; 53 return;
54 54
55 if (maniptype == IP_NAT_MANIP_SRC) 55 if (maniptype == IP_NAT_MANIP_SRC)
56 keyptr = &tuple->src.u.gre.key; 56 keyptr = &tuple->src.u.gre.key;
@@ -68,14 +68,14 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
68 68
69 pr_debug("min = %u, range_size = %u\n", min, range_size); 69 pr_debug("min = %u, range_size = %u\n", min, range_size);
70 70
71 for (i = 0; i < range_size; i++, key++) { 71 for (i = 0; ; ++key) {
72 *keyptr = htons(min + key % range_size); 72 *keyptr = htons(min + key % range_size);
73 if (!nf_nat_used_tuple(tuple, ct)) 73 if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
74 return true; 74 return;
75 } 75 }
76 76
77 pr_debug("%p: no NAT mapping\n", ct); 77 pr_debug("%p: no NAT mapping\n", ct);
78 return false; 78 return;
79} 79}
80 80
81/* manipulate a GRE packet according to maniptype */ 81/* manipulate a GRE packet according to maniptype */
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 19a8b0b07d8e..5744c3ec847c 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -27,7 +27,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
27 ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); 27 ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
28} 28}
29 29
30static bool 30static void
31icmp_unique_tuple(struct nf_conntrack_tuple *tuple, 31icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
32 const struct nf_nat_range *range, 32 const struct nf_nat_range *range,
33 enum nf_nat_manip_type maniptype, 33 enum nf_nat_manip_type maniptype,
@@ -42,13 +42,13 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
42 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) 42 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
43 range_size = 0xFFFF; 43 range_size = 0xFFFF;
44 44
45 for (i = 0; i < range_size; i++, id++) { 45 for (i = 0; ; ++id) {
46 tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) + 46 tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
47 (id % range_size)); 47 (id % range_size));
48 if (!nf_nat_used_tuple(tuple, ct)) 48 if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
49 return true; 49 return;
50 } 50 }
51 return false; 51 return;
52} 52}
53 53
54static bool 54static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
index 3fc598eeeb1a..756331d42661 100644
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c
@@ -16,14 +16,14 @@
16 16
17static u_int16_t nf_sctp_port_rover; 17static u_int16_t nf_sctp_port_rover;
18 18
19static bool 19static void
20sctp_unique_tuple(struct nf_conntrack_tuple *tuple, 20sctp_unique_tuple(struct nf_conntrack_tuple *tuple,
21 const struct nf_nat_range *range, 21 const struct nf_nat_range *range,
22 enum nf_nat_manip_type maniptype, 22 enum nf_nat_manip_type maniptype,
23 const struct nf_conn *ct) 23 const struct nf_conn *ct)
24{ 24{
25 return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, 25 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
26 &nf_sctp_port_rover); 26 &nf_sctp_port_rover);
27} 27}
28 28
29static bool 29static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
index 399e2cfa263b..aa460a595d5d 100644
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c
@@ -20,14 +20,13 @@
20 20
21static u_int16_t tcp_port_rover; 21static u_int16_t tcp_port_rover;
22 22
23static bool 23static void
24tcp_unique_tuple(struct nf_conntrack_tuple *tuple, 24tcp_unique_tuple(struct nf_conntrack_tuple *tuple,
25 const struct nf_nat_range *range, 25 const struct nf_nat_range *range,
26 enum nf_nat_manip_type maniptype, 26 enum nf_nat_manip_type maniptype,
27 const struct nf_conn *ct) 27 const struct nf_conn *ct)
28{ 28{
29 return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, 29 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover);
30 &tcp_port_rover);
31} 30}
32 31
33static bool 32static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
index 9e61c79492e4..dfe65c7e2925 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udp.c
@@ -19,14 +19,13 @@
19 19
20static u_int16_t udp_port_rover; 20static u_int16_t udp_port_rover;
21 21
22static bool 22static void
23udp_unique_tuple(struct nf_conntrack_tuple *tuple, 23udp_unique_tuple(struct nf_conntrack_tuple *tuple,
24 const struct nf_nat_range *range, 24 const struct nf_nat_range *range,
25 enum nf_nat_manip_type maniptype, 25 enum nf_nat_manip_type maniptype,
26 const struct nf_conn *ct) 26 const struct nf_conn *ct)
27{ 27{
28 return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, 28 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover);
29 &udp_port_rover);
30} 29}
31 30
32static bool 31static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c
index 440a229bbd87..3cc8c8af39ef 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udplite.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udplite.c
@@ -18,14 +18,14 @@
18 18
19static u_int16_t udplite_port_rover; 19static u_int16_t udplite_port_rover;
20 20
21static bool 21static void
22udplite_unique_tuple(struct nf_conntrack_tuple *tuple, 22udplite_unique_tuple(struct nf_conntrack_tuple *tuple,
23 const struct nf_nat_range *range, 23 const struct nf_nat_range *range,
24 enum nf_nat_manip_type maniptype, 24 enum nf_nat_manip_type maniptype,
25 const struct nf_conn *ct) 25 const struct nf_conn *ct)
26{ 26{
27 return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, 27 nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
28 &udplite_port_rover); 28 &udplite_port_rover);
29} 29}
30 30
31static bool 31static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c
index 14381c62acea..a50f2bc1c732 100644
--- a/net/ipv4/netfilter/nf_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c
@@ -26,14 +26,14 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
26 return true; 26 return true;
27} 27}
28 28
29static bool unknown_unique_tuple(struct nf_conntrack_tuple *tuple, 29static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
30 const struct nf_nat_range *range, 30 const struct nf_nat_range *range,
31 enum nf_nat_manip_type maniptype, 31 enum nf_nat_manip_type maniptype,
32 const struct nf_conn *ct) 32 const struct nf_conn *ct)
33{ 33{
34 /* Sorry: we can't help you; if it's not unique, we can't frob 34 /* Sorry: we can't help you; if it's not unique, we can't frob
35 anything. */ 35 anything. */
36 return false; 36 return;
37} 37}
38 38
39static bool 39static bool
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 9e81e0dfb4ec..ebbd319f62f5 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9/* Everything about the rules for NAT. */ 9/* Everything about the rules for NAT. */
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10#include <linux/types.h> 11#include <linux/types.h>
11#include <linux/ip.h> 12#include <linux/ip.h>
12#include <linux/netfilter.h> 13#include <linux/netfilter.h>
@@ -15,6 +16,7 @@
15#include <linux/kmod.h> 16#include <linux/kmod.h>
16#include <linux/skbuff.h> 17#include <linux/skbuff.h>
17#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
19#include <linux/slab.h>
18#include <net/checksum.h> 20#include <net/checksum.h>
19#include <net/route.h> 21#include <net/route.h>
20#include <linux/bitops.h> 22#include <linux/bitops.h>
@@ -26,37 +28,8 @@
26 28
27#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ 29#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
28 (1 << NF_INET_POST_ROUTING) | \ 30 (1 << NF_INET_POST_ROUTING) | \
29 (1 << NF_INET_LOCAL_OUT)) 31 (1 << NF_INET_LOCAL_OUT) | \
30 32 (1 << NF_INET_LOCAL_IN))
31static const struct
32{
33 struct ipt_replace repl;
34 struct ipt_standard entries[3];
35 struct ipt_error term;
36} nat_initial_table __net_initdata = {
37 .repl = {
38 .name = "nat",
39 .valid_hooks = NAT_VALID_HOOKS,
40 .num_entries = 4,
41 .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
42 .hook_entry = {
43 [NF_INET_PRE_ROUTING] = 0,
44 [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard),
45 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2
46 },
47 .underflow = {
48 [NF_INET_PRE_ROUTING] = 0,
49 [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard),
50 [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2
51 },
52 },
53 .entries = {
54 IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */
55 IPT_STANDARD_INIT(NF_ACCEPT), /* POST_ROUTING */
56 IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
57 },
58 .term = IPT_ERROR_INIT, /* ERROR */
59};
60 33
61static const struct xt_table nat_table = { 34static const struct xt_table nat_table = {
62 .name = "nat", 35 .name = "nat",
@@ -67,13 +40,14 @@ static const struct xt_table nat_table = {
67 40
68/* Source NAT */ 41/* Source NAT */
69static unsigned int 42static unsigned int
70ipt_snat_target(struct sk_buff *skb, const struct xt_target_param *par) 43ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
71{ 44{
72 struct nf_conn *ct; 45 struct nf_conn *ct;
73 enum ip_conntrack_info ctinfo; 46 enum ip_conntrack_info ctinfo;
74 const struct nf_nat_multi_range_compat *mr = par->targinfo; 47 const struct nf_nat_multi_range_compat *mr = par->targinfo;
75 48
76 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); 49 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING ||
50 par->hooknum == NF_INET_LOCAL_IN);
77 51
78 ct = nf_ct_get(skb, &ctinfo); 52 ct = nf_ct_get(skb, &ctinfo);
79 53
@@ -86,7 +60,7 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_target_param *par)
86} 60}
87 61
88static unsigned int 62static unsigned int
89ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par) 63ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par)
90{ 64{
91 struct nf_conn *ct; 65 struct nf_conn *ct;
92 enum ip_conntrack_info ctinfo; 66 enum ip_conntrack_info ctinfo;
@@ -103,31 +77,31 @@ ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par)
103 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); 77 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST);
104} 78}
105 79
106static bool ipt_snat_checkentry(const struct xt_tgchk_param *par) 80static int ipt_snat_checkentry(const struct xt_tgchk_param *par)
107{ 81{
108 const struct nf_nat_multi_range_compat *mr = par->targinfo; 82 const struct nf_nat_multi_range_compat *mr = par->targinfo;
109 83
110 /* Must be a valid range */ 84 /* Must be a valid range */
111 if (mr->rangesize != 1) { 85 if (mr->rangesize != 1) {
112 printk("SNAT: multiple ranges no longer supported\n"); 86 pr_info("SNAT: multiple ranges no longer supported\n");
113 return false; 87 return -EINVAL;
114 } 88 }
115 return true; 89 return 0;
116} 90}
117 91
118static bool ipt_dnat_checkentry(const struct xt_tgchk_param *par) 92static int ipt_dnat_checkentry(const struct xt_tgchk_param *par)
119{ 93{
120 const struct nf_nat_multi_range_compat *mr = par->targinfo; 94 const struct nf_nat_multi_range_compat *mr = par->targinfo;
121 95
122 /* Must be a valid range */ 96 /* Must be a valid range */
123 if (mr->rangesize != 1) { 97 if (mr->rangesize != 1) {
124 printk("DNAT: multiple ranges no longer supported\n"); 98 pr_info("DNAT: multiple ranges no longer supported\n");
125 return false; 99 return -EINVAL;
126 } 100 }
127 return true; 101 return 0;
128} 102}
129 103
130unsigned int 104static unsigned int
131alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) 105alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
132{ 106{
133 /* Force range to this IP; let proto decide mapping for 107 /* Force range to this IP; let proto decide mapping for
@@ -169,7 +143,7 @@ static struct xt_target ipt_snat_reg __read_mostly = {
169 .target = ipt_snat_target, 143 .target = ipt_snat_target,
170 .targetsize = sizeof(struct nf_nat_multi_range_compat), 144 .targetsize = sizeof(struct nf_nat_multi_range_compat),
171 .table = "nat", 145 .table = "nat",
172 .hooks = 1 << NF_INET_POST_ROUTING, 146 .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN),
173 .checkentry = ipt_snat_checkentry, 147 .checkentry = ipt_snat_checkentry,
174 .family = AF_INET, 148 .family = AF_INET,
175}; 149};
@@ -186,8 +160,13 @@ static struct xt_target ipt_dnat_reg __read_mostly = {
186 160
187static int __net_init nf_nat_rule_net_init(struct net *net) 161static int __net_init nf_nat_rule_net_init(struct net *net)
188{ 162{
189 net->ipv4.nat_table = ipt_register_table(net, &nat_table, 163 struct ipt_replace *repl;
190 &nat_initial_table.repl); 164
165 repl = ipt_alloc_initial_table(&nat_table);
166 if (repl == NULL)
167 return -ENOMEM;
168 net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl);
169 kfree(repl);
191 if (IS_ERR(net->ipv4.nat_table)) 170 if (IS_ERR(net->ipv4.nat_table))
192 return PTR_ERR(net->ipv4.nat_table); 171 return PTR_ERR(net->ipv4.nat_table);
193 return 0; 172 return 0;
@@ -195,7 +174,7 @@ static int __net_init nf_nat_rule_net_init(struct net *net)
195 174
196static void __net_exit nf_nat_rule_net_exit(struct net *net) 175static void __net_exit nf_nat_rule_net_exit(struct net *net)
197{ 176{
198 ipt_unregister_table(net->ipv4.nat_table); 177 ipt_unregister_table(net, net->ipv4.nat_table);
199} 178}
200 179
201static struct pernet_operations nf_nat_rule_net_ops = { 180static struct pernet_operations nf_nat_rule_net_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 07d61a57613c..11b538deaaec 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -1,4 +1,4 @@
1/* SIP extension for UDP NAT alteration. 1/* SIP extension for NAT alteration.
2 * 2 *
3 * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> 3 * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
4 * based on RR's ip_nat_ftp.c and other modules. 4 * based on RR's ip_nat_ftp.c and other modules.
@@ -15,6 +15,7 @@
15#include <linux/ip.h> 15#include <linux/ip.h>
16#include <net/ip.h> 16#include <net/ip.h>
17#include <linux/udp.h> 17#include <linux/udp.h>
18#include <linux/tcp.h>
18 19
19#include <net/netfilter/nf_nat.h> 20#include <net/netfilter/nf_nat.h>
20#include <net/netfilter/nf_nat_helper.h> 21#include <net/netfilter/nf_nat_helper.h>
@@ -29,25 +30,42 @@ MODULE_DESCRIPTION("SIP NAT helper");
29MODULE_ALIAS("ip_nat_sip"); 30MODULE_ALIAS("ip_nat_sip");
30 31
31 32
32static unsigned int mangle_packet(struct sk_buff *skb, 33static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff,
33 const char **dptr, unsigned int *datalen, 34 const char **dptr, unsigned int *datalen,
34 unsigned int matchoff, unsigned int matchlen, 35 unsigned int matchoff, unsigned int matchlen,
35 const char *buffer, unsigned int buflen) 36 const char *buffer, unsigned int buflen)
36{ 37{
37 enum ip_conntrack_info ctinfo; 38 enum ip_conntrack_info ctinfo;
38 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 39 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
39 40 struct tcphdr *th;
40 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, matchoff, matchlen, 41 unsigned int baseoff;
41 buffer, buflen)) 42
42 return 0; 43 if (nf_ct_protonum(ct) == IPPROTO_TCP) {
44 th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
45 baseoff = ip_hdrlen(skb) + th->doff * 4;
46 matchoff += dataoff - baseoff;
47
48 if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
49 matchoff, matchlen,
50 buffer, buflen, false))
51 return 0;
52 } else {
53 baseoff = ip_hdrlen(skb) + sizeof(struct udphdr);
54 matchoff += dataoff - baseoff;
55
56 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
57 matchoff, matchlen,
58 buffer, buflen))
59 return 0;
60 }
43 61
44 /* Reload data pointer and adjust datalen value */ 62 /* Reload data pointer and adjust datalen value */
45 *dptr = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr); 63 *dptr = skb->data + dataoff;
46 *datalen += buflen - matchlen; 64 *datalen += buflen - matchlen;
47 return 1; 65 return 1;
48} 66}
49 67
50static int map_addr(struct sk_buff *skb, 68static int map_addr(struct sk_buff *skb, unsigned int dataoff,
51 const char **dptr, unsigned int *datalen, 69 const char **dptr, unsigned int *datalen,
52 unsigned int matchoff, unsigned int matchlen, 70 unsigned int matchoff, unsigned int matchlen,
53 union nf_inet_addr *addr, __be16 port) 71 union nf_inet_addr *addr, __be16 port)
@@ -76,11 +94,11 @@ static int map_addr(struct sk_buff *skb,
76 94
77 buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport)); 95 buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport));
78 96
79 return mangle_packet(skb, dptr, datalen, matchoff, matchlen, 97 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
80 buffer, buflen); 98 buffer, buflen);
81} 99}
82 100
83static int map_sip_addr(struct sk_buff *skb, 101static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff,
84 const char **dptr, unsigned int *datalen, 102 const char **dptr, unsigned int *datalen,
85 enum sip_header_types type) 103 enum sip_header_types type)
86{ 104{
@@ -93,16 +111,18 @@ static int map_sip_addr(struct sk_buff *skb,
93 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL, 111 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL,
94 &matchoff, &matchlen, &addr, &port) <= 0) 112 &matchoff, &matchlen, &addr, &port) <= 0)
95 return 1; 113 return 1;
96 return map_addr(skb, dptr, datalen, matchoff, matchlen, &addr, port); 114 return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
115 &addr, port);
97} 116}
98 117
99static unsigned int ip_nat_sip(struct sk_buff *skb, 118static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
100 const char **dptr, unsigned int *datalen) 119 const char **dptr, unsigned int *datalen)
101{ 120{
102 enum ip_conntrack_info ctinfo; 121 enum ip_conntrack_info ctinfo;
103 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 122 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
104 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 123 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
105 unsigned int dataoff, matchoff, matchlen; 124 unsigned int coff, matchoff, matchlen;
125 enum sip_header_types hdr;
106 union nf_inet_addr addr; 126 union nf_inet_addr addr;
107 __be16 port; 127 __be16 port;
108 int request, in_header; 128 int request, in_header;
@@ -112,16 +132,21 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
112 if (ct_sip_parse_request(ct, *dptr, *datalen, 132 if (ct_sip_parse_request(ct, *dptr, *datalen,
113 &matchoff, &matchlen, 133 &matchoff, &matchlen,
114 &addr, &port) > 0 && 134 &addr, &port) > 0 &&
115 !map_addr(skb, dptr, datalen, matchoff, matchlen, 135 !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
116 &addr, port)) 136 &addr, port))
117 return NF_DROP; 137 return NF_DROP;
118 request = 1; 138 request = 1;
119 } else 139 } else
120 request = 0; 140 request = 0;
121 141
142 if (nf_ct_protonum(ct) == IPPROTO_TCP)
143 hdr = SIP_HDR_VIA_TCP;
144 else
145 hdr = SIP_HDR_VIA_UDP;
146
122 /* Translate topmost Via header and parameters */ 147 /* Translate topmost Via header and parameters */
123 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, 148 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
124 SIP_HDR_VIA, NULL, &matchoff, &matchlen, 149 hdr, NULL, &matchoff, &matchlen,
125 &addr, &port) > 0) { 150 &addr, &port) > 0) {
126 unsigned int matchend, poff, plen, buflen, n; 151 unsigned int matchend, poff, plen, buflen, n;
127 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; 152 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
@@ -138,7 +163,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
138 goto next; 163 goto next;
139 } 164 }
140 165
141 if (!map_addr(skb, dptr, datalen, matchoff, matchlen, 166 if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
142 &addr, port)) 167 &addr, port))
143 return NF_DROP; 168 return NF_DROP;
144 169
@@ -153,8 +178,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
153 addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { 178 addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) {
154 buflen = sprintf(buffer, "%pI4", 179 buflen = sprintf(buffer, "%pI4",
155 &ct->tuplehash[!dir].tuple.dst.u3.ip); 180 &ct->tuplehash[!dir].tuple.dst.u3.ip);
156 if (!mangle_packet(skb, dptr, datalen, poff, plen, 181 if (!mangle_packet(skb, dataoff, dptr, datalen,
157 buffer, buflen)) 182 poff, plen, buffer, buflen))
158 return NF_DROP; 183 return NF_DROP;
159 } 184 }
160 185
@@ -167,8 +192,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
167 addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { 192 addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) {
168 buflen = sprintf(buffer, "%pI4", 193 buflen = sprintf(buffer, "%pI4",
169 &ct->tuplehash[!dir].tuple.src.u3.ip); 194 &ct->tuplehash[!dir].tuple.src.u3.ip);
170 if (!mangle_packet(skb, dptr, datalen, poff, plen, 195 if (!mangle_packet(skb, dataoff, dptr, datalen,
171 buffer, buflen)) 196 poff, plen, buffer, buflen))
172 return NF_DROP; 197 return NF_DROP;
173 } 198 }
174 199
@@ -181,31 +206,45 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
181 htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) { 206 htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
182 __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; 207 __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
183 buflen = sprintf(buffer, "%u", ntohs(p)); 208 buflen = sprintf(buffer, "%u", ntohs(p));
184 if (!mangle_packet(skb, dptr, datalen, poff, plen, 209 if (!mangle_packet(skb, dataoff, dptr, datalen,
185 buffer, buflen)) 210 poff, plen, buffer, buflen))
186 return NF_DROP; 211 return NF_DROP;
187 } 212 }
188 } 213 }
189 214
190next: 215next:
191 /* Translate Contact headers */ 216 /* Translate Contact headers */
192 dataoff = 0; 217 coff = 0;
193 in_header = 0; 218 in_header = 0;
194 while (ct_sip_parse_header_uri(ct, *dptr, &dataoff, *datalen, 219 while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
195 SIP_HDR_CONTACT, &in_header, 220 SIP_HDR_CONTACT, &in_header,
196 &matchoff, &matchlen, 221 &matchoff, &matchlen,
197 &addr, &port) > 0) { 222 &addr, &port) > 0) {
198 if (!map_addr(skb, dptr, datalen, matchoff, matchlen, 223 if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
199 &addr, port)) 224 &addr, port))
200 return NF_DROP; 225 return NF_DROP;
201 } 226 }
202 227
203 if (!map_sip_addr(skb, dptr, datalen, SIP_HDR_FROM) || 228 if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) ||
204 !map_sip_addr(skb, dptr, datalen, SIP_HDR_TO)) 229 !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO))
205 return NF_DROP; 230 return NF_DROP;
231
206 return NF_ACCEPT; 232 return NF_ACCEPT;
207} 233}
208 234
235static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off)
236{
237 enum ip_conntrack_info ctinfo;
238 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
239 const struct tcphdr *th;
240
241 if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0)
242 return;
243
244 th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
245 nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
246}
247
209/* Handles expected signalling connections and media streams */ 248/* Handles expected signalling connections and media streams */
210static void ip_nat_sip_expected(struct nf_conn *ct, 249static void ip_nat_sip_expected(struct nf_conn *ct,
211 struct nf_conntrack_expect *exp) 250 struct nf_conntrack_expect *exp)
@@ -232,7 +271,7 @@ static void ip_nat_sip_expected(struct nf_conn *ct,
232 } 271 }
233} 272}
234 273
235static unsigned int ip_nat_sip_expect(struct sk_buff *skb, 274static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
236 const char **dptr, unsigned int *datalen, 275 const char **dptr, unsigned int *datalen,
237 struct nf_conntrack_expect *exp, 276 struct nf_conntrack_expect *exp,
238 unsigned int matchoff, 277 unsigned int matchoff,
@@ -279,8 +318,8 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb,
279 if (exp->tuple.dst.u3.ip != exp->saved_ip || 318 if (exp->tuple.dst.u3.ip != exp->saved_ip ||
280 exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { 319 exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
281 buflen = sprintf(buffer, "%pI4:%u", &newip, port); 320 buflen = sprintf(buffer, "%pI4:%u", &newip, port);
282 if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen, 321 if (!mangle_packet(skb, dataoff, dptr, datalen,
283 buffer, buflen)) 322 matchoff, matchlen, buffer, buflen))
284 goto err; 323 goto err;
285 } 324 }
286 return NF_ACCEPT; 325 return NF_ACCEPT;
@@ -290,7 +329,7 @@ err:
290 return NF_DROP; 329 return NF_DROP;
291} 330}
292 331
293static int mangle_content_len(struct sk_buff *skb, 332static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff,
294 const char **dptr, unsigned int *datalen) 333 const char **dptr, unsigned int *datalen)
295{ 334{
296 enum ip_conntrack_info ctinfo; 335 enum ip_conntrack_info ctinfo;
@@ -312,12 +351,13 @@ static int mangle_content_len(struct sk_buff *skb,
312 return 0; 351 return 0;
313 352
314 buflen = sprintf(buffer, "%u", c_len); 353 buflen = sprintf(buffer, "%u", c_len);
315 return mangle_packet(skb, dptr, datalen, matchoff, matchlen, 354 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
316 buffer, buflen); 355 buffer, buflen);
317} 356}
318 357
319static int mangle_sdp_packet(struct sk_buff *skb, const char **dptr, 358static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff,
320 unsigned int dataoff, unsigned int *datalen, 359 const char **dptr, unsigned int *datalen,
360 unsigned int sdpoff,
321 enum sdp_header_types type, 361 enum sdp_header_types type,
322 enum sdp_header_types term, 362 enum sdp_header_types term,
323 char *buffer, int buflen) 363 char *buffer, int buflen)
@@ -326,16 +366,16 @@ static int mangle_sdp_packet(struct sk_buff *skb, const char **dptr,
326 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 366 struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
327 unsigned int matchlen, matchoff; 367 unsigned int matchlen, matchoff;
328 368
329 if (ct_sip_get_sdp_header(ct, *dptr, dataoff, *datalen, type, term, 369 if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term,
330 &matchoff, &matchlen) <= 0) 370 &matchoff, &matchlen) <= 0)
331 return -ENOENT; 371 return -ENOENT;
332 return mangle_packet(skb, dptr, datalen, matchoff, matchlen, 372 return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
333 buffer, buflen) ? 0 : -EINVAL; 373 buffer, buflen) ? 0 : -EINVAL;
334} 374}
335 375
336static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr, 376static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff,
337 unsigned int dataoff, 377 const char **dptr, unsigned int *datalen,
338 unsigned int *datalen, 378 unsigned int sdpoff,
339 enum sdp_header_types type, 379 enum sdp_header_types type,
340 enum sdp_header_types term, 380 enum sdp_header_types term,
341 const union nf_inet_addr *addr) 381 const union nf_inet_addr *addr)
@@ -344,16 +384,15 @@ static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr,
344 unsigned int buflen; 384 unsigned int buflen;
345 385
346 buflen = sprintf(buffer, "%pI4", &addr->ip); 386 buflen = sprintf(buffer, "%pI4", &addr->ip);
347 if (mangle_sdp_packet(skb, dptr, dataoff, datalen, type, term, 387 if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term,
348 buffer, buflen)) 388 buffer, buflen))
349 return 0; 389 return 0;
350 390
351 return mangle_content_len(skb, dptr, datalen); 391 return mangle_content_len(skb, dataoff, dptr, datalen);
352} 392}
353 393
354static unsigned int ip_nat_sdp_port(struct sk_buff *skb, 394static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff,
355 const char **dptr, 395 const char **dptr, unsigned int *datalen,
356 unsigned int *datalen,
357 unsigned int matchoff, 396 unsigned int matchoff,
358 unsigned int matchlen, 397 unsigned int matchlen,
359 u_int16_t port) 398 u_int16_t port)
@@ -362,16 +401,16 @@ static unsigned int ip_nat_sdp_port(struct sk_buff *skb,
362 unsigned int buflen; 401 unsigned int buflen;
363 402
364 buflen = sprintf(buffer, "%u", port); 403 buflen = sprintf(buffer, "%u", port);
365 if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen, 404 if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
366 buffer, buflen)) 405 buffer, buflen))
367 return 0; 406 return 0;
368 407
369 return mangle_content_len(skb, dptr, datalen); 408 return mangle_content_len(skb, dataoff, dptr, datalen);
370} 409}
371 410
372static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr, 411static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff,
373 unsigned int dataoff, 412 const char **dptr, unsigned int *datalen,
374 unsigned int *datalen, 413 unsigned int sdpoff,
375 const union nf_inet_addr *addr) 414 const union nf_inet_addr *addr)
376{ 415{
377 char buffer[sizeof("nnn.nnn.nnn.nnn")]; 416 char buffer[sizeof("nnn.nnn.nnn.nnn")];
@@ -379,12 +418,12 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr,
379 418
380 /* Mangle session description owner and contact addresses */ 419 /* Mangle session description owner and contact addresses */
381 buflen = sprintf(buffer, "%pI4", &addr->ip); 420 buflen = sprintf(buffer, "%pI4", &addr->ip);
382 if (mangle_sdp_packet(skb, dptr, dataoff, datalen, 421 if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
383 SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA, 422 SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA,
384 buffer, buflen)) 423 buffer, buflen))
385 return 0; 424 return 0;
386 425
387 switch (mangle_sdp_packet(skb, dptr, dataoff, datalen, 426 switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
388 SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA, 427 SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA,
389 buffer, buflen)) { 428 buffer, buflen)) {
390 case 0: 429 case 0:
@@ -401,14 +440,13 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr,
401 return 0; 440 return 0;
402 } 441 }
403 442
404 return mangle_content_len(skb, dptr, datalen); 443 return mangle_content_len(skb, dataoff, dptr, datalen);
405} 444}
406 445
407/* So, this packet has hit the connection tracking matching code. 446/* So, this packet has hit the connection tracking matching code.
408 Mangle it, and change the expectation to match the new version. */ 447 Mangle it, and change the expectation to match the new version. */
409static unsigned int ip_nat_sdp_media(struct sk_buff *skb, 448static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
410 const char **dptr, 449 const char **dptr, unsigned int *datalen,
411 unsigned int *datalen,
412 struct nf_conntrack_expect *rtp_exp, 450 struct nf_conntrack_expect *rtp_exp,
413 struct nf_conntrack_expect *rtcp_exp, 451 struct nf_conntrack_expect *rtcp_exp,
414 unsigned int mediaoff, 452 unsigned int mediaoff,
@@ -456,7 +494,8 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb,
456 494
457 /* Update media port. */ 495 /* Update media port. */
458 if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port && 496 if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port &&
459 !ip_nat_sdp_port(skb, dptr, datalen, mediaoff, medialen, port)) 497 !ip_nat_sdp_port(skb, dataoff, dptr, datalen,
498 mediaoff, medialen, port))
460 goto err2; 499 goto err2;
461 500
462 return NF_ACCEPT; 501 return NF_ACCEPT;
@@ -471,6 +510,7 @@ err1:
471static void __exit nf_nat_sip_fini(void) 510static void __exit nf_nat_sip_fini(void)
472{ 511{
473 rcu_assign_pointer(nf_nat_sip_hook, NULL); 512 rcu_assign_pointer(nf_nat_sip_hook, NULL);
513 rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, NULL);
474 rcu_assign_pointer(nf_nat_sip_expect_hook, NULL); 514 rcu_assign_pointer(nf_nat_sip_expect_hook, NULL);
475 rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL); 515 rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL);
476 rcu_assign_pointer(nf_nat_sdp_port_hook, NULL); 516 rcu_assign_pointer(nf_nat_sdp_port_hook, NULL);
@@ -482,12 +522,14 @@ static void __exit nf_nat_sip_fini(void)
482static int __init nf_nat_sip_init(void) 522static int __init nf_nat_sip_init(void)
483{ 523{
484 BUG_ON(nf_nat_sip_hook != NULL); 524 BUG_ON(nf_nat_sip_hook != NULL);
525 BUG_ON(nf_nat_sip_seq_adjust_hook != NULL);
485 BUG_ON(nf_nat_sip_expect_hook != NULL); 526 BUG_ON(nf_nat_sip_expect_hook != NULL);
486 BUG_ON(nf_nat_sdp_addr_hook != NULL); 527 BUG_ON(nf_nat_sdp_addr_hook != NULL);
487 BUG_ON(nf_nat_sdp_port_hook != NULL); 528 BUG_ON(nf_nat_sdp_port_hook != NULL);
488 BUG_ON(nf_nat_sdp_session_hook != NULL); 529 BUG_ON(nf_nat_sdp_session_hook != NULL);
489 BUG_ON(nf_nat_sdp_media_hook != NULL); 530 BUG_ON(nf_nat_sdp_media_hook != NULL);
490 rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip); 531 rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip);
532 rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust);
491 rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect); 533 rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect);
492 rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr); 534 rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr);
493 rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port); 535 rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port);
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index d9521f6f9ed0..ee5f419d0a56 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -43,6 +43,7 @@
43#include <linux/moduleparam.h> 43#include <linux/moduleparam.h>
44#include <linux/types.h> 44#include <linux/types.h>
45#include <linux/kernel.h> 45#include <linux/kernel.h>
46#include <linux/slab.h>
46#include <linux/in.h> 47#include <linux/in.h>
47#include <linux/ip.h> 48#include <linux/ip.h>
48#include <linux/udp.h> 49#include <linux/udp.h>
@@ -400,7 +401,7 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
400 *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); 401 *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
401 if (*octets == NULL) { 402 if (*octets == NULL) {
402 if (net_ratelimit()) 403 if (net_ratelimit())
403 printk("OOM in bsalg (%d)\n", __LINE__); 404 pr_notice("OOM in bsalg (%d)\n", __LINE__);
404 return 0; 405 return 0;
405 } 406 }
406 407
@@ -451,7 +452,7 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
451 *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); 452 *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
452 if (*oid == NULL) { 453 if (*oid == NULL) {
453 if (net_ratelimit()) 454 if (net_ratelimit())
454 printk("OOM in bsalg (%d)\n", __LINE__); 455 pr_notice("OOM in bsalg (%d)\n", __LINE__);
455 return 0; 456 return 0;
456 } 457 }
457 458
@@ -728,7 +729,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
728 if (*obj == NULL) { 729 if (*obj == NULL) {
729 kfree(id); 730 kfree(id);
730 if (net_ratelimit()) 731 if (net_ratelimit())
731 printk("OOM in bsalg (%d)\n", __LINE__); 732 pr_notice("OOM in bsalg (%d)\n", __LINE__);
732 return 0; 733 return 0;
733 } 734 }
734 (*obj)->syntax.l[0] = l; 735 (*obj)->syntax.l[0] = l;
@@ -745,7 +746,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
745 kfree(p); 746 kfree(p);
746 kfree(id); 747 kfree(id);
747 if (net_ratelimit()) 748 if (net_ratelimit())
748 printk("OOM in bsalg (%d)\n", __LINE__); 749 pr_notice("OOM in bsalg (%d)\n", __LINE__);
749 return 0; 750 return 0;
750 } 751 }
751 memcpy((*obj)->syntax.c, p, len); 752 memcpy((*obj)->syntax.c, p, len);
@@ -760,7 +761,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
760 if (*obj == NULL) { 761 if (*obj == NULL) {
761 kfree(id); 762 kfree(id);
762 if (net_ratelimit()) 763 if (net_ratelimit())
763 printk("OOM in bsalg (%d)\n", __LINE__); 764 pr_notice("OOM in bsalg (%d)\n", __LINE__);
764 return 0; 765 return 0;
765 } 766 }
766 if (!asn1_null_decode(ctx, end)) { 767 if (!asn1_null_decode(ctx, end)) {
@@ -781,7 +782,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
781 kfree(lp); 782 kfree(lp);
782 kfree(id); 783 kfree(id);
783 if (net_ratelimit()) 784 if (net_ratelimit())
784 printk("OOM in bsalg (%d)\n", __LINE__); 785 pr_notice("OOM in bsalg (%d)\n", __LINE__);
785 return 0; 786 return 0;
786 } 787 }
787 memcpy((*obj)->syntax.ul, lp, len); 788 memcpy((*obj)->syntax.ul, lp, len);
@@ -802,7 +803,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
802 kfree(p); 803 kfree(p);
803 kfree(id); 804 kfree(id);
804 if (net_ratelimit()) 805 if (net_ratelimit())
805 printk("OOM in bsalg (%d)\n", __LINE__); 806 pr_notice("OOM in bsalg (%d)\n", __LINE__);
806 return 0; 807 return 0;
807 } 808 }
808 memcpy((*obj)->syntax.uc, p, len); 809 memcpy((*obj)->syntax.uc, p, len);
@@ -820,7 +821,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
820 if (*obj == NULL) { 821 if (*obj == NULL) {
821 kfree(id); 822 kfree(id);
822 if (net_ratelimit()) 823 if (net_ratelimit())
823 printk("OOM in bsalg (%d)\n", __LINE__); 824 pr_notice("OOM in bsalg (%d)\n", __LINE__);
824 return 0; 825 return 0;
825 } 826 }
826 (*obj)->syntax.ul[0] = ul; 827 (*obj)->syntax.ul[0] = ul;
@@ -892,13 +893,15 @@ static void fast_csum(__sum16 *csum,
892 unsigned char s[4]; 893 unsigned char s[4];
893 894
894 if (offset & 1) { 895 if (offset & 1) {
895 s[0] = s[2] = 0; 896 s[0] = ~0;
896 s[1] = ~*optr; 897 s[1] = ~*optr;
898 s[2] = 0;
897 s[3] = *nptr; 899 s[3] = *nptr;
898 } else { 900 } else {
899 s[1] = s[3] = 0;
900 s[0] = ~*optr; 901 s[0] = ~*optr;
902 s[1] = ~0;
901 s[2] = *nptr; 903 s[2] = *nptr;
904 s[3] = 0;
902 } 905 }
903 906
904 *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum))); 907 *csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
@@ -1038,7 +1041,7 @@ static int snmp_parse_mangle(unsigned char *msg,
1038 unsigned int cls, con, tag, vers, pdutype; 1041 unsigned int cls, con, tag, vers, pdutype;
1039 struct asn1_ctx ctx; 1042 struct asn1_ctx ctx;
1040 struct asn1_octstr comm; 1043 struct asn1_octstr comm;
1041 struct snmp_object **obj; 1044 struct snmp_object *obj;
1042 1045
1043 if (debug > 1) 1046 if (debug > 1)
1044 hex_dump(msg, len); 1047 hex_dump(msg, len);
@@ -1148,43 +1151,34 @@ static int snmp_parse_mangle(unsigned char *msg,
1148 if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) 1151 if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
1149 return 0; 1152 return 0;
1150 1153
1151 obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
1152 if (obj == NULL) {
1153 if (net_ratelimit())
1154 printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__);
1155 return 0;
1156 }
1157
1158 while (!asn1_eoc_decode(&ctx, eoc)) { 1154 while (!asn1_eoc_decode(&ctx, eoc)) {
1159 unsigned int i; 1155 unsigned int i;
1160 1156
1161 if (!snmp_object_decode(&ctx, obj)) { 1157 if (!snmp_object_decode(&ctx, &obj)) {
1162 if (*obj) { 1158 if (obj) {
1163 kfree((*obj)->id); 1159 kfree(obj->id);
1164 kfree(*obj); 1160 kfree(obj);
1165 } 1161 }
1166 kfree(obj);
1167 return 0; 1162 return 0;
1168 } 1163 }
1169 1164
1170 if (debug > 1) { 1165 if (debug > 1) {
1171 printk(KERN_DEBUG "bsalg: object: "); 1166 printk(KERN_DEBUG "bsalg: object: ");
1172 for (i = 0; i < (*obj)->id_len; i++) { 1167 for (i = 0; i < obj->id_len; i++) {
1173 if (i > 0) 1168 if (i > 0)
1174 printk("."); 1169 printk(".");
1175 printk("%lu", (*obj)->id[i]); 1170 printk("%lu", obj->id[i]);
1176 } 1171 }
1177 printk(": type=%u\n", (*obj)->type); 1172 printk(": type=%u\n", obj->type);
1178 1173
1179 } 1174 }
1180 1175
1181 if ((*obj)->type == SNMP_IPADDR) 1176 if (obj->type == SNMP_IPADDR)
1182 mangle_address(ctx.begin, ctx.pointer - 4 , map, check); 1177 mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
1183 1178
1184 kfree((*obj)->id); 1179 kfree(obj->id);
1185 kfree(*obj); 1180 kfree(obj);
1186 } 1181 }
1187 kfree(obj);
1188 1182
1189 if (!asn1_eoc_decode(&ctx, eoc)) 1183 if (!asn1_eoc_decode(&ctx, eoc))
1190 return 0; 1184 return 0;
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 5678e9562c15..95481fee8bdb 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -7,6 +7,7 @@
7 */ 7 */
8#include <linux/types.h> 8#include <linux/types.h>
9#include <linux/icmp.h> 9#include <linux/icmp.h>
10#include <linux/gfp.h>
10#include <linux/ip.h> 11#include <linux/ip.h>
11#include <linux/netfilter.h> 12#include <linux/netfilter.h>
12#include <linux/netfilter_ipv4.h> 13#include <linux/netfilter_ipv4.h>
@@ -97,7 +98,7 @@ nf_nat_fn(unsigned int hooknum,
97 return NF_ACCEPT; 98 return NF_ACCEPT;
98 99
99 /* Don't try to NAT if this packet is not conntracked */ 100 /* Don't try to NAT if this packet is not conntracked */
100 if (ct == &nf_conntrack_untracked) 101 if (nf_ct_is_untracked(ct))
101 return NF_ACCEPT; 102 return NF_ACCEPT;
102 103
103 nat = nfct_nat(ct); 104 nat = nfct_nat(ct);
@@ -130,16 +131,9 @@ nf_nat_fn(unsigned int hooknum,
130 if (!nf_nat_initialized(ct, maniptype)) { 131 if (!nf_nat_initialized(ct, maniptype)) {
131 unsigned int ret; 132 unsigned int ret;
132 133
133 if (hooknum == NF_INET_LOCAL_IN) 134 ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
134 /* LOCAL_IN hook doesn't have a chain! */ 135 if (ret != NF_ACCEPT)
135 ret = alloc_null_binding(ct, hooknum);
136 else
137 ret = nf_nat_rule_find(skb, hooknum, in, out,
138 ct);
139
140 if (ret != NF_ACCEPT) {
141 return ret; 136 return ret;
142 }
143 } else 137 } else
144 pr_debug("Already setup manip %s for ct %p\n", 138 pr_debug("Already setup manip %s for ct %p\n",
145 maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", 139 maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
@@ -293,12 +287,12 @@ static int __init nf_nat_standalone_init(void)
293#endif 287#endif
294 ret = nf_nat_rule_init(); 288 ret = nf_nat_rule_init();
295 if (ret < 0) { 289 if (ret < 0) {
296 printk("nf_nat_init: can't setup rules.\n"); 290 pr_err("nf_nat_init: can't setup rules.\n");
297 goto cleanup_decode_session; 291 goto cleanup_decode_session;
298 } 292 }
299 ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); 293 ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
300 if (ret < 0) { 294 if (ret < 0) {
301 printk("nf_nat_init: can't register hooks.\n"); 295 pr_err("nf_nat_init: can't register hooks.\n");
302 goto cleanup_rule_init; 296 goto cleanup_rule_init;
303 } 297 }
304 return ret; 298 return ret;
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
index b096e81500ae..7274a43c7a12 100644
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ b/net/ipv4/netfilter/nf_nat_tftp.c
@@ -6,7 +6,6 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/moduleparam.h>
10#include <linux/udp.h> 9#include <linux/udp.h>
11 10
12#include <net/netfilter/nf_nat_helper.h> 11#include <net/netfilter/nf_nat_helper.h>
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index f25542c48b7d..4ae1f203f7cb 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -127,8 +127,8 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
127 SNMP_MIB_SENTINEL 127 SNMP_MIB_SENTINEL
128}; 128};
129 129
130static struct { 130static const struct {
131 char *name; 131 const char *name;
132 int index; 132 int index;
133} icmpmibmap[] = { 133} icmpmibmap[] = {
134 { "DestUnreachs", ICMP_DEST_UNREACH }, 134 { "DestUnreachs", ICMP_DEST_UNREACH },
@@ -249,6 +249,10 @@ static const struct snmp_mib snmp4_net_list[] = {
249 SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), 249 SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
250 SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), 250 SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
251 SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), 251 SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
252 SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
253 SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
254 SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
255 SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
252 SNMP_MIB_SENTINEL 256 SNMP_MIB_SENTINEL
253}; 257};
254 258
@@ -280,7 +284,7 @@ static void icmpmsg_put(struct seq_file *seq)
280 284
281 count = 0; 285 count = 0;
282 for (i = 0; i < ICMPMSG_MIB_MAX; i++) { 286 for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
283 val = snmp_fold_field((void **) net->mib.icmpmsg_statistics, i); 287 val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i);
284 if (val) { 288 if (val) {
285 type[count] = i; 289 type[count] = i;
286 vals[count++] = val; 290 vals[count++] = val;
@@ -307,18 +311,18 @@ static void icmp_put(struct seq_file *seq)
307 for (i=0; icmpmibmap[i].name != NULL; i++) 311 for (i=0; icmpmibmap[i].name != NULL; i++)
308 seq_printf(seq, " Out%s", icmpmibmap[i].name); 312 seq_printf(seq, " Out%s", icmpmibmap[i].name);
309 seq_printf(seq, "\nIcmp: %lu %lu", 313 seq_printf(seq, "\nIcmp: %lu %lu",
310 snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), 314 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS),
311 snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); 315 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
312 for (i=0; icmpmibmap[i].name != NULL; i++) 316 for (i=0; icmpmibmap[i].name != NULL; i++)
313 seq_printf(seq, " %lu", 317 seq_printf(seq, " %lu",
314 snmp_fold_field((void **) net->mib.icmpmsg_statistics, 318 snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
315 icmpmibmap[i].index)); 319 icmpmibmap[i].index));
316 seq_printf(seq, " %lu %lu", 320 seq_printf(seq, " %lu %lu",
317 snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), 321 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
318 snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); 322 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
319 for (i=0; icmpmibmap[i].name != NULL; i++) 323 for (i=0; icmpmibmap[i].name != NULL; i++)
320 seq_printf(seq, " %lu", 324 seq_printf(seq, " %lu",
321 snmp_fold_field((void **) net->mib.icmpmsg_statistics, 325 snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
322 icmpmibmap[i].index | 0x100)); 326 icmpmibmap[i].index | 0x100));
323} 327}
324 328
@@ -339,10 +343,12 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
339 IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, 343 IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
340 sysctl_ip_default_ttl); 344 sysctl_ip_default_ttl);
341 345
346 BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
342 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) 347 for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
343 seq_printf(seq, " %lu", 348 seq_printf(seq, " %llu",
344 snmp_fold_field((void **)net->mib.ip_statistics, 349 snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
345 snmp4_ipstats_list[i].entry)); 350 snmp4_ipstats_list[i].entry,
351 offsetof(struct ipstats_mib, syncp)));
346 352
347 icmp_put(seq); /* RFC 2011 compatibility */ 353 icmp_put(seq); /* RFC 2011 compatibility */
348 icmpmsg_put(seq); 354 icmpmsg_put(seq);
@@ -356,11 +362,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
356 /* MaxConn field is signed, RFC 2012 */ 362 /* MaxConn field is signed, RFC 2012 */
357 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) 363 if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
358 seq_printf(seq, " %ld", 364 seq_printf(seq, " %ld",
359 snmp_fold_field((void **)net->mib.tcp_statistics, 365 snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
360 snmp4_tcp_list[i].entry)); 366 snmp4_tcp_list[i].entry));
361 else 367 else
362 seq_printf(seq, " %lu", 368 seq_printf(seq, " %lu",
363 snmp_fold_field((void **)net->mib.tcp_statistics, 369 snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
364 snmp4_tcp_list[i].entry)); 370 snmp4_tcp_list[i].entry));
365 } 371 }
366 372
@@ -371,7 +377,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
371 seq_puts(seq, "\nUdp:"); 377 seq_puts(seq, "\nUdp:");
372 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 378 for (i = 0; snmp4_udp_list[i].name != NULL; i++)
373 seq_printf(seq, " %lu", 379 seq_printf(seq, " %lu",
374 snmp_fold_field((void **)net->mib.udp_statistics, 380 snmp_fold_field((void __percpu **)net->mib.udp_statistics,
375 snmp4_udp_list[i].entry)); 381 snmp4_udp_list[i].entry));
376 382
377 /* the UDP and UDP-Lite MIBs are the same */ 383 /* the UDP and UDP-Lite MIBs are the same */
@@ -382,7 +388,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
382 seq_puts(seq, "\nUdpLite:"); 388 seq_puts(seq, "\nUdpLite:");
383 for (i = 0; snmp4_udp_list[i].name != NULL; i++) 389 for (i = 0; snmp4_udp_list[i].name != NULL; i++)
384 seq_printf(seq, " %lu", 390 seq_printf(seq, " %lu",
385 snmp_fold_field((void **)net->mib.udplite_statistics, 391 snmp_fold_field((void __percpu **)net->mib.udplite_statistics,
386 snmp4_udp_list[i].entry)); 392 snmp4_udp_list[i].entry));
387 393
388 seq_putc(seq, '\n'); 394 seq_putc(seq, '\n');
@@ -419,7 +425,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
419 seq_puts(seq, "\nTcpExt:"); 425 seq_puts(seq, "\nTcpExt:");
420 for (i = 0; snmp4_net_list[i].name != NULL; i++) 426 for (i = 0; snmp4_net_list[i].name != NULL; i++)
421 seq_printf(seq, " %lu", 427 seq_printf(seq, " %lu",
422 snmp_fold_field((void **)net->mib.net_statistics, 428 snmp_fold_field((void __percpu **)net->mib.net_statistics,
423 snmp4_net_list[i].entry)); 429 snmp4_net_list[i].entry));
424 430
425 seq_puts(seq, "\nIpExt:"); 431 seq_puts(seq, "\nIpExt:");
@@ -428,9 +434,10 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
428 434
429 seq_puts(seq, "\nIpExt:"); 435 seq_puts(seq, "\nIpExt:");
430 for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) 436 for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
431 seq_printf(seq, " %lu", 437 seq_printf(seq, " %llu",
432 snmp_fold_field((void **)net->mib.ip_statistics, 438 snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
433 snmp4_ipextstats_list[i].entry)); 439 snmp4_ipextstats_list[i].entry,
440 offsetof(struct ipstats_mib, syncp)));
434 441
435 seq_putc(seq, '\n'); 442 seq_putc(seq, '\n');
436 return 0; 443 return 0;
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 542f22fc98b3..f2d297351405 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -52,6 +52,7 @@ int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
52 52
53 return ret; 53 return ret;
54} 54}
55EXPORT_SYMBOL(inet_add_protocol);
55 56
56/* 57/*
57 * Remove a protocol from the hash tables. 58 * Remove a protocol from the hash tables.
@@ -76,6 +77,4 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
76 77
77 return ret; 78 return ret;
78} 79}
79
80EXPORT_SYMBOL(inet_add_protocol);
81EXPORT_SYMBOL(inet_del_protocol); 80EXPORT_SYMBOL(inet_del_protocol);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ce154b47f1da..009a7b2aa1ef 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -60,7 +60,6 @@
60#include <net/net_namespace.h> 60#include <net/net_namespace.h>
61#include <net/dst.h> 61#include <net/dst.h>
62#include <net/sock.h> 62#include <net/sock.h>
63#include <linux/gfp.h>
64#include <linux/ip.h> 63#include <linux/ip.h>
65#include <linux/net.h> 64#include <linux/net.h>
66#include <net/ip.h> 65#include <net/ip.h>
@@ -291,7 +290,7 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
291{ 290{
292 /* Charge it to the socket. */ 291 /* Charge it to the socket. */
293 292
294 if (sock_queue_rcv_skb(sk, skb) < 0) { 293 if (ip_queue_rcv_skb(sk, skb) < 0) {
295 kfree_skb(skb); 294 kfree_skb(skb);
296 return NET_RX_DROP; 295 return NET_RX_DROP;
297 } 296 }
@@ -315,7 +314,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
315} 314}
316 315
317static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, 316static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
318 struct rtable *rt, 317 struct rtable **rtp,
319 unsigned int flags) 318 unsigned int flags)
320{ 319{
321 struct inet_sock *inet = inet_sk(sk); 320 struct inet_sock *inet = inet_sk(sk);
@@ -324,25 +323,27 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
324 struct sk_buff *skb; 323 struct sk_buff *skb;
325 unsigned int iphlen; 324 unsigned int iphlen;
326 int err; 325 int err;
326 struct rtable *rt = *rtp;
327 327
328 if (length > rt->u.dst.dev->mtu) { 328 if (length > rt->dst.dev->mtu) {
329 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, 329 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
330 rt->u.dst.dev->mtu); 330 rt->dst.dev->mtu);
331 return -EMSGSIZE; 331 return -EMSGSIZE;
332 } 332 }
333 if (flags&MSG_PROBE) 333 if (flags&MSG_PROBE)
334 goto out; 334 goto out;
335 335
336 skb = sock_alloc_send_skb(sk, 336 skb = sock_alloc_send_skb(sk,
337 length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15, 337 length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15,
338 flags & MSG_DONTWAIT, &err); 338 flags & MSG_DONTWAIT, &err);
339 if (skb == NULL) 339 if (skb == NULL)
340 goto error; 340 goto error;
341 skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev)); 341 skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev));
342 342
343 skb->priority = sk->sk_priority; 343 skb->priority = sk->sk_priority;
344 skb->mark = sk->sk_mark; 344 skb->mark = sk->sk_mark;
345 skb_dst_set(skb, dst_clone(&rt->u.dst)); 345 skb_dst_set(skb, &rt->dst);
346 *rtp = NULL;
346 347
347 skb_reset_network_header(skb); 348 skb_reset_network_header(skb);
348 iph = ip_hdr(skb); 349 iph = ip_hdr(skb);
@@ -374,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
374 iph->check = 0; 375 iph->check = 0;
375 iph->tot_len = htons(length); 376 iph->tot_len = htons(length);
376 if (!iph->id) 377 if (!iph->id)
377 ip_select_ident(iph, &rt->u.dst, NULL); 378 ip_select_ident(iph, &rt->dst, NULL);
378 379
379 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 380 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
380 } 381 }
@@ -382,8 +383,8 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
382 icmp_out_count(net, ((struct icmphdr *) 383 icmp_out_count(net, ((struct icmphdr *)
383 skb_transport_header(skb))->type); 384 skb_transport_header(skb))->type);
384 385
385 err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, 386 err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
386 dst_output); 387 rt->dst.dev, dst_output);
387 if (err > 0) 388 if (err > 0)
388 err = net_xmit_errno(err); 389 err = net_xmit_errno(err);
389 if (err) 390 if (err)
@@ -577,7 +578,7 @@ back_from_confirm:
577 578
578 if (inet->hdrincl) 579 if (inet->hdrincl)
579 err = raw_send_hdrinc(sk, msg->msg_iov, len, 580 err = raw_send_hdrinc(sk, msg->msg_iov, len,
580 rt, msg->msg_flags); 581 &rt, msg->msg_flags);
581 582
582 else { 583 else {
583 if (!ipc.addr) 584 if (!ipc.addr)
@@ -605,7 +606,7 @@ out:
605 return len; 606 return len;
606 607
607do_confirm: 608do_confirm:
608 dst_confirm(&rt->u.dst); 609 dst_confirm(&rt->dst);
609 if (!(msg->msg_flags & MSG_PROBE) || len) 610 if (!(msg->msg_flags & MSG_PROBE) || len)
610 goto back_from_confirm; 611 goto back_from_confirm;
611 err = 0; 612 err = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d62b05d33384..ac6559cb54f9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -90,6 +90,7 @@
90#include <linux/jhash.h> 90#include <linux/jhash.h>
91#include <linux/rcupdate.h> 91#include <linux/rcupdate.h>
92#include <linux/times.h> 92#include <linux/times.h>
93#include <linux/slab.h>
93#include <net/dst.h> 94#include <net/dst.h>
94#include <net/net_namespace.h> 95#include <net/net_namespace.h>
95#include <net/protocol.h> 96#include <net/protocol.h>
@@ -128,7 +129,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256; 131static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
132static int rt_chain_length_max __read_mostly = 20; 132static int rt_chain_length_max __read_mostly = 20;
133 133
134static struct delayed_work expires_work; 134static struct delayed_work expires_work;
@@ -146,7 +146,6 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb); 146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148static int rt_garbage_collect(struct dst_ops *ops); 148static int rt_garbage_collect(struct dst_ops *ops);
149static void rt_emergency_hash_rebuild(struct net *net);
150 149
151 150
152static struct dst_ops ipv4_dst_ops = { 151static struct dst_ops ipv4_dst_ops = {
@@ -254,14 +253,12 @@ static unsigned rt_hash_mask __read_mostly;
254static unsigned int rt_hash_log __read_mostly; 253static unsigned int rt_hash_log __read_mostly;
255 254
256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 255static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257#define RT_CACHE_STAT_INC(field) \ 256#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
258 (__raw_get_cpu_var(rt_cache_stat).field++)
259 257
260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, 258static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid) 259 int genid)
262{ 260{
263 return jhash_3words((__force u32)(__be32)(daddr), 261 return jhash_3words((__force u32)daddr, (__force u32)saddr,
264 (__force u32)(__be32)(saddr),
265 idx, genid) 262 idx, genid)
266 & rt_hash_mask; 263 & rt_hash_mask;
267} 264}
@@ -287,12 +284,12 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
287 if (!rt_hash_table[st->bucket].chain) 284 if (!rt_hash_table[st->bucket].chain)
288 continue; 285 continue;
289 rcu_read_lock_bh(); 286 rcu_read_lock_bh();
290 r = rcu_dereference(rt_hash_table[st->bucket].chain); 287 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
291 while (r) { 288 while (r) {
292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) && 289 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
293 r->rt_genid == st->genid) 290 r->rt_genid == st->genid)
294 return r; 291 return r;
295 r = rcu_dereference(r->u.dst.rt_next); 292 r = rcu_dereference_bh(r->dst.rt_next);
296 } 293 }
297 rcu_read_unlock_bh(); 294 rcu_read_unlock_bh();
298 } 295 }
@@ -304,7 +301,7 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
304{ 301{
305 struct rt_cache_iter_state *st = seq->private; 302 struct rt_cache_iter_state *st = seq->private;
306 303
307 r = r->u.dst.rt_next; 304 r = r->dst.rt_next;
308 while (!r) { 305 while (!r) {
309 rcu_read_unlock_bh(); 306 rcu_read_unlock_bh();
310 do { 307 do {
@@ -314,7 +311,7 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
314 rcu_read_lock_bh(); 311 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain; 312 r = rt_hash_table[st->bucket].chain;
316 } 313 }
317 return rcu_dereference(r); 314 return rcu_dereference_bh(r);
318} 315}
319 316
320static struct rtable *rt_cache_get_next(struct seq_file *seq, 317static struct rtable *rt_cache_get_next(struct seq_file *seq,
@@ -322,7 +319,7 @@ static struct rtable *rt_cache_get_next(struct seq_file *seq,
322{ 319{
323 struct rt_cache_iter_state *st = seq->private; 320 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) { 321 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 if (dev_net(r->u.dst.dev) != seq_file_net(seq)) 322 if (dev_net(r->dst.dev) != seq_file_net(seq))
326 continue; 323 continue;
327 if (r->rt_genid == st->genid) 324 if (r->rt_genid == st->genid)
328 break; 325 break;
@@ -378,20 +375,21 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
378 struct rtable *r = v; 375 struct rtable *r = v;
379 int len; 376 int len;
380 377
381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" 378 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", 379 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 r->u.dst.dev ? r->u.dst.dev->name : "*", 380 r->dst.dev ? r->dst.dev->name : "*",
384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, 381 (__force u32)r->rt_dst,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt), 382 (__force u32)r->rt_gateway,
386 r->u.dst.__use, 0, (unsigned long)r->rt_src, 383 r->rt_flags, atomic_read(&r->dst.__refcnt),
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ? 384 r->dst.__use, 0, (__force u32)r->rt_src,
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), 385 (dst_metric(&r->dst, RTAX_ADVMSS) ?
389 dst_metric(&r->u.dst, RTAX_WINDOW), 386 (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + 387 dst_metric(&r->dst, RTAX_WINDOW),
391 dst_metric(&r->u.dst, RTAX_RTTVAR)), 388 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
389 dst_metric(&r->dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos, 390 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, 391 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output == 392 r->dst.hh ? (r->dst.hh->hh_output ==
395 dev_queue_xmit) : 0, 393 dev_queue_xmit) : 0,
396 r->rt_spec_dst, &len); 394 r->rt_spec_dst, &len);
397 395
@@ -610,13 +608,13 @@ static inline int ip_rt_proc_init(void)
610 608
611static inline void rt_free(struct rtable *rt) 609static inline void rt_free(struct rtable *rt)
612{ 610{
613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 611 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
614} 612}
615 613
616static inline void rt_drop(struct rtable *rt) 614static inline void rt_drop(struct rtable *rt)
617{ 615{
618 ip_rt_put(rt); 616 ip_rt_put(rt);
619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 617 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
620} 618}
621 619
622static inline int rt_fast_clean(struct rtable *rth) 620static inline int rt_fast_clean(struct rtable *rth)
@@ -624,13 +622,13 @@ static inline int rt_fast_clean(struct rtable *rth)
624 /* Kill broadcast/multicast entries very aggresively, if they 622 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */ 623 collide in hash table with more useful entries */
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 624 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 rth->fl.iif && rth->u.dst.rt_next; 625 rth->fl.iif && rth->dst.rt_next;
628} 626}
629 627
630static inline int rt_valuable(struct rtable *rth) 628static inline int rt_valuable(struct rtable *rth)
631{ 629{
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 630 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 rth->u.dst.expires; 631 rth->dst.expires;
634} 632}
635 633
636static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 634static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -638,15 +636,15 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
638 unsigned long age; 636 unsigned long age;
639 int ret = 0; 637 int ret = 0;
640 638
641 if (atomic_read(&rth->u.dst.__refcnt)) 639 if (atomic_read(&rth->dst.__refcnt))
642 goto out; 640 goto out;
643 641
644 ret = 1; 642 ret = 1;
645 if (rth->u.dst.expires && 643 if (rth->dst.expires &&
646 time_after_eq(jiffies, rth->u.dst.expires)) 644 time_after_eq(jiffies, rth->dst.expires))
647 goto out; 645 goto out;
648 646
649 age = jiffies - rth->u.dst.lastuse; 647 age = jiffies - rth->dst.lastuse;
650 ret = 0; 648 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) || 649 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth))) 650 (age <= tmo2 && rt_valuable(rth)))
@@ -662,7 +660,7 @@ out: return ret;
662 */ 660 */
663static inline u32 rt_score(struct rtable *rt) 661static inline u32 rt_score(struct rtable *rt)
664{ 662{
665 u32 score = jiffies - rt->u.dst.lastuse; 663 u32 score = jiffies - rt->dst.lastuse;
666 664
667 score = ~score & ~(3<<30); 665 score = ~score & ~(3<<30);
668 666
@@ -685,30 +683,29 @@ static inline bool rt_caching(const struct net *net)
685static inline bool compare_hash_inputs(const struct flowi *fl1, 683static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2) 684 const struct flowi *fl2)
687{ 685{
688 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | 686 return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
689 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | 687 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
690 (fl1->iif ^ fl2->iif)) == 0); 688 (fl1->iif ^ fl2->iif)) == 0);
691} 689}
692 690
693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 691static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694{ 692{
695 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | 693 return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
696 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) | 694 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
697 (fl1->mark ^ fl2->mark) | 695 (fl1->mark ^ fl2->mark) |
698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ 696 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
699 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 (fl1->oif ^ fl2->oif) | 697 (fl1->oif ^ fl2->oif) |
701 (fl1->iif ^ fl2->iif)) == 0; 698 (fl1->iif ^ fl2->iif)) == 0;
702} 699}
703 700
704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 701static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705{ 702{
706 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev)); 703 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
707} 704}
708 705
709static inline int rt_is_expired(struct rtable *rth) 706static inline int rt_is_expired(struct rtable *rth)
710{ 707{
711 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); 708 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
712} 709}
713 710
714/* 711/*
@@ -737,7 +734,7 @@ static void rt_do_flush(int process_context)
737 rth = rt_hash_table[i].chain; 734 rth = rt_hash_table[i].chain;
738 735
739 /* defer releasing the head of the list after spin_unlock */ 736 /* defer releasing the head of the list after spin_unlock */
740 for (tail = rth; tail; tail = tail->u.dst.rt_next) 737 for (tail = rth; tail; tail = tail->dst.rt_next)
741 if (!rt_is_expired(tail)) 738 if (!rt_is_expired(tail))
742 break; 739 break;
743 if (rth != tail) 740 if (rth != tail)
@@ -746,9 +743,9 @@ static void rt_do_flush(int process_context)
746 /* call rt_free on entries after the tail requiring flush */ 743 /* call rt_free on entries after the tail requiring flush */
747 prev = &rt_hash_table[i].chain; 744 prev = &rt_hash_table[i].chain;
748 for (p = *prev; p; p = next) { 745 for (p = *prev; p; p = next) {
749 next = p->u.dst.rt_next; 746 next = p->dst.rt_next;
750 if (!rt_is_expired(p)) { 747 if (!rt_is_expired(p)) {
751 prev = &p->u.dst.rt_next; 748 prev = &p->dst.rt_next;
752 } else { 749 } else {
753 *prev = next; 750 *prev = next;
754 rt_free(p); 751 rt_free(p);
@@ -763,7 +760,7 @@ static void rt_do_flush(int process_context)
763 spin_unlock_bh(rt_hash_lock_addr(i)); 760 spin_unlock_bh(rt_hash_lock_addr(i));
764 761
765 for (; rth != tail; rth = next) { 762 for (; rth != tail; rth = next) {
766 next = rth->u.dst.rt_next; 763 next = rth->dst.rt_next;
767 rt_free(rth); 764 rt_free(rth);
768 } 765 }
769 } 766 }
@@ -780,11 +777,30 @@ static void rt_do_flush(int process_context)
780#define FRACT_BITS 3 777#define FRACT_BITS 3
781#define ONE (1UL << FRACT_BITS) 778#define ONE (1UL << FRACT_BITS)
782 779
780/*
781 * Given a hash chain and an item in this hash chain,
782 * find if a previous entry has the same hash_inputs
783 * (but differs on tos, mark or oif)
784 * Returns 0 if an alias is found.
785 * Returns ONE if rth has no alias before itself.
786 */
787static int has_noalias(const struct rtable *head, const struct rtable *rth)
788{
789 const struct rtable *aux = head;
790
791 while (aux != rth) {
792 if (compare_hash_inputs(&aux->fl, &rth->fl))
793 return 0;
794 aux = aux->dst.rt_next;
795 }
796 return ONE;
797}
798
783static void rt_check_expire(void) 799static void rt_check_expire(void)
784{ 800{
785 static unsigned int rover; 801 static unsigned int rover;
786 unsigned int i = rover, goal; 802 unsigned int i = rover, goal;
787 struct rtable *rth, *aux, **rthp; 803 struct rtable *rth, **rthp;
788 unsigned long samples = 0; 804 unsigned long samples = 0;
789 unsigned long sum = 0, sum2 = 0; 805 unsigned long sum = 0, sum2 = 0;
790 unsigned long delta; 806 unsigned long delta;
@@ -815,18 +831,18 @@ static void rt_check_expire(void)
815 length = 0; 831 length = 0;
816 spin_lock_bh(rt_hash_lock_addr(i)); 832 spin_lock_bh(rt_hash_lock_addr(i));
817 while ((rth = *rthp) != NULL) { 833 while ((rth = *rthp) != NULL) {
818 prefetch(rth->u.dst.rt_next); 834 prefetch(rth->dst.rt_next);
819 if (rt_is_expired(rth)) { 835 if (rt_is_expired(rth)) {
820 *rthp = rth->u.dst.rt_next; 836 *rthp = rth->dst.rt_next;
821 rt_free(rth); 837 rt_free(rth);
822 continue; 838 continue;
823 } 839 }
824 if (rth->u.dst.expires) { 840 if (rth->dst.expires) {
825 /* Entry is expired even if it is in use */ 841 /* Entry is expired even if it is in use */
826 if (time_before_eq(jiffies, rth->u.dst.expires)) { 842 if (time_before_eq(jiffies, rth->dst.expires)) {
827nofree: 843nofree:
828 tmo >>= 1; 844 tmo >>= 1;
829 rthp = &rth->u.dst.rt_next; 845 rthp = &rth->dst.rt_next;
830 /* 846 /*
831 * We only count entries on 847 * We only count entries on
832 * a chain with equal hash inputs once 848 * a chain with equal hash inputs once
@@ -835,22 +851,14 @@ nofree:
835 * attributes don't unfairly skew 851 * attributes don't unfairly skew
836 * the length computation 852 * the length computation
837 */ 853 */
838 for (aux = rt_hash_table[i].chain;;) { 854 length += has_noalias(rt_hash_table[i].chain, rth);
839 if (aux == rth) {
840 length += ONE;
841 break;
842 }
843 if (compare_hash_inputs(&aux->fl, &rth->fl))
844 break;
845 aux = aux->u.dst.rt_next;
846 }
847 continue; 855 continue;
848 } 856 }
849 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) 857 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
850 goto nofree; 858 goto nofree;
851 859
852 /* Cleanup aged off entries. */ 860 /* Cleanup aged off entries. */
853 *rthp = rth->u.dst.rt_next; 861 *rthp = rth->dst.rt_next;
854 rt_free(rth); 862 rt_free(rth);
855 } 863 }
856 spin_unlock_bh(rt_hash_lock_addr(i)); 864 spin_unlock_bh(rt_hash_lock_addr(i));
@@ -908,34 +916,11 @@ void rt_cache_flush_batch(void)
908 rt_do_flush(!in_softirq()); 916 rt_do_flush(!in_softirq());
909} 917}
910 918
911/*
912 * We change rt_genid and let gc do the cleanup
913 */
914static void rt_secret_rebuild(unsigned long __net)
915{
916 struct net *net = (struct net *)__net;
917 rt_cache_invalidate(net);
918 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
919}
920
921static void rt_secret_rebuild_oneshot(struct net *net)
922{
923 del_timer_sync(&net->ipv4.rt_secret_timer);
924 rt_cache_invalidate(net);
925 if (ip_rt_secret_interval) {
926 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
927 add_timer(&net->ipv4.rt_secret_timer);
928 }
929}
930
931static void rt_emergency_hash_rebuild(struct net *net) 919static void rt_emergency_hash_rebuild(struct net *net)
932{ 920{
933 if (net_ratelimit()) { 921 if (net_ratelimit())
934 printk(KERN_WARNING "Route hash chain too long!\n"); 922 printk(KERN_WARNING "Route hash chain too long!\n");
935 printk(KERN_WARNING "Adjust your secret_interval!\n"); 923 rt_cache_invalidate(net);
936 }
937
938 rt_secret_rebuild_oneshot(net);
939} 924}
940 925
941/* 926/*
@@ -1014,10 +999,10 @@ static int rt_garbage_collect(struct dst_ops *ops)
1014 if (!rt_is_expired(rth) && 999 if (!rt_is_expired(rth) &&
1015 !rt_may_expire(rth, tmo, expire)) { 1000 !rt_may_expire(rth, tmo, expire)) {
1016 tmo >>= 1; 1001 tmo >>= 1;
1017 rthp = &rth->u.dst.rt_next; 1002 rthp = &rth->dst.rt_next;
1018 continue; 1003 continue;
1019 } 1004 }
1020 *rthp = rth->u.dst.rt_next; 1005 *rthp = rth->dst.rt_next;
1021 rt_free(rth); 1006 rt_free(rth);
1022 goal--; 1007 goal--;
1023 } 1008 }
@@ -1073,8 +1058,23 @@ work_done:
1073out: return 0; 1058out: return 0;
1074} 1059}
1075 1060
1061/*
1062 * Returns number of entries in a hash chain that have different hash_inputs
1063 */
1064static int slow_chain_length(const struct rtable *head)
1065{
1066 int length = 0;
1067 const struct rtable *rth = head;
1068
1069 while (rth) {
1070 length += has_noalias(head, rth);
1071 rth = rth->dst.rt_next;
1072 }
1073 return length >> FRACT_BITS;
1074}
1075
1076static int rt_intern_hash(unsigned hash, struct rtable *rt, 1076static int rt_intern_hash(unsigned hash, struct rtable *rt,
1077 struct rtable **rp, struct sk_buff *skb) 1077 struct rtable **rp, struct sk_buff *skb, int ifindex)
1078{ 1078{
1079 struct rtable *rth, **rthp; 1079 struct rtable *rth, **rthp;
1080 unsigned long now; 1080 unsigned long now;
@@ -1090,7 +1090,7 @@ restart:
1090 candp = NULL; 1090 candp = NULL;
1091 now = jiffies; 1091 now = jiffies;
1092 1092
1093 if (!rt_caching(dev_net(rt->u.dst.dev))) { 1093 if (!rt_caching(dev_net(rt->dst.dev))) {
1094 /* 1094 /*
1095 * If we're not caching, just tell the caller we 1095 * If we're not caching, just tell the caller we
1096 * were successful and don't touch the route. The 1096 * were successful and don't touch the route. The
@@ -1108,7 +1108,7 @@ restart:
1108 */ 1108 */
1109 1109
1110 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1110 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1111 int err = arp_bind_neighbour(&rt->u.dst); 1111 int err = arp_bind_neighbour(&rt->dst);
1112 if (err) { 1112 if (err) {
1113 if (net_ratelimit()) 1113 if (net_ratelimit())
1114 printk(KERN_WARNING 1114 printk(KERN_WARNING
@@ -1127,19 +1127,19 @@ restart:
1127 spin_lock_bh(rt_hash_lock_addr(hash)); 1127 spin_lock_bh(rt_hash_lock_addr(hash));
1128 while ((rth = *rthp) != NULL) { 1128 while ((rth = *rthp) != NULL) {
1129 if (rt_is_expired(rth)) { 1129 if (rt_is_expired(rth)) {
1130 *rthp = rth->u.dst.rt_next; 1130 *rthp = rth->dst.rt_next;
1131 rt_free(rth); 1131 rt_free(rth);
1132 continue; 1132 continue;
1133 } 1133 }
1134 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { 1134 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1135 /* Put it first */ 1135 /* Put it first */
1136 *rthp = rth->u.dst.rt_next; 1136 *rthp = rth->dst.rt_next;
1137 /* 1137 /*
1138 * Since lookup is lockfree, the deletion 1138 * Since lookup is lockfree, the deletion
1139 * must be visible to another weakly ordered CPU before 1139 * must be visible to another weakly ordered CPU before
1140 * the insertion at the start of the hash chain. 1140 * the insertion at the start of the hash chain.
1141 */ 1141 */
1142 rcu_assign_pointer(rth->u.dst.rt_next, 1142 rcu_assign_pointer(rth->dst.rt_next,
1143 rt_hash_table[hash].chain); 1143 rt_hash_table[hash].chain);
1144 /* 1144 /*
1145 * Since lookup is lockfree, the update writes 1145 * Since lookup is lockfree, the update writes
@@ -1147,18 +1147,18 @@ restart:
1147 */ 1147 */
1148 rcu_assign_pointer(rt_hash_table[hash].chain, rth); 1148 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1149 1149
1150 dst_use(&rth->u.dst, now); 1150 dst_use(&rth->dst, now);
1151 spin_unlock_bh(rt_hash_lock_addr(hash)); 1151 spin_unlock_bh(rt_hash_lock_addr(hash));
1152 1152
1153 rt_drop(rt); 1153 rt_drop(rt);
1154 if (rp) 1154 if (rp)
1155 *rp = rth; 1155 *rp = rth;
1156 else 1156 else
1157 skb_dst_set(skb, &rth->u.dst); 1157 skb_dst_set(skb, &rth->dst);
1158 return 0; 1158 return 0;
1159 } 1159 }
1160 1160
1161 if (!atomic_read(&rth->u.dst.__refcnt)) { 1161 if (!atomic_read(&rth->dst.__refcnt)) {
1162 u32 score = rt_score(rth); 1162 u32 score = rt_score(rth);
1163 1163
1164 if (score <= min_score) { 1164 if (score <= min_score) {
@@ -1170,7 +1170,7 @@ restart:
1170 1170
1171 chain_length++; 1171 chain_length++;
1172 1172
1173 rthp = &rth->u.dst.rt_next; 1173 rthp = &rth->dst.rt_next;
1174 } 1174 }
1175 1175
1176 if (cand) { 1176 if (cand) {
@@ -1181,18 +1181,24 @@ restart:
1181 * only 2 entries per bucket. We will see. 1181 * only 2 entries per bucket. We will see.
1182 */ 1182 */
1183 if (chain_length > ip_rt_gc_elasticity) { 1183 if (chain_length > ip_rt_gc_elasticity) {
1184 *candp = cand->u.dst.rt_next; 1184 *candp = cand->dst.rt_next;
1185 rt_free(cand); 1185 rt_free(cand);
1186 } 1186 }
1187 } else { 1187 } else {
1188 if (chain_length > rt_chain_length_max) { 1188 if (chain_length > rt_chain_length_max &&
1189 struct net *net = dev_net(rt->u.dst.dev); 1189 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1190 struct net *net = dev_net(rt->dst.dev);
1190 int num = ++net->ipv4.current_rt_cache_rebuild_count; 1191 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1191 if (!rt_caching(dev_net(rt->u.dst.dev))) { 1192 if (!rt_caching(net)) {
1192 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", 1193 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1193 rt->u.dst.dev->name, num); 1194 rt->dst.dev->name, num);
1194 } 1195 }
1195 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); 1196 rt_emergency_hash_rebuild(net);
1197 spin_unlock_bh(rt_hash_lock_addr(hash));
1198
1199 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1200 ifindex, rt_genid(net));
1201 goto restart;
1196 } 1202 }
1197 } 1203 }
1198 1204
@@ -1200,7 +1206,7 @@ restart:
1200 route or unicast forwarding path. 1206 route or unicast forwarding path.
1201 */ 1207 */
1202 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1208 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1203 int err = arp_bind_neighbour(&rt->u.dst); 1209 int err = arp_bind_neighbour(&rt->dst);
1204 if (err) { 1210 if (err) {
1205 spin_unlock_bh(rt_hash_lock_addr(hash)); 1211 spin_unlock_bh(rt_hash_lock_addr(hash));
1206 1212
@@ -1225,20 +1231,20 @@ restart:
1225 } 1231 }
1226 1232
1227 if (net_ratelimit()) 1233 if (net_ratelimit())
1228 printk(KERN_WARNING "Neighbour table overflow.\n"); 1234 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1229 rt_drop(rt); 1235 rt_drop(rt);
1230 return -ENOBUFS; 1236 return -ENOBUFS;
1231 } 1237 }
1232 } 1238 }
1233 1239
1234 rt->u.dst.rt_next = rt_hash_table[hash].chain; 1240 rt->dst.rt_next = rt_hash_table[hash].chain;
1235 1241
1236#if RT_CACHE_DEBUG >= 2 1242#if RT_CACHE_DEBUG >= 2
1237 if (rt->u.dst.rt_next) { 1243 if (rt->dst.rt_next) {
1238 struct rtable *trt; 1244 struct rtable *trt;
1239 printk(KERN_DEBUG "rt_cache @%02x: %pI4", 1245 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1240 hash, &rt->rt_dst); 1246 hash, &rt->rt_dst);
1241 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) 1247 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1242 printk(" . %pI4", &trt->rt_dst); 1248 printk(" . %pI4", &trt->rt_dst);
1243 printk("\n"); 1249 printk("\n");
1244 } 1250 }
@@ -1256,7 +1262,7 @@ skip_hashing:
1256 if (rp) 1262 if (rp)
1257 *rp = rt; 1263 *rp = rt;
1258 else 1264 else
1259 skb_dst_set(skb, &rt->u.dst); 1265 skb_dst_set(skb, &rt->dst);
1260 return 0; 1266 return 0;
1261} 1267}
1262 1268
@@ -1318,6 +1324,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1318 1324
1319 ip_select_fb_ident(iph); 1325 ip_select_fb_ident(iph);
1320} 1326}
1327EXPORT_SYMBOL(__ip_select_ident);
1321 1328
1322static void rt_del(unsigned hash, struct rtable *rt) 1329static void rt_del(unsigned hash, struct rtable *rt)
1323{ 1330{
@@ -1328,20 +1335,21 @@ static void rt_del(unsigned hash, struct rtable *rt)
1328 ip_rt_put(rt); 1335 ip_rt_put(rt);
1329 while ((aux = *rthp) != NULL) { 1336 while ((aux = *rthp) != NULL) {
1330 if (aux == rt || rt_is_expired(aux)) { 1337 if (aux == rt || rt_is_expired(aux)) {
1331 *rthp = aux->u.dst.rt_next; 1338 *rthp = aux->dst.rt_next;
1332 rt_free(aux); 1339 rt_free(aux);
1333 continue; 1340 continue;
1334 } 1341 }
1335 rthp = &aux->u.dst.rt_next; 1342 rthp = &aux->dst.rt_next;
1336 } 1343 }
1337 spin_unlock_bh(rt_hash_lock_addr(hash)); 1344 spin_unlock_bh(rt_hash_lock_addr(hash));
1338} 1345}
1339 1346
1347/* called in rcu_read_lock() section */
1340void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1348void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1341 __be32 saddr, struct net_device *dev) 1349 __be32 saddr, struct net_device *dev)
1342{ 1350{
1343 int i, k; 1351 int i, k;
1344 struct in_device *in_dev = in_dev_get(dev); 1352 struct in_device *in_dev = __in_dev_get_rcu(dev);
1345 struct rtable *rth, **rthp; 1353 struct rtable *rth, **rthp;
1346 __be32 skeys[2] = { saddr, 0 }; 1354 __be32 skeys[2] = { saddr, 0 };
1347 int ikeys[2] = { dev->ifindex, 0 }; 1355 int ikeys[2] = { dev->ifindex, 0 };
@@ -1377,7 +1385,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1377 1385
1378 rthp=&rt_hash_table[hash].chain; 1386 rthp=&rt_hash_table[hash].chain;
1379 1387
1380 rcu_read_lock();
1381 while ((rth = rcu_dereference(*rthp)) != NULL) { 1388 while ((rth = rcu_dereference(*rthp)) != NULL) {
1382 struct rtable *rt; 1389 struct rtable *rt;
1383 1390
@@ -1386,44 +1393,42 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1386 rth->fl.oif != ikeys[k] || 1393 rth->fl.oif != ikeys[k] ||
1387 rth->fl.iif != 0 || 1394 rth->fl.iif != 0 ||
1388 rt_is_expired(rth) || 1395 rt_is_expired(rth) ||
1389 !net_eq(dev_net(rth->u.dst.dev), net)) { 1396 !net_eq(dev_net(rth->dst.dev), net)) {
1390 rthp = &rth->u.dst.rt_next; 1397 rthp = &rth->dst.rt_next;
1391 continue; 1398 continue;
1392 } 1399 }
1393 1400
1394 if (rth->rt_dst != daddr || 1401 if (rth->rt_dst != daddr ||
1395 rth->rt_src != saddr || 1402 rth->rt_src != saddr ||
1396 rth->u.dst.error || 1403 rth->dst.error ||
1397 rth->rt_gateway != old_gw || 1404 rth->rt_gateway != old_gw ||
1398 rth->u.dst.dev != dev) 1405 rth->dst.dev != dev)
1399 break; 1406 break;
1400 1407
1401 dst_hold(&rth->u.dst); 1408 dst_hold(&rth->dst);
1402 rcu_read_unlock();
1403 1409
1404 rt = dst_alloc(&ipv4_dst_ops); 1410 rt = dst_alloc(&ipv4_dst_ops);
1405 if (rt == NULL) { 1411 if (rt == NULL) {
1406 ip_rt_put(rth); 1412 ip_rt_put(rth);
1407 in_dev_put(in_dev);
1408 return; 1413 return;
1409 } 1414 }
1410 1415
1411 /* Copy all the information. */ 1416 /* Copy all the information. */
1412 *rt = *rth; 1417 *rt = *rth;
1413 rt->u.dst.__use = 1; 1418 rt->dst.__use = 1;
1414 atomic_set(&rt->u.dst.__refcnt, 1); 1419 atomic_set(&rt->dst.__refcnt, 1);
1415 rt->u.dst.child = NULL; 1420 rt->dst.child = NULL;
1416 if (rt->u.dst.dev) 1421 if (rt->dst.dev)
1417 dev_hold(rt->u.dst.dev); 1422 dev_hold(rt->dst.dev);
1418 if (rt->idev) 1423 if (rt->idev)
1419 in_dev_hold(rt->idev); 1424 in_dev_hold(rt->idev);
1420 rt->u.dst.obsolete = 0; 1425 rt->dst.obsolete = -1;
1421 rt->u.dst.lastuse = jiffies; 1426 rt->dst.lastuse = jiffies;
1422 rt->u.dst.path = &rt->u.dst; 1427 rt->dst.path = &rt->dst;
1423 rt->u.dst.neighbour = NULL; 1428 rt->dst.neighbour = NULL;
1424 rt->u.dst.hh = NULL; 1429 rt->dst.hh = NULL;
1425#ifdef CONFIG_XFRM 1430#ifdef CONFIG_XFRM
1426 rt->u.dst.xfrm = NULL; 1431 rt->dst.xfrm = NULL;
1427#endif 1432#endif
1428 rt->rt_genid = rt_genid(net); 1433 rt->rt_genid = rt_genid(net);
1429 rt->rt_flags |= RTCF_REDIRECTED; 1434 rt->rt_flags |= RTCF_REDIRECTED;
@@ -1432,37 +1437,35 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1432 rt->rt_gateway = new_gw; 1437 rt->rt_gateway = new_gw;
1433 1438
1434 /* Redirect received -> path was valid */ 1439 /* Redirect received -> path was valid */
1435 dst_confirm(&rth->u.dst); 1440 dst_confirm(&rth->dst);
1436 1441
1437 if (rt->peer) 1442 if (rt->peer)
1438 atomic_inc(&rt->peer->refcnt); 1443 atomic_inc(&rt->peer->refcnt);
1439 1444
1440 if (arp_bind_neighbour(&rt->u.dst) || 1445 if (arp_bind_neighbour(&rt->dst) ||
1441 !(rt->u.dst.neighbour->nud_state & 1446 !(rt->dst.neighbour->nud_state &
1442 NUD_VALID)) { 1447 NUD_VALID)) {
1443 if (rt->u.dst.neighbour) 1448 if (rt->dst.neighbour)
1444 neigh_event_send(rt->u.dst.neighbour, NULL); 1449 neigh_event_send(rt->dst.neighbour, NULL);
1445 ip_rt_put(rth); 1450 ip_rt_put(rth);
1446 rt_drop(rt); 1451 rt_drop(rt);
1447 goto do_next; 1452 goto do_next;
1448 } 1453 }
1449 1454
1450 netevent.old = &rth->u.dst; 1455 netevent.old = &rth->dst;
1451 netevent.new = &rt->u.dst; 1456 netevent.new = &rt->dst;
1452 call_netevent_notifiers(NETEVENT_REDIRECT, 1457 call_netevent_notifiers(NETEVENT_REDIRECT,
1453 &netevent); 1458 &netevent);
1454 1459
1455 rt_del(hash, rth); 1460 rt_del(hash, rth);
1456 if (!rt_intern_hash(hash, rt, &rt, NULL)) 1461 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1457 ip_rt_put(rt); 1462 ip_rt_put(rt);
1458 goto do_next; 1463 goto do_next;
1459 } 1464 }
1460 rcu_read_unlock();
1461 do_next: 1465 do_next:
1462 ; 1466 ;
1463 } 1467 }
1464 } 1468 }
1465 in_dev_put(in_dev);
1466 return; 1469 return;
1467 1470
1468reject_redirect: 1471reject_redirect:
@@ -1473,7 +1476,7 @@ reject_redirect:
1473 &old_gw, dev->name, &new_gw, 1476 &old_gw, dev->name, &new_gw,
1474 &saddr, &daddr); 1477 &saddr, &daddr);
1475#endif 1478#endif
1476 in_dev_put(in_dev); 1479 ;
1477} 1480}
1478 1481
1479static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1482static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -1482,11 +1485,12 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1482 struct dst_entry *ret = dst; 1485 struct dst_entry *ret = dst;
1483 1486
1484 if (rt) { 1487 if (rt) {
1485 if (dst->obsolete) { 1488 if (dst->obsolete > 0) {
1486 ip_rt_put(rt); 1489 ip_rt_put(rt);
1487 ret = NULL; 1490 ret = NULL;
1488 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1491 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1489 rt->u.dst.expires) { 1492 (rt->dst.expires &&
1493 time_after_eq(jiffies, rt->dst.expires))) {
1490 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1494 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1491 rt->fl.oif, 1495 rt->fl.oif,
1492 rt_genid(dev_net(dst->dev))); 1496 rt_genid(dev_net(dst->dev)));
@@ -1524,7 +1528,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1524 int log_martians; 1528 int log_martians;
1525 1529
1526 rcu_read_lock(); 1530 rcu_read_lock();
1527 in_dev = __in_dev_get_rcu(rt->u.dst.dev); 1531 in_dev = __in_dev_get_rcu(rt->dst.dev);
1528 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 1532 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1529 rcu_read_unlock(); 1533 rcu_read_unlock();
1530 return; 1534 return;
@@ -1535,30 +1539,30 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1535 /* No redirected packets during ip_rt_redirect_silence; 1539 /* No redirected packets during ip_rt_redirect_silence;
1536 * reset the algorithm. 1540 * reset the algorithm.
1537 */ 1541 */
1538 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) 1542 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1539 rt->u.dst.rate_tokens = 0; 1543 rt->dst.rate_tokens = 0;
1540 1544
1541 /* Too many ignored redirects; do not send anything 1545 /* Too many ignored redirects; do not send anything
1542 * set u.dst.rate_last to the last seen redirected packet. 1546 * set dst.rate_last to the last seen redirected packet.
1543 */ 1547 */
1544 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { 1548 if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1545 rt->u.dst.rate_last = jiffies; 1549 rt->dst.rate_last = jiffies;
1546 return; 1550 return;
1547 } 1551 }
1548 1552
1549 /* Check for load limit; set rate_last to the latest sent 1553 /* Check for load limit; set rate_last to the latest sent
1550 * redirect. 1554 * redirect.
1551 */ 1555 */
1552 if (rt->u.dst.rate_tokens == 0 || 1556 if (rt->dst.rate_tokens == 0 ||
1553 time_after(jiffies, 1557 time_after(jiffies,
1554 (rt->u.dst.rate_last + 1558 (rt->dst.rate_last +
1555 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { 1559 (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1556 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1560 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1557 rt->u.dst.rate_last = jiffies; 1561 rt->dst.rate_last = jiffies;
1558 ++rt->u.dst.rate_tokens; 1562 ++rt->dst.rate_tokens;
1559#ifdef CONFIG_IP_ROUTE_VERBOSE 1563#ifdef CONFIG_IP_ROUTE_VERBOSE
1560 if (log_martians && 1564 if (log_martians &&
1561 rt->u.dst.rate_tokens == ip_rt_redirect_number && 1565 rt->dst.rate_tokens == ip_rt_redirect_number &&
1562 net_ratelimit()) 1566 net_ratelimit())
1563 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1567 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1564 &rt->rt_src, rt->rt_iif, 1568 &rt->rt_src, rt->rt_iif,
@@ -1573,7 +1577,7 @@ static int ip_error(struct sk_buff *skb)
1573 unsigned long now; 1577 unsigned long now;
1574 int code; 1578 int code;
1575 1579
1576 switch (rt->u.dst.error) { 1580 switch (rt->dst.error) {
1577 case EINVAL: 1581 case EINVAL:
1578 default: 1582 default:
1579 goto out; 1583 goto out;
@@ -1582,7 +1586,7 @@ static int ip_error(struct sk_buff *skb)
1582 break; 1586 break;
1583 case ENETUNREACH: 1587 case ENETUNREACH:
1584 code = ICMP_NET_UNREACH; 1588 code = ICMP_NET_UNREACH;
1585 IP_INC_STATS_BH(dev_net(rt->u.dst.dev), 1589 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1586 IPSTATS_MIB_INNOROUTES); 1590 IPSTATS_MIB_INNOROUTES);
1587 break; 1591 break;
1588 case EACCES: 1592 case EACCES:
@@ -1591,12 +1595,12 @@ static int ip_error(struct sk_buff *skb)
1591 } 1595 }
1592 1596
1593 now = jiffies; 1597 now = jiffies;
1594 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; 1598 rt->dst.rate_tokens += now - rt->dst.rate_last;
1595 if (rt->u.dst.rate_tokens > ip_rt_error_burst) 1599 if (rt->dst.rate_tokens > ip_rt_error_burst)
1596 rt->u.dst.rate_tokens = ip_rt_error_burst; 1600 rt->dst.rate_tokens = ip_rt_error_burst;
1597 rt->u.dst.rate_last = now; 1601 rt->dst.rate_last = now;
1598 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { 1602 if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1599 rt->u.dst.rate_tokens -= ip_rt_error_cost; 1603 rt->dst.rate_tokens -= ip_rt_error_cost;
1600 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1604 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1601 } 1605 }
1602 1606
@@ -1641,7 +1645,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1641 1645
1642 rcu_read_lock(); 1646 rcu_read_lock();
1643 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 1647 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1644 rth = rcu_dereference(rth->u.dst.rt_next)) { 1648 rth = rcu_dereference(rth->dst.rt_next)) {
1645 unsigned short mtu = new_mtu; 1649 unsigned short mtu = new_mtu;
1646 1650
1647 if (rth->fl.fl4_dst != daddr || 1651 if (rth->fl.fl4_dst != daddr ||
@@ -1650,8 +1654,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1650 rth->rt_src != iph->saddr || 1654 rth->rt_src != iph->saddr ||
1651 rth->fl.oif != ikeys[k] || 1655 rth->fl.oif != ikeys[k] ||
1652 rth->fl.iif != 0 || 1656 rth->fl.iif != 0 ||
1653 dst_metric_locked(&rth->u.dst, RTAX_MTU) || 1657 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1654 !net_eq(dev_net(rth->u.dst.dev), net) || 1658 !net_eq(dev_net(rth->dst.dev), net) ||
1655 rt_is_expired(rth)) 1659 rt_is_expired(rth))
1656 continue; 1660 continue;
1657 1661
@@ -1659,22 +1663,22 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1659 1663
1660 /* BSD 4.2 compatibility hack :-( */ 1664 /* BSD 4.2 compatibility hack :-( */
1661 if (mtu == 0 && 1665 if (mtu == 0 &&
1662 old_mtu >= dst_mtu(&rth->u.dst) && 1666 old_mtu >= dst_mtu(&rth->dst) &&
1663 old_mtu >= 68 + (iph->ihl << 2)) 1667 old_mtu >= 68 + (iph->ihl << 2))
1664 old_mtu -= iph->ihl << 2; 1668 old_mtu -= iph->ihl << 2;
1665 1669
1666 mtu = guess_mtu(old_mtu); 1670 mtu = guess_mtu(old_mtu);
1667 } 1671 }
1668 if (mtu <= dst_mtu(&rth->u.dst)) { 1672 if (mtu <= dst_mtu(&rth->dst)) {
1669 if (mtu < dst_mtu(&rth->u.dst)) { 1673 if (mtu < dst_mtu(&rth->dst)) {
1670 dst_confirm(&rth->u.dst); 1674 dst_confirm(&rth->dst);
1671 if (mtu < ip_rt_min_pmtu) { 1675 if (mtu < ip_rt_min_pmtu) {
1672 mtu = ip_rt_min_pmtu; 1676 mtu = ip_rt_min_pmtu;
1673 rth->u.dst.metrics[RTAX_LOCK-1] |= 1677 rth->dst.metrics[RTAX_LOCK-1] |=
1674 (1 << RTAX_MTU); 1678 (1 << RTAX_MTU);
1675 } 1679 }
1676 rth->u.dst.metrics[RTAX_MTU-1] = mtu; 1680 rth->dst.metrics[RTAX_MTU-1] = mtu;
1677 dst_set_expires(&rth->u.dst, 1681 dst_set_expires(&rth->dst,
1678 ip_rt_mtu_expires); 1682 ip_rt_mtu_expires);
1679 } 1683 }
1680 est_mtu = mtu; 1684 est_mtu = mtu;
@@ -1702,7 +1706,9 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1702 1706
1703static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1707static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1704{ 1708{
1705 return NULL; 1709 if (rt_is_expired((struct rtable *)dst))
1710 return NULL;
1711 return dst;
1706} 1712}
1707 1713
1708static void ipv4_dst_destroy(struct dst_entry *dst) 1714static void ipv4_dst_destroy(struct dst_entry *dst)
@@ -1745,7 +1751,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
1745 1751
1746 rt = skb_rtable(skb); 1752 rt = skb_rtable(skb);
1747 if (rt) 1753 if (rt)
1748 dst_set_expires(&rt->u.dst, 0); 1754 dst_set_expires(&rt->dst, 0);
1749} 1755}
1750 1756
1751static int ip_rt_bug(struct sk_buff *skb) 1757static int ip_rt_bug(struct sk_buff *skb)
@@ -1773,11 +1779,11 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1773 1779
1774 if (rt->fl.iif == 0) 1780 if (rt->fl.iif == 0)
1775 src = rt->rt_src; 1781 src = rt->rt_src;
1776 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) { 1782 else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) {
1777 src = FIB_RES_PREFSRC(res); 1783 src = FIB_RES_PREFSRC(res);
1778 fib_res_put(&res); 1784 fib_res_put(&res);
1779 } else 1785 } else
1780 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, 1786 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1781 RT_SCOPE_UNIVERSE); 1787 RT_SCOPE_UNIVERSE);
1782 memcpy(addr, &src, 4); 1788 memcpy(addr, &src, 4);
1783} 1789}
@@ -1785,10 +1791,10 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1785#ifdef CONFIG_NET_CLS_ROUTE 1791#ifdef CONFIG_NET_CLS_ROUTE
1786static void set_class_tag(struct rtable *rt, u32 tag) 1792static void set_class_tag(struct rtable *rt, u32 tag)
1787{ 1793{
1788 if (!(rt->u.dst.tclassid & 0xFFFF)) 1794 if (!(rt->dst.tclassid & 0xFFFF))
1789 rt->u.dst.tclassid |= tag & 0xFFFF; 1795 rt->dst.tclassid |= tag & 0xFFFF;
1790 if (!(rt->u.dst.tclassid & 0xFFFF0000)) 1796 if (!(rt->dst.tclassid & 0xFFFF0000))
1791 rt->u.dst.tclassid |= tag & 0xFFFF0000; 1797 rt->dst.tclassid |= tag & 0xFFFF0000;
1792} 1798}
1793#endif 1799#endif
1794 1800
@@ -1800,30 +1806,30 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1800 if (FIB_RES_GW(*res) && 1806 if (FIB_RES_GW(*res) &&
1801 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1807 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1802 rt->rt_gateway = FIB_RES_GW(*res); 1808 rt->rt_gateway = FIB_RES_GW(*res);
1803 memcpy(rt->u.dst.metrics, fi->fib_metrics, 1809 memcpy(rt->dst.metrics, fi->fib_metrics,
1804 sizeof(rt->u.dst.metrics)); 1810 sizeof(rt->dst.metrics));
1805 if (fi->fib_mtu == 0) { 1811 if (fi->fib_mtu == 0) {
1806 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; 1812 rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1807 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) && 1813 if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1808 rt->rt_gateway != rt->rt_dst && 1814 rt->rt_gateway != rt->rt_dst &&
1809 rt->u.dst.dev->mtu > 576) 1815 rt->dst.dev->mtu > 576)
1810 rt->u.dst.metrics[RTAX_MTU-1] = 576; 1816 rt->dst.metrics[RTAX_MTU-1] = 576;
1811 } 1817 }
1812#ifdef CONFIG_NET_CLS_ROUTE 1818#ifdef CONFIG_NET_CLS_ROUTE
1813 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 1819 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1814#endif 1820#endif
1815 } else 1821 } else
1816 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; 1822 rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
1817 1823
1818 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0) 1824 if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1819 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; 1825 rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1820 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU) 1826 if (dst_mtu(&rt->dst) > IP_MAX_MTU)
1821 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; 1827 rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1822 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0) 1828 if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
1823 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, 1829 rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1824 ip_rt_min_advmss); 1830 ip_rt_min_advmss);
1825 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40) 1831 if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1826 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; 1832 rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1827 1833
1828#ifdef CONFIG_NET_CLS_ROUTE 1834#ifdef CONFIG_NET_CLS_ROUTE
1829#ifdef CONFIG_IP_MULTIPLE_TABLES 1835#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1834,14 +1840,16 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1834 rt->rt_type = res->type; 1840 rt->rt_type = res->type;
1835} 1841}
1836 1842
1843/* called in rcu_read_lock() section */
1837static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1844static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1838 u8 tos, struct net_device *dev, int our) 1845 u8 tos, struct net_device *dev, int our)
1839{ 1846{
1840 unsigned hash; 1847 unsigned int hash;
1841 struct rtable *rth; 1848 struct rtable *rth;
1842 __be32 spec_dst; 1849 __be32 spec_dst;
1843 struct in_device *in_dev = in_dev_get(dev); 1850 struct in_device *in_dev = __in_dev_get_rcu(dev);
1844 u32 itag = 0; 1851 u32 itag = 0;
1852 int err;
1845 1853
1846 /* Primary sanity checks. */ 1854 /* Primary sanity checks. */
1847 1855
@@ -1856,20 +1864,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1856 if (!ipv4_is_local_multicast(daddr)) 1864 if (!ipv4_is_local_multicast(daddr))
1857 goto e_inval; 1865 goto e_inval;
1858 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1866 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1859 } else if (fib_validate_source(saddr, 0, tos, 0, 1867 } else {
1860 dev, &spec_dst, &itag, 0) < 0) 1868 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1861 goto e_inval; 1869 &itag, 0);
1862 1870 if (err < 0)
1871 goto e_err;
1872 }
1863 rth = dst_alloc(&ipv4_dst_ops); 1873 rth = dst_alloc(&ipv4_dst_ops);
1864 if (!rth) 1874 if (!rth)
1865 goto e_nobufs; 1875 goto e_nobufs;
1866 1876
1867 rth->u.dst.output= ip_rt_bug; 1877 rth->dst.output = ip_rt_bug;
1878 rth->dst.obsolete = -1;
1868 1879
1869 atomic_set(&rth->u.dst.__refcnt, 1); 1880 atomic_set(&rth->dst.__refcnt, 1);
1870 rth->u.dst.flags= DST_HOST; 1881 rth->dst.flags= DST_HOST;
1871 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 1882 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1872 rth->u.dst.flags |= DST_NOPOLICY; 1883 rth->dst.flags |= DST_NOPOLICY;
1873 rth->fl.fl4_dst = daddr; 1884 rth->fl.fl4_dst = daddr;
1874 rth->rt_dst = daddr; 1885 rth->rt_dst = daddr;
1875 rth->fl.fl4_tos = tos; 1886 rth->fl.fl4_tos = tos;
@@ -1877,13 +1888,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1877 rth->fl.fl4_src = saddr; 1888 rth->fl.fl4_src = saddr;
1878 rth->rt_src = saddr; 1889 rth->rt_src = saddr;
1879#ifdef CONFIG_NET_CLS_ROUTE 1890#ifdef CONFIG_NET_CLS_ROUTE
1880 rth->u.dst.tclassid = itag; 1891 rth->dst.tclassid = itag;
1881#endif 1892#endif
1882 rth->rt_iif = 1893 rth->rt_iif =
1883 rth->fl.iif = dev->ifindex; 1894 rth->fl.iif = dev->ifindex;
1884 rth->u.dst.dev = init_net.loopback_dev; 1895 rth->dst.dev = init_net.loopback_dev;
1885 dev_hold(rth->u.dst.dev); 1896 dev_hold(rth->dst.dev);
1886 rth->idev = in_dev_get(rth->u.dst.dev); 1897 rth->idev = in_dev_get(rth->dst.dev);
1887 rth->fl.oif = 0; 1898 rth->fl.oif = 0;
1888 rth->rt_gateway = daddr; 1899 rth->rt_gateway = daddr;
1889 rth->rt_spec_dst= spec_dst; 1900 rth->rt_spec_dst= spec_dst;
@@ -1891,27 +1902,25 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1891 rth->rt_flags = RTCF_MULTICAST; 1902 rth->rt_flags = RTCF_MULTICAST;
1892 rth->rt_type = RTN_MULTICAST; 1903 rth->rt_type = RTN_MULTICAST;
1893 if (our) { 1904 if (our) {
1894 rth->u.dst.input= ip_local_deliver; 1905 rth->dst.input= ip_local_deliver;
1895 rth->rt_flags |= RTCF_LOCAL; 1906 rth->rt_flags |= RTCF_LOCAL;
1896 } 1907 }
1897 1908
1898#ifdef CONFIG_IP_MROUTE 1909#ifdef CONFIG_IP_MROUTE
1899 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1910 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1900 rth->u.dst.input = ip_mr_input; 1911 rth->dst.input = ip_mr_input;
1901#endif 1912#endif
1902 RT_CACHE_STAT_INC(in_slow_mc); 1913 RT_CACHE_STAT_INC(in_slow_mc);
1903 1914
1904 in_dev_put(in_dev);
1905 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1915 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1906 return rt_intern_hash(hash, rth, NULL, skb); 1916 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1907 1917
1908e_nobufs: 1918e_nobufs:
1909 in_dev_put(in_dev);
1910 return -ENOBUFS; 1919 return -ENOBUFS;
1911
1912e_inval: 1920e_inval:
1913 in_dev_put(in_dev);
1914 return -EINVAL; 1921 return -EINVAL;
1922e_err:
1923 return err;
1915} 1924}
1916 1925
1917 1926
@@ -1945,22 +1954,22 @@ static void ip_handle_martian_source(struct net_device *dev,
1945#endif 1954#endif
1946} 1955}
1947 1956
1957/* called in rcu_read_lock() section */
1948static int __mkroute_input(struct sk_buff *skb, 1958static int __mkroute_input(struct sk_buff *skb,
1949 struct fib_result *res, 1959 struct fib_result *res,
1950 struct in_device *in_dev, 1960 struct in_device *in_dev,
1951 __be32 daddr, __be32 saddr, u32 tos, 1961 __be32 daddr, __be32 saddr, u32 tos,
1952 struct rtable **result) 1962 struct rtable **result)
1953{ 1963{
1954
1955 struct rtable *rth; 1964 struct rtable *rth;
1956 int err; 1965 int err;
1957 struct in_device *out_dev; 1966 struct in_device *out_dev;
1958 unsigned flags = 0; 1967 unsigned int flags = 0;
1959 __be32 spec_dst; 1968 __be32 spec_dst;
1960 u32 itag; 1969 u32 itag;
1961 1970
1962 /* get a working reference to the output device */ 1971 /* get a working reference to the output device */
1963 out_dev = in_dev_get(FIB_RES_DEV(*res)); 1972 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1964 if (out_dev == NULL) { 1973 if (out_dev == NULL) {
1965 if (net_ratelimit()) 1974 if (net_ratelimit())
1966 printk(KERN_CRIT "Bug in ip_route_input" \ 1975 printk(KERN_CRIT "Bug in ip_route_input" \
@@ -1975,7 +1984,6 @@ static int __mkroute_input(struct sk_buff *skb,
1975 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1984 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1976 saddr); 1985 saddr);
1977 1986
1978 err = -EINVAL;
1979 goto cleanup; 1987 goto cleanup;
1980 } 1988 }
1981 1989
@@ -1990,8 +1998,13 @@ static int __mkroute_input(struct sk_buff *skb,
1990 if (skb->protocol != htons(ETH_P_IP)) { 1998 if (skb->protocol != htons(ETH_P_IP)) {
1991 /* Not IP (i.e. ARP). Do not create route, if it is 1999 /* Not IP (i.e. ARP). Do not create route, if it is
1992 * invalid for proxy arp. DNAT routes are always valid. 2000 * invalid for proxy arp. DNAT routes are always valid.
2001 *
2002 * Proxy arp feature have been extended to allow, ARP
2003 * replies back to the same interface, to support
2004 * Private VLAN switch technologies. See arp.c.
1993 */ 2005 */
1994 if (out_dev == in_dev) { 2006 if (out_dev == in_dev &&
2007 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1995 err = -EINVAL; 2008 err = -EINVAL;
1996 goto cleanup; 2009 goto cleanup;
1997 } 2010 }
@@ -2004,12 +2017,12 @@ static int __mkroute_input(struct sk_buff *skb,
2004 goto cleanup; 2017 goto cleanup;
2005 } 2018 }
2006 2019
2007 atomic_set(&rth->u.dst.__refcnt, 1); 2020 atomic_set(&rth->dst.__refcnt, 1);
2008 rth->u.dst.flags= DST_HOST; 2021 rth->dst.flags= DST_HOST;
2009 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2022 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2010 rth->u.dst.flags |= DST_NOPOLICY; 2023 rth->dst.flags |= DST_NOPOLICY;
2011 if (IN_DEV_CONF_GET(out_dev, NOXFRM)) 2024 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2012 rth->u.dst.flags |= DST_NOXFRM; 2025 rth->dst.flags |= DST_NOXFRM;
2013 rth->fl.fl4_dst = daddr; 2026 rth->fl.fl4_dst = daddr;
2014 rth->rt_dst = daddr; 2027 rth->rt_dst = daddr;
2015 rth->fl.fl4_tos = tos; 2028 rth->fl.fl4_tos = tos;
@@ -2019,15 +2032,16 @@ static int __mkroute_input(struct sk_buff *skb,
2019 rth->rt_gateway = daddr; 2032 rth->rt_gateway = daddr;
2020 rth->rt_iif = 2033 rth->rt_iif =
2021 rth->fl.iif = in_dev->dev->ifindex; 2034 rth->fl.iif = in_dev->dev->ifindex;
2022 rth->u.dst.dev = (out_dev)->dev; 2035 rth->dst.dev = (out_dev)->dev;
2023 dev_hold(rth->u.dst.dev); 2036 dev_hold(rth->dst.dev);
2024 rth->idev = in_dev_get(rth->u.dst.dev); 2037 rth->idev = in_dev_get(rth->dst.dev);
2025 rth->fl.oif = 0; 2038 rth->fl.oif = 0;
2026 rth->rt_spec_dst= spec_dst; 2039 rth->rt_spec_dst= spec_dst;
2027 2040
2028 rth->u.dst.input = ip_forward; 2041 rth->dst.obsolete = -1;
2029 rth->u.dst.output = ip_output; 2042 rth->dst.input = ip_forward;
2030 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); 2043 rth->dst.output = ip_output;
2044 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2031 2045
2032 rt_set_nexthop(rth, res, itag); 2046 rt_set_nexthop(rth, res, itag);
2033 2047
@@ -2036,8 +2050,6 @@ static int __mkroute_input(struct sk_buff *skb,
2036 *result = rth; 2050 *result = rth;
2037 err = 0; 2051 err = 0;
2038 cleanup: 2052 cleanup:
2039 /* release the working reference to the output device */
2040 in_dev_put(out_dev);
2041 return err; 2053 return err;
2042} 2054}
2043 2055
@@ -2063,8 +2075,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
2063 2075
2064 /* put it into the cache */ 2076 /* put it into the cache */
2065 hash = rt_hash(daddr, saddr, fl->iif, 2077 hash = rt_hash(daddr, saddr, fl->iif,
2066 rt_genid(dev_net(rth->u.dst.dev))); 2078 rt_genid(dev_net(rth->dst.dev)));
2067 return rt_intern_hash(hash, rth, NULL, skb); 2079 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2068} 2080}
2069 2081
2070/* 2082/*
@@ -2081,7 +2093,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2081 u8 tos, struct net_device *dev) 2093 u8 tos, struct net_device *dev)
2082{ 2094{
2083 struct fib_result res; 2095 struct fib_result res;
2084 struct in_device *in_dev = in_dev_get(dev); 2096 struct in_device *in_dev = __in_dev_get_rcu(dev);
2085 struct flowi fl = { .nl_u = { .ip4_u = 2097 struct flowi fl = { .nl_u = { .ip4_u =
2086 { .daddr = daddr, 2098 { .daddr = daddr,
2087 .saddr = saddr, 2099 .saddr = saddr,
@@ -2141,13 +2153,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2141 goto brd_input; 2153 goto brd_input;
2142 2154
2143 if (res.type == RTN_LOCAL) { 2155 if (res.type == RTN_LOCAL) {
2144 int result; 2156 err = fib_validate_source(saddr, daddr, tos,
2145 result = fib_validate_source(saddr, daddr, tos,
2146 net->loopback_dev->ifindex, 2157 net->loopback_dev->ifindex,
2147 dev, &spec_dst, &itag, skb->mark); 2158 dev, &spec_dst, &itag, skb->mark);
2148 if (result < 0) 2159 if (err < 0)
2149 goto martian_source; 2160 goto martian_source_keep_err;
2150 if (result) 2161 if (err)
2151 flags |= RTCF_DIRECTSRC; 2162 flags |= RTCF_DIRECTSRC;
2152 spec_dst = daddr; 2163 spec_dst = daddr;
2153 goto local_input; 2164 goto local_input;
@@ -2160,7 +2171,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2160 2171
2161 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 2172 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2162done: 2173done:
2163 in_dev_put(in_dev);
2164 if (free_res) 2174 if (free_res)
2165 fib_res_put(&res); 2175 fib_res_put(&res);
2166out: return err; 2176out: return err;
@@ -2175,7 +2185,7 @@ brd_input:
2175 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, 2185 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2176 &itag, skb->mark); 2186 &itag, skb->mark);
2177 if (err < 0) 2187 if (err < 0)
2178 goto martian_source; 2188 goto martian_source_keep_err;
2179 if (err) 2189 if (err)
2180 flags |= RTCF_DIRECTSRC; 2190 flags |= RTCF_DIRECTSRC;
2181 } 2191 }
@@ -2188,13 +2198,14 @@ local_input:
2188 if (!rth) 2198 if (!rth)
2189 goto e_nobufs; 2199 goto e_nobufs;
2190 2200
2191 rth->u.dst.output= ip_rt_bug; 2201 rth->dst.output= ip_rt_bug;
2202 rth->dst.obsolete = -1;
2192 rth->rt_genid = rt_genid(net); 2203 rth->rt_genid = rt_genid(net);
2193 2204
2194 atomic_set(&rth->u.dst.__refcnt, 1); 2205 atomic_set(&rth->dst.__refcnt, 1);
2195 rth->u.dst.flags= DST_HOST; 2206 rth->dst.flags= DST_HOST;
2196 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2207 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2197 rth->u.dst.flags |= DST_NOPOLICY; 2208 rth->dst.flags |= DST_NOPOLICY;
2198 rth->fl.fl4_dst = daddr; 2209 rth->fl.fl4_dst = daddr;
2199 rth->rt_dst = daddr; 2210 rth->rt_dst = daddr;
2200 rth->fl.fl4_tos = tos; 2211 rth->fl.fl4_tos = tos;
@@ -2202,25 +2213,25 @@ local_input:
2202 rth->fl.fl4_src = saddr; 2213 rth->fl.fl4_src = saddr;
2203 rth->rt_src = saddr; 2214 rth->rt_src = saddr;
2204#ifdef CONFIG_NET_CLS_ROUTE 2215#ifdef CONFIG_NET_CLS_ROUTE
2205 rth->u.dst.tclassid = itag; 2216 rth->dst.tclassid = itag;
2206#endif 2217#endif
2207 rth->rt_iif = 2218 rth->rt_iif =
2208 rth->fl.iif = dev->ifindex; 2219 rth->fl.iif = dev->ifindex;
2209 rth->u.dst.dev = net->loopback_dev; 2220 rth->dst.dev = net->loopback_dev;
2210 dev_hold(rth->u.dst.dev); 2221 dev_hold(rth->dst.dev);
2211 rth->idev = in_dev_get(rth->u.dst.dev); 2222 rth->idev = in_dev_get(rth->dst.dev);
2212 rth->rt_gateway = daddr; 2223 rth->rt_gateway = daddr;
2213 rth->rt_spec_dst= spec_dst; 2224 rth->rt_spec_dst= spec_dst;
2214 rth->u.dst.input= ip_local_deliver; 2225 rth->dst.input= ip_local_deliver;
2215 rth->rt_flags = flags|RTCF_LOCAL; 2226 rth->rt_flags = flags|RTCF_LOCAL;
2216 if (res.type == RTN_UNREACHABLE) { 2227 if (res.type == RTN_UNREACHABLE) {
2217 rth->u.dst.input= ip_error; 2228 rth->dst.input= ip_error;
2218 rth->u.dst.error= -err; 2229 rth->dst.error= -err;
2219 rth->rt_flags &= ~RTCF_LOCAL; 2230 rth->rt_flags &= ~RTCF_LOCAL;
2220 } 2231 }
2221 rth->rt_type = res.type; 2232 rth->rt_type = res.type;
2222 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); 2233 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2223 err = rt_intern_hash(hash, rth, NULL, skb); 2234 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2224 goto done; 2235 goto done;
2225 2236
2226no_route: 2237no_route:
@@ -2255,46 +2266,54 @@ e_nobufs:
2255 goto done; 2266 goto done;
2256 2267
2257martian_source: 2268martian_source:
2269 err = -EINVAL;
2270martian_source_keep_err:
2258 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2271 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2259 goto e_inval; 2272 goto done;
2260} 2273}
2261 2274
2262int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2275int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2263 u8 tos, struct net_device *dev) 2276 u8 tos, struct net_device *dev, bool noref)
2264{ 2277{
2265 struct rtable * rth; 2278 struct rtable * rth;
2266 unsigned hash; 2279 unsigned hash;
2267 int iif = dev->ifindex; 2280 int iif = dev->ifindex;
2268 struct net *net; 2281 struct net *net;
2282 int res;
2269 2283
2270 net = dev_net(dev); 2284 net = dev_net(dev);
2271 2285
2286 rcu_read_lock();
2287
2272 if (!rt_caching(net)) 2288 if (!rt_caching(net))
2273 goto skip_cache; 2289 goto skip_cache;
2274 2290
2275 tos &= IPTOS_RT_MASK; 2291 tos &= IPTOS_RT_MASK;
2276 hash = rt_hash(daddr, saddr, iif, rt_genid(net)); 2292 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2277 2293
2278 rcu_read_lock();
2279 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2294 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2280 rth = rcu_dereference(rth->u.dst.rt_next)) { 2295 rth = rcu_dereference(rth->dst.rt_next)) {
2281 if (((rth->fl.fl4_dst ^ daddr) | 2296 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2282 (rth->fl.fl4_src ^ saddr) | 2297 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2283 (rth->fl.iif ^ iif) | 2298 (rth->fl.iif ^ iif) |
2284 rth->fl.oif | 2299 rth->fl.oif |
2285 (rth->fl.fl4_tos ^ tos)) == 0 && 2300 (rth->fl.fl4_tos ^ tos)) == 0 &&
2286 rth->fl.mark == skb->mark && 2301 rth->fl.mark == skb->mark &&
2287 net_eq(dev_net(rth->u.dst.dev), net) && 2302 net_eq(dev_net(rth->dst.dev), net) &&
2288 !rt_is_expired(rth)) { 2303 !rt_is_expired(rth)) {
2289 dst_use(&rth->u.dst, jiffies); 2304 if (noref) {
2305 dst_use_noref(&rth->dst, jiffies);
2306 skb_dst_set_noref(skb, &rth->dst);
2307 } else {
2308 dst_use(&rth->dst, jiffies);
2309 skb_dst_set(skb, &rth->dst);
2310 }
2290 RT_CACHE_STAT_INC(in_hit); 2311 RT_CACHE_STAT_INC(in_hit);
2291 rcu_read_unlock(); 2312 rcu_read_unlock();
2292 skb_dst_set(skb, &rth->u.dst);
2293 return 0; 2313 return 0;
2294 } 2314 }
2295 RT_CACHE_STAT_INC(in_hlist_search); 2315 RT_CACHE_STAT_INC(in_hlist_search);
2296 } 2316 }
2297 rcu_read_unlock();
2298 2317
2299skip_cache: 2318skip_cache:
2300 /* Multicast recognition logic is moved from route cache to here. 2319 /* Multicast recognition logic is moved from route cache to here.
@@ -2309,12 +2328,11 @@ skip_cache:
2309 route cache entry is created eventually. 2328 route cache entry is created eventually.
2310 */ 2329 */
2311 if (ipv4_is_multicast(daddr)) { 2330 if (ipv4_is_multicast(daddr)) {
2312 struct in_device *in_dev; 2331 struct in_device *in_dev = __in_dev_get_rcu(dev);
2313 2332
2314 rcu_read_lock(); 2333 if (in_dev) {
2315 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2316 int our = ip_check_mc(in_dev, daddr, saddr, 2334 int our = ip_check_mc(in_dev, daddr, saddr,
2317 ip_hdr(skb)->protocol); 2335 ip_hdr(skb)->protocol);
2318 if (our 2336 if (our
2319#ifdef CONFIG_IP_MROUTE 2337#ifdef CONFIG_IP_MROUTE
2320 || 2338 ||
@@ -2322,16 +2340,20 @@ skip_cache:
2322 IN_DEV_MFORWARD(in_dev)) 2340 IN_DEV_MFORWARD(in_dev))
2323#endif 2341#endif
2324 ) { 2342 ) {
2343 int res = ip_route_input_mc(skb, daddr, saddr,
2344 tos, dev, our);
2325 rcu_read_unlock(); 2345 rcu_read_unlock();
2326 return ip_route_input_mc(skb, daddr, saddr, 2346 return res;
2327 tos, dev, our);
2328 } 2347 }
2329 } 2348 }
2330 rcu_read_unlock(); 2349 rcu_read_unlock();
2331 return -EINVAL; 2350 return -EINVAL;
2332 } 2351 }
2333 return ip_route_input_slow(skb, daddr, saddr, tos, dev); 2352 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2353 rcu_read_unlock();
2354 return res;
2334} 2355}
2356EXPORT_SYMBOL(ip_route_input_common);
2335 2357
2336static int __mkroute_output(struct rtable **result, 2358static int __mkroute_output(struct rtable **result,
2337 struct fib_result *res, 2359 struct fib_result *res,
@@ -2391,12 +2413,12 @@ static int __mkroute_output(struct rtable **result,
2391 goto cleanup; 2413 goto cleanup;
2392 } 2414 }
2393 2415
2394 atomic_set(&rth->u.dst.__refcnt, 1); 2416 atomic_set(&rth->dst.__refcnt, 1);
2395 rth->u.dst.flags= DST_HOST; 2417 rth->dst.flags= DST_HOST;
2396 if (IN_DEV_CONF_GET(in_dev, NOXFRM)) 2418 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2397 rth->u.dst.flags |= DST_NOXFRM; 2419 rth->dst.flags |= DST_NOXFRM;
2398 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2420 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2399 rth->u.dst.flags |= DST_NOPOLICY; 2421 rth->dst.flags |= DST_NOPOLICY;
2400 2422
2401 rth->fl.fl4_dst = oldflp->fl4_dst; 2423 rth->fl.fl4_dst = oldflp->fl4_dst;
2402 rth->fl.fl4_tos = tos; 2424 rth->fl.fl4_tos = tos;
@@ -2408,34 +2430,35 @@ static int __mkroute_output(struct rtable **result,
2408 rth->rt_iif = oldflp->oif ? : dev_out->ifindex; 2430 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2409 /* get references to the devices that are to be hold by the routing 2431 /* get references to the devices that are to be hold by the routing
2410 cache entry */ 2432 cache entry */
2411 rth->u.dst.dev = dev_out; 2433 rth->dst.dev = dev_out;
2412 dev_hold(dev_out); 2434 dev_hold(dev_out);
2413 rth->idev = in_dev_get(dev_out); 2435 rth->idev = in_dev_get(dev_out);
2414 rth->rt_gateway = fl->fl4_dst; 2436 rth->rt_gateway = fl->fl4_dst;
2415 rth->rt_spec_dst= fl->fl4_src; 2437 rth->rt_spec_dst= fl->fl4_src;
2416 2438
2417 rth->u.dst.output=ip_output; 2439 rth->dst.output=ip_output;
2440 rth->dst.obsolete = -1;
2418 rth->rt_genid = rt_genid(dev_net(dev_out)); 2441 rth->rt_genid = rt_genid(dev_net(dev_out));
2419 2442
2420 RT_CACHE_STAT_INC(out_slow_tot); 2443 RT_CACHE_STAT_INC(out_slow_tot);
2421 2444
2422 if (flags & RTCF_LOCAL) { 2445 if (flags & RTCF_LOCAL) {
2423 rth->u.dst.input = ip_local_deliver; 2446 rth->dst.input = ip_local_deliver;
2424 rth->rt_spec_dst = fl->fl4_dst; 2447 rth->rt_spec_dst = fl->fl4_dst;
2425 } 2448 }
2426 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2449 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2427 rth->rt_spec_dst = fl->fl4_src; 2450 rth->rt_spec_dst = fl->fl4_src;
2428 if (flags & RTCF_LOCAL && 2451 if (flags & RTCF_LOCAL &&
2429 !(dev_out->flags & IFF_LOOPBACK)) { 2452 !(dev_out->flags & IFF_LOOPBACK)) {
2430 rth->u.dst.output = ip_mc_output; 2453 rth->dst.output = ip_mc_output;
2431 RT_CACHE_STAT_INC(out_slow_mc); 2454 RT_CACHE_STAT_INC(out_slow_mc);
2432 } 2455 }
2433#ifdef CONFIG_IP_MROUTE 2456#ifdef CONFIG_IP_MROUTE
2434 if (res->type == RTN_MULTICAST) { 2457 if (res->type == RTN_MULTICAST) {
2435 if (IN_DEV_MFORWARD(in_dev) && 2458 if (IN_DEV_MFORWARD(in_dev) &&
2436 !ipv4_is_local_multicast(oldflp->fl4_dst)) { 2459 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2437 rth->u.dst.input = ip_mr_input; 2460 rth->dst.input = ip_mr_input;
2438 rth->u.dst.output = ip_mc_output; 2461 rth->dst.output = ip_mc_output;
2439 } 2462 }
2440 } 2463 }
2441#endif 2464#endif
@@ -2466,7 +2489,7 @@ static int ip_mkroute_output(struct rtable **rp,
2466 if (err == 0) { 2489 if (err == 0) {
2467 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, 2490 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2468 rt_genid(dev_net(dev_out))); 2491 rt_genid(dev_net(dev_out)));
2469 err = rt_intern_hash(hash, rth, rp, NULL); 2492 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2470 } 2493 }
2471 2494
2472 return err; 2495 return err;
@@ -2689,8 +2712,8 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2689 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); 2712 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2690 2713
2691 rcu_read_lock_bh(); 2714 rcu_read_lock_bh();
2692 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2715 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2693 rth = rcu_dereference(rth->u.dst.rt_next)) { 2716 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2694 if (rth->fl.fl4_dst == flp->fl4_dst && 2717 if (rth->fl.fl4_dst == flp->fl4_dst &&
2695 rth->fl.fl4_src == flp->fl4_src && 2718 rth->fl.fl4_src == flp->fl4_src &&
2696 rth->fl.iif == 0 && 2719 rth->fl.iif == 0 &&
@@ -2698,9 +2721,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2698 rth->fl.mark == flp->mark && 2721 rth->fl.mark == flp->mark &&
2699 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2722 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2700 (IPTOS_RT_MASK | RTO_ONLINK)) && 2723 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2701 net_eq(dev_net(rth->u.dst.dev), net) && 2724 net_eq(dev_net(rth->dst.dev), net) &&
2702 !rt_is_expired(rth)) { 2725 !rt_is_expired(rth)) {
2703 dst_use(&rth->u.dst, jiffies); 2726 dst_use(&rth->dst, jiffies);
2704 RT_CACHE_STAT_INC(out_hit); 2727 RT_CACHE_STAT_INC(out_hit);
2705 rcu_read_unlock_bh(); 2728 rcu_read_unlock_bh();
2706 *rp = rth; 2729 *rp = rth;
@@ -2713,9 +2736,13 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2713slow_output: 2736slow_output:
2714 return ip_route_output_slow(net, rp, flp); 2737 return ip_route_output_slow(net, rp, flp);
2715} 2738}
2716
2717EXPORT_SYMBOL_GPL(__ip_route_output_key); 2739EXPORT_SYMBOL_GPL(__ip_route_output_key);
2718 2740
2741static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2742{
2743 return NULL;
2744}
2745
2719static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2746static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2720{ 2747{
2721} 2748}
@@ -2724,7 +2751,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
2724 .family = AF_INET, 2751 .family = AF_INET,
2725 .protocol = cpu_to_be16(ETH_P_IP), 2752 .protocol = cpu_to_be16(ETH_P_IP),
2726 .destroy = ipv4_dst_destroy, 2753 .destroy = ipv4_dst_destroy,
2727 .check = ipv4_dst_check, 2754 .check = ipv4_blackhole_dst_check,
2728 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2755 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2729 .entries = ATOMIC_INIT(0), 2756 .entries = ATOMIC_INIT(0),
2730}; 2757};
@@ -2737,15 +2764,15 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2737 dst_alloc(&ipv4_dst_blackhole_ops); 2764 dst_alloc(&ipv4_dst_blackhole_ops);
2738 2765
2739 if (rt) { 2766 if (rt) {
2740 struct dst_entry *new = &rt->u.dst; 2767 struct dst_entry *new = &rt->dst;
2741 2768
2742 atomic_set(&new->__refcnt, 1); 2769 atomic_set(&new->__refcnt, 1);
2743 new->__use = 1; 2770 new->__use = 1;
2744 new->input = dst_discard; 2771 new->input = dst_discard;
2745 new->output = dst_discard; 2772 new->output = dst_discard;
2746 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); 2773 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
2747 2774
2748 new->dev = ort->u.dst.dev; 2775 new->dev = ort->dst.dev;
2749 if (new->dev) 2776 if (new->dev)
2750 dev_hold(new->dev); 2777 dev_hold(new->dev);
2751 2778
@@ -2769,7 +2796,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2769 dst_free(new); 2796 dst_free(new);
2770 } 2797 }
2771 2798
2772 dst_release(&(*rp)->u.dst); 2799 dst_release(&(*rp)->dst);
2773 *rp = rt; 2800 *rp = rt;
2774 return (rt ? 0 : -ENOMEM); 2801 return (rt ? 0 : -ENOMEM);
2775} 2802}
@@ -2797,13 +2824,13 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2797 2824
2798 return 0; 2825 return 0;
2799} 2826}
2800
2801EXPORT_SYMBOL_GPL(ip_route_output_flow); 2827EXPORT_SYMBOL_GPL(ip_route_output_flow);
2802 2828
2803int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) 2829int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2804{ 2830{
2805 return ip_route_output_flow(net, rp, flp, NULL, 0); 2831 return ip_route_output_flow(net, rp, flp, NULL, 0);
2806} 2832}
2833EXPORT_SYMBOL(ip_route_output_key);
2807 2834
2808static int rt_fill_info(struct net *net, 2835static int rt_fill_info(struct net *net,
2809 struct sk_buff *skb, u32 pid, u32 seq, int event, 2836 struct sk_buff *skb, u32 pid, u32 seq, int event,
@@ -2839,11 +2866,11 @@ static int rt_fill_info(struct net *net,
2839 r->rtm_src_len = 32; 2866 r->rtm_src_len = 32;
2840 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); 2867 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2841 } 2868 }
2842 if (rt->u.dst.dev) 2869 if (rt->dst.dev)
2843 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex); 2870 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2844#ifdef CONFIG_NET_CLS_ROUTE 2871#ifdef CONFIG_NET_CLS_ROUTE
2845 if (rt->u.dst.tclassid) 2872 if (rt->dst.tclassid)
2846 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid); 2873 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2847#endif 2874#endif
2848 if (rt->fl.iif) 2875 if (rt->fl.iif)
2849 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2876 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
@@ -2853,12 +2880,16 @@ static int rt_fill_info(struct net *net,
2853 if (rt->rt_dst != rt->rt_gateway) 2880 if (rt->rt_dst != rt->rt_gateway)
2854 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2881 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2855 2882
2856 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) 2883 if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2857 goto nla_put_failure; 2884 goto nla_put_failure;
2858 2885
2859 error = rt->u.dst.error; 2886 if (rt->fl.mark)
2860 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; 2887 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2888
2889 error = rt->dst.error;
2890 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2861 if (rt->peer) { 2891 if (rt->peer) {
2892 inet_peer_refcheck(rt->peer);
2862 id = atomic_read(&rt->peer->ip_id_count) & 0xffff; 2893 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2863 if (rt->peer->tcp_ts_stamp) { 2894 if (rt->peer->tcp_ts_stamp) {
2864 ts = rt->peer->tcp_ts; 2895 ts = rt->peer->tcp_ts;
@@ -2889,7 +2920,7 @@ static int rt_fill_info(struct net *net,
2889 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); 2920 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2890 } 2921 }
2891 2922
2892 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage, 2923 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2893 expires, error) < 0) 2924 expires, error) < 0)
2894 goto nla_put_failure; 2925 goto nla_put_failure;
2895 2926
@@ -2910,6 +2941,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2910 __be32 src = 0; 2941 __be32 src = 0;
2911 u32 iif; 2942 u32 iif;
2912 int err; 2943 int err;
2944 int mark;
2913 struct sk_buff *skb; 2945 struct sk_buff *skb;
2914 2946
2915 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); 2947 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
@@ -2937,6 +2969,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2937 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; 2969 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2938 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; 2970 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2939 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2971 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2972 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2940 2973
2941 if (iif) { 2974 if (iif) {
2942 struct net_device *dev; 2975 struct net_device *dev;
@@ -2949,13 +2982,14 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2949 2982
2950 skb->protocol = htons(ETH_P_IP); 2983 skb->protocol = htons(ETH_P_IP);
2951 skb->dev = dev; 2984 skb->dev = dev;
2985 skb->mark = mark;
2952 local_bh_disable(); 2986 local_bh_disable();
2953 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 2987 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2954 local_bh_enable(); 2988 local_bh_enable();
2955 2989
2956 rt = skb_rtable(skb); 2990 rt = skb_rtable(skb);
2957 if (err == 0 && rt->u.dst.error) 2991 if (err == 0 && rt->dst.error)
2958 err = -rt->u.dst.error; 2992 err = -rt->dst.error;
2959 } else { 2993 } else {
2960 struct flowi fl = { 2994 struct flowi fl = {
2961 .nl_u = { 2995 .nl_u = {
@@ -2966,6 +3000,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2966 }, 3000 },
2967 }, 3001 },
2968 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 3002 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3003 .mark = mark,
2969 }; 3004 };
2970 err = ip_route_output_key(net, &rt, &fl); 3005 err = ip_route_output_key(net, &rt, &fl);
2971 } 3006 }
@@ -2973,7 +3008,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2973 if (err) 3008 if (err)
2974 goto errout_free; 3009 goto errout_free;
2975 3010
2976 skb_dst_set(skb, &rt->u.dst); 3011 skb_dst_set(skb, &rt->dst);
2977 if (rtm->rtm_flags & RTM_F_NOTIFY) 3012 if (rtm->rtm_flags & RTM_F_NOTIFY)
2978 rt->rt_flags |= RTCF_NOTIFY; 3013 rt->rt_flags |= RTCF_NOTIFY;
2979 3014
@@ -3008,13 +3043,13 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3008 if (!rt_hash_table[h].chain) 3043 if (!rt_hash_table[h].chain)
3009 continue; 3044 continue;
3010 rcu_read_lock_bh(); 3045 rcu_read_lock_bh();
3011 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; 3046 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3012 rt = rcu_dereference(rt->u.dst.rt_next), idx++) { 3047 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3013 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) 3048 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3014 continue; 3049 continue;
3015 if (rt_is_expired(rt)) 3050 if (rt_is_expired(rt))
3016 continue; 3051 continue;
3017 skb_dst_set(skb, dst_clone(&rt->u.dst)); 3052 skb_dst_set_noref(skb, &rt->dst);
3018 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, 3053 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3019 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 3054 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3020 1, NLM_F_MULTI) <= 0) { 3055 1, NLM_F_MULTI) <= 0) {
@@ -3060,50 +3095,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3060 return -EINVAL; 3095 return -EINVAL;
3061} 3096}
3062 3097
3063static void rt_secret_reschedule(int old)
3064{
3065 struct net *net;
3066 int new = ip_rt_secret_interval;
3067 int diff = new - old;
3068
3069 if (!diff)
3070 return;
3071
3072 rtnl_lock();
3073 for_each_net(net) {
3074 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3075
3076 if (!new)
3077 continue;
3078
3079 if (deleted) {
3080 long time = net->ipv4.rt_secret_timer.expires - jiffies;
3081
3082 if (time <= 0 || (time += diff) <= 0)
3083 time = 0;
3084
3085 net->ipv4.rt_secret_timer.expires = time;
3086 } else
3087 net->ipv4.rt_secret_timer.expires = new;
3088
3089 net->ipv4.rt_secret_timer.expires += jiffies;
3090 add_timer(&net->ipv4.rt_secret_timer);
3091 }
3092 rtnl_unlock();
3093}
3094
3095static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3096 void __user *buffer, size_t *lenp,
3097 loff_t *ppos)
3098{
3099 int old = ip_rt_secret_interval;
3100 int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3101
3102 rt_secret_reschedule(old);
3103
3104 return ret;
3105}
3106
3107static ctl_table ipv4_route_table[] = { 3098static ctl_table ipv4_route_table[] = {
3108 { 3099 {
3109 .procname = "gc_thresh", 3100 .procname = "gc_thresh",
@@ -3212,13 +3203,6 @@ static ctl_table ipv4_route_table[] = {
3212 .mode = 0644, 3203 .mode = 0644,
3213 .proc_handler = proc_dointvec, 3204 .proc_handler = proc_dointvec,
3214 }, 3205 },
3215 {
3216 .procname = "secret_interval",
3217 .data = &ip_rt_secret_interval,
3218 .maxlen = sizeof(int),
3219 .mode = 0644,
3220 .proc_handler = ipv4_sysctl_rt_secret_interval,
3221 },
3222 { } 3206 { }
3223}; 3207};
3224 3208
@@ -3297,39 +3281,20 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
3297}; 3281};
3298#endif 3282#endif
3299 3283
3300 3284static __net_init int rt_genid_init(struct net *net)
3301static __net_init int rt_secret_timer_init(struct net *net)
3302{ 3285{
3303 atomic_set(&net->ipv4.rt_genid, 3286 get_random_bytes(&net->ipv4.rt_genid,
3304 (int) ((num_physpages ^ (num_physpages>>8)) ^ 3287 sizeof(net->ipv4.rt_genid));
3305 (jiffies ^ (jiffies >> 7))));
3306
3307 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3308 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3309 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3310
3311 if (ip_rt_secret_interval) {
3312 net->ipv4.rt_secret_timer.expires =
3313 jiffies + net_random() % ip_rt_secret_interval +
3314 ip_rt_secret_interval;
3315 add_timer(&net->ipv4.rt_secret_timer);
3316 }
3317 return 0; 3288 return 0;
3318} 3289}
3319 3290
3320static __net_exit void rt_secret_timer_exit(struct net *net) 3291static __net_initdata struct pernet_operations rt_genid_ops = {
3321{ 3292 .init = rt_genid_init,
3322 del_timer_sync(&net->ipv4.rt_secret_timer);
3323}
3324
3325static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3326 .init = rt_secret_timer_init,
3327 .exit = rt_secret_timer_exit,
3328}; 3293};
3329 3294
3330 3295
3331#ifdef CONFIG_NET_CLS_ROUTE 3296#ifdef CONFIG_NET_CLS_ROUTE
3332struct ip_rt_acct *ip_rt_acct __read_mostly; 3297struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3333#endif /* CONFIG_NET_CLS_ROUTE */ 3298#endif /* CONFIG_NET_CLS_ROUTE */
3334 3299
3335static __initdata unsigned long rhash_entries; 3300static __initdata unsigned long rhash_entries;
@@ -3385,9 +3350,6 @@ int __init ip_rt_init(void)
3385 schedule_delayed_work(&expires_work, 3350 schedule_delayed_work(&expires_work,
3386 net_random() % ip_rt_gc_interval + ip_rt_gc_interval); 3351 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3387 3352
3388 if (register_pernet_subsys(&rt_secret_timer_ops))
3389 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3390
3391 if (ip_rt_proc_init()) 3353 if (ip_rt_proc_init())
3392 printk(KERN_ERR "Unable to create route proc files\n"); 3354 printk(KERN_ERR "Unable to create route proc files\n");
3393#ifdef CONFIG_XFRM 3355#ifdef CONFIG_XFRM
@@ -3399,6 +3361,7 @@ int __init ip_rt_init(void)
3399#ifdef CONFIG_SYSCTL 3361#ifdef CONFIG_SYSCTL
3400 register_pernet_subsys(&sysctl_route_ops); 3362 register_pernet_subsys(&sysctl_route_ops);
3401#endif 3363#endif
3364 register_pernet_subsys(&rt_genid_ops);
3402 return rc; 3365 return rc;
3403} 3366}
3404 3367
@@ -3412,7 +3375,3 @@ void __init ip_static_sysctl_init(void)
3412 register_sysctl_paths(ipv4_path, ipv4_skeleton); 3375 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3413} 3376}
3414#endif 3377#endif
3415
3416EXPORT_SYMBOL(__ip_select_ident);
3417EXPORT_SYMBOL(ip_route_input);
3418EXPORT_SYMBOL(ip_route_output_key);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 66fd80ef2473..650cace2180d 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -18,8 +18,8 @@
18#include <net/tcp.h> 18#include <net/tcp.h>
19#include <net/route.h> 19#include <net/route.h>
20 20
21/* Timestamps: lowest 9 bits store TCP options */ 21/* Timestamps: lowest bits store TCP options */
22#define TSBITS 9 22#define TSBITS 6
23#define TSMASK (((__u32)1 << TSBITS) - 1) 23#define TSMASK (((__u32)1 << TSBITS) - 1)
24 24
25extern int sysctl_tcp_syncookies; 25extern int sysctl_tcp_syncookies;
@@ -58,7 +58,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
58 58
59/* 59/*
60 * when syncookies are in effect and tcp timestamps are enabled we encode 60 * when syncookies are in effect and tcp timestamps are enabled we encode
61 * tcp options in the lowest 9 bits of the timestamp value that will be 61 * tcp options in the lower bits of the timestamp value that will be
62 * sent in the syn-ack. 62 * sent in the syn-ack.
63 * Since subsequent timestamps use the normal tcp_time_stamp value, we 63 * Since subsequent timestamps use the normal tcp_time_stamp value, we
64 * must make sure that the resulting initial timestamp is <= tcp_time_stamp. 64 * must make sure that the resulting initial timestamp is <= tcp_time_stamp.
@@ -70,11 +70,10 @@ __u32 cookie_init_timestamp(struct request_sock *req)
70 u32 options = 0; 70 u32 options = 0;
71 71
72 ireq = inet_rsk(req); 72 ireq = inet_rsk(req);
73 if (ireq->wscale_ok) { 73
74 options = ireq->snd_wscale; 74 options = ireq->wscale_ok ? ireq->snd_wscale : 0xf;
75 options |= ireq->rcv_wscale << 4; 75 options |= ireq->sack_ok << 4;
76 } 76 options |= ireq->ecn_ok << 5;
77 options |= ireq->sack_ok << 8;
78 77
79 ts = ts_now & ~TSMASK; 78 ts = ts_now & ~TSMASK;
80 ts |= options; 79 ts |= options;
@@ -138,23 +137,23 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
138} 137}
139 138
140/* 139/*
141 * This table has to be sorted and terminated with (__u16)-1. 140 * MSS Values are taken from the 2009 paper
142 * XXX generate a better table. 141 * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson:
143 * Unresolved Issues: HIPPI with a 64k MSS is not well supported. 142 * - values 1440 to 1460 accounted for 80% of observed mss values
143 * - values outside the 536-1460 range are rare (<0.2%).
144 *
145 * Table must be sorted.
144 */ 146 */
145static __u16 const msstab[] = { 147static __u16 const msstab[] = {
146 64 - 1, 148 64,
147 256 - 1, 149 512,
148 512 - 1, 150 536,
149 536 - 1, 151 1024,
150 1024 - 1, 152 1440,
151 1440 - 1, 153 1460,
152 1460 - 1, 154 4312,
153 4312 - 1, 155 8960,
154 (__u16)-1
155}; 156};
156/* The number doesn't include the -1 terminator */
157#define NUM_MSS (ARRAY_SIZE(msstab) - 1)
158 157
159/* 158/*
160 * Generate a syncookie. mssp points to the mss, which is returned 159 * Generate a syncookie. mssp points to the mss, which is returned
@@ -169,10 +168,10 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
169 168
170 tcp_synq_overflow(sk); 169 tcp_synq_overflow(sk);
171 170
172 /* XXX sort msstab[] by probability? Binary search? */ 171 for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
173 for (mssind = 0; mss > msstab[mssind + 1]; mssind++) 172 if (mss >= msstab[mssind])
174 ; 173 break;
175 *mssp = msstab[mssind] + 1; 174 *mssp = msstab[mssind];
176 175
177 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); 176 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
178 177
@@ -202,7 +201,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
202 jiffies / (HZ * 60), 201 jiffies / (HZ * 60),
203 COUNTER_TRIES); 202 COUNTER_TRIES);
204 203
205 return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; 204 return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
206} 205}
207 206
208static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, 207static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
@@ -227,26 +226,38 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
227 * additional tcp options in the timestamp. 226 * additional tcp options in the timestamp.
228 * This extracts these options from the timestamp echo. 227 * This extracts these options from the timestamp echo.
229 * 228 *
230 * The lowest 4 bits are for snd_wscale 229 * The lowest 4 bits store snd_wscale.
231 * The next 4 lsb are for rcv_wscale 230 * next 2 bits indicate SACK and ECN support.
232 * The next lsb is for sack_ok 231 *
232 * return false if we decode an option that should not be.
233 */ 233 */
234void cookie_check_timestamp(struct tcp_options_received *tcp_opt) 234bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
235{ 235{
236 /* echoed timestamp, 9 lowest bits contain options */ 236 /* echoed timestamp, lowest bits contain options */
237 u32 options = tcp_opt->rcv_tsecr & TSMASK; 237 u32 options = tcp_opt->rcv_tsecr & TSMASK;
238 238
239 tcp_opt->snd_wscale = options & 0xf; 239 if (!tcp_opt->saw_tstamp) {
240 options >>= 4; 240 tcp_clear_options(tcp_opt);
241 tcp_opt->rcv_wscale = options & 0xf; 241 return true;
242 }
243
244 if (!sysctl_tcp_timestamps)
245 return false;
242 246
243 tcp_opt->sack_ok = (options >> 4) & 0x1; 247 tcp_opt->sack_ok = (options >> 4) & 0x1;
248 *ecn_ok = (options >> 5) & 1;
249 if (*ecn_ok && !sysctl_tcp_ecn)
250 return false;
251
252 if (tcp_opt->sack_ok && !sysctl_tcp_sack)
253 return false;
244 254
245 if (tcp_opt->sack_ok) 255 if ((options & 0xf) == 0xf)
246 tcp_sack_reset(tcp_opt); 256 return true; /* no window scaling */
247 257
248 if (tcp_opt->snd_wscale || tcp_opt->rcv_wscale) 258 tcp_opt->wscale_ok = 1;
249 tcp_opt->wscale_ok = 1; 259 tcp_opt->snd_wscale = options & 0xf;
260 return sysctl_tcp_window_scaling != 0;
250} 261}
251EXPORT_SYMBOL(cookie_check_timestamp); 262EXPORT_SYMBOL(cookie_check_timestamp);
252 263
@@ -265,8 +276,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
265 int mss; 276 int mss;
266 struct rtable *rt; 277 struct rtable *rt;
267 __u8 rcv_wscale; 278 __u8 rcv_wscale;
279 bool ecn_ok;
268 280
269 if (!sysctl_tcp_syncookies || !th->ack) 281 if (!sysctl_tcp_syncookies || !th->ack || th->rst)
270 goto out; 282 goto out;
271 283
272 if (tcp_synq_no_recent_overflow(sk) || 284 if (tcp_synq_no_recent_overflow(sk) ||
@@ -281,8 +293,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
281 memset(&tcp_opt, 0, sizeof(tcp_opt)); 293 memset(&tcp_opt, 0, sizeof(tcp_opt));
282 tcp_parse_options(skb, &tcp_opt, &hash_location, 0); 294 tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
283 295
284 if (tcp_opt.saw_tstamp) 296 if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
285 cookie_check_timestamp(&tcp_opt); 297 goto out;
286 298
287 ret = NULL; 299 ret = NULL;
288 req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ 300 req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
@@ -298,9 +310,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
298 ireq->rmt_port = th->source; 310 ireq->rmt_port = th->source;
299 ireq->loc_addr = ip_hdr(skb)->daddr; 311 ireq->loc_addr = ip_hdr(skb)->daddr;
300 ireq->rmt_addr = ip_hdr(skb)->saddr; 312 ireq->rmt_addr = ip_hdr(skb)->saddr;
301 ireq->ecn_ok = 0; 313 ireq->ecn_ok = ecn_ok;
302 ireq->snd_wscale = tcp_opt.snd_wscale; 314 ireq->snd_wscale = tcp_opt.snd_wscale;
303 ireq->rcv_wscale = tcp_opt.rcv_wscale;
304 ireq->sack_ok = tcp_opt.sack_ok; 315 ireq->sack_ok = tcp_opt.sack_ok;
305 ireq->wscale_ok = tcp_opt.wscale_ok; 316 ireq->wscale_ok = tcp_opt.wscale_ok;
306 ireq->tstamp_ok = tcp_opt.saw_tstamp; 317 ireq->tstamp_ok = tcp_opt.saw_tstamp;
@@ -347,21 +358,22 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
347 { .sport = th->dest, 358 { .sport = th->dest,
348 .dport = th->source } } }; 359 .dport = th->source } } };
349 security_req_classify_flow(req, &fl); 360 security_req_classify_flow(req, &fl);
350 if (ip_route_output_key(&init_net, &rt, &fl)) { 361 if (ip_route_output_key(sock_net(sk), &rt, &fl)) {
351 reqsk_free(req); 362 reqsk_free(req);
352 goto out; 363 goto out;
353 } 364 }
354 } 365 }
355 366
356 /* Try to redo what tcp_v4_send_synack did. */ 367 /* Try to redo what tcp_v4_send_synack did. */
357 req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW); 368 req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
358 369
359 tcp_select_initial_window(tcp_full_space(sk), req->mss, 370 tcp_select_initial_window(tcp_full_space(sk), req->mss,
360 &req->rcv_wnd, &req->window_clamp, 371 &req->rcv_wnd, &req->window_clamp,
361 ireq->wscale_ok, &rcv_wscale); 372 ireq->wscale_ok, &rcv_wscale,
373 dst_metric(&rt->dst, RTAX_INITRWND));
362 374
363 ireq->rcv_wscale = rcv_wscale; 375 ireq->rcv_wscale = rcv_wscale;
364 376
365 ret = get_cookie_sock(sk, skb, req, &rt->u.dst); 377 ret = get_cookie_sock(sk, skb, req, &rt->dst);
366out: return ret; 378out: return ret;
367} 379}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7e3712ce3994..d96c1da4b17c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -12,6 +12,7 @@
12#include <linux/inetdevice.h> 12#include <linux/inetdevice.h>
13#include <linux/seqlock.h> 13#include <linux/seqlock.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h>
15#include <net/snmp.h> 16#include <net/snmp.h>
16#include <net/icmp.h> 17#include <net/icmp.h>
17#include <net/ip.h> 18#include <net/ip.h>
@@ -298,6 +299,13 @@ static struct ctl_table ipv4_table[] = {
298 .mode = 0644, 299 .mode = 0644,
299 .proc_handler = ipv4_local_port_range, 300 .proc_handler = ipv4_local_port_range,
300 }, 301 },
302 {
303 .procname = "ip_local_reserved_ports",
304 .data = NULL, /* initialized in sysctl_ipv4_init */
305 .maxlen = 65536,
306 .mode = 0644,
307 .proc_handler = proc_do_large_bitmap,
308 },
301#ifdef CONFIG_IP_MULTICAST 309#ifdef CONFIG_IP_MULTICAST
302 { 310 {
303 .procname = "igmp_max_memberships", 311 .procname = "igmp_max_memberships",
@@ -576,6 +584,20 @@ static struct ctl_table ipv4_table[] = {
576 .proc_handler = proc_dointvec 584 .proc_handler = proc_dointvec
577 }, 585 },
578 { 586 {
587 .procname = "tcp_thin_linear_timeouts",
588 .data = &sysctl_tcp_thin_linear_timeouts,
589 .maxlen = sizeof(int),
590 .mode = 0644,
591 .proc_handler = proc_dointvec
592 },
593 {
594 .procname = "tcp_thin_dupack",
595 .data = &sysctl_tcp_thin_dupack,
596 .maxlen = sizeof(int),
597 .mode = 0644,
598 .proc_handler = proc_dointvec
599 },
600 {
579 .procname = "udp_mem", 601 .procname = "udp_mem",
580 .data = &sysctl_udp_mem, 602 .data = &sysctl_udp_mem,
581 .maxlen = sizeof(sysctl_udp_mem), 603 .maxlen = sizeof(sysctl_udp_mem),
@@ -721,6 +743,16 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
721static __init int sysctl_ipv4_init(void) 743static __init int sysctl_ipv4_init(void)
722{ 744{
723 struct ctl_table_header *hdr; 745 struct ctl_table_header *hdr;
746 struct ctl_table *i;
747
748 for (i = ipv4_table; i->procname; i++) {
749 if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
750 i->data = sysctl_local_reserved_ports;
751 break;
752 }
753 }
754 if (!i->procname)
755 return -EINVAL;
724 756
725 hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table); 757 hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
726 if (hdr == NULL) 758 if (hdr == NULL)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b0a26bb25e2e..f115ea68a4ef 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -265,6 +265,7 @@
265#include <linux/err.h> 265#include <linux/err.h>
266#include <linux/crypto.h> 266#include <linux/crypto.h>
267#include <linux/time.h> 267#include <linux/time.h>
268#include <linux/slab.h>
268 269
269#include <net/icmp.h> 270#include <net/icmp.h>
270#include <net/tcp.h> 271#include <net/tcp.h>
@@ -314,7 +315,6 @@ struct tcp_splice_state {
314 * is strict, actions are advisory and have some latency. 315 * is strict, actions are advisory and have some latency.
315 */ 316 */
316int tcp_memory_pressure __read_mostly; 317int tcp_memory_pressure __read_mostly;
317
318EXPORT_SYMBOL(tcp_memory_pressure); 318EXPORT_SYMBOL(tcp_memory_pressure);
319 319
320void tcp_enter_memory_pressure(struct sock *sk) 320void tcp_enter_memory_pressure(struct sock *sk)
@@ -324,7 +324,6 @@ void tcp_enter_memory_pressure(struct sock *sk)
324 tcp_memory_pressure = 1; 324 tcp_memory_pressure = 1;
325 } 325 }
326} 326}
327
328EXPORT_SYMBOL(tcp_enter_memory_pressure); 327EXPORT_SYMBOL(tcp_enter_memory_pressure);
329 328
330/* Convert seconds to retransmits based on initial and max timeout */ 329/* Convert seconds to retransmits based on initial and max timeout */
@@ -377,7 +376,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
377 struct sock *sk = sock->sk; 376 struct sock *sk = sock->sk;
378 struct tcp_sock *tp = tcp_sk(sk); 377 struct tcp_sock *tp = tcp_sk(sk);
379 378
380 sock_poll_wait(file, sk->sk_sleep, wait); 379 sock_poll_wait(file, sk_sleep(sk), wait);
381 if (sk->sk_state == TCP_LISTEN) 380 if (sk->sk_state == TCP_LISTEN)
382 return inet_csk_listen_poll(sk); 381 return inet_csk_listen_poll(sk);
383 382
@@ -387,8 +386,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
387 */ 386 */
388 387
389 mask = 0; 388 mask = 0;
390 if (sk->sk_err)
391 mask = POLLERR;
392 389
393 /* 390 /*
394 * POLLHUP is certainly not done right. But poll() doesn't 391 * POLLHUP is certainly not done right. But poll() doesn't
@@ -429,7 +426,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
429 if (tp->urg_seq == tp->copied_seq && 426 if (tp->urg_seq == tp->copied_seq &&
430 !sock_flag(sk, SOCK_URGINLINE) && 427 !sock_flag(sk, SOCK_URGINLINE) &&
431 tp->urg_data) 428 tp->urg_data)
432 target--; 429 target++;
433 430
434 /* Potential race condition. If read of tp below will 431 /* Potential race condition. If read of tp below will
435 * escape above sk->sk_state, we can be illegally awaken 432 * escape above sk->sk_state, we can be illegally awaken
@@ -452,13 +449,20 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
452 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) 449 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
453 mask |= POLLOUT | POLLWRNORM; 450 mask |= POLLOUT | POLLWRNORM;
454 } 451 }
455 } 452 } else
453 mask |= POLLOUT | POLLWRNORM;
456 454
457 if (tp->urg_data & TCP_URG_VALID) 455 if (tp->urg_data & TCP_URG_VALID)
458 mask |= POLLPRI; 456 mask |= POLLPRI;
459 } 457 }
458 /* This barrier is coupled with smp_wmb() in tcp_reset() */
459 smp_rmb();
460 if (sk->sk_err)
461 mask |= POLLERR;
462
460 return mask; 463 return mask;
461} 464}
465EXPORT_SYMBOL(tcp_poll);
462 466
463int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) 467int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
464{ 468{
@@ -507,10 +511,11 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
507 511
508 return put_user(answ, (int __user *)arg); 512 return put_user(answ, (int __user *)arg);
509} 513}
514EXPORT_SYMBOL(tcp_ioctl);
510 515
511static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) 516static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
512{ 517{
513 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 518 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
514 tp->pushed_seq = tp->write_seq; 519 tp->pushed_seq = tp->write_seq;
515} 520}
516 521
@@ -526,7 +531,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
526 531
527 skb->csum = 0; 532 skb->csum = 0;
528 tcb->seq = tcb->end_seq = tp->write_seq; 533 tcb->seq = tcb->end_seq = tp->write_seq;
529 tcb->flags = TCPCB_FLAG_ACK; 534 tcb->flags = TCPHDR_ACK;
530 tcb->sacked = 0; 535 tcb->sacked = 0;
531 skb_header_release(skb); 536 skb_header_release(skb);
532 tcp_add_write_queue_tail(sk, skb); 537 tcp_add_write_queue_tail(sk, skb);
@@ -536,8 +541,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
536 tp->nonagle &= ~TCP_NAGLE_PUSH; 541 tp->nonagle &= ~TCP_NAGLE_PUSH;
537} 542}
538 543
539static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, 544static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
540 struct sk_buff *skb)
541{ 545{
542 if (flags & MSG_OOB) 546 if (flags & MSG_OOB)
543 tp->snd_up = tp->write_seq; 547 tp->snd_up = tp->write_seq;
@@ -546,13 +550,13 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
546static inline void tcp_push(struct sock *sk, int flags, int mss_now, 550static inline void tcp_push(struct sock *sk, int flags, int mss_now,
547 int nonagle) 551 int nonagle)
548{ 552{
549 struct tcp_sock *tp = tcp_sk(sk);
550
551 if (tcp_send_head(sk)) { 553 if (tcp_send_head(sk)) {
552 struct sk_buff *skb = tcp_write_queue_tail(sk); 554 struct tcp_sock *tp = tcp_sk(sk);
555
553 if (!(flags & MSG_MORE) || forced_push(tp)) 556 if (!(flags & MSG_MORE) || forced_push(tp))
554 tcp_mark_push(tp, skb); 557 tcp_mark_push(tp, tcp_write_queue_tail(sk));
555 tcp_mark_urg(tp, flags, skb); 558
559 tcp_mark_urg(tp, flags);
556 __tcp_push_pending_frames(sk, mss_now, 560 __tcp_push_pending_frames(sk, mss_now,
557 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); 561 (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
558 } 562 }
@@ -608,6 +612,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
608 ssize_t spliced; 612 ssize_t spliced;
609 int ret; 613 int ret;
610 614
615 sock_rps_record_flow(sk);
611 /* 616 /*
612 * We can't seek on a socket input 617 * We can't seek on a socket input
613 */ 618 */
@@ -675,6 +680,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
675 680
676 return ret; 681 return ret;
677} 682}
683EXPORT_SYMBOL(tcp_splice_read);
678 684
679struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) 685struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
680{ 686{
@@ -815,7 +821,7 @@ new_segment:
815 skb_shinfo(skb)->gso_segs = 0; 821 skb_shinfo(skb)->gso_segs = 0;
816 822
817 if (!copied) 823 if (!copied)
818 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; 824 TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
819 825
820 copied += copy; 826 copied += copy;
821 poffset += copy; 827 poffset += copy;
@@ -856,15 +862,15 @@ out_err:
856 return sk_stream_error(sk, flags, err); 862 return sk_stream_error(sk, flags, err);
857} 863}
858 864
859ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, 865int tcp_sendpage(struct sock *sk, struct page *page, int offset,
860 size_t size, int flags) 866 size_t size, int flags)
861{ 867{
862 ssize_t res; 868 ssize_t res;
863 struct sock *sk = sock->sk;
864 869
865 if (!(sk->sk_route_caps & NETIF_F_SG) || 870 if (!(sk->sk_route_caps & NETIF_F_SG) ||
866 !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) 871 !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
867 return sock_no_sendpage(sock, page, offset, size, flags); 872 return sock_no_sendpage(sk->sk_socket, page, offset, size,
873 flags);
868 874
869 lock_sock(sk); 875 lock_sock(sk);
870 TCP_CHECK_TIMER(sk); 876 TCP_CHECK_TIMER(sk);
@@ -873,16 +879,17 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
873 release_sock(sk); 879 release_sock(sk);
874 return res; 880 return res;
875} 881}
882EXPORT_SYMBOL(tcp_sendpage);
876 883
877#define TCP_PAGE(sk) (sk->sk_sndmsg_page) 884#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
878#define TCP_OFF(sk) (sk->sk_sndmsg_off) 885#define TCP_OFF(sk) (sk->sk_sndmsg_off)
879 886
880static inline int select_size(struct sock *sk) 887static inline int select_size(struct sock *sk, int sg)
881{ 888{
882 struct tcp_sock *tp = tcp_sk(sk); 889 struct tcp_sock *tp = tcp_sk(sk);
883 int tmp = tp->mss_cache; 890 int tmp = tp->mss_cache;
884 891
885 if (sk->sk_route_caps & NETIF_F_SG) { 892 if (sg) {
886 if (sk_can_gso(sk)) 893 if (sk_can_gso(sk))
887 tmp = 0; 894 tmp = 0;
888 else { 895 else {
@@ -897,16 +904,15 @@ static inline int select_size(struct sock *sk)
897 return tmp; 904 return tmp;
898} 905}
899 906
900int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, 907int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
901 size_t size) 908 size_t size)
902{ 909{
903 struct sock *sk = sock->sk;
904 struct iovec *iov; 910 struct iovec *iov;
905 struct tcp_sock *tp = tcp_sk(sk); 911 struct tcp_sock *tp = tcp_sk(sk);
906 struct sk_buff *skb; 912 struct sk_buff *skb;
907 int iovlen, flags; 913 int iovlen, flags;
908 int mss_now, size_goal; 914 int mss_now, size_goal;
909 int err, copied; 915 int sg, err, copied;
910 long timeo; 916 long timeo;
911 917
912 lock_sock(sk); 918 lock_sock(sk);
@@ -934,8 +940,10 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
934 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 940 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
935 goto out_err; 941 goto out_err;
936 942
943 sg = sk->sk_route_caps & NETIF_F_SG;
944
937 while (--iovlen >= 0) { 945 while (--iovlen >= 0) {
938 int seglen = iov->iov_len; 946 size_t seglen = iov->iov_len;
939 unsigned char __user *from = iov->iov_base; 947 unsigned char __user *from = iov->iov_base;
940 948
941 iov++; 949 iov++;
@@ -959,8 +967,9 @@ new_segment:
959 if (!sk_stream_memory_free(sk)) 967 if (!sk_stream_memory_free(sk))
960 goto wait_for_sndbuf; 968 goto wait_for_sndbuf;
961 969
962 skb = sk_stream_alloc_skb(sk, select_size(sk), 970 skb = sk_stream_alloc_skb(sk,
963 sk->sk_allocation); 971 select_size(sk, sg),
972 sk->sk_allocation);
964 if (!skb) 973 if (!skb)
965 goto wait_for_memory; 974 goto wait_for_memory;
966 975
@@ -997,9 +1006,7 @@ new_segment:
997 /* We can extend the last page 1006 /* We can extend the last page
998 * fragment. */ 1007 * fragment. */
999 merge = 1; 1008 merge = 1;
1000 } else if (i == MAX_SKB_FRAGS || 1009 } else if (i == MAX_SKB_FRAGS || !sg) {
1001 (!i &&
1002 !(sk->sk_route_caps & NETIF_F_SG))) {
1003 /* Need to add new fragment and cannot 1010 /* Need to add new fragment and cannot
1004 * do this because interface is non-SG, 1011 * do this because interface is non-SG,
1005 * or because all the page slots are 1012 * or because all the page slots are
@@ -1060,7 +1067,7 @@ new_segment:
1060 } 1067 }
1061 1068
1062 if (!copied) 1069 if (!copied)
1063 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; 1070 TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
1064 1071
1065 tp->write_seq += copy; 1072 tp->write_seq += copy;
1066 TCP_SKB_CB(skb)->end_seq += copy; 1073 TCP_SKB_CB(skb)->end_seq += copy;
@@ -1120,6 +1127,7 @@ out_err:
1120 release_sock(sk); 1127 release_sock(sk);
1121 return err; 1128 return err;
1122} 1129}
1130EXPORT_SYMBOL(tcp_sendmsg);
1123 1131
1124/* 1132/*
1125 * Handle reading urgent data. BSD has very simple semantics for 1133 * Handle reading urgent data. BSD has very simple semantics for
@@ -1254,6 +1262,39 @@ static void tcp_prequeue_process(struct sock *sk)
1254 tp->ucopy.memory = 0; 1262 tp->ucopy.memory = 0;
1255} 1263}
1256 1264
1265#ifdef CONFIG_NET_DMA
1266static void tcp_service_net_dma(struct sock *sk, bool wait)
1267{
1268 dma_cookie_t done, used;
1269 dma_cookie_t last_issued;
1270 struct tcp_sock *tp = tcp_sk(sk);
1271
1272 if (!tp->ucopy.dma_chan)
1273 return;
1274
1275 last_issued = tp->ucopy.dma_cookie;
1276 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1277
1278 do {
1279 if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1280 last_issued, &done,
1281 &used) == DMA_SUCCESS) {
1282 /* Safe to free early-copied skbs now */
1283 __skb_queue_purge(&sk->sk_async_wait_queue);
1284 break;
1285 } else {
1286 struct sk_buff *skb;
1287 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1288 (dma_async_is_complete(skb->dma_cookie, done,
1289 used) == DMA_SUCCESS)) {
1290 __skb_dequeue(&sk->sk_async_wait_queue);
1291 kfree_skb(skb);
1292 }
1293 }
1294 } while (wait);
1295}
1296#endif
1297
1257static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) 1298static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1258{ 1299{
1259 struct sk_buff *skb; 1300 struct sk_buff *skb;
@@ -1335,6 +1376,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1335 sk_eat_skb(sk, skb, 0); 1376 sk_eat_skb(sk, skb, 0);
1336 if (!desc->count) 1377 if (!desc->count)
1337 break; 1378 break;
1379 tp->copied_seq = seq;
1338 } 1380 }
1339 tp->copied_seq = seq; 1381 tp->copied_seq = seq;
1340 1382
@@ -1345,6 +1387,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1345 tcp_cleanup_rbuf(sk, copied); 1387 tcp_cleanup_rbuf(sk, copied);
1346 return copied; 1388 return copied;
1347} 1389}
1390EXPORT_SYMBOL(tcp_read_sock);
1348 1391
1349/* 1392/*
1350 * This routine copies from a sock struct into the user buffer. 1393 * This routine copies from a sock struct into the user buffer.
@@ -1546,6 +1589,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1546 /* __ Set realtime policy in scheduler __ */ 1589 /* __ Set realtime policy in scheduler __ */
1547 } 1590 }
1548 1591
1592#ifdef CONFIG_NET_DMA
1593 if (tp->ucopy.dma_chan)
1594 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1595#endif
1549 if (copied >= target) { 1596 if (copied >= target) {
1550 /* Do not sleep, just process backlog. */ 1597 /* Do not sleep, just process backlog. */
1551 release_sock(sk); 1598 release_sock(sk);
@@ -1554,6 +1601,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1554 sk_wait_data(sk, &timeo); 1601 sk_wait_data(sk, &timeo);
1555 1602
1556#ifdef CONFIG_NET_DMA 1603#ifdef CONFIG_NET_DMA
1604 tcp_service_net_dma(sk, false); /* Don't block */
1557 tp->ucopy.wakeup = 0; 1605 tp->ucopy.wakeup = 0;
1558#endif 1606#endif
1559 1607
@@ -1633,6 +1681,9 @@ do_prequeue:
1633 copied = -EFAULT; 1681 copied = -EFAULT;
1634 break; 1682 break;
1635 } 1683 }
1684
1685 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1686
1636 if ((offset + used) == skb->len) 1687 if ((offset + used) == skb->len)
1637 copied_early = 1; 1688 copied_early = 1;
1638 1689
@@ -1702,27 +1753,9 @@ skip_copy:
1702 } 1753 }
1703 1754
1704#ifdef CONFIG_NET_DMA 1755#ifdef CONFIG_NET_DMA
1705 if (tp->ucopy.dma_chan) { 1756 tcp_service_net_dma(sk, true); /* Wait for queue to drain */
1706 dma_cookie_t done, used; 1757 tp->ucopy.dma_chan = NULL;
1707
1708 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1709
1710 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1711 tp->ucopy.dma_cookie, &done,
1712 &used) == DMA_IN_PROGRESS) {
1713 /* do partial cleanup of sk_async_wait_queue */
1714 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1715 (dma_async_is_complete(skb->dma_cookie, done,
1716 used) == DMA_SUCCESS)) {
1717 __skb_dequeue(&sk->sk_async_wait_queue);
1718 kfree_skb(skb);
1719 }
1720 }
1721 1758
1722 /* Safe to free early-copied skbs now */
1723 __skb_queue_purge(&sk->sk_async_wait_queue);
1724 tp->ucopy.dma_chan = NULL;
1725 }
1726 if (tp->ucopy.pinned_list) { 1759 if (tp->ucopy.pinned_list) {
1727 dma_unpin_iovec_pages(tp->ucopy.pinned_list); 1760 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1728 tp->ucopy.pinned_list = NULL; 1761 tp->ucopy.pinned_list = NULL;
@@ -1749,6 +1782,7 @@ recv_urg:
1749 err = tcp_recv_urg(sk, msg, len, flags); 1782 err = tcp_recv_urg(sk, msg, len, flags);
1750 goto out; 1783 goto out;
1751} 1784}
1785EXPORT_SYMBOL(tcp_recvmsg);
1752 1786
1753void tcp_set_state(struct sock *sk, int state) 1787void tcp_set_state(struct sock *sk, int state)
1754{ 1788{
@@ -1841,6 +1875,7 @@ void tcp_shutdown(struct sock *sk, int how)
1841 tcp_send_fin(sk); 1875 tcp_send_fin(sk);
1842 } 1876 }
1843} 1877}
1878EXPORT_SYMBOL(tcp_shutdown);
1844 1879
1845void tcp_close(struct sock *sk, long timeout) 1880void tcp_close(struct sock *sk, long timeout)
1846{ 1881{
@@ -1873,6 +1908,10 @@ void tcp_close(struct sock *sk, long timeout)
1873 1908
1874 sk_mem_reclaim(sk); 1909 sk_mem_reclaim(sk);
1875 1910
1911 /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
1912 if (sk->sk_state == TCP_CLOSE)
1913 goto adjudge_to_death;
1914
1876 /* As outlined in RFC 2525, section 2.17, we send a RST here because 1915 /* As outlined in RFC 2525, section 2.17, we send a RST here because
1877 * data was lost. To witness the awful effects of the old behavior of 1916 * data was lost. To witness the awful effects of the old behavior of
1878 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk 1917 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
@@ -1976,11 +2015,8 @@ adjudge_to_death:
1976 } 2015 }
1977 } 2016 }
1978 if (sk->sk_state != TCP_CLOSE) { 2017 if (sk->sk_state != TCP_CLOSE) {
1979 int orphan_count = percpu_counter_read_positive(
1980 sk->sk_prot->orphan_count);
1981
1982 sk_mem_reclaim(sk); 2018 sk_mem_reclaim(sk);
1983 if (tcp_too_many_orphans(sk, orphan_count)) { 2019 if (tcp_too_many_orphans(sk, 0)) {
1984 if (net_ratelimit()) 2020 if (net_ratelimit())
1985 printk(KERN_INFO "TCP: too many of orphaned " 2021 printk(KERN_INFO "TCP: too many of orphaned "
1986 "sockets\n"); 2022 "sockets\n");
@@ -2000,6 +2036,7 @@ out:
2000 local_bh_enable(); 2036 local_bh_enable();
2001 sock_put(sk); 2037 sock_put(sk);
2002} 2038}
2039EXPORT_SYMBOL(tcp_close);
2003 2040
2004/* These states need RST on ABORT according to RFC793 */ 2041/* These states need RST on ABORT according to RFC793 */
2005 2042
@@ -2073,6 +2110,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2073 sk->sk_error_report(sk); 2110 sk->sk_error_report(sk);
2074 return err; 2111 return err;
2075} 2112}
2113EXPORT_SYMBOL(tcp_disconnect);
2076 2114
2077/* 2115/*
2078 * Socket option code for TCP. 2116 * Socket option code for TCP.
@@ -2150,6 +2188,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2150 GFP_KERNEL); 2188 GFP_KERNEL);
2151 if (cvp == NULL) 2189 if (cvp == NULL)
2152 return -ENOMEM; 2190 return -ENOMEM;
2191
2192 kref_init(&cvp->kref);
2153 } 2193 }
2154 lock_sock(sk); 2194 lock_sock(sk);
2155 tp->rx_opt.cookie_in_always = 2195 tp->rx_opt.cookie_in_always =
@@ -2164,12 +2204,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2164 */ 2204 */
2165 kref_put(&tp->cookie_values->kref, 2205 kref_put(&tp->cookie_values->kref,
2166 tcp_cookie_values_release); 2206 tcp_cookie_values_release);
2167 kref_init(&cvp->kref);
2168 tp->cookie_values = cvp;
2169 } else { 2207 } else {
2170 cvp = tp->cookie_values; 2208 cvp = tp->cookie_values;
2171 } 2209 }
2172 } 2210 }
2211
2173 if (cvp != NULL) { 2212 if (cvp != NULL) {
2174 cvp->cookie_desired = ctd.tcpct_cookie_desired; 2213 cvp->cookie_desired = ctd.tcpct_cookie_desired;
2175 2214
@@ -2183,6 +2222,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2183 cvp->s_data_desired = ctd.tcpct_s_data_desired; 2222 cvp->s_data_desired = ctd.tcpct_s_data_desired;
2184 cvp->s_data_constant = 0; /* false */ 2223 cvp->s_data_constant = 0; /* false */
2185 } 2224 }
2225
2226 tp->cookie_values = cvp;
2186 } 2227 }
2187 release_sock(sk); 2228 release_sock(sk);
2188 return err; 2229 return err;
@@ -2190,7 +2231,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2190 default: 2231 default:
2191 /* fallthru */ 2232 /* fallthru */
2192 break; 2233 break;
2193 }; 2234 }
2194 2235
2195 if (optlen < sizeof(int)) 2236 if (optlen < sizeof(int))
2196 return -EINVAL; 2237 return -EINVAL;
@@ -2229,6 +2270,20 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2229 } 2270 }
2230 break; 2271 break;
2231 2272
2273 case TCP_THIN_LINEAR_TIMEOUTS:
2274 if (val < 0 || val > 1)
2275 err = -EINVAL;
2276 else
2277 tp->thin_lto = val;
2278 break;
2279
2280 case TCP_THIN_DUPACK:
2281 if (val < 0 || val > 1)
2282 err = -EINVAL;
2283 else
2284 tp->thin_dupack = val;
2285 break;
2286
2232 case TCP_CORK: 2287 case TCP_CORK:
2233 /* When set indicates to always queue non-full frames. 2288 /* When set indicates to always queue non-full frames.
2234 * Later the user clears this option and we transmit 2289 * Later the user clears this option and we transmit
@@ -2259,7 +2314,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2259 if (sock_flag(sk, SOCK_KEEPOPEN) && 2314 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2260 !((1 << sk->sk_state) & 2315 !((1 << sk->sk_state) &
2261 (TCPF_CLOSE | TCPF_LISTEN))) { 2316 (TCPF_CLOSE | TCPF_LISTEN))) {
2262 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp; 2317 u32 elapsed = keepalive_time_elapsed(tp);
2263 if (tp->keepalive_time > elapsed) 2318 if (tp->keepalive_time > elapsed)
2264 elapsed = tp->keepalive_time - elapsed; 2319 elapsed = tp->keepalive_time - elapsed;
2265 else 2320 else
@@ -2357,6 +2412,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2357 optval, optlen); 2412 optval, optlen);
2358 return do_tcp_setsockopt(sk, level, optname, optval, optlen); 2413 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2359} 2414}
2415EXPORT_SYMBOL(tcp_setsockopt);
2360 2416
2361#ifdef CONFIG_COMPAT 2417#ifdef CONFIG_COMPAT
2362int compat_tcp_setsockopt(struct sock *sk, int level, int optname, 2418int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
@@ -2367,7 +2423,6 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2367 optval, optlen); 2423 optval, optlen);
2368 return do_tcp_setsockopt(sk, level, optname, optval, optlen); 2424 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2369} 2425}
2370
2371EXPORT_SYMBOL(compat_tcp_setsockopt); 2426EXPORT_SYMBOL(compat_tcp_setsockopt);
2372#endif 2427#endif
2373 2428
@@ -2433,7 +2488,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2433 2488
2434 info->tcpi_total_retrans = tp->total_retrans; 2489 info->tcpi_total_retrans = tp->total_retrans;
2435} 2490}
2436
2437EXPORT_SYMBOL_GPL(tcp_get_info); 2491EXPORT_SYMBOL_GPL(tcp_get_info);
2438 2492
2439static int do_tcp_getsockopt(struct sock *sk, int level, 2493static int do_tcp_getsockopt(struct sock *sk, int level,
@@ -2551,6 +2605,12 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2551 return -EFAULT; 2605 return -EFAULT;
2552 return 0; 2606 return 0;
2553 } 2607 }
2608 case TCP_THIN_LINEAR_TIMEOUTS:
2609 val = tp->thin_lto;
2610 break;
2611 case TCP_THIN_DUPACK:
2612 val = tp->thin_dupack;
2613 break;
2554 default: 2614 default:
2555 return -ENOPROTOOPT; 2615 return -ENOPROTOOPT;
2556 } 2616 }
@@ -2572,6 +2632,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2572 optval, optlen); 2632 optval, optlen);
2573 return do_tcp_getsockopt(sk, level, optname, optval, optlen); 2633 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2574} 2634}
2635EXPORT_SYMBOL(tcp_getsockopt);
2575 2636
2576#ifdef CONFIG_COMPAT 2637#ifdef CONFIG_COMPAT
2577int compat_tcp_getsockopt(struct sock *sk, int level, int optname, 2638int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
@@ -2582,7 +2643,6 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2582 optval, optlen); 2643 optval, optlen);
2583 return do_tcp_getsockopt(sk, level, optname, optval, optlen); 2644 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2584} 2645}
2585
2586EXPORT_SYMBOL(compat_tcp_getsockopt); 2646EXPORT_SYMBOL(compat_tcp_getsockopt);
2587#endif 2647#endif
2588 2648
@@ -2682,7 +2742,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2682 struct tcphdr *th2; 2742 struct tcphdr *th2;
2683 unsigned int len; 2743 unsigned int len;
2684 unsigned int thlen; 2744 unsigned int thlen;
2685 unsigned int flags; 2745 __be32 flags;
2686 unsigned int mss = 1; 2746 unsigned int mss = 1;
2687 unsigned int hlen; 2747 unsigned int hlen;
2688 unsigned int off; 2748 unsigned int off;
@@ -2732,10 +2792,10 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2732 2792
2733found: 2793found:
2734 flush = NAPI_GRO_CB(p)->flush; 2794 flush = NAPI_GRO_CB(p)->flush;
2735 flush |= flags & TCP_FLAG_CWR; 2795 flush |= (__force int)(flags & TCP_FLAG_CWR);
2736 flush |= (flags ^ tcp_flag_word(th2)) & 2796 flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
2737 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH); 2797 ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
2738 flush |= th->ack_seq ^ th2->ack_seq; 2798 flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
2739 for (i = sizeof(*th); i < thlen; i += 4) 2799 for (i = sizeof(*th); i < thlen; i += 4)
2740 flush |= *(u32 *)((u8 *)th + i) ^ 2800 flush |= *(u32 *)((u8 *)th + i) ^
2741 *(u32 *)((u8 *)th2 + i); 2801 *(u32 *)((u8 *)th2 + i);
@@ -2756,8 +2816,9 @@ found:
2756 2816
2757out_check_final: 2817out_check_final:
2758 flush = len < mss; 2818 flush = len < mss;
2759 flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | 2819 flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
2760 TCP_FLAG_SYN | TCP_FLAG_FIN); 2820 TCP_FLAG_RST | TCP_FLAG_SYN |
2821 TCP_FLAG_FIN));
2761 2822
2762 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) 2823 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
2763 pp = head; 2824 pp = head;
@@ -2788,10 +2849,10 @@ EXPORT_SYMBOL(tcp_gro_complete);
2788 2849
2789#ifdef CONFIG_TCP_MD5SIG 2850#ifdef CONFIG_TCP_MD5SIG
2790static unsigned long tcp_md5sig_users; 2851static unsigned long tcp_md5sig_users;
2791static struct tcp_md5sig_pool **tcp_md5sig_pool; 2852static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool;
2792static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); 2853static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
2793 2854
2794static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) 2855static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool)
2795{ 2856{
2796 int cpu; 2857 int cpu;
2797 for_each_possible_cpu(cpu) { 2858 for_each_possible_cpu(cpu) {
@@ -2800,7 +2861,6 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2800 if (p->md5_desc.tfm) 2861 if (p->md5_desc.tfm)
2801 crypto_free_hash(p->md5_desc.tfm); 2862 crypto_free_hash(p->md5_desc.tfm);
2802 kfree(p); 2863 kfree(p);
2803 p = NULL;
2804 } 2864 }
2805 } 2865 }
2806 free_percpu(pool); 2866 free_percpu(pool);
@@ -2808,7 +2868,7 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2808 2868
2809void tcp_free_md5sig_pool(void) 2869void tcp_free_md5sig_pool(void)
2810{ 2870{
2811 struct tcp_md5sig_pool **pool = NULL; 2871 struct tcp_md5sig_pool * __percpu *pool = NULL;
2812 2872
2813 spin_lock_bh(&tcp_md5sig_pool_lock); 2873 spin_lock_bh(&tcp_md5sig_pool_lock);
2814 if (--tcp_md5sig_users == 0) { 2874 if (--tcp_md5sig_users == 0) {
@@ -2819,13 +2879,13 @@ void tcp_free_md5sig_pool(void)
2819 if (pool) 2879 if (pool)
2820 __tcp_free_md5sig_pool(pool); 2880 __tcp_free_md5sig_pool(pool);
2821} 2881}
2822
2823EXPORT_SYMBOL(tcp_free_md5sig_pool); 2882EXPORT_SYMBOL(tcp_free_md5sig_pool);
2824 2883
2825static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(struct sock *sk) 2884static struct tcp_md5sig_pool * __percpu *
2885__tcp_alloc_md5sig_pool(struct sock *sk)
2826{ 2886{
2827 int cpu; 2887 int cpu;
2828 struct tcp_md5sig_pool **pool; 2888 struct tcp_md5sig_pool * __percpu *pool;
2829 2889
2830 pool = alloc_percpu(struct tcp_md5sig_pool *); 2890 pool = alloc_percpu(struct tcp_md5sig_pool *);
2831 if (!pool) 2891 if (!pool)
@@ -2852,9 +2912,9 @@ out_free:
2852 return NULL; 2912 return NULL;
2853} 2913}
2854 2914
2855struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(struct sock *sk) 2915struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
2856{ 2916{
2857 struct tcp_md5sig_pool **pool; 2917 struct tcp_md5sig_pool * __percpu *pool;
2858 int alloc = 0; 2918 int alloc = 0;
2859 2919
2860retry: 2920retry:
@@ -2873,7 +2933,9 @@ retry:
2873 2933
2874 if (alloc) { 2934 if (alloc) {
2875 /* we cannot hold spinlock here because this may sleep. */ 2935 /* we cannot hold spinlock here because this may sleep. */
2876 struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(sk); 2936 struct tcp_md5sig_pool * __percpu *p;
2937
2938 p = __tcp_alloc_md5sig_pool(sk);
2877 spin_lock_bh(&tcp_md5sig_pool_lock); 2939 spin_lock_bh(&tcp_md5sig_pool_lock);
2878 if (!p) { 2940 if (!p) {
2879 tcp_md5sig_users--; 2941 tcp_md5sig_users--;
@@ -2892,28 +2954,42 @@ retry:
2892 } 2954 }
2893 return pool; 2955 return pool;
2894} 2956}
2895
2896EXPORT_SYMBOL(tcp_alloc_md5sig_pool); 2957EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2897 2958
2898struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu) 2959
2960/**
2961 * tcp_get_md5sig_pool - get md5sig_pool for this user
2962 *
2963 * We use percpu structure, so if we succeed, we exit with preemption
2964 * and BH disabled, to make sure another thread or softirq handling
2965 * wont try to get same context.
2966 */
2967struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
2899{ 2968{
2900 struct tcp_md5sig_pool **p; 2969 struct tcp_md5sig_pool * __percpu *p;
2901 spin_lock_bh(&tcp_md5sig_pool_lock); 2970
2971 local_bh_disable();
2972
2973 spin_lock(&tcp_md5sig_pool_lock);
2902 p = tcp_md5sig_pool; 2974 p = tcp_md5sig_pool;
2903 if (p) 2975 if (p)
2904 tcp_md5sig_users++; 2976 tcp_md5sig_users++;
2905 spin_unlock_bh(&tcp_md5sig_pool_lock); 2977 spin_unlock(&tcp_md5sig_pool_lock);
2906 return (p ? *per_cpu_ptr(p, cpu) : NULL); 2978
2907} 2979 if (p)
2980 return *this_cpu_ptr(p);
2908 2981
2909EXPORT_SYMBOL(__tcp_get_md5sig_pool); 2982 local_bh_enable();
2983 return NULL;
2984}
2985EXPORT_SYMBOL(tcp_get_md5sig_pool);
2910 2986
2911void __tcp_put_md5sig_pool(void) 2987void tcp_put_md5sig_pool(void)
2912{ 2988{
2989 local_bh_enable();
2913 tcp_free_md5sig_pool(); 2990 tcp_free_md5sig_pool();
2914} 2991}
2915 2992EXPORT_SYMBOL(tcp_put_md5sig_pool);
2916EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2917 2993
2918int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, 2994int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2919 struct tcphdr *th) 2995 struct tcphdr *th)
@@ -2929,7 +3005,6 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
2929 th->check = old_checksum; 3005 th->check = old_checksum;
2930 return err; 3006 return err;
2931} 3007}
2932
2933EXPORT_SYMBOL(tcp_md5_hash_header); 3008EXPORT_SYMBOL(tcp_md5_hash_header);
2934 3009
2935int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, 3010int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
@@ -2942,6 +3017,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
2942 const unsigned head_data_len = skb_headlen(skb) > header_len ? 3017 const unsigned head_data_len = skb_headlen(skb) > header_len ?
2943 skb_headlen(skb) - header_len : 0; 3018 skb_headlen(skb) - header_len : 0;
2944 const struct skb_shared_info *shi = skb_shinfo(skb); 3019 const struct skb_shared_info *shi = skb_shinfo(skb);
3020 struct sk_buff *frag_iter;
2945 3021
2946 sg_init_table(&sg, 1); 3022 sg_init_table(&sg, 1);
2947 3023
@@ -2956,9 +3032,12 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
2956 return 1; 3032 return 1;
2957 } 3033 }
2958 3034
3035 skb_walk_frags(skb, frag_iter)
3036 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3037 return 1;
3038
2959 return 0; 3039 return 0;
2960} 3040}
2961
2962EXPORT_SYMBOL(tcp_md5_hash_skb_data); 3041EXPORT_SYMBOL(tcp_md5_hash_skb_data);
2963 3042
2964int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) 3043int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
@@ -2968,7 +3047,6 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
2968 sg_init_one(&sg, key->key, key->keylen); 3047 sg_init_one(&sg, key->key, key->keylen);
2969 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); 3048 return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
2970} 3049}
2971
2972EXPORT_SYMBOL(tcp_md5_hash_key); 3050EXPORT_SYMBOL(tcp_md5_hash_key);
2973 3051
2974#endif 3052#endif
@@ -3135,7 +3213,7 @@ void __init tcp_init(void)
3135{ 3213{
3136 struct sk_buff *skb = NULL; 3214 struct sk_buff *skb = NULL;
3137 unsigned long nr_pages, limit; 3215 unsigned long nr_pages, limit;
3138 int order, i, max_share; 3216 int i, max_share, cnt;
3139 unsigned long jiffy = jiffies; 3217 unsigned long jiffy = jiffies;
3140 3218
3141 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3219 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3184,22 +3262,12 @@ void __init tcp_init(void)
3184 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); 3262 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
3185 } 3263 }
3186 3264
3187 /* Try to be a bit smarter and adjust defaults depending 3265
3188 * on available memory. 3266 cnt = tcp_hashinfo.ehash_mask + 1;
3189 */ 3267
3190 for (order = 0; ((1 << order) << PAGE_SHIFT) < 3268 tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
3191 (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); 3269 sysctl_tcp_max_orphans = cnt / 2;
3192 order++) 3270 sysctl_max_syn_backlog = max(128, cnt / 256);
3193 ;
3194 if (order >= 4) {
3195 tcp_death_row.sysctl_max_tw_buckets = 180000;
3196 sysctl_tcp_max_orphans = 4096 << (order - 4);
3197 sysctl_max_syn_backlog = 1024;
3198 } else if (order < 3) {
3199 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
3200 sysctl_tcp_max_orphans >>= (3 - order);
3201 sysctl_max_syn_backlog = 128;
3202 }
3203 3271
3204 /* Set the pressure threshold to be a fraction of global memory that 3272 /* Set the pressure threshold to be a fraction of global memory that
3205 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of 3273 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
@@ -3240,16 +3308,3 @@ void __init tcp_init(void)
3240 tcp_secret_retiring = &tcp_secret_two; 3308 tcp_secret_retiring = &tcp_secret_two;
3241 tcp_secret_secondary = &tcp_secret_two; 3309 tcp_secret_secondary = &tcp_secret_two;
3242} 3310}
3243
3244EXPORT_SYMBOL(tcp_close);
3245EXPORT_SYMBOL(tcp_disconnect);
3246EXPORT_SYMBOL(tcp_getsockopt);
3247EXPORT_SYMBOL(tcp_ioctl);
3248EXPORT_SYMBOL(tcp_poll);
3249EXPORT_SYMBOL(tcp_read_sock);
3250EXPORT_SYMBOL(tcp_recvmsg);
3251EXPORT_SYMBOL(tcp_sendmsg);
3252EXPORT_SYMBOL(tcp_splice_read);
3253EXPORT_SYMBOL(tcp_sendpage);
3254EXPORT_SYMBOL(tcp_setsockopt);
3255EXPORT_SYMBOL(tcp_shutdown);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 6428b342b164..850c737e08e2 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -10,6 +10,7 @@
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/types.h> 11#include <linux/types.h>
12#include <linux/list.h> 12#include <linux/list.h>
13#include <linux/gfp.h>
13#include <net/tcp.h> 14#include <net/tcp.h>
14 15
15int sysctl_tcp_max_ssthresh = 0; 16int sysctl_tcp_max_ssthresh = 0;
@@ -195,10 +196,10 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
195int tcp_set_allowed_congestion_control(char *val) 196int tcp_set_allowed_congestion_control(char *val)
196{ 197{
197 struct tcp_congestion_ops *ca; 198 struct tcp_congestion_ops *ca;
198 char *clone, *name; 199 char *saved_clone, *clone, *name;
199 int ret = 0; 200 int ret = 0;
200 201
201 clone = kstrdup(val, GFP_USER); 202 saved_clone = clone = kstrdup(val, GFP_USER);
202 if (!clone) 203 if (!clone)
203 return -ENOMEM; 204 return -ENOMEM;
204 205
@@ -225,6 +226,7 @@ int tcp_set_allowed_congestion_control(char *val)
225 } 226 }
226out: 227out:
227 spin_unlock(&tcp_cong_list_lock); 228 spin_unlock(&tcp_cong_list_lock);
229 kfree(saved_clone);
228 230
229 return ret; 231 return ret;
230} 232}
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index c209e054a634..377bc9349371 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -126,8 +126,8 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
126 * calculate 2^fract in a <<7 value. 126 * calculate 2^fract in a <<7 value.
127 */ 127 */
128 is_slowstart = 1; 128 is_slowstart = 1;
129 increment = ((1 << ca->rho) * hybla_fraction(rho_fractions)) 129 increment = ((1 << min(ca->rho, 16U)) *
130 - 128; 130 hybla_fraction(rho_fractions)) - 128;
131 } else { 131 } else {
132 /* 132 /*
133 * congestion avoidance 133 * congestion avoidance
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 28e029632493..b55f60f6fcbe 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -62,6 +62,7 @@
62 */ 62 */
63 63
64#include <linux/mm.h> 64#include <linux/mm.h>
65#include <linux/slab.h>
65#include <linux/module.h> 66#include <linux/module.h>
66#include <linux/sysctl.h> 67#include <linux/sysctl.h>
67#include <linux/kernel.h> 68#include <linux/kernel.h>
@@ -77,10 +78,13 @@ int sysctl_tcp_window_scaling __read_mostly = 1;
77int sysctl_tcp_sack __read_mostly = 1; 78int sysctl_tcp_sack __read_mostly = 1;
78int sysctl_tcp_fack __read_mostly = 1; 79int sysctl_tcp_fack __read_mostly = 1;
79int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; 80int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
81EXPORT_SYMBOL(sysctl_tcp_reordering);
80int sysctl_tcp_ecn __read_mostly = 2; 82int sysctl_tcp_ecn __read_mostly = 2;
83EXPORT_SYMBOL(sysctl_tcp_ecn);
81int sysctl_tcp_dsack __read_mostly = 1; 84int sysctl_tcp_dsack __read_mostly = 1;
82int sysctl_tcp_app_win __read_mostly = 31; 85int sysctl_tcp_app_win __read_mostly = 31;
83int sysctl_tcp_adv_win_scale __read_mostly = 2; 86int sysctl_tcp_adv_win_scale __read_mostly = 2;
87EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
84 88
85int sysctl_tcp_stdurg __read_mostly; 89int sysctl_tcp_stdurg __read_mostly;
86int sysctl_tcp_rfc1337 __read_mostly; 90int sysctl_tcp_rfc1337 __read_mostly;
@@ -89,6 +93,8 @@ int sysctl_tcp_frto __read_mostly = 2;
89int sysctl_tcp_frto_response __read_mostly; 93int sysctl_tcp_frto_response __read_mostly;
90int sysctl_tcp_nometrics_save __read_mostly; 94int sysctl_tcp_nometrics_save __read_mostly;
91 95
96int sysctl_tcp_thin_dupack __read_mostly;
97
92int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 98int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
93int sysctl_tcp_abc __read_mostly; 99int sysctl_tcp_abc __read_mostly;
94 100
@@ -416,6 +422,7 @@ void tcp_initialize_rcv_mss(struct sock *sk)
416 422
417 inet_csk(sk)->icsk_ack.rcv_mss = hint; 423 inet_csk(sk)->icsk_ack.rcv_mss = hint;
418} 424}
425EXPORT_SYMBOL(tcp_initialize_rcv_mss);
419 426
420/* Receiver "autotuning" code. 427/* Receiver "autotuning" code.
421 * 428 *
@@ -2447,6 +2454,16 @@ static int tcp_time_to_recover(struct sock *sk)
2447 return 1; 2454 return 1;
2448 } 2455 }
2449 2456
2457 /* If a thin stream is detected, retransmit after first
2458 * received dupack. Employ only if SACK is supported in order
2459 * to avoid possible corner-case series of spurious retransmissions
2460 * Use only if there are no unsent data.
2461 */
2462 if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
2463 tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
2464 tcp_is_sack(tp) && !tcp_send_head(sk))
2465 return 1;
2466
2450 return 0; 2467 return 0;
2451} 2468}
2452 2469
@@ -2499,6 +2516,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2499 int err; 2516 int err;
2500 unsigned int mss; 2517 unsigned int mss;
2501 2518
2519 if (packets == 0)
2520 return;
2521
2502 WARN_ON(packets > tp->packets_out); 2522 WARN_ON(packets > tp->packets_out);
2503 if (tp->lost_skb_hint) { 2523 if (tp->lost_skb_hint) {
2504 skb = tp->lost_skb_hint; 2524 skb = tp->lost_skb_hint;
@@ -2525,7 +2545,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2525 cnt += tcp_skb_pcount(skb); 2545 cnt += tcp_skb_pcount(skb);
2526 2546
2527 if (cnt > packets) { 2547 if (cnt > packets) {
2528 if (tcp_is_sack(tp) || (oldcnt >= packets)) 2548 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
2549 (oldcnt >= packets))
2529 break; 2550 break;
2530 2551
2531 mss = skb_shinfo(skb)->gso_size; 2552 mss = skb_shinfo(skb)->gso_size;
@@ -2623,7 +2644,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2623 if (sk->sk_family == AF_INET) { 2644 if (sk->sk_family == AF_INET) {
2624 printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", 2645 printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2625 msg, 2646 msg,
2626 &inet->daddr, ntohs(inet->dport), 2647 &inet->inet_daddr, ntohs(inet->inet_dport),
2627 tp->snd_cwnd, tcp_left_out(tp), 2648 tp->snd_cwnd, tcp_left_out(tp),
2628 tp->snd_ssthresh, tp->prior_ssthresh, 2649 tp->snd_ssthresh, tp->prior_ssthresh,
2629 tp->packets_out); 2650 tp->packets_out);
@@ -2633,7 +2654,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2633 struct ipv6_pinfo *np = inet6_sk(sk); 2654 struct ipv6_pinfo *np = inet6_sk(sk);
2634 printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", 2655 printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2635 msg, 2656 msg,
2636 &np->daddr, ntohs(inet->dport), 2657 &np->daddr, ntohs(inet->inet_dport),
2637 tp->snd_cwnd, tcp_left_out(tp), 2658 tp->snd_cwnd, tcp_left_out(tp),
2638 tp->snd_ssthresh, tp->prior_ssthresh, 2659 tp->snd_ssthresh, tp->prior_ssthresh,
2639 tp->packets_out); 2660 tp->packets_out);
@@ -2922,6 +2943,7 @@ void tcp_simple_retransmit(struct sock *sk)
2922 } 2943 }
2923 tcp_xmit_retransmit_queue(sk); 2944 tcp_xmit_retransmit_queue(sk);
2924} 2945}
2946EXPORT_SYMBOL(tcp_simple_retransmit);
2925 2947
2926/* Process an event, which can update packets-in-flight not trivially. 2948/* Process an event, which can update packets-in-flight not trivially.
2927 * Main goal of this function is to calculate new estimate for left_out, 2949 * Main goal of this function is to calculate new estimate for left_out,
@@ -3270,7 +3292,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3270 * connection startup slow start one packet too 3292 * connection startup slow start one packet too
3271 * quickly. This is severely frowned upon behavior. 3293 * quickly. This is severely frowned upon behavior.
3272 */ 3294 */
3273 if (!(scb->flags & TCPCB_FLAG_SYN)) { 3295 if (!(scb->flags & TCPHDR_SYN)) {
3274 flag |= FLAG_DATA_ACKED; 3296 flag |= FLAG_DATA_ACKED;
3275 } else { 3297 } else {
3276 flag |= FLAG_SYN_ACKED; 3298 flag |= FLAG_SYN_ACKED;
@@ -3694,7 +3716,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3694 } 3716 }
3695 3717
3696 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) 3718 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
3697 dst_confirm(sk->sk_dst_cache); 3719 dst_confirm(__sk_dst_get(sk));
3698 3720
3699 return 1; 3721 return 1;
3700 3722
@@ -3829,18 +3851,20 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3829 /* 16-bit multiple */ 3851 /* 16-bit multiple */
3830 opt_rx->cookie_plus = opsize; 3852 opt_rx->cookie_plus = opsize;
3831 *hvpp = ptr; 3853 *hvpp = ptr;
3854 break;
3832 default: 3855 default:
3833 /* ignore option */ 3856 /* ignore option */
3834 break; 3857 break;
3835 }; 3858 }
3836 break; 3859 break;
3837 }; 3860 }
3838 3861
3839 ptr += opsize-2; 3862 ptr += opsize-2;
3840 length -= opsize; 3863 length -= opsize;
3841 } 3864 }
3842 } 3865 }
3843} 3866}
3867EXPORT_SYMBOL(tcp_parse_options);
3844 3868
3845static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) 3869static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
3846{ 3870{
@@ -3907,13 +3931,14 @@ u8 *tcp_parse_md5sig_option(struct tcphdr *th)
3907 if (opsize < 2 || opsize > length) 3931 if (opsize < 2 || opsize > length)
3908 return NULL; 3932 return NULL;
3909 if (opcode == TCPOPT_MD5SIG) 3933 if (opcode == TCPOPT_MD5SIG)
3910 return ptr; 3934 return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
3911 } 3935 }
3912 ptr += opsize - 2; 3936 ptr += opsize - 2;
3913 length -= opsize; 3937 length -= opsize;
3914 } 3938 }
3915 return NULL; 3939 return NULL;
3916} 3940}
3941EXPORT_SYMBOL(tcp_parse_md5sig_option);
3917#endif 3942#endif
3918 3943
3919static inline void tcp_store_ts_recent(struct tcp_sock *tp) 3944static inline void tcp_store_ts_recent(struct tcp_sock *tp)
@@ -4024,6 +4049,8 @@ static void tcp_reset(struct sock *sk)
4024 default: 4049 default:
4025 sk->sk_err = ECONNRESET; 4050 sk->sk_err = ECONNRESET;
4026 } 4051 }
4052 /* This barrier is coupled with smp_rmb() in tcp_poll() */
4053 smp_wmb();
4027 4054
4028 if (!sock_flag(sk, SOCK_DEAD)) 4055 if (!sock_flag(sk, SOCK_DEAD))
4029 sk->sk_error_report(sk); 4056 sk->sk_error_report(sk);
@@ -4303,7 +4330,7 @@ static void tcp_ofo_queue(struct sock *sk)
4303 } 4330 }
4304 4331
4305 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { 4332 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4306 SOCK_DEBUG(sk, "ofo packet was already received \n"); 4333 SOCK_DEBUG(sk, "ofo packet was already received\n");
4307 __skb_unlink(skb, &tp->out_of_order_queue); 4334 __skb_unlink(skb, &tp->out_of_order_queue);
4308 __kfree_skb(skb); 4335 __kfree_skb(skb);
4309 continue; 4336 continue;
@@ -4351,6 +4378,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4351 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) 4378 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
4352 goto drop; 4379 goto drop;
4353 4380
4381 skb_dst_drop(skb);
4354 __skb_pull(skb, th->doff * 4); 4382 __skb_pull(skb, th->doff * 4);
4355 4383
4356 TCP_ECN_accept_cwr(tp, skb); 4384 TCP_ECN_accept_cwr(tp, skb);
@@ -5414,6 +5442,7 @@ discard:
5414 __kfree_skb(skb); 5442 __kfree_skb(skb);
5415 return 0; 5443 return 0;
5416} 5444}
5445EXPORT_SYMBOL(tcp_rcv_established);
5417 5446
5418static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5447static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5419 struct tcphdr *th, unsigned len) 5448 struct tcphdr *th, unsigned len)
@@ -5783,11 +5812,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5783 5812
5784 /* tcp_ack considers this ACK as duplicate 5813 /* tcp_ack considers this ACK as duplicate
5785 * and does not calculate rtt. 5814 * and does not calculate rtt.
5786 * Fix it at least with timestamps. 5815 * Force it here.
5787 */ 5816 */
5788 if (tp->rx_opt.saw_tstamp && 5817 tcp_ack_update_rtt(sk, 0, 0);
5789 tp->rx_opt.rcv_tsecr && !tp->srtt)
5790 tcp_ack_saw_tstamp(sk, 0);
5791 5818
5792 if (tp->rx_opt.tstamp_ok) 5819 if (tp->rx_opt.tstamp_ok)
5793 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5820 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -5819,7 +5846,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5819 if (tp->snd_una == tp->write_seq) { 5846 if (tp->snd_una == tp->write_seq) {
5820 tcp_set_state(sk, TCP_FIN_WAIT2); 5847 tcp_set_state(sk, TCP_FIN_WAIT2);
5821 sk->sk_shutdown |= SEND_SHUTDOWN; 5848 sk->sk_shutdown |= SEND_SHUTDOWN;
5822 dst_confirm(sk->sk_dst_cache); 5849 dst_confirm(__sk_dst_get(sk));
5823 5850
5824 if (!sock_flag(sk, SOCK_DEAD)) 5851 if (!sock_flag(sk, SOCK_DEAD))
5825 /* Wake up lingering close() */ 5852 /* Wake up lingering close() */
@@ -5915,14 +5942,4 @@ discard:
5915 } 5942 }
5916 return 0; 5943 return 0;
5917} 5944}
5918
5919EXPORT_SYMBOL(sysctl_tcp_ecn);
5920EXPORT_SYMBOL(sysctl_tcp_reordering);
5921EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
5922EXPORT_SYMBOL(tcp_parse_options);
5923#ifdef CONFIG_TCP_MD5SIG
5924EXPORT_SYMBOL(tcp_parse_md5sig_option);
5925#endif
5926EXPORT_SYMBOL(tcp_rcv_established);
5927EXPORT_SYMBOL(tcp_rcv_state_process); 5945EXPORT_SYMBOL(tcp_rcv_state_process);
5928EXPORT_SYMBOL(tcp_initialize_rcv_mss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 65b8ebfd078a..020766292bb0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -60,6 +60,7 @@
60#include <linux/jhash.h> 60#include <linux/jhash.h>
61#include <linux/init.h> 61#include <linux/init.h>
62#include <linux/times.h> 62#include <linux/times.h>
63#include <linux/slab.h>
63 64
64#include <net/net_namespace.h> 65#include <net/net_namespace.h>
65#include <net/icmp.h> 66#include <net/icmp.h>
@@ -83,6 +84,7 @@
83 84
84int sysctl_tcp_tw_reuse __read_mostly; 85int sysctl_tcp_tw_reuse __read_mostly;
85int sysctl_tcp_low_latency __read_mostly; 86int sysctl_tcp_low_latency __read_mostly;
87EXPORT_SYMBOL(sysctl_tcp_low_latency);
86 88
87 89
88#ifdef CONFIG_TCP_MD5SIG 90#ifdef CONFIG_TCP_MD5SIG
@@ -99,6 +101,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
99#endif 101#endif
100 102
101struct inet_hashinfo tcp_hashinfo; 103struct inet_hashinfo tcp_hashinfo;
104EXPORT_SYMBOL(tcp_hashinfo);
102 105
103static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) 106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
104{ 107{
@@ -138,7 +141,6 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
138 141
139 return 0; 142 return 0;
140} 143}
141
142EXPORT_SYMBOL_GPL(tcp_twsk_unique); 144EXPORT_SYMBOL_GPL(tcp_twsk_unique);
143 145
144/* This will initiate an outgoing connection. */ 146/* This will initiate an outgoing connection. */
@@ -203,10 +205,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
203 * TIME-WAIT * and initialize rx_opt.ts_recent from it, 205 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
204 * when trying new connection. 206 * when trying new connection.
205 */ 207 */
206 if (peer != NULL && 208 if (peer) {
207 (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { 209 inet_peer_refcheck(peer);
208 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; 210 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
209 tp->rx_opt.ts_recent = peer->tcp_ts; 211 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
212 tp->rx_opt.ts_recent = peer->tcp_ts;
213 }
210 } 214 }
211 } 215 }
212 216
@@ -236,7 +240,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
236 240
237 /* OK, now commit destination to socket. */ 241 /* OK, now commit destination to socket. */
238 sk->sk_gso_type = SKB_GSO_TCPV4; 242 sk->sk_gso_type = SKB_GSO_TCPV4;
239 sk_setup_caps(sk, &rt->u.dst); 243 sk_setup_caps(sk, &rt->dst);
240 244
241 if (!tp->write_seq) 245 if (!tp->write_seq)
242 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, 246 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
@@ -264,6 +268,7 @@ failure:
264 inet->inet_dport = 0; 268 inet->inet_dport = 0;
265 return err; 269 return err;
266} 270}
271EXPORT_SYMBOL(tcp_v4_connect);
267 272
268/* 273/*
269 * This routine does path mtu discovery as defined in RFC1191. 274 * This routine does path mtu discovery as defined in RFC1191.
@@ -370,6 +375,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
370 if (sk->sk_state == TCP_CLOSE) 375 if (sk->sk_state == TCP_CLOSE)
371 goto out; 376 goto out;
372 377
378 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
379 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
380 goto out;
381 }
382
373 icsk = inet_csk(sk); 383 icsk = inet_csk(sk);
374 tp = tcp_sk(sk); 384 tp = tcp_sk(sk);
375 seq = ntohl(th->seq); 385 seq = ntohl(th->seq);
@@ -513,26 +523,32 @@ out:
513 sock_put(sk); 523 sock_put(sk);
514} 524}
515 525
516/* This routine computes an IPv4 TCP checksum. */ 526static void __tcp_v4_send_check(struct sk_buff *skb,
517void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) 527 __be32 saddr, __be32 daddr)
518{ 528{
519 struct inet_sock *inet = inet_sk(sk);
520 struct tcphdr *th = tcp_hdr(skb); 529 struct tcphdr *th = tcp_hdr(skb);
521 530
522 if (skb->ip_summed == CHECKSUM_PARTIAL) { 531 if (skb->ip_summed == CHECKSUM_PARTIAL) {
523 th->check = ~tcp_v4_check(len, inet->inet_saddr, 532 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
524 inet->inet_daddr, 0);
525 skb->csum_start = skb_transport_header(skb) - skb->head; 533 skb->csum_start = skb_transport_header(skb) - skb->head;
526 skb->csum_offset = offsetof(struct tcphdr, check); 534 skb->csum_offset = offsetof(struct tcphdr, check);
527 } else { 535 } else {
528 th->check = tcp_v4_check(len, inet->inet_saddr, 536 th->check = tcp_v4_check(skb->len, saddr, daddr,
529 inet->inet_daddr,
530 csum_partial(th, 537 csum_partial(th,
531 th->doff << 2, 538 th->doff << 2,
532 skb->csum)); 539 skb->csum));
533 } 540 }
534} 541}
535 542
543/* This routine computes an IPv4 TCP checksum. */
544void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
545{
546 struct inet_sock *inet = inet_sk(sk);
547
548 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
549}
550EXPORT_SYMBOL(tcp_v4_send_check);
551
536int tcp_v4_gso_send_check(struct sk_buff *skb) 552int tcp_v4_gso_send_check(struct sk_buff *skb)
537{ 553{
538 const struct iphdr *iph; 554 const struct iphdr *iph;
@@ -545,10 +561,8 @@ int tcp_v4_gso_send_check(struct sk_buff *skb)
545 th = tcp_hdr(skb); 561 th = tcp_hdr(skb);
546 562
547 th->check = 0; 563 th->check = 0;
548 th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
549 skb->csum_start = skb_transport_header(skb) - skb->head;
550 skb->csum_offset = offsetof(struct tcphdr, check);
551 skb->ip_summed = CHECKSUM_PARTIAL; 564 skb->ip_summed = CHECKSUM_PARTIAL;
565 __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
552 return 0; 566 return 0;
553} 567}
554 568
@@ -742,9 +756,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
742 * This still operates on a request_sock only, not on a big 756 * This still operates on a request_sock only, not on a big
743 * socket. 757 * socket.
744 */ 758 */
745static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, 759static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
746 struct request_sock *req, 760 struct request_sock *req,
747 struct request_values *rvp) 761 struct request_values *rvp)
748{ 762{
749 const struct inet_request_sock *ireq = inet_rsk(req); 763 const struct inet_request_sock *ireq = inet_rsk(req);
750 int err = -1; 764 int err = -1;
@@ -757,13 +771,7 @@ static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
757 skb = tcp_make_synack(sk, dst, req, rvp); 771 skb = tcp_make_synack(sk, dst, req, rvp);
758 772
759 if (skb) { 773 if (skb) {
760 struct tcphdr *th = tcp_hdr(skb); 774 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
761
762 th->check = tcp_v4_check(skb->len,
763 ireq->loc_addr,
764 ireq->rmt_addr,
765 csum_partial(th, skb->len,
766 skb->csum));
767 775
768 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, 776 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
769 ireq->rmt_addr, 777 ireq->rmt_addr,
@@ -775,10 +783,11 @@ static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
775 return err; 783 return err;
776} 784}
777 785
778static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, 786static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
779 struct request_values *rvp) 787 struct request_values *rvp)
780{ 788{
781 return __tcp_v4_send_synack(sk, NULL, req, rvp); 789 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
790 return tcp_v4_send_synack(sk, NULL, req, rvp);
782} 791}
783 792
784/* 793/*
@@ -789,19 +798,20 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
789 kfree(inet_rsk(req)->opt); 798 kfree(inet_rsk(req)->opt);
790} 799}
791 800
792#ifdef CONFIG_SYN_COOKIES 801static void syn_flood_warning(const struct sk_buff *skb)
793static void syn_flood_warning(struct sk_buff *skb)
794{ 802{
795 static unsigned long warntime; 803 const char *msg;
796 804
797 if (time_after(jiffies, (warntime + HZ * 60))) { 805#ifdef CONFIG_SYN_COOKIES
798 warntime = jiffies; 806 if (sysctl_tcp_syncookies)
799 printk(KERN_INFO 807 msg = "Sending cookies";
800 "possible SYN flooding on port %d. Sending cookies.\n", 808 else
801 ntohs(tcp_hdr(skb)->dest));
802 }
803}
804#endif 809#endif
810 msg = "Dropping request";
811
812 pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
813 ntohs(tcp_hdr(skb)->dest), msg);
814}
805 815
806/* 816/*
807 * Save and compile IPv4 options into the request_sock if needed. 817 * Save and compile IPv4 options into the request_sock if needed.
@@ -853,7 +863,6 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
853{ 863{
854 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr); 864 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
855} 865}
856
857EXPORT_SYMBOL(tcp_v4_md5_lookup); 866EXPORT_SYMBOL(tcp_v4_md5_lookup);
858 867
859static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, 868static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
@@ -887,7 +896,7 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
887 kfree(newkey); 896 kfree(newkey);
888 return -ENOMEM; 897 return -ENOMEM;
889 } 898 }
890 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 899 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
891 } 900 }
892 if (tcp_alloc_md5sig_pool(sk) == NULL) { 901 if (tcp_alloc_md5sig_pool(sk) == NULL) {
893 kfree(newkey); 902 kfree(newkey);
@@ -920,7 +929,6 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
920 } 929 }
921 return 0; 930 return 0;
922} 931}
923
924EXPORT_SYMBOL(tcp_v4_md5_do_add); 932EXPORT_SYMBOL(tcp_v4_md5_do_add);
925 933
926static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, 934static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
@@ -958,7 +966,6 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
958 } 966 }
959 return -ENOENT; 967 return -ENOENT;
960} 968}
961
962EXPORT_SYMBOL(tcp_v4_md5_do_del); 969EXPORT_SYMBOL(tcp_v4_md5_do_del);
963 970
964static void tcp_v4_clear_md5_list(struct sock *sk) 971static void tcp_v4_clear_md5_list(struct sock *sk)
@@ -1017,7 +1024,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1017 return -EINVAL; 1024 return -EINVAL;
1018 1025
1019 tp->md5sig_info = p; 1026 tp->md5sig_info = p;
1020 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1027 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1021 } 1028 }
1022 1029
1023 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation); 1030 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
@@ -1131,7 +1138,6 @@ clear_hash_noput:
1131 memset(md5_hash, 0, 16); 1138 memset(md5_hash, 0, 16);
1132 return 1; 1139 return 1;
1133} 1140}
1134
1135EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1141EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1136 1142
1137static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) 1143static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
@@ -1192,10 +1198,11 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1192struct request_sock_ops tcp_request_sock_ops __read_mostly = { 1198struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1193 .family = PF_INET, 1199 .family = PF_INET,
1194 .obj_size = sizeof(struct tcp_request_sock), 1200 .obj_size = sizeof(struct tcp_request_sock),
1195 .rtx_syn_ack = tcp_v4_send_synack, 1201 .rtx_syn_ack = tcp_v4_rtx_synack,
1196 .send_ack = tcp_v4_reqsk_send_ack, 1202 .send_ack = tcp_v4_reqsk_send_ack,
1197 .destructor = tcp_v4_reqsk_destructor, 1203 .destructor = tcp_v4_reqsk_destructor,
1198 .send_reset = tcp_v4_send_reset, 1204 .send_reset = tcp_v4_send_reset,
1205 .syn_ack_timeout = tcp_syn_ack_timeout,
1199}; 1206};
1200 1207
1201#ifdef CONFIG_TCP_MD5SIG 1208#ifdef CONFIG_TCP_MD5SIG
@@ -1238,6 +1245,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1238 * evidently real one. 1245 * evidently real one.
1239 */ 1246 */
1240 if (inet_csk_reqsk_queue_is_full(sk) && !isn) { 1247 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1248 if (net_ratelimit())
1249 syn_flood_warning(skb);
1241#ifdef CONFIG_SYN_COOKIES 1250#ifdef CONFIG_SYN_COOKIES
1242 if (sysctl_tcp_syncookies) { 1251 if (sysctl_tcp_syncookies) {
1243 want_cookie = 1; 1252 want_cookie = 1;
@@ -1281,8 +1290,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1281 goto drop_and_release; 1290 goto drop_and_release;
1282 1291
1283 /* Secret recipe starts with IP addresses */ 1292 /* Secret recipe starts with IP addresses */
1284 *mess++ ^= daddr; 1293 *mess++ ^= (__force u32)daddr;
1285 *mess++ ^= saddr; 1294 *mess++ ^= (__force u32)saddr;
1286 1295
1287 /* plus variable length Initiator Cookie */ 1296 /* plus variable length Initiator Cookie */
1288 c = (u8 *)mess; 1297 c = (u8 *)mess;
@@ -1318,15 +1327,12 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1318 if (security_inet_conn_request(sk, skb, req)) 1327 if (security_inet_conn_request(sk, skb, req))
1319 goto drop_and_free; 1328 goto drop_and_free;
1320 1329
1321 if (!want_cookie) 1330 if (!want_cookie || tmp_opt.tstamp_ok)
1322 TCP_ECN_create_request(req, tcp_hdr(skb)); 1331 TCP_ECN_create_request(req, tcp_hdr(skb));
1323 1332
1324 if (want_cookie) { 1333 if (want_cookie) {
1325#ifdef CONFIG_SYN_COOKIES
1326 syn_flood_warning(skb);
1327 req->cookie_ts = tmp_opt.tstamp_ok;
1328#endif
1329 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1334 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1335 req->cookie_ts = tmp_opt.tstamp_ok;
1330 } else if (!isn) { 1336 } else if (!isn) {
1331 struct inet_peer *peer = NULL; 1337 struct inet_peer *peer = NULL;
1332 1338
@@ -1344,6 +1350,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1344 (dst = inet_csk_route_req(sk, req)) != NULL && 1350 (dst = inet_csk_route_req(sk, req)) != NULL &&
1345 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1351 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1346 peer->v4daddr == saddr) { 1352 peer->v4daddr == saddr) {
1353 inet_peer_refcheck(peer);
1347 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && 1354 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1348 (s32)(peer->tcp_ts - req->ts_recent) > 1355 (s32)(peer->tcp_ts - req->ts_recent) >
1349 TCP_PAWS_WINDOW) { 1356 TCP_PAWS_WINDOW) {
@@ -1373,8 +1380,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1373 } 1380 }
1374 tcp_rsk(req)->snt_isn = isn; 1381 tcp_rsk(req)->snt_isn = isn;
1375 1382
1376 if (__tcp_v4_send_synack(sk, dst, req, 1383 if (tcp_v4_send_synack(sk, dst, req,
1377 (struct request_values *)&tmp_ext) || 1384 (struct request_values *)&tmp_ext) ||
1378 want_cookie) 1385 want_cookie)
1379 goto drop_and_free; 1386 goto drop_and_free;
1380 1387
@@ -1388,6 +1395,7 @@ drop_and_free:
1388drop: 1395drop:
1389 return 0; 1396 return 0;
1390} 1397}
1398EXPORT_SYMBOL(tcp_v4_conn_request);
1391 1399
1392 1400
1393/* 1401/*
@@ -1457,7 +1465,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1457 if (newkey != NULL) 1465 if (newkey != NULL)
1458 tcp_v4_md5_do_add(newsk, newinet->inet_daddr, 1466 tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1459 newkey, key->keylen); 1467 newkey, key->keylen);
1460 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1468 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1461 } 1469 }
1462#endif 1470#endif
1463 1471
@@ -1473,6 +1481,7 @@ exit:
1473 dst_release(dst); 1481 dst_release(dst);
1474 return NULL; 1482 return NULL;
1475} 1483}
1484EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1476 1485
1477static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) 1486static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1478{ 1487{
@@ -1499,7 +1508,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1499 } 1508 }
1500 1509
1501#ifdef CONFIG_SYN_COOKIES 1510#ifdef CONFIG_SYN_COOKIES
1502 if (!th->rst && !th->syn && th->ack) 1511 if (!th->syn)
1503 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); 1512 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1504#endif 1513#endif
1505 return sk; 1514 return sk;
@@ -1550,6 +1559,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1550#endif 1559#endif
1551 1560
1552 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1561 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1562 sock_rps_save_rxhash(sk, skb->rxhash);
1553 TCP_CHECK_TIMER(sk); 1563 TCP_CHECK_TIMER(sk);
1554 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1564 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1555 rsk = sk; 1565 rsk = sk;
@@ -1574,7 +1584,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1574 } 1584 }
1575 return 0; 1585 return 0;
1576 } 1586 }
1577 } 1587 } else
1588 sock_rps_save_rxhash(sk, skb->rxhash);
1589
1578 1590
1579 TCP_CHECK_TIMER(sk); 1591 TCP_CHECK_TIMER(sk);
1580 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { 1592 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
@@ -1599,6 +1611,7 @@ csum_err:
1599 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); 1611 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1600 goto discard; 1612 goto discard;
1601} 1613}
1614EXPORT_SYMBOL(tcp_v4_do_rcv);
1602 1615
1603/* 1616/*
1604 * From tcp_input.c 1617 * From tcp_input.c
@@ -1653,6 +1666,11 @@ process:
1653 if (sk->sk_state == TCP_TIME_WAIT) 1666 if (sk->sk_state == TCP_TIME_WAIT)
1654 goto do_time_wait; 1667 goto do_time_wait;
1655 1668
1669 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1670 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1671 goto discard_and_relse;
1672 }
1673
1656 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) 1674 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1657 goto discard_and_relse; 1675 goto discard_and_relse;
1658 nf_reset(skb); 1676 nf_reset(skb);
@@ -1677,8 +1695,11 @@ process:
1677 if (!tcp_prequeue(sk, skb)) 1695 if (!tcp_prequeue(sk, skb))
1678 ret = tcp_v4_do_rcv(sk, skb); 1696 ret = tcp_v4_do_rcv(sk, skb);
1679 } 1697 }
1680 } else 1698 } else if (unlikely(sk_add_backlog(sk, skb))) {
1681 sk_add_backlog(sk, skb); 1699 bh_unlock_sock(sk);
1700 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1701 goto discard_and_relse;
1702 }
1682 bh_unlock_sock(sk); 1703 bh_unlock_sock(sk);
1683 1704
1684 sock_put(sk); 1705 sock_put(sk);
@@ -1777,6 +1798,7 @@ int tcp_v4_remember_stamp(struct sock *sk)
1777 1798
1778 return 0; 1799 return 0;
1779} 1800}
1801EXPORT_SYMBOL(tcp_v4_remember_stamp);
1780 1802
1781int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) 1803int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1782{ 1804{
@@ -1816,6 +1838,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
1816 .compat_getsockopt = compat_ip_getsockopt, 1838 .compat_getsockopt = compat_ip_getsockopt,
1817#endif 1839#endif
1818}; 1840};
1841EXPORT_SYMBOL(ipv4_specific);
1819 1842
1820#ifdef CONFIG_TCP_MD5SIG 1843#ifdef CONFIG_TCP_MD5SIG
1821static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 1844static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
@@ -1944,7 +1967,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
1944 1967
1945 percpu_counter_dec(&tcp_sockets_allocated); 1968 percpu_counter_dec(&tcp_sockets_allocated);
1946} 1969}
1947
1948EXPORT_SYMBOL(tcp_v4_destroy_sock); 1970EXPORT_SYMBOL(tcp_v4_destroy_sock);
1949 1971
1950#ifdef CONFIG_PROC_FS 1972#ifdef CONFIG_PROC_FS
@@ -1962,6 +1984,11 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1962 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; 1984 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1963} 1985}
1964 1986
1987/*
1988 * Get next listener socket follow cur. If cur is NULL, get first socket
1989 * starting from bucket given in st->bucket; when st->bucket is zero the
1990 * very first socket in the hash table is returned.
1991 */
1965static void *listening_get_next(struct seq_file *seq, void *cur) 1992static void *listening_get_next(struct seq_file *seq, void *cur)
1966{ 1993{
1967 struct inet_connection_sock *icsk; 1994 struct inet_connection_sock *icsk;
@@ -1972,14 +1999,15 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
1972 struct net *net = seq_file_net(seq); 1999 struct net *net = seq_file_net(seq);
1973 2000
1974 if (!sk) { 2001 if (!sk) {
1975 st->bucket = 0; 2002 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1976 ilb = &tcp_hashinfo.listening_hash[0];
1977 spin_lock_bh(&ilb->lock); 2003 spin_lock_bh(&ilb->lock);
1978 sk = sk_nulls_head(&ilb->head); 2004 sk = sk_nulls_head(&ilb->head);
2005 st->offset = 0;
1979 goto get_sk; 2006 goto get_sk;
1980 } 2007 }
1981 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2008 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1982 ++st->num; 2009 ++st->num;
2010 ++st->offset;
1983 2011
1984 if (st->state == TCP_SEQ_STATE_OPENREQ) { 2012 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1985 struct request_sock *req = cur; 2013 struct request_sock *req = cur;
@@ -1994,6 +2022,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
1994 } 2022 }
1995 req = req->dl_next; 2023 req = req->dl_next;
1996 } 2024 }
2025 st->offset = 0;
1997 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) 2026 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1998 break; 2027 break;
1999get_req: 2028get_req:
@@ -2029,6 +2058,7 @@ start_req:
2029 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2058 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2030 } 2059 }
2031 spin_unlock_bh(&ilb->lock); 2060 spin_unlock_bh(&ilb->lock);
2061 st->offset = 0;
2032 if (++st->bucket < INET_LHTABLE_SIZE) { 2062 if (++st->bucket < INET_LHTABLE_SIZE) {
2033 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 2063 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2034 spin_lock_bh(&ilb->lock); 2064 spin_lock_bh(&ilb->lock);
@@ -2042,7 +2072,12 @@ out:
2042 2072
2043static void *listening_get_idx(struct seq_file *seq, loff_t *pos) 2073static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2044{ 2074{
2045 void *rc = listening_get_next(seq, NULL); 2075 struct tcp_iter_state *st = seq->private;
2076 void *rc;
2077
2078 st->bucket = 0;
2079 st->offset = 0;
2080 rc = listening_get_next(seq, NULL);
2046 2081
2047 while (rc && *pos) { 2082 while (rc && *pos) {
2048 rc = listening_get_next(seq, rc); 2083 rc = listening_get_next(seq, rc);
@@ -2057,13 +2092,18 @@ static inline int empty_bucket(struct tcp_iter_state *st)
2057 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); 2092 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2058} 2093}
2059 2094
2095/*
2096 * Get first established socket starting from bucket given in st->bucket.
2097 * If st->bucket is zero, the very first socket in the hash is returned.
2098 */
2060static void *established_get_first(struct seq_file *seq) 2099static void *established_get_first(struct seq_file *seq)
2061{ 2100{
2062 struct tcp_iter_state *st = seq->private; 2101 struct tcp_iter_state *st = seq->private;
2063 struct net *net = seq_file_net(seq); 2102 struct net *net = seq_file_net(seq);
2064 void *rc = NULL; 2103 void *rc = NULL;
2065 2104
2066 for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { 2105 st->offset = 0;
2106 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2067 struct sock *sk; 2107 struct sock *sk;
2068 struct hlist_nulls_node *node; 2108 struct hlist_nulls_node *node;
2069 struct inet_timewait_sock *tw; 2109 struct inet_timewait_sock *tw;
@@ -2108,6 +2148,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
2108 struct net *net = seq_file_net(seq); 2148 struct net *net = seq_file_net(seq);
2109 2149
2110 ++st->num; 2150 ++st->num;
2151 ++st->offset;
2111 2152
2112 if (st->state == TCP_SEQ_STATE_TIME_WAIT) { 2153 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2113 tw = cur; 2154 tw = cur;
@@ -2124,6 +2165,7 @@ get_tw:
2124 st->state = TCP_SEQ_STATE_ESTABLISHED; 2165 st->state = TCP_SEQ_STATE_ESTABLISHED;
2125 2166
2126 /* Look for next non empty bucket */ 2167 /* Look for next non empty bucket */
2168 st->offset = 0;
2127 while (++st->bucket <= tcp_hashinfo.ehash_mask && 2169 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2128 empty_bucket(st)) 2170 empty_bucket(st))
2129 ; 2171 ;
@@ -2151,7 +2193,11 @@ out:
2151 2193
2152static void *established_get_idx(struct seq_file *seq, loff_t pos) 2194static void *established_get_idx(struct seq_file *seq, loff_t pos)
2153{ 2195{
2154 void *rc = established_get_first(seq); 2196 struct tcp_iter_state *st = seq->private;
2197 void *rc;
2198
2199 st->bucket = 0;
2200 rc = established_get_first(seq);
2155 2201
2156 while (rc && pos) { 2202 while (rc && pos) {
2157 rc = established_get_next(seq, rc); 2203 rc = established_get_next(seq, rc);
@@ -2176,24 +2222,72 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2176 return rc; 2222 return rc;
2177} 2223}
2178 2224
2225static void *tcp_seek_last_pos(struct seq_file *seq)
2226{
2227 struct tcp_iter_state *st = seq->private;
2228 int offset = st->offset;
2229 int orig_num = st->num;
2230 void *rc = NULL;
2231
2232 switch (st->state) {
2233 case TCP_SEQ_STATE_OPENREQ:
2234 case TCP_SEQ_STATE_LISTENING:
2235 if (st->bucket >= INET_LHTABLE_SIZE)
2236 break;
2237 st->state = TCP_SEQ_STATE_LISTENING;
2238 rc = listening_get_next(seq, NULL);
2239 while (offset-- && rc)
2240 rc = listening_get_next(seq, rc);
2241 if (rc)
2242 break;
2243 st->bucket = 0;
2244 /* Fallthrough */
2245 case TCP_SEQ_STATE_ESTABLISHED:
2246 case TCP_SEQ_STATE_TIME_WAIT:
2247 st->state = TCP_SEQ_STATE_ESTABLISHED;
2248 if (st->bucket > tcp_hashinfo.ehash_mask)
2249 break;
2250 rc = established_get_first(seq);
2251 while (offset-- && rc)
2252 rc = established_get_next(seq, rc);
2253 }
2254
2255 st->num = orig_num;
2256
2257 return rc;
2258}
2259
2179static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) 2260static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2180{ 2261{
2181 struct tcp_iter_state *st = seq->private; 2262 struct tcp_iter_state *st = seq->private;
2263 void *rc;
2264
2265 if (*pos && *pos == st->last_pos) {
2266 rc = tcp_seek_last_pos(seq);
2267 if (rc)
2268 goto out;
2269 }
2270
2182 st->state = TCP_SEQ_STATE_LISTENING; 2271 st->state = TCP_SEQ_STATE_LISTENING;
2183 st->num = 0; 2272 st->num = 0;
2184 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2273 st->bucket = 0;
2274 st->offset = 0;
2275 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2276
2277out:
2278 st->last_pos = *pos;
2279 return rc;
2185} 2280}
2186 2281
2187static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2282static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2188{ 2283{
2284 struct tcp_iter_state *st = seq->private;
2189 void *rc = NULL; 2285 void *rc = NULL;
2190 struct tcp_iter_state *st;
2191 2286
2192 if (v == SEQ_START_TOKEN) { 2287 if (v == SEQ_START_TOKEN) {
2193 rc = tcp_get_idx(seq, 0); 2288 rc = tcp_get_idx(seq, 0);
2194 goto out; 2289 goto out;
2195 } 2290 }
2196 st = seq->private;
2197 2291
2198 switch (st->state) { 2292 switch (st->state) {
2199 case TCP_SEQ_STATE_OPENREQ: 2293 case TCP_SEQ_STATE_OPENREQ:
@@ -2201,6 +2295,8 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2201 rc = listening_get_next(seq, v); 2295 rc = listening_get_next(seq, v);
2202 if (!rc) { 2296 if (!rc) {
2203 st->state = TCP_SEQ_STATE_ESTABLISHED; 2297 st->state = TCP_SEQ_STATE_ESTABLISHED;
2298 st->bucket = 0;
2299 st->offset = 0;
2204 rc = established_get_first(seq); 2300 rc = established_get_first(seq);
2205 } 2301 }
2206 break; 2302 break;
@@ -2211,6 +2307,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2211 } 2307 }
2212out: 2308out:
2213 ++*pos; 2309 ++*pos;
2310 st->last_pos = *pos;
2214 return rc; 2311 return rc;
2215} 2312}
2216 2313
@@ -2249,6 +2346,7 @@ static int tcp_seq_open(struct inode *inode, struct file *file)
2249 2346
2250 s = ((struct seq_file *)file->private_data)->private; 2347 s = ((struct seq_file *)file->private_data)->private;
2251 s->family = afinfo->family; 2348 s->family = afinfo->family;
2349 s->last_pos = 0;
2252 return 0; 2350 return 0;
2253} 2351}
2254 2352
@@ -2272,11 +2370,13 @@ int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2272 rc = -ENOMEM; 2370 rc = -ENOMEM;
2273 return rc; 2371 return rc;
2274} 2372}
2373EXPORT_SYMBOL(tcp_proc_register);
2275 2374
2276void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) 2375void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2277{ 2376{
2278 proc_net_remove(net, afinfo->name); 2377 proc_net_remove(net, afinfo->name);
2279} 2378}
2379EXPORT_SYMBOL(tcp_proc_unregister);
2280 2380
2281static void get_openreq4(struct sock *sk, struct request_sock *req, 2381static void get_openreq4(struct sock *sk, struct request_sock *req,
2282 struct seq_file *f, int i, int uid, int *len) 2382 struct seq_file *f, int i, int uid, int *len)
@@ -2425,12 +2525,12 @@ static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2425 }, 2525 },
2426}; 2526};
2427 2527
2428static int tcp4_proc_init_net(struct net *net) 2528static int __net_init tcp4_proc_init_net(struct net *net)
2429{ 2529{
2430 return tcp_proc_register(net, &tcp4_seq_afinfo); 2530 return tcp_proc_register(net, &tcp4_seq_afinfo);
2431} 2531}
2432 2532
2433static void tcp4_proc_exit_net(struct net *net) 2533static void __net_exit tcp4_proc_exit_net(struct net *net)
2434{ 2534{
2435 tcp_proc_unregister(net, &tcp4_seq_afinfo); 2535 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2436} 2536}
@@ -2500,6 +2600,8 @@ struct proto tcp_prot = {
2500 .setsockopt = tcp_setsockopt, 2600 .setsockopt = tcp_setsockopt,
2501 .getsockopt = tcp_getsockopt, 2601 .getsockopt = tcp_getsockopt,
2502 .recvmsg = tcp_recvmsg, 2602 .recvmsg = tcp_recvmsg,
2603 .sendmsg = tcp_sendmsg,
2604 .sendpage = tcp_sendpage,
2503 .backlog_rcv = tcp_v4_do_rcv, 2605 .backlog_rcv = tcp_v4_do_rcv,
2504 .hash = inet_hash, 2606 .hash = inet_hash,
2505 .unhash = inet_unhash, 2607 .unhash = inet_unhash,
@@ -2518,11 +2620,13 @@ struct proto tcp_prot = {
2518 .twsk_prot = &tcp_timewait_sock_ops, 2620 .twsk_prot = &tcp_timewait_sock_ops,
2519 .rsk_prot = &tcp_request_sock_ops, 2621 .rsk_prot = &tcp_request_sock_ops,
2520 .h.hashinfo = &tcp_hashinfo, 2622 .h.hashinfo = &tcp_hashinfo,
2623 .no_autobind = true,
2521#ifdef CONFIG_COMPAT 2624#ifdef CONFIG_COMPAT
2522 .compat_setsockopt = compat_tcp_setsockopt, 2625 .compat_setsockopt = compat_tcp_setsockopt,
2523 .compat_getsockopt = compat_tcp_getsockopt, 2626 .compat_getsockopt = compat_tcp_getsockopt,
2524#endif 2627#endif
2525}; 2628};
2629EXPORT_SYMBOL(tcp_prot);
2526 2630
2527 2631
2528static int __net_init tcp_sk_init(struct net *net) 2632static int __net_init tcp_sk_init(struct net *net)
@@ -2553,20 +2657,3 @@ void __init tcp_v4_init(void)
2553 if (register_pernet_subsys(&tcp_sk_ops)) 2657 if (register_pernet_subsys(&tcp_sk_ops))
2554 panic("Failed to create the TCP control socket.\n"); 2658 panic("Failed to create the TCP control socket.\n");
2555} 2659}
2556
2557EXPORT_SYMBOL(ipv4_specific);
2558EXPORT_SYMBOL(tcp_hashinfo);
2559EXPORT_SYMBOL(tcp_prot);
2560EXPORT_SYMBOL(tcp_v4_conn_request);
2561EXPORT_SYMBOL(tcp_v4_connect);
2562EXPORT_SYMBOL(tcp_v4_do_rcv);
2563EXPORT_SYMBOL(tcp_v4_remember_stamp);
2564EXPORT_SYMBOL(tcp_v4_send_check);
2565EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2566
2567#ifdef CONFIG_PROC_FS
2568EXPORT_SYMBOL(tcp_proc_register);
2569EXPORT_SYMBOL(tcp_proc_unregister);
2570#endif
2571EXPORT_SYMBOL(sysctl_tcp_low_latency);
2572
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f206ee5dda80..f25b56cb85cb 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -20,6 +20,7 @@
20 20
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/slab.h>
23#include <linux/sysctl.h> 24#include <linux/sysctl.h>
24#include <linux/workqueue.h> 25#include <linux/workqueue.h>
25#include <net/tcp.h> 26#include <net/tcp.h>
@@ -46,7 +47,6 @@ struct inet_timewait_death_row tcp_death_row = {
46 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, 47 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
47 (unsigned long)&tcp_death_row), 48 (unsigned long)&tcp_death_row),
48}; 49};
49
50EXPORT_SYMBOL_GPL(tcp_death_row); 50EXPORT_SYMBOL_GPL(tcp_death_row);
51 51
52static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 52static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
@@ -261,6 +261,7 @@ kill:
261 inet_twsk_put(tw); 261 inet_twsk_put(tw);
262 return TCP_TW_SUCCESS; 262 return TCP_TW_SUCCESS;
263} 263}
264EXPORT_SYMBOL(tcp_timewait_state_process);
264 265
265/* 266/*
266 * Move a socket to time-wait or dead fin-wait-2 state. 267 * Move a socket to time-wait or dead fin-wait-2 state.
@@ -361,7 +362,6 @@ void tcp_twsk_destructor(struct sock *sk)
361 tcp_free_md5sig_pool(); 362 tcp_free_md5sig_pool();
362#endif 363#endif
363} 364}
364
365EXPORT_SYMBOL_GPL(tcp_twsk_destructor); 365EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
366 366
367static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, 367static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
@@ -509,6 +509,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
509 } 509 }
510 return newsk; 510 return newsk;
511} 511}
512EXPORT_SYMBOL(tcp_create_openreq_child);
512 513
513/* 514/*
514 * Process an incoming packet for SYN_RECV sockets represented 515 * Process an incoming packet for SYN_RECV sockets represented
@@ -671,6 +672,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
671 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && 672 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
672 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 673 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
673 inet_rsk(req)->acked = 1; 674 inet_rsk(req)->acked = 1;
675 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
674 return NULL; 676 return NULL;
675 } 677 }
676 678
@@ -704,6 +706,7 @@ embryonic_reset:
704 inet_csk_reqsk_queue_drop(sk, req, prev); 706 inet_csk_reqsk_queue_drop(sk, req, prev);
705 return NULL; 707 return NULL;
706} 708}
709EXPORT_SYMBOL(tcp_check_req);
707 710
708/* 711/*
709 * Queue segment on the new socket if the new socket is active, 712 * Queue segment on the new socket if the new socket is active,
@@ -728,15 +731,11 @@ int tcp_child_process(struct sock *parent, struct sock *child,
728 * in main socket hash table and lock on listening 731 * in main socket hash table and lock on listening
729 * socket does not protect us more. 732 * socket does not protect us more.
730 */ 733 */
731 sk_add_backlog(child, skb); 734 __sk_add_backlog(child, skb);
732 } 735 }
733 736
734 bh_unlock_sock(child); 737 bh_unlock_sock(child);
735 sock_put(child); 738 sock_put(child);
736 return ret; 739 return ret;
737} 740}
738
739EXPORT_SYMBOL(tcp_check_req);
740EXPORT_SYMBOL(tcp_child_process); 741EXPORT_SYMBOL(tcp_child_process);
741EXPORT_SYMBOL(tcp_create_openreq_child);
742EXPORT_SYMBOL(tcp_timewait_state_process);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 383ce237640f..de3bd8458588 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -37,6 +37,7 @@
37#include <net/tcp.h> 37#include <net/tcp.h>
38 38
39#include <linux/compiler.h> 39#include <linux/compiler.h>
40#include <linux/gfp.h>
40#include <linux/module.h> 41#include <linux/module.h>
41 42
42/* People can turn this off for buggy TCP's found in printers etc. */ 43/* People can turn this off for buggy TCP's found in printers etc. */
@@ -183,7 +184,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
183 */ 184 */
184void tcp_select_initial_window(int __space, __u32 mss, 185void tcp_select_initial_window(int __space, __u32 mss,
185 __u32 *rcv_wnd, __u32 *window_clamp, 186 __u32 *rcv_wnd, __u32 *window_clamp,
186 int wscale_ok, __u8 *rcv_wscale) 187 int wscale_ok, __u8 *rcv_wscale,
188 __u32 init_rcv_wnd)
187{ 189{
188 unsigned int space = (__space < 0 ? 0 : __space); 190 unsigned int space = (__space < 0 ? 0 : __space);
189 191
@@ -232,13 +234,20 @@ void tcp_select_initial_window(int __space, __u32 mss,
232 init_cwnd = 2; 234 init_cwnd = 2;
233 else if (mss > 1460) 235 else if (mss > 1460)
234 init_cwnd = 3; 236 init_cwnd = 3;
235 if (*rcv_wnd > init_cwnd * mss) 237 /* when initializing use the value from init_rcv_wnd
238 * rather than the default from above
239 */
240 if (init_rcv_wnd &&
241 (*rcv_wnd > init_rcv_wnd * mss))
242 *rcv_wnd = init_rcv_wnd * mss;
243 else if (*rcv_wnd > init_cwnd * mss)
236 *rcv_wnd = init_cwnd * mss; 244 *rcv_wnd = init_cwnd * mss;
237 } 245 }
238 246
239 /* Set the clamp no higher than max representable value */ 247 /* Set the clamp no higher than max representable value */
240 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); 248 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
241} 249}
250EXPORT_SYMBOL(tcp_select_initial_window);
242 251
243/* Chose a new window to advertise, update state in tcp_sock for the 252/* Chose a new window to advertise, update state in tcp_sock for the
244 * socket, and return result with RFC1323 scaling applied. The return 253 * socket, and return result with RFC1323 scaling applied. The return
@@ -286,9 +295,9 @@ static u16 tcp_select_window(struct sock *sk)
286/* Packet ECN state for a SYN-ACK */ 295/* Packet ECN state for a SYN-ACK */
287static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) 296static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
288{ 297{
289 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; 298 TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR;
290 if (!(tp->ecn_flags & TCP_ECN_OK)) 299 if (!(tp->ecn_flags & TCP_ECN_OK))
291 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; 300 TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE;
292} 301}
293 302
294/* Packet ECN state for a SYN. */ 303/* Packet ECN state for a SYN. */
@@ -298,7 +307,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
298 307
299 tp->ecn_flags = 0; 308 tp->ecn_flags = 0;
300 if (sysctl_tcp_ecn == 1) { 309 if (sysctl_tcp_ecn == 1) {
301 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR; 310 TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR;
302 tp->ecn_flags = TCP_ECN_OK; 311 tp->ecn_flags = TCP_ECN_OK;
303 } 312 }
304} 313}
@@ -342,6 +351,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
342 */ 351 */
343static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) 352static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
344{ 353{
354 skb->ip_summed = CHECKSUM_PARTIAL;
345 skb->csum = 0; 355 skb->csum = 0;
346 356
347 TCP_SKB_CB(skb)->flags = flags; 357 TCP_SKB_CB(skb)->flags = flags;
@@ -352,7 +362,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
352 skb_shinfo(skb)->gso_type = 0; 362 skb_shinfo(skb)->gso_type = 0;
353 363
354 TCP_SKB_CB(skb)->seq = seq; 364 TCP_SKB_CB(skb)->seq = seq;
355 if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN)) 365 if (flags & (TCPHDR_SYN | TCPHDR_FIN))
356 seq++; 366 seq++;
357 TCP_SKB_CB(skb)->end_seq = seq; 367 TCP_SKB_CB(skb)->end_seq = seq;
358} 368}
@@ -659,7 +669,6 @@ static unsigned tcp_synack_options(struct sock *sk,
659 u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? 669 u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
660 xvp->cookie_plus : 670 xvp->cookie_plus :
661 0; 671 0;
662 bool doing_ts = ireq->tstamp_ok;
663 672
664#ifdef CONFIG_TCP_MD5SIG 673#ifdef CONFIG_TCP_MD5SIG
665 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); 674 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
@@ -672,7 +681,7 @@ static unsigned tcp_synack_options(struct sock *sk,
672 * rather than TS in order to fit in better with old, 681 * rather than TS in order to fit in better with old,
673 * buggy kernels, but that was deemed to be unnecessary. 682 * buggy kernels, but that was deemed to be unnecessary.
674 */ 683 */
675 doing_ts &= !ireq->sack_ok; 684 ireq->tstamp_ok &= !ireq->sack_ok;
676 } 685 }
677#else 686#else
678 *md5 = NULL; 687 *md5 = NULL;
@@ -687,7 +696,7 @@ static unsigned tcp_synack_options(struct sock *sk,
687 opts->options |= OPTION_WSCALE; 696 opts->options |= OPTION_WSCALE;
688 remaining -= TCPOLEN_WSCALE_ALIGNED; 697 remaining -= TCPOLEN_WSCALE_ALIGNED;
689 } 698 }
690 if (likely(doing_ts)) { 699 if (likely(ireq->tstamp_ok)) {
691 opts->options |= OPTION_TS; 700 opts->options |= OPTION_TS;
692 opts->tsval = TCP_SKB_CB(skb)->when; 701 opts->tsval = TCP_SKB_CB(skb)->when;
693 opts->tsecr = req->ts_recent; 702 opts->tsecr = req->ts_recent;
@@ -695,7 +704,7 @@ static unsigned tcp_synack_options(struct sock *sk,
695 } 704 }
696 if (likely(ireq->sack_ok)) { 705 if (likely(ireq->sack_ok)) {
697 opts->options |= OPTION_SACK_ADVERTISE; 706 opts->options |= OPTION_SACK_ADVERTISE;
698 if (unlikely(!doing_ts)) 707 if (unlikely(!ireq->tstamp_ok))
699 remaining -= TCPOLEN_SACKPERM_ALIGNED; 708 remaining -= TCPOLEN_SACKPERM_ALIGNED;
700 } 709 }
701 710
@@ -703,7 +712,7 @@ static unsigned tcp_synack_options(struct sock *sk,
703 * If the <SYN> options fit, the same options should fit now! 712 * If the <SYN> options fit, the same options should fit now!
704 */ 713 */
705 if (*md5 == NULL && 714 if (*md5 == NULL &&
706 doing_ts && 715 ireq->tstamp_ok &&
707 cookie_plus > TCPOLEN_COOKIE_BASE) { 716 cookie_plus > TCPOLEN_COOKIE_BASE) {
708 int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */ 717 int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
709 718
@@ -812,7 +821,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
812 tcb = TCP_SKB_CB(skb); 821 tcb = TCP_SKB_CB(skb);
813 memset(&opts, 0, sizeof(opts)); 822 memset(&opts, 0, sizeof(opts));
814 823
815 if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) 824 if (unlikely(tcb->flags & TCPHDR_SYN))
816 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); 825 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
817 else 826 else
818 tcp_options_size = tcp_established_options(sk, skb, &opts, 827 tcp_options_size = tcp_established_options(sk, skb, &opts,
@@ -835,7 +844,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
835 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | 844 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
836 tcb->flags); 845 tcb->flags);
837 846
838 if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { 847 if (unlikely(tcb->flags & TCPHDR_SYN)) {
839 /* RFC1323: The window in SYN & SYN/ACK segments 848 /* RFC1323: The window in SYN & SYN/ACK segments
840 * is never scaled. 849 * is never scaled.
841 */ 850 */
@@ -852,36 +861,37 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
852 th->urg_ptr = htons(tp->snd_up - tcb->seq); 861 th->urg_ptr = htons(tp->snd_up - tcb->seq);
853 th->urg = 1; 862 th->urg = 1;
854 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { 863 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
855 th->urg_ptr = 0xFFFF; 864 th->urg_ptr = htons(0xFFFF);
856 th->urg = 1; 865 th->urg = 1;
857 } 866 }
858 } 867 }
859 868
860 tcp_options_write((__be32 *)(th + 1), tp, &opts); 869 tcp_options_write((__be32 *)(th + 1), tp, &opts);
861 if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) 870 if (likely((tcb->flags & TCPHDR_SYN) == 0))
862 TCP_ECN_send(sk, skb, tcp_header_size); 871 TCP_ECN_send(sk, skb, tcp_header_size);
863 872
864#ifdef CONFIG_TCP_MD5SIG 873#ifdef CONFIG_TCP_MD5SIG
865 /* Calculate the MD5 hash, as we have all we need now */ 874 /* Calculate the MD5 hash, as we have all we need now */
866 if (md5) { 875 if (md5) {
867 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 876 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
868 tp->af_specific->calc_md5_hash(opts.hash_location, 877 tp->af_specific->calc_md5_hash(opts.hash_location,
869 md5, sk, NULL, skb); 878 md5, sk, NULL, skb);
870 } 879 }
871#endif 880#endif
872 881
873 icsk->icsk_af_ops->send_check(sk, skb->len, skb); 882 icsk->icsk_af_ops->send_check(sk, skb);
874 883
875 if (likely(tcb->flags & TCPCB_FLAG_ACK)) 884 if (likely(tcb->flags & TCPHDR_ACK))
876 tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); 885 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
877 886
878 if (skb->len != tcp_header_size) 887 if (skb->len != tcp_header_size)
879 tcp_event_data_sent(tp, skb, sk); 888 tcp_event_data_sent(tp, skb, sk);
880 889
881 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) 890 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
882 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); 891 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
892 tcp_skb_pcount(skb));
883 893
884 err = icsk->icsk_af_ops->queue_xmit(skb, 0); 894 err = icsk->icsk_af_ops->queue_xmit(skb);
885 if (likely(err <= 0)) 895 if (likely(err <= 0))
886 return err; 896 return err;
887 897
@@ -1014,7 +1024,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1014 1024
1015 /* PSH and FIN should only be set in the second packet. */ 1025 /* PSH and FIN should only be set in the second packet. */
1016 flags = TCP_SKB_CB(skb)->flags; 1026 flags = TCP_SKB_CB(skb)->flags;
1017 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); 1027 TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1018 TCP_SKB_CB(buff)->flags = flags; 1028 TCP_SKB_CB(buff)->flags = flags;
1019 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; 1029 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
1020 1030
@@ -1180,6 +1190,7 @@ void tcp_mtup_init(struct sock *sk)
1180 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); 1190 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
1181 icsk->icsk_mtup.probe_size = 0; 1191 icsk->icsk_mtup.probe_size = 0;
1182} 1192}
1193EXPORT_SYMBOL(tcp_mtup_init);
1183 1194
1184/* This function synchronize snd mss to current pmtu/exthdr set. 1195/* This function synchronize snd mss to current pmtu/exthdr set.
1185 1196
@@ -1223,6 +1234,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1223 1234
1224 return mss_now; 1235 return mss_now;
1225} 1236}
1237EXPORT_SYMBOL(tcp_sync_mss);
1226 1238
1227/* Compute the current effective MSS, taking SACKs and IP options, 1239/* Compute the current effective MSS, taking SACKs and IP options,
1228 * and even PMTU discovery events into account. 1240 * and even PMTU discovery events into account.
@@ -1319,8 +1331,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
1319 u32 in_flight, cwnd; 1331 u32 in_flight, cwnd;
1320 1332
1321 /* Don't be strict about the congestion window for the final FIN. */ 1333 /* Don't be strict about the congestion window for the final FIN. */
1322 if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && 1334 if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1)
1323 tcp_skb_pcount(skb) == 1)
1324 return 1; 1335 return 1;
1325 1336
1326 in_flight = tcp_packets_in_flight(tp); 1337 in_flight = tcp_packets_in_flight(tp);
@@ -1389,7 +1400,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
1389 * Nagle can be ignored during F-RTO too (see RFC4138). 1400 * Nagle can be ignored during F-RTO too (see RFC4138).
1390 */ 1401 */
1391 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || 1402 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
1392 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) 1403 (TCP_SKB_CB(skb)->flags & TCPHDR_FIN))
1393 return 1; 1404 return 1;
1394 1405
1395 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) 1406 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
@@ -1452,7 +1463,7 @@ int tcp_may_send_now(struct sock *sk)
1452 * packet has never been sent out before (and thus is not cloned). 1463 * packet has never been sent out before (and thus is not cloned).
1453 */ 1464 */
1454static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, 1465static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1455 unsigned int mss_now) 1466 unsigned int mss_now, gfp_t gfp)
1456{ 1467{
1457 struct sk_buff *buff; 1468 struct sk_buff *buff;
1458 int nlen = skb->len - len; 1469 int nlen = skb->len - len;
@@ -1462,7 +1473,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1462 if (skb->len != skb->data_len) 1473 if (skb->len != skb->data_len)
1463 return tcp_fragment(sk, skb, len, mss_now); 1474 return tcp_fragment(sk, skb, len, mss_now);
1464 1475
1465 buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC); 1476 buff = sk_stream_alloc_skb(sk, 0, gfp);
1466 if (unlikely(buff == NULL)) 1477 if (unlikely(buff == NULL))
1467 return -ENOMEM; 1478 return -ENOMEM;
1468 1479
@@ -1478,7 +1489,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1478 1489
1479 /* PSH and FIN should only be set in the second packet. */ 1490 /* PSH and FIN should only be set in the second packet. */
1480 flags = TCP_SKB_CB(skb)->flags; 1491 flags = TCP_SKB_CB(skb)->flags;
1481 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); 1492 TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1482 TCP_SKB_CB(buff)->flags = flags; 1493 TCP_SKB_CB(buff)->flags = flags;
1483 1494
1484 /* This packet was never sent out yet, so no SACK bits. */ 1495 /* This packet was never sent out yet, so no SACK bits. */
@@ -1509,7 +1520,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1509 const struct inet_connection_sock *icsk = inet_csk(sk); 1520 const struct inet_connection_sock *icsk = inet_csk(sk);
1510 u32 send_win, cong_win, limit, in_flight; 1521 u32 send_win, cong_win, limit, in_flight;
1511 1522
1512 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) 1523 if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
1513 goto send_now; 1524 goto send_now;
1514 1525
1515 if (icsk->icsk_ca_state != TCP_CA_Open) 1526 if (icsk->icsk_ca_state != TCP_CA_Open)
@@ -1635,7 +1646,7 @@ static int tcp_mtu_probe(struct sock *sk)
1635 1646
1636 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; 1647 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1637 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; 1648 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
1638 TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; 1649 TCP_SKB_CB(nskb)->flags = TCPHDR_ACK;
1639 TCP_SKB_CB(nskb)->sacked = 0; 1650 TCP_SKB_CB(nskb)->sacked = 0;
1640 nskb->csum = 0; 1651 nskb->csum = 0;
1641 nskb->ip_summed = skb->ip_summed; 1652 nskb->ip_summed = skb->ip_summed;
@@ -1660,7 +1671,7 @@ static int tcp_mtu_probe(struct sock *sk)
1660 sk_wmem_free_skb(sk, skb); 1671 sk_wmem_free_skb(sk, skb);
1661 } else { 1672 } else {
1662 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & 1673 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
1663 ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); 1674 ~(TCPHDR_FIN|TCPHDR_PSH);
1664 if (!skb_shinfo(skb)->nr_frags) { 1675 if (!skb_shinfo(skb)->nr_frags) {
1665 skb_pull(skb, copy); 1676 skb_pull(skb, copy);
1666 if (skb->ip_summed != CHECKSUM_PARTIAL) 1677 if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -1760,7 +1771,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1760 cwnd_quota); 1771 cwnd_quota);
1761 1772
1762 if (skb->len > limit && 1773 if (skb->len > limit &&
1763 unlikely(tso_fragment(sk, skb, limit, mss_now))) 1774 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
1764 break; 1775 break;
1765 1776
1766 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1777 TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -1794,11 +1805,6 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1794void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, 1805void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
1795 int nonagle) 1806 int nonagle)
1796{ 1807{
1797 struct sk_buff *skb = tcp_send_head(sk);
1798
1799 if (!skb)
1800 return;
1801
1802 /* If we are closed, the bytes will have to remain here. 1808 /* If we are closed, the bytes will have to remain here.
1803 * In time closedown will finish, we empty the write queue and 1809 * In time closedown will finish, we empty the write queue and
1804 * all will be happy. 1810 * all will be happy.
@@ -2016,7 +2022,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2016 2022
2017 if (!sysctl_tcp_retrans_collapse) 2023 if (!sysctl_tcp_retrans_collapse)
2018 return; 2024 return;
2019 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) 2025 if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN)
2020 return; 2026 return;
2021 2027
2022 tcp_for_write_queue_from_safe(skb, tmp, sk) { 2028 tcp_for_write_queue_from_safe(skb, tmp, sk) {
@@ -2108,7 +2114,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2108 * since it is cheap to do so and saves bytes on the network. 2114 * since it is cheap to do so and saves bytes on the network.
2109 */ 2115 */
2110 if (skb->len > 0 && 2116 if (skb->len > 0 &&
2111 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && 2117 (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) &&
2112 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { 2118 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
2113 if (!pskb_trim(skb, 0)) { 2119 if (!pskb_trim(skb, 0)) {
2114 /* Reuse, even though it does some unnecessary work */ 2120 /* Reuse, even though it does some unnecessary work */
@@ -2204,6 +2210,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2204 int mib_idx; 2210 int mib_idx;
2205 int fwd_rexmitting = 0; 2211 int fwd_rexmitting = 0;
2206 2212
2213 if (!tp->packets_out)
2214 return;
2215
2207 if (!tp->lost_out) 2216 if (!tp->lost_out)
2208 tp->retransmit_high = tp->snd_una; 2217 tp->retransmit_high = tp->snd_una;
2209 2218
@@ -2297,7 +2306,7 @@ void tcp_send_fin(struct sock *sk)
2297 mss_now = tcp_current_mss(sk); 2306 mss_now = tcp_current_mss(sk);
2298 2307
2299 if (tcp_send_head(sk) != NULL) { 2308 if (tcp_send_head(sk) != NULL) {
2300 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; 2309 TCP_SKB_CB(skb)->flags |= TCPHDR_FIN;
2301 TCP_SKB_CB(skb)->end_seq++; 2310 TCP_SKB_CB(skb)->end_seq++;
2302 tp->write_seq++; 2311 tp->write_seq++;
2303 } else { 2312 } else {
@@ -2314,7 +2323,7 @@ void tcp_send_fin(struct sock *sk)
2314 skb_reserve(skb, MAX_TCP_HEADER); 2323 skb_reserve(skb, MAX_TCP_HEADER);
2315 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ 2324 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
2316 tcp_init_nondata_skb(skb, tp->write_seq, 2325 tcp_init_nondata_skb(skb, tp->write_seq,
2317 TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); 2326 TCPHDR_ACK | TCPHDR_FIN);
2318 tcp_queue_skb(sk, skb); 2327 tcp_queue_skb(sk, skb);
2319 } 2328 }
2320 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); 2329 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
@@ -2339,7 +2348,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2339 /* Reserve space for headers and prepare control bits. */ 2348 /* Reserve space for headers and prepare control bits. */
2340 skb_reserve(skb, MAX_TCP_HEADER); 2349 skb_reserve(skb, MAX_TCP_HEADER);
2341 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), 2350 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
2342 TCPCB_FLAG_ACK | TCPCB_FLAG_RST); 2351 TCPHDR_ACK | TCPHDR_RST);
2343 /* Send it off. */ 2352 /* Send it off. */
2344 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2353 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2345 if (tcp_transmit_skb(sk, skb, 0, priority)) 2354 if (tcp_transmit_skb(sk, skb, 0, priority))
@@ -2359,11 +2368,11 @@ int tcp_send_synack(struct sock *sk)
2359 struct sk_buff *skb; 2368 struct sk_buff *skb;
2360 2369
2361 skb = tcp_write_queue_head(sk); 2370 skb = tcp_write_queue_head(sk);
2362 if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { 2371 if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) {
2363 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); 2372 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
2364 return -EFAULT; 2373 return -EFAULT;
2365 } 2374 }
2366 if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) { 2375 if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) {
2367 if (skb_cloned(skb)) { 2376 if (skb_cloned(skb)) {
2368 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); 2377 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
2369 if (nskb == NULL) 2378 if (nskb == NULL)
@@ -2377,7 +2386,7 @@ int tcp_send_synack(struct sock *sk)
2377 skb = nskb; 2386 skb = nskb;
2378 } 2387 }
2379 2388
2380 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK; 2389 TCP_SKB_CB(skb)->flags |= TCPHDR_ACK;
2381 TCP_ECN_send_synack(tcp_sk(sk), skb); 2390 TCP_ECN_send_synack(tcp_sk(sk), skb);
2382 } 2391 }
2383 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2392 TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -2393,13 +2402,17 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2393 struct tcp_extend_values *xvp = tcp_xv(rvp); 2402 struct tcp_extend_values *xvp = tcp_xv(rvp);
2394 struct inet_request_sock *ireq = inet_rsk(req); 2403 struct inet_request_sock *ireq = inet_rsk(req);
2395 struct tcp_sock *tp = tcp_sk(sk); 2404 struct tcp_sock *tp = tcp_sk(sk);
2405 const struct tcp_cookie_values *cvp = tp->cookie_values;
2396 struct tcphdr *th; 2406 struct tcphdr *th;
2397 struct sk_buff *skb; 2407 struct sk_buff *skb;
2398 struct tcp_md5sig_key *md5; 2408 struct tcp_md5sig_key *md5;
2399 int tcp_header_size; 2409 int tcp_header_size;
2400 int mss; 2410 int mss;
2411 int s_data_desired = 0;
2401 2412
2402 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); 2413 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
2414 s_data_desired = cvp->s_data_desired;
2415 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
2403 if (skb == NULL) 2416 if (skb == NULL)
2404 return NULL; 2417 return NULL;
2405 2418
@@ -2422,7 +2435,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2422 &req->rcv_wnd, 2435 &req->rcv_wnd,
2423 &req->window_clamp, 2436 &req->window_clamp,
2424 ireq->wscale_ok, 2437 ireq->wscale_ok,
2425 &rcv_wscale); 2438 &rcv_wscale,
2439 dst_metric(dst, RTAX_INITRWND));
2426 ireq->rcv_wscale = rcv_wscale; 2440 ireq->rcv_wscale = rcv_wscale;
2427 } 2441 }
2428 2442
@@ -2451,19 +2465,15 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2451 * not even correctly set) 2465 * not even correctly set)
2452 */ 2466 */
2453 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, 2467 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
2454 TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); 2468 TCPHDR_SYN | TCPHDR_ACK);
2455 2469
2456 if (OPTION_COOKIE_EXTENSION & opts.options) { 2470 if (OPTION_COOKIE_EXTENSION & opts.options) {
2457 const struct tcp_cookie_values *cvp = tp->cookie_values; 2471 if (s_data_desired) {
2458 2472 u8 *buf = skb_put(skb, s_data_desired);
2459 if (cvp != NULL &&
2460 cvp->s_data_constant &&
2461 cvp->s_data_desired > 0) {
2462 u8 *buf = skb_put(skb, cvp->s_data_desired);
2463 2473
2464 /* copy data directly from the listening socket. */ 2474 /* copy data directly from the listening socket. */
2465 memcpy(buf, cvp->s_data_payload, cvp->s_data_desired); 2475 memcpy(buf, cvp->s_data_payload, s_data_desired);
2466 TCP_SKB_CB(skb)->end_seq += cvp->s_data_desired; 2476 TCP_SKB_CB(skb)->end_seq += s_data_desired;
2467 } 2477 }
2468 2478
2469 if (opts.hash_size > 0) { 2479 if (opts.hash_size > 0) {
@@ -2480,7 +2490,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2480 *tail-- ^= TCP_SKB_CB(skb)->seq + 1; 2490 *tail-- ^= TCP_SKB_CB(skb)->seq + 1;
2481 2491
2482 /* recommended */ 2492 /* recommended */
2483 *tail-- ^= ((th->dest << 16) | th->source); 2493 *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
2484 *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */ 2494 *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
2485 2495
2486 sha_transform((__u32 *)&xvp->cookie_bakery[0], 2496 sha_transform((__u32 *)&xvp->cookie_bakery[0],
@@ -2498,7 +2508,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2498 th->window = htons(min(req->rcv_wnd, 65535U)); 2508 th->window = htons(min(req->rcv_wnd, 65535U));
2499 tcp_options_write((__be32 *)(th + 1), tp, &opts); 2509 tcp_options_write((__be32 *)(th + 1), tp, &opts);
2500 th->doff = (tcp_header_size >> 2); 2510 th->doff = (tcp_header_size >> 2);
2501 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); 2511 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
2502 2512
2503#ifdef CONFIG_TCP_MD5SIG 2513#ifdef CONFIG_TCP_MD5SIG
2504 /* Okay, we have all we need - do the md5 hash if needed */ 2514 /* Okay, we have all we need - do the md5 hash if needed */
@@ -2510,6 +2520,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2510 2520
2511 return skb; 2521 return skb;
2512} 2522}
2523EXPORT_SYMBOL(tcp_make_synack);
2513 2524
2514/* Do all connect socket setups that can be done AF independent. */ 2525/* Do all connect socket setups that can be done AF independent. */
2515static void tcp_connect_init(struct sock *sk) 2526static void tcp_connect_init(struct sock *sk)
@@ -2549,7 +2560,8 @@ static void tcp_connect_init(struct sock *sk)
2549 &tp->rcv_wnd, 2560 &tp->rcv_wnd,
2550 &tp->window_clamp, 2561 &tp->window_clamp,
2551 sysctl_tcp_window_scaling, 2562 sysctl_tcp_window_scaling,
2552 &rcv_wscale); 2563 &rcv_wscale,
2564 dst_metric(dst, RTAX_INITRWND));
2553 2565
2554 tp->rx_opt.rcv_wscale = rcv_wscale; 2566 tp->rx_opt.rcv_wscale = rcv_wscale;
2555 tp->rcv_ssthresh = tp->rcv_wnd; 2567 tp->rcv_ssthresh = tp->rcv_wnd;
@@ -2586,7 +2598,7 @@ int tcp_connect(struct sock *sk)
2586 skb_reserve(buff, MAX_TCP_HEADER); 2598 skb_reserve(buff, MAX_TCP_HEADER);
2587 2599
2588 tp->snd_nxt = tp->write_seq; 2600 tp->snd_nxt = tp->write_seq;
2589 tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); 2601 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
2590 TCP_ECN_send_syn(sk, buff); 2602 TCP_ECN_send_syn(sk, buff);
2591 2603
2592 /* Send it off. */ 2604 /* Send it off. */
@@ -2611,6 +2623,7 @@ int tcp_connect(struct sock *sk)
2611 inet_csk(sk)->icsk_rto, TCP_RTO_MAX); 2623 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
2612 return 0; 2624 return 0;
2613} 2625}
2626EXPORT_SYMBOL(tcp_connect);
2614 2627
2615/* Send out a delayed ack, the caller does the policy checking 2628/* Send out a delayed ack, the caller does the policy checking
2616 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() 2629 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
@@ -2692,7 +2705,7 @@ void tcp_send_ack(struct sock *sk)
2692 2705
2693 /* Reserve space for headers and prepare control bits. */ 2706 /* Reserve space for headers and prepare control bits. */
2694 skb_reserve(buff, MAX_TCP_HEADER); 2707 skb_reserve(buff, MAX_TCP_HEADER);
2695 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK); 2708 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
2696 2709
2697 /* Send it off, this clears delayed acks for us. */ 2710 /* Send it off, this clears delayed acks for us. */
2698 TCP_SKB_CB(buff)->when = tcp_time_stamp; 2711 TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -2726,7 +2739,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
2726 * end to send an ack. Don't queue or clone SKB, just 2739 * end to send an ack. Don't queue or clone SKB, just
2727 * send it. 2740 * send it.
2728 */ 2741 */
2729 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK); 2742 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
2730 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2743 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2731 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); 2744 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
2732} 2745}
@@ -2756,13 +2769,13 @@ int tcp_write_wakeup(struct sock *sk)
2756 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || 2769 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
2757 skb->len > mss) { 2770 skb->len > mss) {
2758 seg_size = min(seg_size, mss); 2771 seg_size = min(seg_size, mss);
2759 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2772 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
2760 if (tcp_fragment(sk, skb, seg_size, mss)) 2773 if (tcp_fragment(sk, skb, seg_size, mss))
2761 return -1; 2774 return -1;
2762 } else if (!tcp_skb_pcount(skb)) 2775 } else if (!tcp_skb_pcount(skb))
2763 tcp_set_skb_tso_segs(sk, skb, mss); 2776 tcp_set_skb_tso_segs(sk, skb, mss);
2764 2777
2765 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2778 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
2766 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2779 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2767 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2780 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2768 if (!err) 2781 if (!err)
@@ -2815,10 +2828,3 @@ void tcp_send_probe0(struct sock *sk)
2815 TCP_RTO_MAX); 2828 TCP_RTO_MAX);
2816 } 2829 }
2817} 2830}
2818
2819EXPORT_SYMBOL(tcp_select_initial_window);
2820EXPORT_SYMBOL(tcp_connect);
2821EXPORT_SYMBOL(tcp_make_synack);
2822EXPORT_SYMBOL(tcp_simple_retransmit);
2823EXPORT_SYMBOL(tcp_sync_mss);
2824EXPORT_SYMBOL(tcp_mtup_init);
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 9bc805df95d2..f8efada580e8 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -22,6 +22,7 @@
22#include <linux/kprobes.h> 22#include <linux/kprobes.h>
23#include <linux/socket.h> 23#include <linux/socket.h>
24#include <linux/tcp.h> 24#include <linux/tcp.h>
25#include <linux/slab.h>
25#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/ktime.h> 28#include <linux/ktime.h>
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 8816a20c2597..74c54b30600f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -19,6 +19,7 @@
19 */ 19 */
20 20
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/gfp.h>
22#include <net/tcp.h> 23#include <net/tcp.h>
23 24
24int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; 25int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
@@ -29,6 +30,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
29int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; 30int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
30int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; 31int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
31int sysctl_tcp_orphan_retries __read_mostly; 32int sysctl_tcp_orphan_retries __read_mostly;
33int sysctl_tcp_thin_linear_timeouts __read_mostly;
32 34
33static void tcp_write_timer(unsigned long); 35static void tcp_write_timer(unsigned long);
34static void tcp_delack_timer(unsigned long); 36static void tcp_delack_timer(unsigned long);
@@ -39,7 +41,6 @@ void tcp_init_xmit_timers(struct sock *sk)
39 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, 41 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
40 &tcp_keepalive_timer); 42 &tcp_keepalive_timer);
41} 43}
42
43EXPORT_SYMBOL(tcp_init_xmit_timers); 44EXPORT_SYMBOL(tcp_init_xmit_timers);
44 45
45static void tcp_write_err(struct sock *sk) 46static void tcp_write_err(struct sock *sk)
@@ -65,18 +66,18 @@ static void tcp_write_err(struct sock *sk)
65static int tcp_out_of_resources(struct sock *sk, int do_reset) 66static int tcp_out_of_resources(struct sock *sk, int do_reset)
66{ 67{
67 struct tcp_sock *tp = tcp_sk(sk); 68 struct tcp_sock *tp = tcp_sk(sk);
68 int orphans = percpu_counter_read_positive(&tcp_orphan_count); 69 int shift = 0;
69 70
70 /* If peer does not open window for long time, or did not transmit 71 /* If peer does not open window for long time, or did not transmit
71 * anything for long time, penalize it. */ 72 * anything for long time, penalize it. */
72 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) 73 if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
73 orphans <<= 1; 74 shift++;
74 75
75 /* If some dubious ICMP arrived, penalize even more. */ 76 /* If some dubious ICMP arrived, penalize even more. */
76 if (sk->sk_err_soft) 77 if (sk->sk_err_soft)
77 orphans <<= 1; 78 shift++;
78 79
79 if (tcp_too_many_orphans(sk, orphans)) { 80 if (tcp_too_many_orphans(sk, shift)) {
80 if (net_ratelimit()) 81 if (net_ratelimit())
81 printk(KERN_INFO "Out of socket memory\n"); 82 printk(KERN_INFO "Out of socket memory\n");
82 83
@@ -133,14 +134,17 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
133} 134}
134 135
135/* This function calculates a "timeout" which is equivalent to the timeout of a 136/* This function calculates a "timeout" which is equivalent to the timeout of a
136 * TCP connection after "boundary" unsucessful, exponentially backed-off 137 * TCP connection after "boundary" unsuccessful, exponentially backed-off
137 * retransmissions with an initial RTO of TCP_RTO_MIN. 138 * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
139 * syn_set flag is set.
138 */ 140 */
139static bool retransmits_timed_out(struct sock *sk, 141static bool retransmits_timed_out(struct sock *sk,
140 unsigned int boundary) 142 unsigned int boundary,
143 bool syn_set)
141{ 144{
142 unsigned int timeout, linear_backoff_thresh; 145 unsigned int timeout, linear_backoff_thresh;
143 unsigned int start_ts; 146 unsigned int start_ts;
147 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
144 148
145 if (!inet_csk(sk)->icsk_retransmits) 149 if (!inet_csk(sk)->icsk_retransmits)
146 return false; 150 return false;
@@ -150,12 +154,12 @@ static bool retransmits_timed_out(struct sock *sk,
150 else 154 else
151 start_ts = tcp_sk(sk)->retrans_stamp; 155 start_ts = tcp_sk(sk)->retrans_stamp;
152 156
153 linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); 157 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
154 158
155 if (boundary <= linear_backoff_thresh) 159 if (boundary <= linear_backoff_thresh)
156 timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; 160 timeout = ((2 << boundary) - 1) * rto_base;
157 else 161 else
158 timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + 162 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
159 (boundary - linear_backoff_thresh) * TCP_RTO_MAX; 163 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
160 164
161 return (tcp_time_stamp - start_ts) >= timeout; 165 return (tcp_time_stamp - start_ts) >= timeout;
@@ -166,18 +170,19 @@ static int tcp_write_timeout(struct sock *sk)
166{ 170{
167 struct inet_connection_sock *icsk = inet_csk(sk); 171 struct inet_connection_sock *icsk = inet_csk(sk);
168 int retry_until; 172 int retry_until;
169 bool do_reset; 173 bool do_reset, syn_set = 0;
170 174
171 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 175 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
172 if (icsk->icsk_retransmits) 176 if (icsk->icsk_retransmits)
173 dst_negative_advice(&sk->sk_dst_cache, sk); 177 dst_negative_advice(sk);
174 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 178 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
179 syn_set = 1;
175 } else { 180 } else {
176 if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { 181 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) {
177 /* Black hole detection */ 182 /* Black hole detection */
178 tcp_mtu_probing(icsk, sk); 183 tcp_mtu_probing(icsk, sk);
179 184
180 dst_negative_advice(&sk->sk_dst_cache, sk); 185 dst_negative_advice(sk);
181 } 186 }
182 187
183 retry_until = sysctl_tcp_retries2; 188 retry_until = sysctl_tcp_retries2;
@@ -186,14 +191,14 @@ static int tcp_write_timeout(struct sock *sk)
186 191
187 retry_until = tcp_orphan_retries(sk, alive); 192 retry_until = tcp_orphan_retries(sk, alive);
188 do_reset = alive || 193 do_reset = alive ||
189 !retransmits_timed_out(sk, retry_until); 194 !retransmits_timed_out(sk, retry_until, 0);
190 195
191 if (tcp_out_of_resources(sk, do_reset)) 196 if (tcp_out_of_resources(sk, do_reset))
192 return 1; 197 return 1;
193 } 198 }
194 } 199 }
195 200
196 if (retransmits_timed_out(sk, retry_until)) { 201 if (retransmits_timed_out(sk, retry_until, syn_set)) {
197 /* Has it gone just too far? */ 202 /* Has it gone just too far? */
198 tcp_write_err(sk); 203 tcp_write_err(sk);
199 return 1; 204 return 1;
@@ -415,9 +420,27 @@ void tcp_retransmit_timer(struct sock *sk)
415 icsk->icsk_retransmits++; 420 icsk->icsk_retransmits++;
416 421
417out_reset_timer: 422out_reset_timer:
418 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); 423 /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
424 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
425 * might be increased if the stream oscillates between thin and thick,
426 * thus the old value might already be too high compared to the value
427 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
428 * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
429 * exponential backoff behaviour to avoid continue hammering
430 * linear-timeout retransmissions into a black hole
431 */
432 if (sk->sk_state == TCP_ESTABLISHED &&
433 (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
434 tcp_stream_is_thin(tp) &&
435 icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
436 icsk->icsk_backoff = 0;
437 icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
438 } else {
439 /* Use normal (exponential) backoff */
440 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
441 }
419 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); 442 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
420 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) 443 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0))
421 __sk_dst_reset(sk); 444 __sk_dst_reset(sk);
422 445
423out:; 446out:;
@@ -474,6 +497,12 @@ static void tcp_synack_timer(struct sock *sk)
474 TCP_TIMEOUT_INIT, TCP_RTO_MAX); 497 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
475} 498}
476 499
500void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req)
501{
502 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
503}
504EXPORT_SYMBOL(tcp_syn_ack_timeout);
505
477void tcp_set_keepalive(struct sock *sk, int val) 506void tcp_set_keepalive(struct sock *sk, int val)
478{ 507{
479 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) 508 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
@@ -491,7 +520,7 @@ static void tcp_keepalive_timer (unsigned long data)
491 struct sock *sk = (struct sock *) data; 520 struct sock *sk = (struct sock *) data;
492 struct inet_connection_sock *icsk = inet_csk(sk); 521 struct inet_connection_sock *icsk = inet_csk(sk);
493 struct tcp_sock *tp = tcp_sk(sk); 522 struct tcp_sock *tp = tcp_sk(sk);
494 __u32 elapsed; 523 u32 elapsed;
495 524
496 /* Only process if socket is not in use. */ 525 /* Only process if socket is not in use. */
497 bh_lock_sock(sk); 526 bh_lock_sock(sk);
@@ -528,7 +557,7 @@ static void tcp_keepalive_timer (unsigned long data)
528 if (tp->packets_out || tcp_send_head(sk)) 557 if (tp->packets_out || tcp_send_head(sk))
529 goto resched; 558 goto resched;
530 559
531 elapsed = tcp_time_stamp - tp->rcv_tstamp; 560 elapsed = keepalive_time_elapsed(tp);
532 561
533 if (elapsed >= keepalive_time_when(tp)) { 562 if (elapsed >= keepalive_time_when(tp)) {
534 if (icsk->icsk_probes_out >= keepalive_probes(tp)) { 563 if (icsk->icsk_probes_out >= keepalive_probes(tp)) {
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 3959e0ca456a..59186ca7808a 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -8,6 +8,7 @@
8#include <linux/mutex.h> 8#include <linux/mutex.h>
9#include <linux/netdevice.h> 9#include <linux/netdevice.h>
10#include <linux/skbuff.h> 10#include <linux/skbuff.h>
11#include <linux/slab.h>
11#include <net/icmp.h> 12#include <net/icmp.h>
12#include <net/ip.h> 13#include <net/ip.h>
13#include <net/protocol.h> 14#include <net/protocol.h>
@@ -47,7 +48,6 @@ err:
47 48
48 return ret; 49 return ret;
49} 50}
50
51EXPORT_SYMBOL(xfrm4_tunnel_register); 51EXPORT_SYMBOL(xfrm4_tunnel_register);
52 52
53int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) 53int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
@@ -71,7 +71,6 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
71 71
72 return ret; 72 return ret;
73} 73}
74
75EXPORT_SYMBOL(xfrm4_tunnel_deregister); 74EXPORT_SYMBOL(xfrm4_tunnel_deregister);
76 75
77static int tunnel4_rcv(struct sk_buff *skb) 76static int tunnel4_rcv(struct sk_buff *skb)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f0126fdd7e04..fb23c2e63b52 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -95,6 +95,7 @@
95#include <linux/mm.h> 95#include <linux/mm.h>
96#include <linux/inet.h> 96#include <linux/inet.h>
97#include <linux/netdevice.h> 97#include <linux/netdevice.h>
98#include <linux/slab.h>
98#include <net/tcp_states.h> 99#include <net/tcp_states.h>
99#include <linux/skbuff.h> 100#include <linux/skbuff.h>
100#include <linux/proc_fs.h> 101#include <linux/proc_fs.h>
@@ -232,7 +233,8 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
232 */ 233 */
233 do { 234 do {
234 if (low <= snum && snum <= high && 235 if (low <= snum && snum <= high &&
235 !test_bit(snum >> udptable->log, bitmap)) 236 !test_bit(snum >> udptable->log, bitmap) &&
237 !inet_is_reserved_local_port(snum))
236 goto found; 238 goto found;
237 snum += rand; 239 snum += rand;
238 } while (snum != first); 240 } while (snum != first);
@@ -306,13 +308,13 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
306static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, 308static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
307 unsigned int port) 309 unsigned int port)
308{ 310{
309 return jhash_1word(saddr, net_hash_mix(net)) ^ port; 311 return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
310} 312}
311 313
312int udp_v4_get_port(struct sock *sk, unsigned short snum) 314int udp_v4_get_port(struct sock *sk, unsigned short snum)
313{ 315{
314 unsigned int hash2_nulladdr = 316 unsigned int hash2_nulladdr =
315 udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum); 317 udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
316 unsigned int hash2_partial = 318 unsigned int hash2_partial =
317 udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); 319 udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
318 320
@@ -465,14 +467,14 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
465 daddr, hnum, dif, 467 daddr, hnum, dif,
466 hslot2, slot2); 468 hslot2, slot2);
467 if (!result) { 469 if (!result) {
468 hash2 = udp4_portaddr_hash(net, INADDR_ANY, hnum); 470 hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
469 slot2 = hash2 & udptable->mask; 471 slot2 = hash2 & udptable->mask;
470 hslot2 = &udptable->hash2[slot2]; 472 hslot2 = &udptable->hash2[slot2];
471 if (hslot->count < hslot2->count) 473 if (hslot->count < hslot2->count)
472 goto begin; 474 goto begin;
473 475
474 result = udp4_lib_lookup2(net, INADDR_ANY, sport, 476 result = udp4_lib_lookup2(net, saddr, sport,
475 daddr, hnum, dif, 477 htonl(INADDR_ANY), hnum, dif,
476 hslot2, slot2); 478 hslot2, slot2);
477 } 479 }
478 rcu_read_unlock(); 480 rcu_read_unlock();
@@ -631,9 +633,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
631 if (!inet->recverr) { 633 if (!inet->recverr) {
632 if (!harderr || sk->sk_state != TCP_ESTABLISHED) 634 if (!harderr || sk->sk_state != TCP_ESTABLISHED)
633 goto out; 635 goto out;
634 } else { 636 } else
635 ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); 637 ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
636 } 638
637 sk->sk_err = err; 639 sk->sk_err = err;
638 sk->sk_error_report(sk); 640 sk->sk_error_report(sk);
639out: 641out:
@@ -912,7 +914,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
912 !sock_flag(sk, SOCK_BROADCAST)) 914 !sock_flag(sk, SOCK_BROADCAST))
913 goto out; 915 goto out;
914 if (connected) 916 if (connected)
915 sk_dst_set(sk, dst_clone(&rt->u.dst)); 917 sk_dst_set(sk, dst_clone(&rt->dst));
916 } 918 }
917 919
918 if (msg->msg_flags&MSG_CONFIRM) 920 if (msg->msg_flags&MSG_CONFIRM)
@@ -976,7 +978,7 @@ out:
976 return err; 978 return err;
977 979
978do_confirm: 980do_confirm:
979 dst_confirm(&rt->u.dst); 981 dst_confirm(&rt->dst);
980 if (!(msg->msg_flags&MSG_PROBE) || len) 982 if (!(msg->msg_flags&MSG_PROBE) || len)
981 goto back_from_confirm; 983 goto back_from_confirm;
982 err = 0; 984 err = 0;
@@ -1061,10 +1063,11 @@ static unsigned int first_packet_length(struct sock *sk)
1061 spin_unlock_bh(&rcvq->lock); 1063 spin_unlock_bh(&rcvq->lock);
1062 1064
1063 if (!skb_queue_empty(&list_kill)) { 1065 if (!skb_queue_empty(&list_kill)) {
1064 lock_sock(sk); 1066 bool slow = lock_sock_fast(sk);
1067
1065 __skb_queue_purge(&list_kill); 1068 __skb_queue_purge(&list_kill);
1066 sk_mem_reclaim_partial(sk); 1069 sk_mem_reclaim_partial(sk);
1067 release_sock(sk); 1070 unlock_sock_fast(sk, slow);
1068 } 1071 }
1069 return res; 1072 return res;
1070} 1073}
@@ -1117,10 +1120,11 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1117 struct inet_sock *inet = inet_sk(sk); 1120 struct inet_sock *inet = inet_sk(sk);
1118 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; 1121 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
1119 struct sk_buff *skb; 1122 struct sk_buff *skb;
1120 unsigned int ulen, copied; 1123 unsigned int ulen;
1121 int peeked; 1124 int peeked;
1122 int err; 1125 int err;
1123 int is_udplite = IS_UDPLITE(sk); 1126 int is_udplite = IS_UDPLITE(sk);
1127 bool slow;
1124 1128
1125 /* 1129 /*
1126 * Check any passed addresses 1130 * Check any passed addresses
@@ -1138,10 +1142,9 @@ try_again:
1138 goto out; 1142 goto out;
1139 1143
1140 ulen = skb->len - sizeof(struct udphdr); 1144 ulen = skb->len - sizeof(struct udphdr);
1141 copied = len; 1145 if (len > ulen)
1142 if (copied > ulen) 1146 len = ulen;
1143 copied = ulen; 1147 else if (len < ulen)
1144 else if (copied < ulen)
1145 msg->msg_flags |= MSG_TRUNC; 1148 msg->msg_flags |= MSG_TRUNC;
1146 1149
1147 /* 1150 /*
@@ -1150,14 +1153,14 @@ try_again:
1150 * coverage checksum (UDP-Lite), do it before the copy. 1153 * coverage checksum (UDP-Lite), do it before the copy.
1151 */ 1154 */
1152 1155
1153 if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { 1156 if (len < ulen || UDP_SKB_CB(skb)->partial_cov) {
1154 if (udp_lib_checksum_complete(skb)) 1157 if (udp_lib_checksum_complete(skb))
1155 goto csum_copy_err; 1158 goto csum_copy_err;
1156 } 1159 }
1157 1160
1158 if (skb_csum_unnecessary(skb)) 1161 if (skb_csum_unnecessary(skb))
1159 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), 1162 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
1160 msg->msg_iov, copied); 1163 msg->msg_iov, len);
1161 else { 1164 else {
1162 err = skb_copy_and_csum_datagram_iovec(skb, 1165 err = skb_copy_and_csum_datagram_iovec(skb,
1163 sizeof(struct udphdr), 1166 sizeof(struct udphdr),
@@ -1186,7 +1189,7 @@ try_again:
1186 if (inet->cmsg_flags) 1189 if (inet->cmsg_flags)
1187 ip_cmsg_recv(msg, skb); 1190 ip_cmsg_recv(msg, skb);
1188 1191
1189 err = copied; 1192 err = len;
1190 if (flags & MSG_TRUNC) 1193 if (flags & MSG_TRUNC)
1191 err = ulen; 1194 err = ulen;
1192 1195
@@ -1196,10 +1199,10 @@ out:
1196 return err; 1199 return err;
1197 1200
1198csum_copy_err: 1201csum_copy_err:
1199 lock_sock(sk); 1202 slow = lock_sock_fast(sk);
1200 if (!skb_kill_datagram(sk, skb, flags)) 1203 if (!skb_kill_datagram(sk, skb, flags))
1201 UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); 1204 UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1202 release_sock(sk); 1205 unlock_sock_fast(sk, slow);
1203 1206
1204 if (noblock) 1207 if (noblock)
1205 return -EAGAIN; 1208 return -EAGAIN;
@@ -1217,6 +1220,7 @@ int udp_disconnect(struct sock *sk, int flags)
1217 sk->sk_state = TCP_CLOSE; 1220 sk->sk_state = TCP_CLOSE;
1218 inet->inet_daddr = 0; 1221 inet->inet_daddr = 0;
1219 inet->inet_dport = 0; 1222 inet->inet_dport = 0;
1223 sock_rps_save_rxhash(sk, 0);
1220 sk->sk_bound_dev_if = 0; 1224 sk->sk_bound_dev_if = 0;
1221 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 1225 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1222 inet_reset_saddr(sk); 1226 inet_reset_saddr(sk);
@@ -1256,10 +1260,57 @@ void udp_lib_unhash(struct sock *sk)
1256} 1260}
1257EXPORT_SYMBOL(udp_lib_unhash); 1261EXPORT_SYMBOL(udp_lib_unhash);
1258 1262
1263/*
1264 * inet_rcv_saddr was changed, we must rehash secondary hash
1265 */
1266void udp_lib_rehash(struct sock *sk, u16 newhash)
1267{
1268 if (sk_hashed(sk)) {
1269 struct udp_table *udptable = sk->sk_prot->h.udp_table;
1270 struct udp_hslot *hslot, *hslot2, *nhslot2;
1271
1272 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
1273 nhslot2 = udp_hashslot2(udptable, newhash);
1274 udp_sk(sk)->udp_portaddr_hash = newhash;
1275 if (hslot2 != nhslot2) {
1276 hslot = udp_hashslot(udptable, sock_net(sk),
1277 udp_sk(sk)->udp_port_hash);
1278 /* we must lock primary chain too */
1279 spin_lock_bh(&hslot->lock);
1280
1281 spin_lock(&hslot2->lock);
1282 hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
1283 hslot2->count--;
1284 spin_unlock(&hslot2->lock);
1285
1286 spin_lock(&nhslot2->lock);
1287 hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
1288 &nhslot2->head);
1289 nhslot2->count++;
1290 spin_unlock(&nhslot2->lock);
1291
1292 spin_unlock_bh(&hslot->lock);
1293 }
1294 }
1295}
1296EXPORT_SYMBOL(udp_lib_rehash);
1297
1298static void udp_v4_rehash(struct sock *sk)
1299{
1300 u16 new_hash = udp4_portaddr_hash(sock_net(sk),
1301 inet_sk(sk)->inet_rcv_saddr,
1302 inet_sk(sk)->inet_num);
1303 udp_lib_rehash(sk, new_hash);
1304}
1305
1259static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 1306static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1260{ 1307{
1261 int rc = sock_queue_rcv_skb(sk, skb); 1308 int rc;
1262 1309
1310 if (inet_sk(sk)->inet_daddr)
1311 sock_rps_save_rxhash(sk, skb->rxhash);
1312
1313 rc = ip_queue_rcv_skb(sk, skb);
1263 if (rc < 0) { 1314 if (rc < 0) {
1264 int is_udplite = IS_UDPLITE(sk); 1315 int is_udplite = IS_UDPLITE(sk);
1265 1316
@@ -1367,13 +1418,19 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1367 goto drop; 1418 goto drop;
1368 } 1419 }
1369 1420
1421
1422 if (sk_rcvqueues_full(sk, skb))
1423 goto drop;
1424
1370 rc = 0; 1425 rc = 0;
1371 1426
1372 bh_lock_sock(sk); 1427 bh_lock_sock(sk);
1373 if (!sock_owned_by_user(sk)) 1428 if (!sock_owned_by_user(sk))
1374 rc = __udp_queue_rcv_skb(sk, skb); 1429 rc = __udp_queue_rcv_skb(sk, skb);
1375 else 1430 else if (sk_add_backlog(sk, skb)) {
1376 sk_add_backlog(sk, skb); 1431 bh_unlock_sock(sk);
1432 goto drop;
1433 }
1377 bh_unlock_sock(sk); 1434 bh_unlock_sock(sk);
1378 1435
1379 return rc; 1436 return rc;
@@ -1525,6 +1582,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1525 1582
1526 uh = udp_hdr(skb); 1583 uh = udp_hdr(skb);
1527 ulen = ntohs(uh->len); 1584 ulen = ntohs(uh->len);
1585 saddr = ip_hdr(skb)->saddr;
1586 daddr = ip_hdr(skb)->daddr;
1587
1528 if (ulen > skb->len) 1588 if (ulen > skb->len)
1529 goto short_packet; 1589 goto short_packet;
1530 1590
@@ -1538,9 +1598,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1538 if (udp4_csum_init(skb, uh, proto)) 1598 if (udp4_csum_init(skb, uh, proto))
1539 goto csum_error; 1599 goto csum_error;
1540 1600
1541 saddr = ip_hdr(skb)->saddr;
1542 daddr = ip_hdr(skb)->daddr;
1543
1544 if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) 1601 if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
1545 return __udp4_lib_mcast_deliver(net, skb, uh, 1602 return __udp4_lib_mcast_deliver(net, skb, uh,
1546 saddr, daddr, udptable); 1603 saddr, daddr, udptable);
@@ -1613,9 +1670,9 @@ int udp_rcv(struct sk_buff *skb)
1613 1670
1614void udp_destroy_sock(struct sock *sk) 1671void udp_destroy_sock(struct sock *sk)
1615{ 1672{
1616 lock_sock(sk); 1673 bool slow = lock_sock_fast(sk);
1617 udp_flush_pending_frames(sk); 1674 udp_flush_pending_frames(sk);
1618 release_sock(sk); 1675 unlock_sock_fast(sk, slow);
1619} 1676}
1620 1677
1621/* 1678/*
@@ -1674,8 +1731,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1674 return -ENOPROTOOPT; 1731 return -ENOPROTOOPT;
1675 if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ 1732 if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
1676 val = 8; 1733 val = 8;
1677 else if (val > USHORT_MAX) 1734 else if (val > USHRT_MAX)
1678 val = USHORT_MAX; 1735 val = USHRT_MAX;
1679 up->pcslen = val; 1736 up->pcslen = val;
1680 up->pcflag |= UDPLITE_SEND_CC; 1737 up->pcflag |= UDPLITE_SEND_CC;
1681 break; 1738 break;
@@ -1688,8 +1745,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1688 return -ENOPROTOOPT; 1745 return -ENOPROTOOPT;
1689 if (val != 0 && val < 8) /* Avoid silly minimal values. */ 1746 if (val != 0 && val < 8) /* Avoid silly minimal values. */
1690 val = 8; 1747 val = 8;
1691 else if (val > USHORT_MAX) 1748 else if (val > USHRT_MAX)
1692 val = USHORT_MAX; 1749 val = USHRT_MAX;
1693 up->pcrlen = val; 1750 up->pcrlen = val;
1694 up->pcflag |= UDPLITE_RECV_CC; 1751 up->pcflag |= UDPLITE_RECV_CC;
1695 break; 1752 break;
@@ -1829,6 +1886,7 @@ struct proto udp_prot = {
1829 .backlog_rcv = __udp_queue_rcv_skb, 1886 .backlog_rcv = __udp_queue_rcv_skb,
1830 .hash = udp_lib_hash, 1887 .hash = udp_lib_hash,
1831 .unhash = udp_lib_unhash, 1888 .unhash = udp_lib_unhash,
1889 .rehash = udp_v4_rehash,
1832 .get_port = udp_v4_get_port, 1890 .get_port = udp_v4_get_port,
1833 .memory_allocated = &udp_memory_allocated, 1891 .memory_allocated = &udp_memory_allocated,
1834 .sysctl_mem = sysctl_udp_mem, 1892 .sysctl_mem = sysctl_udp_mem,
@@ -2027,12 +2085,12 @@ static struct udp_seq_afinfo udp4_seq_afinfo = {
2027 }, 2085 },
2028}; 2086};
2029 2087
2030static int udp4_proc_init_net(struct net *net) 2088static int __net_init udp4_proc_init_net(struct net *net)
2031{ 2089{
2032 return udp_proc_register(net, &udp4_seq_afinfo); 2090 return udp_proc_register(net, &udp4_seq_afinfo);
2033} 2091}
2034 2092
2035static void udp4_proc_exit_net(struct net *net) 2093static void __net_exit udp4_proc_exit_net(struct net *net)
2036{ 2094{
2037 udp_proc_unregister(net, &udp4_seq_afinfo); 2095 udp_proc_unregister(net, &udp4_seq_afinfo);
2038} 2096}
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 66f79513f4a5..ab76aa928fa9 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -58,6 +58,7 @@ struct proto udplite_prot = {
58 .compat_getsockopt = compat_udp_getsockopt, 58 .compat_getsockopt = compat_udp_getsockopt,
59#endif 59#endif
60}; 60};
61EXPORT_SYMBOL(udplite_prot);
61 62
62static struct inet_protosw udplite4_protosw = { 63static struct inet_protosw udplite4_protosw = {
63 .type = SOCK_DGRAM, 64 .type = SOCK_DGRAM,
@@ -81,12 +82,12 @@ static struct udp_seq_afinfo udplite4_seq_afinfo = {
81 }, 82 },
82}; 83};
83 84
84static int udplite4_proc_init_net(struct net *net) 85static int __net_init udplite4_proc_init_net(struct net *net)
85{ 86{
86 return udp_proc_register(net, &udplite4_seq_afinfo); 87 return udp_proc_register(net, &udplite4_seq_afinfo);
87} 88}
88 89
89static void udplite4_proc_exit_net(struct net *net) 90static void __net_exit udplite4_proc_exit_net(struct net *net)
90{ 91{
91 udp_proc_unregister(net, &udplite4_seq_afinfo); 92 udp_proc_unregister(net, &udplite4_seq_afinfo);
92} 93}
@@ -127,5 +128,3 @@ out_unregister_proto:
127out_register_err: 128out_register_err:
128 printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__); 129 printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
129} 130}
130
131EXPORT_SYMBOL(udplite_prot);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index f9f922a0ba88..06814b6216dc 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -9,6 +9,7 @@
9 * 9 *
10 */ 10 */
11 11
12#include <linux/slab.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/string.h> 14#include <linux/string.h>
14#include <linux/netfilter.h> 15#include <linux/netfilter.h>
@@ -26,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
26 if (skb_dst(skb) == NULL) { 27 if (skb_dst(skb) == NULL) {
27 const struct iphdr *iph = ip_hdr(skb); 28 const struct iphdr *iph = ip_hdr(skb);
28 29
29 if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, 30 if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
30 skb->dev)) 31 iph->tos, skb->dev))
31 goto drop; 32 goto drop;
32 } 33 }
33 return dst_input(skb); 34 return dst_input(skb);
@@ -60,7 +61,7 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
60 iph->tot_len = htons(skb->len); 61 iph->tot_len = htons(skb->len);
61 ip_send_check(iph); 62 ip_send_check(iph);
62 63
63 NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, 64 NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
64 xfrm4_rcv_encap_finish); 65 xfrm4_rcv_encap_finish);
65 return 0; 66 return 0;
66} 67}
@@ -162,5 +163,4 @@ int xfrm4_rcv(struct sk_buff *skb)
162{ 163{
163 return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0); 164 return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0);
164} 165}
165
166EXPORT_SYMBOL(xfrm4_rcv); 166EXPORT_SYMBOL(xfrm4_rcv);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 3444f3b34eca..6f368413eb0e 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -4,6 +4,7 @@
4 * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> 4 * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
5 */ 5 */
6 6
7#include <linux/gfp.h>
7#include <linux/init.h> 8#include <linux/init.h>
8#include <linux/kernel.h> 9#include <linux/kernel.h>
9#include <linux/module.h> 10#include <linux/module.h>
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index c908bd99bcba..571aa96a175c 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -86,7 +86,7 @@ static int xfrm4_output_finish(struct sk_buff *skb)
86 86
87int xfrm4_output(struct sk_buff *skb) 87int xfrm4_output(struct sk_buff *skb)
88{ 88{
89 return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, 89 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb,
90 NULL, skb_dst(skb)->dev, xfrm4_output_finish, 90 NULL, skb_dst(skb)->dev, xfrm4_output_finish,
91 !(IPCB(skb)->flags & IPSKB_REROUTED)); 91 !(IPCB(skb)->flags & IPSKB_REROUTED));
92} 92}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 67107d63c1cd..a580349f0b8a 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -37,7 +37,7 @@ static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
37 fl.fl4_src = saddr->a4; 37 fl.fl4_src = saddr->a4;
38 38
39 err = __ip_route_output_key(net, &rt, &fl); 39 err = __ip_route_output_key(net, &rt, &fl);
40 dst = &rt->u.dst; 40 dst = &rt->dst;
41 if (err) 41 if (err)
42 dst = ERR_PTR(err); 42 dst = ERR_PTR(err);
43 return dst; 43 return dst;
@@ -59,30 +59,9 @@ static int xfrm4_get_saddr(struct net *net,
59 return 0; 59 return 0;
60} 60}
61 61
62static struct dst_entry *
63__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
64{
65 struct dst_entry *dst;
66
67 read_lock_bh(&policy->lock);
68 for (dst = policy->bundles; dst; dst = dst->next) {
69 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
70 if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/
71 xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
72 xdst->u.rt.fl.fl4_src == fl->fl4_src &&
73 xdst->u.rt.fl.fl4_tos == fl->fl4_tos &&
74 xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) {
75 dst_clone(dst);
76 break;
77 }
78 }
79 read_unlock_bh(&policy->lock);
80 return dst;
81}
82
83static int xfrm4_get_tos(struct flowi *fl) 62static int xfrm4_get_tos(struct flowi *fl)
84{ 63{
85 return fl->fl4_tos; 64 return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */
86} 65}
87 66
88static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, 67static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
@@ -91,11 +70,12 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
91 return 0; 70 return 0;
92} 71}
93 72
94static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) 73static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
74 struct flowi *fl)
95{ 75{
96 struct rtable *rt = (struct rtable *)xdst->route; 76 struct rtable *rt = (struct rtable *)xdst->route;
97 77
98 xdst->u.rt.fl = rt->fl; 78 xdst->u.rt.fl = *fl;
99 79
100 xdst->u.dst.dev = dev; 80 xdst->u.dst.dev = dev;
101 dev_hold(dev); 81 dev_hold(dev);
@@ -128,6 +108,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
128 u8 *xprth = skb_network_header(skb) + iph->ihl * 4; 108 u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
129 109
130 memset(fl, 0, sizeof(struct flowi)); 110 memset(fl, 0, sizeof(struct flowi));
111 fl->mark = skb->mark;
112
131 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { 113 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
132 switch (iph->protocol) { 114 switch (iph->protocol) {
133 case IPPROTO_UDP: 115 case IPPROTO_UDP:
@@ -258,7 +240,6 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
258 .dst_ops = &xfrm4_dst_ops, 240 .dst_ops = &xfrm4_dst_ops,
259 .dst_lookup = xfrm4_dst_lookup, 241 .dst_lookup = xfrm4_dst_lookup,
260 .get_saddr = xfrm4_get_saddr, 242 .get_saddr = xfrm4_get_saddr,
261 .find_bundle = __xfrm4_find_bundle,
262 .decode_session = _decode_session4, 243 .decode_session = _decode_session4,
263 .get_tos = xfrm4_get_tos, 244 .get_tos = xfrm4_get_tos,
264 .init_path = xfrm4_init_path, 245 .init_path = xfrm4_init_path,
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 1ef1366a0a03..47947624eccc 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,21 +21,25 @@ static int xfrm4_init_flags(struct xfrm_state *x)
21} 21}
22 22
23static void 23static void
24__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, 24__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
25 struct xfrm_tmpl *tmpl, 25{
26 xfrm_address_t *daddr, xfrm_address_t *saddr) 26 sel->daddr.a4 = fl->fl4_dst;
27 sel->saddr.a4 = fl->fl4_src;
28 sel->dport = xfrm_flowi_dport(fl);
29 sel->dport_mask = htons(0xffff);
30 sel->sport = xfrm_flowi_sport(fl);
31 sel->sport_mask = htons(0xffff);
32 sel->family = AF_INET;
33 sel->prefixlen_d = 32;
34 sel->prefixlen_s = 32;
35 sel->proto = fl->proto;
36 sel->ifindex = fl->oif;
37}
38
39static void
40xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
41 xfrm_address_t *daddr, xfrm_address_t *saddr)
27{ 42{
28 x->sel.daddr.a4 = fl->fl4_dst;
29 x->sel.saddr.a4 = fl->fl4_src;
30 x->sel.dport = xfrm_flowi_dport(fl);
31 x->sel.dport_mask = htons(0xffff);
32 x->sel.sport = xfrm_flowi_sport(fl);
33 x->sel.sport_mask = htons(0xffff);
34 x->sel.family = AF_INET;
35 x->sel.prefixlen_d = 32;
36 x->sel.prefixlen_s = 32;
37 x->sel.proto = fl->proto;
38 x->sel.ifindex = fl->oif;
39 x->id = tmpl->id; 43 x->id = tmpl->id;
40 if (x->id.daddr.a4 == 0) 44 if (x->id.daddr.a4 == 0)
41 x->id.daddr.a4 = daddr->a4; 45 x->id.daddr.a4 = daddr->a4;
@@ -70,6 +74,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
70 .owner = THIS_MODULE, 74 .owner = THIS_MODULE,
71 .init_flags = xfrm4_init_flags, 75 .init_flags = xfrm4_init_flags,
72 .init_tempsel = __xfrm4_init_tempsel, 76 .init_tempsel = __xfrm4_init_tempsel,
77 .init_temprop = xfrm4_init_temprop,
73 .output = xfrm4_output, 78 .output = xfrm4_output,
74 .extract_input = xfrm4_extract_input, 79 .extract_input = xfrm4_extract_input,
75 .extract_output = xfrm4_extract_output, 80 .extract_output = xfrm4_extract_output,