aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /net/ipv4
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig56
-rw-r--r--net/ipv4/Makefile7
-rw-r--r--net/ipv4/af_inet.c133
-rw-r--r--net/ipv4/ah4.c41
-rw-r--r--net/ipv4/arp.c286
-rw-r--r--net/ipv4/cipso_ipv4.c123
-rw-r--r--net/ipv4/datagram.c31
-rw-r--r--net/ipv4/devinet.c250
-rw-r--r--net/ipv4/esp4.c143
-rw-r--r--net/ipv4/fib_frontend.c329
-rw-r--r--net/ipv4/fib_hash.c1070
-rw-r--r--net/ipv4/fib_lookup.h27
-rw-r--r--net/ipv4/fib_rules.c36
-rw-r--r--net/ipv4/fib_semantics.c531
-rw-r--r--net/ipv4/fib_trie.c458
-rw-r--r--net/ipv4/gre.c152
-rw-r--r--net/ipv4/icmp.c322
-rw-r--r--net/ipv4/igmp.c357
-rw-r--r--net/ipv4/inet_connection_sock.c70
-rw-r--r--net/ipv4/inet_diag.c45
-rw-r--r--net/ipv4/inet_hashtables.c31
-rw-r--r--net/ipv4/inet_lro.c4
-rw-r--r--net/ipv4/inet_timewait_sock.c2
-rw-r--r--net/ipv4/inetpeer.c364
-rw-r--r--net/ipv4/ip_forward.c2
-rw-r--r--net/ipv4/ip_fragment.c87
-rw-r--r--net/ipv4/ip_gre.c283
-rw-r--r--net/ipv4/ip_input.c6
-rw-r--r--net/ipv4/ip_options.c85
-rw-r--r--net/ipv4/ip_output.c474
-rw-r--r--net/ipv4/ip_sockglue.c47
-rw-r--r--net/ipv4/ipcomp.c4
-rw-r--r--net/ipv4/ipconfig.c69
-rw-r--r--net/ipv4/ipip.c229
-rw-r--r--net/ipv4/ipmr.c566
-rw-r--r--net/ipv4/netfilter.c77
-rw-r--r--net/ipv4/netfilter/Kconfig19
-rw-r--r--net/ipv4/netfilter/Makefile7
-rw-r--r--net/ipv4/netfilter/arp_tables.c131
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c8
-rw-r--r--net/ipv4/netfilter/ip_queue.c6
-rw-r--r--net/ipv4/netfilter/ip_tables.c163
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c49
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c148
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c2
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c16
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c134
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c7
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c6
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c45
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c128
-rw-r--r--net/ipv4/netfilter/nf_nat_ftp.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c53
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c76
-rw-r--r--net/ipv4/netfilter/nf_nat_irc.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c19
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c27
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c13
-rw-r--r--net/ipv4/ping.c931
-rw-r--r--net/ipv4/proc.c9
-rw-r--r--net/ipv4/protocol.c33
-rw-r--r--net/ipv4/raw.c120
-rw-r--r--net/ipv4/route.c1697
-rw-r--r--net/ipv4/syncookies.c28
-rw-r--r--net/ipv4/sysctl_net_ipv4.c89
-rw-r--r--net/ipv4/tcp.c70
-rw-r--r--net/ipv4/tcp_bic.c2
-rw-r--r--net/ipv4/tcp_cubic.c56
-rw-r--r--net/ipv4/tcp_highspeed.c2
-rw-r--r--net/ipv4/tcp_htcp.c2
-rw-r--r--net/ipv4/tcp_hybla.c2
-rw-r--r--net/ipv4/tcp_illinois.c4
-rw-r--r--net/ipv4/tcp_input.c152
-rw-r--r--net/ipv4/tcp_ipv4.c231
-rw-r--r--net/ipv4/tcp_lp.c4
-rw-r--r--net/ipv4/tcp_minisocks.c67
-rw-r--r--net/ipv4/tcp_output.c113
-rw-r--r--net/ipv4/tcp_probe.c5
-rw-r--r--net/ipv4/tcp_scalable.c2
-rw-r--r--net/ipv4/tcp_timer.c53
-rw-r--r--net/ipv4/tcp_vegas.c2
-rw-r--r--net/ipv4/tcp_veno.c4
-rw-r--r--net/ipv4/tcp_westwood.c4
-rw-r--r--net/ipv4/tcp_yeah.c4
-rw-r--r--net/ipv4/tunnel4.c44
-rw-r--r--net/ipv4/udp.c207
-rw-r--r--net/ipv4/udplite.c1
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c2
-rw-r--r--net/ipv4/xfrm4_output.c15
-rw-r--r--net/ipv4/xfrm4_policy.c142
-rw-r--r--net/ipv4/xfrm4_state.c23
-rw-r--r--net/ipv4/xfrm4_tunnel.c4
96 files changed, 6506 insertions, 5512 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7cd7760144f7..cbb505ba9324 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER
55 55
56 If unsure, say N here. 56 If unsure, say N here.
57 57
58choice
59 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
60 depends on IP_ADVANCED_ROUTER
61 default ASK_IP_FIB_HASH
62
63config ASK_IP_FIB_HASH
64 bool "FIB_HASH"
65 ---help---
66 Current FIB is very proven and good enough for most users.
67
68config IP_FIB_TRIE
69 bool "FIB_TRIE"
70 ---help---
71 Use new experimental LC-trie as FIB lookup algorithm.
72 This improves lookup performance if you have a large
73 number of routes.
74
75 LC-trie is a longest matching prefix lookup algorithm which
76 performs better than FIB_HASH for large routing tables.
77 But, it consumes more memory and is more complex.
78
79 LC-trie is described in:
80
81 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
82 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
83 June 1999
84
85 An experimental study of compression methods for dynamic tries
86 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
87 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
88
89endchoice
90
91config IP_FIB_HASH
92 def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
93
94config IP_FIB_TRIE_STATS 58config IP_FIB_TRIE_STATS
95 bool "FIB TRIE statistics" 59 bool "FIB TRIE statistics"
96 depends on IP_FIB_TRIE 60 depends on IP_ADVANCED_ROUTER
97 ---help--- 61 ---help---
98 Keep track of statistics on structure of FIB TRIE table. 62 Keep track of statistics on structure of FIB TRIE table.
99 Useful for testing and measuring TRIE performance. 63 Useful for testing and measuring TRIE performance.
@@ -140,6 +104,9 @@ config IP_ROUTE_VERBOSE
140 handled by the klogd daemon which is responsible for kernel messages 104 handled by the klogd daemon which is responsible for kernel messages
141 ("man klogd"). 105 ("man klogd").
142 106
107config IP_ROUTE_CLASSID
108 bool
109
143config IP_PNP 110config IP_PNP
144 bool "IP: kernel level autoconfiguration" 111 bool "IP: kernel level autoconfiguration"
145 help 112 help
@@ -215,9 +182,15 @@ config NET_IPIP
215 be inserted in and removed from the running kernel whenever you 182 be inserted in and removed from the running kernel whenever you
216 want). Most people won't need this and can say N. 183 want). Most people won't need this and can say N.
217 184
185config NET_IPGRE_DEMUX
186 tristate "IP: GRE demultiplexer"
187 help
188 This is helper module to demultiplex GRE packets on GRE version field criteria.
189 Required by ip_gre and pptp modules.
190
218config NET_IPGRE 191config NET_IPGRE
219 tristate "IP: GRE tunnels over IP" 192 tristate "IP: GRE tunnels over IP"
220 depends on IPV6 || IPV6=n 193 depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
221 help 194 help
222 Tunneling means encapsulating data of one protocol type within 195 Tunneling means encapsulating data of one protocol type within
223 another protocol and sending it over a channel that understands the 196 another protocol and sending it over a channel that understands the
@@ -426,7 +399,9 @@ config INET_DIAG
426 ---help--- 399 ---help---
427 Support for INET (TCP, DCCP, etc) socket monitoring interface used by 400 Support for INET (TCP, DCCP, etc) socket monitoring interface used by
428 native Linux tools such as ss. ss is included in iproute2, currently 401 native Linux tools such as ss. ss is included in iproute2, currently
429 downloadable at <http://linux-net.osdl.org/index.php/Iproute2>. 402 downloadable at:
403
404 http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
430 405
431 If unsure, say Y. 406 If unsure, say Y.
432 407
@@ -556,7 +531,7 @@ config TCP_CONG_VENO
556 distinguishing to circumvent the difficult judgment of the packet loss 531 distinguishing to circumvent the difficult judgment of the packet loss
557 type. TCP Veno cuts down less congestion window in response to random 532 type. TCP Veno cuts down less congestion window in response to random
558 loss packets. 533 loss packets.
559 See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf 534 See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
560 535
561config TCP_CONG_YEAH 536config TCP_CONG_YEAH
562 tristate "YeAH TCP" 537 tristate "YeAH TCP"
@@ -649,4 +624,3 @@ config TCP_MD5SIG
649 on the Internet. 624 on the Internet.
650 625
651 If unsure, say N. 626 If unsure, say N.
652
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 80ff87ce43aa..f2dc69cffb57 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,16 +10,15 @@ obj-y := route.o inetpeer.o protocol.o \
10 tcp_minisocks.o tcp_cong.o \ 10 tcp_minisocks.o tcp_cong.o \
11 datagram.o raw.o udp.o udplite.o \ 11 datagram.o raw.o udp.o udplite.o \
12 arp.o icmp.o devinet.o af_inet.o igmp.o \ 12 arp.o icmp.o devinet.o af_inet.o igmp.o \
13 fib_frontend.o fib_semantics.o \ 13 fib_frontend.o fib_semantics.o fib_trie.o \
14 inet_fragment.o 14 inet_fragment.o ping.o
15 15
16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o 16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
17obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
18obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
19obj-$(CONFIG_PROC_FS) += proc.o 17obj-$(CONFIG_PROC_FS) += proc.o
20obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 18obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
21obj-$(CONFIG_IP_MROUTE) += ipmr.o 19obj-$(CONFIG_IP_MROUTE) += ipmr.o
22obj-$(CONFIG_NET_IPIP) += ipip.o 20obj-$(CONFIG_NET_IPIP) += ipip.o
21obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
23obj-$(CONFIG_NET_IPGRE) += ip_gre.o 22obj-$(CONFIG_NET_IPGRE) += ip_gre.o
24obj-$(CONFIG_SYN_COOKIES) += syncookies.o 23obj-$(CONFIG_SYN_COOKIES) += syncookies.o
25obj-$(CONFIG_INET_AH) += ah4.o 24obj-$(CONFIG_INET_AH) += ah4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6a1100c25a9f..ef1528af7abf 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -105,6 +105,7 @@
105#include <net/tcp.h> 105#include <net/tcp.h>
106#include <net/udp.h> 106#include <net/udp.h>
107#include <net/udplite.h> 107#include <net/udplite.h>
108#include <net/ping.h>
108#include <linux/skbuff.h> 109#include <linux/skbuff.h>
109#include <net/sock.h> 110#include <net/sock.h>
110#include <net/raw.h> 111#include <net/raw.h>
@@ -153,7 +154,7 @@ void inet_sock_destruct(struct sock *sk)
153 WARN_ON(sk->sk_wmem_queued); 154 WARN_ON(sk->sk_wmem_queued);
154 WARN_ON(sk->sk_forward_alloc); 155 WARN_ON(sk->sk_forward_alloc);
155 156
156 kfree(inet->opt); 157 kfree(rcu_dereference_protected(inet->inet_opt, 1));
157 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); 158 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
158 sk_refcnt_debug_dec(sk); 159 sk_refcnt_debug_dec(sk);
159} 160}
@@ -227,18 +228,16 @@ EXPORT_SYMBOL(inet_ehash_secret);
227 228
228/* 229/*
229 * inet_ehash_secret must be set exactly once 230 * inet_ehash_secret must be set exactly once
230 * Instead of using a dedicated spinlock, we (ab)use inetsw_lock
231 */ 231 */
232void build_ehash_secret(void) 232void build_ehash_secret(void)
233{ 233{
234 u32 rnd; 234 u32 rnd;
235
235 do { 236 do {
236 get_random_bytes(&rnd, sizeof(rnd)); 237 get_random_bytes(&rnd, sizeof(rnd));
237 } while (rnd == 0); 238 } while (rnd == 0);
238 spin_lock_bh(&inetsw_lock); 239
239 if (!inet_ehash_secret) 240 cmpxchg(&inet_ehash_secret, 0, rnd);
240 inet_ehash_secret = rnd;
241 spin_unlock_bh(&inetsw_lock);
242} 241}
243EXPORT_SYMBOL(build_ehash_secret); 242EXPORT_SYMBOL(build_ehash_secret);
244 243
@@ -466,6 +465,11 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
466 if (addr_len < sizeof(struct sockaddr_in)) 465 if (addr_len < sizeof(struct sockaddr_in))
467 goto out; 466 goto out;
468 467
468 if (addr->sin_family != AF_INET) {
469 err = -EAFNOSUPPORT;
470 goto out;
471 }
472
469 chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); 473 chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
470 474
471 /* Not specified by any standard per-se, however it breaks too 475 /* Not specified by any standard per-se, however it breaks too
@@ -674,6 +678,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
674 678
675 lock_sock(sk2); 679 lock_sock(sk2);
676 680
681 sock_rps_record_flow(sk2);
677 WARN_ON(!((1 << sk2->sk_state) & 682 WARN_ON(!((1 << sk2->sk_state) &
678 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); 683 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)));
679 684
@@ -882,6 +887,19 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
882} 887}
883EXPORT_SYMBOL(inet_ioctl); 888EXPORT_SYMBOL(inet_ioctl);
884 889
890#ifdef CONFIG_COMPAT
891int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
892{
893 struct sock *sk = sock->sk;
894 int err = -ENOIOCTLCMD;
895
896 if (sk->sk_prot->compat_ioctl)
897 err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
898
899 return err;
900}
901#endif
902
885const struct proto_ops inet_stream_ops = { 903const struct proto_ops inet_stream_ops = {
886 .family = PF_INET, 904 .family = PF_INET,
887 .owner = THIS_MODULE, 905 .owner = THIS_MODULE,
@@ -905,6 +923,7 @@ const struct proto_ops inet_stream_ops = {
905#ifdef CONFIG_COMPAT 923#ifdef CONFIG_COMPAT
906 .compat_setsockopt = compat_sock_common_setsockopt, 924 .compat_setsockopt = compat_sock_common_setsockopt,
907 .compat_getsockopt = compat_sock_common_getsockopt, 925 .compat_getsockopt = compat_sock_common_getsockopt,
926 .compat_ioctl = inet_compat_ioctl,
908#endif 927#endif
909}; 928};
910EXPORT_SYMBOL(inet_stream_ops); 929EXPORT_SYMBOL(inet_stream_ops);
@@ -931,6 +950,7 @@ const struct proto_ops inet_dgram_ops = {
931#ifdef CONFIG_COMPAT 950#ifdef CONFIG_COMPAT
932 .compat_setsockopt = compat_sock_common_setsockopt, 951 .compat_setsockopt = compat_sock_common_setsockopt,
933 .compat_getsockopt = compat_sock_common_getsockopt, 952 .compat_getsockopt = compat_sock_common_getsockopt,
953 .compat_ioctl = inet_compat_ioctl,
934#endif 954#endif
935}; 955};
936EXPORT_SYMBOL(inet_dgram_ops); 956EXPORT_SYMBOL(inet_dgram_ops);
@@ -961,6 +981,7 @@ static const struct proto_ops inet_sockraw_ops = {
961#ifdef CONFIG_COMPAT 981#ifdef CONFIG_COMPAT
962 .compat_setsockopt = compat_sock_common_setsockopt, 982 .compat_setsockopt = compat_sock_common_setsockopt,
963 .compat_getsockopt = compat_sock_common_getsockopt, 983 .compat_getsockopt = compat_sock_common_getsockopt,
984 .compat_ioctl = inet_compat_ioctl,
964#endif 985#endif
965}; 986};
966 987
@@ -994,6 +1015,14 @@ static struct inet_protosw inetsw_array[] =
994 .flags = INET_PROTOSW_PERMANENT, 1015 .flags = INET_PROTOSW_PERMANENT,
995 }, 1016 },
996 1017
1018 {
1019 .type = SOCK_DGRAM,
1020 .protocol = IPPROTO_ICMP,
1021 .prot = &ping_prot,
1022 .ops = &inet_dgram_ops,
1023 .no_check = UDP_CSUM_DEFAULT,
1024 .flags = INET_PROTOSW_REUSE,
1025 },
997 1026
998 { 1027 {
999 .type = SOCK_RAW, 1028 .type = SOCK_RAW,
@@ -1087,27 +1116,29 @@ int sysctl_ip_dynaddr __read_mostly;
1087static int inet_sk_reselect_saddr(struct sock *sk) 1116static int inet_sk_reselect_saddr(struct sock *sk)
1088{ 1117{
1089 struct inet_sock *inet = inet_sk(sk); 1118 struct inet_sock *inet = inet_sk(sk);
1090 int err;
1091 struct rtable *rt;
1092 __be32 old_saddr = inet->inet_saddr; 1119 __be32 old_saddr = inet->inet_saddr;
1093 __be32 new_saddr;
1094 __be32 daddr = inet->inet_daddr; 1120 __be32 daddr = inet->inet_daddr;
1121 struct flowi4 *fl4;
1122 struct rtable *rt;
1123 __be32 new_saddr;
1124 struct ip_options_rcu *inet_opt;
1095 1125
1096 if (inet->opt && inet->opt->srr) 1126 inet_opt = rcu_dereference_protected(inet->inet_opt,
1097 daddr = inet->opt->faddr; 1127 sock_owned_by_user(sk));
1128 if (inet_opt && inet_opt->opt.srr)
1129 daddr = inet_opt->opt.faddr;
1098 1130
1099 /* Query new route. */ 1131 /* Query new route. */
1100 err = ip_route_connect(&rt, daddr, 0, 1132 fl4 = &inet->cork.fl.u.ip4;
1101 RT_CONN_FLAGS(sk), 1133 rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
1102 sk->sk_bound_dev_if, 1134 sk->sk_bound_dev_if, sk->sk_protocol,
1103 sk->sk_protocol, 1135 inet->inet_sport, inet->inet_dport, sk, false);
1104 inet->inet_sport, inet->inet_dport, sk, 0); 1136 if (IS_ERR(rt))
1105 if (err) 1137 return PTR_ERR(rt);
1106 return err;
1107 1138
1108 sk_setup_caps(sk, &rt->dst); 1139 sk_setup_caps(sk, &rt->dst);
1109 1140
1110 new_saddr = rt->rt_src; 1141 new_saddr = fl4->saddr;
1111 1142
1112 if (new_saddr == old_saddr) 1143 if (new_saddr == old_saddr)
1113 return 0; 1144 return 0;
@@ -1136,6 +1167,8 @@ int inet_sk_rebuild_header(struct sock *sk)
1136 struct inet_sock *inet = inet_sk(sk); 1167 struct inet_sock *inet = inet_sk(sk);
1137 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); 1168 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1138 __be32 daddr; 1169 __be32 daddr;
1170 struct ip_options_rcu *inet_opt;
1171 struct flowi4 *fl4;
1139 int err; 1172 int err;
1140 1173
1141 /* Route is OK, nothing to do. */ 1174 /* Route is OK, nothing to do. */
@@ -1143,36 +1176,23 @@ int inet_sk_rebuild_header(struct sock *sk)
1143 return 0; 1176 return 0;
1144 1177
1145 /* Reroute. */ 1178 /* Reroute. */
1179 rcu_read_lock();
1180 inet_opt = rcu_dereference(inet->inet_opt);
1146 daddr = inet->inet_daddr; 1181 daddr = inet->inet_daddr;
1147 if (inet->opt && inet->opt->srr) 1182 if (inet_opt && inet_opt->opt.srr)
1148 daddr = inet->opt->faddr; 1183 daddr = inet_opt->opt.faddr;
1149{ 1184 rcu_read_unlock();
1150 struct flowi fl = { 1185 fl4 = &inet->cork.fl.u.ip4;
1151 .oif = sk->sk_bound_dev_if, 1186 rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
1152 .mark = sk->sk_mark, 1187 inet->inet_dport, inet->inet_sport,
1153 .nl_u = { 1188 sk->sk_protocol, RT_CONN_FLAGS(sk),
1154 .ip4_u = { 1189 sk->sk_bound_dev_if);
1155 .daddr = daddr, 1190 if (!IS_ERR(rt)) {
1156 .saddr = inet->inet_saddr, 1191 err = 0;
1157 .tos = RT_CONN_FLAGS(sk),
1158 },
1159 },
1160 .proto = sk->sk_protocol,
1161 .flags = inet_sk_flowi_flags(sk),
1162 .uli_u = {
1163 .ports = {
1164 .sport = inet->inet_sport,
1165 .dport = inet->inet_dport,
1166 },
1167 },
1168 };
1169
1170 security_sk_classify_flow(sk, &fl);
1171 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
1172}
1173 if (!err)
1174 sk_setup_caps(sk, &rt->dst); 1192 sk_setup_caps(sk, &rt->dst);
1175 else { 1193 } else {
1194 err = PTR_ERR(rt);
1195
1176 /* Routing failed... */ 1196 /* Routing failed... */
1177 sk->sk_route_caps = 0; 1197 sk->sk_route_caps = 0;
1178 /* 1198 /*
@@ -1192,7 +1212,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
1192 1212
1193static int inet_gso_send_check(struct sk_buff *skb) 1213static int inet_gso_send_check(struct sk_buff *skb)
1194{ 1214{
1195 struct iphdr *iph; 1215 const struct iphdr *iph;
1196 const struct net_protocol *ops; 1216 const struct net_protocol *ops;
1197 int proto; 1217 int proto;
1198 int ihl; 1218 int ihl;
@@ -1225,7 +1245,7 @@ out:
1225 return err; 1245 return err;
1226} 1246}
1227 1247
1228static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) 1248static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features)
1229{ 1249{
1230 struct sk_buff *segs = ERR_PTR(-EINVAL); 1250 struct sk_buff *segs = ERR_PTR(-EINVAL);
1231 struct iphdr *iph; 1251 struct iphdr *iph;
@@ -1299,7 +1319,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1299 const struct net_protocol *ops; 1319 const struct net_protocol *ops;
1300 struct sk_buff **pp = NULL; 1320 struct sk_buff **pp = NULL;
1301 struct sk_buff *p; 1321 struct sk_buff *p;
1302 struct iphdr *iph; 1322 const struct iphdr *iph;
1303 unsigned int hlen; 1323 unsigned int hlen;
1304 unsigned int off; 1324 unsigned int off;
1305 unsigned int id; 1325 unsigned int id;
@@ -1522,6 +1542,7 @@ static const struct net_protocol udp_protocol = {
1522 1542
1523static const struct net_protocol icmp_protocol = { 1543static const struct net_protocol icmp_protocol = {
1524 .handler = icmp_rcv, 1544 .handler = icmp_rcv,
1545 .err_handler = ping_err,
1525 .no_policy = 1, 1546 .no_policy = 1,
1526 .netns_ok = 1, 1547 .netns_ok = 1,
1527}; 1548};
@@ -1637,6 +1658,10 @@ static int __init inet_init(void)
1637 if (rc) 1658 if (rc)
1638 goto out_unregister_udp_proto; 1659 goto out_unregister_udp_proto;
1639 1660
1661 rc = proto_register(&ping_prot, 1);
1662 if (rc)
1663 goto out_unregister_raw_proto;
1664
1640 /* 1665 /*
1641 * Tell SOCKET that we are alive... 1666 * Tell SOCKET that we are alive...
1642 */ 1667 */
@@ -1692,6 +1717,8 @@ static int __init inet_init(void)
1692 /* Add UDP-Lite (RFC 3828) */ 1717 /* Add UDP-Lite (RFC 3828) */
1693 udplite4_register(); 1718 udplite4_register();
1694 1719
1720 ping_init();
1721
1695 /* 1722 /*
1696 * Set the ICMP layer up 1723 * Set the ICMP layer up
1697 */ 1724 */
@@ -1722,6 +1749,8 @@ static int __init inet_init(void)
1722 rc = 0; 1749 rc = 0;
1723out: 1750out:
1724 return rc; 1751 return rc;
1752out_unregister_raw_proto:
1753 proto_unregister(&raw_prot);
1725out_unregister_udp_proto: 1754out_unregister_udp_proto:
1726 proto_unregister(&udp_prot); 1755 proto_unregister(&udp_prot);
1727out_unregister_tcp_proto: 1756out_unregister_tcp_proto:
@@ -1746,11 +1775,15 @@ static int __init ipv4_proc_init(void)
1746 goto out_tcp; 1775 goto out_tcp;
1747 if (udp4_proc_init()) 1776 if (udp4_proc_init())
1748 goto out_udp; 1777 goto out_udp;
1778 if (ping_proc_init())
1779 goto out_ping;
1749 if (ip_misc_proc_init()) 1780 if (ip_misc_proc_init())
1750 goto out_misc; 1781 goto out_misc;
1751out: 1782out:
1752 return rc; 1783 return rc;
1753out_misc: 1784out_misc:
1785 ping_proc_exit();
1786out_ping:
1754 udp4_proc_exit(); 1787 udp4_proc_exit();
1755out_udp: 1788out_udp:
1756 tcp4_proc_exit(); 1789 tcp4_proc_exit();
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 880a5ec6dce0..c1f4154552fc 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -73,7 +73,7 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
73 * into IP header for icv calculation. Options are already checked 73 * into IP header for icv calculation. Options are already checked
74 * for validity, so paranoia is not required. */ 74 * for validity, so paranoia is not required. */
75 75
76static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr) 76static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
77{ 77{
78 unsigned char * optptr = (unsigned char*)(iph+1); 78 unsigned char * optptr = (unsigned char*)(iph+1);
79 int l = iph->ihl*4 - sizeof(struct iphdr); 79 int l = iph->ihl*4 - sizeof(struct iphdr);
@@ -201,11 +201,14 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
201 top_iph->ttl = 0; 201 top_iph->ttl = 0;
202 top_iph->check = 0; 202 top_iph->check = 0;
203 203
204 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; 204 if (x->props.flags & XFRM_STATE_ALIGN4)
205 ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
206 else
207 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
205 208
206 ah->reserved = 0; 209 ah->reserved = 0;
207 ah->spi = x->id.spi; 210 ah->spi = x->id.spi;
208 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); 211 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
209 212
210 sg_init_table(sg, nfrags); 213 sg_init_table(sg, nfrags);
211 skb_to_sgvec(skb, sg, 0, skb->len); 214 skb_to_sgvec(skb, sg, 0, skb->len);
@@ -299,9 +302,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
299 nexthdr = ah->nexthdr; 302 nexthdr = ah->nexthdr;
300 ah_hlen = (ah->hdrlen + 2) << 2; 303 ah_hlen = (ah->hdrlen + 2) << 2;
301 304
302 if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && 305 if (x->props.flags & XFRM_STATE_ALIGN4) {
303 ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) 306 if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
304 goto out; 307 ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
308 goto out;
309 } else {
310 if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
311 ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
312 goto out;
313 }
305 314
306 if (!pskb_may_pull(skb, ah_hlen)) 315 if (!pskb_may_pull(skb, ah_hlen))
307 goto out; 316 goto out;
@@ -314,14 +323,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
314 323
315 skb->ip_summed = CHECKSUM_NONE; 324 skb->ip_summed = CHECKSUM_NONE;
316 325
317 ah = (struct ip_auth_hdr *)skb->data;
318 iph = ip_hdr(skb);
319 ihl = ip_hdrlen(skb);
320 326
321 if ((err = skb_cow_data(skb, 0, &trailer)) < 0) 327 if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
322 goto out; 328 goto out;
323 nfrags = err; 329 nfrags = err;
324 330
331 ah = (struct ip_auth_hdr *)skb->data;
332 iph = ip_hdr(skb);
333 ihl = ip_hdrlen(skb);
334
325 work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len); 335 work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len);
326 if (!work_iph) 336 if (!work_iph)
327 goto out; 337 goto out;
@@ -386,7 +396,7 @@ out:
386static void ah4_err(struct sk_buff *skb, u32 info) 396static void ah4_err(struct sk_buff *skb, u32 info)
387{ 397{
388 struct net *net = dev_net(skb->dev); 398 struct net *net = dev_net(skb->dev);
389 struct iphdr *iph = (struct iphdr *)skb->data; 399 const struct iphdr *iph = (const struct iphdr *)skb->data;
390 struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); 400 struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
391 struct xfrm_state *x; 401 struct xfrm_state *x;
392 402
@@ -394,7 +404,8 @@ static void ah4_err(struct sk_buff *skb, u32 info)
394 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 404 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
395 return; 405 return;
396 406
397 x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); 407 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
408 ah->spi, IPPROTO_AH, AF_INET);
398 if (!x) 409 if (!x)
399 return; 410 return;
400 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", 411 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
@@ -449,8 +460,12 @@ static int ah_init_state(struct xfrm_state *x)
449 460
450 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); 461 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
451 462
452 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + 463 if (x->props.flags & XFRM_STATE_ALIGN4)
453 ahp->icv_trunc_len); 464 x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
465 ahp->icv_trunc_len);
466 else
467 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
468 ahp->icv_trunc_len);
454 if (x->props.mode == XFRM_MODE_TUNNEL) 469 if (x->props.mode == XFRM_MODE_TUNNEL)
455 x->props.header_len += sizeof(struct iphdr); 470 x->props.header_len += sizeof(struct iphdr);
456 x->data = ahp; 471 x->data = ahp;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 96c1955b3e2f..1b74d3b64371 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -55,7 +55,7 @@
55 * Stuart Cheshire : Metricom and grat arp fixes 55 * Stuart Cheshire : Metricom and grat arp fixes
56 * *** FOR 2.1 clean this up *** 56 * *** FOR 2.1 clean this up ***
57 * Lawrence V. Stefani: (08/12/96) Added FDDI support. 57 * Lawrence V. Stefani: (08/12/96) Added FDDI support.
58 * Alan Cox : Took the AP1000 nasty FDDI hack and 58 * Alan Cox : Took the AP1000 nasty FDDI hack and
59 * folded into the mainstream FDDI code. 59 * folded into the mainstream FDDI code.
60 * Ack spit, Linus how did you allow that 60 * Ack spit, Linus how did you allow that
61 * one in... 61 * one in...
@@ -120,14 +120,14 @@ EXPORT_SYMBOL(clip_tbl_hook);
120#endif 120#endif
121 121
122#include <asm/system.h> 122#include <asm/system.h>
123#include <asm/uaccess.h> 123#include <linux/uaccess.h>
124 124
125#include <linux/netfilter_arp.h> 125#include <linux/netfilter_arp.h>
126 126
127/* 127/*
128 * Interface to generic neighbour cache. 128 * Interface to generic neighbour cache.
129 */ 129 */
130static u32 arp_hash(const void *pkey, const struct net_device *dev); 130static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd);
131static int arp_constructor(struct neighbour *neigh); 131static int arp_constructor(struct neighbour *neigh);
132static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); 132static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
133static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); 133static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -161,7 +161,7 @@ static const struct neigh_ops arp_direct_ops = {
161 .queue_xmit = dev_queue_xmit, 161 .queue_xmit = dev_queue_xmit,
162}; 162};
163 163
164const struct neigh_ops arp_broken_ops = { 164static const struct neigh_ops arp_broken_ops = {
165 .family = AF_INET, 165 .family = AF_INET,
166 .solicit = arp_solicit, 166 .solicit = arp_solicit,
167 .error_report = arp_error_report, 167 .error_report = arp_error_report,
@@ -170,35 +170,34 @@ const struct neigh_ops arp_broken_ops = {
170 .hh_output = dev_queue_xmit, 170 .hh_output = dev_queue_xmit,
171 .queue_xmit = dev_queue_xmit, 171 .queue_xmit = dev_queue_xmit,
172}; 172};
173EXPORT_SYMBOL(arp_broken_ops);
174 173
175struct neigh_table arp_tbl = { 174struct neigh_table arp_tbl = {
176 .family = AF_INET, 175 .family = AF_INET,
177 .entry_size = sizeof(struct neighbour) + 4, 176 .entry_size = sizeof(struct neighbour) + 4,
178 .key_len = 4, 177 .key_len = 4,
179 .hash = arp_hash, 178 .hash = arp_hash,
180 .constructor = arp_constructor, 179 .constructor = arp_constructor,
181 .proxy_redo = parp_redo, 180 .proxy_redo = parp_redo,
182 .id = "arp_cache", 181 .id = "arp_cache",
183 .parms = { 182 .parms = {
184 .tbl = &arp_tbl, 183 .tbl = &arp_tbl,
185 .base_reachable_time = 30 * HZ, 184 .base_reachable_time = 30 * HZ,
186 .retrans_time = 1 * HZ, 185 .retrans_time = 1 * HZ,
187 .gc_staletime = 60 * HZ, 186 .gc_staletime = 60 * HZ,
188 .reachable_time = 30 * HZ, 187 .reachable_time = 30 * HZ,
189 .delay_probe_time = 5 * HZ, 188 .delay_probe_time = 5 * HZ,
190 .queue_len = 3, 189 .queue_len = 3,
191 .ucast_probes = 3, 190 .ucast_probes = 3,
192 .mcast_probes = 3, 191 .mcast_probes = 3,
193 .anycast_delay = 1 * HZ, 192 .anycast_delay = 1 * HZ,
194 .proxy_delay = (8 * HZ) / 10, 193 .proxy_delay = (8 * HZ) / 10,
195 .proxy_qlen = 64, 194 .proxy_qlen = 64,
196 .locktime = 1 * HZ, 195 .locktime = 1 * HZ,
197 }, 196 },
198 .gc_interval = 30 * HZ, 197 .gc_interval = 30 * HZ,
199 .gc_thresh1 = 128, 198 .gc_thresh1 = 128,
200 .gc_thresh2 = 512, 199 .gc_thresh2 = 512,
201 .gc_thresh3 = 1024, 200 .gc_thresh3 = 1024,
202}; 201};
203EXPORT_SYMBOL(arp_tbl); 202EXPORT_SYMBOL(arp_tbl);
204 203
@@ -216,6 +215,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
216 case ARPHRD_INFINIBAND: 215 case ARPHRD_INFINIBAND:
217 ip_ib_mc_map(addr, dev->broadcast, haddr); 216 ip_ib_mc_map(addr, dev->broadcast, haddr);
218 return 0; 217 return 0;
218 case ARPHRD_IPGRE:
219 ip_ipgre_mc_map(addr, dev->broadcast, haddr);
220 return 0;
219 default: 221 default:
220 if (dir) { 222 if (dir) {
221 memcpy(haddr, dev->broadcast, dev->addr_len); 223 memcpy(haddr, dev->broadcast, dev->addr_len);
@@ -226,14 +228,16 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
226} 228}
227 229
228 230
229static u32 arp_hash(const void *pkey, const struct net_device *dev) 231static u32 arp_hash(const void *pkey,
232 const struct net_device *dev,
233 __u32 hash_rnd)
230{ 234{
231 return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd); 235 return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd);
232} 236}
233 237
234static int arp_constructor(struct neighbour *neigh) 238static int arp_constructor(struct neighbour *neigh)
235{ 239{
236 __be32 addr = *(__be32*)neigh->primary_key; 240 __be32 addr = *(__be32 *)neigh->primary_key;
237 struct net_device *dev = neigh->dev; 241 struct net_device *dev = neigh->dev;
238 struct in_device *in_dev; 242 struct in_device *in_dev;
239 struct neigh_parms *parms; 243 struct neigh_parms *parms;
@@ -296,16 +300,19 @@ static int arp_constructor(struct neighbour *neigh)
296 neigh->ops = &arp_broken_ops; 300 neigh->ops = &arp_broken_ops;
297 neigh->output = neigh->ops->output; 301 neigh->output = neigh->ops->output;
298 return 0; 302 return 0;
303#else
304 break;
299#endif 305#endif
300 ;} 306 }
301#endif 307#endif
302 if (neigh->type == RTN_MULTICAST) { 308 if (neigh->type == RTN_MULTICAST) {
303 neigh->nud_state = NUD_NOARP; 309 neigh->nud_state = NUD_NOARP;
304 arp_mc_map(addr, neigh->ha, dev, 1); 310 arp_mc_map(addr, neigh->ha, dev, 1);
305 } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { 311 } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
306 neigh->nud_state = NUD_NOARP; 312 neigh->nud_state = NUD_NOARP;
307 memcpy(neigh->ha, dev->dev_addr, dev->addr_len); 313 memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
308 } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) { 314 } else if (neigh->type == RTN_BROADCAST ||
315 (dev->flags & IFF_POINTOPOINT)) {
309 neigh->nud_state = NUD_NOARP; 316 neigh->nud_state = NUD_NOARP;
310 memcpy(neigh->ha, dev->broadcast, dev->addr_len); 317 memcpy(neigh->ha, dev->broadcast, dev->addr_len);
311 } 318 }
@@ -315,7 +322,7 @@ static int arp_constructor(struct neighbour *neigh)
315 else 322 else
316 neigh->ops = &arp_generic_ops; 323 neigh->ops = &arp_generic_ops;
317 324
318 if (neigh->nud_state&NUD_VALID) 325 if (neigh->nud_state & NUD_VALID)
319 neigh->output = neigh->ops->connected_output; 326 neigh->output = neigh->ops->connected_output;
320 else 327 else
321 neigh->output = neigh->ops->output; 328 neigh->output = neigh->ops->output;
@@ -334,7 +341,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
334 __be32 saddr = 0; 341 __be32 saddr = 0;
335 u8 *dst_ha = NULL; 342 u8 *dst_ha = NULL;
336 struct net_device *dev = neigh->dev; 343 struct net_device *dev = neigh->dev;
337 __be32 target = *(__be32*)neigh->primary_key; 344 __be32 target = *(__be32 *)neigh->primary_key;
338 int probes = atomic_read(&neigh->probes); 345 int probes = atomic_read(&neigh->probes);
339 struct in_device *in_dev; 346 struct in_device *in_dev;
340 347
@@ -347,7 +354,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
347 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { 354 switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
348 default: 355 default:
349 case 0: /* By default announce any local IP */ 356 case 0: /* By default announce any local IP */
350 if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL) 357 if (skb && inet_addr_type(dev_net(dev),
358 ip_hdr(skb)->saddr) == RTN_LOCAL)
351 saddr = ip_hdr(skb)->saddr; 359 saddr = ip_hdr(skb)->saddr;
352 break; 360 break;
353 case 1: /* Restrict announcements of saddr in same subnet */ 361 case 1: /* Restrict announcements of saddr in same subnet */
@@ -369,16 +377,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
369 if (!saddr) 377 if (!saddr)
370 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); 378 saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
371 379
372 if ((probes -= neigh->parms->ucast_probes) < 0) { 380 probes -= neigh->parms->ucast_probes;
373 if (!(neigh->nud_state&NUD_VALID)) 381 if (probes < 0) {
374 printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); 382 if (!(neigh->nud_state & NUD_VALID))
383 printk(KERN_DEBUG
384 "trying to ucast probe in NUD_INVALID\n");
375 dst_ha = neigh->ha; 385 dst_ha = neigh->ha;
376 read_lock_bh(&neigh->lock); 386 read_lock_bh(&neigh->lock);
377 } else if ((probes -= neigh->parms->app_probes) < 0) { 387 } else {
388 probes -= neigh->parms->app_probes;
389 if (probes < 0) {
378#ifdef CONFIG_ARPD 390#ifdef CONFIG_ARPD
379 neigh_app_ns(neigh); 391 neigh_app_ns(neigh);
380#endif 392#endif
381 return; 393 return;
394 }
382 } 395 }
383 396
384 arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, 397 arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
@@ -423,14 +436,13 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
423 436
424static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) 437static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
425{ 438{
426 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip,
427 .saddr = tip } } };
428 struct rtable *rt; 439 struct rtable *rt;
429 int flag = 0; 440 int flag = 0;
430 /*unsigned long now; */ 441 /*unsigned long now; */
431 struct net *net = dev_net(dev); 442 struct net *net = dev_net(dev);
432 443
433 if (ip_route_output_key(net, &rt, &fl) < 0) 444 rt = ip_route_output(net, sip, tip, 0, 0);
445 if (IS_ERR(rt))
434 return 1; 446 return 1;
435 if (rt->dst.dev != dev) { 447 if (rt->dst.dev != dev) {
436 NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); 448 NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
@@ -451,7 +463,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
451 * is allowed to use this function, it is scheduled to be removed. --ANK 463 * is allowed to use this function, it is scheduled to be removed. --ANK
452 */ 464 */
453 465
454static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev) 466static int arp_set_predefined(int addr_hint, unsigned char *haddr,
467 __be32 paddr, struct net_device *dev)
455{ 468{
456 switch (addr_hint) { 469 switch (addr_hint) {
457 case RTN_LOCAL: 470 case RTN_LOCAL:
@@ -483,17 +496,16 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
483 496
484 paddr = skb_rtable(skb)->rt_gateway; 497 paddr = skb_rtable(skb)->rt_gateway;
485 498
486 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev)) 499 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
500 paddr, dev))
487 return 0; 501 return 0;
488 502
489 n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); 503 n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
490 504
491 if (n) { 505 if (n) {
492 n->used = jiffies; 506 n->used = jiffies;
493 if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) { 507 if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
494 read_lock_bh(&n->lock); 508 neigh_ha_snapshot(haddr, n, dev);
495 memcpy(haddr, n->ha, dev->addr_len);
496 read_unlock_bh(&n->lock);
497 neigh_release(n); 509 neigh_release(n);
498 return 0; 510 return 0;
499 } 511 }
@@ -515,13 +527,14 @@ int arp_bind_neighbour(struct dst_entry *dst)
515 return -EINVAL; 527 return -EINVAL;
516 if (n == NULL) { 528 if (n == NULL) {
517 __be32 nexthop = ((struct rtable *)dst)->rt_gateway; 529 __be32 nexthop = ((struct rtable *)dst)->rt_gateway;
518 if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) 530 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
519 nexthop = 0; 531 nexthop = 0;
520 n = __neigh_lookup_errno( 532 n = __neigh_lookup_errno(
521#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) 533#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
522 dev->type == ARPHRD_ATM ? clip_tbl_hook : 534 dev->type == ARPHRD_ATM ?
535 clip_tbl_hook :
523#endif 536#endif
524 &arp_tbl, &nexthop, dev); 537 &arp_tbl, &nexthop, dev);
525 if (IS_ERR(n)) 538 if (IS_ERR(n))
526 return PTR_ERR(n); 539 return PTR_ERR(n);
527 dst->neighbour = n; 540 dst->neighbour = n;
@@ -543,8 +556,8 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
543 556
544 if (!IN_DEV_PROXY_ARP(in_dev)) 557 if (!IN_DEV_PROXY_ARP(in_dev))
545 return 0; 558 return 0;
546 559 imi = IN_DEV_MEDIUM_ID(in_dev);
547 if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0) 560 if (imi == 0)
548 return 1; 561 return 1;
549 if (imi == -1) 562 if (imi == -1)
550 return 0; 563 return 0;
@@ -555,7 +568,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
555 if (out_dev) 568 if (out_dev)
556 omi = IN_DEV_MEDIUM_ID(out_dev); 569 omi = IN_DEV_MEDIUM_ID(out_dev);
557 570
558 return (omi != imi && omi != -1); 571 return omi != imi && omi != -1;
559} 572}
560 573
561/* 574/*
@@ -685,7 +698,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
685 arp->ar_pln = 4; 698 arp->ar_pln = 4;
686 arp->ar_op = htons(type); 699 arp->ar_op = htons(type);
687 700
688 arp_ptr=(unsigned char *)(arp+1); 701 arp_ptr = (unsigned char *)(arp + 1);
689 702
690 memcpy(arp_ptr, src_hw, dev->addr_len); 703 memcpy(arp_ptr, src_hw, dev->addr_len);
691 arp_ptr += dev->addr_len; 704 arp_ptr += dev->addr_len;
@@ -735,9 +748,8 @@ void arp_send(int type, int ptype, __be32 dest_ip,
735 748
736 skb = arp_create(type, ptype, dest_ip, dev, src_ip, 749 skb = arp_create(type, ptype, dest_ip, dev, src_ip,
737 dest_hw, src_hw, target_hw); 750 dest_hw, src_hw, target_hw);
738 if (skb == NULL) { 751 if (skb == NULL)
739 return; 752 return;
740 }
741 753
742 arp_xmit(skb); 754 arp_xmit(skb);
743} 755}
@@ -815,7 +827,7 @@ static int arp_process(struct sk_buff *skb)
815/* 827/*
816 * Extract fields 828 * Extract fields
817 */ 829 */
818 arp_ptr= (unsigned char *)(arp+1); 830 arp_ptr = (unsigned char *)(arp + 1);
819 sha = arp_ptr; 831 sha = arp_ptr;
820 arp_ptr += dev->addr_len; 832 arp_ptr += dev->addr_len;
821 memcpy(&sip, arp_ptr, 4); 833 memcpy(&sip, arp_ptr, 4);
@@ -869,16 +881,17 @@ static int arp_process(struct sk_buff *skb)
869 addr_type = rt->rt_type; 881 addr_type = rt->rt_type;
870 882
871 if (addr_type == RTN_LOCAL) { 883 if (addr_type == RTN_LOCAL) {
872 int dont_send = 0; 884 int dont_send;
873 885
874 if (!dont_send) 886 dont_send = arp_ignore(in_dev, sip, tip);
875 dont_send |= arp_ignore(in_dev,sip,tip);
876 if (!dont_send && IN_DEV_ARPFILTER(in_dev)) 887 if (!dont_send && IN_DEV_ARPFILTER(in_dev))
877 dont_send |= arp_filter(sip,tip,dev); 888 dont_send = arp_filter(sip, tip, dev);
878 if (!dont_send) { 889 if (!dont_send) {
879 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 890 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
880 if (n) { 891 if (n) {
881 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); 892 arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
893 dev, tip, sha, dev->dev_addr,
894 sha);
882 neigh_release(n); 895 neigh_release(n);
883 } 896 }
884 } 897 }
@@ -887,8 +900,7 @@ static int arp_process(struct sk_buff *skb)
887 if (addr_type == RTN_UNICAST && 900 if (addr_type == RTN_UNICAST &&
888 (arp_fwd_proxy(in_dev, dev, rt) || 901 (arp_fwd_proxy(in_dev, dev, rt) ||
889 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || 902 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
890 pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) 903 pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
891 {
892 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 904 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
893 if (n) 905 if (n)
894 neigh_release(n); 906 neigh_release(n);
@@ -896,9 +908,12 @@ static int arp_process(struct sk_buff *skb)
896 if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || 908 if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
897 skb->pkt_type == PACKET_HOST || 909 skb->pkt_type == PACKET_HOST ||
898 in_dev->arp_parms->proxy_delay == 0) { 910 in_dev->arp_parms->proxy_delay == 0) {
899 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); 911 arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
912 dev, tip, sha, dev->dev_addr,
913 sha);
900 } else { 914 } else {
901 pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); 915 pneigh_enqueue(&arp_tbl,
916 in_dev->arp_parms, skb);
902 return 0; 917 return 0;
903 } 918 }
904 goto out; 919 goto out;
@@ -939,7 +954,8 @@ static int arp_process(struct sk_buff *skb)
939 if (arp->ar_op != htons(ARPOP_REPLY) || 954 if (arp->ar_op != htons(ARPOP_REPLY) ||
940 skb->pkt_type != PACKET_HOST) 955 skb->pkt_type != PACKET_HOST)
941 state = NUD_STALE; 956 state = NUD_STALE;
942 neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0); 957 neigh_update(n, sha, state,
958 override ? NEIGH_UPDATE_F_OVERRIDE : 0);
943 neigh_release(n); 959 neigh_release(n);
944 } 960 }
945 961
@@ -975,7 +991,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
975 arp->ar_pln != 4) 991 arp->ar_pln != 4)
976 goto freeskb; 992 goto freeskb;
977 993
978 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) 994 skb = skb_share_check(skb, GFP_ATOMIC);
995 if (skb == NULL)
979 goto out_of_mem; 996 goto out_of_mem;
980 997
981 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); 998 memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
@@ -1018,8 +1035,8 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
1018 if (mask && mask != htonl(0xFFFFFFFF)) 1035 if (mask && mask != htonl(0xFFFFFFFF))
1019 return -EINVAL; 1036 return -EINVAL;
1020 if (!dev && (r->arp_flags & ATF_COM)) { 1037 if (!dev && (r->arp_flags & ATF_COM)) {
1021 dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, 1038 dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family,
1022 r->arp_ha.sa_data); 1039 r->arp_ha.sa_data);
1023 if (!dev) 1040 if (!dev)
1024 return -ENODEV; 1041 return -ENODEV;
1025 } 1042 }
@@ -1033,7 +1050,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
1033} 1050}
1034 1051
1035static int arp_req_set(struct net *net, struct arpreq *r, 1052static int arp_req_set(struct net *net, struct arpreq *r,
1036 struct net_device * dev) 1053 struct net_device *dev)
1037{ 1054{
1038 __be32 ip; 1055 __be32 ip;
1039 struct neighbour *neigh; 1056 struct neighbour *neigh;
@@ -1046,11 +1063,10 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1046 if (r->arp_flags & ATF_PERM) 1063 if (r->arp_flags & ATF_PERM)
1047 r->arp_flags |= ATF_COM; 1064 r->arp_flags |= ATF_COM;
1048 if (dev == NULL) { 1065 if (dev == NULL) {
1049 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, 1066 struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
1050 .tos = RTO_ONLINK } } }; 1067
1051 struct rtable * rt; 1068 if (IS_ERR(rt))
1052 if ((err = ip_route_output_key(net, &rt, &fl)) != 0) 1069 return PTR_ERR(rt);
1053 return err;
1054 dev = rt->dst.dev; 1070 dev = rt->dst.dev;
1055 ip_rt_put(rt); 1071 ip_rt_put(rt);
1056 if (!dev) 1072 if (!dev)
@@ -1083,9 +1099,9 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1083 unsigned state = NUD_STALE; 1099 unsigned state = NUD_STALE;
1084 if (r->arp_flags & ATF_PERM) 1100 if (r->arp_flags & ATF_PERM)
1085 state = NUD_PERMANENT; 1101 state = NUD_PERMANENT;
1086 err = neigh_update(neigh, (r->arp_flags&ATF_COM) ? 1102 err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
1087 r->arp_ha.sa_data : NULL, state, 1103 r->arp_ha.sa_data : NULL, state,
1088 NEIGH_UPDATE_F_OVERRIDE| 1104 NEIGH_UPDATE_F_OVERRIDE |
1089 NEIGH_UPDATE_F_ADMIN); 1105 NEIGH_UPDATE_F_ADMIN);
1090 neigh_release(neigh); 1106 neigh_release(neigh);
1091 } 1107 }
@@ -1094,12 +1110,12 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1094 1110
1095static unsigned arp_state_to_flags(struct neighbour *neigh) 1111static unsigned arp_state_to_flags(struct neighbour *neigh)
1096{ 1112{
1097 unsigned flags = 0;
1098 if (neigh->nud_state&NUD_PERMANENT) 1113 if (neigh->nud_state&NUD_PERMANENT)
1099 flags = ATF_PERM|ATF_COM; 1114 return ATF_PERM | ATF_COM;
1100 else if (neigh->nud_state&NUD_VALID) 1115 else if (neigh->nud_state&NUD_VALID)
1101 flags = ATF_COM; 1116 return ATF_COM;
1102 return flags; 1117 else
1118 return 0;
1103} 1119}
1104 1120
1105/* 1121/*
@@ -1126,6 +1142,23 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
1126 return err; 1142 return err;
1127} 1143}
1128 1144
1145int arp_invalidate(struct net_device *dev, __be32 ip)
1146{
1147 struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
1148 int err = -ENXIO;
1149
1150 if (neigh) {
1151 if (neigh->nud_state & ~NUD_NOARP)
1152 err = neigh_update(neigh, NULL, NUD_FAILED,
1153 NEIGH_UPDATE_F_OVERRIDE|
1154 NEIGH_UPDATE_F_ADMIN);
1155 neigh_release(neigh);
1156 }
1157
1158 return err;
1159}
1160EXPORT_SYMBOL(arp_invalidate);
1161
1129static int arp_req_delete_public(struct net *net, struct arpreq *r, 1162static int arp_req_delete_public(struct net *net, struct arpreq *r,
1130 struct net_device *dev) 1163 struct net_device *dev)
1131{ 1164{
@@ -1142,37 +1175,24 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
1142} 1175}
1143 1176
1144static int arp_req_delete(struct net *net, struct arpreq *r, 1177static int arp_req_delete(struct net *net, struct arpreq *r,
1145 struct net_device * dev) 1178 struct net_device *dev)
1146{ 1179{
1147 int err;
1148 __be32 ip; 1180 __be32 ip;
1149 struct neighbour *neigh;
1150 1181
1151 if (r->arp_flags & ATF_PUBL) 1182 if (r->arp_flags & ATF_PUBL)
1152 return arp_req_delete_public(net, r, dev); 1183 return arp_req_delete_public(net, r, dev);
1153 1184
1154 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; 1185 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
1155 if (dev == NULL) { 1186 if (dev == NULL) {
1156 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, 1187 struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
1157 .tos = RTO_ONLINK } } }; 1188 if (IS_ERR(rt))
1158 struct rtable * rt; 1189 return PTR_ERR(rt);
1159 if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
1160 return err;
1161 dev = rt->dst.dev; 1190 dev = rt->dst.dev;
1162 ip_rt_put(rt); 1191 ip_rt_put(rt);
1163 if (!dev) 1192 if (!dev)
1164 return -EINVAL; 1193 return -EINVAL;
1165 } 1194 }
1166 err = -ENXIO; 1195 return arp_invalidate(dev, ip);
1167 neigh = neigh_lookup(&arp_tbl, &ip, dev);
1168 if (neigh) {
1169 if (neigh->nud_state&~NUD_NOARP)
1170 err = neigh_update(neigh, NULL, NUD_FAILED,
1171 NEIGH_UPDATE_F_OVERRIDE|
1172 NEIGH_UPDATE_F_ADMIN);
1173 neigh_release(neigh);
1174 }
1175 return err;
1176} 1196}
1177 1197
1178/* 1198/*
@@ -1186,24 +1206,24 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1186 struct net_device *dev = NULL; 1206 struct net_device *dev = NULL;
1187 1207
1188 switch (cmd) { 1208 switch (cmd) {
1189 case SIOCDARP: 1209 case SIOCDARP:
1190 case SIOCSARP: 1210 case SIOCSARP:
1191 if (!capable(CAP_NET_ADMIN)) 1211 if (!capable(CAP_NET_ADMIN))
1192 return -EPERM; 1212 return -EPERM;
1193 case SIOCGARP: 1213 case SIOCGARP:
1194 err = copy_from_user(&r, arg, sizeof(struct arpreq)); 1214 err = copy_from_user(&r, arg, sizeof(struct arpreq));
1195 if (err) 1215 if (err)
1196 return -EFAULT; 1216 return -EFAULT;
1197 break; 1217 break;
1198 default: 1218 default:
1199 return -EINVAL; 1219 return -EINVAL;
1200 } 1220 }
1201 1221
1202 if (r.arp_pa.sa_family != AF_INET) 1222 if (r.arp_pa.sa_family != AF_INET)
1203 return -EPFNOSUPPORT; 1223 return -EPFNOSUPPORT;
1204 1224
1205 if (!(r.arp_flags & ATF_PUBL) && 1225 if (!(r.arp_flags & ATF_PUBL) &&
1206 (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB))) 1226 (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
1207 return -EINVAL; 1227 return -EINVAL;
1208 if (!(r.arp_flags & ATF_NETMASK)) 1228 if (!(r.arp_flags & ATF_NETMASK))
1209 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = 1229 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
@@ -1211,7 +1231,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1211 rtnl_lock(); 1231 rtnl_lock();
1212 if (r.arp_dev[0]) { 1232 if (r.arp_dev[0]) {
1213 err = -ENODEV; 1233 err = -ENODEV;
1214 if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL) 1234 dev = __dev_get_by_name(net, r.arp_dev);
1235 if (dev == NULL)
1215 goto out; 1236 goto out;
1216 1237
1217 /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ 1238 /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
@@ -1234,16 +1255,17 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1234 break; 1255 break;
1235 case SIOCGARP: 1256 case SIOCGARP:
1236 err = arp_req_get(&r, dev); 1257 err = arp_req_get(&r, dev);
1237 if (!err && copy_to_user(arg, &r, sizeof(r)))
1238 err = -EFAULT;
1239 break; 1258 break;
1240 } 1259 }
1241out: 1260out:
1242 rtnl_unlock(); 1261 rtnl_unlock();
1262 if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
1263 err = -EFAULT;
1243 return err; 1264 return err;
1244} 1265}
1245 1266
1246static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) 1267static int arp_netdev_event(struct notifier_block *this, unsigned long event,
1268 void *ptr)
1247{ 1269{
1248 struct net_device *dev = ptr; 1270 struct net_device *dev = ptr;
1249 1271
@@ -1311,12 +1333,13 @@ static char *ax2asc2(ax25_address *a, char *buf)
1311 for (n = 0, s = buf; n < 6; n++) { 1333 for (n = 0, s = buf; n < 6; n++) {
1312 c = (a->ax25_call[n] >> 1) & 0x7F; 1334 c = (a->ax25_call[n] >> 1) & 0x7F;
1313 1335
1314 if (c != ' ') *s++ = c; 1336 if (c != ' ')
1337 *s++ = c;
1315 } 1338 }
1316 1339
1317 *s++ = '-'; 1340 *s++ = '-';
1318 1341 n = (a->ax25_call[6] >> 1) & 0x0F;
1319 if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { 1342 if (n > 9) {
1320 *s++ = '1'; 1343 *s++ = '1';
1321 n -= 10; 1344 n -= 10;
1322 } 1345 }
@@ -1325,10 +1348,9 @@ static char *ax2asc2(ax25_address *a, char *buf)
1325 *s++ = '\0'; 1348 *s++ = '\0';
1326 1349
1327 if (*buf == '\0' || *buf == '-') 1350 if (*buf == '\0' || *buf == '-')
1328 return "*"; 1351 return "*";
1329 1352
1330 return buf; 1353 return buf;
1331
1332} 1354}
1333#endif /* CONFIG_AX25 */ 1355#endif /* CONFIG_AX25 */
1334 1356
@@ -1408,10 +1430,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
1408/* ------------------------------------------------------------------------ */ 1430/* ------------------------------------------------------------------------ */
1409 1431
1410static const struct seq_operations arp_seq_ops = { 1432static const struct seq_operations arp_seq_ops = {
1411 .start = arp_seq_start, 1433 .start = arp_seq_start,
1412 .next = neigh_seq_next, 1434 .next = neigh_seq_next,
1413 .stop = neigh_seq_stop, 1435 .stop = neigh_seq_stop,
1414 .show = arp_seq_show, 1436 .show = arp_seq_show,
1415}; 1437};
1416 1438
1417static int arp_seq_open(struct inode *inode, struct file *file) 1439static int arp_seq_open(struct inode *inode, struct file *file)
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 3a92a76ae41d..2b3c23c287cd 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * The CIPSO draft specification can be found in the kernel's Documentation 10 * The CIPSO draft specification can be found in the kernel's Documentation
11 * directory as well as the following URL: 11 * directory as well as the following URL:
12 * http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt 12 * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
13 * The FIPS-188 specification can be found at the following URL: 13 * The FIPS-188 specification can be found at the following URL:
14 * http://www.itl.nist.gov/fipspubs/fip188.htm 14 * http://www.itl.nist.gov/fipspubs/fip188.htm
15 * 15 *
@@ -112,7 +112,7 @@ int cipso_v4_rbm_strictvalid = 1;
112/* The maximum number of category ranges permitted in the ranged category tag 112/* The maximum number of category ranges permitted in the ranged category tag
113 * (tag #5). You may note that the IETF draft states that the maximum number 113 * (tag #5). You may note that the IETF draft states that the maximum number
114 * of category ranges is 7, but if the low end of the last category range is 114 * of category ranges is 7, but if the low end of the last category range is
115 * zero then it is possibile to fit 8 category ranges because the zero should 115 * zero then it is possible to fit 8 category ranges because the zero should
116 * be omitted. */ 116 * be omitted. */
117#define CIPSO_V4_TAG_RNG_CAT_MAX 8 117#define CIPSO_V4_TAG_RNG_CAT_MAX 8
118 118
@@ -438,7 +438,7 @@ cache_add_failure:
438 * 438 *
439 * Description: 439 * Description:
440 * Search the DOI definition list for a DOI definition with a DOI value that 440 * Search the DOI definition list for a DOI definition with a DOI value that
441 * matches @doi. The caller is responsibile for calling rcu_read_[un]lock(). 441 * matches @doi. The caller is responsible for calling rcu_read_[un]lock().
442 * Returns a pointer to the DOI definition on success and NULL on failure. 442 * Returns a pointer to the DOI definition on success and NULL on failure.
443 */ 443 */
444static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi) 444static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
@@ -1293,7 +1293,7 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
1293 return ret_val; 1293 return ret_val;
1294 1294
1295 /* This will send packets using the "optimized" format when 1295 /* This will send packets using the "optimized" format when
1296 * possibile as specified in section 3.4.2.6 of the 1296 * possible as specified in section 3.4.2.6 of the
1297 * CIPSO draft. */ 1297 * CIPSO draft. */
1298 if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10) 1298 if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10)
1299 tag_len = 14; 1299 tag_len = 14;
@@ -1752,7 +1752,7 @@ validate_return:
1752} 1752}
1753 1753
1754/** 1754/**
1755 * cipso_v4_error - Send the correct reponse for a bad packet 1755 * cipso_v4_error - Send the correct response for a bad packet
1756 * @skb: the packet 1756 * @skb: the packet
1757 * @error: the error code 1757 * @error: the error code
1758 * @gateway: CIPSO gateway flag 1758 * @gateway: CIPSO gateway flag
@@ -1857,6 +1857,11 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
1857 return CIPSO_V4_HDR_LEN + ret_val; 1857 return CIPSO_V4_HDR_LEN + ret_val;
1858} 1858}
1859 1859
1860static void opt_kfree_rcu(struct rcu_head *head)
1861{
1862 kfree(container_of(head, struct ip_options_rcu, rcu));
1863}
1864
1860/** 1865/**
1861 * cipso_v4_sock_setattr - Add a CIPSO option to a socket 1866 * cipso_v4_sock_setattr - Add a CIPSO option to a socket
1862 * @sk: the socket 1867 * @sk: the socket
@@ -1879,7 +1884,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
1879 unsigned char *buf = NULL; 1884 unsigned char *buf = NULL;
1880 u32 buf_len; 1885 u32 buf_len;
1881 u32 opt_len; 1886 u32 opt_len;
1882 struct ip_options *opt = NULL; 1887 struct ip_options_rcu *old, *opt = NULL;
1883 struct inet_sock *sk_inet; 1888 struct inet_sock *sk_inet;
1884 struct inet_connection_sock *sk_conn; 1889 struct inet_connection_sock *sk_conn;
1885 1890
@@ -1915,22 +1920,25 @@ int cipso_v4_sock_setattr(struct sock *sk,
1915 ret_val = -ENOMEM; 1920 ret_val = -ENOMEM;
1916 goto socket_setattr_failure; 1921 goto socket_setattr_failure;
1917 } 1922 }
1918 memcpy(opt->__data, buf, buf_len); 1923 memcpy(opt->opt.__data, buf, buf_len);
1919 opt->optlen = opt_len; 1924 opt->opt.optlen = opt_len;
1920 opt->cipso = sizeof(struct iphdr); 1925 opt->opt.cipso = sizeof(struct iphdr);
1921 kfree(buf); 1926 kfree(buf);
1922 buf = NULL; 1927 buf = NULL;
1923 1928
1924 sk_inet = inet_sk(sk); 1929 sk_inet = inet_sk(sk);
1930
1931 old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));
1925 if (sk_inet->is_icsk) { 1932 if (sk_inet->is_icsk) {
1926 sk_conn = inet_csk(sk); 1933 sk_conn = inet_csk(sk);
1927 if (sk_inet->opt) 1934 if (old)
1928 sk_conn->icsk_ext_hdr_len -= sk_inet->opt->optlen; 1935 sk_conn->icsk_ext_hdr_len -= old->opt.optlen;
1929 sk_conn->icsk_ext_hdr_len += opt->optlen; 1936 sk_conn->icsk_ext_hdr_len += opt->opt.optlen;
1930 sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); 1937 sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
1931 } 1938 }
1932 opt = xchg(&sk_inet->opt, opt); 1939 rcu_assign_pointer(sk_inet->inet_opt, opt);
1933 kfree(opt); 1940 if (old)
1941 call_rcu(&old->rcu, opt_kfree_rcu);
1934 1942
1935 return 0; 1943 return 0;
1936 1944
@@ -1960,7 +1968,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
1960 unsigned char *buf = NULL; 1968 unsigned char *buf = NULL;
1961 u32 buf_len; 1969 u32 buf_len;
1962 u32 opt_len; 1970 u32 opt_len;
1963 struct ip_options *opt = NULL; 1971 struct ip_options_rcu *opt = NULL;
1964 struct inet_request_sock *req_inet; 1972 struct inet_request_sock *req_inet;
1965 1973
1966 /* We allocate the maximum CIPSO option size here so we are probably 1974 /* We allocate the maximum CIPSO option size here so we are probably
@@ -1988,15 +1996,16 @@ int cipso_v4_req_setattr(struct request_sock *req,
1988 ret_val = -ENOMEM; 1996 ret_val = -ENOMEM;
1989 goto req_setattr_failure; 1997 goto req_setattr_failure;
1990 } 1998 }
1991 memcpy(opt->__data, buf, buf_len); 1999 memcpy(opt->opt.__data, buf, buf_len);
1992 opt->optlen = opt_len; 2000 opt->opt.optlen = opt_len;
1993 opt->cipso = sizeof(struct iphdr); 2001 opt->opt.cipso = sizeof(struct iphdr);
1994 kfree(buf); 2002 kfree(buf);
1995 buf = NULL; 2003 buf = NULL;
1996 2004
1997 req_inet = inet_rsk(req); 2005 req_inet = inet_rsk(req);
1998 opt = xchg(&req_inet->opt, opt); 2006 opt = xchg(&req_inet->opt, opt);
1999 kfree(opt); 2007 if (opt)
2008 call_rcu(&opt->rcu, opt_kfree_rcu);
2000 2009
2001 return 0; 2010 return 0;
2002 2011
@@ -2016,34 +2025,34 @@ req_setattr_failure:
2016 * values on failure. 2025 * values on failure.
2017 * 2026 *
2018 */ 2027 */
2019static int cipso_v4_delopt(struct ip_options **opt_ptr) 2028static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
2020{ 2029{
2021 int hdr_delta = 0; 2030 int hdr_delta = 0;
2022 struct ip_options *opt = *opt_ptr; 2031 struct ip_options_rcu *opt = *opt_ptr;
2023 2032
2024 if (opt->srr || opt->rr || opt->ts || opt->router_alert) { 2033 if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
2025 u8 cipso_len; 2034 u8 cipso_len;
2026 u8 cipso_off; 2035 u8 cipso_off;
2027 unsigned char *cipso_ptr; 2036 unsigned char *cipso_ptr;
2028 int iter; 2037 int iter;
2029 int optlen_new; 2038 int optlen_new;
2030 2039
2031 cipso_off = opt->cipso - sizeof(struct iphdr); 2040 cipso_off = opt->opt.cipso - sizeof(struct iphdr);
2032 cipso_ptr = &opt->__data[cipso_off]; 2041 cipso_ptr = &opt->opt.__data[cipso_off];
2033 cipso_len = cipso_ptr[1]; 2042 cipso_len = cipso_ptr[1];
2034 2043
2035 if (opt->srr > opt->cipso) 2044 if (opt->opt.srr > opt->opt.cipso)
2036 opt->srr -= cipso_len; 2045 opt->opt.srr -= cipso_len;
2037 if (opt->rr > opt->cipso) 2046 if (opt->opt.rr > opt->opt.cipso)
2038 opt->rr -= cipso_len; 2047 opt->opt.rr -= cipso_len;
2039 if (opt->ts > opt->cipso) 2048 if (opt->opt.ts > opt->opt.cipso)
2040 opt->ts -= cipso_len; 2049 opt->opt.ts -= cipso_len;
2041 if (opt->router_alert > opt->cipso) 2050 if (opt->opt.router_alert > opt->opt.cipso)
2042 opt->router_alert -= cipso_len; 2051 opt->opt.router_alert -= cipso_len;
2043 opt->cipso = 0; 2052 opt->opt.cipso = 0;
2044 2053
2045 memmove(cipso_ptr, cipso_ptr + cipso_len, 2054 memmove(cipso_ptr, cipso_ptr + cipso_len,
2046 opt->optlen - cipso_off - cipso_len); 2055 opt->opt.optlen - cipso_off - cipso_len);
2047 2056
2048 /* determining the new total option length is tricky because of 2057 /* determining the new total option length is tricky because of
2049 * the padding necessary, the only thing i can think to do at 2058 * the padding necessary, the only thing i can think to do at
@@ -2052,21 +2061,21 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr)
2052 * from there we can determine the new total option length */ 2061 * from there we can determine the new total option length */
2053 iter = 0; 2062 iter = 0;
2054 optlen_new = 0; 2063 optlen_new = 0;
2055 while (iter < opt->optlen) 2064 while (iter < opt->opt.optlen)
2056 if (opt->__data[iter] != IPOPT_NOP) { 2065 if (opt->opt.__data[iter] != IPOPT_NOP) {
2057 iter += opt->__data[iter + 1]; 2066 iter += opt->opt.__data[iter + 1];
2058 optlen_new = iter; 2067 optlen_new = iter;
2059 } else 2068 } else
2060 iter++; 2069 iter++;
2061 hdr_delta = opt->optlen; 2070 hdr_delta = opt->opt.optlen;
2062 opt->optlen = (optlen_new + 3) & ~3; 2071 opt->opt.optlen = (optlen_new + 3) & ~3;
2063 hdr_delta -= opt->optlen; 2072 hdr_delta -= opt->opt.optlen;
2064 } else { 2073 } else {
2065 /* only the cipso option was present on the socket so we can 2074 /* only the cipso option was present on the socket so we can
2066 * remove the entire option struct */ 2075 * remove the entire option struct */
2067 *opt_ptr = NULL; 2076 *opt_ptr = NULL;
2068 hdr_delta = opt->optlen; 2077 hdr_delta = opt->opt.optlen;
2069 kfree(opt); 2078 call_rcu(&opt->rcu, opt_kfree_rcu);
2070 } 2079 }
2071 2080
2072 return hdr_delta; 2081 return hdr_delta;
@@ -2083,15 +2092,15 @@ static int cipso_v4_delopt(struct ip_options **opt_ptr)
2083void cipso_v4_sock_delattr(struct sock *sk) 2092void cipso_v4_sock_delattr(struct sock *sk)
2084{ 2093{
2085 int hdr_delta; 2094 int hdr_delta;
2086 struct ip_options *opt; 2095 struct ip_options_rcu *opt;
2087 struct inet_sock *sk_inet; 2096 struct inet_sock *sk_inet;
2088 2097
2089 sk_inet = inet_sk(sk); 2098 sk_inet = inet_sk(sk);
2090 opt = sk_inet->opt; 2099 opt = rcu_dereference_protected(sk_inet->inet_opt, 1);
2091 if (opt == NULL || opt->cipso == 0) 2100 if (opt == NULL || opt->opt.cipso == 0)
2092 return; 2101 return;
2093 2102
2094 hdr_delta = cipso_v4_delopt(&sk_inet->opt); 2103 hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
2095 if (sk_inet->is_icsk && hdr_delta > 0) { 2104 if (sk_inet->is_icsk && hdr_delta > 0) {
2096 struct inet_connection_sock *sk_conn = inet_csk(sk); 2105 struct inet_connection_sock *sk_conn = inet_csk(sk);
2097 sk_conn->icsk_ext_hdr_len -= hdr_delta; 2106 sk_conn->icsk_ext_hdr_len -= hdr_delta;
@@ -2109,12 +2118,12 @@ void cipso_v4_sock_delattr(struct sock *sk)
2109 */ 2118 */
2110void cipso_v4_req_delattr(struct request_sock *req) 2119void cipso_v4_req_delattr(struct request_sock *req)
2111{ 2120{
2112 struct ip_options *opt; 2121 struct ip_options_rcu *opt;
2113 struct inet_request_sock *req_inet; 2122 struct inet_request_sock *req_inet;
2114 2123
2115 req_inet = inet_rsk(req); 2124 req_inet = inet_rsk(req);
2116 opt = req_inet->opt; 2125 opt = req_inet->opt;
2117 if (opt == NULL || opt->cipso == 0) 2126 if (opt == NULL || opt->opt.cipso == 0)
2118 return; 2127 return;
2119 2128
2120 cipso_v4_delopt(&req_inet->opt); 2129 cipso_v4_delopt(&req_inet->opt);
@@ -2184,14 +2193,18 @@ getattr_return:
2184 */ 2193 */
2185int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) 2194int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
2186{ 2195{
2187 struct ip_options *opt; 2196 struct ip_options_rcu *opt;
2197 int res = -ENOMSG;
2188 2198
2189 opt = inet_sk(sk)->opt; 2199 rcu_read_lock();
2190 if (opt == NULL || opt->cipso == 0) 2200 opt = rcu_dereference(inet_sk(sk)->inet_opt);
2191 return -ENOMSG; 2201 if (opt && opt->opt.cipso)
2192 2202 res = cipso_v4_getattr(opt->opt.__data +
2193 return cipso_v4_getattr(opt->__data + opt->cipso - sizeof(struct iphdr), 2203 opt->opt.cipso -
2194 secattr); 2204 sizeof(struct iphdr),
2205 secattr);
2206 rcu_read_unlock();
2207 return res;
2195} 2208}
2196 2209
2197/** 2210/**
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 721a8a37b45c..424fafbc8cb0 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -24,6 +24,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
24{ 24{
25 struct inet_sock *inet = inet_sk(sk); 25 struct inet_sock *inet = inet_sk(sk);
26 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; 26 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
27 struct flowi4 *fl4;
27 struct rtable *rt; 28 struct rtable *rt;
28 __be32 saddr; 29 __be32 saddr;
29 int oif; 30 int oif;
@@ -38,6 +39,8 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
38 39
39 sk_dst_reset(sk); 40 sk_dst_reset(sk);
40 41
42 lock_sock(sk);
43
41 oif = sk->sk_bound_dev_if; 44 oif = sk->sk_bound_dev_if;
42 saddr = inet->inet_saddr; 45 saddr = inet->inet_saddr;
43 if (ipv4_is_multicast(usin->sin_addr.s_addr)) { 46 if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
@@ -46,33 +49,39 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
46 if (!saddr) 49 if (!saddr)
47 saddr = inet->mc_addr; 50 saddr = inet->mc_addr;
48 } 51 }
49 err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, 52 fl4 = &inet->cork.fl.u.ip4;
50 RT_CONN_FLAGS(sk), oif, 53 rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
51 sk->sk_protocol, 54 RT_CONN_FLAGS(sk), oif,
52 inet->inet_sport, usin->sin_port, sk, 1); 55 sk->sk_protocol,
53 if (err) { 56 inet->inet_sport, usin->sin_port, sk, true);
57 if (IS_ERR(rt)) {
58 err = PTR_ERR(rt);
54 if (err == -ENETUNREACH) 59 if (err == -ENETUNREACH)
55 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 60 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
56 return err; 61 goto out;
57 } 62 }
58 63
59 if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) { 64 if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
60 ip_rt_put(rt); 65 ip_rt_put(rt);
61 return -EACCES; 66 err = -EACCES;
67 goto out;
62 } 68 }
63 if (!inet->inet_saddr) 69 if (!inet->inet_saddr)
64 inet->inet_saddr = rt->rt_src; /* Update source address */ 70 inet->inet_saddr = fl4->saddr; /* Update source address */
65 if (!inet->inet_rcv_saddr) { 71 if (!inet->inet_rcv_saddr) {
66 inet->inet_rcv_saddr = rt->rt_src; 72 inet->inet_rcv_saddr = fl4->saddr;
67 if (sk->sk_prot->rehash) 73 if (sk->sk_prot->rehash)
68 sk->sk_prot->rehash(sk); 74 sk->sk_prot->rehash(sk);
69 } 75 }
70 inet->inet_daddr = rt->rt_dst; 76 inet->inet_daddr = fl4->daddr;
71 inet->inet_dport = usin->sin_port; 77 inet->inet_dport = usin->sin_port;
72 sk->sk_state = TCP_ESTABLISHED; 78 sk->sk_state = TCP_ESTABLISHED;
73 inet->inet_id = jiffies; 79 inet->inet_id = jiffies;
74 80
75 sk_dst_set(sk, &rt->dst); 81 sk_dst_set(sk, &rt->dst);
76 return(0); 82 err = 0;
83out:
84 release_sock(sk);
85 return err;
77} 86}
78EXPORT_SYMBOL(ip4_datagram_connect); 87EXPORT_SYMBOL(ip4_datagram_connect);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index da14c49284f4..0d4a184af16f 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -51,6 +51,7 @@
51#include <linux/inetdevice.h> 51#include <linux/inetdevice.h>
52#include <linux/igmp.h> 52#include <linux/igmp.h>
53#include <linux/slab.h> 53#include <linux/slab.h>
54#include <linux/hash.h>
54#ifdef CONFIG_SYSCTL 55#ifdef CONFIG_SYSCTL
55#include <linux/sysctl.h> 56#include <linux/sysctl.h>
56#endif 57#endif
@@ -63,6 +64,8 @@
63#include <net/rtnetlink.h> 64#include <net/rtnetlink.h>
64#include <net/net_namespace.h> 65#include <net/net_namespace.h>
65 66
67#include "fib_lookup.h"
68
66static struct ipv4_devconf ipv4_devconf = { 69static struct ipv4_devconf ipv4_devconf = {
67 .data = { 70 .data = {
68 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, 71 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
@@ -92,6 +95,85 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
92 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, 95 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
93}; 96};
94 97
98/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
99 * value. So if you change this define, make appropriate changes to
100 * inet_addr_hash as well.
101 */
102#define IN4_ADDR_HSIZE 256
103static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
104static DEFINE_SPINLOCK(inet_addr_hash_lock);
105
106static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
107{
108 u32 val = (__force u32) addr ^ hash_ptr(net, 8);
109
110 return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
111 (IN4_ADDR_HSIZE - 1));
112}
113
114static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
115{
116 unsigned int hash = inet_addr_hash(net, ifa->ifa_local);
117
118 spin_lock(&inet_addr_hash_lock);
119 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
120 spin_unlock(&inet_addr_hash_lock);
121}
122
123static void inet_hash_remove(struct in_ifaddr *ifa)
124{
125 spin_lock(&inet_addr_hash_lock);
126 hlist_del_init_rcu(&ifa->hash);
127 spin_unlock(&inet_addr_hash_lock);
128}
129
130/**
131 * __ip_dev_find - find the first device with a given source address.
132 * @net: the net namespace
133 * @addr: the source address
134 * @devref: if true, take a reference on the found device
135 *
136 * If a caller uses devref=false, it should be protected by RCU, or RTNL
137 */
138struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
139{
140 unsigned int hash = inet_addr_hash(net, addr);
141 struct net_device *result = NULL;
142 struct in_ifaddr *ifa;
143 struct hlist_node *node;
144
145 rcu_read_lock();
146 hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
147 struct net_device *dev = ifa->ifa_dev->dev;
148
149 if (!net_eq(dev_net(dev), net))
150 continue;
151 if (ifa->ifa_local == addr) {
152 result = dev;
153 break;
154 }
155 }
156 if (!result) {
157 struct flowi4 fl4 = { .daddr = addr };
158 struct fib_result res = { 0 };
159 struct fib_table *local;
160
161 /* Fallback to FIB local table so that communication
162 * over loopback subnets work.
163 */
164 local = fib_get_table(net, RT_TABLE_LOCAL);
165 if (local &&
166 !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
167 res.type == RTN_LOCAL)
168 result = FIB_RES_DEV(res);
169 }
170 if (result && devref)
171 dev_hold(result);
172 rcu_read_unlock();
173 return result;
174}
175EXPORT_SYMBOL(__ip_dev_find);
176
95static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); 177static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
96 178
97static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); 179static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
@@ -209,7 +291,7 @@ static void inetdev_destroy(struct in_device *in_dev)
209 inet_free_ifa(ifa); 291 inet_free_ifa(ifa);
210 } 292 }
211 293
212 dev->ip_ptr = NULL; 294 rcu_assign_pointer(dev->ip_ptr, NULL);
213 295
214 devinet_sysctl_unregister(in_dev); 296 devinet_sysctl_unregister(in_dev);
215 neigh_parms_release(&arp_tbl, in_dev->arp_parms); 297 neigh_parms_release(&arp_tbl, in_dev->arp_parms);
@@ -265,6 +347,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
265 } 347 }
266 348
267 if (!do_promote) { 349 if (!do_promote) {
350 inet_hash_remove(ifa);
268 *ifap1 = ifa->ifa_next; 351 *ifap1 = ifa->ifa_next;
269 352
270 rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); 353 rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
@@ -278,9 +361,21 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
278 } 361 }
279 } 362 }
280 363
364 /* On promotion all secondaries from subnet are changing
365 * the primary IP, we must remove all their routes silently
366 * and later to add them back with new prefsrc. Do this
367 * while all addresses are on the device list.
368 */
369 for (ifa = promote; ifa; ifa = ifa->ifa_next) {
370 if (ifa1->ifa_mask == ifa->ifa_mask &&
371 inet_ifa_match(ifa1->ifa_address, ifa))
372 fib_del_ifaddr(ifa, ifa1);
373 }
374
281 /* 2. Unlink it */ 375 /* 2. Unlink it */
282 376
283 *ifap = ifa1->ifa_next; 377 *ifap = ifa1->ifa_next;
378 inet_hash_remove(ifa1);
284 379
285 /* 3. Announce address deletion */ 380 /* 3. Announce address deletion */
286 381
@@ -296,6 +391,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
296 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); 391 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
297 392
298 if (promote) { 393 if (promote) {
394 struct in_ifaddr *next_sec = promote->ifa_next;
299 395
300 if (prev_prom) { 396 if (prev_prom) {
301 prev_prom->ifa_next = promote->ifa_next; 397 prev_prom->ifa_next = promote->ifa_next;
@@ -307,7 +403,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
307 rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); 403 rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid);
308 blocking_notifier_call_chain(&inetaddr_chain, 404 blocking_notifier_call_chain(&inetaddr_chain,
309 NETDEV_UP, promote); 405 NETDEV_UP, promote);
310 for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) { 406 for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
311 if (ifa1->ifa_mask != ifa->ifa_mask || 407 if (ifa1->ifa_mask != ifa->ifa_mask ||
312 !inet_ifa_match(ifa1->ifa_address, ifa)) 408 !inet_ifa_match(ifa1->ifa_address, ifa))
313 continue; 409 continue;
@@ -368,6 +464,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
368 ifa->ifa_next = *ifap; 464 ifa->ifa_next = *ifap;
369 *ifap = ifa; 465 *ifap = ifa;
370 466
467 inet_hash_insert(dev_net(in_dev->dev), ifa);
468
371 /* Send message first, then call notifier. 469 /* Send message first, then call notifier.
372 Notifier will trigger FIB update, so that 470 Notifier will trigger FIB update, so that
373 listeners of netlink will know about new ifaddr */ 471 listeners of netlink will know about new ifaddr */
@@ -403,6 +501,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
403 return inet_insert_ifa(ifa); 501 return inet_insert_ifa(ifa);
404} 502}
405 503
504/* Caller must hold RCU or RTNL :
505 * We dont take a reference on found in_device
506 */
406struct in_device *inetdev_by_index(struct net *net, int ifindex) 507struct in_device *inetdev_by_index(struct net *net, int ifindex)
407{ 508{
408 struct net_device *dev; 509 struct net_device *dev;
@@ -411,7 +512,7 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex)
411 rcu_read_lock(); 512 rcu_read_lock();
412 dev = dev_get_by_index_rcu(net, ifindex); 513 dev = dev_get_by_index_rcu(net, ifindex);
413 if (dev) 514 if (dev)
414 in_dev = in_dev_get(dev); 515 in_dev = rcu_dereference_rtnl(dev->ip_ptr);
415 rcu_read_unlock(); 516 rcu_read_unlock();
416 return in_dev; 517 return in_dev;
417} 518}
@@ -453,8 +554,6 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
453 goto errout; 554 goto errout;
454 } 555 }
455 556
456 __in_dev_put(in_dev);
457
458 for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; 557 for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
459 ifap = &ifa->ifa_next) { 558 ifap = &ifa->ifa_next) {
460 if (tb[IFA_LOCAL] && 559 if (tb[IFA_LOCAL] &&
@@ -520,6 +619,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
520 if (tb[IFA_ADDRESS] == NULL) 619 if (tb[IFA_ADDRESS] == NULL)
521 tb[IFA_ADDRESS] = tb[IFA_LOCAL]; 620 tb[IFA_ADDRESS] = tb[IFA_LOCAL];
522 621
622 INIT_HLIST_NODE(&ifa->hash);
523 ifa->ifa_prefixlen = ifm->ifa_prefixlen; 623 ifa->ifa_prefixlen = ifm->ifa_prefixlen;
524 ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); 624 ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
525 ifa->ifa_flags = ifm->ifa_flags; 625 ifa->ifa_flags = ifm->ifa_flags;
@@ -669,7 +769,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
669 ifap = &ifa->ifa_next) { 769 ifap = &ifa->ifa_next) {
670 if (!strcmp(ifr.ifr_name, ifa->ifa_label) && 770 if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
671 sin_orig.sin_addr.s_addr == 771 sin_orig.sin_addr.s_addr ==
672 ifa->ifa_address) { 772 ifa->ifa_local) {
673 break; /* found */ 773 break; /* found */
674 } 774 }
675 } 775 }
@@ -727,6 +827,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
727 if (!ifa) { 827 if (!ifa) {
728 ret = -ENOBUFS; 828 ret = -ENOBUFS;
729 ifa = inet_alloc_ifa(); 829 ifa = inet_alloc_ifa();
830 INIT_HLIST_NODE(&ifa->hash);
730 if (!ifa) 831 if (!ifa)
731 break; 832 break;
732 if (colon) 833 if (colon)
@@ -1029,6 +1130,21 @@ static inline bool inetdev_valid_mtu(unsigned mtu)
1029 return mtu >= 68; 1130 return mtu >= 68;
1030} 1131}
1031 1132
1133static void inetdev_send_gratuitous_arp(struct net_device *dev,
1134 struct in_device *in_dev)
1135
1136{
1137 struct in_ifaddr *ifa = in_dev->ifa_list;
1138
1139 if (!ifa)
1140 return;
1141
1142 arp_send(ARPOP_REQUEST, ETH_P_ARP,
1143 ifa->ifa_local, dev,
1144 ifa->ifa_local, NULL,
1145 dev->dev_addr, NULL);
1146}
1147
1032/* Called only under RTNL semaphore */ 1148/* Called only under RTNL semaphore */
1033 1149
1034static int inetdev_event(struct notifier_block *this, unsigned long event, 1150static int inetdev_event(struct notifier_block *this, unsigned long event,
@@ -1059,7 +1175,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1059 switch (event) { 1175 switch (event) {
1060 case NETDEV_REGISTER: 1176 case NETDEV_REGISTER:
1061 printk(KERN_DEBUG "inetdev_event: bug\n"); 1177 printk(KERN_DEBUG "inetdev_event: bug\n");
1062 dev->ip_ptr = NULL; 1178 rcu_assign_pointer(dev->ip_ptr, NULL);
1063 break; 1179 break;
1064 case NETDEV_UP: 1180 case NETDEV_UP:
1065 if (!inetdev_valid_mtu(dev->mtu)) 1181 if (!inetdev_valid_mtu(dev->mtu))
@@ -1068,6 +1184,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1068 struct in_ifaddr *ifa = inet_alloc_ifa(); 1184 struct in_ifaddr *ifa = inet_alloc_ifa();
1069 1185
1070 if (ifa) { 1186 if (ifa) {
1187 INIT_HLIST_NODE(&ifa->hash);
1071 ifa->ifa_local = 1188 ifa->ifa_local =
1072 ifa->ifa_address = htonl(INADDR_LOOPBACK); 1189 ifa->ifa_address = htonl(INADDR_LOOPBACK);
1073 ifa->ifa_prefixlen = 8; 1190 ifa->ifa_prefixlen = 8;
@@ -1081,18 +1198,13 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1081 } 1198 }
1082 ip_mc_up(in_dev); 1199 ip_mc_up(in_dev);
1083 /* fall through */ 1200 /* fall through */
1084 case NETDEV_NOTIFY_PEERS:
1085 case NETDEV_CHANGEADDR: 1201 case NETDEV_CHANGEADDR:
1202 if (!IN_DEV_ARP_NOTIFY(in_dev))
1203 break;
1204 /* fall through */
1205 case NETDEV_NOTIFY_PEERS:
1086 /* Send gratuitous ARP to notify of link change */ 1206 /* Send gratuitous ARP to notify of link change */
1087 if (IN_DEV_ARP_NOTIFY(in_dev)) { 1207 inetdev_send_gratuitous_arp(dev, in_dev);
1088 struct in_ifaddr *ifa = in_dev->ifa_list;
1089
1090 if (ifa)
1091 arp_send(ARPOP_REQUEST, ETH_P_ARP,
1092 ifa->ifa_address, dev,
1093 ifa->ifa_address, NULL,
1094 dev->dev_addr, NULL);
1095 }
1096 break; 1208 break;
1097 case NETDEV_DOWN: 1209 case NETDEV_DOWN:
1098 ip_mc_down(in_dev); 1210 ip_mc_down(in_dev);
@@ -1255,6 +1367,87 @@ errout:
1255 rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); 1367 rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
1256} 1368}
1257 1369
1370static size_t inet_get_link_af_size(const struct net_device *dev)
1371{
1372 struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
1373
1374 if (!in_dev)
1375 return 0;
1376
1377 return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
1378}
1379
1380static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
1381{
1382 struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
1383 struct nlattr *nla;
1384 int i;
1385
1386 if (!in_dev)
1387 return -ENODATA;
1388
1389 nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
1390 if (nla == NULL)
1391 return -EMSGSIZE;
1392
1393 for (i = 0; i < IPV4_DEVCONF_MAX; i++)
1394 ((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i];
1395
1396 return 0;
1397}
1398
1399static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
1400 [IFLA_INET_CONF] = { .type = NLA_NESTED },
1401};
1402
1403static int inet_validate_link_af(const struct net_device *dev,
1404 const struct nlattr *nla)
1405{
1406 struct nlattr *a, *tb[IFLA_INET_MAX+1];
1407 int err, rem;
1408
1409 if (dev && !__in_dev_get_rtnl(dev))
1410 return -EAFNOSUPPORT;
1411
1412 err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy);
1413 if (err < 0)
1414 return err;
1415
1416 if (tb[IFLA_INET_CONF]) {
1417 nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) {
1418 int cfgid = nla_type(a);
1419
1420 if (nla_len(a) < 4)
1421 return -EINVAL;
1422
1423 if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX)
1424 return -EINVAL;
1425 }
1426 }
1427
1428 return 0;
1429}
1430
1431static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
1432{
1433 struct in_device *in_dev = __in_dev_get_rtnl(dev);
1434 struct nlattr *a, *tb[IFLA_INET_MAX+1];
1435 int rem;
1436
1437 if (!in_dev)
1438 return -EAFNOSUPPORT;
1439
1440 if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0)
1441 BUG();
1442
1443 if (tb[IFLA_INET_CONF]) {
1444 nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
1445 ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a));
1446 }
1447
1448 return 0;
1449}
1450
1258#ifdef CONFIG_SYSCTL 1451#ifdef CONFIG_SYSCTL
1259 1452
1260static void devinet_copy_dflt_conf(struct net *net, int i) 1453static void devinet_copy_dflt_conf(struct net *net, int i)
@@ -1348,9 +1541,9 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
1348 return ret; 1541 return ret;
1349} 1542}
1350 1543
1351int ipv4_doint_and_flush(ctl_table *ctl, int write, 1544static int ipv4_doint_and_flush(ctl_table *ctl, int write,
1352 void __user *buffer, 1545 void __user *buffer,
1353 size_t *lenp, loff_t *ppos) 1546 size_t *lenp, loff_t *ppos)
1354{ 1547{
1355 int *valp = ctl->data; 1548 int *valp = ctl->data;
1356 int val = *valp; 1549 int val = *valp;
@@ -1487,7 +1680,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
1487 return; 1680 return;
1488 1681
1489 cnf->sysctl = NULL; 1682 cnf->sysctl = NULL;
1490 unregister_sysctl_table(t->sysctl_header); 1683 unregister_net_sysctl_table(t->sysctl_header);
1491 kfree(t->dev_name); 1684 kfree(t->dev_name);
1492 kfree(t); 1685 kfree(t);
1493} 1686}
@@ -1618,13 +1811,28 @@ static __net_initdata struct pernet_operations devinet_ops = {
1618 .exit = devinet_exit_net, 1811 .exit = devinet_exit_net,
1619}; 1812};
1620 1813
1814static struct rtnl_af_ops inet_af_ops = {
1815 .family = AF_INET,
1816 .fill_link_af = inet_fill_link_af,
1817 .get_link_af_size = inet_get_link_af_size,
1818 .validate_link_af = inet_validate_link_af,
1819 .set_link_af = inet_set_link_af,
1820};
1821
1621void __init devinet_init(void) 1822void __init devinet_init(void)
1622{ 1823{
1824 int i;
1825
1826 for (i = 0; i < IN4_ADDR_HSIZE; i++)
1827 INIT_HLIST_HEAD(&inet_addr_lst[i]);
1828
1623 register_pernet_subsys(&devinet_ops); 1829 register_pernet_subsys(&devinet_ops);
1624 1830
1625 register_gifconf(PF_INET, inet_gifconf); 1831 register_gifconf(PF_INET, inet_gifconf);
1626 register_netdevice_notifier(&ip_netdev_notifier); 1832 register_netdevice_notifier(&ip_netdev_notifier);
1627 1833
1834 rtnl_af_register(&inet_af_ops);
1835
1628 rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL); 1836 rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL);
1629 rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL); 1837 rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL);
1630 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); 1838 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 14ca1f1c3fb0..a5b413416da3 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -23,6 +23,8 @@ struct esp_skb_cb {
23 23
24#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) 24#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
25 25
26static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
27
26/* 28/*
27 * Allocate an AEAD request structure with extra space for SG and IV. 29 * Allocate an AEAD request structure with extra space for SG and IV.
28 * 30 *
@@ -31,11 +33,14 @@ struct esp_skb_cb {
31 * 33 *
32 * TODO: Use spare space in skb for this where possible. 34 * TODO: Use spare space in skb for this where possible.
33 */ 35 */
34static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) 36static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
35{ 37{
36 unsigned int len; 38 unsigned int len;
37 39
38 len = crypto_aead_ivsize(aead); 40 len = seqhilen;
41
42 len += crypto_aead_ivsize(aead);
43
39 if (len) { 44 if (len) {
40 len += crypto_aead_alignmask(aead) & 45 len += crypto_aead_alignmask(aead) &
41 ~(crypto_tfm_ctx_alignment() - 1); 46 ~(crypto_tfm_ctx_alignment() - 1);
@@ -50,10 +55,15 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags)
50 return kmalloc(len, GFP_ATOMIC); 55 return kmalloc(len, GFP_ATOMIC);
51} 56}
52 57
53static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp) 58static inline __be32 *esp_tmp_seqhi(void *tmp)
59{
60 return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
61}
62static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
54{ 63{
55 return crypto_aead_ivsize(aead) ? 64 return crypto_aead_ivsize(aead) ?
56 PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp; 65 PTR_ALIGN((u8 *)tmp + seqhilen,
66 crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
57} 67}
58 68
59static inline struct aead_givcrypt_request *esp_tmp_givreq( 69static inline struct aead_givcrypt_request *esp_tmp_givreq(
@@ -117,46 +127,75 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
117 int blksize; 127 int blksize;
118 int clen; 128 int clen;
119 int alen; 129 int alen;
130 int plen;
131 int tfclen;
120 int nfrags; 132 int nfrags;
133 int assoclen;
134 int sglists;
135 int seqhilen;
136 __be32 *seqhi;
121 137
122 /* skb is pure payload to encrypt */ 138 /* skb is pure payload to encrypt */
123 139
124 err = -ENOMEM; 140 err = -ENOMEM;
125 141
126 /* Round to block size */
127 clen = skb->len;
128
129 esp = x->data; 142 esp = x->data;
130 aead = esp->aead; 143 aead = esp->aead;
131 alen = crypto_aead_authsize(aead); 144 alen = crypto_aead_authsize(aead);
132 145
146 tfclen = 0;
147 if (x->tfcpad) {
148 struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
149 u32 padto;
150
151 padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
152 if (skb->len < padto)
153 tfclen = padto - skb->len;
154 }
133 blksize = ALIGN(crypto_aead_blocksize(aead), 4); 155 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
134 clen = ALIGN(clen + 2, blksize); 156 clen = ALIGN(skb->len + 2 + tfclen, blksize);
135 if (esp->padlen) 157 if (esp->padlen)
136 clen = ALIGN(clen, esp->padlen); 158 clen = ALIGN(clen, esp->padlen);
159 plen = clen - skb->len - tfclen;
137 160
138 if ((err = skb_cow_data(skb, clen - skb->len + alen, &trailer)) < 0) 161 err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
162 if (err < 0)
139 goto error; 163 goto error;
140 nfrags = err; 164 nfrags = err;
141 165
142 tmp = esp_alloc_tmp(aead, nfrags + 1); 166 assoclen = sizeof(*esph);
167 sglists = 1;
168 seqhilen = 0;
169
170 if (x->props.flags & XFRM_STATE_ESN) {
171 sglists += 2;
172 seqhilen += sizeof(__be32);
173 assoclen += seqhilen;
174 }
175
176 tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
143 if (!tmp) 177 if (!tmp)
144 goto error; 178 goto error;
145 179
146 iv = esp_tmp_iv(aead, tmp); 180 seqhi = esp_tmp_seqhi(tmp);
181 iv = esp_tmp_iv(aead, tmp, seqhilen);
147 req = esp_tmp_givreq(aead, iv); 182 req = esp_tmp_givreq(aead, iv);
148 asg = esp_givreq_sg(aead, req); 183 asg = esp_givreq_sg(aead, req);
149 sg = asg + 1; 184 sg = asg + sglists;
150 185
151 /* Fill padding... */ 186 /* Fill padding... */
152 tail = skb_tail_pointer(trailer); 187 tail = skb_tail_pointer(trailer);
188 if (tfclen) {
189 memset(tail, 0, tfclen);
190 tail += tfclen;
191 }
153 do { 192 do {
154 int i; 193 int i;
155 for (i=0; i<clen-skb->len - 2; i++) 194 for (i = 0; i < plen - 2; i++)
156 tail[i] = i + 1; 195 tail[i] = i + 1;
157 } while (0); 196 } while (0);
158 tail[clen - skb->len - 2] = (clen - skb->len) - 2; 197 tail[plen - 2] = plen - 2;
159 tail[clen - skb->len - 1] = *skb_mac_header(skb); 198 tail[plen - 1] = *skb_mac_header(skb);
160 pskb_put(skb, trailer, clen - skb->len + alen); 199 pskb_put(skb, trailer, clen - skb->len + alen);
161 200
162 skb_push(skb, -skb_network_offset(skb)); 201 skb_push(skb, -skb_network_offset(skb));
@@ -199,19 +238,27 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
199 } 238 }
200 239
201 esph->spi = x->id.spi; 240 esph->spi = x->id.spi;
202 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); 241 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
203 242
204 sg_init_table(sg, nfrags); 243 sg_init_table(sg, nfrags);
205 skb_to_sgvec(skb, sg, 244 skb_to_sgvec(skb, sg,
206 esph->enc_data + crypto_aead_ivsize(aead) - skb->data, 245 esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
207 clen + alen); 246 clen + alen);
208 sg_init_one(asg, esph, sizeof(*esph)); 247
248 if ((x->props.flags & XFRM_STATE_ESN)) {
249 sg_init_table(asg, 3);
250 sg_set_buf(asg, &esph->spi, sizeof(__be32));
251 *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
252 sg_set_buf(asg + 1, seqhi, seqhilen);
253 sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
254 } else
255 sg_init_one(asg, esph, sizeof(*esph));
209 256
210 aead_givcrypt_set_callback(req, 0, esp_output_done, skb); 257 aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
211 aead_givcrypt_set_crypt(req, sg, sg, clen, iv); 258 aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
212 aead_givcrypt_set_assoc(req, asg, sizeof(*esph)); 259 aead_givcrypt_set_assoc(req, asg, assoclen);
213 aead_givcrypt_set_giv(req, esph->enc_data, 260 aead_givcrypt_set_giv(req, esph->enc_data,
214 XFRM_SKB_CB(skb)->seq.output); 261 XFRM_SKB_CB(skb)->seq.output.low);
215 262
216 ESP_SKB_CB(skb)->tmp = tmp; 263 ESP_SKB_CB(skb)->tmp = tmp;
217 err = crypto_aead_givencrypt(req); 264 err = crypto_aead_givencrypt(req);
@@ -229,7 +276,7 @@ error:
229 276
230static int esp_input_done2(struct sk_buff *skb, int err) 277static int esp_input_done2(struct sk_buff *skb, int err)
231{ 278{
232 struct iphdr *iph; 279 const struct iphdr *iph;
233 struct xfrm_state *x = xfrm_input_state(skb); 280 struct xfrm_state *x = xfrm_input_state(skb);
234 struct esp_data *esp = x->data; 281 struct esp_data *esp = x->data;
235 struct crypto_aead *aead = esp->aead; 282 struct crypto_aead *aead = esp->aead;
@@ -330,6 +377,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
330 struct sk_buff *trailer; 377 struct sk_buff *trailer;
331 int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); 378 int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
332 int nfrags; 379 int nfrags;
380 int assoclen;
381 int sglists;
382 int seqhilen;
383 __be32 *seqhi;
333 void *tmp; 384 void *tmp;
334 u8 *iv; 385 u8 *iv;
335 struct scatterlist *sg; 386 struct scatterlist *sg;
@@ -346,16 +397,27 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
346 goto out; 397 goto out;
347 nfrags = err; 398 nfrags = err;
348 399
400 assoclen = sizeof(*esph);
401 sglists = 1;
402 seqhilen = 0;
403
404 if (x->props.flags & XFRM_STATE_ESN) {
405 sglists += 2;
406 seqhilen += sizeof(__be32);
407 assoclen += seqhilen;
408 }
409
349 err = -ENOMEM; 410 err = -ENOMEM;
350 tmp = esp_alloc_tmp(aead, nfrags + 1); 411 tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
351 if (!tmp) 412 if (!tmp)
352 goto out; 413 goto out;
353 414
354 ESP_SKB_CB(skb)->tmp = tmp; 415 ESP_SKB_CB(skb)->tmp = tmp;
355 iv = esp_tmp_iv(aead, tmp); 416 seqhi = esp_tmp_seqhi(tmp);
417 iv = esp_tmp_iv(aead, tmp, seqhilen);
356 req = esp_tmp_req(aead, iv); 418 req = esp_tmp_req(aead, iv);
357 asg = esp_req_sg(aead, req); 419 asg = esp_req_sg(aead, req);
358 sg = asg + 1; 420 sg = asg + sglists;
359 421
360 skb->ip_summed = CHECKSUM_NONE; 422 skb->ip_summed = CHECKSUM_NONE;
361 423
@@ -366,11 +428,19 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
366 428
367 sg_init_table(sg, nfrags); 429 sg_init_table(sg, nfrags);
368 skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); 430 skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
369 sg_init_one(asg, esph, sizeof(*esph)); 431
432 if ((x->props.flags & XFRM_STATE_ESN)) {
433 sg_init_table(asg, 3);
434 sg_set_buf(asg, &esph->spi, sizeof(__be32));
435 *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
436 sg_set_buf(asg + 1, seqhi, seqhilen);
437 sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
438 } else
439 sg_init_one(asg, esph, sizeof(*esph));
370 440
371 aead_request_set_callback(req, 0, esp_input_done, skb); 441 aead_request_set_callback(req, 0, esp_input_done, skb);
372 aead_request_set_crypt(req, sg, sg, elen, iv); 442 aead_request_set_crypt(req, sg, sg, elen, iv);
373 aead_request_set_assoc(req, asg, sizeof(*esph)); 443 aead_request_set_assoc(req, asg, assoclen);
374 444
375 err = crypto_aead_decrypt(req); 445 err = crypto_aead_decrypt(req);
376 if (err == -EINPROGRESS) 446 if (err == -EINPROGRESS)
@@ -414,7 +484,7 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
414static void esp4_err(struct sk_buff *skb, u32 info) 484static void esp4_err(struct sk_buff *skb, u32 info)
415{ 485{
416 struct net *net = dev_net(skb->dev); 486 struct net *net = dev_net(skb->dev);
417 struct iphdr *iph = (struct iphdr *)skb->data; 487 const struct iphdr *iph = (const struct iphdr *)skb->data;
418 struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); 488 struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
419 struct xfrm_state *x; 489 struct xfrm_state *x;
420 490
@@ -422,7 +492,8 @@ static void esp4_err(struct sk_buff *skb, u32 info)
422 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) 492 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
423 return; 493 return;
424 494
425 x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); 495 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
496 esph->spi, IPPROTO_ESP, AF_INET);
426 if (!x) 497 if (!x)
427 return; 498 return;
428 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", 499 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
@@ -484,10 +555,20 @@ static int esp_init_authenc(struct xfrm_state *x)
484 goto error; 555 goto error;
485 556
486 err = -ENAMETOOLONG; 557 err = -ENAMETOOLONG;
487 if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)", 558
488 x->aalg ? x->aalg->alg_name : "digest_null", 559 if ((x->props.flags & XFRM_STATE_ESN)) {
489 x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) 560 if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
490 goto error; 561 "authencesn(%s,%s)",
562 x->aalg ? x->aalg->alg_name : "digest_null",
563 x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
564 goto error;
565 } else {
566 if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
567 "authenc(%s,%s)",
568 x->aalg ? x->aalg->alg_name : "digest_null",
569 x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
570 goto error;
571 }
491 572
492 aead = crypto_alloc_aead(authenc_name, 0, 0); 573 aead = crypto_alloc_aead(authenc_name, 0, 0);
493 err = PTR_ERR(aead); 574 err = PTR_ERR(aead);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7d02a9f999fa..22524716fe70 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -44,6 +44,7 @@
44#include <net/arp.h> 44#include <net/arp.h>
45#include <net/ip_fib.h> 45#include <net/ip_fib.h>
46#include <net/rtnetlink.h> 46#include <net/rtnetlink.h>
47#include <net/xfrm.h>
47 48
48#ifndef CONFIG_IP_MULTIPLE_TABLES 49#ifndef CONFIG_IP_MULTIPLE_TABLES
49 50
@@ -51,11 +52,11 @@ static int __net_init fib4_rules_init(struct net *net)
51{ 52{
52 struct fib_table *local_table, *main_table; 53 struct fib_table *local_table, *main_table;
53 54
54 local_table = fib_hash_table(RT_TABLE_LOCAL); 55 local_table = fib_trie_table(RT_TABLE_LOCAL);
55 if (local_table == NULL) 56 if (local_table == NULL)
56 return -ENOMEM; 57 return -ENOMEM;
57 58
58 main_table = fib_hash_table(RT_TABLE_MAIN); 59 main_table = fib_trie_table(RT_TABLE_MAIN);
59 if (main_table == NULL) 60 if (main_table == NULL)
60 goto fail; 61 goto fail;
61 62
@@ -82,7 +83,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
82 if (tb) 83 if (tb)
83 return tb; 84 return tb;
84 85
85 tb = fib_hash_table(id); 86 tb = fib_trie_table(id);
86 if (!tb) 87 if (!tb)
87 return NULL; 88 return NULL;
88 h = id & (FIB_TABLE_HASHSZ - 1); 89 h = id & (FIB_TABLE_HASHSZ - 1);
@@ -114,21 +115,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
114} 115}
115#endif /* CONFIG_IP_MULTIPLE_TABLES */ 116#endif /* CONFIG_IP_MULTIPLE_TABLES */
116 117
117void fib_select_default(struct net *net,
118 const struct flowi *flp, struct fib_result *res)
119{
120 struct fib_table *tb;
121 int table = RT_TABLE_MAIN;
122#ifdef CONFIG_IP_MULTIPLE_TABLES
123 if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
124 return;
125 table = res->r->table;
126#endif
127 tb = fib_get_table(net, table);
128 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
129 fib_table_select_default(tb, flp, res);
130}
131
132static void fib_flush(struct net *net) 118static void fib_flush(struct net *net)
133{ 119{
134 int flushed = 0; 120 int flushed = 0;
@@ -148,36 +134,6 @@ static void fib_flush(struct net *net)
148} 134}
149 135
150/* 136/*
151 * Find the first device with a given source address.
152 */
153
154struct net_device * ip_dev_find(struct net *net, __be32 addr)
155{
156 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
157 struct fib_result res;
158 struct net_device *dev = NULL;
159 struct fib_table *local_table;
160
161#ifdef CONFIG_IP_MULTIPLE_TABLES
162 res.r = NULL;
163#endif
164
165 local_table = fib_get_table(net, RT_TABLE_LOCAL);
166 if (!local_table || fib_table_lookup(local_table, &fl, &res))
167 return NULL;
168 if (res.type != RTN_LOCAL)
169 goto out;
170 dev = FIB_RES_DEV(res);
171
172 if (dev)
173 dev_hold(dev);
174out:
175 fib_res_put(&res);
176 return dev;
177}
178EXPORT_SYMBOL(ip_dev_find);
179
180/*
181 * Find address type as if only "dev" was present in the system. If 137 * Find address type as if only "dev" was present in the system. If
182 * on_dev is NULL then all interfaces are taken into consideration. 138 * on_dev is NULL then all interfaces are taken into consideration.
183 */ 139 */
@@ -185,7 +141,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
185 const struct net_device *dev, 141 const struct net_device *dev,
186 __be32 addr) 142 __be32 addr)
187{ 143{
188 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; 144 struct flowi4 fl4 = { .daddr = addr };
189 struct fib_result res; 145 struct fib_result res;
190 unsigned ret = RTN_BROADCAST; 146 unsigned ret = RTN_BROADCAST;
191 struct fib_table *local_table; 147 struct fib_table *local_table;
@@ -202,11 +158,12 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
202 local_table = fib_get_table(net, RT_TABLE_LOCAL); 158 local_table = fib_get_table(net, RT_TABLE_LOCAL);
203 if (local_table) { 159 if (local_table) {
204 ret = RTN_UNICAST; 160 ret = RTN_UNICAST;
205 if (!fib_table_lookup(local_table, &fl, &res)) { 161 rcu_read_lock();
162 if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
206 if (!dev || dev == res.fi->fib_dev) 163 if (!dev || dev == res.fi->fib_dev)
207 ret = res.type; 164 ret = res.type;
208 fib_res_put(&res);
209 } 165 }
166 rcu_read_unlock();
210 } 167 }
211 return ret; 168 return ret;
212} 169}
@@ -220,59 +177,60 @@ EXPORT_SYMBOL(inet_addr_type);
220unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, 177unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
221 __be32 addr) 178 __be32 addr)
222{ 179{
223 return __inet_dev_addr_type(net, dev, addr); 180 return __inet_dev_addr_type(net, dev, addr);
224} 181}
225EXPORT_SYMBOL(inet_dev_addr_type); 182EXPORT_SYMBOL(inet_dev_addr_type);
226 183
227/* Given (packet source, input interface) and optional (dst, oif, tos): 184/* Given (packet source, input interface) and optional (dst, oif, tos):
228 - (main) check, that source is valid i.e. not broadcast or our local 185 * - (main) check, that source is valid i.e. not broadcast or our local
229 address. 186 * address.
230 - figure out what "logical" interface this packet arrived 187 * - figure out what "logical" interface this packet arrived
231 and calculate "specific destination" address. 188 * and calculate "specific destination" address.
232 - check, that packet arrived from expected physical interface. 189 * - check, that packet arrived from expected physical interface.
190 * called with rcu_read_lock()
233 */ 191 */
234 192int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
235int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, 193 int oif, struct net_device *dev, __be32 *spec_dst,
236 struct net_device *dev, __be32 *spec_dst, 194 u32 *itag)
237 u32 *itag, u32 mark)
238{ 195{
239 struct in_device *in_dev; 196 struct in_device *in_dev;
240 struct flowi fl = { .nl_u = { .ip4_u = 197 struct flowi4 fl4;
241 { .daddr = src,
242 .saddr = dst,
243 .tos = tos } },
244 .mark = mark,
245 .iif = oif };
246
247 struct fib_result res; 198 struct fib_result res;
248 int no_addr, rpf, accept_local; 199 int no_addr, rpf, accept_local;
249 bool dev_match; 200 bool dev_match;
250 int ret; 201 int ret;
251 struct net *net; 202 struct net *net;
252 203
204 fl4.flowi4_oif = 0;
205 fl4.flowi4_iif = oif;
206 fl4.daddr = src;
207 fl4.saddr = dst;
208 fl4.flowi4_tos = tos;
209 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
210
253 no_addr = rpf = accept_local = 0; 211 no_addr = rpf = accept_local = 0;
254 rcu_read_lock();
255 in_dev = __in_dev_get_rcu(dev); 212 in_dev = __in_dev_get_rcu(dev);
256 if (in_dev) { 213 if (in_dev) {
257 no_addr = in_dev->ifa_list == NULL; 214 no_addr = in_dev->ifa_list == NULL;
258 rpf = IN_DEV_RPFILTER(in_dev); 215
216 /* Ignore rp_filter for packets protected by IPsec. */
217 rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev);
218
259 accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); 219 accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
260 if (mark && !IN_DEV_SRC_VMARK(in_dev)) 220 fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
261 fl.mark = 0;
262 } 221 }
263 rcu_read_unlock();
264 222
265 if (in_dev == NULL) 223 if (in_dev == NULL)
266 goto e_inval; 224 goto e_inval;
267 225
268 net = dev_net(dev); 226 net = dev_net(dev);
269 if (fib_lookup(net, &fl, &res)) 227 if (fib_lookup(net, &fl4, &res))
270 goto last_resort; 228 goto last_resort;
271 if (res.type != RTN_UNICAST) { 229 if (res.type != RTN_UNICAST) {
272 if (res.type != RTN_LOCAL || !accept_local) 230 if (res.type != RTN_LOCAL || !accept_local)
273 goto e_inval_res; 231 goto e_inval;
274 } 232 }
275 *spec_dst = FIB_RES_PREFSRC(res); 233 *spec_dst = FIB_RES_PREFSRC(net, res);
276 fib_combine_itag(itag, &res); 234 fib_combine_itag(itag, &res);
277 dev_match = false; 235 dev_match = false;
278 236
@@ -291,23 +249,20 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
291#endif 249#endif
292 if (dev_match) { 250 if (dev_match) {
293 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 251 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
294 fib_res_put(&res);
295 return ret; 252 return ret;
296 } 253 }
297 fib_res_put(&res);
298 if (no_addr) 254 if (no_addr)
299 goto last_resort; 255 goto last_resort;
300 if (rpf == 1) 256 if (rpf == 1)
301 goto e_rpf; 257 goto e_rpf;
302 fl.oif = dev->ifindex; 258 fl4.flowi4_oif = dev->ifindex;
303 259
304 ret = 0; 260 ret = 0;
305 if (fib_lookup(net, &fl, &res) == 0) { 261 if (fib_lookup(net, &fl4, &res) == 0) {
306 if (res.type == RTN_UNICAST) { 262 if (res.type == RTN_UNICAST) {
307 *spec_dst = FIB_RES_PREFSRC(res); 263 *spec_dst = FIB_RES_PREFSRC(net, res);
308 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 264 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
309 } 265 }
310 fib_res_put(&res);
311 } 266 }
312 return ret; 267 return ret;
313 268
@@ -318,8 +273,6 @@ last_resort:
318 *itag = 0; 273 *itag = 0;
319 return 0; 274 return 0;
320 275
321e_inval_res:
322 fib_res_put(&res);
323e_inval: 276e_inval:
324 return -EINVAL; 277 return -EINVAL;
325e_rpf: 278e_rpf:
@@ -472,9 +425,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
472} 425}
473 426
474/* 427/*
475 * Handle IP routing ioctl calls. These are used to manipulate the routing tables 428 * Handle IP routing ioctl calls.
429 * These are used to manipulate the routing tables
476 */ 430 */
477
478int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) 431int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
479{ 432{
480 struct fib_config cfg; 433 struct fib_config cfg;
@@ -518,7 +471,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
518 return -EINVAL; 471 return -EINVAL;
519} 472}
520 473
521const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { 474const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
522 [RTA_DST] = { .type = NLA_U32 }, 475 [RTA_DST] = { .type = NLA_U32 },
523 [RTA_SRC] = { .type = NLA_U32 }, 476 [RTA_SRC] = { .type = NLA_U32 },
524 [RTA_IIF] = { .type = NLA_U32 }, 477 [RTA_IIF] = { .type = NLA_U32 },
@@ -532,7 +485,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
532}; 485};
533 486
534static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, 487static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
535 struct nlmsghdr *nlh, struct fib_config *cfg) 488 struct nlmsghdr *nlh, struct fib_config *cfg)
536{ 489{
537 struct nlattr *attr; 490 struct nlattr *attr;
538 int err, remaining; 491 int err, remaining;
@@ -687,12 +640,11 @@ out:
687} 640}
688 641
689/* Prepare and feed intra-kernel routing request. 642/* Prepare and feed intra-kernel routing request.
690 Really, it should be netlink message, but :-( netlink 643 * Really, it should be netlink message, but :-( netlink
691 can be not configured, so that we feed it directly 644 * can be not configured, so that we feed it directly
692 to fib engine. It is legal, because all events occur 645 * to fib engine. It is legal, because all events occur
693 only when netlink is already locked. 646 * only when netlink is already locked.
694 */ 647 */
695
696static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) 648static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
697{ 649{
698 struct net *net = dev_net(ifa->ifa_dev->dev); 650 struct net *net = dev_net(ifa->ifa_dev->dev);
@@ -738,9 +690,9 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
738 struct in_ifaddr *prim = ifa; 690 struct in_ifaddr *prim = ifa;
739 __be32 mask = ifa->ifa_mask; 691 __be32 mask = ifa->ifa_mask;
740 __be32 addr = ifa->ifa_local; 692 __be32 addr = ifa->ifa_local;
741 __be32 prefix = ifa->ifa_address&mask; 693 __be32 prefix = ifa->ifa_address & mask;
742 694
743 if (ifa->ifa_flags&IFA_F_SECONDARY) { 695 if (ifa->ifa_flags & IFA_F_SECONDARY) {
744 prim = inet_ifa_byprefix(in_dev, prefix, mask); 696 prim = inet_ifa_byprefix(in_dev, prefix, mask);
745 if (prim == NULL) { 697 if (prim == NULL) {
746 printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n"); 698 printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
@@ -750,58 +702,118 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
750 702
751 fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); 703 fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
752 704
753 if (!(dev->flags&IFF_UP)) 705 if (!(dev->flags & IFF_UP))
754 return; 706 return;
755 707
756 /* Add broadcast address, if it is explicitly assigned. */ 708 /* Add broadcast address, if it is explicitly assigned. */
757 if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) 709 if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
758 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); 710 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
759 711
760 if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && 712 if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
761 (prefix != addr || ifa->ifa_prefixlen < 32)) { 713 (prefix != addr || ifa->ifa_prefixlen < 32)) {
762 fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : 714 fib_magic(RTM_NEWROUTE,
763 RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); 715 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
716 prefix, ifa->ifa_prefixlen, prim);
764 717
765 /* Add network specific broadcasts, when it takes a sense */ 718 /* Add network specific broadcasts, when it takes a sense */
766 if (ifa->ifa_prefixlen < 31) { 719 if (ifa->ifa_prefixlen < 31) {
767 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); 720 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
768 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim); 721 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
722 32, prim);
769 } 723 }
770 } 724 }
771} 725}
772 726
773static void fib_del_ifaddr(struct in_ifaddr *ifa) 727/* Delete primary or secondary address.
728 * Optionally, on secondary address promotion consider the addresses
729 * from subnet iprim as deleted, even if they are in device list.
730 * In this case the secondary ifa can be in device list.
731 */
732void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
774{ 733{
775 struct in_device *in_dev = ifa->ifa_dev; 734 struct in_device *in_dev = ifa->ifa_dev;
776 struct net_device *dev = in_dev->dev; 735 struct net_device *dev = in_dev->dev;
777 struct in_ifaddr *ifa1; 736 struct in_ifaddr *ifa1;
778 struct in_ifaddr *prim = ifa; 737 struct in_ifaddr *prim = ifa, *prim1 = NULL;
779 __be32 brd = ifa->ifa_address|~ifa->ifa_mask; 738 __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
780 __be32 any = ifa->ifa_address&ifa->ifa_mask; 739 __be32 any = ifa->ifa_address & ifa->ifa_mask;
781#define LOCAL_OK 1 740#define LOCAL_OK 1
782#define BRD_OK 2 741#define BRD_OK 2
783#define BRD0_OK 4 742#define BRD0_OK 4
784#define BRD1_OK 8 743#define BRD1_OK 8
785 unsigned ok = 0; 744 unsigned ok = 0;
745 int subnet = 0; /* Primary network */
746 int gone = 1; /* Address is missing */
747 int same_prefsrc = 0; /* Another primary with same IP */
786 748
787 if (!(ifa->ifa_flags&IFA_F_SECONDARY)) 749 if (ifa->ifa_flags & IFA_F_SECONDARY) {
788 fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
789 RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
790 else {
791 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); 750 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
792 if (prim == NULL) { 751 if (prim == NULL) {
793 printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); 752 printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
794 return; 753 return;
795 } 754 }
755 if (iprim && iprim != prim) {
756 printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n");
757 return;
758 }
759 } else if (!ipv4_is_zeronet(any) &&
760 (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
761 fib_magic(RTM_DELROUTE,
762 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
763 any, ifa->ifa_prefixlen, prim);
764 subnet = 1;
796 } 765 }
797 766
798 /* Deletion is more complicated than add. 767 /* Deletion is more complicated than add.
799 We should take care of not to delete too much :-) 768 * We should take care of not to delete too much :-)
800 769 *
801 Scan address list to be sure that addresses are really gone. 770 * Scan address list to be sure that addresses are really gone.
802 */ 771 */
803 772
804 for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { 773 for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
774 if (ifa1 == ifa) {
775 /* promotion, keep the IP */
776 gone = 0;
777 continue;
778 }
779 /* Ignore IFAs from our subnet */
780 if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
781 inet_ifa_match(ifa1->ifa_address, iprim))
782 continue;
783
784 /* Ignore ifa1 if it uses different primary IP (prefsrc) */
785 if (ifa1->ifa_flags & IFA_F_SECONDARY) {
786 /* Another address from our subnet? */
787 if (ifa1->ifa_mask == prim->ifa_mask &&
788 inet_ifa_match(ifa1->ifa_address, prim))
789 prim1 = prim;
790 else {
791 /* We reached the secondaries, so
792 * same_prefsrc should be determined.
793 */
794 if (!same_prefsrc)
795 continue;
796 /* Search new prim1 if ifa1 is not
797 * using the current prim1
798 */
799 if (!prim1 ||
800 ifa1->ifa_mask != prim1->ifa_mask ||
801 !inet_ifa_match(ifa1->ifa_address, prim1))
802 prim1 = inet_ifa_byprefix(in_dev,
803 ifa1->ifa_address,
804 ifa1->ifa_mask);
805 if (!prim1)
806 continue;
807 if (prim1->ifa_local != prim->ifa_local)
808 continue;
809 }
810 } else {
811 if (prim->ifa_local != ifa1->ifa_local)
812 continue;
813 prim1 = ifa1;
814 if (prim != prim1)
815 same_prefsrc = 1;
816 }
805 if (ifa->ifa_local == ifa1->ifa_local) 817 if (ifa->ifa_local == ifa1->ifa_local)
806 ok |= LOCAL_OK; 818 ok |= LOCAL_OK;
807 if (ifa->ifa_broadcast == ifa1->ifa_broadcast) 819 if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
@@ -810,25 +822,43 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
810 ok |= BRD1_OK; 822 ok |= BRD1_OK;
811 if (any == ifa1->ifa_broadcast) 823 if (any == ifa1->ifa_broadcast)
812 ok |= BRD0_OK; 824 ok |= BRD0_OK;
825 /* primary has network specific broadcasts */
826 if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
827 __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
828 __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
829
830 if (!ipv4_is_zeronet(any1)) {
831 if (ifa->ifa_broadcast == brd1 ||
832 ifa->ifa_broadcast == any1)
833 ok |= BRD_OK;
834 if (brd == brd1 || brd == any1)
835 ok |= BRD1_OK;
836 if (any == brd1 || any == any1)
837 ok |= BRD0_OK;
838 }
839 }
813 } 840 }
814 841
815 if (!(ok&BRD_OK)) 842 if (!(ok & BRD_OK))
816 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); 843 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
817 if (!(ok&BRD1_OK)) 844 if (subnet && ifa->ifa_prefixlen < 31) {
818 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); 845 if (!(ok & BRD1_OK))
819 if (!(ok&BRD0_OK)) 846 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
820 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); 847 if (!(ok & BRD0_OK))
821 if (!(ok&LOCAL_OK)) { 848 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
849 }
850 if (!(ok & LOCAL_OK)) {
822 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); 851 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
823 852
824 /* Check, that this local address finally disappeared. */ 853 /* Check, that this local address finally disappeared. */
825 if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { 854 if (gone &&
855 inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
826 /* And the last, but not the least thing. 856 /* And the last, but not the least thing.
827 We must flush stray FIB entries. 857 * We must flush stray FIB entries.
828 858 *
829 First of all, we scan fib_info list searching 859 * First of all, we scan fib_info list searching
830 for stray nexthop entries, then ignite fib_flush. 860 * for stray nexthop entries, then ignite fib_flush.
831 */ 861 */
832 if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) 862 if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
833 fib_flush(dev_net(dev)); 863 fib_flush(dev_net(dev));
834 } 864 }
@@ -839,14 +869,16 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
839#undef BRD1_OK 869#undef BRD1_OK
840} 870}
841 871
842static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) 872static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
843{ 873{
844 874
845 struct fib_result res; 875 struct fib_result res;
846 struct flowi fl = { .mark = frn->fl_mark, 876 struct flowi4 fl4 = {
847 .nl_u = { .ip4_u = { .daddr = frn->fl_addr, 877 .flowi4_mark = frn->fl_mark,
848 .tos = frn->fl_tos, 878 .daddr = frn->fl_addr,
849 .scope = frn->fl_scope } } }; 879 .flowi4_tos = frn->fl_tos,
880 .flowi4_scope = frn->fl_scope,
881 };
850 882
851#ifdef CONFIG_IP_MULTIPLE_TABLES 883#ifdef CONFIG_IP_MULTIPLE_TABLES
852 res.r = NULL; 884 res.r = NULL;
@@ -857,15 +889,16 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
857 local_bh_disable(); 889 local_bh_disable();
858 890
859 frn->tb_id = tb->tb_id; 891 frn->tb_id = tb->tb_id;
860 frn->err = fib_table_lookup(tb, &fl, &res); 892 rcu_read_lock();
893 frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
861 894
862 if (!frn->err) { 895 if (!frn->err) {
863 frn->prefixlen = res.prefixlen; 896 frn->prefixlen = res.prefixlen;
864 frn->nh_sel = res.nh_sel; 897 frn->nh_sel = res.nh_sel;
865 frn->type = res.type; 898 frn->type = res.type;
866 frn->scope = res.scope; 899 frn->scope = res.scope;
867 fib_res_put(&res);
868 } 900 }
901 rcu_read_unlock();
869 local_bh_enable(); 902 local_bh_enable();
870 } 903 }
871} 904}
@@ -894,8 +927,8 @@ static void nl_fib_input(struct sk_buff *skb)
894 927
895 nl_fib_lookup(frn, tb); 928 nl_fib_lookup(frn, tb);
896 929
897 pid = NETLINK_CB(skb).pid; /* pid of sending process */ 930 pid = NETLINK_CB(skb).pid; /* pid of sending process */
898 NETLINK_CB(skb).pid = 0; /* from kernel */ 931 NETLINK_CB(skb).pid = 0; /* from kernel */
899 NETLINK_CB(skb).dst_group = 0; /* unicast */ 932 NETLINK_CB(skb).dst_group = 0; /* unicast */
900 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); 933 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
901} 934}
@@ -929,6 +962,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
929{ 962{
930 struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; 963 struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
931 struct net_device *dev = ifa->ifa_dev->dev; 964 struct net_device *dev = ifa->ifa_dev->dev;
965 struct net *net = dev_net(dev);
932 966
933 switch (event) { 967 switch (event) {
934 case NETDEV_UP: 968 case NETDEV_UP:
@@ -936,13 +970,15 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
936#ifdef CONFIG_IP_ROUTE_MULTIPATH 970#ifdef CONFIG_IP_ROUTE_MULTIPATH
937 fib_sync_up(dev); 971 fib_sync_up(dev);
938#endif 972#endif
973 atomic_inc(&net->ipv4.dev_addr_genid);
939 rt_cache_flush(dev_net(dev), -1); 974 rt_cache_flush(dev_net(dev), -1);
940 break; 975 break;
941 case NETDEV_DOWN: 976 case NETDEV_DOWN:
942 fib_del_ifaddr(ifa); 977 fib_del_ifaddr(ifa, NULL);
978 atomic_inc(&net->ipv4.dev_addr_genid);
943 if (ifa->ifa_dev->ifa_list == NULL) { 979 if (ifa->ifa_dev->ifa_list == NULL) {
944 /* Last address was deleted from this interface. 980 /* Last address was deleted from this interface.
945 Disable IP. 981 * Disable IP.
946 */ 982 */
947 fib_disable_ip(dev, 1, 0); 983 fib_disable_ip(dev, 1, 0);
948 } else { 984 } else {
@@ -957,6 +993,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
957{ 993{
958 struct net_device *dev = ptr; 994 struct net_device *dev = ptr;
959 struct in_device *in_dev = __in_dev_get_rtnl(dev); 995 struct in_device *in_dev = __in_dev_get_rtnl(dev);
996 struct net *net = dev_net(dev);
960 997
961 if (event == NETDEV_UNREGISTER) { 998 if (event == NETDEV_UNREGISTER) {
962 fib_disable_ip(dev, 2, -1); 999 fib_disable_ip(dev, 2, -1);
@@ -974,6 +1011,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
974#ifdef CONFIG_IP_ROUTE_MULTIPATH 1011#ifdef CONFIG_IP_ROUTE_MULTIPATH
975 fib_sync_up(dev); 1012 fib_sync_up(dev);
976#endif 1013#endif
1014 atomic_inc(&net->ipv4.dev_addr_genid);
977 rt_cache_flush(dev_net(dev), -1); 1015 rt_cache_flush(dev_net(dev), -1);
978 break; 1016 break;
979 case NETDEV_DOWN: 1017 case NETDEV_DOWN:
@@ -984,7 +1022,11 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
984 rt_cache_flush(dev_net(dev), 0); 1022 rt_cache_flush(dev_net(dev), 0);
985 break; 1023 break;
986 case NETDEV_UNREGISTER_BATCH: 1024 case NETDEV_UNREGISTER_BATCH:
987 rt_cache_flush_batch(); 1025 /* The batch unregister is only called on the first
1026 * device in the list of devices being unregistered.
1027 * Therefore we should not pass dev_net(dev) in here.
1028 */
1029 rt_cache_flush_batch(NULL);
988 break; 1030 break;
989 } 1031 }
990 return NOTIFY_DONE; 1032 return NOTIFY_DONE;
@@ -1001,16 +1043,15 @@ static struct notifier_block fib_netdev_notifier = {
1001static int __net_init ip_fib_net_init(struct net *net) 1043static int __net_init ip_fib_net_init(struct net *net)
1002{ 1044{
1003 int err; 1045 int err;
1004 unsigned int i; 1046 size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1047
1048 /* Avoid false sharing : Use at least a full cache line */
1049 size = max_t(size_t, size, L1_CACHE_BYTES);
1005 1050
1006 net->ipv4.fib_table_hash = kzalloc( 1051 net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1007 sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL);
1008 if (net->ipv4.fib_table_hash == NULL) 1052 if (net->ipv4.fib_table_hash == NULL)
1009 return -ENOMEM; 1053 return -ENOMEM;
1010 1054
1011 for (i = 0; i < FIB_TABLE_HASHSZ; i++)
1012 INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
1013
1014 err = fib4_rules_init(net); 1055 err = fib4_rules_init(net);
1015 if (err < 0) 1056 if (err < 0)
1016 goto fail; 1057 goto fail;
@@ -1029,6 +1070,7 @@ static void ip_fib_net_exit(struct net *net)
1029 fib4_rules_exit(net); 1070 fib4_rules_exit(net);
1030#endif 1071#endif
1031 1072
1073 rtnl_lock();
1032 for (i = 0; i < FIB_TABLE_HASHSZ; i++) { 1074 for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
1033 struct fib_table *tb; 1075 struct fib_table *tb;
1034 struct hlist_head *head; 1076 struct hlist_head *head;
@@ -1038,9 +1080,10 @@ static void ip_fib_net_exit(struct net *net)
1038 hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { 1080 hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
1039 hlist_del(node); 1081 hlist_del(node);
1040 fib_table_flush(tb); 1082 fib_table_flush(tb);
1041 kfree(tb); 1083 fib_free_table(tb);
1042 } 1084 }
1043 } 1085 }
1086 rtnl_unlock();
1044 kfree(net->ipv4.fib_table_hash); 1087 kfree(net->ipv4.fib_table_hash);
1045} 1088}
1046 1089
@@ -1089,5 +1132,5 @@ void __init ip_fib_init(void)
1089 register_netdevice_notifier(&fib_netdev_notifier); 1132 register_netdevice_notifier(&fib_netdev_notifier);
1090 register_inetaddr_notifier(&fib_inetaddr_notifier); 1133 register_inetaddr_notifier(&fib_inetaddr_notifier);
1091 1134
1092 fib_hash_init(); 1135 fib_trie_init();
1093} 1136}
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
deleted file mode 100644
index 4ed7e0dea1bc..000000000000
--- a/net/ipv4/fib_hash.c
+++ /dev/null
@@ -1,1070 +0,0 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IPv4 FIB: lookup engine and maintenance routines.
7 *
8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16#include <asm/uaccess.h>
17#include <asm/system.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/mm.h>
22#include <linux/string.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/errno.h>
26#include <linux/in.h>
27#include <linux/inet.h>
28#include <linux/inetdevice.h>
29#include <linux/netdevice.h>
30#include <linux/if_arp.h>
31#include <linux/proc_fs.h>
32#include <linux/skbuff.h>
33#include <linux/netlink.h>
34#include <linux/init.h>
35#include <linux/slab.h>
36
37#include <net/net_namespace.h>
38#include <net/ip.h>
39#include <net/protocol.h>
40#include <net/route.h>
41#include <net/tcp.h>
42#include <net/sock.h>
43#include <net/ip_fib.h>
44
45#include "fib_lookup.h"
46
47static struct kmem_cache *fn_hash_kmem __read_mostly;
48static struct kmem_cache *fn_alias_kmem __read_mostly;
49
50struct fib_node {
51 struct hlist_node fn_hash;
52 struct list_head fn_alias;
53 __be32 fn_key;
54 struct fib_alias fn_embedded_alias;
55};
56
57struct fn_zone {
58 struct fn_zone *fz_next; /* Next not empty zone */
59 struct hlist_head *fz_hash; /* Hash table pointer */
60 int fz_nent; /* Number of entries */
61
62 int fz_divisor; /* Hash divisor */
63 u32 fz_hashmask; /* (fz_divisor - 1) */
64#define FZ_HASHMASK(fz) ((fz)->fz_hashmask)
65
66 int fz_order; /* Zone order */
67 __be32 fz_mask;
68#define FZ_MASK(fz) ((fz)->fz_mask)
69};
70
71/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
72 * can be cheaper than memory lookup, so that FZ_* macros are used.
73 */
74
75struct fn_hash {
76 struct fn_zone *fn_zones[33];
77 struct fn_zone *fn_zone_list;
78};
79
80static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
81{
82 u32 h = ntohl(key)>>(32 - fz->fz_order);
83 h ^= (h>>20);
84 h ^= (h>>10);
85 h ^= (h>>5);
86 h &= FZ_HASHMASK(fz);
87 return h;
88}
89
90static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
91{
92 return dst & FZ_MASK(fz);
93}
94
95static DEFINE_RWLOCK(fib_hash_lock);
96static unsigned int fib_hash_genid;
97
98#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
99
100static struct hlist_head *fz_hash_alloc(int divisor)
101{
102 unsigned long size = divisor * sizeof(struct hlist_head);
103
104 if (size <= PAGE_SIZE) {
105 return kzalloc(size, GFP_KERNEL);
106 } else {
107 return (struct hlist_head *)
108 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
109 }
110}
111
112/* The fib hash lock must be held when this is called. */
113static inline void fn_rebuild_zone(struct fn_zone *fz,
114 struct hlist_head *old_ht,
115 int old_divisor)
116{
117 int i;
118
119 for (i = 0; i < old_divisor; i++) {
120 struct hlist_node *node, *n;
121 struct fib_node *f;
122
123 hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
124 struct hlist_head *new_head;
125
126 hlist_del(&f->fn_hash);
127
128 new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
129 hlist_add_head(&f->fn_hash, new_head);
130 }
131 }
132}
133
134static void fz_hash_free(struct hlist_head *hash, int divisor)
135{
136 unsigned long size = divisor * sizeof(struct hlist_head);
137
138 if (size <= PAGE_SIZE)
139 kfree(hash);
140 else
141 free_pages((unsigned long)hash, get_order(size));
142}
143
144static void fn_rehash_zone(struct fn_zone *fz)
145{
146 struct hlist_head *ht, *old_ht;
147 int old_divisor, new_divisor;
148 u32 new_hashmask;
149
150 old_divisor = fz->fz_divisor;
151
152 switch (old_divisor) {
153 case 16:
154 new_divisor = 256;
155 break;
156 case 256:
157 new_divisor = 1024;
158 break;
159 default:
160 if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
161 printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
162 return;
163 }
164 new_divisor = (old_divisor << 1);
165 break;
166 }
167
168 new_hashmask = (new_divisor - 1);
169
170#if RT_CACHE_DEBUG >= 2
171 printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n",
172 fz->fz_order, old_divisor);
173#endif
174
175 ht = fz_hash_alloc(new_divisor);
176
177 if (ht) {
178 write_lock_bh(&fib_hash_lock);
179 old_ht = fz->fz_hash;
180 fz->fz_hash = ht;
181 fz->fz_hashmask = new_hashmask;
182 fz->fz_divisor = new_divisor;
183 fn_rebuild_zone(fz, old_ht, old_divisor);
184 fib_hash_genid++;
185 write_unlock_bh(&fib_hash_lock);
186
187 fz_hash_free(old_ht, old_divisor);
188 }
189}
190
191static inline void fn_free_node(struct fib_node * f)
192{
193 kmem_cache_free(fn_hash_kmem, f);
194}
195
196static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
197{
198 fib_release_info(fa->fa_info);
199 if (fa == &f->fn_embedded_alias)
200 fa->fa_info = NULL;
201 else
202 kmem_cache_free(fn_alias_kmem, fa);
203}
204
205static struct fn_zone *
206fn_new_zone(struct fn_hash *table, int z)
207{
208 int i;
209 struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
210 if (!fz)
211 return NULL;
212
213 if (z) {
214 fz->fz_divisor = 16;
215 } else {
216 fz->fz_divisor = 1;
217 }
218 fz->fz_hashmask = (fz->fz_divisor - 1);
219 fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
220 if (!fz->fz_hash) {
221 kfree(fz);
222 return NULL;
223 }
224 fz->fz_order = z;
225 fz->fz_mask = inet_make_mask(z);
226
227 /* Find the first not empty zone with more specific mask */
228 for (i=z+1; i<=32; i++)
229 if (table->fn_zones[i])
230 break;
231 write_lock_bh(&fib_hash_lock);
232 if (i>32) {
233 /* No more specific masks, we are the first. */
234 fz->fz_next = table->fn_zone_list;
235 table->fn_zone_list = fz;
236 } else {
237 fz->fz_next = table->fn_zones[i]->fz_next;
238 table->fn_zones[i]->fz_next = fz;
239 }
240 table->fn_zones[z] = fz;
241 fib_hash_genid++;
242 write_unlock_bh(&fib_hash_lock);
243 return fz;
244}
245
246int fib_table_lookup(struct fib_table *tb,
247 const struct flowi *flp, struct fib_result *res)
248{
249 int err;
250 struct fn_zone *fz;
251 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
252
253 read_lock(&fib_hash_lock);
254 for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
255 struct hlist_head *head;
256 struct hlist_node *node;
257 struct fib_node *f;
258 __be32 k = fz_key(flp->fl4_dst, fz);
259
260 head = &fz->fz_hash[fn_hash(k, fz)];
261 hlist_for_each_entry(f, node, head, fn_hash) {
262 if (f->fn_key != k)
263 continue;
264
265 err = fib_semantic_match(&f->fn_alias,
266 flp, res,
267 fz->fz_order);
268 if (err <= 0)
269 goto out;
270 }
271 }
272 err = 1;
273out:
274 read_unlock(&fib_hash_lock);
275 return err;
276}
277
278void fib_table_select_default(struct fib_table *tb,
279 const struct flowi *flp, struct fib_result *res)
280{
281 int order, last_idx;
282 struct hlist_node *node;
283 struct fib_node *f;
284 struct fib_info *fi = NULL;
285 struct fib_info *last_resort;
286 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
287 struct fn_zone *fz = t->fn_zones[0];
288
289 if (fz == NULL)
290 return;
291
292 last_idx = -1;
293 last_resort = NULL;
294 order = -1;
295
296 read_lock(&fib_hash_lock);
297 hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
298 struct fib_alias *fa;
299
300 list_for_each_entry(fa, &f->fn_alias, fa_list) {
301 struct fib_info *next_fi = fa->fa_info;
302
303 if (fa->fa_scope != res->scope ||
304 fa->fa_type != RTN_UNICAST)
305 continue;
306
307 if (next_fi->fib_priority > res->fi->fib_priority)
308 break;
309 if (!next_fi->fib_nh[0].nh_gw ||
310 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
311 continue;
312 fa->fa_state |= FA_S_ACCESSED;
313
314 if (fi == NULL) {
315 if (next_fi != res->fi)
316 break;
317 } else if (!fib_detect_death(fi, order, &last_resort,
318 &last_idx, tb->tb_default)) {
319 fib_result_assign(res, fi);
320 tb->tb_default = order;
321 goto out;
322 }
323 fi = next_fi;
324 order++;
325 }
326 }
327
328 if (order <= 0 || fi == NULL) {
329 tb->tb_default = -1;
330 goto out;
331 }
332
333 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
334 tb->tb_default)) {
335 fib_result_assign(res, fi);
336 tb->tb_default = order;
337 goto out;
338 }
339
340 if (last_idx >= 0)
341 fib_result_assign(res, last_resort);
342 tb->tb_default = last_idx;
343out:
344 read_unlock(&fib_hash_lock);
345}
346
347/* Insert node F to FZ. */
348static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
349{
350 struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
351
352 hlist_add_head(&f->fn_hash, head);
353}
354
355/* Return the node in FZ matching KEY. */
356static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
357{
358 struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)];
359 struct hlist_node *node;
360 struct fib_node *f;
361
362 hlist_for_each_entry(f, node, head, fn_hash) {
363 if (f->fn_key == key)
364 return f;
365 }
366
367 return NULL;
368}
369
370int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
371{
372 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
373 struct fib_node *new_f = NULL;
374 struct fib_node *f;
375 struct fib_alias *fa, *new_fa;
376 struct fn_zone *fz;
377 struct fib_info *fi;
378 u8 tos = cfg->fc_tos;
379 __be32 key;
380 int err;
381
382 if (cfg->fc_dst_len > 32)
383 return -EINVAL;
384
385 fz = table->fn_zones[cfg->fc_dst_len];
386 if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
387 return -ENOBUFS;
388
389 key = 0;
390 if (cfg->fc_dst) {
391 if (cfg->fc_dst & ~FZ_MASK(fz))
392 return -EINVAL;
393 key = fz_key(cfg->fc_dst, fz);
394 }
395
396 fi = fib_create_info(cfg);
397 if (IS_ERR(fi))
398 return PTR_ERR(fi);
399
400 if (fz->fz_nent > (fz->fz_divisor<<1) &&
401 fz->fz_divisor < FZ_MAX_DIVISOR &&
402 (cfg->fc_dst_len == 32 ||
403 (1 << cfg->fc_dst_len) > fz->fz_divisor))
404 fn_rehash_zone(fz);
405
406 f = fib_find_node(fz, key);
407
408 if (!f)
409 fa = NULL;
410 else
411 fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
412
413 /* Now fa, if non-NULL, points to the first fib alias
414 * with the same keys [prefix,tos,priority], if such key already
415 * exists or to the node before which we will insert new one.
416 *
417 * If fa is NULL, we will need to allocate a new one and
418 * insert to the head of f.
419 *
420 * If f is NULL, no fib node matched the destination key
421 * and we need to allocate a new one of those as well.
422 */
423
424 if (fa && fa->fa_tos == tos &&
425 fa->fa_info->fib_priority == fi->fib_priority) {
426 struct fib_alias *fa_first, *fa_match;
427
428 err = -EEXIST;
429 if (cfg->fc_nlflags & NLM_F_EXCL)
430 goto out;
431
432 /* We have 2 goals:
433 * 1. Find exact match for type, scope, fib_info to avoid
434 * duplicate routes
435 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
436 */
437 fa_match = NULL;
438 fa_first = fa;
439 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
440 list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
441 if (fa->fa_tos != tos)
442 break;
443 if (fa->fa_info->fib_priority != fi->fib_priority)
444 break;
445 if (fa->fa_type == cfg->fc_type &&
446 fa->fa_scope == cfg->fc_scope &&
447 fa->fa_info == fi) {
448 fa_match = fa;
449 break;
450 }
451 }
452
453 if (cfg->fc_nlflags & NLM_F_REPLACE) {
454 struct fib_info *fi_drop;
455 u8 state;
456
457 fa = fa_first;
458 if (fa_match) {
459 if (fa == fa_match)
460 err = 0;
461 goto out;
462 }
463 write_lock_bh(&fib_hash_lock);
464 fi_drop = fa->fa_info;
465 fa->fa_info = fi;
466 fa->fa_type = cfg->fc_type;
467 fa->fa_scope = cfg->fc_scope;
468 state = fa->fa_state;
469 fa->fa_state &= ~FA_S_ACCESSED;
470 fib_hash_genid++;
471 write_unlock_bh(&fib_hash_lock);
472
473 fib_release_info(fi_drop);
474 if (state & FA_S_ACCESSED)
475 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
476 rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id,
477 &cfg->fc_nlinfo, NLM_F_REPLACE);
478 return 0;
479 }
480
481 /* Error if we find a perfect match which
482 * uses the same scope, type, and nexthop
483 * information.
484 */
485 if (fa_match)
486 goto out;
487
488 if (!(cfg->fc_nlflags & NLM_F_APPEND))
489 fa = fa_first;
490 }
491
492 err = -ENOENT;
493 if (!(cfg->fc_nlflags & NLM_F_CREATE))
494 goto out;
495
496 err = -ENOBUFS;
497
498 if (!f) {
499 new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
500 if (new_f == NULL)
501 goto out;
502
503 INIT_HLIST_NODE(&new_f->fn_hash);
504 INIT_LIST_HEAD(&new_f->fn_alias);
505 new_f->fn_key = key;
506 f = new_f;
507 }
508
509 new_fa = &f->fn_embedded_alias;
510 if (new_fa->fa_info != NULL) {
511 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
512 if (new_fa == NULL)
513 goto out;
514 }
515 new_fa->fa_info = fi;
516 new_fa->fa_tos = tos;
517 new_fa->fa_type = cfg->fc_type;
518 new_fa->fa_scope = cfg->fc_scope;
519 new_fa->fa_state = 0;
520
521 /*
522 * Insert new entry to the list.
523 */
524
525 write_lock_bh(&fib_hash_lock);
526 if (new_f)
527 fib_insert_node(fz, new_f);
528 list_add_tail(&new_fa->fa_list,
529 (fa ? &fa->fa_list : &f->fn_alias));
530 fib_hash_genid++;
531 write_unlock_bh(&fib_hash_lock);
532
533 if (new_f)
534 fz->fz_nent++;
535 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
536
537 rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
538 &cfg->fc_nlinfo, 0);
539 return 0;
540
541out:
542 if (new_f)
543 kmem_cache_free(fn_hash_kmem, new_f);
544 fib_release_info(fi);
545 return err;
546}
547
548int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
549{
550 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
551 struct fib_node *f;
552 struct fib_alias *fa, *fa_to_delete;
553 struct fn_zone *fz;
554 __be32 key;
555
556 if (cfg->fc_dst_len > 32)
557 return -EINVAL;
558
559 if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL)
560 return -ESRCH;
561
562 key = 0;
563 if (cfg->fc_dst) {
564 if (cfg->fc_dst & ~FZ_MASK(fz))
565 return -EINVAL;
566 key = fz_key(cfg->fc_dst, fz);
567 }
568
569 f = fib_find_node(fz, key);
570
571 if (!f)
572 fa = NULL;
573 else
574 fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
575 if (!fa)
576 return -ESRCH;
577
578 fa_to_delete = NULL;
579 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
580 list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
581 struct fib_info *fi = fa->fa_info;
582
583 if (fa->fa_tos != cfg->fc_tos)
584 break;
585
586 if ((!cfg->fc_type ||
587 fa->fa_type == cfg->fc_type) &&
588 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
589 fa->fa_scope == cfg->fc_scope) &&
590 (!cfg->fc_protocol ||
591 fi->fib_protocol == cfg->fc_protocol) &&
592 fib_nh_match(cfg, fi) == 0) {
593 fa_to_delete = fa;
594 break;
595 }
596 }
597
598 if (fa_to_delete) {
599 int kill_fn;
600
601 fa = fa_to_delete;
602 rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
603 tb->tb_id, &cfg->fc_nlinfo, 0);
604
605 kill_fn = 0;
606 write_lock_bh(&fib_hash_lock);
607 list_del(&fa->fa_list);
608 if (list_empty(&f->fn_alias)) {
609 hlist_del(&f->fn_hash);
610 kill_fn = 1;
611 }
612 fib_hash_genid++;
613 write_unlock_bh(&fib_hash_lock);
614
615 if (fa->fa_state & FA_S_ACCESSED)
616 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
617 fn_free_alias(fa, f);
618 if (kill_fn) {
619 fn_free_node(f);
620 fz->fz_nent--;
621 }
622
623 return 0;
624 }
625 return -ESRCH;
626}
627
628static int fn_flush_list(struct fn_zone *fz, int idx)
629{
630 struct hlist_head *head = &fz->fz_hash[idx];
631 struct hlist_node *node, *n;
632 struct fib_node *f;
633 int found = 0;
634
635 hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
636 struct fib_alias *fa, *fa_node;
637 int kill_f;
638
639 kill_f = 0;
640 list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
641 struct fib_info *fi = fa->fa_info;
642
643 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
644 write_lock_bh(&fib_hash_lock);
645 list_del(&fa->fa_list);
646 if (list_empty(&f->fn_alias)) {
647 hlist_del(&f->fn_hash);
648 kill_f = 1;
649 }
650 fib_hash_genid++;
651 write_unlock_bh(&fib_hash_lock);
652
653 fn_free_alias(fa, f);
654 found++;
655 }
656 }
657 if (kill_f) {
658 fn_free_node(f);
659 fz->fz_nent--;
660 }
661 }
662 return found;
663}
664
665int fib_table_flush(struct fib_table *tb)
666{
667 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
668 struct fn_zone *fz;
669 int found = 0;
670
671 for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
672 int i;
673
674 for (i = fz->fz_divisor - 1; i >= 0; i--)
675 found += fn_flush_list(fz, i);
676 }
677 return found;
678}
679
680
681static inline int
682fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
683 struct fib_table *tb,
684 struct fn_zone *fz,
685 struct hlist_head *head)
686{
687 struct hlist_node *node;
688 struct fib_node *f;
689 int i, s_i;
690
691 s_i = cb->args[4];
692 i = 0;
693 hlist_for_each_entry(f, node, head, fn_hash) {
694 struct fib_alias *fa;
695
696 list_for_each_entry(fa, &f->fn_alias, fa_list) {
697 if (i < s_i)
698 goto next;
699
700 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
701 cb->nlh->nlmsg_seq,
702 RTM_NEWROUTE,
703 tb->tb_id,
704 fa->fa_type,
705 fa->fa_scope,
706 f->fn_key,
707 fz->fz_order,
708 fa->fa_tos,
709 fa->fa_info,
710 NLM_F_MULTI) < 0) {
711 cb->args[4] = i;
712 return -1;
713 }
714 next:
715 i++;
716 }
717 }
718 cb->args[4] = i;
719 return skb->len;
720}
721
722static inline int
723fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
724 struct fib_table *tb,
725 struct fn_zone *fz)
726{
727 int h, s_h;
728
729 if (fz->fz_hash == NULL)
730 return skb->len;
731 s_h = cb->args[3];
732 for (h = s_h; h < fz->fz_divisor; h++) {
733 if (hlist_empty(&fz->fz_hash[h]))
734 continue;
735 if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h]) < 0) {
736 cb->args[3] = h;
737 return -1;
738 }
739 memset(&cb->args[4], 0,
740 sizeof(cb->args) - 4*sizeof(cb->args[0]));
741 }
742 cb->args[3] = h;
743 return skb->len;
744}
745
746int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
747 struct netlink_callback *cb)
748{
749 int m, s_m;
750 struct fn_zone *fz;
751 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
752
753 s_m = cb->args[2];
754 read_lock(&fib_hash_lock);
755 for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
756 if (m < s_m) continue;
757 if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
758 cb->args[2] = m;
759 read_unlock(&fib_hash_lock);
760 return -1;
761 }
762 memset(&cb->args[3], 0,
763 sizeof(cb->args) - 3*sizeof(cb->args[0]));
764 }
765 read_unlock(&fib_hash_lock);
766 cb->args[2] = m;
767 return skb->len;
768}
769
770void __init fib_hash_init(void)
771{
772 fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
773 0, SLAB_PANIC, NULL);
774
775 fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
776 0, SLAB_PANIC, NULL);
777
778}
779
780struct fib_table *fib_hash_table(u32 id)
781{
782 struct fib_table *tb;
783
784 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
785 GFP_KERNEL);
786 if (tb == NULL)
787 return NULL;
788
789 tb->tb_id = id;
790 tb->tb_default = -1;
791
792 memset(tb->tb_data, 0, sizeof(struct fn_hash));
793 return tb;
794}
795
796/* ------------------------------------------------------------------------ */
797#ifdef CONFIG_PROC_FS
798
799struct fib_iter_state {
800 struct seq_net_private p;
801 struct fn_zone *zone;
802 int bucket;
803 struct hlist_head *hash_head;
804 struct fib_node *fn;
805 struct fib_alias *fa;
806 loff_t pos;
807 unsigned int genid;
808 int valid;
809};
810
811static struct fib_alias *fib_get_first(struct seq_file *seq)
812{
813 struct fib_iter_state *iter = seq->private;
814 struct fib_table *main_table;
815 struct fn_hash *table;
816
817 main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
818 table = (struct fn_hash *)main_table->tb_data;
819
820 iter->bucket = 0;
821 iter->hash_head = NULL;
822 iter->fn = NULL;
823 iter->fa = NULL;
824 iter->pos = 0;
825 iter->genid = fib_hash_genid;
826 iter->valid = 1;
827
828 for (iter->zone = table->fn_zone_list; iter->zone;
829 iter->zone = iter->zone->fz_next) {
830 int maxslot;
831
832 if (!iter->zone->fz_nent)
833 continue;
834
835 iter->hash_head = iter->zone->fz_hash;
836 maxslot = iter->zone->fz_divisor;
837
838 for (iter->bucket = 0; iter->bucket < maxslot;
839 ++iter->bucket, ++iter->hash_head) {
840 struct hlist_node *node;
841 struct fib_node *fn;
842
843 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
844 struct fib_alias *fa;
845
846 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
847 iter->fn = fn;
848 iter->fa = fa;
849 goto out;
850 }
851 }
852 }
853 }
854out:
855 return iter->fa;
856}
857
858static struct fib_alias *fib_get_next(struct seq_file *seq)
859{
860 struct fib_iter_state *iter = seq->private;
861 struct fib_node *fn;
862 struct fib_alias *fa;
863
864 /* Advance FA, if any. */
865 fn = iter->fn;
866 fa = iter->fa;
867 if (fa) {
868 BUG_ON(!fn);
869 list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
870 iter->fa = fa;
871 goto out;
872 }
873 }
874
875 fa = iter->fa = NULL;
876
877 /* Advance FN. */
878 if (fn) {
879 struct hlist_node *node = &fn->fn_hash;
880 hlist_for_each_entry_continue(fn, node, fn_hash) {
881 iter->fn = fn;
882
883 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
884 iter->fa = fa;
885 goto out;
886 }
887 }
888 }
889
890 fn = iter->fn = NULL;
891
892 /* Advance hash chain. */
893 if (!iter->zone)
894 goto out;
895
896 for (;;) {
897 struct hlist_node *node;
898 int maxslot;
899
900 maxslot = iter->zone->fz_divisor;
901
902 while (++iter->bucket < maxslot) {
903 iter->hash_head++;
904
905 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
906 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
907 iter->fn = fn;
908 iter->fa = fa;
909 goto out;
910 }
911 }
912 }
913
914 iter->zone = iter->zone->fz_next;
915
916 if (!iter->zone)
917 goto out;
918
919 iter->bucket = 0;
920 iter->hash_head = iter->zone->fz_hash;
921
922 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
923 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
924 iter->fn = fn;
925 iter->fa = fa;
926 goto out;
927 }
928 }
929 }
930out:
931 iter->pos++;
932 return fa;
933}
934
935static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
936{
937 struct fib_iter_state *iter = seq->private;
938 struct fib_alias *fa;
939
940 if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
941 fa = iter->fa;
942 pos -= iter->pos;
943 } else
944 fa = fib_get_first(seq);
945
946 if (fa)
947 while (pos && (fa = fib_get_next(seq)))
948 --pos;
949 return pos ? NULL : fa;
950}
951
952static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
953 __acquires(fib_hash_lock)
954{
955 void *v = NULL;
956
957 read_lock(&fib_hash_lock);
958 if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
959 v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
960 return v;
961}
962
963static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
964{
965 ++*pos;
966 return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
967}
968
969static void fib_seq_stop(struct seq_file *seq, void *v)
970 __releases(fib_hash_lock)
971{
972 read_unlock(&fib_hash_lock);
973}
974
975static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
976{
977 static const unsigned type2flags[RTN_MAX + 1] = {
978 [7] = RTF_REJECT, [8] = RTF_REJECT,
979 };
980 unsigned flags = type2flags[type];
981
982 if (fi && fi->fib_nh->nh_gw)
983 flags |= RTF_GATEWAY;
984 if (mask == htonl(0xFFFFFFFF))
985 flags |= RTF_HOST;
986 flags |= RTF_UP;
987 return flags;
988}
989
990/*
991 * This outputs /proc/net/route.
992 *
993 * It always works in backward compatibility mode.
994 * The format of the file is not supposed to be changed.
995 */
996static int fib_seq_show(struct seq_file *seq, void *v)
997{
998 struct fib_iter_state *iter;
999 int len;
1000 __be32 prefix, mask;
1001 unsigned flags;
1002 struct fib_node *f;
1003 struct fib_alias *fa;
1004 struct fib_info *fi;
1005
1006 if (v == SEQ_START_TOKEN) {
1007 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
1008 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
1009 "\tWindow\tIRTT");
1010 goto out;
1011 }
1012
1013 iter = seq->private;
1014 f = iter->fn;
1015 fa = iter->fa;
1016 fi = fa->fa_info;
1017 prefix = f->fn_key;
1018 mask = FZ_MASK(iter->zone);
1019 flags = fib_flag_trans(fa->fa_type, mask, fi);
1020 if (fi)
1021 seq_printf(seq,
1022 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
1023 fi->fib_dev ? fi->fib_dev->name : "*", prefix,
1024 fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
1025 mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
1026 fi->fib_window,
1027 fi->fib_rtt >> 3, &len);
1028 else
1029 seq_printf(seq,
1030 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
1031 prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len);
1032
1033 seq_printf(seq, "%*s\n", 127 - len, "");
1034out:
1035 return 0;
1036}
1037
1038static const struct seq_operations fib_seq_ops = {
1039 .start = fib_seq_start,
1040 .next = fib_seq_next,
1041 .stop = fib_seq_stop,
1042 .show = fib_seq_show,
1043};
1044
1045static int fib_seq_open(struct inode *inode, struct file *file)
1046{
1047 return seq_open_net(inode, file, &fib_seq_ops,
1048 sizeof(struct fib_iter_state));
1049}
1050
1051static const struct file_operations fib_seq_fops = {
1052 .owner = THIS_MODULE,
1053 .open = fib_seq_open,
1054 .read = seq_read,
1055 .llseek = seq_lseek,
1056 .release = seq_release_net,
1057};
1058
1059int __net_init fib_proc_init(struct net *net)
1060{
1061 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops))
1062 return -ENOMEM;
1063 return 0;
1064}
1065
1066void __net_exit fib_proc_exit(struct net *net)
1067{
1068 proc_net_remove(net, "route");
1069}
1070#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 637b133973bd..af0f14aba169 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -10,24 +10,25 @@ struct fib_alias {
10 struct fib_info *fa_info; 10 struct fib_info *fa_info;
11 u8 fa_tos; 11 u8 fa_tos;
12 u8 fa_type; 12 u8 fa_type;
13 u8 fa_scope;
14 u8 fa_state; 13 u8 fa_state;
15#ifdef CONFIG_IP_FIB_TRIE
16 struct rcu_head rcu; 14 struct rcu_head rcu;
17#endif
18}; 15};
19 16
20#define FA_S_ACCESSED 0x01 17#define FA_S_ACCESSED 0x01
21 18
19/* Dont write on fa_state unless needed, to keep it shared on all cpus */
20static inline void fib_alias_accessed(struct fib_alias *fa)
21{
22 if (!(fa->fa_state & FA_S_ACCESSED))
23 fa->fa_state |= FA_S_ACCESSED;
24}
25
22/* Exported by fib_semantics.c */ 26/* Exported by fib_semantics.c */
23extern int fib_semantic_match(struct list_head *head,
24 const struct flowi *flp,
25 struct fib_result *res, int prefixlen);
26extern void fib_release_info(struct fib_info *); 27extern void fib_release_info(struct fib_info *);
27extern struct fib_info *fib_create_info(struct fib_config *cfg); 28extern struct fib_info *fib_create_info(struct fib_config *cfg);
28extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); 29extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
29extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 30extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
30 u32 tb_id, u8 type, u8 scope, __be32 dst, 31 u32 tb_id, u8 type, __be32 dst,
31 int dst_len, u8 tos, struct fib_info *fi, 32 int dst_len, u8 tos, struct fib_info *fi,
32 unsigned int); 33 unsigned int);
33extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, 34extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
@@ -42,11 +43,15 @@ extern int fib_detect_death(struct fib_info *fi, int order,
42static inline void fib_result_assign(struct fib_result *res, 43static inline void fib_result_assign(struct fib_result *res,
43 struct fib_info *fi) 44 struct fib_info *fi)
44{ 45{
45 if (res->fi != NULL) 46 /* we used to play games with refcounts, but we now use RCU */
46 fib_info_put(res->fi);
47 res->fi = fi; 47 res->fi = fi;
48 if (fi != NULL)
49 atomic_inc(&fi->fib_clntref);
50} 48}
51 49
50struct fib_prop {
51 int error;
52 u8 scope;
53};
54
55extern const struct fib_prop fib_props[RTN_MAX + 1];
56
52#endif /* _FIB_LOOKUP_H */ 57#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 76daeb5ff564..a53bb1b5b118 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -6,7 +6,7 @@
6 * IPv4 Forwarding Information Base: policy rules. 6 * IPv4 Forwarding Information Base: policy rules.
7 * 7 *
8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 * Thomas Graf <tgraf@suug.ch> 9 * Thomas Graf <tgraf@suug.ch>
10 * 10 *
11 * This program is free software; you can redistribute it and/or 11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License 12 * modify it under the terms of the GNU General Public License
@@ -14,7 +14,7 @@
14 * 2 of the License, or (at your option) any later version. 14 * 2 of the License, or (at your option) any later version.
15 * 15 *
16 * Fixes: 16 * Fixes:
17 * Rani Assaf : local_rule cannot be deleted 17 * Rani Assaf : local_rule cannot be deleted
18 * Marc Boucher : routing by fwmark 18 * Marc Boucher : routing by fwmark
19 */ 19 */
20 20
@@ -32,8 +32,7 @@
32#include <net/ip_fib.h> 32#include <net/ip_fib.h>
33#include <net/fib_rules.h> 33#include <net/fib_rules.h>
34 34
35struct fib4_rule 35struct fib4_rule {
36{
37 struct fib_rule common; 36 struct fib_rule common;
38 u8 dst_len; 37 u8 dst_len;
39 u8 src_len; 38 u8 src_len;
@@ -42,26 +41,27 @@ struct fib4_rule
42 __be32 srcmask; 41 __be32 srcmask;
43 __be32 dst; 42 __be32 dst;
44 __be32 dstmask; 43 __be32 dstmask;
45#ifdef CONFIG_NET_CLS_ROUTE 44#ifdef CONFIG_IP_ROUTE_CLASSID
46 u32 tclassid; 45 u32 tclassid;
47#endif 46#endif
48}; 47};
49 48
50#ifdef CONFIG_NET_CLS_ROUTE 49#ifdef CONFIG_IP_ROUTE_CLASSID
51u32 fib_rules_tclass(struct fib_result *res) 50u32 fib_rules_tclass(const struct fib_result *res)
52{ 51{
53 return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; 52 return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
54} 53}
55#endif 54#endif
56 55
57int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) 56int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
58{ 57{
59 struct fib_lookup_arg arg = { 58 struct fib_lookup_arg arg = {
60 .result = res, 59 .result = res,
60 .flags = FIB_LOOKUP_NOREF,
61 }; 61 };
62 int err; 62 int err;
63 63
64 err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg); 64 err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
65 res->r = arg.rule; 65 res->r = arg.rule;
66 66
67 return err; 67 return err;
@@ -91,10 +91,11 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
91 goto errout; 91 goto errout;
92 } 92 }
93 93
94 if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL) 94 tbl = fib_get_table(rule->fr_net, rule->table);
95 if (!tbl)
95 goto errout; 96 goto errout;
96 97
97 err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result); 98 err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
98 if (err > 0) 99 if (err > 0)
99 err = -EAGAIN; 100 err = -EAGAIN;
100errout: 101errout:
@@ -105,14 +106,15 @@ errout:
105static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) 106static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
106{ 107{
107 struct fib4_rule *r = (struct fib4_rule *) rule; 108 struct fib4_rule *r = (struct fib4_rule *) rule;
108 __be32 daddr = fl->fl4_dst; 109 struct flowi4 *fl4 = &fl->u.ip4;
109 __be32 saddr = fl->fl4_src; 110 __be32 daddr = fl4->daddr;
111 __be32 saddr = fl4->saddr;
110 112
111 if (((saddr ^ r->src) & r->srcmask) || 113 if (((saddr ^ r->src) & r->srcmask) ||
112 ((daddr ^ r->dst) & r->dstmask)) 114 ((daddr ^ r->dst) & r->dstmask))
113 return 0; 115 return 0;
114 116
115 if (r->tos && (r->tos != fl->fl4_tos)) 117 if (r->tos && (r->tos != fl4->flowi4_tos))
116 return 0; 118 return 0;
117 119
118 return 1; 120 return 1;
@@ -164,7 +166,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
164 if (frh->dst_len) 166 if (frh->dst_len)
165 rule4->dst = nla_get_be32(tb[FRA_DST]); 167 rule4->dst = nla_get_be32(tb[FRA_DST]);
166 168
167#ifdef CONFIG_NET_CLS_ROUTE 169#ifdef CONFIG_IP_ROUTE_CLASSID
168 if (tb[FRA_FLOW]) 170 if (tb[FRA_FLOW])
169 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); 171 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
170#endif 172#endif
@@ -194,7 +196,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
194 if (frh->tos && (rule4->tos != frh->tos)) 196 if (frh->tos && (rule4->tos != frh->tos))
195 return 0; 197 return 0;
196 198
197#ifdef CONFIG_NET_CLS_ROUTE 199#ifdef CONFIG_IP_ROUTE_CLASSID
198 if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) 200 if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
199 return 0; 201 return 0;
200#endif 202#endif
@@ -223,7 +225,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
223 if (rule4->src_len) 225 if (rule4->src_len)
224 NLA_PUT_BE32(skb, FRA_SRC, rule4->src); 226 NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
225 227
226#ifdef CONFIG_NET_CLS_ROUTE 228#ifdef CONFIG_IP_ROUTE_CLASSID
227 if (rule4->tclassid) 229 if (rule4->tclassid)
228 NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); 230 NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
229#endif 231#endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 20f09c5b31e8..33e2c35b74b7 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -49,7 +49,7 @@
49static DEFINE_SPINLOCK(fib_info_lock); 49static DEFINE_SPINLOCK(fib_info_lock);
50static struct hlist_head *fib_info_hash; 50static struct hlist_head *fib_info_hash;
51static struct hlist_head *fib_info_laddrhash; 51static struct hlist_head *fib_info_laddrhash;
52static unsigned int fib_hash_size; 52static unsigned int fib_info_hash_size;
53static unsigned int fib_info_cnt; 53static unsigned int fib_info_cnt;
54 54
55#define DEVINDEX_HASHBITS 8 55#define DEVINDEX_HASHBITS 8
@@ -60,89 +60,93 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
60 60
61static DEFINE_SPINLOCK(fib_multipath_lock); 61static DEFINE_SPINLOCK(fib_multipath_lock);
62 62
63#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ 63#define for_nexthops(fi) { \
64for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) 64 int nhsel; const struct fib_nh *nh; \
65 for (nhsel = 0, nh = (fi)->fib_nh; \
66 nhsel < (fi)->fib_nhs; \
67 nh++, nhsel++)
65 68
66#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \ 69#define change_nexthops(fi) { \
67for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++) 70 int nhsel; struct fib_nh *nexthop_nh; \
71 for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
72 nhsel < (fi)->fib_nhs; \
73 nexthop_nh++, nhsel++)
68 74
69#else /* CONFIG_IP_ROUTE_MULTIPATH */ 75#else /* CONFIG_IP_ROUTE_MULTIPATH */
70 76
71/* Hope, that gcc will optimize it to get rid of dummy loop */ 77/* Hope, that gcc will optimize it to get rid of dummy loop */
72 78
73#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \ 79#define for_nexthops(fi) { \
74for (nhsel=0; nhsel < 1; nhsel++) 80 int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \
81 for (nhsel = 0; nhsel < 1; nhsel++)
75 82
76#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ 83#define change_nexthops(fi) { \
77for (nhsel=0; nhsel < 1; nhsel++) 84 int nhsel; \
85 struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
86 for (nhsel = 0; nhsel < 1; nhsel++)
78 87
79#endif /* CONFIG_IP_ROUTE_MULTIPATH */ 88#endif /* CONFIG_IP_ROUTE_MULTIPATH */
80 89
81#define endfor_nexthops(fi) } 90#define endfor_nexthops(fi) }
82 91
83 92
84static const struct 93const struct fib_prop fib_props[RTN_MAX + 1] = {
85{ 94 [RTN_UNSPEC] = {
86 int error;
87 u8 scope;
88} fib_props[RTN_MAX + 1] = {
89 {
90 .error = 0, 95 .error = 0,
91 .scope = RT_SCOPE_NOWHERE, 96 .scope = RT_SCOPE_NOWHERE,
92 }, /* RTN_UNSPEC */ 97 },
93 { 98 [RTN_UNICAST] = {
94 .error = 0, 99 .error = 0,
95 .scope = RT_SCOPE_UNIVERSE, 100 .scope = RT_SCOPE_UNIVERSE,
96 }, /* RTN_UNICAST */ 101 },
97 { 102 [RTN_LOCAL] = {
98 .error = 0, 103 .error = 0,
99 .scope = RT_SCOPE_HOST, 104 .scope = RT_SCOPE_HOST,
100 }, /* RTN_LOCAL */ 105 },
101 { 106 [RTN_BROADCAST] = {
102 .error = 0, 107 .error = 0,
103 .scope = RT_SCOPE_LINK, 108 .scope = RT_SCOPE_LINK,
104 }, /* RTN_BROADCAST */ 109 },
105 { 110 [RTN_ANYCAST] = {
106 .error = 0, 111 .error = 0,
107 .scope = RT_SCOPE_LINK, 112 .scope = RT_SCOPE_LINK,
108 }, /* RTN_ANYCAST */ 113 },
109 { 114 [RTN_MULTICAST] = {
110 .error = 0, 115 .error = 0,
111 .scope = RT_SCOPE_UNIVERSE, 116 .scope = RT_SCOPE_UNIVERSE,
112 }, /* RTN_MULTICAST */ 117 },
113 { 118 [RTN_BLACKHOLE] = {
114 .error = -EINVAL, 119 .error = -EINVAL,
115 .scope = RT_SCOPE_UNIVERSE, 120 .scope = RT_SCOPE_UNIVERSE,
116 }, /* RTN_BLACKHOLE */ 121 },
117 { 122 [RTN_UNREACHABLE] = {
118 .error = -EHOSTUNREACH, 123 .error = -EHOSTUNREACH,
119 .scope = RT_SCOPE_UNIVERSE, 124 .scope = RT_SCOPE_UNIVERSE,
120 }, /* RTN_UNREACHABLE */ 125 },
121 { 126 [RTN_PROHIBIT] = {
122 .error = -EACCES, 127 .error = -EACCES,
123 .scope = RT_SCOPE_UNIVERSE, 128 .scope = RT_SCOPE_UNIVERSE,
124 }, /* RTN_PROHIBIT */ 129 },
125 { 130 [RTN_THROW] = {
126 .error = -EAGAIN, 131 .error = -EAGAIN,
127 .scope = RT_SCOPE_UNIVERSE, 132 .scope = RT_SCOPE_UNIVERSE,
128 }, /* RTN_THROW */ 133 },
129 { 134 [RTN_NAT] = {
130 .error = -EINVAL, 135 .error = -EINVAL,
131 .scope = RT_SCOPE_NOWHERE, 136 .scope = RT_SCOPE_NOWHERE,
132 }, /* RTN_NAT */ 137 },
133 { 138 [RTN_XRESOLVE] = {
134 .error = -EINVAL, 139 .error = -EINVAL,
135 .scope = RT_SCOPE_NOWHERE, 140 .scope = RT_SCOPE_NOWHERE,
136 }, /* RTN_XRESOLVE */ 141 },
137}; 142};
138 143
139
140/* Release a nexthop info record */ 144/* Release a nexthop info record */
141 145
142void free_fib_info(struct fib_info *fi) 146void free_fib_info(struct fib_info *fi)
143{ 147{
144 if (fi->fib_dead == 0) { 148 if (fi->fib_dead == 0) {
145 printk(KERN_WARNING "Freeing alive fib_info %p\n", fi); 149 pr_warning("Freeing alive fib_info %p\n", fi);
146 return; 150 return;
147 } 151 }
148 change_nexthops(fi) { 152 change_nexthops(fi) {
@@ -152,7 +156,7 @@ void free_fib_info(struct fib_info *fi)
152 } endfor_nexthops(fi); 156 } endfor_nexthops(fi);
153 fib_info_cnt--; 157 fib_info_cnt--;
154 release_net(fi->fib_net); 158 release_net(fi->fib_net);
155 kfree(fi); 159 kfree_rcu(fi, rcu);
156} 160}
157 161
158void fib_release_info(struct fib_info *fi) 162void fib_release_info(struct fib_info *fi)
@@ -173,7 +177,7 @@ void fib_release_info(struct fib_info *fi)
173 spin_unlock_bh(&fib_info_lock); 177 spin_unlock_bh(&fib_info_lock);
174} 178}
175 179
176static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) 180static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
177{ 181{
178 const struct fib_nh *onh = ofi->fib_nh; 182 const struct fib_nh *onh = ofi->fib_nh;
179 183
@@ -184,10 +188,10 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *
184#ifdef CONFIG_IP_ROUTE_MULTIPATH 188#ifdef CONFIG_IP_ROUTE_MULTIPATH
185 nh->nh_weight != onh->nh_weight || 189 nh->nh_weight != onh->nh_weight ||
186#endif 190#endif
187#ifdef CONFIG_NET_CLS_ROUTE 191#ifdef CONFIG_IP_ROUTE_CLASSID
188 nh->nh_tclassid != onh->nh_tclassid || 192 nh->nh_tclassid != onh->nh_tclassid ||
189#endif 193#endif
190 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) 194 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
191 return -1; 195 return -1;
192 onh++; 196 onh++;
193 } endfor_nexthops(fi); 197 } endfor_nexthops(fi);
@@ -205,10 +209,10 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
205 209
206static inline unsigned int fib_info_hashfn(const struct fib_info *fi) 210static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
207{ 211{
208 unsigned int mask = (fib_hash_size - 1); 212 unsigned int mask = (fib_info_hash_size - 1);
209 unsigned int val = fi->fib_nhs; 213 unsigned int val = fi->fib_nhs;
210 214
211 val ^= fi->fib_protocol; 215 val ^= (fi->fib_protocol << 8) | fi->fib_scope;
212 val ^= (__force u32)fi->fib_prefsrc; 216 val ^= (__force u32)fi->fib_prefsrc;
213 val ^= fi->fib_priority; 217 val ^= fi->fib_priority;
214 for_nexthops(fi) { 218 for_nexthops(fi) {
@@ -234,11 +238,12 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
234 if (fi->fib_nhs != nfi->fib_nhs) 238 if (fi->fib_nhs != nfi->fib_nhs)
235 continue; 239 continue;
236 if (nfi->fib_protocol == fi->fib_protocol && 240 if (nfi->fib_protocol == fi->fib_protocol &&
241 nfi->fib_scope == fi->fib_scope &&
237 nfi->fib_prefsrc == fi->fib_prefsrc && 242 nfi->fib_prefsrc == fi->fib_prefsrc &&
238 nfi->fib_priority == fi->fib_priority && 243 nfi->fib_priority == fi->fib_priority &&
239 memcmp(nfi->fib_metrics, fi->fib_metrics, 244 memcmp(nfi->fib_metrics, fi->fib_metrics,
240 sizeof(fi->fib_metrics)) == 0 && 245 sizeof(u32) * RTAX_MAX) == 0 &&
241 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && 246 ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
242 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) 247 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
243 return fi; 248 return fi;
244 } 249 }
@@ -247,9 +252,8 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
247} 252}
248 253
249/* Check, that the gateway is already configured. 254/* Check, that the gateway is already configured.
250 Used only by redirect accept routine. 255 * Used only by redirect accept routine.
251 */ 256 */
252
253int ip_fib_check_default(__be32 gw, struct net_device *dev) 257int ip_fib_check_default(__be32 gw, struct net_device *dev)
254{ 258{
255 struct hlist_head *head; 259 struct hlist_head *head;
@@ -264,7 +268,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
264 hlist_for_each_entry(nh, node, head, nh_hash) { 268 hlist_for_each_entry(nh, node, head, nh_hash) {
265 if (nh->nh_dev == dev && 269 if (nh->nh_dev == dev &&
266 nh->nh_gw == gw && 270 nh->nh_gw == gw &&
267 !(nh->nh_flags&RTNH_F_DEAD)) { 271 !(nh->nh_flags & RTNH_F_DEAD)) {
268 spin_unlock(&fib_info_lock); 272 spin_unlock(&fib_info_lock);
269 return 0; 273 return 0;
270 } 274 }
@@ -315,7 +319,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
315 goto errout; 319 goto errout;
316 320
317 err = fib_dump_info(skb, info->pid, seq, event, tb_id, 321 err = fib_dump_info(skb, info->pid, seq, event, tb_id,
318 fa->fa_type, fa->fa_scope, key, dst_len, 322 fa->fa_type, key, dst_len,
319 fa->fa_tos, fa->fa_info, nlm_flags); 323 fa->fa_tos, fa->fa_info, nlm_flags);
320 if (err < 0) { 324 if (err < 0) {
321 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ 325 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
@@ -362,10 +366,10 @@ int fib_detect_death(struct fib_info *fi, int order,
362 } 366 }
363 if (state == NUD_REACHABLE) 367 if (state == NUD_REACHABLE)
364 return 0; 368 return 0;
365 if ((state&NUD_VALID) && order != dflt) 369 if ((state & NUD_VALID) && order != dflt)
366 return 0; 370 return 0;
367 if ((state&NUD_VALID) || 371 if ((state & NUD_VALID) ||
368 (*last_idx<0 && order > dflt)) { 372 (*last_idx < 0 && order > dflt)) {
369 *last_resort = fi; 373 *last_resort = fi;
370 *last_idx = order; 374 *last_idx = order;
371 } 375 }
@@ -407,7 +411,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
407 411
408 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 412 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
409 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; 413 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
410#ifdef CONFIG_NET_CLS_ROUTE 414#ifdef CONFIG_IP_ROUTE_CLASSID
411 nla = nla_find(attrs, attrlen, RTA_FLOW); 415 nla = nla_find(attrs, attrlen, RTA_FLOW);
412 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 416 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
413#endif 417#endif
@@ -461,7 +465,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
461 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 465 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
462 if (nla && nla_get_be32(nla) != nh->nh_gw) 466 if (nla && nla_get_be32(nla) != nh->nh_gw)
463 return 1; 467 return 1;
464#ifdef CONFIG_NET_CLS_ROUTE 468#ifdef CONFIG_IP_ROUTE_CLASSID
465 nla = nla_find(attrs, attrlen, RTA_FLOW); 469 nla = nla_find(attrs, attrlen, RTA_FLOW);
466 if (nla && nla_get_u32(nla) != nh->nh_tclassid) 470 if (nla && nla_get_u32(nla) != nh->nh_tclassid)
467 return 1; 471 return 1;
@@ -476,145 +480,146 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
476 480
477 481
478/* 482/*
479 Picture 483 * Picture
480 ------- 484 * -------
481 485 *
482 Semantics of nexthop is very messy by historical reasons. 486 * Semantics of nexthop is very messy by historical reasons.
483 We have to take into account, that: 487 * We have to take into account, that:
484 a) gateway can be actually local interface address, 488 * a) gateway can be actually local interface address,
485 so that gatewayed route is direct. 489 * so that gatewayed route is direct.
486 b) gateway must be on-link address, possibly 490 * b) gateway must be on-link address, possibly
487 described not by an ifaddr, but also by a direct route. 491 * described not by an ifaddr, but also by a direct route.
488 c) If both gateway and interface are specified, they should not 492 * c) If both gateway and interface are specified, they should not
489 contradict. 493 * contradict.
490 d) If we use tunnel routes, gateway could be not on-link. 494 * d) If we use tunnel routes, gateway could be not on-link.
491 495 *
492 Attempt to reconcile all of these (alas, self-contradictory) conditions 496 * Attempt to reconcile all of these (alas, self-contradictory) conditions
493 results in pretty ugly and hairy code with obscure logic. 497 * results in pretty ugly and hairy code with obscure logic.
494 498 *
495 I chose to generalized it instead, so that the size 499 * I chose to generalized it instead, so that the size
496 of code does not increase practically, but it becomes 500 * of code does not increase practically, but it becomes
497 much more general. 501 * much more general.
498 Every prefix is assigned a "scope" value: "host" is local address, 502 * Every prefix is assigned a "scope" value: "host" is local address,
499 "link" is direct route, 503 * "link" is direct route,
500 [ ... "site" ... "interior" ... ] 504 * [ ... "site" ... "interior" ... ]
501 and "universe" is true gateway route with global meaning. 505 * and "universe" is true gateway route with global meaning.
502 506 *
503 Every prefix refers to a set of "nexthop"s (gw, oif), 507 * Every prefix refers to a set of "nexthop"s (gw, oif),
504 where gw must have narrower scope. This recursion stops 508 * where gw must have narrower scope. This recursion stops
505 when gw has LOCAL scope or if "nexthop" is declared ONLINK, 509 * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
506 which means that gw is forced to be on link. 510 * which means that gw is forced to be on link.
507 511 *
508 Code is still hairy, but now it is apparently logically 512 * Code is still hairy, but now it is apparently logically
509 consistent and very flexible. F.e. as by-product it allows 513 * consistent and very flexible. F.e. as by-product it allows
510 to co-exists in peace independent exterior and interior 514 * to co-exists in peace independent exterior and interior
511 routing processes. 515 * routing processes.
512 516 *
513 Normally it looks as following. 517 * Normally it looks as following.
514 518 *
515 {universe prefix} -> (gw, oif) [scope link] 519 * {universe prefix} -> (gw, oif) [scope link]
516 | 520 * |
517 |-> {link prefix} -> (gw, oif) [scope local] 521 * |-> {link prefix} -> (gw, oif) [scope local]
518 | 522 * |
519 |-> {local prefix} (terminal node) 523 * |-> {local prefix} (terminal node)
520 */ 524 */
521
522static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, 525static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
523 struct fib_nh *nh) 526 struct fib_nh *nh)
524{ 527{
525 int err; 528 int err;
526 struct net *net; 529 struct net *net;
530 struct net_device *dev;
527 531
528 net = cfg->fc_nlinfo.nl_net; 532 net = cfg->fc_nlinfo.nl_net;
529 if (nh->nh_gw) { 533 if (nh->nh_gw) {
530 struct fib_result res; 534 struct fib_result res;
531 535
532 if (nh->nh_flags&RTNH_F_ONLINK) { 536 if (nh->nh_flags & RTNH_F_ONLINK) {
533 struct net_device *dev;
534 537
535 if (cfg->fc_scope >= RT_SCOPE_LINK) 538 if (cfg->fc_scope >= RT_SCOPE_LINK)
536 return -EINVAL; 539 return -EINVAL;
537 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) 540 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
538 return -EINVAL; 541 return -EINVAL;
539 if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL) 542 dev = __dev_get_by_index(net, nh->nh_oif);
543 if (!dev)
540 return -ENODEV; 544 return -ENODEV;
541 if (!(dev->flags&IFF_UP)) 545 if (!(dev->flags & IFF_UP))
542 return -ENETDOWN; 546 return -ENETDOWN;
543 nh->nh_dev = dev; 547 nh->nh_dev = dev;
544 dev_hold(dev); 548 dev_hold(dev);
545 nh->nh_scope = RT_SCOPE_LINK; 549 nh->nh_scope = RT_SCOPE_LINK;
546 return 0; 550 return 0;
547 } 551 }
552 rcu_read_lock();
548 { 553 {
549 struct flowi fl = { 554 struct flowi4 fl4 = {
550 .nl_u = { 555 .daddr = nh->nh_gw,
551 .ip4_u = { 556 .flowi4_scope = cfg->fc_scope + 1,
552 .daddr = nh->nh_gw, 557 .flowi4_oif = nh->nh_oif,
553 .scope = cfg->fc_scope + 1,
554 },
555 },
556 .oif = nh->nh_oif,
557 }; 558 };
558 559
559 /* It is not necessary, but requires a bit of thinking */ 560 /* It is not necessary, but requires a bit of thinking */
560 if (fl.fl4_scope < RT_SCOPE_LINK) 561 if (fl4.flowi4_scope < RT_SCOPE_LINK)
561 fl.fl4_scope = RT_SCOPE_LINK; 562 fl4.flowi4_scope = RT_SCOPE_LINK;
562 if ((err = fib_lookup(net, &fl, &res)) != 0) 563 err = fib_lookup(net, &fl4, &res);
564 if (err) {
565 rcu_read_unlock();
563 return err; 566 return err;
567 }
564 } 568 }
565 err = -EINVAL; 569 err = -EINVAL;
566 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) 570 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
567 goto out; 571 goto out;
568 nh->nh_scope = res.scope; 572 nh->nh_scope = res.scope;
569 nh->nh_oif = FIB_RES_OIF(res); 573 nh->nh_oif = FIB_RES_OIF(res);
570 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) 574 nh->nh_dev = dev = FIB_RES_DEV(res);
575 if (!dev)
571 goto out; 576 goto out;
572 dev_hold(nh->nh_dev); 577 dev_hold(dev);
573 err = -ENETDOWN; 578 err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
574 if (!(nh->nh_dev->flags & IFF_UP))
575 goto out;
576 err = 0;
577out:
578 fib_res_put(&res);
579 return err;
580 } else { 579 } else {
581 struct in_device *in_dev; 580 struct in_device *in_dev;
582 581
583 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) 582 if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
584 return -EINVAL; 583 return -EINVAL;
585 584
585 rcu_read_lock();
586 err = -ENODEV;
586 in_dev = inetdev_by_index(net, nh->nh_oif); 587 in_dev = inetdev_by_index(net, nh->nh_oif);
587 if (in_dev == NULL) 588 if (in_dev == NULL)
588 return -ENODEV; 589 goto out;
589 if (!(in_dev->dev->flags&IFF_UP)) { 590 err = -ENETDOWN;
590 in_dev_put(in_dev); 591 if (!(in_dev->dev->flags & IFF_UP))
591 return -ENETDOWN; 592 goto out;
592 }
593 nh->nh_dev = in_dev->dev; 593 nh->nh_dev = in_dev->dev;
594 dev_hold(nh->nh_dev); 594 dev_hold(nh->nh_dev);
595 nh->nh_scope = RT_SCOPE_HOST; 595 nh->nh_scope = RT_SCOPE_HOST;
596 in_dev_put(in_dev); 596 err = 0;
597 } 597 }
598 return 0; 598out:
599 rcu_read_unlock();
600 return err;
599} 601}
600 602
601static inline unsigned int fib_laddr_hashfn(__be32 val) 603static inline unsigned int fib_laddr_hashfn(__be32 val)
602{ 604{
603 unsigned int mask = (fib_hash_size - 1); 605 unsigned int mask = (fib_info_hash_size - 1);
604 606
605 return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask; 607 return ((__force u32)val ^
608 ((__force u32)val >> 7) ^
609 ((__force u32)val >> 14)) & mask;
606} 610}
607 611
608static struct hlist_head *fib_hash_alloc(int bytes) 612static struct hlist_head *fib_info_hash_alloc(int bytes)
609{ 613{
610 if (bytes <= PAGE_SIZE) 614 if (bytes <= PAGE_SIZE)
611 return kzalloc(bytes, GFP_KERNEL); 615 return kzalloc(bytes, GFP_KERNEL);
612 else 616 else
613 return (struct hlist_head *) 617 return (struct hlist_head *)
614 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes)); 618 __get_free_pages(GFP_KERNEL | __GFP_ZERO,
619 get_order(bytes));
615} 620}
616 621
617static void fib_hash_free(struct hlist_head *hash, int bytes) 622static void fib_info_hash_free(struct hlist_head *hash, int bytes)
618{ 623{
619 if (!hash) 624 if (!hash)
620 return; 625 return;
@@ -625,18 +630,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes)
625 free_pages((unsigned long) hash, get_order(bytes)); 630 free_pages((unsigned long) hash, get_order(bytes));
626} 631}
627 632
628static void fib_hash_move(struct hlist_head *new_info_hash, 633static void fib_info_hash_move(struct hlist_head *new_info_hash,
629 struct hlist_head *new_laddrhash, 634 struct hlist_head *new_laddrhash,
630 unsigned int new_size) 635 unsigned int new_size)
631{ 636{
632 struct hlist_head *old_info_hash, *old_laddrhash; 637 struct hlist_head *old_info_hash, *old_laddrhash;
633 unsigned int old_size = fib_hash_size; 638 unsigned int old_size = fib_info_hash_size;
634 unsigned int i, bytes; 639 unsigned int i, bytes;
635 640
636 spin_lock_bh(&fib_info_lock); 641 spin_lock_bh(&fib_info_lock);
637 old_info_hash = fib_info_hash; 642 old_info_hash = fib_info_hash;
638 old_laddrhash = fib_info_laddrhash; 643 old_laddrhash = fib_info_laddrhash;
639 fib_hash_size = new_size; 644 fib_info_hash_size = new_size;
640 645
641 for (i = 0; i < old_size; i++) { 646 for (i = 0; i < old_size; i++) {
642 struct hlist_head *head = &fib_info_hash[i]; 647 struct hlist_head *head = &fib_info_hash[i];
@@ -677,8 +682,18 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
677 spin_unlock_bh(&fib_info_lock); 682 spin_unlock_bh(&fib_info_lock);
678 683
679 bytes = old_size * sizeof(struct hlist_head *); 684 bytes = old_size * sizeof(struct hlist_head *);
680 fib_hash_free(old_info_hash, bytes); 685 fib_info_hash_free(old_info_hash, bytes);
681 fib_hash_free(old_laddrhash, bytes); 686 fib_info_hash_free(old_laddrhash, bytes);
687}
688
689__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
690{
691 nh->nh_saddr = inet_select_addr(nh->nh_dev,
692 nh->nh_gw,
693 nh->nh_parent->fib_scope);
694 nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
695
696 return nh->nh_saddr;
682} 697}
683 698
684struct fib_info *fib_create_info(struct fib_config *cfg) 699struct fib_info *fib_create_info(struct fib_config *cfg)
@@ -689,6 +704,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
689 int nhs = 1; 704 int nhs = 1;
690 struct net *net = cfg->fc_nlinfo.nl_net; 705 struct net *net = cfg->fc_nlinfo.nl_net;
691 706
707 if (cfg->fc_type > RTN_MAX)
708 goto err_inval;
709
692 /* Fast check to catch the most weird cases */ 710 /* Fast check to catch the most weird cases */
693 if (fib_props[cfg->fc_type].scope > cfg->fc_scope) 711 if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
694 goto err_inval; 712 goto err_inval;
@@ -702,8 +720,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
702#endif 720#endif
703 721
704 err = -ENOBUFS; 722 err = -ENOBUFS;
705 if (fib_info_cnt >= fib_hash_size) { 723 if (fib_info_cnt >= fib_info_hash_size) {
706 unsigned int new_size = fib_hash_size << 1; 724 unsigned int new_size = fib_info_hash_size << 1;
707 struct hlist_head *new_info_hash; 725 struct hlist_head *new_info_hash;
708 struct hlist_head *new_laddrhash; 726 struct hlist_head *new_laddrhash;
709 unsigned int bytes; 727 unsigned int bytes;
@@ -711,25 +729,32 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
711 if (!new_size) 729 if (!new_size)
712 new_size = 1; 730 new_size = 1;
713 bytes = new_size * sizeof(struct hlist_head *); 731 bytes = new_size * sizeof(struct hlist_head *);
714 new_info_hash = fib_hash_alloc(bytes); 732 new_info_hash = fib_info_hash_alloc(bytes);
715 new_laddrhash = fib_hash_alloc(bytes); 733 new_laddrhash = fib_info_hash_alloc(bytes);
716 if (!new_info_hash || !new_laddrhash) { 734 if (!new_info_hash || !new_laddrhash) {
717 fib_hash_free(new_info_hash, bytes); 735 fib_info_hash_free(new_info_hash, bytes);
718 fib_hash_free(new_laddrhash, bytes); 736 fib_info_hash_free(new_laddrhash, bytes);
719 } else 737 } else
720 fib_hash_move(new_info_hash, new_laddrhash, new_size); 738 fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
721 739
722 if (!fib_hash_size) 740 if (!fib_info_hash_size)
723 goto failure; 741 goto failure;
724 } 742 }
725 743
726 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); 744 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
727 if (fi == NULL) 745 if (fi == NULL)
728 goto failure; 746 goto failure;
747 if (cfg->fc_mx) {
748 fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
749 if (!fi->fib_metrics)
750 goto failure;
751 } else
752 fi->fib_metrics = (u32 *) dst_default_metrics;
729 fib_info_cnt++; 753 fib_info_cnt++;
730 754
731 fi->fib_net = hold_net(net); 755 fi->fib_net = hold_net(net);
732 fi->fib_protocol = cfg->fc_protocol; 756 fi->fib_protocol = cfg->fc_protocol;
757 fi->fib_scope = cfg->fc_scope;
733 fi->fib_flags = cfg->fc_flags; 758 fi->fib_flags = cfg->fc_flags;
734 fi->fib_priority = cfg->fc_priority; 759 fi->fib_priority = cfg->fc_priority;
735 fi->fib_prefsrc = cfg->fc_prefsrc; 760 fi->fib_prefsrc = cfg->fc_prefsrc;
@@ -763,7 +788,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
763 goto err_inval; 788 goto err_inval;
764 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) 789 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
765 goto err_inval; 790 goto err_inval;
766#ifdef CONFIG_NET_CLS_ROUTE 791#ifdef CONFIG_IP_ROUTE_CLASSID
767 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) 792 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
768 goto err_inval; 793 goto err_inval;
769#endif 794#endif
@@ -776,7 +801,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
776 nh->nh_oif = cfg->fc_oif; 801 nh->nh_oif = cfg->fc_oif;
777 nh->nh_gw = cfg->fc_gw; 802 nh->nh_gw = cfg->fc_gw;
778 nh->nh_flags = cfg->fc_flags; 803 nh->nh_flags = cfg->fc_flags;
779#ifdef CONFIG_NET_CLS_ROUTE 804#ifdef CONFIG_IP_ROUTE_CLASSID
780 nh->nh_tclassid = cfg->fc_flow; 805 nh->nh_tclassid = cfg->fc_flow;
781#endif 806#endif
782#ifdef CONFIG_IP_ROUTE_MULTIPATH 807#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -788,6 +813,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
788 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) 813 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
789 goto err_inval; 814 goto err_inval;
790 goto link_it; 815 goto link_it;
816 } else {
817 switch (cfg->fc_type) {
818 case RTN_UNICAST:
819 case RTN_LOCAL:
820 case RTN_BROADCAST:
821 case RTN_ANYCAST:
822 case RTN_MULTICAST:
823 break;
824 default:
825 goto err_inval;
826 }
791 } 827 }
792 828
793 if (cfg->fc_scope > RT_SCOPE_HOST) 829 if (cfg->fc_scope > RT_SCOPE_HOST)
@@ -806,7 +842,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
806 goto failure; 842 goto failure;
807 } else { 843 } else {
808 change_nexthops(fi) { 844 change_nexthops(fi) {
809 if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0) 845 err = fib_check_nh(cfg, fi, nexthop_nh);
846 if (err != 0)
810 goto failure; 847 goto failure;
811 } endfor_nexthops(fi) 848 } endfor_nexthops(fi)
812 } 849 }
@@ -818,8 +855,13 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
818 goto err_inval; 855 goto err_inval;
819 } 856 }
820 857
858 change_nexthops(fi) {
859 fib_info_update_nh_saddr(net, nexthop_nh);
860 } endfor_nexthops(fi)
861
821link_it: 862link_it:
822 if ((ofi = fib_find_info(fi)) != NULL) { 863 ofi = fib_find_info(fi);
864 if (ofi) {
823 fi->fib_dead = 1; 865 fi->fib_dead = 1;
824 free_fib_info(fi); 866 free_fib_info(fi);
825 ofi->fib_treeref++; 867 ofi->fib_treeref++;
@@ -862,86 +904,8 @@ failure:
862 return ERR_PTR(err); 904 return ERR_PTR(err);
863} 905}
864 906
865/* Note! fib_semantic_match intentionally uses RCU list functions. */
866int fib_semantic_match(struct list_head *head, const struct flowi *flp,
867 struct fib_result *res, int prefixlen)
868{
869 struct fib_alias *fa;
870 int nh_sel = 0;
871
872 list_for_each_entry_rcu(fa, head, fa_list) {
873 int err;
874
875 if (fa->fa_tos &&
876 fa->fa_tos != flp->fl4_tos)
877 continue;
878
879 if (fa->fa_scope < flp->fl4_scope)
880 continue;
881
882 fa->fa_state |= FA_S_ACCESSED;
883
884 err = fib_props[fa->fa_type].error;
885 if (err == 0) {
886 struct fib_info *fi = fa->fa_info;
887
888 if (fi->fib_flags & RTNH_F_DEAD)
889 continue;
890
891 switch (fa->fa_type) {
892 case RTN_UNICAST:
893 case RTN_LOCAL:
894 case RTN_BROADCAST:
895 case RTN_ANYCAST:
896 case RTN_MULTICAST:
897 for_nexthops(fi) {
898 if (nh->nh_flags&RTNH_F_DEAD)
899 continue;
900 if (!flp->oif || flp->oif == nh->nh_oif)
901 break;
902 }
903#ifdef CONFIG_IP_ROUTE_MULTIPATH
904 if (nhsel < fi->fib_nhs) {
905 nh_sel = nhsel;
906 goto out_fill_res;
907 }
908#else
909 if (nhsel < 1) {
910 goto out_fill_res;
911 }
912#endif
913 endfor_nexthops(fi);
914 continue;
915
916 default:
917 printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
918 fa->fa_type);
919 return -EINVAL;
920 }
921 }
922 return err;
923 }
924 return 1;
925
926out_fill_res:
927 res->prefixlen = prefixlen;
928 res->nh_sel = nh_sel;
929 res->type = fa->fa_type;
930 res->scope = fa->fa_scope;
931 res->fi = fa->fa_info;
932 atomic_inc(&res->fi->fib_clntref);
933 return 0;
934}
935
936/* Find appropriate source address to this destination */
937
938__be32 __fib_res_prefsrc(struct fib_result *res)
939{
940 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
941}
942
943int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 907int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
944 u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, 908 u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
945 struct fib_info *fi, unsigned int flags) 909 struct fib_info *fi, unsigned int flags)
946{ 910{
947 struct nlmsghdr *nlh; 911 struct nlmsghdr *nlh;
@@ -963,7 +927,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
963 NLA_PUT_U32(skb, RTA_TABLE, tb_id); 927 NLA_PUT_U32(skb, RTA_TABLE, tb_id);
964 rtm->rtm_type = type; 928 rtm->rtm_type = type;
965 rtm->rtm_flags = fi->fib_flags; 929 rtm->rtm_flags = fi->fib_flags;
966 rtm->rtm_scope = scope; 930 rtm->rtm_scope = fi->fib_scope;
967 rtm->rtm_protocol = fi->fib_protocol; 931 rtm->rtm_protocol = fi->fib_protocol;
968 932
969 if (rtm->rtm_dst_len) 933 if (rtm->rtm_dst_len)
@@ -984,7 +948,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
984 948
985 if (fi->fib_nh->nh_oif) 949 if (fi->fib_nh->nh_oif)
986 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); 950 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
987#ifdef CONFIG_NET_CLS_ROUTE 951#ifdef CONFIG_IP_ROUTE_CLASSID
988 if (fi->fib_nh[0].nh_tclassid) 952 if (fi->fib_nh[0].nh_tclassid)
989 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); 953 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
990#endif 954#endif
@@ -1009,7 +973,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
1009 973
1010 if (nh->nh_gw) 974 if (nh->nh_gw)
1011 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); 975 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1012#ifdef CONFIG_NET_CLS_ROUTE 976#ifdef CONFIG_IP_ROUTE_CLASSID
1013 if (nh->nh_tclassid) 977 if (nh->nh_tclassid)
1014 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); 978 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1015#endif 979#endif
@@ -1028,10 +992,10 @@ nla_put_failure:
1028} 992}
1029 993
1030/* 994/*
1031 Update FIB if: 995 * Update FIB if:
1032 - local address disappeared -> we must delete all the entries 996 * - local address disappeared -> we must delete all the entries
1033 referring to it. 997 * referring to it.
1034 - device went down -> we must shutdown all nexthops going via it. 998 * - device went down -> we must shutdown all nexthops going via it.
1035 */ 999 */
1036int fib_sync_down_addr(struct net *net, __be32 local) 1000int fib_sync_down_addr(struct net *net, __be32 local)
1037{ 1001{
@@ -1078,7 +1042,7 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1078 prev_fi = fi; 1042 prev_fi = fi;
1079 dead = 0; 1043 dead = 0;
1080 change_nexthops(fi) { 1044 change_nexthops(fi) {
1081 if (nexthop_nh->nh_flags&RTNH_F_DEAD) 1045 if (nexthop_nh->nh_flags & RTNH_F_DEAD)
1082 dead++; 1046 dead++;
1083 else if (nexthop_nh->nh_dev == dev && 1047 else if (nexthop_nh->nh_dev == dev &&
1084 nexthop_nh->nh_scope != scope) { 1048 nexthop_nh->nh_scope != scope) {
@@ -1107,13 +1071,68 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1107 return ret; 1071 return ret;
1108} 1072}
1109 1073
1074/* Must be invoked inside of an RCU protected region. */
1075void fib_select_default(struct fib_result *res)
1076{
1077 struct fib_info *fi = NULL, *last_resort = NULL;
1078 struct list_head *fa_head = res->fa_head;
1079 struct fib_table *tb = res->table;
1080 int order = -1, last_idx = -1;
1081 struct fib_alias *fa;
1082
1083 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1084 struct fib_info *next_fi = fa->fa_info;
1085
1086 if (next_fi->fib_scope != res->scope ||
1087 fa->fa_type != RTN_UNICAST)
1088 continue;
1089
1090 if (next_fi->fib_priority > res->fi->fib_priority)
1091 break;
1092 if (!next_fi->fib_nh[0].nh_gw ||
1093 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1094 continue;
1095
1096 fib_alias_accessed(fa);
1097
1098 if (fi == NULL) {
1099 if (next_fi != res->fi)
1100 break;
1101 } else if (!fib_detect_death(fi, order, &last_resort,
1102 &last_idx, tb->tb_default)) {
1103 fib_result_assign(res, fi);
1104 tb->tb_default = order;
1105 goto out;
1106 }
1107 fi = next_fi;
1108 order++;
1109 }
1110
1111 if (order <= 0 || fi == NULL) {
1112 tb->tb_default = -1;
1113 goto out;
1114 }
1115
1116 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1117 tb->tb_default)) {
1118 fib_result_assign(res, fi);
1119 tb->tb_default = order;
1120 goto out;
1121 }
1122
1123 if (last_idx >= 0)
1124 fib_result_assign(res, last_resort);
1125 tb->tb_default = last_idx;
1126out:
1127 return;
1128}
1129
1110#ifdef CONFIG_IP_ROUTE_MULTIPATH 1130#ifdef CONFIG_IP_ROUTE_MULTIPATH
1111 1131
1112/* 1132/*
1113 Dead device goes up. We wake up dead nexthops. 1133 * Dead device goes up. We wake up dead nexthops.
1114 It takes sense only on multipath routes. 1134 * It takes sense only on multipath routes.
1115 */ 1135 */
1116
1117int fib_sync_up(struct net_device *dev) 1136int fib_sync_up(struct net_device *dev)
1118{ 1137{
1119 struct fib_info *prev_fi; 1138 struct fib_info *prev_fi;
@@ -1123,7 +1142,7 @@ int fib_sync_up(struct net_device *dev)
1123 struct fib_nh *nh; 1142 struct fib_nh *nh;
1124 int ret; 1143 int ret;
1125 1144
1126 if (!(dev->flags&IFF_UP)) 1145 if (!(dev->flags & IFF_UP))
1127 return 0; 1146 return 0;
1128 1147
1129 prev_fi = NULL; 1148 prev_fi = NULL;
@@ -1142,12 +1161,12 @@ int fib_sync_up(struct net_device *dev)
1142 prev_fi = fi; 1161 prev_fi = fi;
1143 alive = 0; 1162 alive = 0;
1144 change_nexthops(fi) { 1163 change_nexthops(fi) {
1145 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { 1164 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1146 alive++; 1165 alive++;
1147 continue; 1166 continue;
1148 } 1167 }
1149 if (nexthop_nh->nh_dev == NULL || 1168 if (nexthop_nh->nh_dev == NULL ||
1150 !(nexthop_nh->nh_dev->flags&IFF_UP)) 1169 !(nexthop_nh->nh_dev->flags & IFF_UP))
1151 continue; 1170 continue;
1152 if (nexthop_nh->nh_dev != dev || 1171 if (nexthop_nh->nh_dev != dev ||
1153 !__in_dev_get_rtnl(dev)) 1172 !__in_dev_get_rtnl(dev))
@@ -1169,11 +1188,10 @@ int fib_sync_up(struct net_device *dev)
1169} 1188}
1170 1189
1171/* 1190/*
1172 The algorithm is suboptimal, but it provides really 1191 * The algorithm is suboptimal, but it provides really
1173 fair weighted route distribution. 1192 * fair weighted route distribution.
1174 */ 1193 */
1175 1194void fib_select_multipath(struct fib_result *res)
1176void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1177{ 1195{
1178 struct fib_info *fi = res->fi; 1196 struct fib_info *fi = res->fi;
1179 int w; 1197 int w;
@@ -1182,7 +1200,7 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1182 if (fi->fib_power <= 0) { 1200 if (fi->fib_power <= 0) {
1183 int power = 0; 1201 int power = 0;
1184 change_nexthops(fi) { 1202 change_nexthops(fi) {
1185 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { 1203 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1186 power += nexthop_nh->nh_weight; 1204 power += nexthop_nh->nh_weight;
1187 nexthop_nh->nh_power = nexthop_nh->nh_weight; 1205 nexthop_nh->nh_power = nexthop_nh->nh_weight;
1188 } 1206 }
@@ -1198,15 +1216,16 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1198 1216
1199 1217
1200 /* w should be random number [0..fi->fib_power-1], 1218 /* w should be random number [0..fi->fib_power-1],
1201 it is pretty bad approximation. 1219 * it is pretty bad approximation.
1202 */ 1220 */
1203 1221
1204 w = jiffies % fi->fib_power; 1222 w = jiffies % fi->fib_power;
1205 1223
1206 change_nexthops(fi) { 1224 change_nexthops(fi) {
1207 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) && 1225 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
1208 nexthop_nh->nh_power) { 1226 nexthop_nh->nh_power) {
1209 if ((w -= nexthop_nh->nh_power) <= 0) { 1227 w -= nexthop_nh->nh_power;
1228 if (w <= 0) {
1210 nexthop_nh->nh_power--; 1229 nexthop_nh->nh_power--;
1211 fi->fib_power--; 1230 fi->fib_power--;
1212 res->nh_sel = nhsel; 1231 res->nh_sel = nhsel;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 4a8e370862bc..58c25ea5a5c1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -12,11 +12,11 @@
12 * 12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet 13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 * 14 *
15 * This work is based on the LPC-trie which is originally descibed in: 15 * This work is based on the LPC-trie which is originally described in:
16 * 16 *
17 * An experimental study of compression methods for dynamic tries 17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. 18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ 19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20 * 20 *
21 * 21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson 22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
@@ -72,6 +72,7 @@
72#include <linux/init.h> 72#include <linux/init.h>
73#include <linux/list.h> 73#include <linux/list.h>
74#include <linux/slab.h> 74#include <linux/slab.h>
75#include <linux/prefetch.h>
75#include <net/net_namespace.h> 76#include <net/net_namespace.h>
76#include <net/ip.h> 77#include <net/ip.h>
77#include <net/protocol.h> 78#include <net/protocol.h>
@@ -95,7 +96,7 @@ typedef unsigned int t_key;
95#define IS_TNODE(n) (!(n->parent & T_LEAF)) 96#define IS_TNODE(n) (!(n->parent & T_LEAF))
96#define IS_LEAF(n) (n->parent & T_LEAF) 97#define IS_LEAF(n) (n->parent & T_LEAF)
97 98
98struct node { 99struct rt_trie_node {
99 unsigned long parent; 100 unsigned long parent;
100 t_key key; 101 t_key key;
101}; 102};
@@ -126,7 +127,7 @@ struct tnode {
126 struct work_struct work; 127 struct work_struct work;
127 struct tnode *tnode_free; 128 struct tnode *tnode_free;
128 }; 129 };
129 struct node *child[0]; 130 struct rt_trie_node __rcu *child[0];
130}; 131};
131 132
132#ifdef CONFIG_IP_FIB_TRIE_STATS 133#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -151,16 +152,16 @@ struct trie_stat {
151}; 152};
152 153
153struct trie { 154struct trie {
154 struct node *trie; 155 struct rt_trie_node __rcu *trie;
155#ifdef CONFIG_IP_FIB_TRIE_STATS 156#ifdef CONFIG_IP_FIB_TRIE_STATS
156 struct trie_use_stats stats; 157 struct trie_use_stats stats;
157#endif 158#endif
158}; 159};
159 160
160static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); 161static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
161static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, 162static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
162 int wasfull); 163 int wasfull);
163static struct node *resize(struct trie *t, struct tnode *tn); 164static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
164static struct tnode *inflate(struct trie *t, struct tnode *tn); 165static struct tnode *inflate(struct trie *t, struct tnode *tn);
165static struct tnode *halve(struct trie *t, struct tnode *tn); 166static struct tnode *halve(struct trie *t, struct tnode *tn);
166/* tnodes to free after resize(); protected by RTNL */ 167/* tnodes to free after resize(); protected by RTNL */
@@ -177,43 +178,58 @@ static const int sync_pages = 128;
177static struct kmem_cache *fn_alias_kmem __read_mostly; 178static struct kmem_cache *fn_alias_kmem __read_mostly;
178static struct kmem_cache *trie_leaf_kmem __read_mostly; 179static struct kmem_cache *trie_leaf_kmem __read_mostly;
179 180
180static inline struct tnode *node_parent(struct node *node) 181/*
182 * caller must hold RTNL
183 */
184static inline struct tnode *node_parent(const struct rt_trie_node *node)
181{ 185{
182 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); 186 unsigned long parent;
187
188 parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
189
190 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
183} 191}
184 192
185static inline struct tnode *node_parent_rcu(struct node *node) 193/*
194 * caller must hold RCU read lock or RTNL
195 */
196static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
186{ 197{
187 struct tnode *ret = node_parent(node); 198 unsigned long parent;
199
200 parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
201 lockdep_rtnl_is_held());
188 202
189 return rcu_dereference_check(ret, 203 return (struct tnode *)(parent & ~NODE_TYPE_MASK);
190 rcu_read_lock_held() ||
191 lockdep_rtnl_is_held());
192} 204}
193 205
194/* Same as rcu_assign_pointer 206/* Same as rcu_assign_pointer
195 * but that macro() assumes that value is a pointer. 207 * but that macro() assumes that value is a pointer.
196 */ 208 */
197static inline void node_set_parent(struct node *node, struct tnode *ptr) 209static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
198{ 210{
199 smp_wmb(); 211 smp_wmb();
200 node->parent = (unsigned long)ptr | NODE_TYPE(node); 212 node->parent = (unsigned long)ptr | NODE_TYPE(node);
201} 213}
202 214
203static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) 215/*
216 * caller must hold RTNL
217 */
218static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
204{ 219{
205 BUG_ON(i >= 1U << tn->bits); 220 BUG_ON(i >= 1U << tn->bits);
206 221
207 return tn->child[i]; 222 return rtnl_dereference(tn->child[i]);
208} 223}
209 224
210static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) 225/*
226 * caller must hold RCU read lock or RTNL
227 */
228static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
211{ 229{
212 struct node *ret = tnode_get_child(tn, i); 230 BUG_ON(i >= 1U << tn->bits);
213 231
214 return rcu_dereference_check(ret, 232 return rcu_dereference_rtnl(tn->child[i]);
215 rcu_read_lock_held() ||
216 lockdep_rtnl_is_held());
217} 233}
218 234
219static inline int tnode_child_length(const struct tnode *tn) 235static inline int tnode_child_length(const struct tnode *tn)
@@ -221,12 +237,12 @@ static inline int tnode_child_length(const struct tnode *tn)
221 return 1 << tn->bits; 237 return 1 << tn->bits;
222} 238}
223 239
224static inline t_key mask_pfx(t_key k, unsigned short l) 240static inline t_key mask_pfx(t_key k, unsigned int l)
225{ 241{
226 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); 242 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
227} 243}
228 244
229static inline t_key tkey_extract_bits(t_key a, int offset, int bits) 245static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
230{ 246{
231 if (offset < KEYLENGTH) 247 if (offset < KEYLENGTH)
232 return ((t_key)(a << offset)) >> (KEYLENGTH - bits); 248 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
@@ -354,14 +370,9 @@ static inline void free_leaf(struct leaf *l)
354 call_rcu_bh(&l->rcu, __leaf_free_rcu); 370 call_rcu_bh(&l->rcu, __leaf_free_rcu);
355} 371}
356 372
357static void __leaf_info_free_rcu(struct rcu_head *head)
358{
359 kfree(container_of(head, struct leaf_info, rcu));
360}
361
362static inline void free_leaf_info(struct leaf_info *leaf) 373static inline void free_leaf_info(struct leaf_info *leaf)
363{ 374{
364 call_rcu(&leaf->rcu, __leaf_info_free_rcu); 375 kfree_rcu(leaf, rcu);
365} 376}
366 377
367static struct tnode *tnode_alloc(size_t size) 378static struct tnode *tnode_alloc(size_t size)
@@ -369,7 +380,7 @@ static struct tnode *tnode_alloc(size_t size)
369 if (size <= PAGE_SIZE) 380 if (size <= PAGE_SIZE)
370 return kzalloc(size, GFP_KERNEL); 381 return kzalloc(size, GFP_KERNEL);
371 else 382 else
372 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); 383 return vzalloc(size);
373} 384}
374 385
375static void __tnode_vfree(struct work_struct *arg) 386static void __tnode_vfree(struct work_struct *arg)
@@ -382,7 +393,7 @@ static void __tnode_free_rcu(struct rcu_head *head)
382{ 393{
383 struct tnode *tn = container_of(head, struct tnode, rcu); 394 struct tnode *tn = container_of(head, struct tnode, rcu);
384 size_t size = sizeof(struct tnode) + 395 size_t size = sizeof(struct tnode) +
385 (sizeof(struct node *) << tn->bits); 396 (sizeof(struct rt_trie_node *) << tn->bits);
386 397
387 if (size <= PAGE_SIZE) 398 if (size <= PAGE_SIZE)
388 kfree(tn); 399 kfree(tn);
@@ -406,7 +417,7 @@ static void tnode_free_safe(struct tnode *tn)
406 tn->tnode_free = tnode_free_head; 417 tn->tnode_free = tnode_free_head;
407 tnode_free_head = tn; 418 tnode_free_head = tn;
408 tnode_free_size += sizeof(struct tnode) + 419 tnode_free_size += sizeof(struct tnode) +
409 (sizeof(struct node *) << tn->bits); 420 (sizeof(struct rt_trie_node *) << tn->bits);
410} 421}
411 422
412static void tnode_free_flush(void) 423static void tnode_free_flush(void)
@@ -447,7 +458,7 @@ static struct leaf_info *leaf_info_new(int plen)
447 458
448static struct tnode *tnode_new(t_key key, int pos, int bits) 459static struct tnode *tnode_new(t_key key, int pos, int bits)
449{ 460{
450 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); 461 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
451 struct tnode *tn = tnode_alloc(sz); 462 struct tnode *tn = tnode_alloc(sz);
452 463
453 if (tn) { 464 if (tn) {
@@ -459,8 +470,8 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
459 tn->empty_children = 1<<bits; 470 tn->empty_children = 1<<bits;
460 } 471 }
461 472
462 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode), 473 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
463 (unsigned long) (sizeof(struct node) << bits)); 474 sizeof(struct rt_trie_node) << bits);
464 return tn; 475 return tn;
465} 476}
466 477
@@ -469,7 +480,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
469 * and no bits are skipped. See discussion in dyntree paper p. 6 480 * and no bits are skipped. See discussion in dyntree paper p. 6
470 */ 481 */
471 482
472static inline int tnode_full(const struct tnode *tn, const struct node *n) 483static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
473{ 484{
474 if (n == NULL || IS_LEAF(n)) 485 if (n == NULL || IS_LEAF(n))
475 return 0; 486 return 0;
@@ -478,7 +489,7 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n)
478} 489}
479 490
480static inline void put_child(struct trie *t, struct tnode *tn, int i, 491static inline void put_child(struct trie *t, struct tnode *tn, int i,
481 struct node *n) 492 struct rt_trie_node *n)
482{ 493{
483 tnode_put_child_reorg(tn, i, n, -1); 494 tnode_put_child_reorg(tn, i, n, -1);
484} 495}
@@ -488,10 +499,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i,
488 * Update the value of full_children and empty_children. 499 * Update the value of full_children and empty_children.
489 */ 500 */
490 501
491static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, 502static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
492 int wasfull) 503 int wasfull)
493{ 504{
494 struct node *chi = tn->child[i]; 505 struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
495 int isfull; 506 int isfull;
496 507
497 BUG_ON(i >= 1<<tn->bits); 508 BUG_ON(i >= 1<<tn->bits);
@@ -519,7 +530,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
519} 530}
520 531
521#define MAX_WORK 10 532#define MAX_WORK 10
522static struct node *resize(struct trie *t, struct tnode *tn) 533static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
523{ 534{
524 int i; 535 int i;
525 struct tnode *old_tn; 536 struct tnode *old_tn;
@@ -609,11 +620,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
609 620
610 /* Keep root node larger */ 621 /* Keep root node larger */
611 622
612 if (!node_parent((struct node*) tn)) { 623 if (!node_parent((struct rt_trie_node *)tn)) {
613 inflate_threshold_use = inflate_threshold_root; 624 inflate_threshold_use = inflate_threshold_root;
614 halve_threshold_use = halve_threshold_root; 625 halve_threshold_use = halve_threshold_root;
615 } 626 } else {
616 else {
617 inflate_threshold_use = inflate_threshold; 627 inflate_threshold_use = inflate_threshold;
618 halve_threshold_use = halve_threshold; 628 halve_threshold_use = halve_threshold;
619 } 629 }
@@ -639,8 +649,8 @@ static struct node *resize(struct trie *t, struct tnode *tn)
639 check_tnode(tn); 649 check_tnode(tn);
640 650
641 /* Return if at least one inflate is run */ 651 /* Return if at least one inflate is run */
642 if( max_work != MAX_WORK) 652 if (max_work != MAX_WORK)
643 return (struct node *) tn; 653 return (struct rt_trie_node *) tn;
644 654
645 /* 655 /*
646 * Halve as long as the number of empty children in this 656 * Halve as long as the number of empty children in this
@@ -668,9 +678,9 @@ static struct node *resize(struct trie *t, struct tnode *tn)
668 if (tn->empty_children == tnode_child_length(tn) - 1) { 678 if (tn->empty_children == tnode_child_length(tn) - 1) {
669one_child: 679one_child:
670 for (i = 0; i < tnode_child_length(tn); i++) { 680 for (i = 0; i < tnode_child_length(tn); i++) {
671 struct node *n; 681 struct rt_trie_node *n;
672 682
673 n = tn->child[i]; 683 n = rtnl_dereference(tn->child[i]);
674 if (!n) 684 if (!n)
675 continue; 685 continue;
676 686
@@ -681,7 +691,21 @@ one_child:
681 return n; 691 return n;
682 } 692 }
683 } 693 }
684 return (struct node *) tn; 694 return (struct rt_trie_node *) tn;
695}
696
697
698static void tnode_clean_free(struct tnode *tn)
699{
700 int i;
701 struct tnode *tofree;
702
703 for (i = 0; i < tnode_child_length(tn); i++) {
704 tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
705 if (tofree)
706 tnode_free(tofree);
707 }
708 tnode_free(tn);
685} 709}
686 710
687static struct tnode *inflate(struct trie *t, struct tnode *tn) 711static struct tnode *inflate(struct trie *t, struct tnode *tn)
@@ -728,14 +752,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
728 goto nomem; 752 goto nomem;
729 } 753 }
730 754
731 put_child(t, tn, 2*i, (struct node *) left); 755 put_child(t, tn, 2*i, (struct rt_trie_node *) left);
732 put_child(t, tn, 2*i+1, (struct node *) right); 756 put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
733 } 757 }
734 } 758 }
735 759
736 for (i = 0; i < olen; i++) { 760 for (i = 0; i < olen; i++) {
737 struct tnode *inode; 761 struct tnode *inode;
738 struct node *node = tnode_get_child(oldtnode, i); 762 struct rt_trie_node *node = tnode_get_child(oldtnode, i);
739 struct tnode *left, *right; 763 struct tnode *left, *right;
740 int size, j; 764 int size, j;
741 765
@@ -760,8 +784,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
760 inode = (struct tnode *) node; 784 inode = (struct tnode *) node;
761 785
762 if (inode->bits == 1) { 786 if (inode->bits == 1) {
763 put_child(t, tn, 2*i, inode->child[0]); 787 put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
764 put_child(t, tn, 2*i+1, inode->child[1]); 788 put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
765 789
766 tnode_free_safe(inode); 790 tnode_free_safe(inode);
767 continue; 791 continue;
@@ -802,8 +826,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
802 826
803 size = tnode_child_length(left); 827 size = tnode_child_length(left);
804 for (j = 0; j < size; j++) { 828 for (j = 0; j < size; j++) {
805 put_child(t, left, j, inode->child[j]); 829 put_child(t, left, j, rtnl_dereference(inode->child[j]));
806 put_child(t, right, j, inode->child[j + size]); 830 put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
807 } 831 }
808 put_child(t, tn, 2*i, resize(t, left)); 832 put_child(t, tn, 2*i, resize(t, left));
809 put_child(t, tn, 2*i+1, resize(t, right)); 833 put_child(t, tn, 2*i+1, resize(t, right));
@@ -813,24 +837,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
813 tnode_free_safe(oldtnode); 837 tnode_free_safe(oldtnode);
814 return tn; 838 return tn;
815nomem: 839nomem:
816 { 840 tnode_clean_free(tn);
817 int size = tnode_child_length(tn); 841 return ERR_PTR(-ENOMEM);
818 int j;
819
820 for (j = 0; j < size; j++)
821 if (tn->child[j])
822 tnode_free((struct tnode *)tn->child[j]);
823
824 tnode_free(tn);
825
826 return ERR_PTR(-ENOMEM);
827 }
828} 842}
829 843
830static struct tnode *halve(struct trie *t, struct tnode *tn) 844static struct tnode *halve(struct trie *t, struct tnode *tn)
831{ 845{
832 struct tnode *oldtnode = tn; 846 struct tnode *oldtnode = tn;
833 struct node *left, *right; 847 struct rt_trie_node *left, *right;
834 int i; 848 int i;
835 int olen = tnode_child_length(tn); 849 int olen = tnode_child_length(tn);
836 850
@@ -861,7 +875,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
861 if (!newn) 875 if (!newn)
862 goto nomem; 876 goto nomem;
863 877
864 put_child(t, tn, i/2, (struct node *)newn); 878 put_child(t, tn, i/2, (struct rt_trie_node *)newn);
865 } 879 }
866 880
867 } 881 }
@@ -895,18 +909,8 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
895 tnode_free_safe(oldtnode); 909 tnode_free_safe(oldtnode);
896 return tn; 910 return tn;
897nomem: 911nomem:
898 { 912 tnode_clean_free(tn);
899 int size = tnode_child_length(tn); 913 return ERR_PTR(-ENOMEM);
900 int j;
901
902 for (j = 0; j < size; j++)
903 if (tn->child[j])
904 tnode_free((struct tnode *)tn->child[j]);
905
906 tnode_free(tn);
907
908 return ERR_PTR(-ENOMEM);
909 }
910} 914}
911 915
912/* readside must use rcu_read_lock currently dump routines 916/* readside must use rcu_read_lock currently dump routines
@@ -963,12 +967,10 @@ fib_find_node(struct trie *t, u32 key)
963{ 967{
964 int pos; 968 int pos;
965 struct tnode *tn; 969 struct tnode *tn;
966 struct node *n; 970 struct rt_trie_node *n;
967 971
968 pos = 0; 972 pos = 0;
969 n = rcu_dereference_check(t->trie, 973 n = rcu_dereference_rtnl(t->trie);
970 rcu_read_lock_held() ||
971 lockdep_rtnl_is_held());
972 974
973 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 975 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
974 tn = (struct tnode *) n; 976 tn = (struct tnode *) n;
@@ -1000,17 +1002,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1000 1002
1001 key = tn->key; 1003 key = tn->key;
1002 1004
1003 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { 1005 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
1004 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1006 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1005 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 1007 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1006 tn = (struct tnode *) resize(t, (struct tnode *)tn); 1008 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1007 1009
1008 tnode_put_child_reorg((struct tnode *)tp, cindex, 1010 tnode_put_child_reorg((struct tnode *)tp, cindex,
1009 (struct node *)tn, wasfull); 1011 (struct rt_trie_node *)tn, wasfull);
1010 1012
1011 tp = node_parent((struct node *) tn); 1013 tp = node_parent((struct rt_trie_node *) tn);
1012 if (!tp) 1014 if (!tp)
1013 rcu_assign_pointer(t->trie, (struct node *)tn); 1015 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1014 1016
1015 tnode_free_flush(); 1017 tnode_free_flush();
1016 if (!tp) 1018 if (!tp)
@@ -1022,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1022 if (IS_TNODE(tn)) 1024 if (IS_TNODE(tn))
1023 tn = (struct tnode *)resize(t, (struct tnode *)tn); 1025 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1024 1026
1025 rcu_assign_pointer(t->trie, (struct node *)tn); 1027 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1026 tnode_free_flush(); 1028 tnode_free_flush();
1027} 1029}
1028 1030
@@ -1032,7 +1034,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1032{ 1034{
1033 int pos, newpos; 1035 int pos, newpos;
1034 struct tnode *tp = NULL, *tn = NULL; 1036 struct tnode *tp = NULL, *tn = NULL;
1035 struct node *n; 1037 struct rt_trie_node *n;
1036 struct leaf *l; 1038 struct leaf *l;
1037 int missbit; 1039 int missbit;
1038 struct list_head *fa_head = NULL; 1040 struct list_head *fa_head = NULL;
@@ -1040,7 +1042,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1040 t_key cindex; 1042 t_key cindex;
1041 1043
1042 pos = 0; 1044 pos = 0;
1043 n = t->trie; 1045 n = rtnl_dereference(t->trie);
1044 1046
1045 /* If we point to NULL, stop. Either the tree is empty and we should 1047 /* If we point to NULL, stop. Either the tree is empty and we should
1046 * just put a new leaf in if, or we have reached an empty child slot, 1048 * just put a new leaf in if, or we have reached an empty child slot,
@@ -1118,10 +1120,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1118 if (t->trie && n == NULL) { 1120 if (t->trie && n == NULL) {
1119 /* Case 2: n is NULL, and will just insert a new leaf */ 1121 /* Case 2: n is NULL, and will just insert a new leaf */
1120 1122
1121 node_set_parent((struct node *)l, tp); 1123 node_set_parent((struct rt_trie_node *)l, tp);
1122 1124
1123 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1125 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1124 put_child(t, (struct tnode *)tp, cindex, (struct node *)l); 1126 put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
1125 } else { 1127 } else {
1126 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1128 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1127 /* 1129 /*
@@ -1148,18 +1150,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1148 return NULL; 1150 return NULL;
1149 } 1151 }
1150 1152
1151 node_set_parent((struct node *)tn, tp); 1153 node_set_parent((struct rt_trie_node *)tn, tp);
1152 1154
1153 missbit = tkey_extract_bits(key, newpos, 1); 1155 missbit = tkey_extract_bits(key, newpos, 1);
1154 put_child(t, tn, missbit, (struct node *)l); 1156 put_child(t, tn, missbit, (struct rt_trie_node *)l);
1155 put_child(t, tn, 1-missbit, n); 1157 put_child(t, tn, 1-missbit, n);
1156 1158
1157 if (tp) { 1159 if (tp) {
1158 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1160 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1159 put_child(t, (struct tnode *)tp, cindex, 1161 put_child(t, (struct tnode *)tp, cindex,
1160 (struct node *)tn); 1162 (struct rt_trie_node *)tn);
1161 } else { 1163 } else {
1162 rcu_assign_pointer(t->trie, (struct node *)tn); 1164 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1163 tp = tn; 1165 tp = tn;
1164 } 1166 }
1165 } 1167 }
@@ -1252,7 +1254,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1252 if (fa->fa_info->fib_priority != fi->fib_priority) 1254 if (fa->fa_info->fib_priority != fi->fib_priority)
1253 break; 1255 break;
1254 if (fa->fa_type == cfg->fc_type && 1256 if (fa->fa_type == cfg->fc_type &&
1255 fa->fa_scope == cfg->fc_scope &&
1256 fa->fa_info == fi) { 1257 fa->fa_info == fi) {
1257 fa_match = fa; 1258 fa_match = fa;
1258 break; 1259 break;
@@ -1278,7 +1279,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1278 new_fa->fa_tos = fa->fa_tos; 1279 new_fa->fa_tos = fa->fa_tos;
1279 new_fa->fa_info = fi; 1280 new_fa->fa_info = fi;
1280 new_fa->fa_type = cfg->fc_type; 1281 new_fa->fa_type = cfg->fc_type;
1281 new_fa->fa_scope = cfg->fc_scope;
1282 state = fa->fa_state; 1282 state = fa->fa_state;
1283 new_fa->fa_state = state & ~FA_S_ACCESSED; 1283 new_fa->fa_state = state & ~FA_S_ACCESSED;
1284 1284
@@ -1315,7 +1315,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1315 new_fa->fa_info = fi; 1315 new_fa->fa_info = fi;
1316 new_fa->fa_tos = tos; 1316 new_fa->fa_tos = tos;
1317 new_fa->fa_type = cfg->fc_type; 1317 new_fa->fa_type = cfg->fc_type;
1318 new_fa->fa_scope = cfg->fc_scope;
1319 new_fa->fa_state = 0; 1318 new_fa->fa_state = 0;
1320 /* 1319 /*
1321 * Insert new entry to the list. 1320 * Insert new entry to the list.
@@ -1329,6 +1328,9 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1329 } 1328 }
1330 } 1329 }
1331 1330
1331 if (!plen)
1332 tb->tb_num_default++;
1333
1332 list_add_tail_rcu(&new_fa->fa_list, 1334 list_add_tail_rcu(&new_fa->fa_list,
1333 (fa ? &fa->fa_list : fa_head)); 1335 (fa ? &fa->fa_list : fa_head));
1334 1336
@@ -1347,52 +1349,86 @@ err:
1347} 1349}
1348 1350
1349/* should be called with rcu_read_lock */ 1351/* should be called with rcu_read_lock */
1350static int check_leaf(struct trie *t, struct leaf *l, 1352static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1351 t_key key, const struct flowi *flp, 1353 t_key key, const struct flowi4 *flp,
1352 struct fib_result *res) 1354 struct fib_result *res, int fib_flags)
1353{ 1355{
1354 struct leaf_info *li; 1356 struct leaf_info *li;
1355 struct hlist_head *hhead = &l->list; 1357 struct hlist_head *hhead = &l->list;
1356 struct hlist_node *node; 1358 struct hlist_node *node;
1357 1359
1358 hlist_for_each_entry_rcu(li, node, hhead, hlist) { 1360 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1359 int err; 1361 struct fib_alias *fa;
1360 int plen = li->plen; 1362 int plen = li->plen;
1361 __be32 mask = inet_make_mask(plen); 1363 __be32 mask = inet_make_mask(plen);
1362 1364
1363 if (l->key != (key & ntohl(mask))) 1365 if (l->key != (key & ntohl(mask)))
1364 continue; 1366 continue;
1365 1367
1366 err = fib_semantic_match(&li->falh, flp, res, plen); 1368 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1369 struct fib_info *fi = fa->fa_info;
1370 int nhsel, err;
1367 1371
1372 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1373 continue;
1374 if (fa->fa_info->fib_scope < flp->flowi4_scope)
1375 continue;
1376 fib_alias_accessed(fa);
1377 err = fib_props[fa->fa_type].error;
1378 if (err) {
1368#ifdef CONFIG_IP_FIB_TRIE_STATS 1379#ifdef CONFIG_IP_FIB_TRIE_STATS
1369 if (err <= 0) 1380 t->stats.semantic_match_passed++;
1370 t->stats.semantic_match_passed++; 1381#endif
1371 else 1382 return err;
1372 t->stats.semantic_match_miss++; 1383 }
1384 if (fi->fib_flags & RTNH_F_DEAD)
1385 continue;
1386 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1387 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1388
1389 if (nh->nh_flags & RTNH_F_DEAD)
1390 continue;
1391 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1392 continue;
1393
1394#ifdef CONFIG_IP_FIB_TRIE_STATS
1395 t->stats.semantic_match_passed++;
1396#endif
1397 res->prefixlen = plen;
1398 res->nh_sel = nhsel;
1399 res->type = fa->fa_type;
1400 res->scope = fa->fa_info->fib_scope;
1401 res->fi = fi;
1402 res->table = tb;
1403 res->fa_head = &li->falh;
1404 if (!(fib_flags & FIB_LOOKUP_NOREF))
1405 atomic_inc(&res->fi->fib_clntref);
1406 return 0;
1407 }
1408 }
1409
1410#ifdef CONFIG_IP_FIB_TRIE_STATS
1411 t->stats.semantic_match_miss++;
1373#endif 1412#endif
1374 if (err <= 0)
1375 return err;
1376 } 1413 }
1377 1414
1378 return 1; 1415 return 1;
1379} 1416}
1380 1417
1381int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, 1418int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1382 struct fib_result *res) 1419 struct fib_result *res, int fib_flags)
1383{ 1420{
1384 struct trie *t = (struct trie *) tb->tb_data; 1421 struct trie *t = (struct trie *) tb->tb_data;
1385 int ret; 1422 int ret;
1386 struct node *n; 1423 struct rt_trie_node *n;
1387 struct tnode *pn; 1424 struct tnode *pn;
1388 int pos, bits; 1425 unsigned int pos, bits;
1389 t_key key = ntohl(flp->fl4_dst); 1426 t_key key = ntohl(flp->daddr);
1390 int chopped_off; 1427 unsigned int chopped_off;
1391 t_key cindex = 0; 1428 t_key cindex = 0;
1392 int current_prefix_length = KEYLENGTH; 1429 unsigned int current_prefix_length = KEYLENGTH;
1393 struct tnode *cn; 1430 struct tnode *cn;
1394 t_key node_prefix, key_prefix, pref_mismatch; 1431 t_key pref_mismatch;
1395 int mp;
1396 1432
1397 rcu_read_lock(); 1433 rcu_read_lock();
1398 1434
@@ -1406,7 +1442,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1406 1442
1407 /* Just a leaf? */ 1443 /* Just a leaf? */
1408 if (IS_LEAF(n)) { 1444 if (IS_LEAF(n)) {
1409 ret = check_leaf(t, (struct leaf *)n, key, flp, res); 1445 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1410 goto found; 1446 goto found;
1411 } 1447 }
1412 1448
@@ -1431,7 +1467,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1431 } 1467 }
1432 1468
1433 if (IS_LEAF(n)) { 1469 if (IS_LEAF(n)) {
1434 ret = check_leaf(t, (struct leaf *)n, key, flp, res); 1470 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1435 if (ret > 0) 1471 if (ret > 0)
1436 goto backtrace; 1472 goto backtrace;
1437 goto found; 1473 goto found;
@@ -1507,10 +1543,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1507 * matching prefix. 1543 * matching prefix.
1508 */ 1544 */
1509 1545
1510 node_prefix = mask_pfx(cn->key, cn->pos); 1546 pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1511 key_prefix = mask_pfx(key, cn->pos);
1512 pref_mismatch = key_prefix^node_prefix;
1513 mp = 0;
1514 1547
1515 /* 1548 /*
1516 * In short: If skipped bits in this node do not match 1549 * In short: If skipped bits in this node do not match
@@ -1518,13 +1551,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1518 * state.directly. 1551 * state.directly.
1519 */ 1552 */
1520 if (pref_mismatch) { 1553 if (pref_mismatch) {
1521 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { 1554 int mp = KEYLENGTH - fls(pref_mismatch);
1522 mp++;
1523 pref_mismatch = pref_mismatch << 1;
1524 }
1525 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1526 1555
1527 if (key_prefix != 0) 1556 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1528 goto backtrace; 1557 goto backtrace;
1529 1558
1530 if (current_prefix_length >= cn->pos) 1559 if (current_prefix_length >= cn->pos)
@@ -1556,7 +1585,7 @@ backtrace:
1556 if (chopped_off <= pn->bits) { 1585 if (chopped_off <= pn->bits) {
1557 cindex &= ~(1 << (chopped_off-1)); 1586 cindex &= ~(1 << (chopped_off-1));
1558 } else { 1587 } else {
1559 struct tnode *parent = node_parent_rcu((struct node *) pn); 1588 struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1560 if (!parent) 1589 if (!parent)
1561 goto failed; 1590 goto failed;
1562 1591
@@ -1583,7 +1612,7 @@ found:
1583 */ 1612 */
1584static void trie_leaf_remove(struct trie *t, struct leaf *l) 1613static void trie_leaf_remove(struct trie *t, struct leaf *l)
1585{ 1614{
1586 struct tnode *tp = node_parent((struct node *) l); 1615 struct tnode *tp = node_parent((struct rt_trie_node *) l);
1587 1616
1588 pr_debug("entering trie_leaf_remove(%p)\n", l); 1617 pr_debug("entering trie_leaf_remove(%p)\n", l);
1589 1618
@@ -1644,7 +1673,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1644 1673
1645 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && 1674 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1646 (cfg->fc_scope == RT_SCOPE_NOWHERE || 1675 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1647 fa->fa_scope == cfg->fc_scope) && 1676 fa->fa_info->fib_scope == cfg->fc_scope) &&
1677 (!cfg->fc_prefsrc ||
1678 fi->fib_prefsrc == cfg->fc_prefsrc) &&
1648 (!cfg->fc_protocol || 1679 (!cfg->fc_protocol ||
1649 fi->fib_protocol == cfg->fc_protocol) && 1680 fi->fib_protocol == cfg->fc_protocol) &&
1650 fib_nh_match(cfg, fi) == 0) { 1681 fib_nh_match(cfg, fi) == 0) {
@@ -1665,6 +1696,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1665 1696
1666 list_del_rcu(&fa->fa_list); 1697 list_del_rcu(&fa->fa_list);
1667 1698
1699 if (!plen)
1700 tb->tb_num_default--;
1701
1668 if (list_empty(fa_head)) { 1702 if (list_empty(fa_head)) {
1669 hlist_del_rcu(&li->hlist); 1703 hlist_del_rcu(&li->hlist);
1670 free_leaf_info(li); 1704 free_leaf_info(li);
@@ -1721,7 +1755,7 @@ static int trie_flush_leaf(struct leaf *l)
1721 * Scan for the next right leaf starting at node p->child[idx] 1755 * Scan for the next right leaf starting at node p->child[idx]
1722 * Since we have back pointer, no recursion necessary. 1756 * Since we have back pointer, no recursion necessary.
1723 */ 1757 */
1724static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) 1758static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1725{ 1759{
1726 do { 1760 do {
1727 t_key idx; 1761 t_key idx;
@@ -1737,7 +1771,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1737 continue; 1771 continue;
1738 1772
1739 if (IS_LEAF(c)) { 1773 if (IS_LEAF(c)) {
1740 prefetch(p->child[idx]); 1774 prefetch(rcu_dereference_rtnl(p->child[idx]));
1741 return (struct leaf *) c; 1775 return (struct leaf *) c;
1742 } 1776 }
1743 1777
@@ -1747,17 +1781,15 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1747 } 1781 }
1748 1782
1749 /* Node empty, walk back up to parent */ 1783 /* Node empty, walk back up to parent */
1750 c = (struct node *) p; 1784 c = (struct rt_trie_node *) p;
1751 } while ( (p = node_parent_rcu(c)) != NULL); 1785 } while ((p = node_parent_rcu(c)) != NULL);
1752 1786
1753 return NULL; /* Root of trie */ 1787 return NULL; /* Root of trie */
1754} 1788}
1755 1789
1756static struct leaf *trie_firstleaf(struct trie *t) 1790static struct leaf *trie_firstleaf(struct trie *t)
1757{ 1791{
1758 struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie, 1792 struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1759 rcu_read_lock_held() ||
1760 lockdep_rtnl_is_held());
1761 1793
1762 if (!n) 1794 if (!n)
1763 return NULL; 1795 return NULL;
@@ -1770,7 +1802,7 @@ static struct leaf *trie_firstleaf(struct trie *t)
1770 1802
1771static struct leaf *trie_nextleaf(struct leaf *l) 1803static struct leaf *trie_nextleaf(struct leaf *l)
1772{ 1804{
1773 struct node *c = (struct node *) l; 1805 struct rt_trie_node *c = (struct rt_trie_node *) l;
1774 struct tnode *p = node_parent_rcu(c); 1806 struct tnode *p = node_parent_rcu(c);
1775 1807
1776 if (!p) 1808 if (!p)
@@ -1814,77 +1846,9 @@ int fib_table_flush(struct fib_table *tb)
1814 return found; 1846 return found;
1815} 1847}
1816 1848
1817void fib_table_select_default(struct fib_table *tb, 1849void fib_free_table(struct fib_table *tb)
1818 const struct flowi *flp,
1819 struct fib_result *res)
1820{ 1850{
1821 struct trie *t = (struct trie *) tb->tb_data; 1851 kfree(tb);
1822 int order, last_idx;
1823 struct fib_info *fi = NULL;
1824 struct fib_info *last_resort;
1825 struct fib_alias *fa = NULL;
1826 struct list_head *fa_head;
1827 struct leaf *l;
1828
1829 last_idx = -1;
1830 last_resort = NULL;
1831 order = -1;
1832
1833 rcu_read_lock();
1834
1835 l = fib_find_node(t, 0);
1836 if (!l)
1837 goto out;
1838
1839 fa_head = get_fa_head(l, 0);
1840 if (!fa_head)
1841 goto out;
1842
1843 if (list_empty(fa_head))
1844 goto out;
1845
1846 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1847 struct fib_info *next_fi = fa->fa_info;
1848
1849 if (fa->fa_scope != res->scope ||
1850 fa->fa_type != RTN_UNICAST)
1851 continue;
1852
1853 if (next_fi->fib_priority > res->fi->fib_priority)
1854 break;
1855 if (!next_fi->fib_nh[0].nh_gw ||
1856 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1857 continue;
1858 fa->fa_state |= FA_S_ACCESSED;
1859
1860 if (fi == NULL) {
1861 if (next_fi != res->fi)
1862 break;
1863 } else if (!fib_detect_death(fi, order, &last_resort,
1864 &last_idx, tb->tb_default)) {
1865 fib_result_assign(res, fi);
1866 tb->tb_default = order;
1867 goto out;
1868 }
1869 fi = next_fi;
1870 order++;
1871 }
1872 if (order <= 0 || fi == NULL) {
1873 tb->tb_default = -1;
1874 goto out;
1875 }
1876
1877 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1878 tb->tb_default)) {
1879 fib_result_assign(res, fi);
1880 tb->tb_default = order;
1881 goto out;
1882 }
1883 if (last_idx >= 0)
1884 fib_result_assign(res, last_resort);
1885 tb->tb_default = last_idx;
1886out:
1887 rcu_read_unlock();
1888} 1852}
1889 1853
1890static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, 1854static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
@@ -1911,7 +1875,6 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1911 RTM_NEWROUTE, 1875 RTM_NEWROUTE,
1912 tb->tb_id, 1876 tb->tb_id,
1913 fa->fa_type, 1877 fa->fa_type,
1914 fa->fa_scope,
1915 xkey, 1878 xkey,
1916 plen, 1879 plen,
1917 fa->fa_tos, 1880 fa->fa_tos,
@@ -2001,7 +1964,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
2001 return skb->len; 1964 return skb->len;
2002} 1965}
2003 1966
2004void __init fib_hash_init(void) 1967void __init fib_trie_init(void)
2005{ 1968{
2006 fn_alias_kmem = kmem_cache_create("ip_fib_alias", 1969 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2007 sizeof(struct fib_alias), 1970 sizeof(struct fib_alias),
@@ -2014,8 +1977,7 @@ void __init fib_hash_init(void)
2014} 1977}
2015 1978
2016 1979
2017/* Fix more generic FIB names for init later */ 1980struct fib_table *fib_trie_table(u32 id)
2018struct fib_table *fib_hash_table(u32 id)
2019{ 1981{
2020 struct fib_table *tb; 1982 struct fib_table *tb;
2021 struct trie *t; 1983 struct trie *t;
@@ -2027,13 +1989,11 @@ struct fib_table *fib_hash_table(u32 id)
2027 1989
2028 tb->tb_id = id; 1990 tb->tb_id = id;
2029 tb->tb_default = -1; 1991 tb->tb_default = -1;
1992 tb->tb_num_default = 0;
2030 1993
2031 t = (struct trie *) tb->tb_data; 1994 t = (struct trie *) tb->tb_data;
2032 memset(t, 0, sizeof(*t)); 1995 memset(t, 0, sizeof(*t));
2033 1996
2034 if (id == RT_TABLE_LOCAL)
2035 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2036
2037 return tb; 1997 return tb;
2038} 1998}
2039 1999
@@ -2043,14 +2003,14 @@ struct fib_trie_iter {
2043 struct seq_net_private p; 2003 struct seq_net_private p;
2044 struct fib_table *tb; 2004 struct fib_table *tb;
2045 struct tnode *tnode; 2005 struct tnode *tnode;
2046 unsigned index; 2006 unsigned int index;
2047 unsigned depth; 2007 unsigned int depth;
2048}; 2008};
2049 2009
2050static struct node *fib_trie_get_next(struct fib_trie_iter *iter) 2010static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2051{ 2011{
2052 struct tnode *tn = iter->tnode; 2012 struct tnode *tn = iter->tnode;
2053 unsigned cindex = iter->index; 2013 unsigned int cindex = iter->index;
2054 struct tnode *p; 2014 struct tnode *p;
2055 2015
2056 /* A single entry routing table */ 2016 /* A single entry routing table */
@@ -2061,7 +2021,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2061 iter->tnode, iter->index, iter->depth); 2021 iter->tnode, iter->index, iter->depth);
2062rescan: 2022rescan:
2063 while (cindex < (1<<tn->bits)) { 2023 while (cindex < (1<<tn->bits)) {
2064 struct node *n = tnode_get_child_rcu(tn, cindex); 2024 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2065 2025
2066 if (n) { 2026 if (n) {
2067 if (IS_LEAF(n)) { 2027 if (IS_LEAF(n)) {
@@ -2080,7 +2040,7 @@ rescan:
2080 } 2040 }
2081 2041
2082 /* Current node exhausted, pop back up */ 2042 /* Current node exhausted, pop back up */
2083 p = node_parent_rcu((struct node *)tn); 2043 p = node_parent_rcu((struct rt_trie_node *)tn);
2084 if (p) { 2044 if (p) {
2085 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; 2045 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2086 tn = p; 2046 tn = p;
@@ -2092,10 +2052,10 @@ rescan:
2092 return NULL; 2052 return NULL;
2093} 2053}
2094 2054
2095static struct node *fib_trie_get_first(struct fib_trie_iter *iter, 2055static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2096 struct trie *t) 2056 struct trie *t)
2097{ 2057{
2098 struct node *n; 2058 struct rt_trie_node *n;
2099 2059
2100 if (!t) 2060 if (!t)
2101 return NULL; 2061 return NULL;
@@ -2119,7 +2079,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2119 2079
2120static void trie_collect_stats(struct trie *t, struct trie_stat *s) 2080static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2121{ 2081{
2122 struct node *n; 2082 struct rt_trie_node *n;
2123 struct fib_trie_iter iter; 2083 struct fib_trie_iter iter;
2124 2084
2125 memset(s, 0, sizeof(*s)); 2085 memset(s, 0, sizeof(*s));
@@ -2159,7 +2119,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2159 */ 2119 */
2160static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) 2120static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2161{ 2121{
2162 unsigned i, max, pointers, bytes, avdepth; 2122 unsigned int i, max, pointers, bytes, avdepth;
2163 2123
2164 if (stat->leaves) 2124 if (stat->leaves)
2165 avdepth = stat->totdepth*100 / stat->leaves; 2125 avdepth = stat->totdepth*100 / stat->leaves;
@@ -2192,7 +2152,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2192 seq_putc(seq, '\n'); 2152 seq_putc(seq, '\n');
2193 seq_printf(seq, "\tPointers: %u\n", pointers); 2153 seq_printf(seq, "\tPointers: %u\n", pointers);
2194 2154
2195 bytes += sizeof(struct node *) * pointers; 2155 bytes += sizeof(struct rt_trie_node *) * pointers;
2196 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); 2156 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2197 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); 2157 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2198} 2158}
@@ -2273,7 +2233,7 @@ static const struct file_operations fib_triestat_fops = {
2273 .release = single_release_net, 2233 .release = single_release_net,
2274}; 2234};
2275 2235
2276static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) 2236static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2277{ 2237{
2278 struct fib_trie_iter *iter = seq->private; 2238 struct fib_trie_iter *iter = seq->private;
2279 struct net *net = seq_file_net(seq); 2239 struct net *net = seq_file_net(seq);
@@ -2286,7 +2246,7 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2286 struct fib_table *tb; 2246 struct fib_table *tb;
2287 2247
2288 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { 2248 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2289 struct node *n; 2249 struct rt_trie_node *n;
2290 2250
2291 for (n = fib_trie_get_first(iter, 2251 for (n = fib_trie_get_first(iter,
2292 (struct trie *) tb->tb_data); 2252 (struct trie *) tb->tb_data);
@@ -2315,7 +2275,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2315 struct fib_table *tb = iter->tb; 2275 struct fib_table *tb = iter->tb;
2316 struct hlist_node *tb_node; 2276 struct hlist_node *tb_node;
2317 unsigned int h; 2277 unsigned int h;
2318 struct node *n; 2278 struct rt_trie_node *n;
2319 2279
2320 ++*pos; 2280 ++*pos;
2321 /* next node in same table */ 2281 /* next node in same table */
@@ -2325,7 +2285,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2325 2285
2326 /* walk rest of this hash chain */ 2286 /* walk rest of this hash chain */
2327 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); 2287 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2328 while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) { 2288 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2329 tb = hlist_entry(tb_node, struct fib_table, tb_hlist); 2289 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2330 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); 2290 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2331 if (n) 2291 if (n)
@@ -2356,7 +2316,8 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2356 2316
2357static void seq_indent(struct seq_file *seq, int n) 2317static void seq_indent(struct seq_file *seq, int n)
2358{ 2318{
2359 while (n-- > 0) seq_puts(seq, " "); 2319 while (n-- > 0)
2320 seq_puts(seq, " ");
2360} 2321}
2361 2322
2362static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) 2323static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
@@ -2388,7 +2349,7 @@ static const char *const rtn_type_names[__RTN_MAX] = {
2388 [RTN_XRESOLVE] = "XRESOLVE", 2349 [RTN_XRESOLVE] = "XRESOLVE",
2389}; 2350};
2390 2351
2391static inline const char *rtn_type(char *buf, size_t len, unsigned t) 2352static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2392{ 2353{
2393 if (t < __RTN_MAX && rtn_type_names[t]) 2354 if (t < __RTN_MAX && rtn_type_names[t])
2394 return rtn_type_names[t]; 2355 return rtn_type_names[t];
@@ -2400,7 +2361,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2400static int fib_trie_seq_show(struct seq_file *seq, void *v) 2361static int fib_trie_seq_show(struct seq_file *seq, void *v)
2401{ 2362{
2402 const struct fib_trie_iter *iter = seq->private; 2363 const struct fib_trie_iter *iter = seq->private;
2403 struct node *n = v; 2364 struct rt_trie_node *n = v;
2404 2365
2405 if (!node_parent_rcu(n)) 2366 if (!node_parent_rcu(n))
2406 fib_table_print(seq, iter->tb); 2367 fib_table_print(seq, iter->tb);
@@ -2432,7 +2393,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2432 seq_indent(seq, iter->depth+1); 2393 seq_indent(seq, iter->depth+1);
2433 seq_printf(seq, " /%d %s %s", li->plen, 2394 seq_printf(seq, " /%d %s %s", li->plen,
2434 rtn_scope(buf1, sizeof(buf1), 2395 rtn_scope(buf1, sizeof(buf1),
2435 fa->fa_scope), 2396 fa->fa_info->fib_scope),
2436 rtn_type(buf2, sizeof(buf2), 2397 rtn_type(buf2, sizeof(buf2),
2437 fa->fa_type)); 2398 fa->fa_type));
2438 if (fa->fa_tos) 2399 if (fa->fa_tos)
@@ -2544,13 +2505,12 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
2544 rcu_read_unlock(); 2505 rcu_read_unlock();
2545} 2506}
2546 2507
2547static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) 2508static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2548{ 2509{
2549 static unsigned type2flags[RTN_MAX + 1] = { 2510 unsigned int flags = 0;
2550 [7] = RTF_REJECT, [8] = RTF_REJECT,
2551 };
2552 unsigned flags = type2flags[type];
2553 2511
2512 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2513 flags = RTF_REJECT;
2554 if (fi && fi->fib_nh->nh_gw) 2514 if (fi && fi->fib_nh->nh_gw)
2555 flags |= RTF_GATEWAY; 2515 flags |= RTF_GATEWAY;
2556 if (mask == htonl(0xFFFFFFFF)) 2516 if (mask == htonl(0xFFFFFFFF))
@@ -2562,7 +2522,7 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2562/* 2522/*
2563 * This outputs /proc/net/route. 2523 * This outputs /proc/net/route.
2564 * The format of the file is not supposed to be changed 2524 * The format of the file is not supposed to be changed
2565 * and needs to be same as fib_hash output to avoid breaking 2525 * and needs to be same as fib_hash output to avoid breaking
2566 * legacy utilities 2526 * legacy utilities
2567 */ 2527 */
2568static int fib_route_seq_show(struct seq_file *seq, void *v) 2528static int fib_route_seq_show(struct seq_file *seq, void *v)
@@ -2587,7 +2547,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
2587 2547
2588 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 2548 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2589 const struct fib_info *fi = fa->fa_info; 2549 const struct fib_info *fi = fa->fa_info;
2590 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); 2550 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2591 int len; 2551 int len;
2592 2552
2593 if (fa->fa_type == RTN_BROADCAST 2553 if (fa->fa_type == RTN_BROADCAST
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
new file mode 100644
index 000000000000..c6933f2ea310
--- /dev/null
+++ b/net/ipv4/gre.c
@@ -0,0 +1,152 @@
1/*
2 * GRE over IPv4 demultiplexer driver
3 *
4 * Authors: Dmitry Kozlov (xeb@mail.ru)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/kernel.h>
15#include <linux/kmod.h>
16#include <linux/skbuff.h>
17#include <linux/in.h>
18#include <linux/netdevice.h>
19#include <linux/version.h>
20#include <linux/spinlock.h>
21#include <net/protocol.h>
22#include <net/gre.h>
23
24
25static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
26static DEFINE_SPINLOCK(gre_proto_lock);
27
28int gre_add_protocol(const struct gre_protocol *proto, u8 version)
29{
30 if (version >= GREPROTO_MAX)
31 goto err_out;
32
33 spin_lock(&gre_proto_lock);
34 if (gre_proto[version])
35 goto err_out_unlock;
36
37 rcu_assign_pointer(gre_proto[version], proto);
38 spin_unlock(&gre_proto_lock);
39 return 0;
40
41err_out_unlock:
42 spin_unlock(&gre_proto_lock);
43err_out:
44 return -1;
45}
46EXPORT_SYMBOL_GPL(gre_add_protocol);
47
48int gre_del_protocol(const struct gre_protocol *proto, u8 version)
49{
50 if (version >= GREPROTO_MAX)
51 goto err_out;
52
53 spin_lock(&gre_proto_lock);
54 if (rcu_dereference_protected(gre_proto[version],
55 lockdep_is_held(&gre_proto_lock)) != proto)
56 goto err_out_unlock;
57 rcu_assign_pointer(gre_proto[version], NULL);
58 spin_unlock(&gre_proto_lock);
59 synchronize_rcu();
60 return 0;
61
62err_out_unlock:
63 spin_unlock(&gre_proto_lock);
64err_out:
65 return -1;
66}
67EXPORT_SYMBOL_GPL(gre_del_protocol);
68
69static int gre_rcv(struct sk_buff *skb)
70{
71 const struct gre_protocol *proto;
72 u8 ver;
73 int ret;
74
75 if (!pskb_may_pull(skb, 12))
76 goto drop;
77
78 ver = skb->data[1]&0x7f;
79 if (ver >= GREPROTO_MAX)
80 goto drop;
81
82 rcu_read_lock();
83 proto = rcu_dereference(gre_proto[ver]);
84 if (!proto || !proto->handler)
85 goto drop_unlock;
86 ret = proto->handler(skb);
87 rcu_read_unlock();
88 return ret;
89
90drop_unlock:
91 rcu_read_unlock();
92drop:
93 kfree_skb(skb);
94 return NET_RX_DROP;
95}
96
97static void gre_err(struct sk_buff *skb, u32 info)
98{
99 const struct gre_protocol *proto;
100 u8 ver;
101
102 if (!pskb_may_pull(skb, 12))
103 goto drop;
104
105 ver = skb->data[1]&0x7f;
106 if (ver >= GREPROTO_MAX)
107 goto drop;
108
109 rcu_read_lock();
110 proto = rcu_dereference(gre_proto[ver]);
111 if (!proto || !proto->err_handler)
112 goto drop_unlock;
113 proto->err_handler(skb, info);
114 rcu_read_unlock();
115 return;
116
117drop_unlock:
118 rcu_read_unlock();
119drop:
120 kfree_skb(skb);
121}
122
123static const struct net_protocol net_gre_protocol = {
124 .handler = gre_rcv,
125 .err_handler = gre_err,
126 .netns_ok = 1,
127};
128
129static int __init gre_init(void)
130{
131 pr_info("GRE over IPv4 demultiplexor driver");
132
133 if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
134 pr_err("gre: can't add protocol\n");
135 return -EAGAIN;
136 }
137
138 return 0;
139}
140
141static void __exit gre_exit(void)
142{
143 inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
144}
145
146module_init(gre_init);
147module_exit(gre_exit);
148
149MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
150MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
151MODULE_LICENSE("GPL");
152
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index a0d847c7cba5..5395e45dcce6 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -83,6 +83,7 @@
83#include <net/tcp.h> 83#include <net/tcp.h>
84#include <net/udp.h> 84#include <net/udp.h>
85#include <net/raw.h> 85#include <net/raw.h>
86#include <net/ping.h>
86#include <linux/skbuff.h> 87#include <linux/skbuff.h>
87#include <net/sock.h> 88#include <net/sock.h>
88#include <linux/errno.h> 89#include <linux/errno.h>
@@ -108,8 +109,7 @@ struct icmp_bxm {
108 __be32 times[3]; 109 __be32 times[3];
109 } data; 110 } data;
110 int head_len; 111 int head_len;
111 struct ip_options replyopts; 112 struct ip_options_data replyopts;
112 unsigned char optbuf[40];
113}; 113};
114 114
115/* An array of errno for error messages from dest unreach. */ 115/* An array of errno for error messages from dest unreach. */
@@ -233,48 +233,11 @@ static inline void icmp_xmit_unlock(struct sock *sk)
233 * Send an ICMP frame. 233 * Send an ICMP frame.
234 */ 234 */
235 235
236/* 236static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
237 * Check transmit rate limitation for given message. 237 struct flowi4 *fl4, int type, int code)
238 * The rate information is held in the destination cache now.
239 * This function is generic and could be used for other purposes
240 * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
241 *
242 * Note that the same dst_entry fields are modified by functions in
243 * route.c too, but these work for packet destinations while xrlim_allow
244 * works for icmp destinations. This means the rate limiting information
245 * for one "ip object" is shared - and these ICMPs are twice limited:
246 * by source and by destination.
247 *
248 * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
249 * SHOULD allow setting of rate limits
250 *
251 * Shared between ICMPv4 and ICMPv6.
252 */
253#define XRLIM_BURST_FACTOR 6
254int xrlim_allow(struct dst_entry *dst, int timeout)
255{
256 unsigned long now, token = dst->rate_tokens;
257 int rc = 0;
258
259 now = jiffies;
260 token += now - dst->rate_last;
261 dst->rate_last = now;
262 if (token > XRLIM_BURST_FACTOR * timeout)
263 token = XRLIM_BURST_FACTOR * timeout;
264 if (token >= timeout) {
265 token -= timeout;
266 rc = 1;
267 }
268 dst->rate_tokens = token;
269 return rc;
270}
271EXPORT_SYMBOL(xrlim_allow);
272
273static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
274 int type, int code)
275{ 238{
276 struct dst_entry *dst = &rt->dst; 239 struct dst_entry *dst = &rt->dst;
277 int rc = 1; 240 bool rc = true;
278 241
279 if (type > NR_ICMP_TYPES) 242 if (type > NR_ICMP_TYPES)
280 goto out; 243 goto out;
@@ -288,8 +251,12 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
288 goto out; 251 goto out;
289 252
290 /* Limit if icmp type is enabled in ratemask. */ 253 /* Limit if icmp type is enabled in ratemask. */
291 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) 254 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
292 rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit); 255 if (!rt->peer)
256 rt_bind_peer(rt, fl4->daddr, 1);
257 rc = inet_peer_xrlim_allow(rt->peer,
258 net->ipv4.sysctl_icmp_ratelimit);
259 }
293out: 260out:
294 return rc; 261 return rc;
295} 262}
@@ -324,13 +291,14 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
324} 291}
325 292
326static void icmp_push_reply(struct icmp_bxm *icmp_param, 293static void icmp_push_reply(struct icmp_bxm *icmp_param,
294 struct flowi4 *fl4,
327 struct ipcm_cookie *ipc, struct rtable **rt) 295 struct ipcm_cookie *ipc, struct rtable **rt)
328{ 296{
329 struct sock *sk; 297 struct sock *sk;
330 struct sk_buff *skb; 298 struct sk_buff *skb;
331 299
332 sk = icmp_sk(dev_net((*rt)->dst.dev)); 300 sk = icmp_sk(dev_net((*rt)->dst.dev));
333 if (ip_append_data(sk, icmp_glue_bits, icmp_param, 301 if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
334 icmp_param->data_len+icmp_param->head_len, 302 icmp_param->data_len+icmp_param->head_len,
335 icmp_param->head_len, 303 icmp_param->head_len,
336 ipc, rt, MSG_DONTWAIT) < 0) { 304 ipc, rt, MSG_DONTWAIT) < 0) {
@@ -349,7 +317,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
349 icmp_param->head_len, csum); 317 icmp_param->head_len, csum);
350 icmph->checksum = csum_fold(csum); 318 icmph->checksum = csum_fold(csum);
351 skb->ip_summed = CHECKSUM_NONE; 319 skb->ip_summed = CHECKSUM_NONE;
352 ip_push_pending_frames(sk); 320 ip_push_pending_frames(sk, fl4);
353 } 321 }
354} 322}
355 323
@@ -362,11 +330,12 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
362 struct ipcm_cookie ipc; 330 struct ipcm_cookie ipc;
363 struct rtable *rt = skb_rtable(skb); 331 struct rtable *rt = skb_rtable(skb);
364 struct net *net = dev_net(rt->dst.dev); 332 struct net *net = dev_net(rt->dst.dev);
333 struct flowi4 fl4;
365 struct sock *sk; 334 struct sock *sk;
366 struct inet_sock *inet; 335 struct inet_sock *inet;
367 __be32 daddr; 336 __be32 daddr;
368 337
369 if (ip_options_echo(&icmp_param->replyopts, skb)) 338 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
370 return; 339 return;
371 340
372 sk = icmp_xmit_lock(net); 341 sk = icmp_xmit_lock(net);
@@ -377,32 +346,120 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
377 icmp_param->data.icmph.checksum = 0; 346 icmp_param->data.icmph.checksum = 0;
378 347
379 inet->tos = ip_hdr(skb)->tos; 348 inet->tos = ip_hdr(skb)->tos;
380 daddr = ipc.addr = rt->rt_src; 349 daddr = ipc.addr = ip_hdr(skb)->saddr;
381 ipc.opt = NULL; 350 ipc.opt = NULL;
382 ipc.shtx.flags = 0; 351 ipc.tx_flags = 0;
383 if (icmp_param->replyopts.optlen) { 352 if (icmp_param->replyopts.opt.opt.optlen) {
384 ipc.opt = &icmp_param->replyopts; 353 ipc.opt = &icmp_param->replyopts.opt;
385 if (ipc.opt->srr) 354 if (ipc.opt->opt.srr)
386 daddr = icmp_param->replyopts.faddr; 355 daddr = icmp_param->replyopts.opt.opt.faddr;
387 } 356 }
388 { 357 memset(&fl4, 0, sizeof(fl4));
389 struct flowi fl = { .nl_u = { .ip4_u = 358 fl4.daddr = daddr;
390 { .daddr = daddr, 359 fl4.saddr = rt->rt_spec_dst;
391 .saddr = rt->rt_spec_dst, 360 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
392 .tos = RT_TOS(ip_hdr(skb)->tos) } }, 361 fl4.flowi4_proto = IPPROTO_ICMP;
393 .proto = IPPROTO_ICMP }; 362 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
394 security_skb_classify_flow(skb, &fl); 363 rt = ip_route_output_key(net, &fl4);
395 if (ip_route_output_key(net, &rt, &fl)) 364 if (IS_ERR(rt))
396 goto out_unlock; 365 goto out_unlock;
397 } 366 if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
398 if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type,
399 icmp_param->data.icmph.code)) 367 icmp_param->data.icmph.code))
400 icmp_push_reply(icmp_param, &ipc, &rt); 368 icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
401 ip_rt_put(rt); 369 ip_rt_put(rt);
402out_unlock: 370out_unlock:
403 icmp_xmit_unlock(sk); 371 icmp_xmit_unlock(sk);
404} 372}
405 373
374static struct rtable *icmp_route_lookup(struct net *net,
375 struct flowi4 *fl4,
376 struct sk_buff *skb_in,
377 const struct iphdr *iph,
378 __be32 saddr, u8 tos,
379 int type, int code,
380 struct icmp_bxm *param)
381{
382 struct rtable *rt, *rt2;
383 int err;
384
385 memset(fl4, 0, sizeof(*fl4));
386 fl4->daddr = (param->replyopts.opt.opt.srr ?
387 param->replyopts.opt.opt.faddr : iph->saddr);
388 fl4->saddr = saddr;
389 fl4->flowi4_tos = RT_TOS(tos);
390 fl4->flowi4_proto = IPPROTO_ICMP;
391 fl4->fl4_icmp_type = type;
392 fl4->fl4_icmp_code = code;
393 security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
394 rt = __ip_route_output_key(net, fl4);
395 if (IS_ERR(rt))
396 return rt;
397
398 /* No need to clone since we're just using its address. */
399 rt2 = rt;
400
401 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
402 flowi4_to_flowi(fl4), NULL, 0);
403 if (!IS_ERR(rt)) {
404 if (rt != rt2)
405 return rt;
406 } else if (PTR_ERR(rt) == -EPERM) {
407 rt = NULL;
408 } else
409 return rt;
410
411 err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(fl4), AF_INET);
412 if (err)
413 goto relookup_failed;
414
415 if (inet_addr_type(net, fl4->saddr) == RTN_LOCAL) {
416 rt2 = __ip_route_output_key(net, fl4);
417 if (IS_ERR(rt2))
418 err = PTR_ERR(rt2);
419 } else {
420 struct flowi4 fl4_2 = {};
421 unsigned long orefdst;
422
423 fl4_2.daddr = fl4->saddr;
424 rt2 = ip_route_output_key(net, &fl4_2);
425 if (IS_ERR(rt2)) {
426 err = PTR_ERR(rt2);
427 goto relookup_failed;
428 }
429 /* Ugh! */
430 orefdst = skb_in->_skb_refdst; /* save old refdst */
431 err = ip_route_input(skb_in, fl4->daddr, fl4->saddr,
432 RT_TOS(tos), rt2->dst.dev);
433
434 dst_release(&rt2->dst);
435 rt2 = skb_rtable(skb_in);
436 skb_in->_skb_refdst = orefdst; /* restore old refdst */
437 }
438
439 if (err)
440 goto relookup_failed;
441
442 rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
443 flowi4_to_flowi(fl4), NULL,
444 XFRM_LOOKUP_ICMP);
445 if (!IS_ERR(rt2)) {
446 dst_release(&rt->dst);
447 rt = rt2;
448 } else if (PTR_ERR(rt2) == -EPERM) {
449 if (rt)
450 dst_release(&rt->dst);
451 return rt2;
452 } else {
453 err = PTR_ERR(rt2);
454 goto relookup_failed;
455 }
456 return rt;
457
458relookup_failed:
459 if (rt)
460 return rt;
461 return ERR_PTR(err);
462}
406 463
407/* 464/*
408 * Send an ICMP message in response to a situation 465 * Send an ICMP message in response to a situation
@@ -422,6 +479,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
422 struct icmp_bxm icmp_param; 479 struct icmp_bxm icmp_param;
423 struct rtable *rt = skb_rtable(skb_in); 480 struct rtable *rt = skb_rtable(skb_in);
424 struct ipcm_cookie ipc; 481 struct ipcm_cookie ipc;
482 struct flowi4 fl4;
425 __be32 saddr; 483 __be32 saddr;
426 u8 tos; 484 u8 tos;
427 struct net *net; 485 struct net *net;
@@ -506,9 +564,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
506 struct net_device *dev = NULL; 564 struct net_device *dev = NULL;
507 565
508 rcu_read_lock(); 566 rcu_read_lock();
509 if (rt->fl.iif && 567 if (rt_is_input_route(rt) &&
510 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) 568 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
511 dev = dev_get_by_index_rcu(net, rt->fl.iif); 569 dev = dev_get_by_index_rcu(net, rt->rt_iif);
512 570
513 if (dev) 571 if (dev)
514 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); 572 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -521,7 +579,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
521 IPTOS_PREC_INTERNETCONTROL) : 579 IPTOS_PREC_INTERNETCONTROL) :
522 iph->tos; 580 iph->tos;
523 581
524 if (ip_options_echo(&icmp_param.replyopts, skb_in)) 582 if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
525 goto out_unlock; 583 goto out_unlock;
526 584
527 585
@@ -537,96 +595,15 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
537 icmp_param.offset = skb_network_offset(skb_in); 595 icmp_param.offset = skb_network_offset(skb_in);
538 inet_sk(sk)->tos = tos; 596 inet_sk(sk)->tos = tos;
539 ipc.addr = iph->saddr; 597 ipc.addr = iph->saddr;
540 ipc.opt = &icmp_param.replyopts; 598 ipc.opt = &icmp_param.replyopts.opt;
541 ipc.shtx.flags = 0; 599 ipc.tx_flags = 0;
542
543 {
544 struct flowi fl = {
545 .nl_u = {
546 .ip4_u = {
547 .daddr = icmp_param.replyopts.srr ?
548 icmp_param.replyopts.faddr :
549 iph->saddr,
550 .saddr = saddr,
551 .tos = RT_TOS(tos)
552 }
553 },
554 .proto = IPPROTO_ICMP,
555 .uli_u = {
556 .icmpt = {
557 .type = type,
558 .code = code
559 }
560 }
561 };
562 int err;
563 struct rtable *rt2;
564
565 security_skb_classify_flow(skb_in, &fl);
566 if (__ip_route_output_key(net, &rt, &fl))
567 goto out_unlock;
568
569 /* No need to clone since we're just using its address. */
570 rt2 = rt;
571
572 err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0);
573 switch (err) {
574 case 0:
575 if (rt != rt2)
576 goto route_done;
577 break;
578 case -EPERM:
579 rt = NULL;
580 break;
581 default:
582 goto out_unlock;
583 }
584
585 if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET))
586 goto relookup_failed;
587
588 if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL)
589 err = __ip_route_output_key(net, &rt2, &fl);
590 else {
591 struct flowi fl2 = {};
592 unsigned long orefdst;
593 600
594 fl2.fl4_dst = fl.fl4_src; 601 rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos,
595 if (ip_route_output_key(net, &rt2, &fl2)) 602 type, code, &icmp_param);
596 goto relookup_failed; 603 if (IS_ERR(rt))
597 604 goto out_unlock;
598 /* Ugh! */
599 orefdst = skb_in->_skb_refdst; /* save old refdst */
600 err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
601 RT_TOS(tos), rt2->dst.dev);
602
603 dst_release(&rt2->dst);
604 rt2 = skb_rtable(skb_in);
605 skb_in->_skb_refdst = orefdst; /* restore old refdst */
606 }
607
608 if (err)
609 goto relookup_failed;
610
611 err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL,
612 XFRM_LOOKUP_ICMP);
613 switch (err) {
614 case 0:
615 dst_release(&rt->dst);
616 rt = rt2;
617 break;
618 case -EPERM:
619 goto ende;
620 default:
621relookup_failed:
622 if (!rt)
623 goto out_unlock;
624 break;
625 }
626 }
627 605
628route_done: 606 if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
629 if (!icmpv4_xrlim_allow(net, rt, type, code))
630 goto ende; 607 goto ende;
631 608
632 /* RFC says return as much as we can without exceeding 576 bytes. */ 609 /* RFC says return as much as we can without exceeding 576 bytes. */
@@ -634,7 +611,7 @@ route_done:
634 room = dst_mtu(&rt->dst); 611 room = dst_mtu(&rt->dst);
635 if (room > 576) 612 if (room > 576)
636 room = 576; 613 room = 576;
637 room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; 614 room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
638 room -= sizeof(struct icmphdr); 615 room -= sizeof(struct icmphdr);
639 616
640 icmp_param.data_len = skb_in->len - icmp_param.offset; 617 icmp_param.data_len = skb_in->len - icmp_param.offset;
@@ -642,7 +619,7 @@ route_done:
642 icmp_param.data_len = room; 619 icmp_param.data_len = room;
643 icmp_param.head_len = sizeof(struct icmphdr); 620 icmp_param.head_len = sizeof(struct icmphdr);
644 621
645 icmp_push_reply(&icmp_param, &ipc, &rt); 622 icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
646ende: 623ende:
647 ip_rt_put(rt); 624 ip_rt_put(rt);
648out_unlock: 625out_unlock:
@@ -658,7 +635,7 @@ EXPORT_SYMBOL(icmp_send);
658 635
659static void icmp_unreach(struct sk_buff *skb) 636static void icmp_unreach(struct sk_buff *skb)
660{ 637{
661 struct iphdr *iph; 638 const struct iphdr *iph;
662 struct icmphdr *icmph; 639 struct icmphdr *icmph;
663 int hash, protocol; 640 int hash, protocol;
664 const struct net_protocol *ipprot; 641 const struct net_protocol *ipprot;
@@ -677,7 +654,7 @@ static void icmp_unreach(struct sk_buff *skb)
677 goto out_err; 654 goto out_err;
678 655
679 icmph = icmp_hdr(skb); 656 icmph = icmp_hdr(skb);
680 iph = (struct iphdr *)skb->data; 657 iph = (const struct iphdr *)skb->data;
681 658
682 if (iph->ihl < 5) /* Mangled header, drop. */ 659 if (iph->ihl < 5) /* Mangled header, drop. */
683 goto out_err; 660 goto out_err;
@@ -725,7 +702,7 @@ static void icmp_unreach(struct sk_buff *skb)
725 */ 702 */
726 703
727 /* 704 /*
728 * Check the other end isnt violating RFC 1122. Some routers send 705 * Check the other end isn't violating RFC 1122. Some routers send
729 * bogus responses to broadcast frames. If you see this message 706 * bogus responses to broadcast frames. If you see this message
730 * first check your netmask matches at both ends, if it does then 707 * first check your netmask matches at both ends, if it does then
731 * get the other vendor to fix their kit. 708 * get the other vendor to fix their kit.
@@ -750,7 +727,7 @@ static void icmp_unreach(struct sk_buff *skb)
750 if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) 727 if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
751 goto out; 728 goto out;
752 729
753 iph = (struct iphdr *)skb->data; 730 iph = (const struct iphdr *)skb->data;
754 protocol = iph->protocol; 731 protocol = iph->protocol;
755 732
756 /* 733 /*
@@ -779,7 +756,7 @@ out_err:
779 756
780static void icmp_redirect(struct sk_buff *skb) 757static void icmp_redirect(struct sk_buff *skb)
781{ 758{
782 struct iphdr *iph; 759 const struct iphdr *iph;
783 760
784 if (skb->len < sizeof(struct iphdr)) 761 if (skb->len < sizeof(struct iphdr))
785 goto out_err; 762 goto out_err;
@@ -790,7 +767,7 @@ static void icmp_redirect(struct sk_buff *skb)
790 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 767 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
791 goto out; 768 goto out;
792 769
793 iph = (struct iphdr *)skb->data; 770 iph = (const struct iphdr *)skb->data;
794 771
795 switch (icmp_hdr(skb)->code & 7) { 772 switch (icmp_hdr(skb)->code & 7) {
796 case ICMP_REDIR_NET: 773 case ICMP_REDIR_NET:
@@ -805,6 +782,15 @@ static void icmp_redirect(struct sk_buff *skb)
805 iph->saddr, skb->dev); 782 iph->saddr, skb->dev);
806 break; 783 break;
807 } 784 }
785
786 /* Ping wants to see redirects.
787 * Let's pretend they are errors of sorts... */
788 if (iph->protocol == IPPROTO_ICMP &&
789 iph->ihl >= 5 &&
790 pskb_may_pull(skb, (iph->ihl<<2)+8)) {
791 ping_err(skb, icmp_hdr(skb)->un.gateway);
792 }
793
808out: 794out:
809 return; 795 return;
810out_err: 796out_err:
@@ -954,12 +940,12 @@ static void icmp_address_reply(struct sk_buff *skb)
954 BUG_ON(mp == NULL); 940 BUG_ON(mp == NULL);
955 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { 941 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
956 if (*mp == ifa->ifa_mask && 942 if (*mp == ifa->ifa_mask &&
957 inet_ifa_match(rt->rt_src, ifa)) 943 inet_ifa_match(ip_hdr(skb)->saddr, ifa))
958 break; 944 break;
959 } 945 }
960 if (!ifa && net_ratelimit()) { 946 if (!ifa && net_ratelimit()) {
961 printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n", 947 printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n",
962 mp, dev->name, &rt->rt_src); 948 mp, dev->name, &ip_hdr(skb)->saddr);
963 } 949 }
964 } 950 }
965} 951}
@@ -1065,7 +1051,7 @@ error:
1065 */ 1051 */
1066static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { 1052static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
1067 [ICMP_ECHOREPLY] = { 1053 [ICMP_ECHOREPLY] = {
1068 .handler = icmp_discard, 1054 .handler = ping_rcv,
1069 }, 1055 },
1070 [1] = { 1056 [1] = {
1071 .handler = icmp_discard, 1057 .handler = icmp_discard,
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 2a4bb76f2132..f1d27f6c9351 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -153,17 +153,27 @@ static void ip_ma_put(struct ip_mc_list *im)
153{ 153{
154 if (atomic_dec_and_test(&im->refcnt)) { 154 if (atomic_dec_and_test(&im->refcnt)) {
155 in_dev_put(im->interface); 155 in_dev_put(im->interface);
156 kfree(im); 156 kfree_rcu(im, rcu);
157 } 157 }
158} 158}
159 159
160#define for_each_pmc_rcu(in_dev, pmc) \
161 for (pmc = rcu_dereference(in_dev->mc_list); \
162 pmc != NULL; \
163 pmc = rcu_dereference(pmc->next_rcu))
164
165#define for_each_pmc_rtnl(in_dev, pmc) \
166 for (pmc = rtnl_dereference(in_dev->mc_list); \
167 pmc != NULL; \
168 pmc = rtnl_dereference(pmc->next_rcu))
169
160#ifdef CONFIG_IP_MULTICAST 170#ifdef CONFIG_IP_MULTICAST
161 171
162/* 172/*
163 * Timer management 173 * Timer management
164 */ 174 */
165 175
166static __inline__ void igmp_stop_timer(struct ip_mc_list *im) 176static void igmp_stop_timer(struct ip_mc_list *im)
167{ 177{
168 spin_lock_bh(&im->lock); 178 spin_lock_bh(&im->lock);
169 if (del_timer(&im->timer)) 179 if (del_timer(&im->timer))
@@ -284,6 +294,8 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
284 return scount; 294 return scount;
285} 295}
286 296
297#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb))
298
287static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) 299static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
288{ 300{
289 struct sk_buff *skb; 301 struct sk_buff *skb;
@@ -291,24 +303,24 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
291 struct iphdr *pip; 303 struct iphdr *pip;
292 struct igmpv3_report *pig; 304 struct igmpv3_report *pig;
293 struct net *net = dev_net(dev); 305 struct net *net = dev_net(dev);
306 struct flowi4 fl4;
294 307
295 skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); 308 while (1) {
296 if (skb == NULL) 309 skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev),
297 return NULL; 310 GFP_ATOMIC | __GFP_NOWARN);
298 311 if (skb)
299 { 312 break;
300 struct flowi fl = { .oif = dev->ifindex, 313 size >>= 1;
301 .nl_u = { .ip4_u = { 314 if (size < 256)
302 .daddr = IGMPV3_ALL_MCR } },
303 .proto = IPPROTO_IGMP };
304 if (ip_route_output_key(net, &rt, &fl)) {
305 kfree_skb(skb);
306 return NULL; 315 return NULL;
307 }
308 } 316 }
309 if (rt->rt_src == 0) { 317 igmp_skb_size(skb) = size;
318
319 rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
320 0, 0,
321 IPPROTO_IGMP, 0, dev->ifindex);
322 if (IS_ERR(rt)) {
310 kfree_skb(skb); 323 kfree_skb(skb);
311 ip_rt_put(rt);
312 return NULL; 324 return NULL;
313 } 325 }
314 326
@@ -326,8 +338,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
326 pip->tos = 0xc0; 338 pip->tos = 0xc0;
327 pip->frag_off = htons(IP_DF); 339 pip->frag_off = htons(IP_DF);
328 pip->ttl = 1; 340 pip->ttl = 1;
329 pip->daddr = rt->rt_dst; 341 pip->daddr = fl4.daddr;
330 pip->saddr = rt->rt_src; 342 pip->saddr = fl4.saddr;
331 pip->protocol = IPPROTO_IGMP; 343 pip->protocol = IPPROTO_IGMP;
332 pip->tot_len = 0; /* filled in later */ 344 pip->tot_len = 0; /* filled in later */
333 ip_select_ident(pip, &rt->dst, NULL); 345 ip_select_ident(pip, &rt->dst, NULL);
@@ -384,7 +396,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
384 return skb; 396 return skb;
385} 397}
386 398
387#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ 399#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \
388 skb_tailroom(skb)) : 0) 400 skb_tailroom(skb)) : 0)
389 401
390static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, 402static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
@@ -502,8 +514,8 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
502 int type; 514 int type;
503 515
504 if (!pmc) { 516 if (!pmc) {
505 read_lock(&in_dev->mc_list_lock); 517 rcu_read_lock();
506 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 518 for_each_pmc_rcu(in_dev, pmc) {
507 if (pmc->multiaddr == IGMP_ALL_HOSTS) 519 if (pmc->multiaddr == IGMP_ALL_HOSTS)
508 continue; 520 continue;
509 spin_lock_bh(&pmc->lock); 521 spin_lock_bh(&pmc->lock);
@@ -514,7 +526,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
514 skb = add_grec(skb, pmc, type, 0, 0); 526 skb = add_grec(skb, pmc, type, 0, 0);
515 spin_unlock_bh(&pmc->lock); 527 spin_unlock_bh(&pmc->lock);
516 } 528 }
517 read_unlock(&in_dev->mc_list_lock); 529 rcu_read_unlock();
518 } else { 530 } else {
519 spin_lock_bh(&pmc->lock); 531 spin_lock_bh(&pmc->lock);
520 if (pmc->sfcount[MCAST_EXCLUDE]) 532 if (pmc->sfcount[MCAST_EXCLUDE])
@@ -556,7 +568,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
556 struct sk_buff *skb = NULL; 568 struct sk_buff *skb = NULL;
557 int type, dtype; 569 int type, dtype;
558 570
559 read_lock(&in_dev->mc_list_lock); 571 rcu_read_lock();
560 spin_lock_bh(&in_dev->mc_tomb_lock); 572 spin_lock_bh(&in_dev->mc_tomb_lock);
561 573
562 /* deleted MCA's */ 574 /* deleted MCA's */
@@ -593,7 +605,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
593 spin_unlock_bh(&in_dev->mc_tomb_lock); 605 spin_unlock_bh(&in_dev->mc_tomb_lock);
594 606
595 /* change recs */ 607 /* change recs */
596 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 608 for_each_pmc_rcu(in_dev, pmc) {
597 spin_lock_bh(&pmc->lock); 609 spin_lock_bh(&pmc->lock);
598 if (pmc->sfcount[MCAST_EXCLUDE]) { 610 if (pmc->sfcount[MCAST_EXCLUDE]) {
599 type = IGMPV3_BLOCK_OLD_SOURCES; 611 type = IGMPV3_BLOCK_OLD_SOURCES;
@@ -616,7 +628,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
616 } 628 }
617 spin_unlock_bh(&pmc->lock); 629 spin_unlock_bh(&pmc->lock);
618 } 630 }
619 read_unlock(&in_dev->mc_list_lock); 631 rcu_read_unlock();
620 632
621 if (!skb) 633 if (!skb)
622 return; 634 return;
@@ -633,6 +645,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
633 struct net_device *dev = in_dev->dev; 645 struct net_device *dev = in_dev->dev;
634 struct net *net = dev_net(dev); 646 struct net *net = dev_net(dev);
635 __be32 group = pmc ? pmc->multiaddr : 0; 647 __be32 group = pmc ? pmc->multiaddr : 0;
648 struct flowi4 fl4;
636 __be32 dst; 649 __be32 dst;
637 650
638 if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) 651 if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
@@ -642,17 +655,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
642 else 655 else
643 dst = group; 656 dst = group;
644 657
645 { 658 rt = ip_route_output_ports(net, &fl4, NULL, dst, 0,
646 struct flowi fl = { .oif = dev->ifindex, 659 0, 0,
647 .nl_u = { .ip4_u = { .daddr = dst } }, 660 IPPROTO_IGMP, 0, dev->ifindex);
648 .proto = IPPROTO_IGMP }; 661 if (IS_ERR(rt))
649 if (ip_route_output_key(net, &rt, &fl))
650 return -1;
651 }
652 if (rt->rt_src == 0) {
653 ip_rt_put(rt);
654 return -1; 662 return -1;
655 }
656 663
657 skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC); 664 skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC);
658 if (skb == NULL) { 665 if (skb == NULL) {
@@ -674,7 +681,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
674 iph->frag_off = htons(IP_DF); 681 iph->frag_off = htons(IP_DF);
675 iph->ttl = 1; 682 iph->ttl = 1;
676 iph->daddr = dst; 683 iph->daddr = dst;
677 iph->saddr = rt->rt_src; 684 iph->saddr = fl4.saddr;
678 iph->protocol = IPPROTO_IGMP; 685 iph->protocol = IPPROTO_IGMP;
679 ip_select_ident(iph, &rt->dst, NULL); 686 ip_select_ident(iph, &rt->dst, NULL);
680 ((u8*)&iph[1])[0] = IPOPT_RA; 687 ((u8*)&iph[1])[0] = IPOPT_RA;
@@ -813,14 +820,14 @@ static void igmp_heard_report(struct in_device *in_dev, __be32 group)
813 if (group == IGMP_ALL_HOSTS) 820 if (group == IGMP_ALL_HOSTS)
814 return; 821 return;
815 822
816 read_lock(&in_dev->mc_list_lock); 823 rcu_read_lock();
817 for (im=in_dev->mc_list; im!=NULL; im=im->next) { 824 for_each_pmc_rcu(in_dev, im) {
818 if (im->multiaddr == group) { 825 if (im->multiaddr == group) {
819 igmp_stop_timer(im); 826 igmp_stop_timer(im);
820 break; 827 break;
821 } 828 }
822 } 829 }
823 read_unlock(&in_dev->mc_list_lock); 830 rcu_read_unlock();
824} 831}
825 832
826static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, 833static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
@@ -906,8 +913,8 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
906 * - Use the igmp->igmp_code field as the maximum 913 * - Use the igmp->igmp_code field as the maximum
907 * delay possible 914 * delay possible
908 */ 915 */
909 read_lock(&in_dev->mc_list_lock); 916 rcu_read_lock();
910 for (im=in_dev->mc_list; im!=NULL; im=im->next) { 917 for_each_pmc_rcu(in_dev, im) {
911 int changed; 918 int changed;
912 919
913 if (group && group != im->multiaddr) 920 if (group && group != im->multiaddr)
@@ -925,7 +932,7 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
925 if (changed) 932 if (changed)
926 igmp_mod_timer(im, max_delay); 933 igmp_mod_timer(im, max_delay);
927 } 934 }
928 read_unlock(&in_dev->mc_list_lock); 935 rcu_read_unlock();
929} 936}
930 937
931/* called in rcu_read_lock() section */ 938/* called in rcu_read_lock() section */
@@ -961,7 +968,7 @@ int igmp_rcv(struct sk_buff *skb)
961 case IGMP_HOST_MEMBERSHIP_REPORT: 968 case IGMP_HOST_MEMBERSHIP_REPORT:
962 case IGMPV2_HOST_MEMBERSHIP_REPORT: 969 case IGMPV2_HOST_MEMBERSHIP_REPORT:
963 /* Is it our report looped back? */ 970 /* Is it our report looped back? */
964 if (skb_rtable(skb)->fl.iif == 0) 971 if (rt_is_output_route(skb_rtable(skb)))
965 break; 972 break;
966 /* don't rely on MC router hearing unicast reports */ 973 /* don't rely on MC router hearing unicast reports */
967 if (skb->pkt_type == PACKET_MULTICAST || 974 if (skb->pkt_type == PACKET_MULTICAST ||
@@ -1110,8 +1117,8 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
1110 kfree(pmc); 1117 kfree(pmc);
1111 } 1118 }
1112 /* clear dead sources, too */ 1119 /* clear dead sources, too */
1113 read_lock(&in_dev->mc_list_lock); 1120 rcu_read_lock();
1114 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 1121 for_each_pmc_rcu(in_dev, pmc) {
1115 struct ip_sf_list *psf, *psf_next; 1122 struct ip_sf_list *psf, *psf_next;
1116 1123
1117 spin_lock_bh(&pmc->lock); 1124 spin_lock_bh(&pmc->lock);
@@ -1123,7 +1130,7 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
1123 kfree(psf); 1130 kfree(psf);
1124 } 1131 }
1125 } 1132 }
1126 read_unlock(&in_dev->mc_list_lock); 1133 rcu_read_unlock();
1127} 1134}
1128#endif 1135#endif
1129 1136
@@ -1148,20 +1155,18 @@ static void igmp_group_dropped(struct ip_mc_list *im)
1148 1155
1149 if (!in_dev->dead) { 1156 if (!in_dev->dead) {
1150 if (IGMP_V1_SEEN(in_dev)) 1157 if (IGMP_V1_SEEN(in_dev))
1151 goto done; 1158 return;
1152 if (IGMP_V2_SEEN(in_dev)) { 1159 if (IGMP_V2_SEEN(in_dev)) {
1153 if (reporter) 1160 if (reporter)
1154 igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE); 1161 igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
1155 goto done; 1162 return;
1156 } 1163 }
1157 /* IGMPv3 */ 1164 /* IGMPv3 */
1158 igmpv3_add_delrec(in_dev, im); 1165 igmpv3_add_delrec(in_dev, im);
1159 1166
1160 igmp_ifc_event(in_dev); 1167 igmp_ifc_event(in_dev);
1161 } 1168 }
1162done:
1163#endif 1169#endif
1164 ip_mc_clear_src(im);
1165} 1170}
1166 1171
1167static void igmp_group_added(struct ip_mc_list *im) 1172static void igmp_group_added(struct ip_mc_list *im)
@@ -1209,7 +1214,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1209 1214
1210 ASSERT_RTNL(); 1215 ASSERT_RTNL();
1211 1216
1212 for (im=in_dev->mc_list; im; im=im->next) { 1217 for_each_pmc_rtnl(in_dev, im) {
1213 if (im->multiaddr == addr) { 1218 if (im->multiaddr == addr) {
1214 im->users++; 1219 im->users++;
1215 ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); 1220 ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
@@ -1217,7 +1222,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1217 } 1222 }
1218 } 1223 }
1219 1224
1220 im = kmalloc(sizeof(*im), GFP_KERNEL); 1225 im = kzalloc(sizeof(*im), GFP_KERNEL);
1221 if (!im) 1226 if (!im)
1222 goto out; 1227 goto out;
1223 1228
@@ -1227,26 +1232,18 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1227 im->multiaddr = addr; 1232 im->multiaddr = addr;
1228 /* initial mode is (EX, empty) */ 1233 /* initial mode is (EX, empty) */
1229 im->sfmode = MCAST_EXCLUDE; 1234 im->sfmode = MCAST_EXCLUDE;
1230 im->sfcount[MCAST_INCLUDE] = 0;
1231 im->sfcount[MCAST_EXCLUDE] = 1; 1235 im->sfcount[MCAST_EXCLUDE] = 1;
1232 im->sources = NULL;
1233 im->tomb = NULL;
1234 im->crcount = 0;
1235 atomic_set(&im->refcnt, 1); 1236 atomic_set(&im->refcnt, 1);
1236 spin_lock_init(&im->lock); 1237 spin_lock_init(&im->lock);
1237#ifdef CONFIG_IP_MULTICAST 1238#ifdef CONFIG_IP_MULTICAST
1238 im->tm_running = 0;
1239 setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im); 1239 setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im);
1240 im->unsolicit_count = IGMP_Unsolicited_Report_Count; 1240 im->unsolicit_count = IGMP_Unsolicited_Report_Count;
1241 im->reporter = 0;
1242 im->gsquery = 0;
1243#endif 1241#endif
1244 im->loaded = 0; 1242
1245 write_lock_bh(&in_dev->mc_list_lock); 1243 im->next_rcu = in_dev->mc_list;
1246 im->next = in_dev->mc_list;
1247 in_dev->mc_list = im;
1248 in_dev->mc_count++; 1244 in_dev->mc_count++;
1249 write_unlock_bh(&in_dev->mc_list_lock); 1245 rcu_assign_pointer(in_dev->mc_list, im);
1246
1250#ifdef CONFIG_IP_MULTICAST 1247#ifdef CONFIG_IP_MULTICAST
1251 igmpv3_del_delrec(in_dev, im->multiaddr); 1248 igmpv3_del_delrec(in_dev, im->multiaddr);
1252#endif 1249#endif
@@ -1260,26 +1257,32 @@ EXPORT_SYMBOL(ip_mc_inc_group);
1260 1257
1261/* 1258/*
1262 * Resend IGMP JOIN report; used for bonding. 1259 * Resend IGMP JOIN report; used for bonding.
1260 * Called with rcu_read_lock()
1263 */ 1261 */
1264void ip_mc_rejoin_group(struct ip_mc_list *im) 1262void ip_mc_rejoin_groups(struct in_device *in_dev)
1265{ 1263{
1266#ifdef CONFIG_IP_MULTICAST 1264#ifdef CONFIG_IP_MULTICAST
1267 struct in_device *in_dev = im->interface; 1265 struct ip_mc_list *im;
1266 int type;
1268 1267
1269 if (im->multiaddr == IGMP_ALL_HOSTS) 1268 for_each_pmc_rcu(in_dev, im) {
1270 return; 1269 if (im->multiaddr == IGMP_ALL_HOSTS)
1270 continue;
1271 1271
1272 if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { 1272 /* a failover is happening and switches
1273 igmp_mod_timer(im, IGMP_Initial_Report_Delay); 1273 * must be notified immediately
1274 return; 1274 */
1275 if (IGMP_V1_SEEN(in_dev))
1276 type = IGMP_HOST_MEMBERSHIP_REPORT;
1277 else if (IGMP_V2_SEEN(in_dev))
1278 type = IGMPV2_HOST_MEMBERSHIP_REPORT;
1279 else
1280 type = IGMPV3_HOST_MEMBERSHIP_REPORT;
1281 igmp_send_report(in_dev, im, type);
1275 } 1282 }
1276 /* else, v3 */
1277 im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
1278 IGMP_Unsolicited_Report_Count;
1279 igmp_ifc_event(in_dev);
1280#endif 1283#endif
1281} 1284}
1282EXPORT_SYMBOL(ip_mc_rejoin_group); 1285EXPORT_SYMBOL(ip_mc_rejoin_groups);
1283 1286
1284/* 1287/*
1285 * A socket has left a multicast group on device dev 1288 * A socket has left a multicast group on device dev
@@ -1287,18 +1290,20 @@ EXPORT_SYMBOL(ip_mc_rejoin_group);
1287 1290
1288void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) 1291void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
1289{ 1292{
1290 struct ip_mc_list *i, **ip; 1293 struct ip_mc_list *i;
1294 struct ip_mc_list __rcu **ip;
1291 1295
1292 ASSERT_RTNL(); 1296 ASSERT_RTNL();
1293 1297
1294 for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { 1298 for (ip = &in_dev->mc_list;
1299 (i = rtnl_dereference(*ip)) != NULL;
1300 ip = &i->next_rcu) {
1295 if (i->multiaddr == addr) { 1301 if (i->multiaddr == addr) {
1296 if (--i->users == 0) { 1302 if (--i->users == 0) {
1297 write_lock_bh(&in_dev->mc_list_lock); 1303 *ip = i->next_rcu;
1298 *ip = i->next;
1299 in_dev->mc_count--; 1304 in_dev->mc_count--;
1300 write_unlock_bh(&in_dev->mc_list_lock);
1301 igmp_group_dropped(i); 1305 igmp_group_dropped(i);
1306 ip_mc_clear_src(i);
1302 1307
1303 if (!in_dev->dead) 1308 if (!in_dev->dead)
1304 ip_rt_multicast_event(in_dev); 1309 ip_rt_multicast_event(in_dev);
@@ -1316,34 +1321,34 @@ EXPORT_SYMBOL(ip_mc_dec_group);
1316 1321
1317void ip_mc_unmap(struct in_device *in_dev) 1322void ip_mc_unmap(struct in_device *in_dev)
1318{ 1323{
1319 struct ip_mc_list *i; 1324 struct ip_mc_list *pmc;
1320 1325
1321 ASSERT_RTNL(); 1326 ASSERT_RTNL();
1322 1327
1323 for (i = in_dev->mc_list; i; i = i->next) 1328 for_each_pmc_rtnl(in_dev, pmc)
1324 igmp_group_dropped(i); 1329 igmp_group_dropped(pmc);
1325} 1330}
1326 1331
1327void ip_mc_remap(struct in_device *in_dev) 1332void ip_mc_remap(struct in_device *in_dev)
1328{ 1333{
1329 struct ip_mc_list *i; 1334 struct ip_mc_list *pmc;
1330 1335
1331 ASSERT_RTNL(); 1336 ASSERT_RTNL();
1332 1337
1333 for (i = in_dev->mc_list; i; i = i->next) 1338 for_each_pmc_rtnl(in_dev, pmc)
1334 igmp_group_added(i); 1339 igmp_group_added(pmc);
1335} 1340}
1336 1341
1337/* Device going down */ 1342/* Device going down */
1338 1343
1339void ip_mc_down(struct in_device *in_dev) 1344void ip_mc_down(struct in_device *in_dev)
1340{ 1345{
1341 struct ip_mc_list *i; 1346 struct ip_mc_list *pmc;
1342 1347
1343 ASSERT_RTNL(); 1348 ASSERT_RTNL();
1344 1349
1345 for (i=in_dev->mc_list; i; i=i->next) 1350 for_each_pmc_rtnl(in_dev, pmc)
1346 igmp_group_dropped(i); 1351 igmp_group_dropped(pmc);
1347 1352
1348#ifdef CONFIG_IP_MULTICAST 1353#ifdef CONFIG_IP_MULTICAST
1349 in_dev->mr_ifc_count = 0; 1354 in_dev->mr_ifc_count = 0;
@@ -1374,7 +1379,6 @@ void ip_mc_init_dev(struct in_device *in_dev)
1374 in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; 1379 in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
1375#endif 1380#endif
1376 1381
1377 rwlock_init(&in_dev->mc_list_lock);
1378 spin_lock_init(&in_dev->mc_tomb_lock); 1382 spin_lock_init(&in_dev->mc_tomb_lock);
1379} 1383}
1380 1384
@@ -1382,14 +1386,14 @@ void ip_mc_init_dev(struct in_device *in_dev)
1382 1386
1383void ip_mc_up(struct in_device *in_dev) 1387void ip_mc_up(struct in_device *in_dev)
1384{ 1388{
1385 struct ip_mc_list *i; 1389 struct ip_mc_list *pmc;
1386 1390
1387 ASSERT_RTNL(); 1391 ASSERT_RTNL();
1388 1392
1389 ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); 1393 ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
1390 1394
1391 for (i=in_dev->mc_list; i; i=i->next) 1395 for_each_pmc_rtnl(in_dev, pmc)
1392 igmp_group_added(i); 1396 igmp_group_added(pmc);
1393} 1397}
1394 1398
1395/* 1399/*
@@ -1405,43 +1409,40 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
1405 /* Deactivate timers */ 1409 /* Deactivate timers */
1406 ip_mc_down(in_dev); 1410 ip_mc_down(in_dev);
1407 1411
1408 write_lock_bh(&in_dev->mc_list_lock); 1412 while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
1409 while ((i = in_dev->mc_list) != NULL) { 1413 in_dev->mc_list = i->next_rcu;
1410 in_dev->mc_list = i->next;
1411 in_dev->mc_count--; 1414 in_dev->mc_count--;
1412 write_unlock_bh(&in_dev->mc_list_lock);
1413 igmp_group_dropped(i);
1414 ip_ma_put(i);
1415 1415
1416 write_lock_bh(&in_dev->mc_list_lock); 1416 /* We've dropped the groups in ip_mc_down already */
1417 ip_mc_clear_src(i);
1418 ip_ma_put(i);
1417 } 1419 }
1418 write_unlock_bh(&in_dev->mc_list_lock);
1419} 1420}
1420 1421
1422/* RTNL is locked */
1421static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) 1423static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1422{ 1424{
1423 struct flowi fl = { .nl_u = { .ip4_u =
1424 { .daddr = imr->imr_multiaddr.s_addr } } };
1425 struct rtable *rt;
1426 struct net_device *dev = NULL; 1425 struct net_device *dev = NULL;
1427 struct in_device *idev = NULL; 1426 struct in_device *idev = NULL;
1428 1427
1429 if (imr->imr_ifindex) { 1428 if (imr->imr_ifindex) {
1430 idev = inetdev_by_index(net, imr->imr_ifindex); 1429 idev = inetdev_by_index(net, imr->imr_ifindex);
1431 if (idev)
1432 __in_dev_put(idev);
1433 return idev; 1430 return idev;
1434 } 1431 }
1435 if (imr->imr_address.s_addr) { 1432 if (imr->imr_address.s_addr) {
1436 dev = ip_dev_find(net, imr->imr_address.s_addr); 1433 dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
1437 if (!dev) 1434 if (!dev)
1438 return NULL; 1435 return NULL;
1439 dev_put(dev);
1440 } 1436 }
1441 1437
1442 if (!dev && !ip_route_output_key(net, &rt, &fl)) { 1438 if (!dev) {
1443 dev = rt->dst.dev; 1439 struct rtable *rt = ip_route_output(net,
1444 ip_rt_put(rt); 1440 imr->imr_multiaddr.s_addr,
1441 0, 0, 0);
1442 if (!IS_ERR(rt)) {
1443 dev = rt->dst.dev;
1444 ip_rt_put(rt);
1445 }
1445 } 1446 }
1446 if (dev) { 1447 if (dev) {
1447 imr->imr_ifindex = dev->ifindex; 1448 imr->imr_ifindex = dev->ifindex;
@@ -1515,18 +1516,18 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
1515 1516
1516 if (!in_dev) 1517 if (!in_dev)
1517 return -ENODEV; 1518 return -ENODEV;
1518 read_lock(&in_dev->mc_list_lock); 1519 rcu_read_lock();
1519 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 1520 for_each_pmc_rcu(in_dev, pmc) {
1520 if (*pmca == pmc->multiaddr) 1521 if (*pmca == pmc->multiaddr)
1521 break; 1522 break;
1522 } 1523 }
1523 if (!pmc) { 1524 if (!pmc) {
1524 /* MCA not found?? bug */ 1525 /* MCA not found?? bug */
1525 read_unlock(&in_dev->mc_list_lock); 1526 rcu_read_unlock();
1526 return -ESRCH; 1527 return -ESRCH;
1527 } 1528 }
1528 spin_lock_bh(&pmc->lock); 1529 spin_lock_bh(&pmc->lock);
1529 read_unlock(&in_dev->mc_list_lock); 1530 rcu_read_unlock();
1530#ifdef CONFIG_IP_MULTICAST 1531#ifdef CONFIG_IP_MULTICAST
1531 sf_markstate(pmc); 1532 sf_markstate(pmc);
1532#endif 1533#endif
@@ -1687,18 +1688,18 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
1687 1688
1688 if (!in_dev) 1689 if (!in_dev)
1689 return -ENODEV; 1690 return -ENODEV;
1690 read_lock(&in_dev->mc_list_lock); 1691 rcu_read_lock();
1691 for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { 1692 for_each_pmc_rcu(in_dev, pmc) {
1692 if (*pmca == pmc->multiaddr) 1693 if (*pmca == pmc->multiaddr)
1693 break; 1694 break;
1694 } 1695 }
1695 if (!pmc) { 1696 if (!pmc) {
1696 /* MCA not found?? bug */ 1697 /* MCA not found?? bug */
1697 read_unlock(&in_dev->mc_list_lock); 1698 rcu_read_unlock();
1698 return -ESRCH; 1699 return -ESRCH;
1699 } 1700 }
1700 spin_lock_bh(&pmc->lock); 1701 spin_lock_bh(&pmc->lock);
1701 read_unlock(&in_dev->mc_list_lock); 1702 rcu_read_unlock();
1702 1703
1703#ifdef CONFIG_IP_MULTICAST 1704#ifdef CONFIG_IP_MULTICAST
1704 sf_markstate(pmc); 1705 sf_markstate(pmc);
@@ -1795,7 +1796,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1795 1796
1796 err = -EADDRINUSE; 1797 err = -EADDRINUSE;
1797 ifindex = imr->imr_ifindex; 1798 ifindex = imr->imr_ifindex;
1798 for (i = inet->mc_list; i; i = i->next) { 1799 for_each_pmc_rtnl(inet, i) {
1799 if (i->multi.imr_multiaddr.s_addr == addr && 1800 if (i->multi.imr_multiaddr.s_addr == addr &&
1800 i->multi.imr_ifindex == ifindex) 1801 i->multi.imr_ifindex == ifindex)
1801 goto done; 1802 goto done;
@@ -1809,7 +1810,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1809 goto done; 1810 goto done;
1810 1811
1811 memcpy(&iml->multi, imr, sizeof(*imr)); 1812 memcpy(&iml->multi, imr, sizeof(*imr));
1812 iml->next = inet->mc_list; 1813 iml->next_rcu = inet->mc_list;
1813 iml->sflist = NULL; 1814 iml->sflist = NULL;
1814 iml->sfmode = MCAST_EXCLUDE; 1815 iml->sfmode = MCAST_EXCLUDE;
1815 rcu_assign_pointer(inet->mc_list, iml); 1816 rcu_assign_pointer(inet->mc_list, iml);
@@ -1821,19 +1822,10 @@ done:
1821} 1822}
1822EXPORT_SYMBOL(ip_mc_join_group); 1823EXPORT_SYMBOL(ip_mc_join_group);
1823 1824
1824static void ip_sf_socklist_reclaim(struct rcu_head *rp)
1825{
1826 struct ip_sf_socklist *psf;
1827
1828 psf = container_of(rp, struct ip_sf_socklist, rcu);
1829 /* sk_omem_alloc should have been decreased by the caller*/
1830 kfree(psf);
1831}
1832
1833static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, 1825static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
1834 struct in_device *in_dev) 1826 struct in_device *in_dev)
1835{ 1827{
1836 struct ip_sf_socklist *psf = iml->sflist; 1828 struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
1837 int err; 1829 int err;
1838 1830
1839 if (psf == NULL) { 1831 if (psf == NULL) {
@@ -1846,21 +1838,10 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
1846 rcu_assign_pointer(iml->sflist, NULL); 1838 rcu_assign_pointer(iml->sflist, NULL);
1847 /* decrease mem now to avoid the memleak warning */ 1839 /* decrease mem now to avoid the memleak warning */
1848 atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); 1840 atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
1849 call_rcu(&psf->rcu, ip_sf_socklist_reclaim); 1841 kfree_rcu(psf, rcu);
1850 return err; 1842 return err;
1851} 1843}
1852 1844
1853
1854static void ip_mc_socklist_reclaim(struct rcu_head *rp)
1855{
1856 struct ip_mc_socklist *iml;
1857
1858 iml = container_of(rp, struct ip_mc_socklist, rcu);
1859 /* sk_omem_alloc should have been decreased by the caller*/
1860 kfree(iml);
1861}
1862
1863
1864/* 1845/*
1865 * Ask a socket to leave a group. 1846 * Ask a socket to leave a group.
1866 */ 1847 */
@@ -1868,7 +1849,8 @@ static void ip_mc_socklist_reclaim(struct rcu_head *rp)
1868int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) 1849int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1869{ 1850{
1870 struct inet_sock *inet = inet_sk(sk); 1851 struct inet_sock *inet = inet_sk(sk);
1871 struct ip_mc_socklist *iml, **imlp; 1852 struct ip_mc_socklist *iml;
1853 struct ip_mc_socklist __rcu **imlp;
1872 struct in_device *in_dev; 1854 struct in_device *in_dev;
1873 struct net *net = sock_net(sk); 1855 struct net *net = sock_net(sk);
1874 __be32 group = imr->imr_multiaddr.s_addr; 1856 __be32 group = imr->imr_multiaddr.s_addr;
@@ -1878,7 +1860,9 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1878 rtnl_lock(); 1860 rtnl_lock();
1879 in_dev = ip_mc_find_dev(net, imr); 1861 in_dev = ip_mc_find_dev(net, imr);
1880 ifindex = imr->imr_ifindex; 1862 ifindex = imr->imr_ifindex;
1881 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { 1863 for (imlp = &inet->mc_list;
1864 (iml = rtnl_dereference(*imlp)) != NULL;
1865 imlp = &iml->next_rcu) {
1882 if (iml->multi.imr_multiaddr.s_addr != group) 1866 if (iml->multi.imr_multiaddr.s_addr != group)
1883 continue; 1867 continue;
1884 if (ifindex) { 1868 if (ifindex) {
@@ -1890,14 +1874,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1890 1874
1891 (void) ip_mc_leave_src(sk, iml, in_dev); 1875 (void) ip_mc_leave_src(sk, iml, in_dev);
1892 1876
1893 rcu_assign_pointer(*imlp, iml->next); 1877 *imlp = iml->next_rcu;
1894 1878
1895 if (in_dev) 1879 if (in_dev)
1896 ip_mc_dec_group(in_dev, group); 1880 ip_mc_dec_group(in_dev, group);
1897 rtnl_unlock(); 1881 rtnl_unlock();
1898 /* decrease mem now to avoid the memleak warning */ 1882 /* decrease mem now to avoid the memleak warning */
1899 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); 1883 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
1900 call_rcu(&iml->rcu, ip_mc_socklist_reclaim); 1884 kfree_rcu(iml, rcu);
1901 return 0; 1885 return 0;
1902 } 1886 }
1903 if (!in_dev) 1887 if (!in_dev)
@@ -1936,7 +1920,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1936 } 1920 }
1937 err = -EADDRNOTAVAIL; 1921 err = -EADDRNOTAVAIL;
1938 1922
1939 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1923 for_each_pmc_rtnl(inet, pmc) {
1940 if ((pmc->multi.imr_multiaddr.s_addr == 1924 if ((pmc->multi.imr_multiaddr.s_addr ==
1941 imr.imr_multiaddr.s_addr) && 1925 imr.imr_multiaddr.s_addr) &&
1942 (pmc->multi.imr_ifindex == imr.imr_ifindex)) 1926 (pmc->multi.imr_ifindex == imr.imr_ifindex))
@@ -1960,7 +1944,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1960 pmc->sfmode = omode; 1944 pmc->sfmode = omode;
1961 } 1945 }
1962 1946
1963 psl = pmc->sflist; 1947 psl = rtnl_dereference(pmc->sflist);
1964 if (!add) { 1948 if (!add) {
1965 if (!psl) 1949 if (!psl)
1966 goto done; /* err = -EADDRNOTAVAIL */ 1950 goto done; /* err = -EADDRNOTAVAIL */
@@ -2014,7 +1998,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
2014 newpsl->sl_addr[i] = psl->sl_addr[i]; 1998 newpsl->sl_addr[i] = psl->sl_addr[i];
2015 /* decrease mem now to avoid the memleak warning */ 1999 /* decrease mem now to avoid the memleak warning */
2016 atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); 2000 atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
2017 call_rcu(&psl->rcu, ip_sf_socklist_reclaim); 2001 kfree_rcu(psl, rcu);
2018 } 2002 }
2019 rcu_assign_pointer(pmc->sflist, newpsl); 2003 rcu_assign_pointer(pmc->sflist, newpsl);
2020 psl = newpsl; 2004 psl = newpsl;
@@ -2079,7 +2063,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
2079 goto done; 2063 goto done;
2080 } 2064 }
2081 2065
2082 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2066 for_each_pmc_rtnl(inet, pmc) {
2083 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && 2067 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
2084 pmc->multi.imr_ifindex == imr.imr_ifindex) 2068 pmc->multi.imr_ifindex == imr.imr_ifindex)
2085 break; 2069 break;
@@ -2109,13 +2093,13 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
2109 (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr, 2093 (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
2110 msf->imsf_fmode, 0, NULL, 0); 2094 msf->imsf_fmode, 0, NULL, 0);
2111 } 2095 }
2112 psl = pmc->sflist; 2096 psl = rtnl_dereference(pmc->sflist);
2113 if (psl) { 2097 if (psl) {
2114 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 2098 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
2115 psl->sl_count, psl->sl_addr, 0); 2099 psl->sl_count, psl->sl_addr, 0);
2116 /* decrease mem now to avoid the memleak warning */ 2100 /* decrease mem now to avoid the memleak warning */
2117 atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); 2101 atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
2118 call_rcu(&psl->rcu, ip_sf_socklist_reclaim); 2102 kfree_rcu(psl, rcu);
2119 } else 2103 } else
2120 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 2104 (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
2121 0, NULL, 0); 2105 0, NULL, 0);
@@ -2157,7 +2141,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
2157 } 2141 }
2158 err = -EADDRNOTAVAIL; 2142 err = -EADDRNOTAVAIL;
2159 2143
2160 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2144 for_each_pmc_rtnl(inet, pmc) {
2161 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && 2145 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
2162 pmc->multi.imr_ifindex == imr.imr_ifindex) 2146 pmc->multi.imr_ifindex == imr.imr_ifindex)
2163 break; 2147 break;
@@ -2165,7 +2149,7 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
2165 if (!pmc) /* must have a prior join */ 2149 if (!pmc) /* must have a prior join */
2166 goto done; 2150 goto done;
2167 msf->imsf_fmode = pmc->sfmode; 2151 msf->imsf_fmode = pmc->sfmode;
2168 psl = pmc->sflist; 2152 psl = rtnl_dereference(pmc->sflist);
2169 rtnl_unlock(); 2153 rtnl_unlock();
2170 if (!psl) { 2154 if (!psl) {
2171 len = 0; 2155 len = 0;
@@ -2210,7 +2194,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
2210 2194
2211 err = -EADDRNOTAVAIL; 2195 err = -EADDRNOTAVAIL;
2212 2196
2213 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 2197 for_each_pmc_rtnl(inet, pmc) {
2214 if (pmc->multi.imr_multiaddr.s_addr == addr && 2198 if (pmc->multi.imr_multiaddr.s_addr == addr &&
2215 pmc->multi.imr_ifindex == gsf->gf_interface) 2199 pmc->multi.imr_ifindex == gsf->gf_interface)
2216 break; 2200 break;
@@ -2218,7 +2202,7 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
2218 if (!pmc) /* must have a prior join */ 2202 if (!pmc) /* must have a prior join */
2219 goto done; 2203 goto done;
2220 gsf->gf_fmode = pmc->sfmode; 2204 gsf->gf_fmode = pmc->sfmode;
2221 psl = pmc->sflist; 2205 psl = rtnl_dereference(pmc->sflist);
2222 rtnl_unlock(); 2206 rtnl_unlock();
2223 count = psl ? psl->sl_count : 0; 2207 count = psl ? psl->sl_count : 0;
2224 copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; 2208 copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
@@ -2259,7 +2243,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
2259 goto out; 2243 goto out;
2260 2244
2261 rcu_read_lock(); 2245 rcu_read_lock();
2262 for (pmc=rcu_dereference(inet->mc_list); pmc; pmc=rcu_dereference(pmc->next)) { 2246 for_each_pmc_rcu(inet, pmc) {
2263 if (pmc->multi.imr_multiaddr.s_addr == loc_addr && 2247 if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
2264 pmc->multi.imr_ifindex == dif) 2248 pmc->multi.imr_ifindex == dif)
2265 break; 2249 break;
@@ -2267,7 +2251,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
2267 ret = inet->mc_all; 2251 ret = inet->mc_all;
2268 if (!pmc) 2252 if (!pmc)
2269 goto unlock; 2253 goto unlock;
2270 psl = pmc->sflist; 2254 psl = rcu_dereference(pmc->sflist);
2271 ret = (pmc->sfmode == MCAST_EXCLUDE); 2255 ret = (pmc->sfmode == MCAST_EXCLUDE);
2272 if (!psl) 2256 if (!psl)
2273 goto unlock; 2257 goto unlock;
@@ -2302,31 +2286,29 @@ void ip_mc_drop_socket(struct sock *sk)
2302 return; 2286 return;
2303 2287
2304 rtnl_lock(); 2288 rtnl_lock();
2305 while ((iml = inet->mc_list) != NULL) { 2289 while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
2306 struct in_device *in_dev; 2290 struct in_device *in_dev;
2307 rcu_assign_pointer(inet->mc_list, iml->next);
2308 2291
2292 inet->mc_list = iml->next_rcu;
2309 in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); 2293 in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
2310 (void) ip_mc_leave_src(sk, iml, in_dev); 2294 (void) ip_mc_leave_src(sk, iml, in_dev);
2311 if (in_dev != NULL) { 2295 if (in_dev != NULL)
2312 ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); 2296 ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
2313 in_dev_put(in_dev);
2314 }
2315 /* decrease mem now to avoid the memleak warning */ 2297 /* decrease mem now to avoid the memleak warning */
2316 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); 2298 atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
2317 call_rcu(&iml->rcu, ip_mc_socklist_reclaim); 2299 kfree_rcu(iml, rcu);
2318 } 2300 }
2319 rtnl_unlock(); 2301 rtnl_unlock();
2320} 2302}
2321 2303
2322int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) 2304/* called with rcu_read_lock() */
2305int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
2323{ 2306{
2324 struct ip_mc_list *im; 2307 struct ip_mc_list *im;
2325 struct ip_sf_list *psf; 2308 struct ip_sf_list *psf;
2326 int rv = 0; 2309 int rv = 0;
2327 2310
2328 read_lock(&in_dev->mc_list_lock); 2311 for_each_pmc_rcu(in_dev, im) {
2329 for (im=in_dev->mc_list; im; im=im->next) {
2330 if (im->multiaddr == mc_addr) 2312 if (im->multiaddr == mc_addr)
2331 break; 2313 break;
2332 } 2314 }
@@ -2347,7 +2329,6 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
2347 } else 2329 } else
2348 rv = 1; /* unspecified source; tentatively allow */ 2330 rv = 1; /* unspecified source; tentatively allow */
2349 } 2331 }
2350 read_unlock(&in_dev->mc_list_lock);
2351 return rv; 2332 return rv;
2352} 2333}
2353 2334
@@ -2373,13 +2354,11 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
2373 in_dev = __in_dev_get_rcu(state->dev); 2354 in_dev = __in_dev_get_rcu(state->dev);
2374 if (!in_dev) 2355 if (!in_dev)
2375 continue; 2356 continue;
2376 read_lock(&in_dev->mc_list_lock); 2357 im = rcu_dereference(in_dev->mc_list);
2377 im = in_dev->mc_list;
2378 if (im) { 2358 if (im) {
2379 state->in_dev = in_dev; 2359 state->in_dev = in_dev;
2380 break; 2360 break;
2381 } 2361 }
2382 read_unlock(&in_dev->mc_list_lock);
2383 } 2362 }
2384 return im; 2363 return im;
2385} 2364}
@@ -2387,11 +2366,9 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
2387static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im) 2366static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
2388{ 2367{
2389 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2368 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2390 im = im->next;
2391 while (!im) {
2392 if (likely(state->in_dev != NULL))
2393 read_unlock(&state->in_dev->mc_list_lock);
2394 2369
2370 im = rcu_dereference(im->next_rcu);
2371 while (!im) {
2395 state->dev = next_net_device_rcu(state->dev); 2372 state->dev = next_net_device_rcu(state->dev);
2396 if (!state->dev) { 2373 if (!state->dev) {
2397 state->in_dev = NULL; 2374 state->in_dev = NULL;
@@ -2400,8 +2377,7 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li
2400 state->in_dev = __in_dev_get_rcu(state->dev); 2377 state->in_dev = __in_dev_get_rcu(state->dev);
2401 if (!state->in_dev) 2378 if (!state->in_dev)
2402 continue; 2379 continue;
2403 read_lock(&state->in_dev->mc_list_lock); 2380 im = rcu_dereference(state->in_dev->mc_list);
2404 im = state->in_dev->mc_list;
2405 } 2381 }
2406 return im; 2382 return im;
2407} 2383}
@@ -2437,10 +2413,8 @@ static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
2437 __releases(rcu) 2413 __releases(rcu)
2438{ 2414{
2439 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2415 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2440 if (likely(state->in_dev != NULL)) { 2416
2441 read_unlock(&state->in_dev->mc_list_lock); 2417 state->in_dev = NULL;
2442 state->in_dev = NULL;
2443 }
2444 state->dev = NULL; 2418 state->dev = NULL;
2445 rcu_read_unlock(); 2419 rcu_read_unlock();
2446} 2420}
@@ -2462,7 +2436,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2462 querier = "NONE"; 2436 querier = "NONE";
2463#endif 2437#endif
2464 2438
2465 if (state->in_dev->mc_list == im) { 2439 if (rcu_dereference(state->in_dev->mc_list) == im) {
2466 seq_printf(seq, "%d\t%-10s: %5d %7s\n", 2440 seq_printf(seq, "%d\t%-10s: %5d %7s\n",
2467 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); 2441 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
2468 } 2442 }
@@ -2521,8 +2495,7 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2521 idev = __in_dev_get_rcu(state->dev); 2495 idev = __in_dev_get_rcu(state->dev);
2522 if (unlikely(idev == NULL)) 2496 if (unlikely(idev == NULL))
2523 continue; 2497 continue;
2524 read_lock(&idev->mc_list_lock); 2498 im = rcu_dereference(idev->mc_list);
2525 im = idev->mc_list;
2526 if (likely(im != NULL)) { 2499 if (likely(im != NULL)) {
2527 spin_lock_bh(&im->lock); 2500 spin_lock_bh(&im->lock);
2528 psf = im->sources; 2501 psf = im->sources;
@@ -2533,7 +2506,6 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
2533 } 2506 }
2534 spin_unlock_bh(&im->lock); 2507 spin_unlock_bh(&im->lock);
2535 } 2508 }
2536 read_unlock(&idev->mc_list_lock);
2537 } 2509 }
2538 return psf; 2510 return psf;
2539} 2511}
@@ -2547,9 +2519,6 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
2547 spin_unlock_bh(&state->im->lock); 2519 spin_unlock_bh(&state->im->lock);
2548 state->im = state->im->next; 2520 state->im = state->im->next;
2549 while (!state->im) { 2521 while (!state->im) {
2550 if (likely(state->idev != NULL))
2551 read_unlock(&state->idev->mc_list_lock);
2552
2553 state->dev = next_net_device_rcu(state->dev); 2522 state->dev = next_net_device_rcu(state->dev);
2554 if (!state->dev) { 2523 if (!state->dev) {
2555 state->idev = NULL; 2524 state->idev = NULL;
@@ -2558,8 +2527,7 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
2558 state->idev = __in_dev_get_rcu(state->dev); 2527 state->idev = __in_dev_get_rcu(state->dev);
2559 if (!state->idev) 2528 if (!state->idev)
2560 continue; 2529 continue;
2561 read_lock(&state->idev->mc_list_lock); 2530 state->im = rcu_dereference(state->idev->mc_list);
2562 state->im = state->idev->mc_list;
2563 } 2531 }
2564 if (!state->im) 2532 if (!state->im)
2565 break; 2533 break;
@@ -2605,10 +2573,7 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
2605 spin_unlock_bh(&state->im->lock); 2573 spin_unlock_bh(&state->im->lock);
2606 state->im = NULL; 2574 state->im = NULL;
2607 } 2575 }
2608 if (likely(state->idev != NULL)) { 2576 state->idev = NULL;
2609 read_unlock(&state->idev->mc_list_lock);
2610 state->idev = NULL;
2611 }
2612 state->dev = NULL; 2577 state->dev = NULL;
2613 rcu_read_unlock(); 2578 rcu_read_unlock();
2614} 2579}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7174370b1195..c14d88ad348d 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -33,7 +33,7 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
33 * This struct holds the first and last local port number. 33 * This struct holds the first and last local port number.
34 */ 34 */
35struct local_ports sysctl_local_ports __read_mostly = { 35struct local_ports sysctl_local_ports __read_mostly = {
36 .lock = SEQLOCK_UNLOCKED, 36 .lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock),
37 .range = { 32768, 61000 }, 37 .range = { 32768, 61000 },
38}; 38};
39 39
@@ -55,7 +55,6 @@ EXPORT_SYMBOL(inet_get_local_port_range);
55int inet_csk_bind_conflict(const struct sock *sk, 55int inet_csk_bind_conflict(const struct sock *sk,
56 const struct inet_bind_bucket *tb) 56 const struct inet_bind_bucket *tb)
57{ 57{
58 const __be32 sk_rcv_saddr = inet_rcv_saddr(sk);
59 struct sock *sk2; 58 struct sock *sk2;
60 struct hlist_node *node; 59 struct hlist_node *node;
61 int reuse = sk->sk_reuse; 60 int reuse = sk->sk_reuse;
@@ -75,9 +74,9 @@ int inet_csk_bind_conflict(const struct sock *sk,
75 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { 74 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
76 if (!reuse || !sk2->sk_reuse || 75 if (!reuse || !sk2->sk_reuse ||
77 sk2->sk_state == TCP_LISTEN) { 76 sk2->sk_state == TCP_LISTEN) {
78 const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); 77 const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
79 if (!sk2_rcv_saddr || !sk_rcv_saddr || 78 if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
80 sk2_rcv_saddr == sk_rcv_saddr) 79 sk2_rcv_saddr == sk_rcv_saddr(sk))
81 break; 80 break;
82 } 81 }
83 } 82 }
@@ -351,30 +350,24 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
351EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); 350EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
352 351
353struct dst_entry *inet_csk_route_req(struct sock *sk, 352struct dst_entry *inet_csk_route_req(struct sock *sk,
353 struct flowi4 *fl4,
354 const struct request_sock *req) 354 const struct request_sock *req)
355{ 355{
356 struct rtable *rt; 356 struct rtable *rt;
357 const struct inet_request_sock *ireq = inet_rsk(req); 357 const struct inet_request_sock *ireq = inet_rsk(req);
358 struct ip_options *opt = inet_rsk(req)->opt; 358 struct ip_options_rcu *opt = inet_rsk(req)->opt;
359 struct flowi fl = { .oif = sk->sk_bound_dev_if,
360 .mark = sk->sk_mark,
361 .nl_u = { .ip4_u =
362 { .daddr = ((opt && opt->srr) ?
363 opt->faddr :
364 ireq->rmt_addr),
365 .saddr = ireq->loc_addr,
366 .tos = RT_CONN_FLAGS(sk) } },
367 .proto = sk->sk_protocol,
368 .flags = inet_sk_flowi_flags(sk),
369 .uli_u = { .ports =
370 { .sport = inet_sk(sk)->inet_sport,
371 .dport = ireq->rmt_port } } };
372 struct net *net = sock_net(sk); 359 struct net *net = sock_net(sk);
373 360
374 security_req_classify_flow(req, &fl); 361 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
375 if (ip_route_output_flow(net, &rt, &fl, sk, 0)) 362 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
363 sk->sk_protocol, inet_sk_flowi_flags(sk),
364 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
365 ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
366 security_req_classify_flow(req, flowi4_to_flowi(fl4));
367 rt = ip_route_output_flow(net, fl4, sk);
368 if (IS_ERR(rt))
376 goto no_route; 369 goto no_route;
377 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 370 if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
378 goto route_err; 371 goto route_err;
379 return &rt->dst; 372 return &rt->dst;
380 373
@@ -386,6 +379,39 @@ no_route:
386} 379}
387EXPORT_SYMBOL_GPL(inet_csk_route_req); 380EXPORT_SYMBOL_GPL(inet_csk_route_req);
388 381
382struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
383 struct sock *newsk,
384 const struct request_sock *req)
385{
386 const struct inet_request_sock *ireq = inet_rsk(req);
387 struct inet_sock *newinet = inet_sk(newsk);
388 struct ip_options_rcu *opt = ireq->opt;
389 struct net *net = sock_net(sk);
390 struct flowi4 *fl4;
391 struct rtable *rt;
392
393 fl4 = &newinet->cork.fl.u.ip4;
394 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
395 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
396 sk->sk_protocol, inet_sk_flowi_flags(sk),
397 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
398 ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
399 security_req_classify_flow(req, flowi4_to_flowi(fl4));
400 rt = ip_route_output_flow(net, fl4, sk);
401 if (IS_ERR(rt))
402 goto no_route;
403 if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
404 goto route_err;
405 return &rt->dst;
406
407route_err:
408 ip_rt_put(rt);
409no_route:
410 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
411 return NULL;
412}
413EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
414
389static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, 415static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
390 const u32 rnd, const u32 synq_hsize) 416 const u32 rnd, const u32 synq_hsize)
391{ 417{
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e5fa2ddce320..3267d3898437 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -124,7 +124,7 @@ static int inet_csk_diag_fill(struct sock *sk,
124 124
125#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 125#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
126 if (r->idiag_family == AF_INET6) { 126 if (r->idiag_family == AF_INET6) {
127 struct ipv6_pinfo *np = inet6_sk(sk); 127 const struct ipv6_pinfo *np = inet6_sk(sk);
128 128
129 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, 129 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
130 &np->rcv_saddr); 130 &np->rcv_saddr);
@@ -425,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len,
425 bc += op->no; 425 bc += op->no;
426 } 426 }
427 } 427 }
428 return (len == 0); 428 return len == 0;
429} 429}
430 430
431static int valid_cc(const void *bc, int len, int cc) 431static int valid_cc(const void *bc, int len, int cc)
@@ -437,7 +437,7 @@ static int valid_cc(const void *bc, int len, int cc)
437 return 0; 437 return 0;
438 if (cc == len) 438 if (cc == len)
439 return 1; 439 return 1;
440 if (op->yes < 4) 440 if (op->yes < 4 || op->yes & 3)
441 return 0; 441 return 0;
442 len -= op->yes; 442 len -= op->yes;
443 bc += op->yes; 443 bc += op->yes;
@@ -447,11 +447,11 @@ static int valid_cc(const void *bc, int len, int cc)
447 447
448static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) 448static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
449{ 449{
450 const unsigned char *bc = bytecode; 450 const void *bc = bytecode;
451 int len = bytecode_len; 451 int len = bytecode_len;
452 452
453 while (len > 0) { 453 while (len > 0) {
454 struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc; 454 const struct inet_diag_bc_op *op = bc;
455 455
456//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); 456//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
457 switch (op->code) { 457 switch (op->code) {
@@ -462,22 +462,20 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
462 case INET_DIAG_BC_S_LE: 462 case INET_DIAG_BC_S_LE:
463 case INET_DIAG_BC_D_GE: 463 case INET_DIAG_BC_D_GE:
464 case INET_DIAG_BC_D_LE: 464 case INET_DIAG_BC_D_LE:
465 if (op->yes < 4 || op->yes > len + 4)
466 return -EINVAL;
467 case INET_DIAG_BC_JMP: 465 case INET_DIAG_BC_JMP:
468 if (op->no < 4 || op->no > len + 4) 466 if (op->no < 4 || op->no > len + 4 || op->no & 3)
469 return -EINVAL; 467 return -EINVAL;
470 if (op->no < len && 468 if (op->no < len &&
471 !valid_cc(bytecode, bytecode_len, len - op->no)) 469 !valid_cc(bytecode, bytecode_len, len - op->no))
472 return -EINVAL; 470 return -EINVAL;
473 break; 471 break;
474 case INET_DIAG_BC_NOP: 472 case INET_DIAG_BC_NOP:
475 if (op->yes < 4 || op->yes > len + 4)
476 return -EINVAL;
477 break; 473 break;
478 default: 474 default:
479 return -EINVAL; 475 return -EINVAL;
480 } 476 }
477 if (op->yes < 4 || op->yes > len + 4 || op->yes & 3)
478 return -EINVAL;
481 bc += op->yes; 479 bc += op->yes;
482 len -= op->yes; 480 len -= op->yes;
483 } 481 }
@@ -490,9 +488,11 @@ static int inet_csk_diag_dump(struct sock *sk,
490{ 488{
491 struct inet_diag_req *r = NLMSG_DATA(cb->nlh); 489 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
492 490
493 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { 491 if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
494 struct inet_diag_entry entry; 492 struct inet_diag_entry entry;
495 struct rtattr *bc = (struct rtattr *)(r + 1); 493 const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
494 sizeof(*r),
495 INET_DIAG_REQ_BYTECODE);
496 struct inet_sock *inet = inet_sk(sk); 496 struct inet_sock *inet = inet_sk(sk);
497 497
498 entry.family = sk->sk_family; 498 entry.family = sk->sk_family;
@@ -512,7 +512,7 @@ static int inet_csk_diag_dump(struct sock *sk,
512 entry.dport = ntohs(inet->inet_dport); 512 entry.dport = ntohs(inet->inet_dport);
513 entry.userlocks = sk->sk_userlocks; 513 entry.userlocks = sk->sk_userlocks;
514 514
515 if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) 515 if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry))
516 return 0; 516 return 0;
517 } 517 }
518 518
@@ -527,9 +527,11 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
527{ 527{
528 struct inet_diag_req *r = NLMSG_DATA(cb->nlh); 528 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
529 529
530 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { 530 if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
531 struct inet_diag_entry entry; 531 struct inet_diag_entry entry;
532 struct rtattr *bc = (struct rtattr *)(r + 1); 532 const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
533 sizeof(*r),
534 INET_DIAG_REQ_BYTECODE);
533 535
534 entry.family = tw->tw_family; 536 entry.family = tw->tw_family;
535#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 537#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
@@ -548,7 +550,7 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
548 entry.dport = ntohs(tw->tw_dport); 550 entry.dport = ntohs(tw->tw_dport);
549 entry.userlocks = 0; 551 entry.userlocks = 0;
550 552
551 if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) 553 if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry))
552 return 0; 554 return 0;
553 } 555 }
554 556
@@ -618,7 +620,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
618 struct inet_diag_req *r = NLMSG_DATA(cb->nlh); 620 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
619 struct inet_connection_sock *icsk = inet_csk(sk); 621 struct inet_connection_sock *icsk = inet_csk(sk);
620 struct listen_sock *lopt; 622 struct listen_sock *lopt;
621 struct rtattr *bc = NULL; 623 const struct nlattr *bc = NULL;
622 struct inet_sock *inet = inet_sk(sk); 624 struct inet_sock *inet = inet_sk(sk);
623 int j, s_j; 625 int j, s_j;
624 int reqnum, s_reqnum; 626 int reqnum, s_reqnum;
@@ -638,8 +640,9 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
638 if (!lopt || !lopt->qlen) 640 if (!lopt || !lopt->qlen)
639 goto out; 641 goto out;
640 642
641 if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { 643 if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
642 bc = (struct rtattr *)(r + 1); 644 bc = nlmsg_find_attr(cb->nlh, sizeof(*r),
645 INET_DIAG_REQ_BYTECODE);
643 entry.sport = inet->inet_num; 646 entry.sport = inet->inet_num;
644 entry.userlocks = sk->sk_userlocks; 647 entry.userlocks = sk->sk_userlocks;
645 } 648 }
@@ -672,8 +675,8 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
672 &ireq->rmt_addr; 675 &ireq->rmt_addr;
673 entry.dport = ntohs(ireq->rmt_port); 676 entry.dport = ntohs(ireq->rmt_port);
674 677
675 if (!inet_diag_bc_run(RTA_DATA(bc), 678 if (!inet_diag_bc_run(nla_data(bc),
676 RTA_PAYLOAD(bc), &entry)) 679 nla_len(bc), &entry))
677 continue; 680 continue;
678 } 681 }
679 682
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fb7ad5a21ff3..3c0369a3a663 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -101,19 +101,42 @@ void inet_put_port(struct sock *sk)
101} 101}
102EXPORT_SYMBOL(inet_put_port); 102EXPORT_SYMBOL(inet_put_port);
103 103
104void __inet_inherit_port(struct sock *sk, struct sock *child) 104int __inet_inherit_port(struct sock *sk, struct sock *child)
105{ 105{
106 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; 106 struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
107 const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num, 107 unsigned short port = inet_sk(child)->inet_num;
108 const int bhash = inet_bhashfn(sock_net(sk), port,
108 table->bhash_size); 109 table->bhash_size);
109 struct inet_bind_hashbucket *head = &table->bhash[bhash]; 110 struct inet_bind_hashbucket *head = &table->bhash[bhash];
110 struct inet_bind_bucket *tb; 111 struct inet_bind_bucket *tb;
111 112
112 spin_lock(&head->lock); 113 spin_lock(&head->lock);
113 tb = inet_csk(sk)->icsk_bind_hash; 114 tb = inet_csk(sk)->icsk_bind_hash;
114 sk_add_bind_node(child, &tb->owners); 115 if (tb->port != port) {
115 inet_csk(child)->icsk_bind_hash = tb; 116 /* NOTE: using tproxy and redirecting skbs to a proxy
117 * on a different listener port breaks the assumption
118 * that the listener socket's icsk_bind_hash is the same
119 * as that of the child socket. We have to look up or
120 * create a new bind bucket for the child here. */
121 struct hlist_node *node;
122 inet_bind_bucket_for_each(tb, node, &head->chain) {
123 if (net_eq(ib_net(tb), sock_net(sk)) &&
124 tb->port == port)
125 break;
126 }
127 if (!node) {
128 tb = inet_bind_bucket_create(table->bind_bucket_cachep,
129 sock_net(sk), head, port);
130 if (!tb) {
131 spin_unlock(&head->lock);
132 return -ENOMEM;
133 }
134 }
135 }
136 inet_bind_hash(child, tb, port);
116 spin_unlock(&head->lock); 137 spin_unlock(&head->lock);
138
139 return 0;
117} 140}
118EXPORT_SYMBOL_GPL(__inet_inherit_port); 141EXPORT_SYMBOL_GPL(__inet_inherit_port);
119 142
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index 47038cb6c138..85a0f75dae64 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -51,8 +51,8 @@ MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
51 * Basic tcp checks whether packet is suitable for LRO 51 * Basic tcp checks whether packet is suitable for LRO
52 */ 52 */
53 53
54static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph, 54static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph,
55 int len, struct net_lro_desc *lro_desc) 55 int len, const struct net_lro_desc *lro_desc)
56{ 56{
57 /* check ip header: don't aggregate padded frames */ 57 /* check ip header: don't aggregate padded frames */
58 if (ntohs(iph->tot_len) != len) 58 if (ntohs(iph->tot_len) != len)
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index c5af909cf701..3c8dfa16614d 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -505,7 +505,9 @@ restart:
505 } 505 }
506 506
507 rcu_read_unlock(); 507 rcu_read_unlock();
508 local_bh_disable();
508 inet_twsk_deschedule(tw, twdr); 509 inet_twsk_deschedule(tw, twdr);
510 local_bh_enable();
509 inet_twsk_put(tw); 511 inet_twsk_put(tw);
510 goto restart_rcu; 512 goto restart_rcu;
511 } 513 }
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 9ffa24b9a804..ce616d92cc54 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -63,7 +63,7 @@
63 * refcnt: atomically against modifications on other CPU; 63 * refcnt: atomically against modifications on other CPU;
64 * usually under some other lock to prevent node disappearing 64 * usually under some other lock to prevent node disappearing
65 * dtime: unused node list lock 65 * dtime: unused node list lock
66 * v4daddr: unchangeable 66 * daddr: unchangeable
67 * ip_id_count: atomic value (no lock needed) 67 * ip_id_count: atomic value (no lock needed)
68 */ 68 */
69 69
@@ -72,21 +72,31 @@ static struct kmem_cache *peer_cachep __read_mostly;
72#define node_height(x) x->avl_height 72#define node_height(x) x->avl_height
73 73
74#define peer_avl_empty ((struct inet_peer *)&peer_fake_node) 74#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
75#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
75static const struct inet_peer peer_fake_node = { 76static const struct inet_peer peer_fake_node = {
76 .avl_left = peer_avl_empty, 77 .avl_left = peer_avl_empty_rcu,
77 .avl_right = peer_avl_empty, 78 .avl_right = peer_avl_empty_rcu,
78 .avl_height = 0 79 .avl_height = 0
79}; 80};
80 81
81static struct { 82struct inet_peer_base {
82 struct inet_peer *root; 83 struct inet_peer __rcu *root;
83 spinlock_t lock; 84 seqlock_t lock;
84 int total; 85 int total;
85} peers = { 86};
86 .root = peer_avl_empty, 87
87 .lock = __SPIN_LOCK_UNLOCKED(peers.lock), 88static struct inet_peer_base v4_peers = {
89 .root = peer_avl_empty_rcu,
90 .lock = __SEQLOCK_UNLOCKED(v4_peers.lock),
91 .total = 0,
92};
93
94static struct inet_peer_base v6_peers = {
95 .root = peer_avl_empty_rcu,
96 .lock = __SEQLOCK_UNLOCKED(v6_peers.lock),
88 .total = 0, 97 .total = 0,
89}; 98};
99
90#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 100#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
91 101
92/* Exported for sysctl_net_ipv4. */ 102/* Exported for sysctl_net_ipv4. */
@@ -144,62 +154,99 @@ void __init inet_initpeers(void)
144/* Called with or without local BH being disabled. */ 154/* Called with or without local BH being disabled. */
145static void unlink_from_unused(struct inet_peer *p) 155static void unlink_from_unused(struct inet_peer *p)
146{ 156{
147 if (!list_empty(&p->unused)) { 157 spin_lock_bh(&unused_peers.lock);
148 spin_lock_bh(&unused_peers.lock); 158 list_del_init(&p->unused);
149 list_del_init(&p->unused); 159 spin_unlock_bh(&unused_peers.lock);
150 spin_unlock_bh(&unused_peers.lock); 160}
161
162static int addr_compare(const struct inetpeer_addr *a,
163 const struct inetpeer_addr *b)
164{
165 int i, n = (a->family == AF_INET ? 1 : 4);
166
167 for (i = 0; i < n; i++) {
168 if (a->addr.a6[i] == b->addr.a6[i])
169 continue;
170 if (a->addr.a6[i] < b->addr.a6[i])
171 return -1;
172 return 1;
151 } 173 }
174
175 return 0;
152} 176}
153 177
178#define rcu_deref_locked(X, BASE) \
179 rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
180
154/* 181/*
155 * Called with local BH disabled and the pool lock held. 182 * Called with local BH disabled and the pool lock held.
156 */ 183 */
157#define lookup(_daddr, _stack) \ 184#define lookup(_daddr, _stack, _base) \
158({ \ 185({ \
159 struct inet_peer *u, **v; \ 186 struct inet_peer *u; \
187 struct inet_peer __rcu **v; \
160 \ 188 \
161 stackptr = _stack; \ 189 stackptr = _stack; \
162 *stackptr++ = &peers.root; \ 190 *stackptr++ = &_base->root; \
163 for (u = peers.root; u != peer_avl_empty; ) { \ 191 for (u = rcu_deref_locked(_base->root, _base); \
164 if (_daddr == u->v4daddr) \ 192 u != peer_avl_empty; ) { \
193 int cmp = addr_compare(_daddr, &u->daddr); \
194 if (cmp == 0) \
165 break; \ 195 break; \
166 if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ 196 if (cmp == -1) \
167 v = &u->avl_left; \ 197 v = &u->avl_left; \
168 else \ 198 else \
169 v = &u->avl_right; \ 199 v = &u->avl_right; \
170 *stackptr++ = v; \ 200 *stackptr++ = v; \
171 u = *v; \ 201 u = rcu_deref_locked(*v, _base); \
172 } \ 202 } \
173 u; \ 203 u; \
174}) 204})
175 205
206static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv)
207{
208 int cur, old = atomic_read(ptr);
209
210 while (old != u) {
211 *newv = old + a;
212 cur = atomic_cmpxchg(ptr, old, *newv);
213 if (cur == old)
214 return true;
215 old = cur;
216 }
217 return false;
218}
219
176/* 220/*
177 * Called with rcu_read_lock_bh() 221 * Called with rcu_read_lock()
178 * Because we hold no lock against a writer, its quite possible we fall 222 * Because we hold no lock against a writer, its quite possible we fall
179 * in an endless loop. 223 * in an endless loop.
180 * But every pointer we follow is guaranteed to be valid thanks to RCU. 224 * But every pointer we follow is guaranteed to be valid thanks to RCU.
181 * We exit from this function if number of links exceeds PEER_MAXDEPTH 225 * We exit from this function if number of links exceeds PEER_MAXDEPTH
182 */ 226 */
183static struct inet_peer *lookup_rcu_bh(__be32 daddr) 227static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
228 struct inet_peer_base *base,
229 int *newrefcnt)
184{ 230{
185 struct inet_peer *u = rcu_dereference_bh(peers.root); 231 struct inet_peer *u = rcu_dereference(base->root);
186 int count = 0; 232 int count = 0;
187 233
188 while (u != peer_avl_empty) { 234 while (u != peer_avl_empty) {
189 if (daddr == u->v4daddr) { 235 int cmp = addr_compare(daddr, &u->daddr);
236 if (cmp == 0) {
190 /* Before taking a reference, check if this entry was 237 /* Before taking a reference, check if this entry was
191 * deleted, unlink_from_pool() sets refcnt=-1 to make 238 * deleted, unlink_from_pool() sets refcnt=-1 to make
192 * distinction between an unused entry (refcnt=0) and 239 * distinction between an unused entry (refcnt=0) and
193 * a freed one. 240 * a freed one.
194 */ 241 */
195 if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1))) 242 if (!atomic_add_unless_return(&u->refcnt, 1, -1, newrefcnt))
196 u = NULL; 243 u = NULL;
197 return u; 244 return u;
198 } 245 }
199 if ((__force __u32)daddr < (__force __u32)u->v4daddr) 246 if (cmp == -1)
200 u = rcu_dereference_bh(u->avl_left); 247 u = rcu_dereference(u->avl_left);
201 else 248 else
202 u = rcu_dereference_bh(u->avl_right); 249 u = rcu_dereference(u->avl_right);
203 if (unlikely(++count == PEER_MAXDEPTH)) 250 if (unlikely(++count == PEER_MAXDEPTH))
204 break; 251 break;
205 } 252 }
@@ -207,15 +254,17 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr)
207} 254}
208 255
209/* Called with local BH disabled and the pool lock held. */ 256/* Called with local BH disabled and the pool lock held. */
210#define lookup_rightempty(start) \ 257#define lookup_rightempty(start, base) \
211({ \ 258({ \
212 struct inet_peer *u, **v; \ 259 struct inet_peer *u; \
260 struct inet_peer __rcu **v; \
213 *stackptr++ = &start->avl_left; \ 261 *stackptr++ = &start->avl_left; \
214 v = &start->avl_left; \ 262 v = &start->avl_left; \
215 for (u = *v; u->avl_right != peer_avl_empty; ) { \ 263 for (u = rcu_deref_locked(*v, base); \
264 u->avl_right != peer_avl_empty_rcu; ) { \
216 v = &u->avl_right; \ 265 v = &u->avl_right; \
217 *stackptr++ = v; \ 266 *stackptr++ = v; \
218 u = *v; \ 267 u = rcu_deref_locked(*v, base); \
219 } \ 268 } \
220 u; \ 269 u; \
221}) 270})
@@ -224,74 +273,76 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr)
224 * Variable names are the proof of operation correctness. 273 * Variable names are the proof of operation correctness.
225 * Look into mm/map_avl.c for more detail description of the ideas. 274 * Look into mm/map_avl.c for more detail description of the ideas.
226 */ 275 */
227static void peer_avl_rebalance(struct inet_peer **stack[], 276static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
228 struct inet_peer ***stackend) 277 struct inet_peer __rcu ***stackend,
278 struct inet_peer_base *base)
229{ 279{
230 struct inet_peer **nodep, *node, *l, *r; 280 struct inet_peer __rcu **nodep;
281 struct inet_peer *node, *l, *r;
231 int lh, rh; 282 int lh, rh;
232 283
233 while (stackend > stack) { 284 while (stackend > stack) {
234 nodep = *--stackend; 285 nodep = *--stackend;
235 node = *nodep; 286 node = rcu_deref_locked(*nodep, base);
236 l = node->avl_left; 287 l = rcu_deref_locked(node->avl_left, base);
237 r = node->avl_right; 288 r = rcu_deref_locked(node->avl_right, base);
238 lh = node_height(l); 289 lh = node_height(l);
239 rh = node_height(r); 290 rh = node_height(r);
240 if (lh > rh + 1) { /* l: RH+2 */ 291 if (lh > rh + 1) { /* l: RH+2 */
241 struct inet_peer *ll, *lr, *lrl, *lrr; 292 struct inet_peer *ll, *lr, *lrl, *lrr;
242 int lrh; 293 int lrh;
243 ll = l->avl_left; 294 ll = rcu_deref_locked(l->avl_left, base);
244 lr = l->avl_right; 295 lr = rcu_deref_locked(l->avl_right, base);
245 lrh = node_height(lr); 296 lrh = node_height(lr);
246 if (lrh <= node_height(ll)) { /* ll: RH+1 */ 297 if (lrh <= node_height(ll)) { /* ll: RH+1 */
247 node->avl_left = lr; /* lr: RH or RH+1 */ 298 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
248 node->avl_right = r; /* r: RH */ 299 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
249 node->avl_height = lrh + 1; /* RH+1 or RH+2 */ 300 node->avl_height = lrh + 1; /* RH+1 or RH+2 */
250 l->avl_left = ll; /* ll: RH+1 */ 301 RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */
251 l->avl_right = node; /* node: RH+1 or RH+2 */ 302 RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */
252 l->avl_height = node->avl_height + 1; 303 l->avl_height = node->avl_height + 1;
253 *nodep = l; 304 RCU_INIT_POINTER(*nodep, l);
254 } else { /* ll: RH, lr: RH+1 */ 305 } else { /* ll: RH, lr: RH+1 */
255 lrl = lr->avl_left; /* lrl: RH or RH-1 */ 306 lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
256 lrr = lr->avl_right; /* lrr: RH or RH-1 */ 307 lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
257 node->avl_left = lrr; /* lrr: RH or RH-1 */ 308 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
258 node->avl_right = r; /* r: RH */ 309 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
259 node->avl_height = rh + 1; /* node: RH+1 */ 310 node->avl_height = rh + 1; /* node: RH+1 */
260 l->avl_left = ll; /* ll: RH */ 311 RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */
261 l->avl_right = lrl; /* lrl: RH or RH-1 */ 312 RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */
262 l->avl_height = rh + 1; /* l: RH+1 */ 313 l->avl_height = rh + 1; /* l: RH+1 */
263 lr->avl_left = l; /* l: RH+1 */ 314 RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */
264 lr->avl_right = node; /* node: RH+1 */ 315 RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */
265 lr->avl_height = rh + 2; 316 lr->avl_height = rh + 2;
266 *nodep = lr; 317 RCU_INIT_POINTER(*nodep, lr);
267 } 318 }
268 } else if (rh > lh + 1) { /* r: LH+2 */ 319 } else if (rh > lh + 1) { /* r: LH+2 */
269 struct inet_peer *rr, *rl, *rlr, *rll; 320 struct inet_peer *rr, *rl, *rlr, *rll;
270 int rlh; 321 int rlh;
271 rr = r->avl_right; 322 rr = rcu_deref_locked(r->avl_right, base);
272 rl = r->avl_left; 323 rl = rcu_deref_locked(r->avl_left, base);
273 rlh = node_height(rl); 324 rlh = node_height(rl);
274 if (rlh <= node_height(rr)) { /* rr: LH+1 */ 325 if (rlh <= node_height(rr)) { /* rr: LH+1 */
275 node->avl_right = rl; /* rl: LH or LH+1 */ 326 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
276 node->avl_left = l; /* l: LH */ 327 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
277 node->avl_height = rlh + 1; /* LH+1 or LH+2 */ 328 node->avl_height = rlh + 1; /* LH+1 or LH+2 */
278 r->avl_right = rr; /* rr: LH+1 */ 329 RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */
279 r->avl_left = node; /* node: LH+1 or LH+2 */ 330 RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */
280 r->avl_height = node->avl_height + 1; 331 r->avl_height = node->avl_height + 1;
281 *nodep = r; 332 RCU_INIT_POINTER(*nodep, r);
282 } else { /* rr: RH, rl: RH+1 */ 333 } else { /* rr: RH, rl: RH+1 */
283 rlr = rl->avl_right; /* rlr: LH or LH-1 */ 334 rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
284 rll = rl->avl_left; /* rll: LH or LH-1 */ 335 rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
285 node->avl_right = rll; /* rll: LH or LH-1 */ 336 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
286 node->avl_left = l; /* l: LH */ 337 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
287 node->avl_height = lh + 1; /* node: LH+1 */ 338 node->avl_height = lh + 1; /* node: LH+1 */
288 r->avl_right = rr; /* rr: LH */ 339 RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */
289 r->avl_left = rlr; /* rlr: LH or LH-1 */ 340 RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */
290 r->avl_height = lh + 1; /* r: LH+1 */ 341 r->avl_height = lh + 1; /* r: LH+1 */
291 rl->avl_right = r; /* r: LH+1 */ 342 RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */
292 rl->avl_left = node; /* node: LH+1 */ 343 RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */
293 rl->avl_height = lh + 2; 344 rl->avl_height = lh + 2;
294 *nodep = rl; 345 RCU_INIT_POINTER(*nodep, rl);
295 } 346 }
296 } else { 347 } else {
297 node->avl_height = (lh > rh ? lh : rh) + 1; 348 node->avl_height = (lh > rh ? lh : rh) + 1;
@@ -300,14 +351,14 @@ static void peer_avl_rebalance(struct inet_peer **stack[],
300} 351}
301 352
302/* Called with local BH disabled and the pool lock held. */ 353/* Called with local BH disabled and the pool lock held. */
303#define link_to_pool(n) \ 354#define link_to_pool(n, base) \
304do { \ 355do { \
305 n->avl_height = 1; \ 356 n->avl_height = 1; \
306 n->avl_left = peer_avl_empty; \ 357 n->avl_left = peer_avl_empty_rcu; \
307 n->avl_right = peer_avl_empty; \ 358 n->avl_right = peer_avl_empty_rcu; \
308 smp_wmb(); /* lockless readers can catch us now */ \ 359 /* lockless readers can catch us now */ \
309 **--stackptr = n; \ 360 rcu_assign_pointer(**--stackptr, n); \
310 peer_avl_rebalance(stack, stackptr); \ 361 peer_avl_rebalance(stack, stackptr, base); \
311} while (0) 362} while (0)
312 363
313static void inetpeer_free_rcu(struct rcu_head *head) 364static void inetpeer_free_rcu(struct rcu_head *head)
@@ -316,13 +367,14 @@ static void inetpeer_free_rcu(struct rcu_head *head)
316} 367}
317 368
318/* May be called with local BH enabled. */ 369/* May be called with local BH enabled. */
319static void unlink_from_pool(struct inet_peer *p) 370static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
371 struct inet_peer __rcu **stack[PEER_MAXDEPTH])
320{ 372{
321 int do_free; 373 int do_free;
322 374
323 do_free = 0; 375 do_free = 0;
324 376
325 spin_lock_bh(&peers.lock); 377 write_seqlock_bh(&base->lock);
326 /* Check the reference counter. It was artificially incremented by 1 378 /* Check the reference counter. It was artificially incremented by 1
327 * in cleanup() function to prevent sudden disappearing. If we can 379 * in cleanup() function to prevent sudden disappearing. If we can
328 * atomically (because of lockless readers) take this last reference, 380 * atomically (because of lockless readers) take this last reference,
@@ -330,38 +382,37 @@ static void unlink_from_pool(struct inet_peer *p)
330 * We use refcnt=-1 to alert lockless readers this entry is deleted. 382 * We use refcnt=-1 to alert lockless readers this entry is deleted.
331 */ 383 */
332 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { 384 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
333 struct inet_peer **stack[PEER_MAXDEPTH]; 385 struct inet_peer __rcu ***stackptr, ***delp;
334 struct inet_peer ***stackptr, ***delp; 386 if (lookup(&p->daddr, stack, base) != p)
335 if (lookup(p->v4daddr, stack) != p)
336 BUG(); 387 BUG();
337 delp = stackptr - 1; /* *delp[0] == p */ 388 delp = stackptr - 1; /* *delp[0] == p */
338 if (p->avl_left == peer_avl_empty) { 389 if (p->avl_left == peer_avl_empty_rcu) {
339 *delp[0] = p->avl_right; 390 *delp[0] = p->avl_right;
340 --stackptr; 391 --stackptr;
341 } else { 392 } else {
342 /* look for a node to insert instead of p */ 393 /* look for a node to insert instead of p */
343 struct inet_peer *t; 394 struct inet_peer *t;
344 t = lookup_rightempty(p); 395 t = lookup_rightempty(p, base);
345 BUG_ON(*stackptr[-1] != t); 396 BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
346 **--stackptr = t->avl_left; 397 **--stackptr = t->avl_left;
347 /* t is removed, t->v4daddr > x->v4daddr for any 398 /* t is removed, t->daddr > x->daddr for any
348 * x in p->avl_left subtree. 399 * x in p->avl_left subtree.
349 * Put t in the old place of p. */ 400 * Put t in the old place of p. */
350 *delp[0] = t; 401 RCU_INIT_POINTER(*delp[0], t);
351 t->avl_left = p->avl_left; 402 t->avl_left = p->avl_left;
352 t->avl_right = p->avl_right; 403 t->avl_right = p->avl_right;
353 t->avl_height = p->avl_height; 404 t->avl_height = p->avl_height;
354 BUG_ON(delp[1] != &p->avl_left); 405 BUG_ON(delp[1] != &p->avl_left);
355 delp[1] = &t->avl_left; /* was &p->avl_left */ 406 delp[1] = &t->avl_left; /* was &p->avl_left */
356 } 407 }
357 peer_avl_rebalance(stack, stackptr); 408 peer_avl_rebalance(stack, stackptr, base);
358 peers.total--; 409 base->total--;
359 do_free = 1; 410 do_free = 1;
360 } 411 }
361 spin_unlock_bh(&peers.lock); 412 write_sequnlock_bh(&base->lock);
362 413
363 if (do_free) 414 if (do_free)
364 call_rcu_bh(&p->rcu, inetpeer_free_rcu); 415 call_rcu(&p->rcu, inetpeer_free_rcu);
365 else 416 else
366 /* The node is used again. Decrease the reference counter 417 /* The node is used again. Decrease the reference counter
367 * back. The loop "cleanup -> unlink_from_unused 418 * back. The loop "cleanup -> unlink_from_unused
@@ -373,8 +424,18 @@ static void unlink_from_pool(struct inet_peer *p)
373 inet_putpeer(p); 424 inet_putpeer(p);
374} 425}
375 426
427static struct inet_peer_base *family_to_base(int family)
428{
429 return (family == AF_INET ? &v4_peers : &v6_peers);
430}
431
432static struct inet_peer_base *peer_to_base(struct inet_peer *p)
433{
434 return family_to_base(p->daddr.family);
435}
436
376/* May be called with local BH enabled. */ 437/* May be called with local BH enabled. */
377static int cleanup_once(unsigned long ttl) 438static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH])
378{ 439{
379 struct inet_peer *p = NULL; 440 struct inet_peer *p = NULL;
380 441
@@ -406,79 +467,101 @@ static int cleanup_once(unsigned long ttl)
406 * happen because of entry limits in route cache. */ 467 * happen because of entry limits in route cache. */
407 return -1; 468 return -1;
408 469
409 unlink_from_pool(p); 470 unlink_from_pool(p, peer_to_base(p), stack);
410 return 0; 471 return 0;
411} 472}
412 473
413/* Called with or without local BH being disabled. */ 474/* Called with or without local BH being disabled. */
414struct inet_peer *inet_getpeer(__be32 daddr, int create) 475struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
415{ 476{
477 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
478 struct inet_peer_base *base = family_to_base(daddr->family);
416 struct inet_peer *p; 479 struct inet_peer *p;
417 struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; 480 unsigned int sequence;
481 int invalidated, newrefcnt = 0;
418 482
419 /* Look up for the address quickly, lockless. 483 /* Look up for the address quickly, lockless.
420 * Because of a concurrent writer, we might not find an existing entry. 484 * Because of a concurrent writer, we might not find an existing entry.
421 */ 485 */
422 rcu_read_lock_bh(); 486 rcu_read_lock();
423 p = lookup_rcu_bh(daddr); 487 sequence = read_seqbegin(&base->lock);
424 rcu_read_unlock_bh(); 488 p = lookup_rcu(daddr, base, &newrefcnt);
489 invalidated = read_seqretry(&base->lock, sequence);
490 rcu_read_unlock();
425 491
426 if (p) { 492 if (p) {
427 /* The existing node has been found. 493found: /* The existing node has been found.
428 * Remove the entry from unused list if it was there. 494 * Remove the entry from unused list if it was there.
429 */ 495 */
430 unlink_from_unused(p); 496 if (newrefcnt == 1)
497 unlink_from_unused(p);
431 return p; 498 return p;
432 } 499 }
433 500
501 /* If no writer did a change during our lookup, we can return early. */
502 if (!create && !invalidated)
503 return NULL;
504
434 /* retry an exact lookup, taking the lock before. 505 /* retry an exact lookup, taking the lock before.
435 * At least, nodes should be hot in our cache. 506 * At least, nodes should be hot in our cache.
436 */ 507 */
437 spin_lock_bh(&peers.lock); 508 write_seqlock_bh(&base->lock);
438 p = lookup(daddr, stack); 509 p = lookup(daddr, stack, base);
439 if (p != peer_avl_empty) { 510 if (p != peer_avl_empty) {
440 atomic_inc(&p->refcnt); 511 newrefcnt = atomic_inc_return(&p->refcnt);
441 spin_unlock_bh(&peers.lock); 512 write_sequnlock_bh(&base->lock);
442 /* Remove the entry from unused list if it was there. */ 513 goto found;
443 unlink_from_unused(p);
444 return p;
445 } 514 }
446 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; 515 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
447 if (p) { 516 if (p) {
448 p->v4daddr = daddr; 517 p->daddr = *daddr;
449 atomic_set(&p->refcnt, 1); 518 atomic_set(&p->refcnt, 1);
450 atomic_set(&p->rid, 0); 519 atomic_set(&p->rid, 0);
451 atomic_set(&p->ip_id_count, secure_ip_id(daddr)); 520 atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4));
452 p->tcp_ts_stamp = 0; 521 p->tcp_ts_stamp = 0;
522 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
523 p->rate_tokens = 0;
524 p->rate_last = 0;
525 p->pmtu_expires = 0;
526 p->pmtu_orig = 0;
527 memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
453 INIT_LIST_HEAD(&p->unused); 528 INIT_LIST_HEAD(&p->unused);
454 529
455 530
456 /* Link the node. */ 531 /* Link the node. */
457 link_to_pool(p); 532 link_to_pool(p, base);
458 peers.total++; 533 base->total++;
459 } 534 }
460 spin_unlock_bh(&peers.lock); 535 write_sequnlock_bh(&base->lock);
461 536
462 if (peers.total >= inet_peer_threshold) 537 if (base->total >= inet_peer_threshold)
463 /* Remove one less-recently-used entry. */ 538 /* Remove one less-recently-used entry. */
464 cleanup_once(0); 539 cleanup_once(0, stack);
465 540
466 return p; 541 return p;
467} 542}
468 543
544static int compute_total(void)
545{
546 return v4_peers.total + v6_peers.total;
547}
548EXPORT_SYMBOL_GPL(inet_getpeer);
549
469/* Called with local BH disabled. */ 550/* Called with local BH disabled. */
470static void peer_check_expire(unsigned long dummy) 551static void peer_check_expire(unsigned long dummy)
471{ 552{
472 unsigned long now = jiffies; 553 unsigned long now = jiffies;
473 int ttl; 554 int ttl, total;
555 struct inet_peer __rcu **stack[PEER_MAXDEPTH];
474 556
475 if (peers.total >= inet_peer_threshold) 557 total = compute_total();
558 if (total >= inet_peer_threshold)
476 ttl = inet_peer_minttl; 559 ttl = inet_peer_minttl;
477 else 560 else
478 ttl = inet_peer_maxttl 561 ttl = inet_peer_maxttl
479 - (inet_peer_maxttl - inet_peer_minttl) / HZ * 562 - (inet_peer_maxttl - inet_peer_minttl) / HZ *
480 peers.total / inet_peer_threshold * HZ; 563 total / inet_peer_threshold * HZ;
481 while (!cleanup_once(ttl)) { 564 while (!cleanup_once(ttl, stack)) {
482 if (jiffies != now) 565 if (jiffies != now)
483 break; 566 break;
484 } 567 }
@@ -486,13 +569,14 @@ static void peer_check_expire(unsigned long dummy)
486 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime 569 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
487 * interval depending on the total number of entries (more entries, 570 * interval depending on the total number of entries (more entries,
488 * less interval). */ 571 * less interval). */
489 if (peers.total >= inet_peer_threshold) 572 total = compute_total();
573 if (total >= inet_peer_threshold)
490 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; 574 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
491 else 575 else
492 peer_periodic_timer.expires = jiffies 576 peer_periodic_timer.expires = jiffies
493 + inet_peer_gc_maxtime 577 + inet_peer_gc_maxtime
494 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * 578 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
495 peers.total / inet_peer_threshold * HZ; 579 total / inet_peer_threshold * HZ;
496 add_timer(&peer_periodic_timer); 580 add_timer(&peer_periodic_timer);
497} 581}
498 582
@@ -508,3 +592,45 @@ void inet_putpeer(struct inet_peer *p)
508 592
509 local_bh_enable(); 593 local_bh_enable();
510} 594}
595EXPORT_SYMBOL_GPL(inet_putpeer);
596
597/*
598 * Check transmit rate limitation for given message.
599 * The rate information is held in the inet_peer entries now.
600 * This function is generic and could be used for other purposes
601 * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
602 *
603 * Note that the same inet_peer fields are modified by functions in
604 * route.c too, but these work for packet destinations while xrlim_allow
605 * works for icmp destinations. This means the rate limiting information
606 * for one "ip object" is shared - and these ICMPs are twice limited:
607 * by source and by destination.
608 *
609 * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
610 * SHOULD allow setting of rate limits
611 *
612 * Shared between ICMPv4 and ICMPv6.
613 */
614#define XRLIM_BURST_FACTOR 6
615bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
616{
617 unsigned long now, token;
618 bool rc = false;
619
620 if (!peer)
621 return true;
622
623 token = peer->rate_tokens;
624 now = jiffies;
625 token += now - peer->rate_last;
626 peer->rate_last = now;
627 if (token > XRLIM_BURST_FACTOR * timeout)
628 token = XRLIM_BURST_FACTOR * timeout;
629 if (token >= timeout) {
630 token -= timeout;
631 rc = true;
632 }
633 peer->rate_tokens = token;
634 return rc;
635}
636EXPORT_SYMBOL(inet_peer_xrlim_allow);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 99461f09320f..3b34d1c86270 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -84,7 +84,7 @@ int ip_forward(struct sk_buff *skb)
84 84
85 rt = skb_rtable(skb); 85 rt = skb_rtable(skb);
86 86
87 if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 87 if (opt->is_strictroute && ip_hdr(skb)->daddr != rt->rt_gateway)
88 goto sr_failed; 88 goto sr_failed;
89 89
90 if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && 90 if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b7c41654dde5..0ad6035f6366 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -45,6 +45,7 @@
45#include <linux/udp.h> 45#include <linux/udp.h>
46#include <linux/inet.h> 46#include <linux/inet.h>
47#include <linux/netfilter_ipv4.h> 47#include <linux/netfilter_ipv4.h>
48#include <net/inet_ecn.h>
48 49
49/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 50/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
50 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c 51 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
@@ -70,11 +71,46 @@ struct ipq {
70 __be32 daddr; 71 __be32 daddr;
71 __be16 id; 72 __be16 id;
72 u8 protocol; 73 u8 protocol;
74 u8 ecn; /* RFC3168 support */
73 int iif; 75 int iif;
74 unsigned int rid; 76 unsigned int rid;
75 struct inet_peer *peer; 77 struct inet_peer *peer;
76}; 78};
77 79
80/* RFC 3168 support :
81 * We want to check ECN values of all fragments, do detect invalid combinations.
82 * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
83 */
84#define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */
85#define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */
86#define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */
87#define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */
88
89static inline u8 ip4_frag_ecn(u8 tos)
90{
91 return 1 << (tos & INET_ECN_MASK);
92}
93
94/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
95 * Value : 0xff if frame should be dropped.
96 * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
97 */
98static const u8 ip4_frag_ecn_table[16] = {
99 /* at least one fragment had CE, and others ECT_0 or ECT_1 */
100 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
101 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
102 [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
103
104 /* invalid combinations : drop frame */
105 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
106 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
107 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
108 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
109 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
110 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
111 [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
112};
113
78static struct inet_frags ip4_frags; 114static struct inet_frags ip4_frags;
79 115
80int ip_frag_nqueues(struct net *net) 116int ip_frag_nqueues(struct net *net)
@@ -116,11 +152,11 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
116 struct ip4_create_arg *arg = a; 152 struct ip4_create_arg *arg = a;
117 153
118 qp = container_of(q, struct ipq, q); 154 qp = container_of(q, struct ipq, q);
119 return (qp->id == arg->iph->id && 155 return qp->id == arg->iph->id &&
120 qp->saddr == arg->iph->saddr && 156 qp->saddr == arg->iph->saddr &&
121 qp->daddr == arg->iph->daddr && 157 qp->daddr == arg->iph->daddr &&
122 qp->protocol == arg->iph->protocol && 158 qp->protocol == arg->iph->protocol &&
123 qp->user == arg->user); 159 qp->user == arg->user;
124} 160}
125 161
126/* Memory Tracking Functions. */ 162/* Memory Tracking Functions. */
@@ -137,11 +173,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
137 173
138 qp->protocol = arg->iph->protocol; 174 qp->protocol = arg->iph->protocol;
139 qp->id = arg->iph->id; 175 qp->id = arg->iph->id;
176 qp->ecn = ip4_frag_ecn(arg->iph->tos);
140 qp->saddr = arg->iph->saddr; 177 qp->saddr = arg->iph->saddr;
141 qp->daddr = arg->iph->daddr; 178 qp->daddr = arg->iph->daddr;
142 qp->user = arg->user; 179 qp->user = arg->user;
143 qp->peer = sysctl_ipfrag_max_dist ? 180 qp->peer = sysctl_ipfrag_max_dist ?
144 inet_getpeer(arg->iph->saddr, 1) : NULL; 181 inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
145} 182}
146 183
147static __inline__ void ip4_frag_free(struct inet_frag_queue *q) 184static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
@@ -204,31 +241,30 @@ static void ip_expire(unsigned long arg)
204 241
205 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { 242 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
206 struct sk_buff *head = qp->q.fragments; 243 struct sk_buff *head = qp->q.fragments;
244 const struct iphdr *iph;
245 int err;
207 246
208 rcu_read_lock(); 247 rcu_read_lock();
209 head->dev = dev_get_by_index_rcu(net, qp->iif); 248 head->dev = dev_get_by_index_rcu(net, qp->iif);
210 if (!head->dev) 249 if (!head->dev)
211 goto out_rcu_unlock; 250 goto out_rcu_unlock;
212 251
252 /* skb dst is stale, drop it, and perform route lookup again */
253 skb_dst_drop(head);
254 iph = ip_hdr(head);
255 err = ip_route_input_noref(head, iph->daddr, iph->saddr,
256 iph->tos, head->dev);
257 if (err)
258 goto out_rcu_unlock;
259
213 /* 260 /*
214 * Only search router table for the head fragment, 261 * Only an end host needs to send an ICMP
215 * when defraging timeout at PRE_ROUTING HOOK. 262 * "Fragment Reassembly Timeout" message, per RFC792.
216 */ 263 */
217 if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { 264 if (qp->user == IP_DEFRAG_CONNTRACK_IN &&
218 const struct iphdr *iph = ip_hdr(head); 265 skb_rtable(head)->rt_type != RTN_LOCAL)
219 int err = ip_route_input(head, iph->daddr, iph->saddr, 266 goto out_rcu_unlock;
220 iph->tos, head->dev);
221 if (unlikely(err))
222 goto out_rcu_unlock;
223
224 /*
225 * Only an end host needs to send an ICMP
226 * "Fragment Reassembly Timeout" message, per RFC792.
227 */
228 if (skb_rtable(head)->rt_type != RTN_LOCAL)
229 goto out_rcu_unlock;
230 267
231 }
232 268
233 /* Send an ICMP "Fragment Reassembly Timeout" message. */ 269 /* Send an ICMP "Fragment Reassembly Timeout" message. */
234 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); 270 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
@@ -316,6 +352,7 @@ static int ip_frag_reinit(struct ipq *qp)
316 qp->q.fragments = NULL; 352 qp->q.fragments = NULL;
317 qp->q.fragments_tail = NULL; 353 qp->q.fragments_tail = NULL;
318 qp->iif = 0; 354 qp->iif = 0;
355 qp->ecn = 0;
319 356
320 return 0; 357 return 0;
321} 358}
@@ -328,6 +365,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
328 int flags, offset; 365 int flags, offset;
329 int ihl, end; 366 int ihl, end;
330 int err = -ENOENT; 367 int err = -ENOENT;
368 u8 ecn;
331 369
332 if (qp->q.last_in & INET_FRAG_COMPLETE) 370 if (qp->q.last_in & INET_FRAG_COMPLETE)
333 goto err; 371 goto err;
@@ -339,6 +377,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
339 goto err; 377 goto err;
340 } 378 }
341 379
380 ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
342 offset = ntohs(ip_hdr(skb)->frag_off); 381 offset = ntohs(ip_hdr(skb)->frag_off);
343 flags = offset & ~IP_OFFSET; 382 flags = offset & ~IP_OFFSET;
344 offset &= IP_OFFSET; 383 offset &= IP_OFFSET;
@@ -472,6 +511,7 @@ found:
472 } 511 }
473 qp->q.stamp = skb->tstamp; 512 qp->q.stamp = skb->tstamp;
474 qp->q.meat += skb->len; 513 qp->q.meat += skb->len;
514 qp->ecn |= ecn;
475 atomic_add(skb->truesize, &qp->q.net->mem); 515 atomic_add(skb->truesize, &qp->q.net->mem);
476 if (offset == 0) 516 if (offset == 0)
477 qp->q.last_in |= INET_FRAG_FIRST_IN; 517 qp->q.last_in |= INET_FRAG_FIRST_IN;
@@ -502,9 +542,15 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
502 int len; 542 int len;
503 int ihlen; 543 int ihlen;
504 int err; 544 int err;
545 u8 ecn;
505 546
506 ipq_kill(qp); 547 ipq_kill(qp);
507 548
549 ecn = ip4_frag_ecn_table[qp->ecn];
550 if (unlikely(ecn == 0xff)) {
551 err = -EINVAL;
552 goto out_fail;
553 }
508 /* Make the one we just received the head. */ 554 /* Make the one we just received the head. */
509 if (prev) { 555 if (prev) {
510 head = prev->next; 556 head = prev->next;
@@ -542,7 +588,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
542 /* If the first fragment is fragmented itself, we split 588 /* If the first fragment is fragmented itself, we split
543 * it to two chunks: the first with data and paged part 589 * it to two chunks: the first with data and paged part
544 * and the second, holding only fragments. */ 590 * and the second, holding only fragments. */
545 if (skb_has_frags(head)) { 591 if (skb_has_frag_list(head)) {
546 struct sk_buff *clone; 592 struct sk_buff *clone;
547 int i, plen = 0; 593 int i, plen = 0;
548 594
@@ -583,6 +629,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
583 iph = ip_hdr(head); 629 iph = ip_hdr(head);
584 iph->frag_off = 0; 630 iph->frag_off = 0;
585 iph->tot_len = htons(len); 631 iph->tot_len = htons(len);
632 iph->tos |= ecn;
586 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); 633 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
587 qp->q.fragments = NULL; 634 qp->q.fragments = NULL;
588 qp->q.fragments_tail = NULL; 635 qp->q.fragments_tail = NULL;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 35c93e8b6a46..8871067560db 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -44,6 +44,7 @@
44#include <net/net_namespace.h> 44#include <net/net_namespace.h>
45#include <net/netns/generic.h> 45#include <net/netns/generic.h>
46#include <net/rtnetlink.h> 46#include <net/rtnetlink.h>
47#include <net/gre.h>
47 48
48#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 49#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
49#include <net/ipv6.h> 50#include <net/ipv6.h>
@@ -63,13 +64,13 @@
63 We cannot track such dead loops during route installation, 64 We cannot track such dead loops during route installation,
64 it is infeasible task. The most general solutions would be 65 it is infeasible task. The most general solutions would be
65 to keep skb->encapsulation counter (sort of local ttl), 66 to keep skb->encapsulation counter (sort of local ttl),
66 and silently drop packet when it expires. It is the best 67 and silently drop packet when it expires. It is a good
67 solution, but it supposes maintaing new variable in ALL 68 solution, but it supposes maintaing new variable in ALL
68 skb, even if no tunneling is used. 69 skb, even if no tunneling is used.
69 70
70 Current solution: HARD_TX_LOCK lock breaks dead loops. 71 Current solution: xmit_recursion breaks dead loops. This is a percpu
71 72 counter, since when we enter the first ndo_xmit(), cpu migration is
72 73 forbidden. We force an exit if this counter reaches RECURSION_LIMIT
73 74
74 2. Networking dead loops would not kill routers, but would really 75 2. Networking dead loops would not kill routers, but would really
75 kill network. IP hop limit plays role of "t->recursion" in this case, 76 kill network. IP hop limit plays role of "t->recursion" in this case,
@@ -128,7 +129,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
128 129
129static int ipgre_net_id __read_mostly; 130static int ipgre_net_id __read_mostly;
130struct ipgre_net { 131struct ipgre_net {
131 struct ip_tunnel *tunnels[4][HASH_SIZE]; 132 struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
132 133
133 struct net_device *fb_tunnel_dev; 134 struct net_device *fb_tunnel_dev;
134}; 135};
@@ -158,13 +159,40 @@ struct ipgre_net {
158#define tunnels_l tunnels[1] 159#define tunnels_l tunnels[1]
159#define tunnels_wc tunnels[0] 160#define tunnels_wc tunnels[0]
160/* 161/*
161 * Locking : hash tables are protected by RCU and a spinlock 162 * Locking : hash tables are protected by RCU and RTNL
162 */ 163 */
163static DEFINE_SPINLOCK(ipgre_lock);
164 164
165#define for_each_ip_tunnel_rcu(start) \ 165#define for_each_ip_tunnel_rcu(start) \
166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) 166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167 167
168/* often modified stats are per cpu, other are shared (netdev->stats) */
169struct pcpu_tstats {
170 unsigned long rx_packets;
171 unsigned long rx_bytes;
172 unsigned long tx_packets;
173 unsigned long tx_bytes;
174};
175
176static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177{
178 struct pcpu_tstats sum = { 0 };
179 int i;
180
181 for_each_possible_cpu(i) {
182 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183
184 sum.rx_packets += tstats->rx_packets;
185 sum.rx_bytes += tstats->rx_bytes;
186 sum.tx_packets += tstats->tx_packets;
187 sum.tx_bytes += tstats->tx_bytes;
188 }
189 dev->stats.rx_packets = sum.rx_packets;
190 dev->stats.rx_bytes = sum.rx_bytes;
191 dev->stats.tx_packets = sum.tx_packets;
192 dev->stats.tx_bytes = sum.tx_bytes;
193 return &dev->stats;
194}
195
168/* Given src, dst and key, find appropriate for input tunnel. */ 196/* Given src, dst and key, find appropriate for input tunnel. */
169 197
170static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, 198static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
@@ -173,8 +201,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
173{ 201{
174 struct net *net = dev_net(dev); 202 struct net *net = dev_net(dev);
175 int link = dev->ifindex; 203 int link = dev->ifindex;
176 unsigned h0 = HASH(remote); 204 unsigned int h0 = HASH(remote);
177 unsigned h1 = HASH(key); 205 unsigned int h1 = HASH(key);
178 struct ip_tunnel *t, *cand = NULL; 206 struct ip_tunnel *t, *cand = NULL;
179 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 207 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
180 int dev_type = (gre_proto == htons(ETH_P_TEB)) ? 208 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
@@ -289,13 +317,13 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
289 return NULL; 317 return NULL;
290} 318}
291 319
292static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, 320static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
293 struct ip_tunnel_parm *parms) 321 struct ip_tunnel_parm *parms)
294{ 322{
295 __be32 remote = parms->iph.daddr; 323 __be32 remote = parms->iph.daddr;
296 __be32 local = parms->iph.saddr; 324 __be32 local = parms->iph.saddr;
297 __be32 key = parms->i_key; 325 __be32 key = parms->i_key;
298 unsigned h = HASH(key); 326 unsigned int h = HASH(key);
299 int prio = 0; 327 int prio = 0;
300 328
301 if (local) 329 if (local)
@@ -308,7 +336,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
308 return &ign->tunnels[prio][h]; 336 return &ign->tunnels[prio][h];
309} 337}
310 338
311static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, 339static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
312 struct ip_tunnel *t) 340 struct ip_tunnel *t)
313{ 341{
314 return __ipgre_bucket(ign, &t->parms); 342 return __ipgre_bucket(ign, &t->parms);
@@ -316,23 +344,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
316 344
317static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) 345static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
318{ 346{
319 struct ip_tunnel **tp = ipgre_bucket(ign, t); 347 struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
320 348
321 spin_lock_bh(&ipgre_lock); 349 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
322 t->next = *tp;
323 rcu_assign_pointer(*tp, t); 350 rcu_assign_pointer(*tp, t);
324 spin_unlock_bh(&ipgre_lock);
325} 351}
326 352
327static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) 353static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
328{ 354{
329 struct ip_tunnel **tp; 355 struct ip_tunnel __rcu **tp;
330 356 struct ip_tunnel *iter;
331 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { 357
332 if (t == *tp) { 358 for (tp = ipgre_bucket(ign, t);
333 spin_lock_bh(&ipgre_lock); 359 (iter = rtnl_dereference(*tp)) != NULL;
334 *tp = t->next; 360 tp = &iter->next) {
335 spin_unlock_bh(&ipgre_lock); 361 if (t == iter) {
362 rcu_assign_pointer(*tp, t->next);
336 break; 363 break;
337 } 364 }
338 } 365 }
@@ -346,10 +373,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
346 __be32 local = parms->iph.saddr; 373 __be32 local = parms->iph.saddr;
347 __be32 key = parms->i_key; 374 __be32 key = parms->i_key;
348 int link = parms->link; 375 int link = parms->link;
349 struct ip_tunnel *t, **tp; 376 struct ip_tunnel *t;
377 struct ip_tunnel __rcu **tp;
350 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 378 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
351 379
352 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) 380 for (tp = __ipgre_bucket(ign, parms);
381 (t = rtnl_dereference(*tp)) != NULL;
382 tp = &t->next)
353 if (local == t->parms.iph.saddr && 383 if (local == t->parms.iph.saddr &&
354 remote == t->parms.iph.daddr && 384 remote == t->parms.iph.daddr &&
355 key == t->parms.i_key && 385 key == t->parms.i_key &&
@@ -360,7 +390,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
360 return t; 390 return t;
361} 391}
362 392
363static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, 393static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
364 struct ip_tunnel_parm *parms, int create) 394 struct ip_tunnel_parm *parms, int create)
365{ 395{
366 struct ip_tunnel *t, *nt; 396 struct ip_tunnel *t, *nt;
@@ -375,19 +405,14 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
375 if (parms->name[0]) 405 if (parms->name[0])
376 strlcpy(name, parms->name, IFNAMSIZ); 406 strlcpy(name, parms->name, IFNAMSIZ);
377 else 407 else
378 sprintf(name, "gre%%d"); 408 strcpy(name, "gre%d");
379 409
380 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); 410 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
381 if (!dev) 411 if (!dev)
382 return NULL; 412 return NULL;
383 413
384 dev_net_set(dev, net); 414 dev_net_set(dev, net);
385 415
386 if (strchr(name, '%')) {
387 if (dev_alloc_name(dev, name) < 0)
388 goto failed_free;
389 }
390
391 nt = netdev_priv(dev); 416 nt = netdev_priv(dev);
392 nt->parms = *parms; 417 nt->parms = *parms;
393 dev->rtnl_link_ops = &ipgre_link_ops; 418 dev->rtnl_link_ops = &ipgre_link_ops;
@@ -432,7 +457,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
432 by themself??? 457 by themself???
433 */ 458 */
434 459
435 struct iphdr *iph = (struct iphdr *)skb->data; 460 const struct iphdr *iph = (const struct iphdr *)skb->data;
436 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2)); 461 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
437 int grehlen = (iph->ihl<<2) + 4; 462 int grehlen = (iph->ihl<<2) + 4;
438 const int type = icmp_hdr(skb)->type; 463 const int type = icmp_hdr(skb)->type;
@@ -504,7 +529,7 @@ out:
504 rcu_read_unlock(); 529 rcu_read_unlock();
505} 530}
506 531
507static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) 532static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
508{ 533{
509 if (INET_ECN_is_ce(iph->tos)) { 534 if (INET_ECN_is_ce(iph->tos)) {
510 if (skb->protocol == htons(ETH_P_IP)) { 535 if (skb->protocol == htons(ETH_P_IP)) {
@@ -516,19 +541,19 @@ static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
516} 541}
517 542
518static inline u8 543static inline u8
519ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb) 544ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
520{ 545{
521 u8 inner = 0; 546 u8 inner = 0;
522 if (skb->protocol == htons(ETH_P_IP)) 547 if (skb->protocol == htons(ETH_P_IP))
523 inner = old_iph->tos; 548 inner = old_iph->tos;
524 else if (skb->protocol == htons(ETH_P_IPV6)) 549 else if (skb->protocol == htons(ETH_P_IPV6))
525 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph); 550 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
526 return INET_ECN_encapsulate(tos, inner); 551 return INET_ECN_encapsulate(tos, inner);
527} 552}
528 553
529static int ipgre_rcv(struct sk_buff *skb) 554static int ipgre_rcv(struct sk_buff *skb)
530{ 555{
531 struct iphdr *iph; 556 const struct iphdr *iph;
532 u8 *h; 557 u8 *h;
533 __be16 flags; 558 __be16 flags;
534 __sum16 csum = 0; 559 __sum16 csum = 0;
@@ -582,7 +607,7 @@ static int ipgre_rcv(struct sk_buff *skb)
582 if ((tunnel = ipgre_tunnel_lookup(skb->dev, 607 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
583 iph->saddr, iph->daddr, key, 608 iph->saddr, iph->daddr, key,
584 gre_proto))) { 609 gre_proto))) {
585 struct net_device_stats *stats = &tunnel->dev->stats; 610 struct pcpu_tstats *tstats;
586 611
587 secpath_reset(skb); 612 secpath_reset(skb);
588 613
@@ -604,24 +629,24 @@ static int ipgre_rcv(struct sk_buff *skb)
604#ifdef CONFIG_NET_IPGRE_BROADCAST 629#ifdef CONFIG_NET_IPGRE_BROADCAST
605 if (ipv4_is_multicast(iph->daddr)) { 630 if (ipv4_is_multicast(iph->daddr)) {
606 /* Looped back packet, drop it! */ 631 /* Looped back packet, drop it! */
607 if (skb_rtable(skb)->fl.iif == 0) 632 if (rt_is_output_route(skb_rtable(skb)))
608 goto drop; 633 goto drop;
609 stats->multicast++; 634 tunnel->dev->stats.multicast++;
610 skb->pkt_type = PACKET_BROADCAST; 635 skb->pkt_type = PACKET_BROADCAST;
611 } 636 }
612#endif 637#endif
613 638
614 if (((flags&GRE_CSUM) && csum) || 639 if (((flags&GRE_CSUM) && csum) ||
615 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { 640 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
616 stats->rx_crc_errors++; 641 tunnel->dev->stats.rx_crc_errors++;
617 stats->rx_errors++; 642 tunnel->dev->stats.rx_errors++;
618 goto drop; 643 goto drop;
619 } 644 }
620 if (tunnel->parms.i_flags&GRE_SEQ) { 645 if (tunnel->parms.i_flags&GRE_SEQ) {
621 if (!(flags&GRE_SEQ) || 646 if (!(flags&GRE_SEQ) ||
622 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { 647 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
623 stats->rx_fifo_errors++; 648 tunnel->dev->stats.rx_fifo_errors++;
624 stats->rx_errors++; 649 tunnel->dev->stats.rx_errors++;
625 goto drop; 650 goto drop;
626 } 651 }
627 tunnel->i_seqno = seqno + 1; 652 tunnel->i_seqno = seqno + 1;
@@ -630,8 +655,8 @@ static int ipgre_rcv(struct sk_buff *skb)
630 /* Warning: All skb pointers will be invalidated! */ 655 /* Warning: All skb pointers will be invalidated! */
631 if (tunnel->dev->type == ARPHRD_ETHER) { 656 if (tunnel->dev->type == ARPHRD_ETHER) {
632 if (!pskb_may_pull(skb, ETH_HLEN)) { 657 if (!pskb_may_pull(skb, ETH_HLEN)) {
633 stats->rx_length_errors++; 658 tunnel->dev->stats.rx_length_errors++;
634 stats->rx_errors++; 659 tunnel->dev->stats.rx_errors++;
635 goto drop; 660 goto drop;
636 } 661 }
637 662
@@ -640,14 +665,19 @@ static int ipgre_rcv(struct sk_buff *skb)
640 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 665 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
641 } 666 }
642 667
643 skb_tunnel_rx(skb, tunnel->dev); 668 tstats = this_cpu_ptr(tunnel->dev->tstats);
669 tstats->rx_packets++;
670 tstats->rx_bytes += skb->len;
671
672 __skb_tunnel_rx(skb, tunnel->dev);
644 673
645 skb_reset_network_header(skb); 674 skb_reset_network_header(skb);
646 ipgre_ecn_decapsulate(iph, skb); 675 ipgre_ecn_decapsulate(iph, skb);
647 676
648 netif_rx(skb); 677 netif_rx(skb);
678
649 rcu_read_unlock(); 679 rcu_read_unlock();
650 return(0); 680 return 0;
651 } 681 }
652 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 682 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
653 683
@@ -655,20 +685,20 @@ drop:
655 rcu_read_unlock(); 685 rcu_read_unlock();
656drop_nolock: 686drop_nolock:
657 kfree_skb(skb); 687 kfree_skb(skb);
658 return(0); 688 return 0;
659} 689}
660 690
661static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 691static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
662{ 692{
663 struct ip_tunnel *tunnel = netdev_priv(dev); 693 struct ip_tunnel *tunnel = netdev_priv(dev);
664 struct net_device_stats *stats = &dev->stats; 694 struct pcpu_tstats *tstats;
665 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); 695 const struct iphdr *old_iph = ip_hdr(skb);
666 struct iphdr *old_iph = ip_hdr(skb); 696 const struct iphdr *tiph;
667 struct iphdr *tiph; 697 struct flowi4 fl4;
668 u8 tos; 698 u8 tos;
669 __be16 df; 699 __be16 df;
670 struct rtable *rt; /* Route to the other host */ 700 struct rtable *rt; /* Route to the other host */
671 struct net_device *tdev; /* Device to other host */ 701 struct net_device *tdev; /* Device to other host */
672 struct iphdr *iph; /* Our new IP header */ 702 struct iphdr *iph; /* Our new IP header */
673 unsigned int max_headroom; /* The extra header space needed */ 703 unsigned int max_headroom; /* The extra header space needed */
674 int gre_hlen; 704 int gre_hlen;
@@ -680,7 +710,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
680 710
681 if (dev->header_ops && dev->type == ARPHRD_IPGRE) { 711 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
682 gre_hlen = 0; 712 gre_hlen = 0;
683 tiph = (struct iphdr *)skb->data; 713 tiph = (const struct iphdr *)skb->data;
684 } else { 714 } else {
685 gre_hlen = tunnel->hlen; 715 gre_hlen = tunnel->hlen;
686 tiph = &tunnel->parms.iph; 716 tiph = &tunnel->parms.iph;
@@ -690,7 +720,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
690 /* NBMA tunnel */ 720 /* NBMA tunnel */
691 721
692 if (skb_dst(skb) == NULL) { 722 if (skb_dst(skb) == NULL) {
693 stats->tx_fifo_errors++; 723 dev->stats.tx_fifo_errors++;
694 goto tx_error; 724 goto tx_error;
695 } 725 }
696 726
@@ -701,14 +731,14 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
701 } 731 }
702#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 732#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
703 else if (skb->protocol == htons(ETH_P_IPV6)) { 733 else if (skb->protocol == htons(ETH_P_IPV6)) {
704 struct in6_addr *addr6; 734 const struct in6_addr *addr6;
705 int addr_type; 735 int addr_type;
706 struct neighbour *neigh = skb_dst(skb)->neighbour; 736 struct neighbour *neigh = skb_dst(skb)->neighbour;
707 737
708 if (neigh == NULL) 738 if (neigh == NULL)
709 goto tx_error; 739 goto tx_error;
710 740
711 addr6 = (struct in6_addr *)&neigh->primary_key; 741 addr6 = (const struct in6_addr *)&neigh->primary_key;
712 addr_type = ipv6_addr_type(addr6); 742 addr_type = ipv6_addr_type(addr6);
713 743
714 if (addr_type == IPV6_ADDR_ANY) { 744 if (addr_type == IPV6_ADDR_ANY) {
@@ -732,26 +762,21 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
732 if (skb->protocol == htons(ETH_P_IP)) 762 if (skb->protocol == htons(ETH_P_IP))
733 tos = old_iph->tos; 763 tos = old_iph->tos;
734 else if (skb->protocol == htons(ETH_P_IPV6)) 764 else if (skb->protocol == htons(ETH_P_IPV6))
735 tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph); 765 tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
736 } 766 }
737 767
738 { 768 rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
739 struct flowi fl = { .oif = tunnel->parms.link, 769 tunnel->parms.o_key, RT_TOS(tos),
740 .nl_u = { .ip4_u = 770 tunnel->parms.link);
741 { .daddr = dst, 771 if (IS_ERR(rt)) {
742 .saddr = tiph->saddr, 772 dev->stats.tx_carrier_errors++;
743 .tos = RT_TOS(tos) } }, 773 goto tx_error;
744 .proto = IPPROTO_GRE };
745 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
746 stats->tx_carrier_errors++;
747 goto tx_error;
748 }
749 } 774 }
750 tdev = rt->dst.dev; 775 tdev = rt->dst.dev;
751 776
752 if (tdev == dev) { 777 if (tdev == dev) {
753 ip_rt_put(rt); 778 ip_rt_put(rt);
754 stats->collisions++; 779 dev->stats.collisions++;
755 goto tx_error; 780 goto tx_error;
756 } 781 }
757 782
@@ -783,7 +808,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
783 !ipv4_is_multicast(tunnel->parms.iph.daddr)) || 808 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
784 rt6->rt6i_dst.plen == 128) { 809 rt6->rt6i_dst.plen == 128) {
785 rt6->rt6i_flags |= RTF_MODIFIED; 810 rt6->rt6i_flags |= RTF_MODIFIED;
786 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu; 811 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
787 } 812 }
788 } 813 }
789 814
@@ -814,7 +839,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
814 dev->needed_headroom = max_headroom; 839 dev->needed_headroom = max_headroom;
815 if (!new_skb) { 840 if (!new_skb) {
816 ip_rt_put(rt); 841 ip_rt_put(rt);
817 txq->tx_dropped++; 842 dev->stats.tx_dropped++;
818 dev_kfree_skb(skb); 843 dev_kfree_skb(skb);
819 return NETDEV_TX_OK; 844 return NETDEV_TX_OK;
820 } 845 }
@@ -844,18 +869,18 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
844 iph->frag_off = df; 869 iph->frag_off = df;
845 iph->protocol = IPPROTO_GRE; 870 iph->protocol = IPPROTO_GRE;
846 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); 871 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
847 iph->daddr = rt->rt_dst; 872 iph->daddr = fl4.daddr;
848 iph->saddr = rt->rt_src; 873 iph->saddr = fl4.saddr;
849 874
850 if ((iph->ttl = tiph->ttl) == 0) { 875 if ((iph->ttl = tiph->ttl) == 0) {
851 if (skb->protocol == htons(ETH_P_IP)) 876 if (skb->protocol == htons(ETH_P_IP))
852 iph->ttl = old_iph->ttl; 877 iph->ttl = old_iph->ttl;
853#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 878#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
854 else if (skb->protocol == htons(ETH_P_IPV6)) 879 else if (skb->protocol == htons(ETH_P_IPV6))
855 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit; 880 iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
856#endif 881#endif
857 else 882 else
858 iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT); 883 iph->ttl = ip4_dst_hoplimit(&rt->dst);
859 } 884 }
860 885
861 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; 886 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
@@ -881,15 +906,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
881 } 906 }
882 907
883 nf_reset(skb); 908 nf_reset(skb);
884 909 tstats = this_cpu_ptr(dev->tstats);
885 IPTUNNEL_XMIT(); 910 __IPTUNNEL_XMIT(tstats, &dev->stats);
886 return NETDEV_TX_OK; 911 return NETDEV_TX_OK;
887 912
888tx_error_icmp: 913tx_error_icmp:
889 dst_link_failure(skb); 914 dst_link_failure(skb);
890 915
891tx_error: 916tx_error:
892 stats->tx_errors++; 917 dev->stats.tx_errors++;
893 dev_kfree_skb(skb); 918 dev_kfree_skb(skb);
894 return NETDEV_TX_OK; 919 return NETDEV_TX_OK;
895} 920}
@@ -898,7 +923,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
898{ 923{
899 struct net_device *tdev = NULL; 924 struct net_device *tdev = NULL;
900 struct ip_tunnel *tunnel; 925 struct ip_tunnel *tunnel;
901 struct iphdr *iph; 926 const struct iphdr *iph;
902 int hlen = LL_MAX_HEADER; 927 int hlen = LL_MAX_HEADER;
903 int mtu = ETH_DATA_LEN; 928 int mtu = ETH_DATA_LEN;
904 int addend = sizeof(struct iphdr) + 4; 929 int addend = sizeof(struct iphdr) + 4;
@@ -909,14 +934,15 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
909 /* Guess output device to choose reasonable mtu and needed_headroom */ 934 /* Guess output device to choose reasonable mtu and needed_headroom */
910 935
911 if (iph->daddr) { 936 if (iph->daddr) {
912 struct flowi fl = { .oif = tunnel->parms.link, 937 struct flowi4 fl4;
913 .nl_u = { .ip4_u =
914 { .daddr = iph->daddr,
915 .saddr = iph->saddr,
916 .tos = RT_TOS(iph->tos) } },
917 .proto = IPPROTO_GRE };
918 struct rtable *rt; 938 struct rtable *rt;
919 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 939
940 rt = ip_route_output_gre(dev_net(dev), &fl4,
941 iph->daddr, iph->saddr,
942 tunnel->parms.o_key,
943 RT_TOS(iph->tos),
944 tunnel->parms.link);
945 if (!IS_ERR(rt)) {
920 tdev = rt->dst.dev; 946 tdev = rt->dst.dev;
921 ip_rt_put(rt); 947 ip_rt_put(rt);
922 } 948 }
@@ -1012,7 +1038,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1012 break; 1038 break;
1013 } 1039 }
1014 } else { 1040 } else {
1015 unsigned nflags = 0; 1041 unsigned int nflags = 0;
1016 1042
1017 t = netdev_priv(dev); 1043 t = netdev_priv(dev);
1018 1044
@@ -1026,6 +1052,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1026 break; 1052 break;
1027 } 1053 }
1028 ipgre_tunnel_unlink(ign, t); 1054 ipgre_tunnel_unlink(ign, t);
1055 synchronize_net();
1029 t->parms.iph.saddr = p.iph.saddr; 1056 t->parms.iph.saddr = p.iph.saddr;
1030 t->parms.iph.daddr = p.iph.daddr; 1057 t->parms.iph.daddr = p.iph.daddr;
1031 t->parms.i_key = p.i_key; 1058 t->parms.i_key = p.i_key;
@@ -1125,7 +1152,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1125 1152
1126static int ipgre_header(struct sk_buff *skb, struct net_device *dev, 1153static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1127 unsigned short type, 1154 unsigned short type,
1128 const void *daddr, const void *saddr, unsigned len) 1155 const void *daddr, const void *saddr, unsigned int len)
1129{ 1156{
1130 struct ip_tunnel *t = netdev_priv(dev); 1157 struct ip_tunnel *t = netdev_priv(dev);
1131 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); 1158 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
@@ -1151,7 +1178,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1151 1178
1152static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) 1179static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1153{ 1180{
1154 struct iphdr *iph = (struct iphdr *) skb_mac_header(skb); 1181 const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1155 memcpy(haddr, &iph->saddr, 4); 1182 memcpy(haddr, &iph->saddr, 4);
1156 return 4; 1183 return 4;
1157} 1184}
@@ -1167,14 +1194,16 @@ static int ipgre_open(struct net_device *dev)
1167 struct ip_tunnel *t = netdev_priv(dev); 1194 struct ip_tunnel *t = netdev_priv(dev);
1168 1195
1169 if (ipv4_is_multicast(t->parms.iph.daddr)) { 1196 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1170 struct flowi fl = { .oif = t->parms.link, 1197 struct flowi4 fl4;
1171 .nl_u = { .ip4_u =
1172 { .daddr = t->parms.iph.daddr,
1173 .saddr = t->parms.iph.saddr,
1174 .tos = RT_TOS(t->parms.iph.tos) } },
1175 .proto = IPPROTO_GRE };
1176 struct rtable *rt; 1198 struct rtable *rt;
1177 if (ip_route_output_key(dev_net(dev), &rt, &fl)) 1199
1200 rt = ip_route_output_gre(dev_net(dev), &fl4,
1201 t->parms.iph.daddr,
1202 t->parms.iph.saddr,
1203 t->parms.o_key,
1204 RT_TOS(t->parms.iph.tos),
1205 t->parms.link);
1206 if (IS_ERR(rt))
1178 return -EADDRNOTAVAIL; 1207 return -EADDRNOTAVAIL;
1179 dev = rt->dst.dev; 1208 dev = rt->dst.dev;
1180 ip_rt_put(rt); 1209 ip_rt_put(rt);
@@ -1193,10 +1222,8 @@ static int ipgre_close(struct net_device *dev)
1193 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { 1222 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1194 struct in_device *in_dev; 1223 struct in_device *in_dev;
1195 in_dev = inetdev_by_index(dev_net(dev), t->mlink); 1224 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1196 if (in_dev) { 1225 if (in_dev)
1197 ip_mc_dec_group(in_dev, t->parms.iph.daddr); 1226 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1198 in_dev_put(in_dev);
1199 }
1200 } 1227 }
1201 return 0; 1228 return 0;
1202} 1229}
@@ -1213,12 +1240,19 @@ static const struct net_device_ops ipgre_netdev_ops = {
1213 .ndo_start_xmit = ipgre_tunnel_xmit, 1240 .ndo_start_xmit = ipgre_tunnel_xmit,
1214 .ndo_do_ioctl = ipgre_tunnel_ioctl, 1241 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1215 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1242 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1243 .ndo_get_stats = ipgre_get_stats,
1216}; 1244};
1217 1245
1246static void ipgre_dev_free(struct net_device *dev)
1247{
1248 free_percpu(dev->tstats);
1249 free_netdev(dev);
1250}
1251
1218static void ipgre_tunnel_setup(struct net_device *dev) 1252static void ipgre_tunnel_setup(struct net_device *dev)
1219{ 1253{
1220 dev->netdev_ops = &ipgre_netdev_ops; 1254 dev->netdev_ops = &ipgre_netdev_ops;
1221 dev->destructor = free_netdev; 1255 dev->destructor = ipgre_dev_free;
1222 1256
1223 dev->type = ARPHRD_IPGRE; 1257 dev->type = ARPHRD_IPGRE;
1224 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 1258 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
@@ -1256,6 +1290,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
1256 } else 1290 } else
1257 dev->header_ops = &ipgre_header_ops; 1291 dev->header_ops = &ipgre_header_ops;
1258 1292
1293 dev->tstats = alloc_percpu(struct pcpu_tstats);
1294 if (!dev->tstats)
1295 return -ENOMEM;
1296
1259 return 0; 1297 return 0;
1260} 1298}
1261 1299
@@ -1263,7 +1301,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
1263{ 1301{
1264 struct ip_tunnel *tunnel = netdev_priv(dev); 1302 struct ip_tunnel *tunnel = netdev_priv(dev);
1265 struct iphdr *iph = &tunnel->parms.iph; 1303 struct iphdr *iph = &tunnel->parms.iph;
1266 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1267 1304
1268 tunnel->dev = dev; 1305 tunnel->dev = dev;
1269 strcpy(tunnel->parms.name, dev->name); 1306 strcpy(tunnel->parms.name, dev->name);
@@ -1274,14 +1311,12 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
1274 tunnel->hlen = sizeof(struct iphdr) + 4; 1311 tunnel->hlen = sizeof(struct iphdr) + 4;
1275 1312
1276 dev_hold(dev); 1313 dev_hold(dev);
1277 ign->tunnels_wc[0] = tunnel;
1278} 1314}
1279 1315
1280 1316
1281static const struct net_protocol ipgre_protocol = { 1317static const struct gre_protocol ipgre_protocol = {
1282 .handler = ipgre_rcv, 1318 .handler = ipgre_rcv,
1283 .err_handler = ipgre_err, 1319 .err_handler = ipgre_err,
1284 .netns_ok = 1,
1285}; 1320};
1286 1321
1287static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) 1322static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
@@ -1291,11 +1326,13 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1291 for (prio = 0; prio < 4; prio++) { 1326 for (prio = 0; prio < 4; prio++) {
1292 int h; 1327 int h;
1293 for (h = 0; h < HASH_SIZE; h++) { 1328 for (h = 0; h < HASH_SIZE; h++) {
1294 struct ip_tunnel *t = ign->tunnels[prio][h]; 1329 struct ip_tunnel *t;
1330
1331 t = rtnl_dereference(ign->tunnels[prio][h]);
1295 1332
1296 while (t != NULL) { 1333 while (t != NULL) {
1297 unregister_netdevice_queue(t->dev, head); 1334 unregister_netdevice_queue(t->dev, head);
1298 t = t->next; 1335 t = rtnl_dereference(t->next);
1299 } 1336 }
1300 } 1337 }
1301 } 1338 }
@@ -1320,10 +1357,12 @@ static int __net_init ipgre_init_net(struct net *net)
1320 if ((err = register_netdev(ign->fb_tunnel_dev))) 1357 if ((err = register_netdev(ign->fb_tunnel_dev)))
1321 goto err_reg_dev; 1358 goto err_reg_dev;
1322 1359
1360 rcu_assign_pointer(ign->tunnels_wc[0],
1361 netdev_priv(ign->fb_tunnel_dev));
1323 return 0; 1362 return 0;
1324 1363
1325err_reg_dev: 1364err_reg_dev:
1326 free_netdev(ign->fb_tunnel_dev); 1365 ipgre_dev_free(ign->fb_tunnel_dev);
1327err_alloc_dev: 1366err_alloc_dev:
1328 return err; 1367 return err;
1329} 1368}
@@ -1441,6 +1480,10 @@ static int ipgre_tap_init(struct net_device *dev)
1441 1480
1442 ipgre_tunnel_bind_dev(dev); 1481 ipgre_tunnel_bind_dev(dev);
1443 1482
1483 dev->tstats = alloc_percpu(struct pcpu_tstats);
1484 if (!dev->tstats)
1485 return -ENOMEM;
1486
1444 return 0; 1487 return 0;
1445} 1488}
1446 1489
@@ -1451,6 +1494,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
1451 .ndo_set_mac_address = eth_mac_addr, 1494 .ndo_set_mac_address = eth_mac_addr,
1452 .ndo_validate_addr = eth_validate_addr, 1495 .ndo_validate_addr = eth_validate_addr,
1453 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1496 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1497 .ndo_get_stats = ipgre_get_stats,
1454}; 1498};
1455 1499
1456static void ipgre_tap_setup(struct net_device *dev) 1500static void ipgre_tap_setup(struct net_device *dev)
@@ -1459,7 +1503,7 @@ static void ipgre_tap_setup(struct net_device *dev)
1459 ether_setup(dev); 1503 ether_setup(dev);
1460 1504
1461 dev->netdev_ops = &ipgre_tap_netdev_ops; 1505 dev->netdev_ops = &ipgre_tap_netdev_ops;
1462 dev->destructor = free_netdev; 1506 dev->destructor = ipgre_dev_free;
1463 1507
1464 dev->iflink = 0; 1508 dev->iflink = 0;
1465 dev->features |= NETIF_F_NETNS_LOCAL; 1509 dev->features |= NETIF_F_NETNS_LOCAL;
@@ -1487,6 +1531,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla
1487 if (!tb[IFLA_MTU]) 1531 if (!tb[IFLA_MTU])
1488 dev->mtu = mtu; 1532 dev->mtu = mtu;
1489 1533
1534 /* Can use a lockless transmit, unless we generate output sequences */
1535 if (!(nt->parms.o_flags & GRE_SEQ))
1536 dev->features |= NETIF_F_LLTX;
1537
1490 err = register_netdevice(dev); 1538 err = register_netdevice(dev);
1491 if (err) 1539 if (err)
1492 goto out; 1540 goto out;
@@ -1522,7 +1570,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1522 t = nt; 1570 t = nt;
1523 1571
1524 if (dev->type != ARPHRD_ETHER) { 1572 if (dev->type != ARPHRD_ETHER) {
1525 unsigned nflags = 0; 1573 unsigned int nflags = 0;
1526 1574
1527 if (ipv4_is_multicast(p.iph.daddr)) 1575 if (ipv4_is_multicast(p.iph.daddr))
1528 nflags = IFF_BROADCAST; 1576 nflags = IFF_BROADCAST;
@@ -1663,7 +1711,7 @@ static int __init ipgre_init(void)
1663 if (err < 0) 1711 if (err < 0)
1664 return err; 1712 return err;
1665 1713
1666 err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE); 1714 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1667 if (err < 0) { 1715 if (err < 0) {
1668 printk(KERN_INFO "ipgre init: can't add protocol\n"); 1716 printk(KERN_INFO "ipgre init: can't add protocol\n");
1669 goto add_proto_failed; 1717 goto add_proto_failed;
@@ -1683,7 +1731,7 @@ out:
1683tap_ops_failed: 1731tap_ops_failed:
1684 rtnl_link_unregister(&ipgre_link_ops); 1732 rtnl_link_unregister(&ipgre_link_ops);
1685rtnl_link_failed: 1733rtnl_link_failed:
1686 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); 1734 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1687add_proto_failed: 1735add_proto_failed:
1688 unregister_pernet_device(&ipgre_net_ops); 1736 unregister_pernet_device(&ipgre_net_ops);
1689 goto out; 1737 goto out;
@@ -1693,7 +1741,7 @@ static void __exit ipgre_fini(void)
1693{ 1741{
1694 rtnl_link_unregister(&ipgre_tap_ops); 1742 rtnl_link_unregister(&ipgre_tap_ops);
1695 rtnl_link_unregister(&ipgre_link_ops); 1743 rtnl_link_unregister(&ipgre_link_ops);
1696 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) 1744 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1697 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1745 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1698 unregister_pernet_device(&ipgre_net_ops); 1746 unregister_pernet_device(&ipgre_net_ops);
1699} 1747}
@@ -1703,3 +1751,4 @@ module_exit(ipgre_fini);
1703MODULE_LICENSE("GPL"); 1751MODULE_LICENSE("GPL");
1704MODULE_ALIAS_RTNL_LINK("gre"); 1752MODULE_ALIAS_RTNL_LINK("gre");
1705MODULE_ALIAS_RTNL_LINK("gretap"); 1753MODULE_ALIAS_RTNL_LINK("gretap");
1754MODULE_ALIAS_NETDEV("gre0");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d859bcc26cb7..c8f48efc5fd3 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -268,7 +268,7 @@ int ip_local_deliver(struct sk_buff *skb)
268static inline int ip_rcv_options(struct sk_buff *skb) 268static inline int ip_rcv_options(struct sk_buff *skb)
269{ 269{
270 struct ip_options *opt; 270 struct ip_options *opt;
271 struct iphdr *iph; 271 const struct iphdr *iph;
272 struct net_device *dev = skb->dev; 272 struct net_device *dev = skb->dev;
273 273
274 /* It looks as overkill, because not all 274 /* It looks as overkill, because not all
@@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
340 } 340 }
341 } 341 }
342 342
343#ifdef CONFIG_NET_CLS_ROUTE 343#ifdef CONFIG_IP_ROUTE_CLASSID
344 if (unlikely(skb_dst(skb)->tclassid)) { 344 if (unlikely(skb_dst(skb)->tclassid)) {
345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); 345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
346 u32 idx = skb_dst(skb)->tclassid; 346 u32 idx = skb_dst(skb)->tclassid;
@@ -374,7 +374,7 @@ drop:
374 */ 374 */
375int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) 375int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
376{ 376{
377 struct iphdr *iph; 377 const struct iphdr *iph;
378 u32 len; 378 u32 len;
379 379
380 /* When the interface is in promisc. mode, drop all the crap 380 /* When the interface is in promisc. mode, drop all the crap
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ba9836c488ed..ec93335901dd 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -14,6 +14,7 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/types.h> 15#include <linux/types.h>
16#include <asm/uaccess.h> 16#include <asm/uaccess.h>
17#include <asm/unaligned.h>
17#include <linux/skbuff.h> 18#include <linux/skbuff.h>
18#include <linux/ip.h> 19#include <linux/ip.h>
19#include <linux/icmp.h> 20#include <linux/icmp.h>
@@ -36,8 +37,8 @@
36 * saddr is address of outgoing interface. 37 * saddr is address of outgoing interface.
37 */ 38 */
38 39
39void ip_options_build(struct sk_buff * skb, struct ip_options * opt, 40void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
40 __be32 daddr, struct rtable *rt, int is_frag) 41 __be32 daddr, struct rtable *rt, int is_frag)
41{ 42{
42 unsigned char *iph = skb_network_header(skb); 43 unsigned char *iph = skb_network_header(skb);
43 44
@@ -50,9 +51,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
50 51
51 if (!is_frag) { 52 if (!is_frag) {
52 if (opt->rr_needaddr) 53 if (opt->rr_needaddr)
53 ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt); 54 ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt);
54 if (opt->ts_needaddr) 55 if (opt->ts_needaddr)
55 ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt); 56 ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
56 if (opt->ts_needtime) { 57 if (opt->ts_needtime) {
57 struct timespec tv; 58 struct timespec tv;
58 __be32 midtime; 59 __be32 midtime;
@@ -83,9 +84,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
83 * NOTE: dopt cannot point to skb. 84 * NOTE: dopt cannot point to skb.
84 */ 85 */
85 86
86int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) 87int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
87{ 88{
88 struct ip_options *sopt; 89 const struct ip_options *sopt;
89 unsigned char *sptr, *dptr; 90 unsigned char *sptr, *dptr;
90 int soffset, doffset; 91 int soffset, doffset;
91 int optlen; 92 int optlen;
@@ -95,10 +96,8 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
95 96
96 sopt = &(IPCB(skb)->opt); 97 sopt = &(IPCB(skb)->opt);
97 98
98 if (sopt->optlen == 0) { 99 if (sopt->optlen == 0)
99 dopt->optlen = 0;
100 return 0; 100 return 0;
101 }
102 101
103 sptr = skb_network_header(skb); 102 sptr = skb_network_header(skb);
104 dptr = dopt->__data; 103 dptr = dopt->__data;
@@ -140,11 +139,11 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
140 } else { 139 } else {
141 dopt->ts_needtime = 0; 140 dopt->ts_needtime = 0;
142 141
143 if (soffset + 8 <= optlen) { 142 if (soffset + 7 <= optlen) {
144 __be32 addr; 143 __be32 addr;
145 144
146 memcpy(&addr, sptr+soffset-1, 4); 145 memcpy(&addr, dptr+soffset-1, 4);
147 if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_LOCAL) { 146 if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) {
148 dopt->ts_needtime = 1; 147 dopt->ts_needtime = 1;
149 soffset += 8; 148 soffset += 8;
150 } 149 }
@@ -157,7 +156,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
157 dopt->optlen += optlen; 156 dopt->optlen += optlen;
158 } 157 }
159 if (sopt->srr) { 158 if (sopt->srr) {
160 unsigned char * start = sptr+sopt->srr; 159 unsigned char *start = sptr+sopt->srr;
161 __be32 faddr; 160 __be32 faddr;
162 161
163 optlen = start[1]; 162 optlen = start[1];
@@ -329,7 +328,7 @@ int ip_options_compile(struct net *net,
329 pp_ptr = optptr + 2; 328 pp_ptr = optptr + 2;
330 goto error; 329 goto error;
331 } 330 }
332 if (skb) { 331 if (rt) {
333 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); 332 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
334 opt->is_changed = 1; 333 opt->is_changed = 1;
335 } 334 }
@@ -352,7 +351,7 @@ int ip_options_compile(struct net *net,
352 goto error; 351 goto error;
353 } 352 }
354 if (optptr[2] <= optlen) { 353 if (optptr[2] <= optlen) {
355 __be32 *timeptr = NULL; 354 unsigned char *timeptr = NULL;
356 if (optptr[2]+3 > optptr[1]) { 355 if (optptr[2]+3 > optptr[1]) {
357 pp_ptr = optptr + 2; 356 pp_ptr = optptr + 2;
358 goto error; 357 goto error;
@@ -361,7 +360,7 @@ int ip_options_compile(struct net *net,
361 case IPOPT_TS_TSONLY: 360 case IPOPT_TS_TSONLY:
362 opt->ts = optptr - iph; 361 opt->ts = optptr - iph;
363 if (skb) 362 if (skb)
364 timeptr = (__be32*)&optptr[optptr[2]-1]; 363 timeptr = &optptr[optptr[2]-1];
365 opt->ts_needtime = 1; 364 opt->ts_needtime = 1;
366 optptr[2] += 4; 365 optptr[2] += 4;
367 break; 366 break;
@@ -371,9 +370,9 @@ int ip_options_compile(struct net *net,
371 goto error; 370 goto error;
372 } 371 }
373 opt->ts = optptr - iph; 372 opt->ts = optptr - iph;
374 if (skb) { 373 if (rt) {
375 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); 374 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
376 timeptr = (__be32*)&optptr[optptr[2]+3]; 375 timeptr = &optptr[optptr[2]+3];
377 } 376 }
378 opt->ts_needaddr = 1; 377 opt->ts_needaddr = 1;
379 opt->ts_needtime = 1; 378 opt->ts_needtime = 1;
@@ -391,7 +390,7 @@ int ip_options_compile(struct net *net,
391 if (inet_addr_type(net, addr) == RTN_UNICAST) 390 if (inet_addr_type(net, addr) == RTN_UNICAST)
392 break; 391 break;
393 if (skb) 392 if (skb)
394 timeptr = (__be32*)&optptr[optptr[2]+3]; 393 timeptr = &optptr[optptr[2]+3];
395 } 394 }
396 opt->ts_needtime = 1; 395 opt->ts_needtime = 1;
397 optptr[2] += 8; 396 optptr[2] += 8;
@@ -405,10 +404,10 @@ int ip_options_compile(struct net *net,
405 } 404 }
406 if (timeptr) { 405 if (timeptr) {
407 struct timespec tv; 406 struct timespec tv;
408 __be32 midtime; 407 u32 midtime;
409 getnstimeofday(&tv); 408 getnstimeofday(&tv);
410 midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC); 409 midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC;
411 memcpy(timeptr, &midtime, sizeof(__be32)); 410 put_unaligned_be32(midtime, timeptr);
412 opt->is_changed = 1; 411 opt->is_changed = 1;
413 } 412 }
414 } else { 413 } else {
@@ -466,7 +465,7 @@ error:
466 } 465 }
467 return -EINVAL; 466 return -EINVAL;
468} 467}
469 468EXPORT_SYMBOL(ip_options_compile);
470 469
471/* 470/*
472 * Undo all the changes done by ip_options_compile(). 471 * Undo all the changes done by ip_options_compile().
@@ -499,19 +498,19 @@ void ip_options_undo(struct ip_options * opt)
499 } 498 }
500} 499}
501 500
502static struct ip_options *ip_options_get_alloc(const int optlen) 501static struct ip_options_rcu *ip_options_get_alloc(const int optlen)
503{ 502{
504 return kzalloc(sizeof(struct ip_options) + ((optlen + 3) & ~3), 503 return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),
505 GFP_KERNEL); 504 GFP_KERNEL);
506} 505}
507 506
508static int ip_options_get_finish(struct net *net, struct ip_options **optp, 507static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp,
509 struct ip_options *opt, int optlen) 508 struct ip_options_rcu *opt, int optlen)
510{ 509{
511 while (optlen & 3) 510 while (optlen & 3)
512 opt->__data[optlen++] = IPOPT_END; 511 opt->opt.__data[optlen++] = IPOPT_END;
513 opt->optlen = optlen; 512 opt->opt.optlen = optlen;
514 if (optlen && ip_options_compile(net, opt, NULL)) { 513 if (optlen && ip_options_compile(net, &opt->opt, NULL)) {
515 kfree(opt); 514 kfree(opt);
516 return -EINVAL; 515 return -EINVAL;
517 } 516 }
@@ -520,29 +519,29 @@ static int ip_options_get_finish(struct net *net, struct ip_options **optp,
520 return 0; 519 return 0;
521} 520}
522 521
523int ip_options_get_from_user(struct net *net, struct ip_options **optp, 522int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp,
524 unsigned char __user *data, int optlen) 523 unsigned char __user *data, int optlen)
525{ 524{
526 struct ip_options *opt = ip_options_get_alloc(optlen); 525 struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
527 526
528 if (!opt) 527 if (!opt)
529 return -ENOMEM; 528 return -ENOMEM;
530 if (optlen && copy_from_user(opt->__data, data, optlen)) { 529 if (optlen && copy_from_user(opt->opt.__data, data, optlen)) {
531 kfree(opt); 530 kfree(opt);
532 return -EFAULT; 531 return -EFAULT;
533 } 532 }
534 return ip_options_get_finish(net, optp, opt, optlen); 533 return ip_options_get_finish(net, optp, opt, optlen);
535} 534}
536 535
537int ip_options_get(struct net *net, struct ip_options **optp, 536int ip_options_get(struct net *net, struct ip_options_rcu **optp,
538 unsigned char *data, int optlen) 537 unsigned char *data, int optlen)
539{ 538{
540 struct ip_options *opt = ip_options_get_alloc(optlen); 539 struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
541 540
542 if (!opt) 541 if (!opt)
543 return -ENOMEM; 542 return -ENOMEM;
544 if (optlen) 543 if (optlen)
545 memcpy(opt->__data, data, optlen); 544 memcpy(opt->opt.__data, data, optlen);
546 return ip_options_get_finish(net, optp, opt, optlen); 545 return ip_options_get_finish(net, optp, opt, optlen);
547} 546}
548 547
@@ -555,7 +554,7 @@ void ip_forward_options(struct sk_buff *skb)
555 554
556 if (opt->rr_needaddr) { 555 if (opt->rr_needaddr) {
557 optptr = (unsigned char *)raw + opt->rr; 556 optptr = (unsigned char *)raw + opt->rr;
558 ip_rt_get_source(&optptr[optptr[2]-5], rt); 557 ip_rt_get_source(&optptr[optptr[2]-5], skb, rt);
559 opt->is_changed = 1; 558 opt->is_changed = 1;
560 } 559 }
561 if (opt->srr_is_hit) { 560 if (opt->srr_is_hit) {
@@ -569,19 +568,18 @@ void ip_forward_options(struct sk_buff *skb)
569 ) { 568 ) {
570 if (srrptr + 3 > srrspace) 569 if (srrptr + 3 > srrspace)
571 break; 570 break;
572 if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0) 571 if (memcmp(&ip_hdr(skb)->daddr, &optptr[srrptr-1], 4) == 0)
573 break; 572 break;
574 } 573 }
575 if (srrptr + 3 <= srrspace) { 574 if (srrptr + 3 <= srrspace) {
576 opt->is_changed = 1; 575 opt->is_changed = 1;
577 ip_rt_get_source(&optptr[srrptr-1], rt); 576 ip_rt_get_source(&optptr[srrptr-1], skb, rt);
578 ip_hdr(skb)->daddr = rt->rt_dst;
579 optptr[2] = srrptr+4; 577 optptr[2] = srrptr+4;
580 } else if (net_ratelimit()) 578 } else if (net_ratelimit())
581 printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); 579 printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
582 if (opt->ts_needaddr) { 580 if (opt->ts_needaddr) {
583 optptr = raw + opt->ts; 581 optptr = raw + opt->ts;
584 ip_rt_get_source(&optptr[optptr[2]-9], rt); 582 ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
585 opt->is_changed = 1; 583 opt->is_changed = 1;
586 } 584 }
587 } 585 }
@@ -603,7 +601,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
603 unsigned long orefdst; 601 unsigned long orefdst;
604 int err; 602 int err;
605 603
606 if (!opt->srr) 604 if (!rt)
607 return 0; 605 return 0;
608 606
609 if (skb->pkt_type != PACKET_HOST) 607 if (skb->pkt_type != PACKET_HOST)
@@ -637,7 +635,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
637 if (rt2->rt_type != RTN_LOCAL) 635 if (rt2->rt_type != RTN_LOCAL)
638 break; 636 break;
639 /* Superfast 8) loopback forward */ 637 /* Superfast 8) loopback forward */
640 memcpy(&iph->daddr, &optptr[srrptr-1], 4); 638 iph->daddr = nexthop;
641 opt->is_changed = 1; 639 opt->is_changed = 1;
642 } 640 }
643 if (srrptr <= srrspace) { 641 if (srrptr <= srrspace) {
@@ -646,3 +644,4 @@ int ip_options_rcv_srr(struct sk_buff *skb)
646 } 644 }
647 return 0; 645 return 0;
648} 646}
647EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7649d7750075..84f26e8e6c60 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -82,6 +82,7 @@
82#include <linux/tcp.h> 82#include <linux/tcp.h>
83 83
84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; 84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85EXPORT_SYMBOL(sysctl_ip_default_ttl);
85 86
86/* Generate a checksum for an outgoing IP datagram. */ 87/* Generate a checksum for an outgoing IP datagram. */
87__inline__ void ip_send_check(struct iphdr *iph) 88__inline__ void ip_send_check(struct iphdr *iph)
@@ -130,7 +131,7 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130 int ttl = inet->uc_ttl; 131 int ttl = inet->uc_ttl;
131 132
132 if (ttl < 0) 133 if (ttl < 0)
133 ttl = dst_metric(dst, RTAX_HOPLIMIT); 134 ttl = ip4_dst_hoplimit(dst);
134 return ttl; 135 return ttl;
135} 136}
136 137
@@ -139,14 +140,14 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
139 * 140 *
140 */ 141 */
141int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, 142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
142 __be32 saddr, __be32 daddr, struct ip_options *opt) 143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
143{ 144{
144 struct inet_sock *inet = inet_sk(sk); 145 struct inet_sock *inet = inet_sk(sk);
145 struct rtable *rt = skb_rtable(skb); 146 struct rtable *rt = skb_rtable(skb);
146 struct iphdr *iph; 147 struct iphdr *iph;
147 148
148 /* Build the IP header. */ 149 /* Build the IP header. */
149 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); 150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
150 skb_reset_network_header(skb); 151 skb_reset_network_header(skb);
151 iph = ip_hdr(skb); 152 iph = ip_hdr(skb);
152 iph->version = 4; 153 iph->version = 4;
@@ -157,14 +158,14 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
157 else 158 else
158 iph->frag_off = 0; 159 iph->frag_off = 0;
159 iph->ttl = ip_select_ttl(inet, &rt->dst); 160 iph->ttl = ip_select_ttl(inet, &rt->dst);
160 iph->daddr = rt->rt_dst; 161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
161 iph->saddr = rt->rt_src; 162 iph->saddr = saddr;
162 iph->protocol = sk->sk_protocol; 163 iph->protocol = sk->sk_protocol;
163 ip_select_ident(iph, &rt->dst, sk); 164 ip_select_ident(iph, &rt->dst, sk);
164 165
165 if (opt && opt->optlen) { 166 if (opt && opt->opt.optlen) {
166 iph->ihl += opt->optlen>>2; 167 iph->ihl += opt->opt.optlen>>2;
167 ip_options_build(skb, opt, daddr, rt, 0); 168 ip_options_build(skb, &opt->opt, daddr, rt, 0);
168 } 169 }
169 170
170 skb->priority = sk->sk_priority; 171 skb->priority = sk->sk_priority;
@@ -311,11 +312,12 @@ int ip_output(struct sk_buff *skb)
311 !(IPCB(skb)->flags & IPSKB_REROUTED)); 312 !(IPCB(skb)->flags & IPSKB_REROUTED));
312} 313}
313 314
314int ip_queue_xmit(struct sk_buff *skb) 315int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
315{ 316{
316 struct sock *sk = skb->sk; 317 struct sock *sk = skb->sk;
317 struct inet_sock *inet = inet_sk(sk); 318 struct inet_sock *inet = inet_sk(sk);
318 struct ip_options *opt = inet->opt; 319 struct ip_options_rcu *inet_opt;
320 struct flowi4 *fl4;
319 struct rtable *rt; 321 struct rtable *rt;
320 struct iphdr *iph; 322 struct iphdr *iph;
321 int res; 323 int res;
@@ -324,6 +326,8 @@ int ip_queue_xmit(struct sk_buff *skb)
324 * f.e. by something like SCTP. 326 * f.e. by something like SCTP.
325 */ 327 */
326 rcu_read_lock(); 328 rcu_read_lock();
329 inet_opt = rcu_dereference(inet->inet_opt);
330 fl4 = &fl->u.ip4;
327 rt = skb_rtable(skb); 331 rt = skb_rtable(skb);
328 if (rt != NULL) 332 if (rt != NULL)
329 goto packet_routed; 333 goto packet_routed;
@@ -335,40 +339,32 @@ int ip_queue_xmit(struct sk_buff *skb)
335 339
336 /* Use correct destination address if we have options. */ 340 /* Use correct destination address if we have options. */
337 daddr = inet->inet_daddr; 341 daddr = inet->inet_daddr;
338 if(opt && opt->srr) 342 if (inet_opt && inet_opt->opt.srr)
339 daddr = opt->faddr; 343 daddr = inet_opt->opt.faddr;
340 344
341 { 345 /* If this fails, retransmit mechanism of transport layer will
342 struct flowi fl = { .oif = sk->sk_bound_dev_if, 346 * keep trying until route appears or the connection times
343 .mark = sk->sk_mark, 347 * itself out.
344 .nl_u = { .ip4_u = 348 */
345 { .daddr = daddr, 349 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
346 .saddr = inet->inet_saddr, 350 daddr, inet->inet_saddr,
347 .tos = RT_CONN_FLAGS(sk) } }, 351 inet->inet_dport,
348 .proto = sk->sk_protocol, 352 inet->inet_sport,
349 .flags = inet_sk_flowi_flags(sk), 353 sk->sk_protocol,
350 .uli_u = { .ports = 354 RT_CONN_FLAGS(sk),
351 { .sport = inet->inet_sport, 355 sk->sk_bound_dev_if);
352 .dport = inet->inet_dport } } }; 356 if (IS_ERR(rt))
353 357 goto no_route;
354 /* If this fails, retransmit mechanism of transport layer will
355 * keep trying until route appears or the connection times
356 * itself out.
357 */
358 security_sk_classify_flow(sk, &fl);
359 if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
360 goto no_route;
361 }
362 sk_setup_caps(sk, &rt->dst); 358 sk_setup_caps(sk, &rt->dst);
363 } 359 }
364 skb_dst_set_noref(skb, &rt->dst); 360 skb_dst_set_noref(skb, &rt->dst);
365 361
366packet_routed: 362packet_routed:
367 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 363 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
368 goto no_route; 364 goto no_route;
369 365
370 /* OK, we know where to send it, allocate and build IP header. */ 366 /* OK, we know where to send it, allocate and build IP header. */
371 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); 367 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
372 skb_reset_network_header(skb); 368 skb_reset_network_header(skb);
373 iph = ip_hdr(skb); 369 iph = ip_hdr(skb);
374 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); 370 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
@@ -378,13 +374,13 @@ packet_routed:
378 iph->frag_off = 0; 374 iph->frag_off = 0;
379 iph->ttl = ip_select_ttl(inet, &rt->dst); 375 iph->ttl = ip_select_ttl(inet, &rt->dst);
380 iph->protocol = sk->sk_protocol; 376 iph->protocol = sk->sk_protocol;
381 iph->saddr = rt->rt_src; 377 iph->saddr = fl4->saddr;
382 iph->daddr = rt->rt_dst; 378 iph->daddr = fl4->daddr;
383 /* Transport layer set skb->h.foo itself. */ 379 /* Transport layer set skb->h.foo itself. */
384 380
385 if (opt && opt->optlen) { 381 if (inet_opt && inet_opt->opt.optlen) {
386 iph->ihl += opt->optlen >> 2; 382 iph->ihl += inet_opt->opt.optlen >> 2;
387 ip_options_build(skb, opt, inet->inet_daddr, rt, 0); 383 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
388 } 384 }
389 385
390 ip_select_ident_more(iph, &rt->dst, sk, 386 ip_select_ident_more(iph, &rt->dst, sk,
@@ -487,7 +483,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
487 * LATER: this step can be merged to real generation of fragments, 483 * LATER: this step can be merged to real generation of fragments,
488 * we can switch to copy when see the first bad fragment. 484 * we can switch to copy when see the first bad fragment.
489 */ 485 */
490 if (skb_has_frags(skb)) { 486 if (skb_has_frag_list(skb)) {
491 struct sk_buff *frag, *frag2; 487 struct sk_buff *frag, *frag2;
492 int first_len = skb_pagelen(skb); 488 int first_len = skb_pagelen(skb);
493 489
@@ -610,7 +606,7 @@ slow_path:
610 /* IF: it doesn't fit, use 'mtu' - the data space left */ 606 /* IF: it doesn't fit, use 'mtu' - the data space left */
611 if (len > mtu) 607 if (len > mtu)
612 len = mtu; 608 len = mtu;
613 /* IF: we are not sending upto and including the packet end 609 /* IF: we are not sending up to and including the packet end
614 then align the next start on an eight byte boundary */ 610 then align the next start on an eight byte boundary */
615 if (len < left) { 611 if (len < left) {
616 len &= ~7; 612 len &= ~7;
@@ -734,6 +730,7 @@ csum_page(struct page *page, int offset, int copy)
734} 730}
735 731
736static inline int ip_ufo_append_data(struct sock *sk, 732static inline int ip_ufo_append_data(struct sock *sk,
733 struct sk_buff_head *queue,
737 int getfrag(void *from, char *to, int offset, int len, 734 int getfrag(void *from, char *to, int offset, int len,
738 int odd, struct sk_buff *skb), 735 int odd, struct sk_buff *skb),
739 void *from, int length, int hh_len, int fragheaderlen, 736 void *from, int length, int hh_len, int fragheaderlen,
@@ -746,7 +743,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
746 * device, so create one single skb packet containing complete 743 * device, so create one single skb packet containing complete
747 * udp datagram 744 * udp datagram
748 */ 745 */
749 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { 746 if ((skb = skb_peek_tail(queue)) == NULL) {
750 skb = sock_alloc_send_skb(sk, 747 skb = sock_alloc_send_skb(sk,
751 hh_len + fragheaderlen + transhdrlen + 20, 748 hh_len + fragheaderlen + transhdrlen + 20,
752 (flags & MSG_DONTWAIT), &err); 749 (flags & MSG_DONTWAIT), &err);
@@ -768,40 +765,30 @@ static inline int ip_ufo_append_data(struct sock *sk,
768 765
769 skb->ip_summed = CHECKSUM_PARTIAL; 766 skb->ip_summed = CHECKSUM_PARTIAL;
770 skb->csum = 0; 767 skb->csum = 0;
771 sk->sk_sndmsg_off = 0;
772 768
773 /* specify the length of each IP datagram fragment */ 769 /* specify the length of each IP datagram fragment */
774 skb_shinfo(skb)->gso_size = mtu - fragheaderlen; 770 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
775 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 771 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
776 __skb_queue_tail(&sk->sk_write_queue, skb); 772 __skb_queue_tail(queue, skb);
777 } 773 }
778 774
779 return skb_append_datato_frags(sk, skb, getfrag, from, 775 return skb_append_datato_frags(sk, skb, getfrag, from,
780 (length - transhdrlen)); 776 (length - transhdrlen));
781} 777}
782 778
783/* 779static int __ip_append_data(struct sock *sk,
784 * ip_append_data() and ip_append_page() can make one large IP datagram 780 struct flowi4 *fl4,
785 * from many pieces of data. Each pieces will be holded on the socket 781 struct sk_buff_head *queue,
786 * until ip_push_pending_frames() is called. Each piece can be a page 782 struct inet_cork *cork,
787 * or non-page data. 783 int getfrag(void *from, char *to, int offset,
788 * 784 int len, int odd, struct sk_buff *skb),
789 * Not only UDP, other transport protocols - e.g. raw sockets - can use 785 void *from, int length, int transhdrlen,
790 * this interface potentially. 786 unsigned int flags)
791 *
792 * LATER: length must be adjusted by pad at tail, when it is required.
793 */
794int ip_append_data(struct sock *sk,
795 int getfrag(void *from, char *to, int offset, int len,
796 int odd, struct sk_buff *skb),
797 void *from, int length, int transhdrlen,
798 struct ipcm_cookie *ipc, struct rtable **rtp,
799 unsigned int flags)
800{ 787{
801 struct inet_sock *inet = inet_sk(sk); 788 struct inet_sock *inet = inet_sk(sk);
802 struct sk_buff *skb; 789 struct sk_buff *skb;
803 790
804 struct ip_options *opt = NULL; 791 struct ip_options *opt = cork->opt;
805 int hh_len; 792 int hh_len;
806 int exthdrlen; 793 int exthdrlen;
807 int mtu; 794 int mtu;
@@ -810,60 +797,20 @@ int ip_append_data(struct sock *sk,
810 int offset = 0; 797 int offset = 0;
811 unsigned int maxfraglen, fragheaderlen; 798 unsigned int maxfraglen, fragheaderlen;
812 int csummode = CHECKSUM_NONE; 799 int csummode = CHECKSUM_NONE;
813 struct rtable *rt; 800 struct rtable *rt = (struct rtable *)cork->dst;
814 801
815 if (flags&MSG_PROBE) 802 skb = skb_peek_tail(queue);
816 return 0;
817 803
818 if (skb_queue_empty(&sk->sk_write_queue)) { 804 exthdrlen = !skb ? rt->dst.header_len : 0;
819 /* 805 mtu = cork->fragsize;
820 * setup for corking.
821 */
822 opt = ipc->opt;
823 if (opt) {
824 if (inet->cork.opt == NULL) {
825 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
826 if (unlikely(inet->cork.opt == NULL))
827 return -ENOBUFS;
828 }
829 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
830 inet->cork.flags |= IPCORK_OPT;
831 inet->cork.addr = ipc->addr;
832 }
833 rt = *rtp;
834 if (unlikely(!rt))
835 return -EFAULT;
836 /*
837 * We steal reference to this route, caller should not release it
838 */
839 *rtp = NULL;
840 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
841 rt->dst.dev->mtu :
842 dst_mtu(rt->dst.path);
843 inet->cork.dst = &rt->dst;
844 inet->cork.length = 0;
845 sk->sk_sndmsg_page = NULL;
846 sk->sk_sndmsg_off = 0;
847 if ((exthdrlen = rt->dst.header_len) != 0) {
848 length += exthdrlen;
849 transhdrlen += exthdrlen;
850 }
851 } else {
852 rt = (struct rtable *)inet->cork.dst;
853 if (inet->cork.flags & IPCORK_OPT)
854 opt = inet->cork.opt;
855 806
856 transhdrlen = 0;
857 exthdrlen = 0;
858 mtu = inet->cork.fragsize;
859 }
860 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 807 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
861 808
862 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 809 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
863 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 810 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
864 811
865 if (inet->cork.length + length > 0xFFFF - fragheaderlen) { 812 if (cork->length + length > 0xFFFF - fragheaderlen) {
866 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, 813 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
867 mtu-exthdrlen); 814 mtu-exthdrlen);
868 return -EMSGSIZE; 815 return -EMSGSIZE;
869 } 816 }
@@ -878,15 +825,13 @@ int ip_append_data(struct sock *sk,
878 !exthdrlen) 825 !exthdrlen)
879 csummode = CHECKSUM_PARTIAL; 826 csummode = CHECKSUM_PARTIAL;
880 827
881 skb = skb_peek_tail(&sk->sk_write_queue); 828 cork->length += length;
882
883 inet->cork.length += length;
884 if (((length > mtu) || (skb && skb_is_gso(skb))) && 829 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
885 (sk->sk_protocol == IPPROTO_UDP) && 830 (sk->sk_protocol == IPPROTO_UDP) &&
886 (rt->dst.dev->features & NETIF_F_UFO)) { 831 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
887 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, 832 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
888 fragheaderlen, transhdrlen, mtu, 833 hh_len, fragheaderlen, transhdrlen,
889 flags); 834 mtu, flags);
890 if (err) 835 if (err)
891 goto error; 836 goto error;
892 return 0; 837 return 0;
@@ -934,7 +879,9 @@ alloc_new_skb:
934 !(rt->dst.dev->features&NETIF_F_SG)) 879 !(rt->dst.dev->features&NETIF_F_SG))
935 alloclen = mtu; 880 alloclen = mtu;
936 else 881 else
937 alloclen = datalen + fragheaderlen; 882 alloclen = fraglen;
883
884 alloclen += exthdrlen;
938 885
939 /* The last fragment gets additional space at tail. 886 /* The last fragment gets additional space at tail.
940 * Note, with MSG_MORE we overallocate on fragments, 887 * Note, with MSG_MORE we overallocate on fragments,
@@ -960,7 +907,7 @@ alloc_new_skb:
960 else 907 else
961 /* only the initial fragment is 908 /* only the initial fragment is
962 time stamped */ 909 time stamped */
963 ipc->shtx.flags = 0; 910 cork->tx_flags = 0;
964 } 911 }
965 if (skb == NULL) 912 if (skb == NULL)
966 goto error; 913 goto error;
@@ -971,16 +918,16 @@ alloc_new_skb:
971 skb->ip_summed = csummode; 918 skb->ip_summed = csummode;
972 skb->csum = 0; 919 skb->csum = 0;
973 skb_reserve(skb, hh_len); 920 skb_reserve(skb, hh_len);
974 *skb_tx(skb) = ipc->shtx; 921 skb_shinfo(skb)->tx_flags = cork->tx_flags;
975 922
976 /* 923 /*
977 * Find where to start putting bytes. 924 * Find where to start putting bytes.
978 */ 925 */
979 data = skb_put(skb, fraglen); 926 data = skb_put(skb, fraglen + exthdrlen);
980 skb_set_network_header(skb, exthdrlen); 927 skb_set_network_header(skb, exthdrlen);
981 skb->transport_header = (skb->network_header + 928 skb->transport_header = (skb->network_header +
982 fragheaderlen); 929 fragheaderlen);
983 data += fragheaderlen; 930 data += fragheaderlen + exthdrlen;
984 931
985 if (fraggap) { 932 if (fraggap) {
986 skb->csum = skb_copy_and_csum_bits( 933 skb->csum = skb_copy_and_csum_bits(
@@ -1008,7 +955,7 @@ alloc_new_skb:
1008 /* 955 /*
1009 * Put the packet on the pending queue. 956 * Put the packet on the pending queue.
1010 */ 957 */
1011 __skb_queue_tail(&sk->sk_write_queue, skb); 958 __skb_queue_tail(queue, skb);
1012 continue; 959 continue;
1013 } 960 }
1014 961
@@ -1028,8 +975,8 @@ alloc_new_skb:
1028 } else { 975 } else {
1029 int i = skb_shinfo(skb)->nr_frags; 976 int i = skb_shinfo(skb)->nr_frags;
1030 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 977 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1031 struct page *page = sk->sk_sndmsg_page; 978 struct page *page = cork->page;
1032 int off = sk->sk_sndmsg_off; 979 int off = cork->off;
1033 unsigned int left; 980 unsigned int left;
1034 981
1035 if (page && (left = PAGE_SIZE - off) > 0) { 982 if (page && (left = PAGE_SIZE - off) > 0) {
@@ -1041,7 +988,7 @@ alloc_new_skb:
1041 goto error; 988 goto error;
1042 } 989 }
1043 get_page(page); 990 get_page(page);
1044 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); 991 skb_fill_page_desc(skb, i, page, off, 0);
1045 frag = &skb_shinfo(skb)->frags[i]; 992 frag = &skb_shinfo(skb)->frags[i];
1046 } 993 }
1047 } else if (i < MAX_SKB_FRAGS) { 994 } else if (i < MAX_SKB_FRAGS) {
@@ -1052,8 +999,8 @@ alloc_new_skb:
1052 err = -ENOMEM; 999 err = -ENOMEM;
1053 goto error; 1000 goto error;
1054 } 1001 }
1055 sk->sk_sndmsg_page = page; 1002 cork->page = page;
1056 sk->sk_sndmsg_off = 0; 1003 cork->off = 0;
1057 1004
1058 skb_fill_page_desc(skb, i, page, 0, 0); 1005 skb_fill_page_desc(skb, i, page, 0, 0);
1059 frag = &skb_shinfo(skb)->frags[i]; 1006 frag = &skb_shinfo(skb)->frags[i];
@@ -1065,7 +1012,7 @@ alloc_new_skb:
1065 err = -EFAULT; 1012 err = -EFAULT;
1066 goto error; 1013 goto error;
1067 } 1014 }
1068 sk->sk_sndmsg_off += copy; 1015 cork->off += copy;
1069 frag->size += copy; 1016 frag->size += copy;
1070 skb->len += copy; 1017 skb->len += copy;
1071 skb->data_len += copy; 1018 skb->data_len += copy;
@@ -1079,18 +1026,95 @@ alloc_new_skb:
1079 return 0; 1026 return 0;
1080 1027
1081error: 1028error:
1082 inet->cork.length -= length; 1029 cork->length -= length;
1083 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1030 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1084 return err; 1031 return err;
1085} 1032}
1086 1033
1087ssize_t ip_append_page(struct sock *sk, struct page *page, 1034static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1035 struct ipcm_cookie *ipc, struct rtable **rtp)
1036{
1037 struct inet_sock *inet = inet_sk(sk);
1038 struct ip_options_rcu *opt;
1039 struct rtable *rt;
1040
1041 /*
1042 * setup for corking.
1043 */
1044 opt = ipc->opt;
1045 if (opt) {
1046 if (cork->opt == NULL) {
1047 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1048 sk->sk_allocation);
1049 if (unlikely(cork->opt == NULL))
1050 return -ENOBUFS;
1051 }
1052 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1053 cork->flags |= IPCORK_OPT;
1054 cork->addr = ipc->addr;
1055 }
1056 rt = *rtp;
1057 if (unlikely(!rt))
1058 return -EFAULT;
1059 /*
1060 * We steal reference to this route, caller should not release it
1061 */
1062 *rtp = NULL;
1063 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1064 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1065 cork->dst = &rt->dst;
1066 cork->length = 0;
1067 cork->tx_flags = ipc->tx_flags;
1068 cork->page = NULL;
1069 cork->off = 0;
1070
1071 return 0;
1072}
1073
1074/*
1075 * ip_append_data() and ip_append_page() can make one large IP datagram
1076 * from many pieces of data. Each pieces will be holded on the socket
1077 * until ip_push_pending_frames() is called. Each piece can be a page
1078 * or non-page data.
1079 *
1080 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1081 * this interface potentially.
1082 *
1083 * LATER: length must be adjusted by pad at tail, when it is required.
1084 */
1085int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1086 int getfrag(void *from, char *to, int offset, int len,
1087 int odd, struct sk_buff *skb),
1088 void *from, int length, int transhdrlen,
1089 struct ipcm_cookie *ipc, struct rtable **rtp,
1090 unsigned int flags)
1091{
1092 struct inet_sock *inet = inet_sk(sk);
1093 int err;
1094
1095 if (flags&MSG_PROBE)
1096 return 0;
1097
1098 if (skb_queue_empty(&sk->sk_write_queue)) {
1099 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1100 if (err)
1101 return err;
1102 } else {
1103 transhdrlen = 0;
1104 }
1105
1106 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1107 from, length, transhdrlen, flags);
1108}
1109
1110ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1088 int offset, size_t size, int flags) 1111 int offset, size_t size, int flags)
1089{ 1112{
1090 struct inet_sock *inet = inet_sk(sk); 1113 struct inet_sock *inet = inet_sk(sk);
1091 struct sk_buff *skb; 1114 struct sk_buff *skb;
1092 struct rtable *rt; 1115 struct rtable *rt;
1093 struct ip_options *opt = NULL; 1116 struct ip_options *opt = NULL;
1117 struct inet_cork *cork;
1094 int hh_len; 1118 int hh_len;
1095 int mtu; 1119 int mtu;
1096 int len; 1120 int len;
@@ -1106,28 +1130,29 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1106 if (skb_queue_empty(&sk->sk_write_queue)) 1130 if (skb_queue_empty(&sk->sk_write_queue))
1107 return -EINVAL; 1131 return -EINVAL;
1108 1132
1109 rt = (struct rtable *)inet->cork.dst; 1133 cork = &inet->cork.base;
1110 if (inet->cork.flags & IPCORK_OPT) 1134 rt = (struct rtable *)cork->dst;
1111 opt = inet->cork.opt; 1135 if (cork->flags & IPCORK_OPT)
1136 opt = cork->opt;
1112 1137
1113 if (!(rt->dst.dev->features&NETIF_F_SG)) 1138 if (!(rt->dst.dev->features&NETIF_F_SG))
1114 return -EOPNOTSUPP; 1139 return -EOPNOTSUPP;
1115 1140
1116 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1141 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1117 mtu = inet->cork.fragsize; 1142 mtu = cork->fragsize;
1118 1143
1119 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 1144 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1120 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 1145 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1121 1146
1122 if (inet->cork.length + size > 0xFFFF - fragheaderlen) { 1147 if (cork->length + size > 0xFFFF - fragheaderlen) {
1123 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu); 1148 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1124 return -EMSGSIZE; 1149 return -EMSGSIZE;
1125 } 1150 }
1126 1151
1127 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 1152 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1128 return -EINVAL; 1153 return -EINVAL;
1129 1154
1130 inet->cork.length += size; 1155 cork->length += size;
1131 if ((size + skb->len > mtu) && 1156 if ((size + skb->len > mtu) &&
1132 (sk->sk_protocol == IPPROTO_UDP) && 1157 (sk->sk_protocol == IPPROTO_UDP) &&
1133 (rt->dst.dev->features & NETIF_F_UFO)) { 1158 (rt->dst.dev->features & NETIF_F_UFO)) {
@@ -1222,45 +1247,47 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
1222 return 0; 1247 return 0;
1223 1248
1224error: 1249error:
1225 inet->cork.length -= size; 1250 cork->length -= size;
1226 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1251 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1227 return err; 1252 return err;
1228} 1253}
1229 1254
1230static void ip_cork_release(struct inet_sock *inet) 1255static void ip_cork_release(struct inet_cork *cork)
1231{ 1256{
1232 inet->cork.flags &= ~IPCORK_OPT; 1257 cork->flags &= ~IPCORK_OPT;
1233 kfree(inet->cork.opt); 1258 kfree(cork->opt);
1234 inet->cork.opt = NULL; 1259 cork->opt = NULL;
1235 dst_release(inet->cork.dst); 1260 dst_release(cork->dst);
1236 inet->cork.dst = NULL; 1261 cork->dst = NULL;
1237} 1262}
1238 1263
1239/* 1264/*
1240 * Combined all pending IP fragments on the socket as one IP datagram 1265 * Combined all pending IP fragments on the socket as one IP datagram
1241 * and push them out. 1266 * and push them out.
1242 */ 1267 */
1243int ip_push_pending_frames(struct sock *sk) 1268struct sk_buff *__ip_make_skb(struct sock *sk,
1269 struct flowi4 *fl4,
1270 struct sk_buff_head *queue,
1271 struct inet_cork *cork)
1244{ 1272{
1245 struct sk_buff *skb, *tmp_skb; 1273 struct sk_buff *skb, *tmp_skb;
1246 struct sk_buff **tail_skb; 1274 struct sk_buff **tail_skb;
1247 struct inet_sock *inet = inet_sk(sk); 1275 struct inet_sock *inet = inet_sk(sk);
1248 struct net *net = sock_net(sk); 1276 struct net *net = sock_net(sk);
1249 struct ip_options *opt = NULL; 1277 struct ip_options *opt = NULL;
1250 struct rtable *rt = (struct rtable *)inet->cork.dst; 1278 struct rtable *rt = (struct rtable *)cork->dst;
1251 struct iphdr *iph; 1279 struct iphdr *iph;
1252 __be16 df = 0; 1280 __be16 df = 0;
1253 __u8 ttl; 1281 __u8 ttl;
1254 int err = 0;
1255 1282
1256 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) 1283 if ((skb = __skb_dequeue(queue)) == NULL)
1257 goto out; 1284 goto out;
1258 tail_skb = &(skb_shinfo(skb)->frag_list); 1285 tail_skb = &(skb_shinfo(skb)->frag_list);
1259 1286
1260 /* move skb->data to ip header from ext header */ 1287 /* move skb->data to ip header from ext header */
1261 if (skb->data < skb_network_header(skb)) 1288 if (skb->data < skb_network_header(skb))
1262 __skb_pull(skb, skb_network_offset(skb)); 1289 __skb_pull(skb, skb_network_offset(skb));
1263 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { 1290 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1264 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1291 __skb_pull(tmp_skb, skb_network_header_len(skb));
1265 *tail_skb = tmp_skb; 1292 *tail_skb = tmp_skb;
1266 tail_skb = &(tmp_skb->next); 1293 tail_skb = &(tmp_skb->next);
@@ -1286,8 +1313,8 @@ int ip_push_pending_frames(struct sock *sk)
1286 ip_dont_fragment(sk, &rt->dst))) 1313 ip_dont_fragment(sk, &rt->dst)))
1287 df = htons(IP_DF); 1314 df = htons(IP_DF);
1288 1315
1289 if (inet->cork.flags & IPCORK_OPT) 1316 if (cork->flags & IPCORK_OPT)
1290 opt = inet->cork.opt; 1317 opt = cork->opt;
1291 1318
1292 if (rt->rt_type == RTN_MULTICAST) 1319 if (rt->rt_type == RTN_MULTICAST)
1293 ttl = inet->mc_ttl; 1320 ttl = inet->mc_ttl;
@@ -1297,17 +1324,18 @@ int ip_push_pending_frames(struct sock *sk)
1297 iph = (struct iphdr *)skb->data; 1324 iph = (struct iphdr *)skb->data;
1298 iph->version = 4; 1325 iph->version = 4;
1299 iph->ihl = 5; 1326 iph->ihl = 5;
1300 if (opt) {
1301 iph->ihl += opt->optlen>>2;
1302 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1303 }
1304 iph->tos = inet->tos; 1327 iph->tos = inet->tos;
1305 iph->frag_off = df; 1328 iph->frag_off = df;
1306 ip_select_ident(iph, &rt->dst, sk); 1329 ip_select_ident(iph, &rt->dst, sk);
1307 iph->ttl = ttl; 1330 iph->ttl = ttl;
1308 iph->protocol = sk->sk_protocol; 1331 iph->protocol = sk->sk_protocol;
1309 iph->saddr = rt->rt_src; 1332 iph->saddr = fl4->saddr;
1310 iph->daddr = rt->rt_dst; 1333 iph->daddr = fl4->daddr;
1334
1335 if (opt) {
1336 iph->ihl += opt->optlen>>2;
1337 ip_options_build(skb, opt, cork->addr, rt, 0);
1338 }
1311 1339
1312 skb->priority = sk->sk_priority; 1340 skb->priority = sk->sk_priority;
1313 skb->mark = sk->sk_mark; 1341 skb->mark = sk->sk_mark;
@@ -1315,44 +1343,99 @@ int ip_push_pending_frames(struct sock *sk)
1315 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec 1343 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1316 * on dst refcount 1344 * on dst refcount
1317 */ 1345 */
1318 inet->cork.dst = NULL; 1346 cork->dst = NULL;
1319 skb_dst_set(skb, &rt->dst); 1347 skb_dst_set(skb, &rt->dst);
1320 1348
1321 if (iph->protocol == IPPROTO_ICMP) 1349 if (iph->protocol == IPPROTO_ICMP)
1322 icmp_out_count(net, ((struct icmphdr *) 1350 icmp_out_count(net, ((struct icmphdr *)
1323 skb_transport_header(skb))->type); 1351 skb_transport_header(skb))->type);
1324 1352
1325 /* Netfilter gets whole the not fragmented skb. */ 1353 ip_cork_release(cork);
1354out:
1355 return skb;
1356}
1357
1358int ip_send_skb(struct sk_buff *skb)
1359{
1360 struct net *net = sock_net(skb->sk);
1361 int err;
1362
1326 err = ip_local_out(skb); 1363 err = ip_local_out(skb);
1327 if (err) { 1364 if (err) {
1328 if (err > 0) 1365 if (err > 0)
1329 err = net_xmit_errno(err); 1366 err = net_xmit_errno(err);
1330 if (err) 1367 if (err)
1331 goto error; 1368 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1332 } 1369 }
1333 1370
1334out:
1335 ip_cork_release(inet);
1336 return err; 1371 return err;
1372}
1337 1373
1338error: 1374int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1339 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); 1375{
1340 goto out; 1376 struct sk_buff *skb;
1377
1378 skb = ip_finish_skb(sk, fl4);
1379 if (!skb)
1380 return 0;
1381
1382 /* Netfilter gets whole the not fragmented skb. */
1383 return ip_send_skb(skb);
1341} 1384}
1342 1385
1343/* 1386/*
1344 * Throw away all pending data on the socket. 1387 * Throw away all pending data on the socket.
1345 */ 1388 */
1346void ip_flush_pending_frames(struct sock *sk) 1389static void __ip_flush_pending_frames(struct sock *sk,
1390 struct sk_buff_head *queue,
1391 struct inet_cork *cork)
1347{ 1392{
1348 struct sk_buff *skb; 1393 struct sk_buff *skb;
1349 1394
1350 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) 1395 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1351 kfree_skb(skb); 1396 kfree_skb(skb);
1352 1397
1353 ip_cork_release(inet_sk(sk)); 1398 ip_cork_release(cork);
1354} 1399}
1355 1400
1401void ip_flush_pending_frames(struct sock *sk)
1402{
1403 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1404}
1405
1406struct sk_buff *ip_make_skb(struct sock *sk,
1407 struct flowi4 *fl4,
1408 int getfrag(void *from, char *to, int offset,
1409 int len, int odd, struct sk_buff *skb),
1410 void *from, int length, int transhdrlen,
1411 struct ipcm_cookie *ipc, struct rtable **rtp,
1412 unsigned int flags)
1413{
1414 struct inet_cork cork;
1415 struct sk_buff_head queue;
1416 int err;
1417
1418 if (flags & MSG_PROBE)
1419 return NULL;
1420
1421 __skb_queue_head_init(&queue);
1422
1423 cork.flags = 0;
1424 cork.addr = 0;
1425 cork.opt = NULL;
1426 err = ip_setup_cork(sk, &cork, ipc, rtp);
1427 if (err)
1428 return ERR_PTR(err);
1429
1430 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1431 from, length, transhdrlen, flags);
1432 if (err) {
1433 __ip_flush_pending_frames(sk, &queue, &cork);
1434 return ERR_PTR(err);
1435 }
1436
1437 return __ip_make_skb(sk, fl4, &queue, &cork);
1438}
1356 1439
1357/* 1440/*
1358 * Fetch data from kernel space and fill in checksum if needed. 1441 * Fetch data from kernel space and fill in checksum if needed.
@@ -1374,48 +1457,39 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1374 * Should run single threaded per socket because it uses the sock 1457 * Should run single threaded per socket because it uses the sock
1375 * structure to pass arguments. 1458 * structure to pass arguments.
1376 */ 1459 */
1377void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, 1460void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1378 unsigned int len) 1461 struct ip_reply_arg *arg, unsigned int len)
1379{ 1462{
1380 struct inet_sock *inet = inet_sk(sk); 1463 struct inet_sock *inet = inet_sk(sk);
1381 struct { 1464 struct ip_options_data replyopts;
1382 struct ip_options opt;
1383 char data[40];
1384 } replyopts;
1385 struct ipcm_cookie ipc; 1465 struct ipcm_cookie ipc;
1386 __be32 daddr; 1466 struct flowi4 fl4;
1387 struct rtable *rt = skb_rtable(skb); 1467 struct rtable *rt = skb_rtable(skb);
1388 1468
1389 if (ip_options_echo(&replyopts.opt, skb)) 1469 if (ip_options_echo(&replyopts.opt.opt, skb))
1390 return; 1470 return;
1391 1471
1392 daddr = ipc.addr = rt->rt_src; 1472 ipc.addr = daddr;
1393 ipc.opt = NULL; 1473 ipc.opt = NULL;
1394 ipc.shtx.flags = 0; 1474 ipc.tx_flags = 0;
1395 1475
1396 if (replyopts.opt.optlen) { 1476 if (replyopts.opt.opt.optlen) {
1397 ipc.opt = &replyopts.opt; 1477 ipc.opt = &replyopts.opt;
1398 1478
1399 if (ipc.opt->srr) 1479 if (replyopts.opt.opt.srr)
1400 daddr = replyopts.opt.faddr; 1480 daddr = replyopts.opt.opt.faddr;
1401 } 1481 }
1402 1482
1403 { 1483 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1404 struct flowi fl = { .oif = arg->bound_dev_if, 1484 RT_TOS(ip_hdr(skb)->tos),
1405 .nl_u = { .ip4_u = 1485 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1406 { .daddr = daddr, 1486 ip_reply_arg_flowi_flags(arg),
1407 .saddr = rt->rt_spec_dst, 1487 daddr, rt->rt_spec_dst,
1408 .tos = RT_TOS(ip_hdr(skb)->tos) } }, 1488 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1409 /* Not quite clean, but right. */ 1489 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1410 .uli_u = { .ports = 1490 rt = ip_route_output_key(sock_net(sk), &fl4);
1411 { .sport = tcp_hdr(skb)->dest, 1491 if (IS_ERR(rt))
1412 .dport = tcp_hdr(skb)->source } }, 1492 return;
1413 .proto = sk->sk_protocol,
1414 .flags = ip_reply_arg_flowi_flags(arg) };
1415 security_skb_classify_flow(skb, &fl);
1416 if (ip_route_output_key(sock_net(sk), &rt, &fl))
1417 return;
1418 }
1419 1493
1420 /* And let IP do all the hard work. 1494 /* And let IP do all the hard work.
1421 1495
@@ -1428,7 +1502,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1428 sk->sk_priority = skb->priority; 1502 sk->sk_priority = skb->priority;
1429 sk->sk_protocol = ip_hdr(skb)->protocol; 1503 sk->sk_protocol = ip_hdr(skb)->protocol;
1430 sk->sk_bound_dev_if = arg->bound_dev_if; 1504 sk->sk_bound_dev_if = arg->bound_dev_if;
1431 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, 1505 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1432 &ipc, &rt, MSG_DONTWAIT); 1506 &ipc, &rt, MSG_DONTWAIT);
1433 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { 1507 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1434 if (arg->csumoffset >= 0) 1508 if (arg->csumoffset >= 0)
@@ -1436,7 +1510,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1436 arg->csumoffset) = csum_fold(csum_add(skb->csum, 1510 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1437 arg->csum)); 1511 arg->csum));
1438 skb->ip_summed = CHECKSUM_NONE; 1512 skb->ip_summed = CHECKSUM_NONE;
1439 ip_push_pending_frames(sk); 1513 ip_push_pending_frames(sk, &fl4);
1440 } 1514 }
1441 1515
1442 bh_unlock_sock(sk); 1516 bh_unlock_sock(sk);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 64b70ad162e3..ab0c9efd1efa 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -131,7 +131,7 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
131static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) 131static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
132{ 132{
133 struct sockaddr_in sin; 133 struct sockaddr_in sin;
134 struct iphdr *iph = ip_hdr(skb); 134 const struct iphdr *iph = ip_hdr(skb);
135 __be16 *ports = (__be16 *)skb_transport_header(skb); 135 __be16 *ports = (__be16 *)skb_transport_header(skb);
136 136
137 if (skb_transport_offset(skb) + 4 > skb->len) 137 if (skb_transport_offset(skb) + 4 > skb->len)
@@ -238,7 +238,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
238 but receiver should be enough clever f.e. to forward mtrace requests, 238 but receiver should be enough clever f.e. to forward mtrace requests,
239 sent to multicast group to reach destination designated router. 239 sent to multicast group to reach destination designated router.
240 */ 240 */
241struct ip_ra_chain *ip_ra_chain; 241struct ip_ra_chain __rcu *ip_ra_chain;
242static DEFINE_SPINLOCK(ip_ra_lock); 242static DEFINE_SPINLOCK(ip_ra_lock);
243 243
244 244
@@ -253,7 +253,8 @@ static void ip_ra_destroy_rcu(struct rcu_head *head)
253int ip_ra_control(struct sock *sk, unsigned char on, 253int ip_ra_control(struct sock *sk, unsigned char on,
254 void (*destructor)(struct sock *)) 254 void (*destructor)(struct sock *))
255{ 255{
256 struct ip_ra_chain *ra, *new_ra, **rap; 256 struct ip_ra_chain *ra, *new_ra;
257 struct ip_ra_chain __rcu **rap;
257 258
258 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) 259 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
259 return -EINVAL; 260 return -EINVAL;
@@ -261,7 +262,10 @@ int ip_ra_control(struct sock *sk, unsigned char on,
261 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; 262 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
262 263
263 spin_lock_bh(&ip_ra_lock); 264 spin_lock_bh(&ip_ra_lock);
264 for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { 265 for (rap = &ip_ra_chain;
266 (ra = rcu_dereference_protected(*rap,
267 lockdep_is_held(&ip_ra_lock))) != NULL;
268 rap = &ra->next) {
265 if (ra->sk == sk) { 269 if (ra->sk == sk) {
266 if (on) { 270 if (on) {
267 spin_unlock_bh(&ip_ra_lock); 271 spin_unlock_bh(&ip_ra_lock);
@@ -447,6 +451,11 @@ out:
447} 451}
448 452
449 453
454static void opt_kfree_rcu(struct rcu_head *head)
455{
456 kfree(container_of(head, struct ip_options_rcu, rcu));
457}
458
450/* 459/*
451 * Socket option code for IP. This is the end of the line after any 460 * Socket option code for IP. This is the end of the line after any
452 * TCP,UDP etc options on an IP socket. 461 * TCP,UDP etc options on an IP socket.
@@ -493,13 +502,16 @@ static int do_ip_setsockopt(struct sock *sk, int level,
493 switch (optname) { 502 switch (optname) {
494 case IP_OPTIONS: 503 case IP_OPTIONS:
495 { 504 {
496 struct ip_options *opt = NULL; 505 struct ip_options_rcu *old, *opt = NULL;
506
497 if (optlen > 40) 507 if (optlen > 40)
498 goto e_inval; 508 goto e_inval;
499 err = ip_options_get_from_user(sock_net(sk), &opt, 509 err = ip_options_get_from_user(sock_net(sk), &opt,
500 optval, optlen); 510 optval, optlen);
501 if (err) 511 if (err)
502 break; 512 break;
513 old = rcu_dereference_protected(inet->inet_opt,
514 sock_owned_by_user(sk));
503 if (inet->is_icsk) { 515 if (inet->is_icsk) {
504 struct inet_connection_sock *icsk = inet_csk(sk); 516 struct inet_connection_sock *icsk = inet_csk(sk);
505#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 517#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
@@ -508,17 +520,18 @@ static int do_ip_setsockopt(struct sock *sk, int level,
508 (TCPF_LISTEN | TCPF_CLOSE)) && 520 (TCPF_LISTEN | TCPF_CLOSE)) &&
509 inet->inet_daddr != LOOPBACK4_IPV6)) { 521 inet->inet_daddr != LOOPBACK4_IPV6)) {
510#endif 522#endif
511 if (inet->opt) 523 if (old)
512 icsk->icsk_ext_hdr_len -= inet->opt->optlen; 524 icsk->icsk_ext_hdr_len -= old->opt.optlen;
513 if (opt) 525 if (opt)
514 icsk->icsk_ext_hdr_len += opt->optlen; 526 icsk->icsk_ext_hdr_len += opt->opt.optlen;
515 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); 527 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
516#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 528#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
517 } 529 }
518#endif 530#endif
519 } 531 }
520 opt = xchg(&inet->opt, opt); 532 rcu_assign_pointer(inet->inet_opt, opt);
521 kfree(opt); 533 if (old)
534 call_rcu(&old->rcu, opt_kfree_rcu);
522 break; 535 break;
523 } 536 }
524 case IP_PKTINFO: 537 case IP_PKTINFO:
@@ -1077,12 +1090,16 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1077 case IP_OPTIONS: 1090 case IP_OPTIONS:
1078 { 1091 {
1079 unsigned char optbuf[sizeof(struct ip_options)+40]; 1092 unsigned char optbuf[sizeof(struct ip_options)+40];
1080 struct ip_options * opt = (struct ip_options *)optbuf; 1093 struct ip_options *opt = (struct ip_options *)optbuf;
1094 struct ip_options_rcu *inet_opt;
1095
1096 inet_opt = rcu_dereference_protected(inet->inet_opt,
1097 sock_owned_by_user(sk));
1081 opt->optlen = 0; 1098 opt->optlen = 0;
1082 if (inet->opt) 1099 if (inet_opt)
1083 memcpy(optbuf, inet->opt, 1100 memcpy(optbuf, &inet_opt->opt,
1084 sizeof(struct ip_options)+ 1101 sizeof(struct ip_options) +
1085 inet->opt->optlen); 1102 inet_opt->opt.optlen);
1086 release_sock(sk); 1103 release_sock(sk);
1087 1104
1088 if (opt->optlen == 0) 1105 if (opt->optlen == 0)
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 629067571f02..c857f6f49b03 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -27,7 +27,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
27{ 27{
28 struct net *net = dev_net(skb->dev); 28 struct net *net = dev_net(skb->dev);
29 __be32 spi; 29 __be32 spi;
30 struct iphdr *iph = (struct iphdr *)skb->data; 30 const struct iphdr *iph = (const struct iphdr *)skb->data;
31 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); 31 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
32 struct xfrm_state *x; 32 struct xfrm_state *x;
33 33
@@ -36,7 +36,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
36 return; 36 return;
37 37
38 spi = htonl(ntohs(ipch->cpi)); 38 spi = htonl(ntohs(ipch->cpi));
39 x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, 39 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
40 spi, IPPROTO_COMP, AF_INET); 40 spi, IPPROTO_COMP, AF_INET);
41 if (!x) 41 if (!x)
42 return; 42 return;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 3a6e1ec5e9ae..ab7e5542c1cf 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -87,8 +87,8 @@
87#endif 87#endif
88 88
89/* Define the friendly delay before and after opening net devices */ 89/* Define the friendly delay before and after opening net devices */
90#define CONF_PRE_OPEN 500 /* Before opening: 1/2 second */ 90#define CONF_POST_OPEN 10 /* After opening: 10 msecs */
91#define CONF_POST_OPEN 1 /* After opening: 1 second */ 91#define CONF_CARRIER_TIMEOUT 120000 /* Wait for carrier timeout */
92 92
93/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ 93/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
94#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ 94#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
@@ -188,14 +188,14 @@ struct ic_device {
188static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ 188static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
189static struct net_device *ic_dev __initdata = NULL; /* Selected device */ 189static struct net_device *ic_dev __initdata = NULL; /* Selected device */
190 190
191static bool __init ic_device_match(struct net_device *dev) 191static bool __init ic_is_init_dev(struct net_device *dev)
192{ 192{
193 if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : 193 if (dev->flags & IFF_LOOPBACK)
194 return false;
195 return user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
194 (!(dev->flags & IFF_LOOPBACK) && 196 (!(dev->flags & IFF_LOOPBACK) &&
195 (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && 197 (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
196 strncmp(dev->name, "dummy", 5))) 198 strncmp(dev->name, "dummy", 5));
197 return true;
198 return false;
199} 199}
200 200
201static int __init ic_open_devs(void) 201static int __init ic_open_devs(void)
@@ -203,6 +203,7 @@ static int __init ic_open_devs(void)
203 struct ic_device *d, **last; 203 struct ic_device *d, **last;
204 struct net_device *dev; 204 struct net_device *dev;
205 unsigned short oflags; 205 unsigned short oflags;
206 unsigned long start;
206 207
207 last = &ic_first_dev; 208 last = &ic_first_dev;
208 rtnl_lock(); 209 rtnl_lock();
@@ -216,9 +217,7 @@ static int __init ic_open_devs(void)
216 } 217 }
217 218
218 for_each_netdev(&init_net, dev) { 219 for_each_netdev(&init_net, dev) {
219 if (dev->flags & IFF_LOOPBACK) 220 if (ic_is_init_dev(dev)) {
220 continue;
221 if (ic_device_match(dev)) {
222 int able = 0; 221 int able = 0;
223 if (dev->mtu >= 364) 222 if (dev->mtu >= 364)
224 able |= IC_BOOTP; 223 able |= IC_BOOTP;
@@ -252,6 +251,17 @@ static int __init ic_open_devs(void)
252 dev->name, able, d->xid)); 251 dev->name, able, d->xid));
253 } 252 }
254 } 253 }
254
255 /* wait for a carrier on at least one device */
256 start = jiffies;
257 while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) {
258 for_each_netdev(&init_net, dev)
259 if (ic_is_init_dev(dev) && netif_carrier_ok(dev))
260 goto have_carrier;
261
262 msleep(1);
263 }
264have_carrier:
255 rtnl_unlock(); 265 rtnl_unlock();
256 266
257 *last = NULL; 267 *last = NULL;
@@ -1191,13 +1201,13 @@ static int __init ic_dynamic(void)
1191 (ic_proto_enabled & IC_USE_DHCP) && 1201 (ic_proto_enabled & IC_USE_DHCP) &&
1192 ic_dhcp_msgtype != DHCPACK) { 1202 ic_dhcp_msgtype != DHCPACK) {
1193 ic_got_reply = 0; 1203 ic_got_reply = 0;
1194 printk(","); 1204 printk(KERN_CONT ",");
1195 continue; 1205 continue;
1196 } 1206 }
1197#endif /* IPCONFIG_DHCP */ 1207#endif /* IPCONFIG_DHCP */
1198 1208
1199 if (ic_got_reply) { 1209 if (ic_got_reply) {
1200 printk(" OK\n"); 1210 printk(KERN_CONT " OK\n");
1201 break; 1211 break;
1202 } 1212 }
1203 1213
@@ -1205,7 +1215,7 @@ static int __init ic_dynamic(void)
1205 continue; 1215 continue;
1206 1216
1207 if (! --retries) { 1217 if (! --retries) {
1208 printk(" timed out!\n"); 1218 printk(KERN_CONT " timed out!\n");
1209 break; 1219 break;
1210 } 1220 }
1211 1221
@@ -1215,7 +1225,7 @@ static int __init ic_dynamic(void)
1215 if (timeout > CONF_TIMEOUT_MAX) 1225 if (timeout > CONF_TIMEOUT_MAX)
1216 timeout = CONF_TIMEOUT_MAX; 1226 timeout = CONF_TIMEOUT_MAX;
1217 1227
1218 printk("."); 1228 printk(KERN_CONT ".");
1219 } 1229 }
1220 1230
1221#ifdef IPCONFIG_BOOTP 1231#ifdef IPCONFIG_BOOTP
@@ -1236,7 +1246,7 @@ static int __init ic_dynamic(void)
1236 ((ic_got_reply & IC_RARP) ? "RARP" 1246 ((ic_got_reply & IC_RARP) ? "RARP"
1237 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), 1247 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
1238 &ic_servaddr); 1248 &ic_servaddr);
1239 printk("my address is %pI4\n", &ic_myaddr); 1249 printk(KERN_CONT "my address is %pI4\n", &ic_myaddr);
1240 1250
1241 return 0; 1251 return 0;
1242} 1252}
@@ -1324,14 +1334,13 @@ static int __init wait_for_devices(void)
1324{ 1334{
1325 int i; 1335 int i;
1326 1336
1327 msleep(CONF_PRE_OPEN);
1328 for (i = 0; i < DEVICE_WAIT_MAX; i++) { 1337 for (i = 0; i < DEVICE_WAIT_MAX; i++) {
1329 struct net_device *dev; 1338 struct net_device *dev;
1330 int found = 0; 1339 int found = 0;
1331 1340
1332 rtnl_lock(); 1341 rtnl_lock();
1333 for_each_netdev(&init_net, dev) { 1342 for_each_netdev(&init_net, dev) {
1334 if (ic_device_match(dev)) { 1343 if (ic_is_init_dev(dev)) {
1335 found = 1; 1344 found = 1;
1336 break; 1345 break;
1337 } 1346 }
@@ -1378,7 +1387,7 @@ static int __init ip_auto_config(void)
1378 return err; 1387 return err;
1379 1388
1380 /* Give drivers a chance to settle */ 1389 /* Give drivers a chance to settle */
1381 ssleep(CONF_POST_OPEN); 1390 msleep(CONF_POST_OPEN);
1382 1391
1383 /* 1392 /*
1384 * If the config information is insufficient (e.g., our IP address or 1393 * If the config information is insufficient (e.g., our IP address or
@@ -1444,7 +1453,7 @@ static int __init ip_auto_config(void)
1444 root_server_addr = addr; 1453 root_server_addr = addr;
1445 1454
1446 /* 1455 /*
1447 * Use defaults whereever applicable. 1456 * Use defaults wherever applicable.
1448 */ 1457 */
1449 if (ic_defaults() < 0) 1458 if (ic_defaults() < 0)
1450 return -1; 1459 return -1;
@@ -1468,19 +1477,19 @@ static int __init ip_auto_config(void)
1468 /* 1477 /*
1469 * Clue in the operator. 1478 * Clue in the operator.
1470 */ 1479 */
1471 printk("IP-Config: Complete:"); 1480 printk("IP-Config: Complete:\n");
1472 printk("\n device=%s", ic_dev->name); 1481 printk(" device=%s", ic_dev->name);
1473 printk(", addr=%pI4", &ic_myaddr); 1482 printk(KERN_CONT ", addr=%pI4", &ic_myaddr);
1474 printk(", mask=%pI4", &ic_netmask); 1483 printk(KERN_CONT ", mask=%pI4", &ic_netmask);
1475 printk(", gw=%pI4", &ic_gateway); 1484 printk(KERN_CONT ", gw=%pI4", &ic_gateway);
1476 printk(",\n host=%s, domain=%s, nis-domain=%s", 1485 printk(KERN_CONT ",\n host=%s, domain=%s, nis-domain=%s",
1477 utsname()->nodename, ic_domain, utsname()->domainname); 1486 utsname()->nodename, ic_domain, utsname()->domainname);
1478 printk(",\n bootserver=%pI4", &ic_servaddr); 1487 printk(KERN_CONT ",\n bootserver=%pI4", &ic_servaddr);
1479 printk(", rootserver=%pI4", &root_server_addr); 1488 printk(KERN_CONT ", rootserver=%pI4", &root_server_addr);
1480 printk(", rootpath=%s", root_server_path); 1489 printk(KERN_CONT ", rootpath=%s", root_server_path);
1481 if (ic_dev_mtu) 1490 if (ic_dev_mtu)
1482 printk(", mtu=%d", ic_dev_mtu); 1491 printk(KERN_CONT ", mtu=%d", ic_dev_mtu);
1483 printk("\n"); 1492 printk(KERN_CONT "\n");
1484#endif /* !SILENT */ 1493#endif /* !SILENT */
1485 1494
1486 return 0; 1495 return 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ec036731a70b..378b20b7ca6e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -122,31 +122,59 @@
122 122
123static int ipip_net_id __read_mostly; 123static int ipip_net_id __read_mostly;
124struct ipip_net { 124struct ipip_net {
125 struct ip_tunnel *tunnels_r_l[HASH_SIZE]; 125 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
126 struct ip_tunnel *tunnels_r[HASH_SIZE]; 126 struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
127 struct ip_tunnel *tunnels_l[HASH_SIZE]; 127 struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
128 struct ip_tunnel *tunnels_wc[1]; 128 struct ip_tunnel __rcu *tunnels_wc[1];
129 struct ip_tunnel **tunnels[4]; 129 struct ip_tunnel __rcu **tunnels[4];
130 130
131 struct net_device *fb_tunnel_dev; 131 struct net_device *fb_tunnel_dev;
132}; 132};
133 133
134static void ipip_tunnel_init(struct net_device *dev); 134static int ipip_tunnel_init(struct net_device *dev);
135static void ipip_tunnel_setup(struct net_device *dev); 135static void ipip_tunnel_setup(struct net_device *dev);
136static void ipip_dev_free(struct net_device *dev);
136 137
137/* 138/*
138 * Locking : hash tables are protected by RCU and a spinlock 139 * Locking : hash tables are protected by RCU and RTNL
139 */ 140 */
140static DEFINE_SPINLOCK(ipip_lock);
141 141
142#define for_each_ip_tunnel_rcu(start) \ 142#define for_each_ip_tunnel_rcu(start) \
143 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) 143 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
144 144
145/* often modified stats are per cpu, other are shared (netdev->stats) */
146struct pcpu_tstats {
147 unsigned long rx_packets;
148 unsigned long rx_bytes;
149 unsigned long tx_packets;
150 unsigned long tx_bytes;
151};
152
153static struct net_device_stats *ipip_get_stats(struct net_device *dev)
154{
155 struct pcpu_tstats sum = { 0 };
156 int i;
157
158 for_each_possible_cpu(i) {
159 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
160
161 sum.rx_packets += tstats->rx_packets;
162 sum.rx_bytes += tstats->rx_bytes;
163 sum.tx_packets += tstats->tx_packets;
164 sum.tx_bytes += tstats->tx_bytes;
165 }
166 dev->stats.rx_packets = sum.rx_packets;
167 dev->stats.rx_bytes = sum.rx_bytes;
168 dev->stats.tx_packets = sum.tx_packets;
169 dev->stats.tx_bytes = sum.tx_bytes;
170 return &dev->stats;
171}
172
145static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, 173static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
146 __be32 remote, __be32 local) 174 __be32 remote, __be32 local)
147{ 175{
148 unsigned h0 = HASH(remote); 176 unsigned int h0 = HASH(remote);
149 unsigned h1 = HASH(local); 177 unsigned int h1 = HASH(local);
150 struct ip_tunnel *t; 178 struct ip_tunnel *t;
151 struct ipip_net *ipn = net_generic(net, ipip_net_id); 179 struct ipip_net *ipn = net_generic(net, ipip_net_id);
152 180
@@ -169,12 +197,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
169 return NULL; 197 return NULL;
170} 198}
171 199
172static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, 200static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
173 struct ip_tunnel_parm *parms) 201 struct ip_tunnel_parm *parms)
174{ 202{
175 __be32 remote = parms->iph.daddr; 203 __be32 remote = parms->iph.daddr;
176 __be32 local = parms->iph.saddr; 204 __be32 local = parms->iph.saddr;
177 unsigned h = 0; 205 unsigned int h = 0;
178 int prio = 0; 206 int prio = 0;
179 207
180 if (remote) { 208 if (remote) {
@@ -188,7 +216,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
188 return &ipn->tunnels[prio][h]; 216 return &ipn->tunnels[prio][h];
189} 217}
190 218
191static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, 219static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
192 struct ip_tunnel *t) 220 struct ip_tunnel *t)
193{ 221{
194 return __ipip_bucket(ipn, &t->parms); 222 return __ipip_bucket(ipn, &t->parms);
@@ -196,13 +224,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
196 224
197static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) 225static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
198{ 226{
199 struct ip_tunnel **tp; 227 struct ip_tunnel __rcu **tp;
200 228 struct ip_tunnel *iter;
201 for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { 229
202 if (t == *tp) { 230 for (tp = ipip_bucket(ipn, t);
203 spin_lock_bh(&ipip_lock); 231 (iter = rtnl_dereference(*tp)) != NULL;
204 *tp = t->next; 232 tp = &iter->next) {
205 spin_unlock_bh(&ipip_lock); 233 if (t == iter) {
234 rcu_assign_pointer(*tp, t->next);
206 break; 235 break;
207 } 236 }
208 } 237 }
@@ -210,12 +239,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
210 239
211static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) 240static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
212{ 241{
213 struct ip_tunnel **tp = ipip_bucket(ipn, t); 242 struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
214 243
215 spin_lock_bh(&ipip_lock); 244 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
216 t->next = *tp;
217 rcu_assign_pointer(*tp, t); 245 rcu_assign_pointer(*tp, t);
218 spin_unlock_bh(&ipip_lock);
219} 246}
220 247
221static struct ip_tunnel * ipip_tunnel_locate(struct net *net, 248static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -223,12 +250,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
223{ 250{
224 __be32 remote = parms->iph.daddr; 251 __be32 remote = parms->iph.daddr;
225 __be32 local = parms->iph.saddr; 252 __be32 local = parms->iph.saddr;
226 struct ip_tunnel *t, **tp, *nt; 253 struct ip_tunnel *t, *nt;
254 struct ip_tunnel __rcu **tp;
227 struct net_device *dev; 255 struct net_device *dev;
228 char name[IFNAMSIZ]; 256 char name[IFNAMSIZ];
229 struct ipip_net *ipn = net_generic(net, ipip_net_id); 257 struct ipip_net *ipn = net_generic(net, ipip_net_id);
230 258
231 for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) { 259 for (tp = __ipip_bucket(ipn, parms);
260 (t = rtnl_dereference(*tp)) != NULL;
261 tp = &t->next) {
232 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) 262 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
233 return t; 263 return t;
234 } 264 }
@@ -238,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
238 if (parms->name[0]) 268 if (parms->name[0])
239 strlcpy(name, parms->name, IFNAMSIZ); 269 strlcpy(name, parms->name, IFNAMSIZ);
240 else 270 else
241 sprintf(name, "tunl%%d"); 271 strcpy(name, "tunl%d");
242 272
243 dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); 273 dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
244 if (dev == NULL) 274 if (dev == NULL)
@@ -246,15 +276,11 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
246 276
247 dev_net_set(dev, net); 277 dev_net_set(dev, net);
248 278
249 if (strchr(name, '%')) {
250 if (dev_alloc_name(dev, name) < 0)
251 goto failed_free;
252 }
253
254 nt = netdev_priv(dev); 279 nt = netdev_priv(dev);
255 nt->parms = *parms; 280 nt->parms = *parms;
256 281
257 ipip_tunnel_init(dev); 282 if (ipip_tunnel_init(dev) < 0)
283 goto failed_free;
258 284
259 if (register_netdevice(dev) < 0) 285 if (register_netdevice(dev) < 0)
260 goto failed_free; 286 goto failed_free;
@@ -264,20 +290,19 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
264 return nt; 290 return nt;
265 291
266failed_free: 292failed_free:
267 free_netdev(dev); 293 ipip_dev_free(dev);
268 return NULL; 294 return NULL;
269} 295}
270 296
297/* called with RTNL */
271static void ipip_tunnel_uninit(struct net_device *dev) 298static void ipip_tunnel_uninit(struct net_device *dev)
272{ 299{
273 struct net *net = dev_net(dev); 300 struct net *net = dev_net(dev);
274 struct ipip_net *ipn = net_generic(net, ipip_net_id); 301 struct ipip_net *ipn = net_generic(net, ipip_net_id);
275 302
276 if (dev == ipn->fb_tunnel_dev) { 303 if (dev == ipn->fb_tunnel_dev)
277 spin_lock_bh(&ipip_lock); 304 rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
278 ipn->tunnels_wc[0] = NULL; 305 else
279 spin_unlock_bh(&ipip_lock);
280 } else
281 ipip_tunnel_unlink(ipn, netdev_priv(dev)); 306 ipip_tunnel_unlink(ipn, netdev_priv(dev));
282 dev_put(dev); 307 dev_put(dev);
283} 308}
@@ -289,7 +314,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
289 8 bytes of packet payload. It means, that precise relaying of 314 8 bytes of packet payload. It means, that precise relaying of
290 ICMP in the real Internet is absolutely infeasible. 315 ICMP in the real Internet is absolutely infeasible.
291 */ 316 */
292 struct iphdr *iph = (struct iphdr *)skb->data; 317 const struct iphdr *iph = (const struct iphdr *)skb->data;
293 const int type = icmp_hdr(skb)->type; 318 const int type = icmp_hdr(skb)->type;
294 const int code = icmp_hdr(skb)->code; 319 const int code = icmp_hdr(skb)->code;
295 struct ip_tunnel *t; 320 struct ip_tunnel *t;
@@ -359,8 +384,10 @@ static int ipip_rcv(struct sk_buff *skb)
359 const struct iphdr *iph = ip_hdr(skb); 384 const struct iphdr *iph = ip_hdr(skb);
360 385
361 rcu_read_lock(); 386 rcu_read_lock();
362 if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), 387 tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
363 iph->saddr, iph->daddr)) != NULL) { 388 if (tunnel != NULL) {
389 struct pcpu_tstats *tstats;
390
364 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { 391 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
365 rcu_read_unlock(); 392 rcu_read_unlock();
366 kfree_skb(skb); 393 kfree_skb(skb);
@@ -374,10 +401,16 @@ static int ipip_rcv(struct sk_buff *skb)
374 skb->protocol = htons(ETH_P_IP); 401 skb->protocol = htons(ETH_P_IP);
375 skb->pkt_type = PACKET_HOST; 402 skb->pkt_type = PACKET_HOST;
376 403
377 skb_tunnel_rx(skb, tunnel->dev); 404 tstats = this_cpu_ptr(tunnel->dev->tstats);
405 tstats->rx_packets++;
406 tstats->rx_bytes += skb->len;
407
408 __skb_tunnel_rx(skb, tunnel->dev);
378 409
379 ipip_ecn_decapsulate(iph, skb); 410 ipip_ecn_decapsulate(iph, skb);
411
380 netif_rx(skb); 412 netif_rx(skb);
413
381 rcu_read_unlock(); 414 rcu_read_unlock();
382 return 0; 415 return 0;
383 } 416 }
@@ -394,52 +427,49 @@ static int ipip_rcv(struct sk_buff *skb)
394static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 427static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
395{ 428{
396 struct ip_tunnel *tunnel = netdev_priv(dev); 429 struct ip_tunnel *tunnel = netdev_priv(dev);
397 struct net_device_stats *stats = &dev->stats; 430 struct pcpu_tstats *tstats;
398 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); 431 const struct iphdr *tiph = &tunnel->parms.iph;
399 struct iphdr *tiph = &tunnel->parms.iph;
400 u8 tos = tunnel->parms.iph.tos; 432 u8 tos = tunnel->parms.iph.tos;
401 __be16 df = tiph->frag_off; 433 __be16 df = tiph->frag_off;
402 struct rtable *rt; /* Route to the other host */ 434 struct rtable *rt; /* Route to the other host */
403 struct net_device *tdev; /* Device to other host */ 435 struct net_device *tdev; /* Device to other host */
404 struct iphdr *old_iph = ip_hdr(skb); 436 const struct iphdr *old_iph = ip_hdr(skb);
405 struct iphdr *iph; /* Our new IP header */ 437 struct iphdr *iph; /* Our new IP header */
406 unsigned int max_headroom; /* The extra header space needed */ 438 unsigned int max_headroom; /* The extra header space needed */
407 __be32 dst = tiph->daddr; 439 __be32 dst = tiph->daddr;
440 struct flowi4 fl4;
408 int mtu; 441 int mtu;
409 442
410 if (skb->protocol != htons(ETH_P_IP)) 443 if (skb->protocol != htons(ETH_P_IP))
411 goto tx_error; 444 goto tx_error;
412 445
413 if (tos&1) 446 if (tos & 1)
414 tos = old_iph->tos; 447 tos = old_iph->tos;
415 448
416 if (!dst) { 449 if (!dst) {
417 /* NBMA tunnel */ 450 /* NBMA tunnel */
418 if ((rt = skb_rtable(skb)) == NULL) { 451 if ((rt = skb_rtable(skb)) == NULL) {
419 stats->tx_fifo_errors++; 452 dev->stats.tx_fifo_errors++;
420 goto tx_error; 453 goto tx_error;
421 } 454 }
422 if ((dst = rt->rt_gateway) == 0) 455 if ((dst = rt->rt_gateway) == 0)
423 goto tx_error_icmp; 456 goto tx_error_icmp;
424 } 457 }
425 458
426 { 459 rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
427 struct flowi fl = { .oif = tunnel->parms.link, 460 dst, tiph->saddr,
428 .nl_u = { .ip4_u = 461 0, 0,
429 { .daddr = dst, 462 IPPROTO_IPIP, RT_TOS(tos),
430 .saddr = tiph->saddr, 463 tunnel->parms.link);
431 .tos = RT_TOS(tos) } }, 464 if (IS_ERR(rt)) {
432 .proto = IPPROTO_IPIP }; 465 dev->stats.tx_carrier_errors++;
433 if (ip_route_output_key(dev_net(dev), &rt, &fl)) { 466 goto tx_error_icmp;
434 stats->tx_carrier_errors++;
435 goto tx_error_icmp;
436 }
437 } 467 }
438 tdev = rt->dst.dev; 468 tdev = rt->dst.dev;
439 469
440 if (tdev == dev) { 470 if (tdev == dev) {
441 ip_rt_put(rt); 471 ip_rt_put(rt);
442 stats->collisions++; 472 dev->stats.collisions++;
443 goto tx_error; 473 goto tx_error;
444 } 474 }
445 475
@@ -449,7 +479,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
449 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); 479 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
450 480
451 if (mtu < 68) { 481 if (mtu < 68) {
452 stats->collisions++; 482 dev->stats.collisions++;
453 ip_rt_put(rt); 483 ip_rt_put(rt);
454 goto tx_error; 484 goto tx_error;
455 } 485 }
@@ -485,7 +515,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
485 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 515 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
486 if (!new_skb) { 516 if (!new_skb) {
487 ip_rt_put(rt); 517 ip_rt_put(rt);
488 txq->tx_dropped++; 518 dev->stats.tx_dropped++;
489 dev_kfree_skb(skb); 519 dev_kfree_skb(skb);
490 return NETDEV_TX_OK; 520 return NETDEV_TX_OK;
491 } 521 }
@@ -515,21 +545,21 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
515 iph->frag_off = df; 545 iph->frag_off = df;
516 iph->protocol = IPPROTO_IPIP; 546 iph->protocol = IPPROTO_IPIP;
517 iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); 547 iph->tos = INET_ECN_encapsulate(tos, old_iph->tos);
518 iph->daddr = rt->rt_dst; 548 iph->daddr = fl4.daddr;
519 iph->saddr = rt->rt_src; 549 iph->saddr = fl4.saddr;
520 550
521 if ((iph->ttl = tiph->ttl) == 0) 551 if ((iph->ttl = tiph->ttl) == 0)
522 iph->ttl = old_iph->ttl; 552 iph->ttl = old_iph->ttl;
523 553
524 nf_reset(skb); 554 nf_reset(skb);
525 555 tstats = this_cpu_ptr(dev->tstats);
526 IPTUNNEL_XMIT(); 556 __IPTUNNEL_XMIT(tstats, &dev->stats);
527 return NETDEV_TX_OK; 557 return NETDEV_TX_OK;
528 558
529tx_error_icmp: 559tx_error_icmp:
530 dst_link_failure(skb); 560 dst_link_failure(skb);
531tx_error: 561tx_error:
532 stats->tx_errors++; 562 dev->stats.tx_errors++;
533 dev_kfree_skb(skb); 563 dev_kfree_skb(skb);
534 return NETDEV_TX_OK; 564 return NETDEV_TX_OK;
535} 565}
@@ -538,20 +568,22 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
538{ 568{
539 struct net_device *tdev = NULL; 569 struct net_device *tdev = NULL;
540 struct ip_tunnel *tunnel; 570 struct ip_tunnel *tunnel;
541 struct iphdr *iph; 571 const struct iphdr *iph;
542 572
543 tunnel = netdev_priv(dev); 573 tunnel = netdev_priv(dev);
544 iph = &tunnel->parms.iph; 574 iph = &tunnel->parms.iph;
545 575
546 if (iph->daddr) { 576 if (iph->daddr) {
547 struct flowi fl = { .oif = tunnel->parms.link,
548 .nl_u = { .ip4_u =
549 { .daddr = iph->daddr,
550 .saddr = iph->saddr,
551 .tos = RT_TOS(iph->tos) } },
552 .proto = IPPROTO_IPIP };
553 struct rtable *rt; 577 struct rtable *rt;
554 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { 578 struct flowi4 fl4;
579
580 rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
581 iph->daddr, iph->saddr,
582 0, 0,
583 IPPROTO_IPIP,
584 RT_TOS(iph->tos),
585 tunnel->parms.link);
586 if (!IS_ERR(rt)) {
555 tdev = rt->dst.dev; 587 tdev = rt->dst.dev;
556 ip_rt_put(rt); 588 ip_rt_put(rt);
557 } 589 }
@@ -627,6 +659,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
627 } 659 }
628 t = netdev_priv(dev); 660 t = netdev_priv(dev);
629 ipip_tunnel_unlink(ipn, t); 661 ipip_tunnel_unlink(ipn, t);
662 synchronize_net();
630 t->parms.iph.saddr = p.iph.saddr; 663 t->parms.iph.saddr = p.iph.saddr;
631 t->parms.iph.daddr = p.iph.daddr; 664 t->parms.iph.daddr = p.iph.daddr;
632 memcpy(dev->dev_addr, &p.iph.saddr, 4); 665 memcpy(dev->dev_addr, &p.iph.saddr, 4);
@@ -696,13 +729,19 @@ static const struct net_device_ops ipip_netdev_ops = {
696 .ndo_start_xmit = ipip_tunnel_xmit, 729 .ndo_start_xmit = ipip_tunnel_xmit,
697 .ndo_do_ioctl = ipip_tunnel_ioctl, 730 .ndo_do_ioctl = ipip_tunnel_ioctl,
698 .ndo_change_mtu = ipip_tunnel_change_mtu, 731 .ndo_change_mtu = ipip_tunnel_change_mtu,
699 732 .ndo_get_stats = ipip_get_stats,
700}; 733};
701 734
735static void ipip_dev_free(struct net_device *dev)
736{
737 free_percpu(dev->tstats);
738 free_netdev(dev);
739}
740
702static void ipip_tunnel_setup(struct net_device *dev) 741static void ipip_tunnel_setup(struct net_device *dev)
703{ 742{
704 dev->netdev_ops = &ipip_netdev_ops; 743 dev->netdev_ops = &ipip_netdev_ops;
705 dev->destructor = free_netdev; 744 dev->destructor = ipip_dev_free;
706 745
707 dev->type = ARPHRD_TUNNEL; 746 dev->type = ARPHRD_TUNNEL;
708 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); 747 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
@@ -711,10 +750,11 @@ static void ipip_tunnel_setup(struct net_device *dev)
711 dev->iflink = 0; 750 dev->iflink = 0;
712 dev->addr_len = 4; 751 dev->addr_len = 4;
713 dev->features |= NETIF_F_NETNS_LOCAL; 752 dev->features |= NETIF_F_NETNS_LOCAL;
753 dev->features |= NETIF_F_LLTX;
714 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 754 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
715} 755}
716 756
717static void ipip_tunnel_init(struct net_device *dev) 757static int ipip_tunnel_init(struct net_device *dev)
718{ 758{
719 struct ip_tunnel *tunnel = netdev_priv(dev); 759 struct ip_tunnel *tunnel = netdev_priv(dev);
720 760
@@ -725,9 +765,15 @@ static void ipip_tunnel_init(struct net_device *dev)
725 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 765 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
726 766
727 ipip_tunnel_bind_dev(dev); 767 ipip_tunnel_bind_dev(dev);
768
769 dev->tstats = alloc_percpu(struct pcpu_tstats);
770 if (!dev->tstats)
771 return -ENOMEM;
772
773 return 0;
728} 774}
729 775
730static void __net_init ipip_fb_tunnel_init(struct net_device *dev) 776static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
731{ 777{
732 struct ip_tunnel *tunnel = netdev_priv(dev); 778 struct ip_tunnel *tunnel = netdev_priv(dev);
733 struct iphdr *iph = &tunnel->parms.iph; 779 struct iphdr *iph = &tunnel->parms.iph;
@@ -740,11 +786,16 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
740 iph->protocol = IPPROTO_IPIP; 786 iph->protocol = IPPROTO_IPIP;
741 iph->ihl = 5; 787 iph->ihl = 5;
742 788
789 dev->tstats = alloc_percpu(struct pcpu_tstats);
790 if (!dev->tstats)
791 return -ENOMEM;
792
743 dev_hold(dev); 793 dev_hold(dev);
744 ipn->tunnels_wc[0] = tunnel; 794 rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
795 return 0;
745} 796}
746 797
747static struct xfrm_tunnel ipip_handler = { 798static struct xfrm_tunnel ipip_handler __read_mostly = {
748 .handler = ipip_rcv, 799 .handler = ipip_rcv,
749 .err_handler = ipip_err, 800 .err_handler = ipip_err,
750 .priority = 1, 801 .priority = 1,
@@ -760,11 +811,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
760 for (prio = 1; prio < 4; prio++) { 811 for (prio = 1; prio < 4; prio++) {
761 int h; 812 int h;
762 for (h = 0; h < HASH_SIZE; h++) { 813 for (h = 0; h < HASH_SIZE; h++) {
763 struct ip_tunnel *t = ipn->tunnels[prio][h]; 814 struct ip_tunnel *t;
764 815
816 t = rtnl_dereference(ipn->tunnels[prio][h]);
765 while (t != NULL) { 817 while (t != NULL) {
766 unregister_netdevice_queue(t->dev, head); 818 unregister_netdevice_queue(t->dev, head);
767 t = t->next; 819 t = rtnl_dereference(t->next);
768 } 820 }
769 } 821 }
770 } 822 }
@@ -789,7 +841,9 @@ static int __net_init ipip_init_net(struct net *net)
789 } 841 }
790 dev_net_set(ipn->fb_tunnel_dev, net); 842 dev_net_set(ipn->fb_tunnel_dev, net);
791 843
792 ipip_fb_tunnel_init(ipn->fb_tunnel_dev); 844 err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
845 if (err)
846 goto err_reg_dev;
793 847
794 if ((err = register_netdev(ipn->fb_tunnel_dev))) 848 if ((err = register_netdev(ipn->fb_tunnel_dev)))
795 goto err_reg_dev; 849 goto err_reg_dev;
@@ -797,7 +851,7 @@ static int __net_init ipip_init_net(struct net *net)
797 return 0; 851 return 0;
798 852
799err_reg_dev: 853err_reg_dev:
800 free_netdev(ipn->fb_tunnel_dev); 854 ipip_dev_free(ipn->fb_tunnel_dev);
801err_alloc_dev: 855err_alloc_dev:
802 /* nothing */ 856 /* nothing */
803 return err; 857 return err;
@@ -850,3 +904,4 @@ static void __exit ipip_fini(void)
850module_init(ipip_init); 904module_init(ipip_init);
851module_exit(ipip_fini); 905module_exit(ipip_fini);
852MODULE_LICENSE("GPL"); 906MODULE_LICENSE("GPL");
907MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 179fcab866fc..30a7763c400e 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -60,6 +60,7 @@
60#include <linux/notifier.h> 60#include <linux/notifier.h>
61#include <linux/if_arp.h> 61#include <linux/if_arp.h>
62#include <linux/netfilter_ipv4.h> 62#include <linux/netfilter_ipv4.h>
63#include <linux/compat.h>
63#include <net/ipip.h> 64#include <net/ipip.h>
64#include <net/checksum.h> 65#include <net/checksum.h>
65#include <net/netlink.h> 66#include <net/netlink.h>
@@ -75,7 +76,7 @@ struct mr_table {
75 struct net *net; 76 struct net *net;
76#endif 77#endif
77 u32 id; 78 u32 id;
78 struct sock *mroute_sk; 79 struct sock __rcu *mroute_sk;
79 struct timer_list ipmr_expire_timer; 80 struct timer_list ipmr_expire_timer;
80 struct list_head mfc_unres_queue; 81 struct list_head mfc_unres_queue;
81 struct list_head mfc_cache_array[MFC_LINES]; 82 struct list_head mfc_cache_array[MFC_LINES];
@@ -98,7 +99,7 @@ struct ipmr_result {
98}; 99};
99 100
100/* Big lock, protecting vif table, mrt cache and mroute socket state. 101/* Big lock, protecting vif table, mrt cache and mroute socket state.
101 Note that the changes are semaphored via rtnl_lock. 102 * Note that the changes are semaphored via rtnl_lock.
102 */ 103 */
103 104
104static DEFINE_RWLOCK(mrt_lock); 105static DEFINE_RWLOCK(mrt_lock);
@@ -113,11 +114,11 @@ static DEFINE_RWLOCK(mrt_lock);
113static DEFINE_SPINLOCK(mfc_unres_lock); 114static DEFINE_SPINLOCK(mfc_unres_lock);
114 115
115/* We return to original Alan's scheme. Hash table of resolved 116/* We return to original Alan's scheme. Hash table of resolved
116 entries is changed only in process context and protected 117 * entries is changed only in process context and protected
117 with weak lock mrt_lock. Queue of unresolved entries is protected 118 * with weak lock mrt_lock. Queue of unresolved entries is protected
118 with strong spinlock mfc_unres_lock. 119 * with strong spinlock mfc_unres_lock.
119 120 *
120 In this case data path is free of exclusive locks at all. 121 * In this case data path is free of exclusive locks at all.
121 */ 122 */
122 123
123static struct kmem_cache *mrt_cachep __read_mostly; 124static struct kmem_cache *mrt_cachep __read_mostly;
@@ -147,14 +148,15 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
147 return NULL; 148 return NULL;
148} 149}
149 150
150static int ipmr_fib_lookup(struct net *net, struct flowi *flp, 151static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
151 struct mr_table **mrt) 152 struct mr_table **mrt)
152{ 153{
153 struct ipmr_result res; 154 struct ipmr_result res;
154 struct fib_lookup_arg arg = { .result = &res, }; 155 struct fib_lookup_arg arg = { .result = &res, };
155 int err; 156 int err;
156 157
157 err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg); 158 err = fib_rules_lookup(net->ipv4.mr_rules_ops,
159 flowi4_to_flowi(flp4), 0, &arg);
158 if (err < 0) 160 if (err < 0)
159 return err; 161 return err;
160 *mrt = res.mrt; 162 *mrt = res.mrt;
@@ -282,7 +284,7 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
282 return net->ipv4.mrt; 284 return net->ipv4.mrt;
283} 285}
284 286
285static int ipmr_fib_lookup(struct net *net, struct flowi *flp, 287static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
286 struct mr_table **mrt) 288 struct mr_table **mrt)
287{ 289{
288 *mrt = net->ipv4.mrt; 290 *mrt = net->ipv4.mrt;
@@ -396,9 +398,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
396 set_fs(KERNEL_DS); 398 set_fs(KERNEL_DS);
397 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); 399 err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
398 set_fs(oldfs); 400 set_fs(oldfs);
399 } else 401 } else {
400 err = -EOPNOTSUPP; 402 err = -EOPNOTSUPP;
401 403 }
402 dev = NULL; 404 dev = NULL;
403 405
404 if (err == 0 && 406 if (err == 0 &&
@@ -434,14 +436,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
434{ 436{
435 struct net *net = dev_net(dev); 437 struct net *net = dev_net(dev);
436 struct mr_table *mrt; 438 struct mr_table *mrt;
437 struct flowi fl = { 439 struct flowi4 fl4 = {
438 .oif = dev->ifindex, 440 .flowi4_oif = dev->ifindex,
439 .iif = skb->skb_iif, 441 .flowi4_iif = skb->skb_iif,
440 .mark = skb->mark, 442 .flowi4_mark = skb->mark,
441 }; 443 };
442 int err; 444 int err;
443 445
444 err = ipmr_fib_lookup(net, &fl, &mrt); 446 err = ipmr_fib_lookup(net, &fl4, &mrt);
445 if (err < 0) { 447 if (err < 0) {
446 kfree_skb(skb); 448 kfree_skb(skb);
447 return err; 449 return err;
@@ -495,7 +497,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
495 dev->iflink = 0; 497 dev->iflink = 0;
496 498
497 rcu_read_lock(); 499 rcu_read_lock();
498 if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { 500 in_dev = __in_dev_get_rcu(dev);
501 if (!in_dev) {
499 rcu_read_unlock(); 502 rcu_read_unlock();
500 goto failure; 503 goto failure;
501 } 504 }
@@ -552,9 +555,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
552 mrt->mroute_reg_vif_num = -1; 555 mrt->mroute_reg_vif_num = -1;
553#endif 556#endif
554 557
555 if (vifi+1 == mrt->maxvif) { 558 if (vifi + 1 == mrt->maxvif) {
556 int tmp; 559 int tmp;
557 for (tmp=vifi-1; tmp>=0; tmp--) { 560
561 for (tmp = vifi - 1; tmp >= 0; tmp--) {
558 if (VIF_EXISTS(mrt, tmp)) 562 if (VIF_EXISTS(mrt, tmp))
559 break; 563 break;
560 } 564 }
@@ -565,25 +569,33 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
565 569
566 dev_set_allmulti(dev, -1); 570 dev_set_allmulti(dev, -1);
567 571
568 if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { 572 in_dev = __in_dev_get_rtnl(dev);
573 if (in_dev) {
569 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; 574 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
570 ip_rt_multicast_event(in_dev); 575 ip_rt_multicast_event(in_dev);
571 } 576 }
572 577
573 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) 578 if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
574 unregister_netdevice_queue(dev, head); 579 unregister_netdevice_queue(dev, head);
575 580
576 dev_put(dev); 581 dev_put(dev);
577 return 0; 582 return 0;
578} 583}
579 584
580static inline void ipmr_cache_free(struct mfc_cache *c) 585static void ipmr_cache_free_rcu(struct rcu_head *head)
581{ 586{
587 struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
588
582 kmem_cache_free(mrt_cachep, c); 589 kmem_cache_free(mrt_cachep, c);
583} 590}
584 591
592static inline void ipmr_cache_free(struct mfc_cache *c)
593{
594 call_rcu(&c->rcu, ipmr_cache_free_rcu);
595}
596
585/* Destroy an unresolved cache entry, killing queued skbs 597/* Destroy an unresolved cache entry, killing queued skbs
586 and reporting error to netlink readers. 598 * and reporting error to netlink readers.
587 */ 599 */
588 600
589static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) 601static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
@@ -605,8 +617,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
605 memset(&e->msg, 0, sizeof(e->msg)); 617 memset(&e->msg, 0, sizeof(e->msg));
606 618
607 rtnl_unicast(skb, net, NETLINK_CB(skb).pid); 619 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
608 } else 620 } else {
609 kfree_skb(skb); 621 kfree_skb(skb);
622 }
610 } 623 }
611 624
612 ipmr_cache_free(c); 625 ipmr_cache_free(c);
@@ -724,13 +737,13 @@ static int vif_add(struct net *net, struct mr_table *mrt,
724 case 0: 737 case 0:
725 if (vifc->vifc_flags == VIFF_USE_IFINDEX) { 738 if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
726 dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); 739 dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
727 if (dev && dev->ip_ptr == NULL) { 740 if (dev && __in_dev_get_rtnl(dev) == NULL) {
728 dev_put(dev); 741 dev_put(dev);
729 return -EADDRNOTAVAIL; 742 return -EADDRNOTAVAIL;
730 } 743 }
731 } else 744 } else {
732 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); 745 dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
733 746 }
734 if (!dev) 747 if (!dev)
735 return -EADDRNOTAVAIL; 748 return -EADDRNOTAVAIL;
736 err = dev_set_allmulti(dev, 1); 749 err = dev_set_allmulti(dev, 1);
@@ -743,16 +756,16 @@ static int vif_add(struct net *net, struct mr_table *mrt,
743 return -EINVAL; 756 return -EINVAL;
744 } 757 }
745 758
746 if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) { 759 in_dev = __in_dev_get_rtnl(dev);
760 if (!in_dev) {
747 dev_put(dev); 761 dev_put(dev);
748 return -EADDRNOTAVAIL; 762 return -EADDRNOTAVAIL;
749 } 763 }
750 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; 764 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
751 ip_rt_multicast_event(in_dev); 765 ip_rt_multicast_event(in_dev);
752 766
753 /* 767 /* Fill in the VIF structures */
754 * Fill in the VIF structures 768
755 */
756 v->rate_limit = vifc->vifc_rate_limit; 769 v->rate_limit = vifc->vifc_rate_limit;
757 v->local = vifc->vifc_lcl_addr.s_addr; 770 v->local = vifc->vifc_lcl_addr.s_addr;
758 v->remote = vifc->vifc_rmt_addr.s_addr; 771 v->remote = vifc->vifc_rmt_addr.s_addr;
@@ -765,14 +778,14 @@ static int vif_add(struct net *net, struct mr_table *mrt,
765 v->pkt_in = 0; 778 v->pkt_in = 0;
766 v->pkt_out = 0; 779 v->pkt_out = 0;
767 v->link = dev->ifindex; 780 v->link = dev->ifindex;
768 if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) 781 if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
769 v->link = dev->iflink; 782 v->link = dev->iflink;
770 783
771 /* And finish update writing critical data */ 784 /* And finish update writing critical data */
772 write_lock_bh(&mrt_lock); 785 write_lock_bh(&mrt_lock);
773 v->dev = dev; 786 v->dev = dev;
774#ifdef CONFIG_IP_PIMSM 787#ifdef CONFIG_IP_PIMSM
775 if (v->flags&VIFF_REGISTER) 788 if (v->flags & VIFF_REGISTER)
776 mrt->mroute_reg_vif_num = vifi; 789 mrt->mroute_reg_vif_num = vifi;
777#endif 790#endif
778 if (vifi+1 > mrt->maxvif) 791 if (vifi+1 > mrt->maxvif)
@@ -781,6 +794,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
781 return 0; 794 return 0;
782} 795}
783 796
797/* called with rcu_read_lock() */
784static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, 798static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
785 __be32 origin, 799 __be32 origin,
786 __be32 mcastgrp) 800 __be32 mcastgrp)
@@ -788,7 +802,7 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
788 int line = MFC_HASH(mcastgrp, origin); 802 int line = MFC_HASH(mcastgrp, origin);
789 struct mfc_cache *c; 803 struct mfc_cache *c;
790 804
791 list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { 805 list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
792 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) 806 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
793 return c; 807 return c;
794 } 808 }
@@ -801,19 +815,20 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
801static struct mfc_cache *ipmr_cache_alloc(void) 815static struct mfc_cache *ipmr_cache_alloc(void)
802{ 816{
803 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); 817 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
804 if (c == NULL) 818
805 return NULL; 819 if (c)
806 c->mfc_un.res.minvif = MAXVIFS; 820 c->mfc_un.res.minvif = MAXVIFS;
807 return c; 821 return c;
808} 822}
809 823
810static struct mfc_cache *ipmr_cache_alloc_unres(void) 824static struct mfc_cache *ipmr_cache_alloc_unres(void)
811{ 825{
812 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); 826 struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
813 if (c == NULL) 827
814 return NULL; 828 if (c) {
815 skb_queue_head_init(&c->mfc_un.unres.unresolved); 829 skb_queue_head_init(&c->mfc_un.unres.unresolved);
816 c->mfc_un.unres.expires = jiffies + 10*HZ; 830 c->mfc_un.unres.expires = jiffies + 10*HZ;
831 }
817 return c; 832 return c;
818} 833}
819 834
@@ -827,17 +842,15 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
827 struct sk_buff *skb; 842 struct sk_buff *skb;
828 struct nlmsgerr *e; 843 struct nlmsgerr *e;
829 844
830 /* 845 /* Play the pending entries through our router */
831 * Play the pending entries through our router
832 */
833 846
834 while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { 847 while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
835 if (ip_hdr(skb)->version == 0) { 848 if (ip_hdr(skb)->version == 0) {
836 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); 849 struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
837 850
838 if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { 851 if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
839 nlh->nlmsg_len = (skb_tail_pointer(skb) - 852 nlh->nlmsg_len = skb_tail_pointer(skb) -
840 (u8 *)nlh); 853 (u8 *)nlh;
841 } else { 854 } else {
842 nlh->nlmsg_type = NLMSG_ERROR; 855 nlh->nlmsg_type = NLMSG_ERROR;
843 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); 856 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
@@ -848,8 +861,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
848 } 861 }
849 862
850 rtnl_unicast(skb, net, NETLINK_CB(skb).pid); 863 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
851 } else 864 } else {
852 ip_mr_forward(net, mrt, skb, c, 0); 865 ip_mr_forward(net, mrt, skb, c, 0);
866 }
853 } 867 }
854} 868}
855 869
@@ -867,6 +881,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
867 const int ihl = ip_hdrlen(pkt); 881 const int ihl = ip_hdrlen(pkt);
868 struct igmphdr *igmp; 882 struct igmphdr *igmp;
869 struct igmpmsg *msg; 883 struct igmpmsg *msg;
884 struct sock *mroute_sk;
870 int ret; 885 int ret;
871 886
872#ifdef CONFIG_IP_PIMSM 887#ifdef CONFIG_IP_PIMSM
@@ -882,9 +897,9 @@ static int ipmr_cache_report(struct mr_table *mrt,
882#ifdef CONFIG_IP_PIMSM 897#ifdef CONFIG_IP_PIMSM
883 if (assert == IGMPMSG_WHOLEPKT) { 898 if (assert == IGMPMSG_WHOLEPKT) {
884 /* Ugly, but we have no choice with this interface. 899 /* Ugly, but we have no choice with this interface.
885 Duplicate old header, fix ihl, length etc. 900 * Duplicate old header, fix ihl, length etc.
886 And all this only to mangle msg->im_msgtype and 901 * And all this only to mangle msg->im_msgtype and
887 to set msg->im_mbz to "mbz" :-) 902 * to set msg->im_mbz to "mbz" :-)
888 */ 903 */
889 skb_push(skb, sizeof(struct iphdr)); 904 skb_push(skb, sizeof(struct iphdr));
890 skb_reset_network_header(skb); 905 skb_reset_network_header(skb);
@@ -901,39 +916,38 @@ static int ipmr_cache_report(struct mr_table *mrt,
901#endif 916#endif
902 { 917 {
903 918
904 /* 919 /* Copy the IP header */
905 * Copy the IP header
906 */
907 920
908 skb->network_header = skb->tail; 921 skb->network_header = skb->tail;
909 skb_put(skb, ihl); 922 skb_put(skb, ihl);
910 skb_copy_to_linear_data(skb, pkt->data, ihl); 923 skb_copy_to_linear_data(skb, pkt->data, ihl);
911 ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ 924 ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
912 msg = (struct igmpmsg *)skb_network_header(skb); 925 msg = (struct igmpmsg *)skb_network_header(skb);
913 msg->im_vif = vifi; 926 msg->im_vif = vifi;
914 skb_dst_set(skb, dst_clone(skb_dst(pkt))); 927 skb_dst_set(skb, dst_clone(skb_dst(pkt)));
915 928
916 /* 929 /* Add our header */
917 * Add our header
918 */
919 930
920 igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); 931 igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
921 igmp->type = 932 igmp->type =
922 msg->im_msgtype = assert; 933 msg->im_msgtype = assert;
923 igmp->code = 0; 934 igmp->code = 0;
924 ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ 935 ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
925 skb->transport_header = skb->network_header; 936 skb->transport_header = skb->network_header;
926 } 937 }
927 938
928 if (mrt->mroute_sk == NULL) { 939 rcu_read_lock();
940 mroute_sk = rcu_dereference(mrt->mroute_sk);
941 if (mroute_sk == NULL) {
942 rcu_read_unlock();
929 kfree_skb(skb); 943 kfree_skb(skb);
930 return -EINVAL; 944 return -EINVAL;
931 } 945 }
932 946
933 /* 947 /* Deliver to mrouted */
934 * Deliver to mrouted 948
935 */ 949 ret = sock_queue_rcv_skb(mroute_sk, skb);
936 ret = sock_queue_rcv_skb(mrt->mroute_sk, skb); 950 rcu_read_unlock();
937 if (ret < 0) { 951 if (ret < 0) {
938 if (net_ratelimit()) 952 if (net_ratelimit())
939 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); 953 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -965,9 +979,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
965 } 979 }
966 980
967 if (!found) { 981 if (!found) {
968 /* 982 /* Create a new entry if allowable */
969 * Create a new entry if allowable
970 */
971 983
972 if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || 984 if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
973 (c = ipmr_cache_alloc_unres()) == NULL) { 985 (c = ipmr_cache_alloc_unres()) == NULL) {
@@ -977,16 +989,14 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
977 return -ENOBUFS; 989 return -ENOBUFS;
978 } 990 }
979 991
980 /* 992 /* Fill in the new cache entry */
981 * Fill in the new cache entry 993
982 */
983 c->mfc_parent = -1; 994 c->mfc_parent = -1;
984 c->mfc_origin = iph->saddr; 995 c->mfc_origin = iph->saddr;
985 c->mfc_mcastgrp = iph->daddr; 996 c->mfc_mcastgrp = iph->daddr;
986 997
987 /* 998 /* Reflect first query at mrouted. */
988 * Reflect first query at mrouted. 999
989 */
990 err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); 1000 err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
991 if (err < 0) { 1001 if (err < 0) {
992 /* If the report failed throw the cache entry 1002 /* If the report failed throw the cache entry
@@ -1006,10 +1016,9 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
1006 mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); 1016 mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
1007 } 1017 }
1008 1018
1009 /* 1019 /* See if we can append the packet */
1010 * See if we can append the packet 1020
1011 */ 1021 if (c->mfc_un.unres.unresolved.qlen > 3) {
1012 if (c->mfc_un.unres.unresolved.qlen>3) {
1013 kfree_skb(skb); 1022 kfree_skb(skb);
1014 err = -ENOBUFS; 1023 err = -ENOBUFS;
1015 } else { 1024 } else {
@@ -1035,9 +1044,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
1035 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { 1044 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
1036 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 1045 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1037 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { 1046 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1038 write_lock_bh(&mrt_lock); 1047 list_del_rcu(&c->list);
1039 list_del(&c->list);
1040 write_unlock_bh(&mrt_lock);
1041 1048
1042 ipmr_cache_free(c); 1049 ipmr_cache_free(c);
1043 return 0; 1050 return 0;
@@ -1090,9 +1097,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1090 if (!mrtsock) 1097 if (!mrtsock)
1091 c->mfc_flags |= MFC_STATIC; 1098 c->mfc_flags |= MFC_STATIC;
1092 1099
1093 write_lock_bh(&mrt_lock); 1100 list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
1094 list_add(&c->list, &mrt->mfc_cache_array[line]);
1095 write_unlock_bh(&mrt_lock);
1096 1101
1097 /* 1102 /*
1098 * Check to see if we resolved a queued list. If so we 1103 * Check to see if we resolved a queued list. If so we
@@ -1130,26 +1135,21 @@ static void mroute_clean_tables(struct mr_table *mrt)
1130 LIST_HEAD(list); 1135 LIST_HEAD(list);
1131 struct mfc_cache *c, *next; 1136 struct mfc_cache *c, *next;
1132 1137
1133 /* 1138 /* Shut down all active vif entries */
1134 * Shut down all active vif entries 1139
1135 */
1136 for (i = 0; i < mrt->maxvif; i++) { 1140 for (i = 0; i < mrt->maxvif; i++) {
1137 if (!(mrt->vif_table[i].flags&VIFF_STATIC)) 1141 if (!(mrt->vif_table[i].flags & VIFF_STATIC))
1138 vif_delete(mrt, i, 0, &list); 1142 vif_delete(mrt, i, 0, &list);
1139 } 1143 }
1140 unregister_netdevice_many(&list); 1144 unregister_netdevice_many(&list);
1141 1145
1142 /* 1146 /* Wipe the cache */
1143 * Wipe the cache 1147
1144 */
1145 for (i = 0; i < MFC_LINES; i++) { 1148 for (i = 0; i < MFC_LINES; i++) {
1146 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { 1149 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
1147 if (c->mfc_flags&MFC_STATIC) 1150 if (c->mfc_flags & MFC_STATIC)
1148 continue; 1151 continue;
1149 write_lock_bh(&mrt_lock); 1152 list_del_rcu(&c->list);
1150 list_del(&c->list);
1151 write_unlock_bh(&mrt_lock);
1152
1153 ipmr_cache_free(c); 1153 ipmr_cache_free(c);
1154 } 1154 }
1155 } 1155 }
@@ -1164,6 +1164,9 @@ static void mroute_clean_tables(struct mr_table *mrt)
1164 } 1164 }
1165} 1165}
1166 1166
1167/* called from ip_ra_control(), before an RCU grace period,
1168 * we dont need to call synchronize_rcu() here
1169 */
1167static void mrtsock_destruct(struct sock *sk) 1170static void mrtsock_destruct(struct sock *sk)
1168{ 1171{
1169 struct net *net = sock_net(sk); 1172 struct net *net = sock_net(sk);
@@ -1171,13 +1174,9 @@ static void mrtsock_destruct(struct sock *sk)
1171 1174
1172 rtnl_lock(); 1175 rtnl_lock();
1173 ipmr_for_each_table(mrt, net) { 1176 ipmr_for_each_table(mrt, net) {
1174 if (sk == mrt->mroute_sk) { 1177 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1175 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; 1178 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1176 1179 rcu_assign_pointer(mrt->mroute_sk, NULL);
1177 write_lock_bh(&mrt_lock);
1178 mrt->mroute_sk = NULL;
1179 write_unlock_bh(&mrt_lock);
1180
1181 mroute_clean_tables(mrt); 1180 mroute_clean_tables(mrt);
1182 } 1181 }
1183 } 1182 }
@@ -1204,7 +1203,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1204 return -ENOENT; 1203 return -ENOENT;
1205 1204
1206 if (optname != MRT_INIT) { 1205 if (optname != MRT_INIT) {
1207 if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN)) 1206 if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
1207 !capable(CAP_NET_ADMIN))
1208 return -EACCES; 1208 return -EACCES;
1209 } 1209 }
1210 1210
@@ -1217,23 +1217,20 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1217 return -ENOPROTOOPT; 1217 return -ENOPROTOOPT;
1218 1218
1219 rtnl_lock(); 1219 rtnl_lock();
1220 if (mrt->mroute_sk) { 1220 if (rtnl_dereference(mrt->mroute_sk)) {
1221 rtnl_unlock(); 1221 rtnl_unlock();
1222 return -EADDRINUSE; 1222 return -EADDRINUSE;
1223 } 1223 }
1224 1224
1225 ret = ip_ra_control(sk, 1, mrtsock_destruct); 1225 ret = ip_ra_control(sk, 1, mrtsock_destruct);
1226 if (ret == 0) { 1226 if (ret == 0) {
1227 write_lock_bh(&mrt_lock); 1227 rcu_assign_pointer(mrt->mroute_sk, sk);
1228 mrt->mroute_sk = sk;
1229 write_unlock_bh(&mrt_lock);
1230
1231 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; 1228 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1232 } 1229 }
1233 rtnl_unlock(); 1230 rtnl_unlock();
1234 return ret; 1231 return ret;
1235 case MRT_DONE: 1232 case MRT_DONE:
1236 if (sk != mrt->mroute_sk) 1233 if (sk != rcu_dereference_raw(mrt->mroute_sk))
1237 return -EACCES; 1234 return -EACCES;
1238 return ip_ra_control(sk, 0, NULL); 1235 return ip_ra_control(sk, 0, NULL);
1239 case MRT_ADD_VIF: 1236 case MRT_ADD_VIF:
@@ -1246,7 +1243,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1246 return -ENFILE; 1243 return -ENFILE;
1247 rtnl_lock(); 1244 rtnl_lock();
1248 if (optname == MRT_ADD_VIF) { 1245 if (optname == MRT_ADD_VIF) {
1249 ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk); 1246 ret = vif_add(net, mrt, &vif,
1247 sk == rtnl_dereference(mrt->mroute_sk));
1250 } else { 1248 } else {
1251 ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); 1249 ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
1252 } 1250 }
@@ -1267,7 +1265,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1267 if (optname == MRT_DEL_MFC) 1265 if (optname == MRT_DEL_MFC)
1268 ret = ipmr_mfc_delete(mrt, &mfc); 1266 ret = ipmr_mfc_delete(mrt, &mfc);
1269 else 1267 else
1270 ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk); 1268 ret = ipmr_mfc_add(net, mrt, &mfc,
1269 sk == rtnl_dereference(mrt->mroute_sk));
1271 rtnl_unlock(); 1270 rtnl_unlock();
1272 return ret; 1271 return ret;
1273 /* 1272 /*
@@ -1276,7 +1275,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1276 case MRT_ASSERT: 1275 case MRT_ASSERT:
1277 { 1276 {
1278 int v; 1277 int v;
1279 if (get_user(v,(int __user *)optval)) 1278 if (get_user(v, (int __user *)optval))
1280 return -EFAULT; 1279 return -EFAULT;
1281 mrt->mroute_do_assert = (v) ? 1 : 0; 1280 mrt->mroute_do_assert = (v) ? 1 : 0;
1282 return 0; 1281 return 0;
@@ -1286,7 +1285,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1286 { 1285 {
1287 int v; 1286 int v;
1288 1287
1289 if (get_user(v,(int __user *)optval)) 1288 if (get_user(v, (int __user *)optval))
1290 return -EFAULT; 1289 return -EFAULT;
1291 v = (v) ? 1 : 0; 1290 v = (v) ? 1 : 0;
1292 1291
@@ -1309,14 +1308,16 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1309 return -EINVAL; 1308 return -EINVAL;
1310 if (get_user(v, (u32 __user *)optval)) 1309 if (get_user(v, (u32 __user *)optval))
1311 return -EFAULT; 1310 return -EFAULT;
1312 if (sk == mrt->mroute_sk)
1313 return -EBUSY;
1314 1311
1315 rtnl_lock(); 1312 rtnl_lock();
1316 ret = 0; 1313 ret = 0;
1317 if (!ipmr_new_table(net, v)) 1314 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1318 ret = -ENOMEM; 1315 ret = -EBUSY;
1319 raw_sk(sk)->ipmr_table = v; 1316 } else {
1317 if (!ipmr_new_table(net, v))
1318 ret = -ENOMEM;
1319 raw_sk(sk)->ipmr_table = v;
1320 }
1320 rtnl_unlock(); 1321 rtnl_unlock();
1321 return ret; 1322 return ret;
1322 } 1323 }
@@ -1347,9 +1348,9 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
1347 1348
1348 if (optname != MRT_VERSION && 1349 if (optname != MRT_VERSION &&
1349#ifdef CONFIG_IP_PIMSM 1350#ifdef CONFIG_IP_PIMSM
1350 optname!=MRT_PIM && 1351 optname != MRT_PIM &&
1351#endif 1352#endif
1352 optname!=MRT_ASSERT) 1353 optname != MRT_ASSERT)
1353 return -ENOPROTOOPT; 1354 return -ENOPROTOOPT;
1354 1355
1355 if (get_user(olr, optlen)) 1356 if (get_user(olr, optlen))
@@ -1416,24 +1417,99 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1416 if (copy_from_user(&sr, arg, sizeof(sr))) 1417 if (copy_from_user(&sr, arg, sizeof(sr)))
1417 return -EFAULT; 1418 return -EFAULT;
1418 1419
1419 read_lock(&mrt_lock); 1420 rcu_read_lock();
1420 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); 1421 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1421 if (c) { 1422 if (c) {
1422 sr.pktcnt = c->mfc_un.res.pkt; 1423 sr.pktcnt = c->mfc_un.res.pkt;
1423 sr.bytecnt = c->mfc_un.res.bytes; 1424 sr.bytecnt = c->mfc_un.res.bytes;
1424 sr.wrong_if = c->mfc_un.res.wrong_if; 1425 sr.wrong_if = c->mfc_un.res.wrong_if;
1425 read_unlock(&mrt_lock); 1426 rcu_read_unlock();
1426 1427
1427 if (copy_to_user(arg, &sr, sizeof(sr))) 1428 if (copy_to_user(arg, &sr, sizeof(sr)))
1428 return -EFAULT; 1429 return -EFAULT;
1429 return 0; 1430 return 0;
1430 } 1431 }
1432 rcu_read_unlock();
1433 return -EADDRNOTAVAIL;
1434 default:
1435 return -ENOIOCTLCMD;
1436 }
1437}
1438
1439#ifdef CONFIG_COMPAT
1440struct compat_sioc_sg_req {
1441 struct in_addr src;
1442 struct in_addr grp;
1443 compat_ulong_t pktcnt;
1444 compat_ulong_t bytecnt;
1445 compat_ulong_t wrong_if;
1446};
1447
1448struct compat_sioc_vif_req {
1449 vifi_t vifi; /* Which iface */
1450 compat_ulong_t icount;
1451 compat_ulong_t ocount;
1452 compat_ulong_t ibytes;
1453 compat_ulong_t obytes;
1454};
1455
1456int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
1457{
1458 struct compat_sioc_sg_req sr;
1459 struct compat_sioc_vif_req vr;
1460 struct vif_device *vif;
1461 struct mfc_cache *c;
1462 struct net *net = sock_net(sk);
1463 struct mr_table *mrt;
1464
1465 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1466 if (mrt == NULL)
1467 return -ENOENT;
1468
1469 switch (cmd) {
1470 case SIOCGETVIFCNT:
1471 if (copy_from_user(&vr, arg, sizeof(vr)))
1472 return -EFAULT;
1473 if (vr.vifi >= mrt->maxvif)
1474 return -EINVAL;
1475 read_lock(&mrt_lock);
1476 vif = &mrt->vif_table[vr.vifi];
1477 if (VIF_EXISTS(mrt, vr.vifi)) {
1478 vr.icount = vif->pkt_in;
1479 vr.ocount = vif->pkt_out;
1480 vr.ibytes = vif->bytes_in;
1481 vr.obytes = vif->bytes_out;
1482 read_unlock(&mrt_lock);
1483
1484 if (copy_to_user(arg, &vr, sizeof(vr)))
1485 return -EFAULT;
1486 return 0;
1487 }
1431 read_unlock(&mrt_lock); 1488 read_unlock(&mrt_lock);
1432 return -EADDRNOTAVAIL; 1489 return -EADDRNOTAVAIL;
1490 case SIOCGETSGCNT:
1491 if (copy_from_user(&sr, arg, sizeof(sr)))
1492 return -EFAULT;
1493
1494 rcu_read_lock();
1495 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1496 if (c) {
1497 sr.pktcnt = c->mfc_un.res.pkt;
1498 sr.bytecnt = c->mfc_un.res.bytes;
1499 sr.wrong_if = c->mfc_un.res.wrong_if;
1500 rcu_read_unlock();
1501
1502 if (copy_to_user(arg, &sr, sizeof(sr)))
1503 return -EFAULT;
1504 return 0;
1505 }
1506 rcu_read_unlock();
1507 return -EADDRNOTAVAIL;
1433 default: 1508 default:
1434 return -ENOIOCTLCMD; 1509 return -ENOIOCTLCMD;
1435 } 1510 }
1436} 1511}
1512#endif
1437 1513
1438 1514
1439static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) 1515static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
@@ -1465,7 +1541,7 @@ static struct notifier_block ip_mr_notifier = {
1465}; 1541};
1466 1542
1467/* 1543/*
1468 * Encapsulate a packet by attaching a valid IPIP header to it. 1544 * Encapsulate a packet by attaching a valid IPIP header to it.
1469 * This avoids tunnel drivers and other mess and gives us the speed so 1545 * This avoids tunnel drivers and other mess and gives us the speed so
1470 * important for multicast video. 1546 * important for multicast video.
1471 */ 1547 */
@@ -1473,14 +1549,14 @@ static struct notifier_block ip_mr_notifier = {
1473static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) 1549static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1474{ 1550{
1475 struct iphdr *iph; 1551 struct iphdr *iph;
1476 struct iphdr *old_iph = ip_hdr(skb); 1552 const struct iphdr *old_iph = ip_hdr(skb);
1477 1553
1478 skb_push(skb, sizeof(struct iphdr)); 1554 skb_push(skb, sizeof(struct iphdr));
1479 skb->transport_header = skb->network_header; 1555 skb->transport_header = skb->network_header;
1480 skb_reset_network_header(skb); 1556 skb_reset_network_header(skb);
1481 iph = ip_hdr(skb); 1557 iph = ip_hdr(skb);
1482 1558
1483 iph->version = 4; 1559 iph->version = 4;
1484 iph->tos = old_iph->tos; 1560 iph->tos = old_iph->tos;
1485 iph->ttl = old_iph->ttl; 1561 iph->ttl = old_iph->ttl;
1486 iph->frag_off = 0; 1562 iph->frag_off = 0;
@@ -1498,7 +1574,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1498 1574
1499static inline int ipmr_forward_finish(struct sk_buff *skb) 1575static inline int ipmr_forward_finish(struct sk_buff *skb)
1500{ 1576{
1501 struct ip_options * opt = &(IPCB(skb)->opt); 1577 struct ip_options *opt = &(IPCB(skb)->opt);
1502 1578
1503 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); 1579 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1504 1580
@@ -1519,6 +1595,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1519 struct vif_device *vif = &mrt->vif_table[vifi]; 1595 struct vif_device *vif = &mrt->vif_table[vifi];
1520 struct net_device *dev; 1596 struct net_device *dev;
1521 struct rtable *rt; 1597 struct rtable *rt;
1598 struct flowi4 fl4;
1522 int encap = 0; 1599 int encap = 0;
1523 1600
1524 if (vif->dev == NULL) 1601 if (vif->dev == NULL)
@@ -1535,23 +1612,21 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1535 } 1612 }
1536#endif 1613#endif
1537 1614
1538 if (vif->flags&VIFF_TUNNEL) { 1615 if (vif->flags & VIFF_TUNNEL) {
1539 struct flowi fl = { .oif = vif->link, 1616 rt = ip_route_output_ports(net, &fl4, NULL,
1540 .nl_u = { .ip4_u = 1617 vif->remote, vif->local,
1541 { .daddr = vif->remote, 1618 0, 0,
1542 .saddr = vif->local, 1619 IPPROTO_IPIP,
1543 .tos = RT_TOS(iph->tos) } }, 1620 RT_TOS(iph->tos), vif->link);
1544 .proto = IPPROTO_IPIP }; 1621 if (IS_ERR(rt))
1545 if (ip_route_output_key(net, &rt, &fl))
1546 goto out_free; 1622 goto out_free;
1547 encap = sizeof(struct iphdr); 1623 encap = sizeof(struct iphdr);
1548 } else { 1624 } else {
1549 struct flowi fl = { .oif = vif->link, 1625 rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
1550 .nl_u = { .ip4_u = 1626 0, 0,
1551 { .daddr = iph->daddr, 1627 IPPROTO_IPIP,
1552 .tos = RT_TOS(iph->tos) } }, 1628 RT_TOS(iph->tos), vif->link);
1553 .proto = IPPROTO_IPIP }; 1629 if (IS_ERR(rt))
1554 if (ip_route_output_key(net, &rt, &fl))
1555 goto out_free; 1630 goto out_free;
1556 } 1631 }
1557 1632
@@ -1559,8 +1634,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1559 1634
1560 if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { 1635 if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
1561 /* Do not fragment multicasts. Alas, IPv4 does not 1636 /* Do not fragment multicasts. Alas, IPv4 does not
1562 allow to send ICMP, so that packets will disappear 1637 * allow to send ICMP, so that packets will disappear
1563 to blackhole. 1638 * to blackhole.
1564 */ 1639 */
1565 1640
1566 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 1641 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -1583,7 +1658,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1583 ip_decrease_ttl(ip_hdr(skb)); 1658 ip_decrease_ttl(ip_hdr(skb));
1584 1659
1585 /* FIXME: forward and output firewalls used to be called here. 1660 /* FIXME: forward and output firewalls used to be called here.
1586 * What do we do with netfilter? -- RR */ 1661 * What do we do with netfilter? -- RR
1662 */
1587 if (vif->flags & VIFF_TUNNEL) { 1663 if (vif->flags & VIFF_TUNNEL) {
1588 ip_encap(skb, vif->local, vif->remote); 1664 ip_encap(skb, vif->local, vif->remote);
1589 /* FIXME: extra output firewall step used to be here. --RR */ 1665 /* FIXME: extra output firewall step used to be here. --RR */
@@ -1642,17 +1718,17 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1642 if (mrt->vif_table[vif].dev != skb->dev) { 1718 if (mrt->vif_table[vif].dev != skb->dev) {
1643 int true_vifi; 1719 int true_vifi;
1644 1720
1645 if (skb_rtable(skb)->fl.iif == 0) { 1721 if (rt_is_output_route(skb_rtable(skb))) {
1646 /* It is our own packet, looped back. 1722 /* It is our own packet, looped back.
1647 Very complicated situation... 1723 * Very complicated situation...
1648 1724 *
1649 The best workaround until routing daemons will be 1725 * The best workaround until routing daemons will be
1650 fixed is not to redistribute packet, if it was 1726 * fixed is not to redistribute packet, if it was
1651 send through wrong interface. It means, that 1727 * send through wrong interface. It means, that
1652 multicast applications WILL NOT work for 1728 * multicast applications WILL NOT work for
1653 (S,G), which have default multicast route pointing 1729 * (S,G), which have default multicast route pointing
1654 to wrong oif. In any case, it is not a good 1730 * to wrong oif. In any case, it is not a good
1655 idea to use multicasting applications on router. 1731 * idea to use multicasting applications on router.
1656 */ 1732 */
1657 goto dont_forward; 1733 goto dont_forward;
1658 } 1734 }
@@ -1662,9 +1738,9 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1662 1738
1663 if (true_vifi >= 0 && mrt->mroute_do_assert && 1739 if (true_vifi >= 0 && mrt->mroute_do_assert &&
1664 /* pimsm uses asserts, when switching from RPT to SPT, 1740 /* pimsm uses asserts, when switching from RPT to SPT,
1665 so that we cannot check that packet arrived on an oif. 1741 * so that we cannot check that packet arrived on an oif.
1666 It is bad, but otherwise we would need to move pretty 1742 * It is bad, but otherwise we would need to move pretty
1667 large chunk of pimd to kernel. Ough... --ANK 1743 * large chunk of pimd to kernel. Ough... --ANK
1668 */ 1744 */
1669 (mrt->mroute_do_pim || 1745 (mrt->mroute_do_pim ||
1670 cache->mfc_un.res.ttls[true_vifi] < 255) && 1746 cache->mfc_un.res.ttls[true_vifi] < 255) &&
@@ -1682,10 +1758,12 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1682 /* 1758 /*
1683 * Forward the frame 1759 * Forward the frame
1684 */ 1760 */
1685 for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { 1761 for (ct = cache->mfc_un.res.maxvif - 1;
1762 ct >= cache->mfc_un.res.minvif; ct--) {
1686 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { 1763 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1687 if (psend != -1) { 1764 if (psend != -1) {
1688 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1765 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1766
1689 if (skb2) 1767 if (skb2)
1690 ipmr_queue_xmit(net, mrt, skb2, cache, 1768 ipmr_queue_xmit(net, mrt, skb2, cache,
1691 psend); 1769 psend);
@@ -1696,6 +1774,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1696 if (psend != -1) { 1774 if (psend != -1) {
1697 if (local) { 1775 if (local) {
1698 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1776 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1777
1699 if (skb2) 1778 if (skb2)
1700 ipmr_queue_xmit(net, mrt, skb2, cache, psend); 1779 ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1701 } else { 1780 } else {
@@ -1710,9 +1789,30 @@ dont_forward:
1710 return 0; 1789 return 0;
1711} 1790}
1712 1791
1792static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
1793{
1794 struct rtable *rt = skb_rtable(skb);
1795 struct iphdr *iph = ip_hdr(skb);
1796 struct flowi4 fl4 = {
1797 .daddr = iph->daddr,
1798 .saddr = iph->saddr,
1799 .flowi4_tos = iph->tos,
1800 .flowi4_oif = rt->rt_oif,
1801 .flowi4_iif = rt->rt_iif,
1802 .flowi4_mark = rt->rt_mark,
1803 };
1804 struct mr_table *mrt;
1805 int err;
1806
1807 err = ipmr_fib_lookup(net, &fl4, &mrt);
1808 if (err)
1809 return ERR_PTR(err);
1810 return mrt;
1811}
1713 1812
1714/* 1813/*
1715 * Multicast packets for forwarding arrive here 1814 * Multicast packets for forwarding arrive here
1815 * Called with rcu_read_lock();
1716 */ 1816 */
1717 1817
1718int ip_mr_input(struct sk_buff *skb) 1818int ip_mr_input(struct sk_buff *skb)
@@ -1721,43 +1821,41 @@ int ip_mr_input(struct sk_buff *skb)
1721 struct net *net = dev_net(skb->dev); 1821 struct net *net = dev_net(skb->dev);
1722 int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; 1822 int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1723 struct mr_table *mrt; 1823 struct mr_table *mrt;
1724 int err;
1725 1824
1726 /* Packet is looped back after forward, it should not be 1825 /* Packet is looped back after forward, it should not be
1727 forwarded second time, but still can be delivered locally. 1826 * forwarded second time, but still can be delivered locally.
1728 */ 1827 */
1729 if (IPCB(skb)->flags&IPSKB_FORWARDED) 1828 if (IPCB(skb)->flags & IPSKB_FORWARDED)
1730 goto dont_forward; 1829 goto dont_forward;
1731 1830
1732 err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); 1831 mrt = ipmr_rt_fib_lookup(net, skb);
1733 if (err < 0) { 1832 if (IS_ERR(mrt)) {
1734 kfree_skb(skb); 1833 kfree_skb(skb);
1735 return err; 1834 return PTR_ERR(mrt);
1736 } 1835 }
1737
1738 if (!local) { 1836 if (!local) {
1739 if (IPCB(skb)->opt.router_alert) { 1837 if (IPCB(skb)->opt.router_alert) {
1740 if (ip_call_ra_chain(skb)) 1838 if (ip_call_ra_chain(skb))
1741 return 0; 1839 return 0;
1742 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){ 1840 } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
1743 /* IGMPv1 (and broken IGMPv2 implementations sort of 1841 /* IGMPv1 (and broken IGMPv2 implementations sort of
1744 Cisco IOS <= 11.2(8)) do not put router alert 1842 * Cisco IOS <= 11.2(8)) do not put router alert
1745 option to IGMP packets destined to routable 1843 * option to IGMP packets destined to routable
1746 groups. It is very bad, because it means 1844 * groups. It is very bad, because it means
1747 that we can forward NO IGMP messages. 1845 * that we can forward NO IGMP messages.
1748 */ 1846 */
1749 read_lock(&mrt_lock); 1847 struct sock *mroute_sk;
1750 if (mrt->mroute_sk) { 1848
1751 nf_reset(skb); 1849 mroute_sk = rcu_dereference(mrt->mroute_sk);
1752 raw_rcv(mrt->mroute_sk, skb); 1850 if (mroute_sk) {
1753 read_unlock(&mrt_lock); 1851 nf_reset(skb);
1754 return 0; 1852 raw_rcv(mroute_sk, skb);
1755 } 1853 return 0;
1756 read_unlock(&mrt_lock); 1854 }
1757 } 1855 }
1758 } 1856 }
1759 1857
1760 read_lock(&mrt_lock); 1858 /* already under rcu_read_lock() */
1761 cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); 1859 cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1762 1860
1763 /* 1861 /*
@@ -1769,13 +1867,12 @@ int ip_mr_input(struct sk_buff *skb)
1769 if (local) { 1867 if (local) {
1770 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1868 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1771 ip_local_deliver(skb); 1869 ip_local_deliver(skb);
1772 if (skb2 == NULL) { 1870 if (skb2 == NULL)
1773 read_unlock(&mrt_lock);
1774 return -ENOBUFS; 1871 return -ENOBUFS;
1775 }
1776 skb = skb2; 1872 skb = skb2;
1777 } 1873 }
1778 1874
1875 read_lock(&mrt_lock);
1779 vif = ipmr_find_vif(mrt, skb->dev); 1876 vif = ipmr_find_vif(mrt, skb->dev);
1780 if (vif >= 0) { 1877 if (vif >= 0) {
1781 int err2 = ipmr_cache_unresolved(mrt, vif, skb); 1878 int err2 = ipmr_cache_unresolved(mrt, vif, skb);
@@ -1788,8 +1885,8 @@ int ip_mr_input(struct sk_buff *skb)
1788 return -ENODEV; 1885 return -ENODEV;
1789 } 1886 }
1790 1887
1888 read_lock(&mrt_lock);
1791 ip_mr_forward(net, mrt, skb, cache, local); 1889 ip_mr_forward(net, mrt, skb, cache, local);
1792
1793 read_unlock(&mrt_lock); 1890 read_unlock(&mrt_lock);
1794 1891
1795 if (local) 1892 if (local)
@@ -1805,6 +1902,7 @@ dont_forward:
1805} 1902}
1806 1903
1807#ifdef CONFIG_IP_PIMSM 1904#ifdef CONFIG_IP_PIMSM
1905/* called with rcu_read_lock() */
1808static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, 1906static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1809 unsigned int pimlen) 1907 unsigned int pimlen)
1810{ 1908{
@@ -1813,10 +1911,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1813 1911
1814 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); 1912 encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1815 /* 1913 /*
1816 Check that: 1914 * Check that:
1817 a. packet is really destinted to a multicast group 1915 * a. packet is really sent to a multicast group
1818 b. packet is not a NULL-REGISTER 1916 * b. packet is not a NULL-REGISTER
1819 c. packet is not truncated 1917 * c. packet is not truncated
1820 */ 1918 */
1821 if (!ipv4_is_multicast(encap->daddr) || 1919 if (!ipv4_is_multicast(encap->daddr) ||
1822 encap->tot_len == 0 || 1920 encap->tot_len == 0 ||
@@ -1826,26 +1924,23 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1826 read_lock(&mrt_lock); 1924 read_lock(&mrt_lock);
1827 if (mrt->mroute_reg_vif_num >= 0) 1925 if (mrt->mroute_reg_vif_num >= 0)
1828 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; 1926 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
1829 if (reg_dev)
1830 dev_hold(reg_dev);
1831 read_unlock(&mrt_lock); 1927 read_unlock(&mrt_lock);
1832 1928
1833 if (reg_dev == NULL) 1929 if (reg_dev == NULL)
1834 return 1; 1930 return 1;
1835 1931
1836 skb->mac_header = skb->network_header; 1932 skb->mac_header = skb->network_header;
1837 skb_pull(skb, (u8*)encap - skb->data); 1933 skb_pull(skb, (u8 *)encap - skb->data);
1838 skb_reset_network_header(skb); 1934 skb_reset_network_header(skb);
1839 skb->protocol = htons(ETH_P_IP); 1935 skb->protocol = htons(ETH_P_IP);
1840 skb->ip_summed = 0; 1936 skb->ip_summed = CHECKSUM_NONE;
1841 skb->pkt_type = PACKET_HOST; 1937 skb->pkt_type = PACKET_HOST;
1842 1938
1843 skb_tunnel_rx(skb, reg_dev); 1939 skb_tunnel_rx(skb, reg_dev);
1844 1940
1845 netif_rx(skb); 1941 netif_rx(skb);
1846 dev_put(reg_dev);
1847 1942
1848 return 0; 1943 return NET_RX_SUCCESS;
1849} 1944}
1850#endif 1945#endif
1851 1946
@@ -1854,7 +1949,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1854 * Handle IGMP messages of PIMv1 1949 * Handle IGMP messages of PIMv1
1855 */ 1950 */
1856 1951
1857int pim_rcv_v1(struct sk_buff * skb) 1952int pim_rcv_v1(struct sk_buff *skb)
1858{ 1953{
1859 struct igmphdr *pim; 1954 struct igmphdr *pim;
1860 struct net *net = dev_net(skb->dev); 1955 struct net *net = dev_net(skb->dev);
@@ -1865,9 +1960,9 @@ int pim_rcv_v1(struct sk_buff * skb)
1865 1960
1866 pim = igmp_hdr(skb); 1961 pim = igmp_hdr(skb);
1867 1962
1868 if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) 1963 mrt = ipmr_rt_fib_lookup(net, skb);
1964 if (IS_ERR(mrt))
1869 goto drop; 1965 goto drop;
1870
1871 if (!mrt->mroute_do_pim || 1966 if (!mrt->mroute_do_pim ||
1872 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 1967 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1873 goto drop; 1968 goto drop;
@@ -1881,7 +1976,7 @@ drop:
1881#endif 1976#endif
1882 1977
1883#ifdef CONFIG_IP_PIMSM_V2 1978#ifdef CONFIG_IP_PIMSM_V2
1884static int pim_rcv(struct sk_buff * skb) 1979static int pim_rcv(struct sk_buff *skb)
1885{ 1980{
1886 struct pimreghdr *pim; 1981 struct pimreghdr *pim;
1887 struct net *net = dev_net(skb->dev); 1982 struct net *net = dev_net(skb->dev);
@@ -1891,15 +1986,15 @@ static int pim_rcv(struct sk_buff * skb)
1891 goto drop; 1986 goto drop;
1892 1987
1893 pim = (struct pimreghdr *)skb_transport_header(skb); 1988 pim = (struct pimreghdr *)skb_transport_header(skb);
1894 if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || 1989 if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
1895 (pim->flags&PIM_NULL_REGISTER) || 1990 (pim->flags & PIM_NULL_REGISTER) ||
1896 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 1991 (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1897 csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 1992 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1898 goto drop; 1993 goto drop;
1899 1994
1900 if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) 1995 mrt = ipmr_rt_fib_lookup(net, skb);
1996 if (IS_ERR(mrt))
1901 goto drop; 1997 goto drop;
1902
1903 if (__pim_rcv(mrt, skb, sizeof(*pim))) { 1998 if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1904drop: 1999drop:
1905 kfree_skb(skb); 2000 kfree_skb(skb);
@@ -1946,40 +2041,45 @@ rtattr_failure:
1946 return -EMSGSIZE; 2041 return -EMSGSIZE;
1947} 2042}
1948 2043
1949int ipmr_get_route(struct net *net, 2044int ipmr_get_route(struct net *net, struct sk_buff *skb,
1950 struct sk_buff *skb, struct rtmsg *rtm, int nowait) 2045 __be32 saddr, __be32 daddr,
2046 struct rtmsg *rtm, int nowait)
1951{ 2047{
1952 int err;
1953 struct mr_table *mrt;
1954 struct mfc_cache *cache; 2048 struct mfc_cache *cache;
1955 struct rtable *rt = skb_rtable(skb); 2049 struct mr_table *mrt;
2050 int err;
1956 2051
1957 mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); 2052 mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
1958 if (mrt == NULL) 2053 if (mrt == NULL)
1959 return -ENOENT; 2054 return -ENOENT;
1960 2055
1961 read_lock(&mrt_lock); 2056 rcu_read_lock();
1962 cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst); 2057 cache = ipmr_cache_find(mrt, saddr, daddr);
1963 2058
1964 if (cache == NULL) { 2059 if (cache == NULL) {
1965 struct sk_buff *skb2; 2060 struct sk_buff *skb2;
1966 struct iphdr *iph; 2061 struct iphdr *iph;
1967 struct net_device *dev; 2062 struct net_device *dev;
1968 int vif; 2063 int vif = -1;
1969 2064
1970 if (nowait) { 2065 if (nowait) {
1971 read_unlock(&mrt_lock); 2066 rcu_read_unlock();
1972 return -EAGAIN; 2067 return -EAGAIN;
1973 } 2068 }
1974 2069
1975 dev = skb->dev; 2070 dev = skb->dev;
1976 if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) { 2071 read_lock(&mrt_lock);
2072 if (dev)
2073 vif = ipmr_find_vif(mrt, dev);
2074 if (vif < 0) {
1977 read_unlock(&mrt_lock); 2075 read_unlock(&mrt_lock);
2076 rcu_read_unlock();
1978 return -ENODEV; 2077 return -ENODEV;
1979 } 2078 }
1980 skb2 = skb_clone(skb, GFP_ATOMIC); 2079 skb2 = skb_clone(skb, GFP_ATOMIC);
1981 if (!skb2) { 2080 if (!skb2) {
1982 read_unlock(&mrt_lock); 2081 read_unlock(&mrt_lock);
2082 rcu_read_unlock();
1983 return -ENOMEM; 2083 return -ENOMEM;
1984 } 2084 }
1985 2085
@@ -1987,18 +2087,21 @@ int ipmr_get_route(struct net *net,
1987 skb_reset_network_header(skb2); 2087 skb_reset_network_header(skb2);
1988 iph = ip_hdr(skb2); 2088 iph = ip_hdr(skb2);
1989 iph->ihl = sizeof(struct iphdr) >> 2; 2089 iph->ihl = sizeof(struct iphdr) >> 2;
1990 iph->saddr = rt->rt_src; 2090 iph->saddr = saddr;
1991 iph->daddr = rt->rt_dst; 2091 iph->daddr = daddr;
1992 iph->version = 0; 2092 iph->version = 0;
1993 err = ipmr_cache_unresolved(mrt, vif, skb2); 2093 err = ipmr_cache_unresolved(mrt, vif, skb2);
1994 read_unlock(&mrt_lock); 2094 read_unlock(&mrt_lock);
2095 rcu_read_unlock();
1995 return err; 2096 return err;
1996 } 2097 }
1997 2098
1998 if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) 2099 read_lock(&mrt_lock);
2100 if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
1999 cache->mfc_flags |= MFC_NOTIFY; 2101 cache->mfc_flags |= MFC_NOTIFY;
2000 err = __ipmr_fill_mroute(mrt, skb, cache, rtm); 2102 err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
2001 read_unlock(&mrt_lock); 2103 read_unlock(&mrt_lock);
2104 rcu_read_unlock();
2002 return err; 2105 return err;
2003} 2106}
2004 2107
@@ -2050,14 +2153,14 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2050 s_h = cb->args[1]; 2153 s_h = cb->args[1];
2051 s_e = cb->args[2]; 2154 s_e = cb->args[2];
2052 2155
2053 read_lock(&mrt_lock); 2156 rcu_read_lock();
2054 ipmr_for_each_table(mrt, net) { 2157 ipmr_for_each_table(mrt, net) {
2055 if (t < s_t) 2158 if (t < s_t)
2056 goto next_table; 2159 goto next_table;
2057 if (t > s_t) 2160 if (t > s_t)
2058 s_h = 0; 2161 s_h = 0;
2059 for (h = s_h; h < MFC_LINES; h++) { 2162 for (h = s_h; h < MFC_LINES; h++) {
2060 list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) { 2163 list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
2061 if (e < s_e) 2164 if (e < s_e)
2062 goto next_entry; 2165 goto next_entry;
2063 if (ipmr_fill_mroute(mrt, skb, 2166 if (ipmr_fill_mroute(mrt, skb,
@@ -2075,7 +2178,7 @@ next_table:
2075 t++; 2178 t++;
2076 } 2179 }
2077done: 2180done:
2078 read_unlock(&mrt_lock); 2181 rcu_read_unlock();
2079 2182
2080 cb->args[2] = e; 2183 cb->args[2] = e;
2081 cb->args[1] = h; 2184 cb->args[1] = h;
@@ -2086,7 +2189,8 @@ done:
2086 2189
2087#ifdef CONFIG_PROC_FS 2190#ifdef CONFIG_PROC_FS
2088/* 2191/*
2089 * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif 2192 * The /proc interfaces to multicast routing :
2193 * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
2090 */ 2194 */
2091struct ipmr_vif_iter { 2195struct ipmr_vif_iter {
2092 struct seq_net_private p; 2196 struct seq_net_private p;
@@ -2208,14 +2312,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2208 struct mr_table *mrt = it->mrt; 2312 struct mr_table *mrt = it->mrt;
2209 struct mfc_cache *mfc; 2313 struct mfc_cache *mfc;
2210 2314
2211 read_lock(&mrt_lock); 2315 rcu_read_lock();
2212 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { 2316 for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
2213 it->cache = &mrt->mfc_cache_array[it->ct]; 2317 it->cache = &mrt->mfc_cache_array[it->ct];
2214 list_for_each_entry(mfc, it->cache, list) 2318 list_for_each_entry_rcu(mfc, it->cache, list)
2215 if (pos-- == 0) 2319 if (pos-- == 0)
2216 return mfc; 2320 return mfc;
2217 } 2321 }
2218 read_unlock(&mrt_lock); 2322 rcu_read_unlock();
2219 2323
2220 spin_lock_bh(&mfc_unres_lock); 2324 spin_lock_bh(&mfc_unres_lock);
2221 it->cache = &mrt->mfc_unres_queue; 2325 it->cache = &mrt->mfc_unres_queue;
@@ -2274,7 +2378,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2274 } 2378 }
2275 2379
2276 /* exhausted cache_array, show unresolved */ 2380 /* exhausted cache_array, show unresolved */
2277 read_unlock(&mrt_lock); 2381 rcu_read_unlock();
2278 it->cache = &mrt->mfc_unres_queue; 2382 it->cache = &mrt->mfc_unres_queue;
2279 it->ct = 0; 2383 it->ct = 0;
2280 2384
@@ -2282,7 +2386,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2282 if (!list_empty(it->cache)) 2386 if (!list_empty(it->cache))
2283 return list_first_entry(it->cache, struct mfc_cache, list); 2387 return list_first_entry(it->cache, struct mfc_cache, list);
2284 2388
2285 end_of_list: 2389end_of_list:
2286 spin_unlock_bh(&mfc_unres_lock); 2390 spin_unlock_bh(&mfc_unres_lock);
2287 it->cache = NULL; 2391 it->cache = NULL;
2288 2392
@@ -2297,7 +2401,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
2297 if (it->cache == &mrt->mfc_unres_queue) 2401 if (it->cache == &mrt->mfc_unres_queue)
2298 spin_unlock_bh(&mfc_unres_lock); 2402 spin_unlock_bh(&mfc_unres_lock);
2299 else if (it->cache == &mrt->mfc_cache_array[it->ct]) 2403 else if (it->cache == &mrt->mfc_cache_array[it->ct])
2300 read_unlock(&mrt_lock); 2404 rcu_read_unlock();
2301} 2405}
2302 2406
2303static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) 2407static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
@@ -2323,7 +2427,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
2323 mfc->mfc_un.res.bytes, 2427 mfc->mfc_un.res.bytes,
2324 mfc->mfc_un.res.wrong_if); 2428 mfc->mfc_un.res.wrong_if);
2325 for (n = mfc->mfc_un.res.minvif; 2429 for (n = mfc->mfc_un.res.minvif;
2326 n < mfc->mfc_un.res.maxvif; n++ ) { 2430 n < mfc->mfc_un.res.maxvif; n++) {
2327 if (VIF_EXISTS(mrt, n) && 2431 if (VIF_EXISTS(mrt, n) &&
2328 mfc->mfc_un.res.ttls[n] < 255) 2432 mfc->mfc_un.res.ttls[n] < 255)
2329 seq_printf(seq, 2433 seq_printf(seq,
@@ -2421,7 +2525,7 @@ int __init ip_mr_init(void)
2421 2525
2422 mrt_cachep = kmem_cache_create("ip_mrt_cache", 2526 mrt_cachep = kmem_cache_create("ip_mrt_cache",
2423 sizeof(struct mfc_cache), 2527 sizeof(struct mfc_cache),
2424 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 2528 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
2425 NULL); 2529 NULL);
2426 if (!mrt_cachep) 2530 if (!mrt_cachep)
2427 return -ENOMEM; 2531 return -ENOMEM;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index d88a46c54fd1..2e97e3ec1eb7 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -16,60 +16,47 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
16 struct net *net = dev_net(skb_dst(skb)->dev); 16 struct net *net = dev_net(skb_dst(skb)->dev);
17 const struct iphdr *iph = ip_hdr(skb); 17 const struct iphdr *iph = ip_hdr(skb);
18 struct rtable *rt; 18 struct rtable *rt;
19 struct flowi fl = {}; 19 struct flowi4 fl4 = {};
20 unsigned long orefdst; 20 __be32 saddr = iph->saddr;
21 __u8 flags = 0;
21 unsigned int hh_len; 22 unsigned int hh_len;
22 unsigned int type;
23 23
24 type = inet_addr_type(net, iph->saddr); 24 if (!skb->sk && addr_type != RTN_LOCAL) {
25 if (skb->sk && inet_sk(skb->sk)->transparent) 25 if (addr_type == RTN_UNSPEC)
26 type = RTN_LOCAL; 26 addr_type = inet_addr_type(net, saddr);
27 if (addr_type == RTN_UNSPEC) 27 if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
28 addr_type = type; 28 flags |= FLOWI_FLAG_ANYSRC;
29 else
30 saddr = 0;
31 }
29 32
30 /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause 33 /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
31 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. 34 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
32 */ 35 */
33 if (addr_type == RTN_LOCAL) { 36 fl4.daddr = iph->daddr;
34 fl.nl_u.ip4_u.daddr = iph->daddr; 37 fl4.saddr = saddr;
35 if (type == RTN_LOCAL) 38 fl4.flowi4_tos = RT_TOS(iph->tos);
36 fl.nl_u.ip4_u.saddr = iph->saddr; 39 fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
37 fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); 40 fl4.flowi4_mark = skb->mark;
38 fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; 41 fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : flags;
39 fl.mark = skb->mark; 42 rt = ip_route_output_key(net, &fl4);
40 fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; 43 if (IS_ERR(rt))
41 if (ip_route_output_key(net, &rt, &fl) != 0) 44 return -1;
42 return -1;
43
44 /* Drop old route. */
45 skb_dst_drop(skb);
46 skb_dst_set(skb, &rt->dst);
47 } else {
48 /* non-local src, find valid iif to satisfy
49 * rp-filter when calling ip_route_input. */
50 fl.nl_u.ip4_u.daddr = iph->saddr;
51 if (ip_route_output_key(net, &rt, &fl) != 0)
52 return -1;
53 45
54 orefdst = skb->_skb_refdst; 46 /* Drop old route. */
55 if (ip_route_input(skb, iph->daddr, iph->saddr, 47 skb_dst_drop(skb);
56 RT_TOS(iph->tos), rt->dst.dev) != 0) { 48 skb_dst_set(skb, &rt->dst);
57 dst_release(&rt->dst);
58 return -1;
59 }
60 dst_release(&rt->dst);
61 refdst_drop(orefdst);
62 }
63 49
64 if (skb_dst(skb)->error) 50 if (skb_dst(skb)->error)
65 return -1; 51 return -1;
66 52
67#ifdef CONFIG_XFRM 53#ifdef CONFIG_XFRM
68 if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && 54 if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
69 xfrm_decode_session(skb, &fl, AF_INET) == 0) { 55 xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
70 struct dst_entry *dst = skb_dst(skb); 56 struct dst_entry *dst = skb_dst(skb);
71 skb_dst_set(skb, NULL); 57 skb_dst_set(skb, NULL);
72 if (xfrm_lookup(net, &dst, &fl, skb->sk, 0)) 58 dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
59 if (IS_ERR(dst))
73 return -1; 60 return -1;
74 skb_dst_set(skb, dst); 61 skb_dst_set(skb, dst);
75 } 62 }
@@ -102,7 +89,8 @@ int ip_xfrm_me_harder(struct sk_buff *skb)
102 dst = ((struct xfrm_dst *)dst)->route; 89 dst = ((struct xfrm_dst *)dst)->route;
103 dst_hold(dst); 90 dst_hold(dst);
104 91
105 if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0) 92 dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
93 if (IS_ERR(dst))
106 return -1; 94 return -1;
107 95
108 skb_dst_drop(skb); 96 skb_dst_drop(skb);
@@ -217,9 +205,14 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
217 return csum; 205 return csum;
218} 206}
219 207
220static int nf_ip_route(struct dst_entry **dst, struct flowi *fl) 208static int nf_ip_route(struct net *net, struct dst_entry **dst,
209 struct flowi *fl, bool strict __always_unused)
221{ 210{
222 return ip_route_output_key(&init_net, (struct rtable **)dst, fl); 211 struct rtable *rt = ip_route_output_key(net, &fl->u.ip4);
212 if (IS_ERR(rt))
213 return PTR_ERR(rt);
214 *dst = &rt->dst;
215 return 0;
223} 216}
224 217
225static const struct nf_afinfo nf_ip_afinfo = { 218static const struct nf_afinfo nf_ip_afinfo = {
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1833bdbf9805..1dfc18a03fd4 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -64,16 +64,6 @@ config IP_NF_IPTABLES
64if IP_NF_IPTABLES 64if IP_NF_IPTABLES
65 65
66# The matches. 66# The matches.
67config IP_NF_MATCH_ADDRTYPE
68 tristate '"addrtype" address type match support'
69 depends on NETFILTER_ADVANCED
70 help
71 This option allows you to match what routing thinks of an address,
72 eg. UNICAST, LOCAL, BROADCAST, ...
73
74 If you want to compile it as a module, say M here and read
75 <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
76
77config IP_NF_MATCH_AH 67config IP_NF_MATCH_AH
78 tristate '"ah" match support' 68 tristate '"ah" match support'
79 depends on NETFILTER_ADVANCED 69 depends on NETFILTER_ADVANCED
@@ -147,7 +137,7 @@ config IP_NF_TARGET_ULOG
147 which can only be viewed through syslog. 137 which can only be viewed through syslog.
148 138
149 The appropriate userspace logging daemon (ulogd) may be obtained from 139 The appropriate userspace logging daemon (ulogd) may be obtained from
150 <http://www.gnumonks.org/projects/ulogd/> 140 <http://www.netfilter.org/projects/ulogd/index.html>
151 141
152 To compile it as a module, choose M here. If unsure, say N. 142 To compile it as a module, choose M here. If unsure, say N.
153 143
@@ -206,8 +196,9 @@ config IP_NF_TARGET_REDIRECT
206 196
207config NF_NAT_SNMP_BASIC 197config NF_NAT_SNMP_BASIC
208 tristate "Basic SNMP-ALG support" 198 tristate "Basic SNMP-ALG support"
209 depends on NF_NAT 199 depends on NF_CONNTRACK_SNMP && NF_NAT
210 depends on NETFILTER_ADVANCED 200 depends on NETFILTER_ADVANCED
201 default NF_NAT && NF_CONNTRACK_SNMP
211 ---help--- 202 ---help---
212 203
213 This module implements an Application Layer Gateway (ALG) for 204 This module implements an Application Layer Gateway (ALG) for
@@ -324,10 +315,10 @@ config IP_NF_TARGET_ECN
324 315
325config IP_NF_TARGET_TTL 316config IP_NF_TARGET_TTL
326 tristate '"TTL" target support' 317 tristate '"TTL" target support'
327 depends on NETFILTER_ADVANCED 318 depends on NETFILTER_ADVANCED && IP_NF_MANGLE
328 select NETFILTER_XT_TARGET_HL 319 select NETFILTER_XT_TARGET_HL
329 ---help--- 320 ---help---
330 This is a backwards-compat option for the user's convenience 321 This is a backwards-compatible option for the user's convenience
331 (e.g. when running oldconfig). It selects 322 (e.g. when running oldconfig). It selects
332 CONFIG_NETFILTER_XT_TARGET_HL. 323 CONFIG_NETFILTER_XT_TARGET_HL.
333 324
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 48111594ee9b..dca2082ec683 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -3,15 +3,15 @@
3# 3#
4 4
5# objects for l3 independent conntrack 5# objects for l3 independent conntrack
6nf_conntrack_ipv4-objs := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o 6nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
7ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y) 7ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y)
8ifeq ($(CONFIG_PROC_FS),y) 8ifeq ($(CONFIG_PROC_FS),y)
9nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o 9nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
10endif 10endif
11endif 11endif
12 12
13nf_nat-objs := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o 13nf_nat-y := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
14iptable_nat-objs := nf_nat_rule.o nf_nat_standalone.o 14iptable_nat-y := nf_nat_rule.o nf_nat_standalone.o
15 15
16# connection tracking 16# connection tracking
17obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o 17obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
@@ -48,7 +48,6 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
48obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o 48obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
49 49
50# matches 50# matches
51obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
52obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o 51obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
53obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o 52obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
54 53
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e8f4f9a57f12..fd7a3f68917f 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -72,11 +72,11 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
72 for (i = 0; i < len; i++) 72 for (i = 0; i < len; i++)
73 ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; 73 ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
74 74
75 return (ret != 0); 75 return ret != 0;
76} 76}
77 77
78/* 78/*
79 * Unfortunatly, _b and _mask are not aligned to an int (or long int) 79 * Unfortunately, _b and _mask are not aligned to an int (or long int)
80 * Some arches dont care, unrolling the loop is a win on them. 80 * Some arches dont care, unrolling the loop is a win on them.
81 * For other arches, we only have a 16bit alignement. 81 * For other arches, we only have a 16bit alignement.
82 */ 82 */
@@ -228,7 +228,7 @@ arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
228 return NF_DROP; 228 return NF_DROP;
229} 229}
230 230
231static inline const struct arpt_entry_target * 231static inline const struct xt_entry_target *
232arpt_get_target_c(const struct arpt_entry *e) 232arpt_get_target_c(const struct arpt_entry *e)
233{ 233{
234 return arpt_get_target((struct arpt_entry *)e); 234 return arpt_get_target((struct arpt_entry *)e);
@@ -260,6 +260,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
260 void *table_base; 260 void *table_base;
261 const struct xt_table_info *private; 261 const struct xt_table_info *private;
262 struct xt_action_param acpar; 262 struct xt_action_param acpar;
263 unsigned int addend;
263 264
264 if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) 265 if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
265 return NF_DROP; 266 return NF_DROP;
@@ -267,7 +268,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
267 indev = in ? in->name : nulldevname; 268 indev = in ? in->name : nulldevname;
268 outdev = out ? out->name : nulldevname; 269 outdev = out ? out->name : nulldevname;
269 270
270 xt_info_rdlock_bh(); 271 local_bh_disable();
272 addend = xt_write_recseq_begin();
271 private = table->private; 273 private = table->private;
272 table_base = private->entries[smp_processor_id()]; 274 table_base = private->entries[smp_processor_id()];
273 275
@@ -282,7 +284,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
282 284
283 arp = arp_hdr(skb); 285 arp = arp_hdr(skb);
284 do { 286 do {
285 const struct arpt_entry_target *t; 287 const struct xt_entry_target *t;
286 288
287 if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { 289 if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
288 e = arpt_next_entry(e); 290 e = arpt_next_entry(e);
@@ -297,10 +299,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
297 if (!t->u.kernel.target->target) { 299 if (!t->u.kernel.target->target) {
298 int v; 300 int v;
299 301
300 v = ((struct arpt_standard_target *)t)->verdict; 302 v = ((struct xt_standard_target *)t)->verdict;
301 if (v < 0) { 303 if (v < 0) {
302 /* Pop from stack? */ 304 /* Pop from stack? */
303 if (v != ARPT_RETURN) { 305 if (v != XT_RETURN) {
304 verdict = (unsigned)(-v) - 1; 306 verdict = (unsigned)(-v) - 1;
305 break; 307 break;
306 } 308 }
@@ -332,13 +334,14 @@ unsigned int arpt_do_table(struct sk_buff *skb,
332 /* Target might have changed stuff. */ 334 /* Target might have changed stuff. */
333 arp = arp_hdr(skb); 335 arp = arp_hdr(skb);
334 336
335 if (verdict == ARPT_CONTINUE) 337 if (verdict == XT_CONTINUE)
336 e = arpt_next_entry(e); 338 e = arpt_next_entry(e);
337 else 339 else
338 /* Verdict */ 340 /* Verdict */
339 break; 341 break;
340 } while (!acpar.hotdrop); 342 } while (!acpar.hotdrop);
341 xt_info_rdunlock_bh(); 343 xt_write_recseq_end(addend);
344 local_bh_enable();
342 345
343 if (acpar.hotdrop) 346 if (acpar.hotdrop)
344 return NF_DROP; 347 return NF_DROP;
@@ -377,7 +380,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
377 e->counters.pcnt = pos; 380 e->counters.pcnt = pos;
378 381
379 for (;;) { 382 for (;;) {
380 const struct arpt_standard_target *t 383 const struct xt_standard_target *t
381 = (void *)arpt_get_target_c(e); 384 = (void *)arpt_get_target_c(e);
382 int visited = e->comefrom & (1 << hook); 385 int visited = e->comefrom & (1 << hook);
383 386
@@ -392,13 +395,13 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
392 /* Unconditional return/END. */ 395 /* Unconditional return/END. */
393 if ((e->target_offset == sizeof(struct arpt_entry) && 396 if ((e->target_offset == sizeof(struct arpt_entry) &&
394 (strcmp(t->target.u.user.name, 397 (strcmp(t->target.u.user.name,
395 ARPT_STANDARD_TARGET) == 0) && 398 XT_STANDARD_TARGET) == 0) &&
396 t->verdict < 0 && unconditional(&e->arp)) || 399 t->verdict < 0 && unconditional(&e->arp)) ||
397 visited) { 400 visited) {
398 unsigned int oldpos, size; 401 unsigned int oldpos, size;
399 402
400 if ((strcmp(t->target.u.user.name, 403 if ((strcmp(t->target.u.user.name,
401 ARPT_STANDARD_TARGET) == 0) && 404 XT_STANDARD_TARGET) == 0) &&
402 t->verdict < -NF_MAX_VERDICT - 1) { 405 t->verdict < -NF_MAX_VERDICT - 1) {
403 duprintf("mark_source_chains: bad " 406 duprintf("mark_source_chains: bad "
404 "negative verdict (%i)\n", 407 "negative verdict (%i)\n",
@@ -433,7 +436,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
433 int newpos = t->verdict; 436 int newpos = t->verdict;
434 437
435 if (strcmp(t->target.u.user.name, 438 if (strcmp(t->target.u.user.name,
436 ARPT_STANDARD_TARGET) == 0 && 439 XT_STANDARD_TARGET) == 0 &&
437 newpos >= 0) { 440 newpos >= 0) {
438 if (newpos > newinfo->size - 441 if (newpos > newinfo->size -
439 sizeof(struct arpt_entry)) { 442 sizeof(struct arpt_entry)) {
@@ -464,14 +467,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
464 467
465static inline int check_entry(const struct arpt_entry *e, const char *name) 468static inline int check_entry(const struct arpt_entry *e, const char *name)
466{ 469{
467 const struct arpt_entry_target *t; 470 const struct xt_entry_target *t;
468 471
469 if (!arp_checkentry(&e->arp)) { 472 if (!arp_checkentry(&e->arp)) {
470 duprintf("arp_tables: arp check failed %p %s.\n", e, name); 473 duprintf("arp_tables: arp check failed %p %s.\n", e, name);
471 return -EINVAL; 474 return -EINVAL;
472 } 475 }
473 476
474 if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset) 477 if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
475 return -EINVAL; 478 return -EINVAL;
476 479
477 t = arpt_get_target_c(e); 480 t = arpt_get_target_c(e);
@@ -483,7 +486,7 @@ static inline int check_entry(const struct arpt_entry *e, const char *name)
483 486
484static inline int check_target(struct arpt_entry *e, const char *name) 487static inline int check_target(struct arpt_entry *e, const char *name)
485{ 488{
486 struct arpt_entry_target *t = arpt_get_target(e); 489 struct xt_entry_target *t = arpt_get_target(e);
487 int ret; 490 int ret;
488 struct xt_tgchk_param par = { 491 struct xt_tgchk_param par = {
489 .table = name, 492 .table = name,
@@ -506,7 +509,7 @@ static inline int check_target(struct arpt_entry *e, const char *name)
506static inline int 509static inline int
507find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) 510find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
508{ 511{
509 struct arpt_entry_target *t; 512 struct xt_entry_target *t;
510 struct xt_target *target; 513 struct xt_target *target;
511 int ret; 514 int ret;
512 515
@@ -536,7 +539,7 @@ out:
536 539
537static bool check_underflow(const struct arpt_entry *e) 540static bool check_underflow(const struct arpt_entry *e)
538{ 541{
539 const struct arpt_entry_target *t; 542 const struct xt_entry_target *t;
540 unsigned int verdict; 543 unsigned int verdict;
541 544
542 if (!unconditional(&e->arp)) 545 if (!unconditional(&e->arp))
@@ -544,7 +547,7 @@ static bool check_underflow(const struct arpt_entry *e)
544 t = arpt_get_target_c(e); 547 t = arpt_get_target_c(e);
545 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) 548 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
546 return false; 549 return false;
547 verdict = ((struct arpt_standard_target *)t)->verdict; 550 verdict = ((struct xt_standard_target *)t)->verdict;
548 verdict = -verdict - 1; 551 verdict = -verdict - 1;
549 return verdict == NF_DROP || verdict == NF_ACCEPT; 552 return verdict == NF_DROP || verdict == NF_ACCEPT;
550} 553}
@@ -566,7 +569,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
566 } 569 }
567 570
568 if (e->next_offset 571 if (e->next_offset
569 < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) { 572 < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
570 duprintf("checking: element %p size %u\n", 573 duprintf("checking: element %p size %u\n",
571 e, e->next_offset); 574 e, e->next_offset);
572 return -EINVAL; 575 return -EINVAL;
@@ -598,7 +601,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
598static inline void cleanup_entry(struct arpt_entry *e) 601static inline void cleanup_entry(struct arpt_entry *e)
599{ 602{
600 struct xt_tgdtor_param par; 603 struct xt_tgdtor_param par;
601 struct arpt_entry_target *t; 604 struct xt_entry_target *t;
602 605
603 t = arpt_get_target(e); 606 t = arpt_get_target(e);
604 par.target = t->u.kernel.target; 607 par.target = t->u.kernel.target;
@@ -710,42 +713,25 @@ static void get_counters(const struct xt_table_info *t,
710 struct arpt_entry *iter; 713 struct arpt_entry *iter;
711 unsigned int cpu; 714 unsigned int cpu;
712 unsigned int i; 715 unsigned int i;
713 unsigned int curcpu = get_cpu();
714
715 /* Instead of clearing (by a previous call to memset())
716 * the counters and using adds, we set the counters
717 * with data used by 'current' CPU
718 *
719 * Bottom half has to be disabled to prevent deadlock
720 * if new softirq were to run and call ipt_do_table
721 */
722 local_bh_disable();
723 i = 0;
724 xt_entry_foreach(iter, t->entries[curcpu], t->size) {
725 SET_COUNTER(counters[i], iter->counters.bcnt,
726 iter->counters.pcnt);
727 ++i;
728 }
729 local_bh_enable();
730 /* Processing counters from other cpus, we can let bottom half enabled,
731 * (preemption is disabled)
732 */
733 716
734 for_each_possible_cpu(cpu) { 717 for_each_possible_cpu(cpu) {
735 if (cpu == curcpu) 718 seqcount_t *s = &per_cpu(xt_recseq, cpu);
736 continue; 719
737 i = 0; 720 i = 0;
738 local_bh_disable();
739 xt_info_wrlock(cpu);
740 xt_entry_foreach(iter, t->entries[cpu], t->size) { 721 xt_entry_foreach(iter, t->entries[cpu], t->size) {
741 ADD_COUNTER(counters[i], iter->counters.bcnt, 722 u64 bcnt, pcnt;
742 iter->counters.pcnt); 723 unsigned int start;
724
725 do {
726 start = read_seqcount_begin(s);
727 bcnt = iter->counters.bcnt;
728 pcnt = iter->counters.pcnt;
729 } while (read_seqcount_retry(s, start));
730
731 ADD_COUNTER(counters[i], bcnt, pcnt);
743 ++i; 732 ++i;
744 } 733 }
745 xt_info_wrunlock(cpu);
746 local_bh_enable();
747 } 734 }
748 put_cpu();
749} 735}
750 736
751static struct xt_counters *alloc_counters(const struct xt_table *table) 737static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -759,7 +745,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
759 * about). 745 * about).
760 */ 746 */
761 countersize = sizeof(struct xt_counters) * private->number; 747 countersize = sizeof(struct xt_counters) * private->number;
762 counters = vmalloc(countersize); 748 counters = vzalloc(countersize);
763 749
764 if (counters == NULL) 750 if (counters == NULL)
765 return ERR_PTR(-ENOMEM); 751 return ERR_PTR(-ENOMEM);
@@ -794,7 +780,7 @@ static int copy_entries_to_user(unsigned int total_size,
794 /* FIXME: use iterator macros --RR */ 780 /* FIXME: use iterator macros --RR */
795 /* ... then go back and fix counters and names */ 781 /* ... then go back and fix counters and names */
796 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ 782 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
797 const struct arpt_entry_target *t; 783 const struct xt_entry_target *t;
798 784
799 e = (struct arpt_entry *)(loc_cpu_entry + off); 785 e = (struct arpt_entry *)(loc_cpu_entry + off);
800 if (copy_to_user(userptr + off 786 if (copy_to_user(userptr + off
@@ -807,7 +793,7 @@ static int copy_entries_to_user(unsigned int total_size,
807 793
808 t = arpt_get_target_c(e); 794 t = arpt_get_target_c(e);
809 if (copy_to_user(userptr + off + e->target_offset 795 if (copy_to_user(userptr + off + e->target_offset
810 + offsetof(struct arpt_entry_target, 796 + offsetof(struct xt_entry_target,
811 u.user.name), 797 u.user.name),
812 t->u.kernel.target->name, 798 t->u.kernel.target->name,
813 strlen(t->u.kernel.target->name)+1) != 0) { 799 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -844,7 +830,7 @@ static int compat_calc_entry(const struct arpt_entry *e,
844 const struct xt_table_info *info, 830 const struct xt_table_info *info,
845 const void *base, struct xt_table_info *newinfo) 831 const void *base, struct xt_table_info *newinfo)
846{ 832{
847 const struct arpt_entry_target *t; 833 const struct xt_entry_target *t;
848 unsigned int entry_offset; 834 unsigned int entry_offset;
849 int off, i, ret; 835 int off, i, ret;
850 836
@@ -883,6 +869,7 @@ static int compat_table_info(const struct xt_table_info *info,
883 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 869 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
884 newinfo->initial_entries = 0; 870 newinfo->initial_entries = 0;
885 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 871 loc_cpu_entry = info->entries[raw_smp_processor_id()];
872 xt_compat_init_offsets(NFPROTO_ARP, info->number);
886 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 873 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
887 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 874 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
888 if (ret != 0) 875 if (ret != 0)
@@ -895,7 +882,7 @@ static int compat_table_info(const struct xt_table_info *info,
895static int get_info(struct net *net, void __user *user, 882static int get_info(struct net *net, void __user *user,
896 const int *len, int compat) 883 const int *len, int compat)
897{ 884{
898 char name[ARPT_TABLE_MAXNAMELEN]; 885 char name[XT_TABLE_MAXNAMELEN];
899 struct xt_table *t; 886 struct xt_table *t;
900 int ret; 887 int ret;
901 888
@@ -908,7 +895,7 @@ static int get_info(struct net *net, void __user *user,
908 if (copy_from_user(name, user, sizeof(name)) != 0) 895 if (copy_from_user(name, user, sizeof(name)) != 0)
909 return -EFAULT; 896 return -EFAULT;
910 897
911 name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; 898 name[XT_TABLE_MAXNAMELEN-1] = '\0';
912#ifdef CONFIG_COMPAT 899#ifdef CONFIG_COMPAT
913 if (compat) 900 if (compat)
914 xt_compat_lock(NFPROTO_ARP); 901 xt_compat_lock(NFPROTO_ARP);
@@ -927,6 +914,7 @@ static int get_info(struct net *net, void __user *user,
927 private = &tmp; 914 private = &tmp;
928 } 915 }
929#endif 916#endif
917 memset(&info, 0, sizeof(info));
930 info.valid_hooks = t->valid_hooks; 918 info.valid_hooks = t->valid_hooks;
931 memcpy(info.hook_entry, private->hook_entry, 919 memcpy(info.hook_entry, private->hook_entry,
932 sizeof(info.hook_entry)); 920 sizeof(info.hook_entry));
@@ -1006,7 +994,7 @@ static int __do_replace(struct net *net, const char *name,
1006 struct arpt_entry *iter; 994 struct arpt_entry *iter;
1007 995
1008 ret = 0; 996 ret = 0;
1009 counters = vmalloc(num_counters * sizeof(struct xt_counters)); 997 counters = vzalloc(num_counters * sizeof(struct xt_counters));
1010 if (!counters) { 998 if (!counters) {
1011 ret = -ENOMEM; 999 ret = -ENOMEM;
1012 goto out; 1000 goto out;
@@ -1081,6 +1069,7 @@ static int do_replace(struct net *net, const void __user *user,
1081 /* overflow check */ 1069 /* overflow check */
1082 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1070 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1083 return -ENOMEM; 1071 return -ENOMEM;
1072 tmp.name[sizeof(tmp.name)-1] = 0;
1084 1073
1085 newinfo = xt_alloc_table_info(tmp.size); 1074 newinfo = xt_alloc_table_info(tmp.size);
1086 if (!newinfo) 1075 if (!newinfo)
@@ -1129,6 +1118,7 @@ static int do_add_counters(struct net *net, const void __user *user,
1129 int ret = 0; 1118 int ret = 0;
1130 void *loc_cpu_entry; 1119 void *loc_cpu_entry;
1131 struct arpt_entry *iter; 1120 struct arpt_entry *iter;
1121 unsigned int addend;
1132#ifdef CONFIG_COMPAT 1122#ifdef CONFIG_COMPAT
1133 struct compat_xt_counters_info compat_tmp; 1123 struct compat_xt_counters_info compat_tmp;
1134 1124
@@ -1185,12 +1175,12 @@ static int do_add_counters(struct net *net, const void __user *user,
1185 /* Choose the copy that is on our node */ 1175 /* Choose the copy that is on our node */
1186 curcpu = smp_processor_id(); 1176 curcpu = smp_processor_id();
1187 loc_cpu_entry = private->entries[curcpu]; 1177 loc_cpu_entry = private->entries[curcpu];
1188 xt_info_wrlock(curcpu); 1178 addend = xt_write_recseq_begin();
1189 xt_entry_foreach(iter, loc_cpu_entry, private->size) { 1179 xt_entry_foreach(iter, loc_cpu_entry, private->size) {
1190 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); 1180 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
1191 ++i; 1181 ++i;
1192 } 1182 }
1193 xt_info_wrunlock(curcpu); 1183 xt_write_recseq_end(addend);
1194 unlock_up_free: 1184 unlock_up_free:
1195 local_bh_enable(); 1185 local_bh_enable();
1196 xt_table_unlock(t); 1186 xt_table_unlock(t);
@@ -1204,7 +1194,7 @@ static int do_add_counters(struct net *net, const void __user *user,
1204#ifdef CONFIG_COMPAT 1194#ifdef CONFIG_COMPAT
1205static inline void compat_release_entry(struct compat_arpt_entry *e) 1195static inline void compat_release_entry(struct compat_arpt_entry *e)
1206{ 1196{
1207 struct arpt_entry_target *t; 1197 struct xt_entry_target *t;
1208 1198
1209 t = compat_arpt_get_target(e); 1199 t = compat_arpt_get_target(e);
1210 module_put(t->u.kernel.target->me); 1200 module_put(t->u.kernel.target->me);
@@ -1220,7 +1210,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1220 const unsigned int *underflows, 1210 const unsigned int *underflows,
1221 const char *name) 1211 const char *name)
1222{ 1212{
1223 struct arpt_entry_target *t; 1213 struct xt_entry_target *t;
1224 struct xt_target *target; 1214 struct xt_target *target;
1225 unsigned int entry_offset; 1215 unsigned int entry_offset;
1226 int ret, off, h; 1216 int ret, off, h;
@@ -1288,7 +1278,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
1288 unsigned int *size, const char *name, 1278 unsigned int *size, const char *name,
1289 struct xt_table_info *newinfo, unsigned char *base) 1279 struct xt_table_info *newinfo, unsigned char *base)
1290{ 1280{
1291 struct arpt_entry_target *t; 1281 struct xt_entry_target *t;
1292 struct xt_target *target; 1282 struct xt_target *target;
1293 struct arpt_entry *de; 1283 struct arpt_entry *de;
1294 unsigned int origsize; 1284 unsigned int origsize;
@@ -1349,6 +1339,7 @@ static int translate_compat_table(const char *name,
1349 duprintf("translate_compat_table: size %u\n", info->size); 1339 duprintf("translate_compat_table: size %u\n", info->size);
1350 j = 0; 1340 j = 0;
1351 xt_compat_lock(NFPROTO_ARP); 1341 xt_compat_lock(NFPROTO_ARP);
1342 xt_compat_init_offsets(NFPROTO_ARP, number);
1352 /* Walk through entries, checking offsets. */ 1343 /* Walk through entries, checking offsets. */
1353 xt_entry_foreach(iter0, entry0, total_size) { 1344 xt_entry_foreach(iter0, entry0, total_size) {
1354 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1345 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
@@ -1474,7 +1465,7 @@ out_unlock:
1474} 1465}
1475 1466
1476struct compat_arpt_replace { 1467struct compat_arpt_replace {
1477 char name[ARPT_TABLE_MAXNAMELEN]; 1468 char name[XT_TABLE_MAXNAMELEN];
1478 u32 valid_hooks; 1469 u32 valid_hooks;
1479 u32 num_entries; 1470 u32 num_entries;
1480 u32 size; 1471 u32 size;
@@ -1502,6 +1493,7 @@ static int compat_do_replace(struct net *net, void __user *user,
1502 return -ENOMEM; 1493 return -ENOMEM;
1503 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1494 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1504 return -ENOMEM; 1495 return -ENOMEM;
1496 tmp.name[sizeof(tmp.name)-1] = 0;
1505 1497
1506 newinfo = xt_alloc_table_info(tmp.size); 1498 newinfo = xt_alloc_table_info(tmp.size);
1507 if (!newinfo) 1499 if (!newinfo)
@@ -1567,7 +1559,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
1567 struct xt_counters *counters, 1559 struct xt_counters *counters,
1568 unsigned int i) 1560 unsigned int i)
1569{ 1561{
1570 struct arpt_entry_target *t; 1562 struct xt_entry_target *t;
1571 struct compat_arpt_entry __user *ce; 1563 struct compat_arpt_entry __user *ce;
1572 u_int16_t target_offset, next_offset; 1564 u_int16_t target_offset, next_offset;
1573 compat_uint_t origsize; 1565 compat_uint_t origsize;
@@ -1628,7 +1620,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
1628} 1620}
1629 1621
1630struct compat_arpt_get_entries { 1622struct compat_arpt_get_entries {
1631 char name[ARPT_TABLE_MAXNAMELEN]; 1623 char name[XT_TABLE_MAXNAMELEN];
1632 compat_uint_t size; 1624 compat_uint_t size;
1633 struct compat_arpt_entry entrytable[0]; 1625 struct compat_arpt_entry entrytable[0];
1634}; 1626};
@@ -1754,6 +1746,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
1754 ret = -EFAULT; 1746 ret = -EFAULT;
1755 break; 1747 break;
1756 } 1748 }
1749 rev.name[sizeof(rev.name)-1] = 0;
1757 1750
1758 try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, 1751 try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name,
1759 rev.revision, 1, &ret), 1752 rev.revision, 1, &ret),
@@ -1828,7 +1821,7 @@ void arpt_unregister_table(struct xt_table *table)
1828/* The built-in targets: standard (NULL) and error. */ 1821/* The built-in targets: standard (NULL) and error. */
1829static struct xt_target arpt_builtin_tg[] __read_mostly = { 1822static struct xt_target arpt_builtin_tg[] __read_mostly = {
1830 { 1823 {
1831 .name = ARPT_STANDARD_TARGET, 1824 .name = XT_STANDARD_TARGET,
1832 .targetsize = sizeof(int), 1825 .targetsize = sizeof(int),
1833 .family = NFPROTO_ARP, 1826 .family = NFPROTO_ARP,
1834#ifdef CONFIG_COMPAT 1827#ifdef CONFIG_COMPAT
@@ -1838,9 +1831,9 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = {
1838#endif 1831#endif
1839 }, 1832 },
1840 { 1833 {
1841 .name = ARPT_ERROR_TARGET, 1834 .name = XT_ERROR_TARGET,
1842 .target = arpt_error, 1835 .target = arpt_error,
1843 .targetsize = ARPT_FUNCTION_MAXNAMELEN, 1836 .targetsize = XT_FUNCTION_MAXNAMELEN,
1844 .family = NFPROTO_ARP, 1837 .family = NFPROTO_ARP,
1845 }, 1838 },
1846}; 1839};
@@ -1885,7 +1878,7 @@ static int __init arp_tables_init(void)
1885 if (ret < 0) 1878 if (ret < 0)
1886 goto err1; 1879 goto err1;
1887 1880
1888 /* Noone else will be downing sem now, so we won't sleep */ 1881 /* No one else will be downing sem now, so we won't sleep */
1889 ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); 1882 ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
1890 if (ret < 0) 1883 if (ret < 0)
1891 goto err2; 1884 goto err2;
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index e1be7dd1171b..a5e52a9f0a12 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -60,12 +60,12 @@ static int checkentry(const struct xt_tgchk_param *par)
60 60
61 if (mangle->flags & ~ARPT_MANGLE_MASK || 61 if (mangle->flags & ~ARPT_MANGLE_MASK ||
62 !(mangle->flags & ARPT_MANGLE_MASK)) 62 !(mangle->flags & ARPT_MANGLE_MASK))
63 return false; 63 return -EINVAL;
64 64
65 if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && 65 if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
66 mangle->target != ARPT_CONTINUE) 66 mangle->target != XT_CONTINUE)
67 return false; 67 return -EINVAL;
68 return true; 68 return 0;
69} 69}
70 70
71static struct xt_target arpt_mangle_reg __read_mostly = { 71static struct xt_target arpt_mangle_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index d2c1311cb28d..5c9b9d963918 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -203,7 +203,8 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
203 else 203 else
204 pmsg->outdev_name[0] = '\0'; 204 pmsg->outdev_name[0] = '\0';
205 205
206 if (entry->indev && entry->skb->dev) { 206 if (entry->indev && entry->skb->dev &&
207 entry->skb->mac_header != entry->skb->network_header) {
207 pmsg->hw_type = entry->skb->dev->type; 208 pmsg->hw_type = entry->skb->dev->type;
208 pmsg->hw_addrlen = dev_parse_header(entry->skb, 209 pmsg->hw_addrlen = dev_parse_header(entry->skb,
209 pmsg->hw_addr); 210 pmsg->hw_addr);
@@ -402,7 +403,8 @@ ipq_dev_drop(int ifindex)
402static inline void 403static inline void
403__ipq_rcv_skb(struct sk_buff *skb) 404__ipq_rcv_skb(struct sk_buff *skb)
404{ 405{
405 int status, type, pid, flags, nlmsglen, skblen; 406 int status, type, pid, flags;
407 unsigned int nlmsglen, skblen;
406 struct nlmsghdr *nlh; 408 struct nlmsghdr *nlh;
407 409
408 skblen = skb->len; 410 skblen = skb->len;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d163f2e3b2e9..24e556e83a3b 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -68,15 +68,6 @@ void *ipt_alloc_initial_table(const struct xt_table *info)
68} 68}
69EXPORT_SYMBOL_GPL(ipt_alloc_initial_table); 69EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
70 70
71/*
72 We keep a set of rules for each CPU, so we can avoid write-locking
73 them in the softirq when updating the counters and therefore
74 only need to read-lock in the softirq; doing a write_lock_bh() in user
75 context stops packets coming through and allows user context to read
76 the counters or update the rules.
77
78 Hence the start of any table is given by get_table() below. */
79
80/* Returns whether matches rule or not. */ 71/* Returns whether matches rule or not. */
81/* Performance critical - called for every packet */ 72/* Performance critical - called for every packet */
82static inline bool 73static inline bool
@@ -186,7 +177,7 @@ static inline bool unconditional(const struct ipt_ip *ip)
186} 177}
187 178
188/* for const-correctness */ 179/* for const-correctness */
189static inline const struct ipt_entry_target * 180static inline const struct xt_entry_target *
190ipt_get_target_c(const struct ipt_entry *e) 181ipt_get_target_c(const struct ipt_entry *e)
191{ 182{
192 return ipt_get_target((struct ipt_entry *)e); 183 return ipt_get_target((struct ipt_entry *)e);
@@ -230,9 +221,9 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
230 const char *hookname, const char **chainname, 221 const char *hookname, const char **chainname,
231 const char **comment, unsigned int *rulenum) 222 const char **comment, unsigned int *rulenum)
232{ 223{
233 const struct ipt_standard_target *t = (void *)ipt_get_target_c(s); 224 const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
234 225
235 if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) { 226 if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
236 /* Head of user chain: ERROR target with chainname */ 227 /* Head of user chain: ERROR target with chainname */
237 *chainname = t->target.data; 228 *chainname = t->target.data;
238 (*rulenum) = 0; 229 (*rulenum) = 0;
@@ -241,7 +232,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
241 232
242 if (s->target_offset == sizeof(struct ipt_entry) && 233 if (s->target_offset == sizeof(struct ipt_entry) &&
243 strcmp(t->target.u.kernel.target->name, 234 strcmp(t->target.u.kernel.target->name,
244 IPT_STANDARD_TARGET) == 0 && 235 XT_STANDARD_TARGET) == 0 &&
245 t->verdict < 0 && 236 t->verdict < 0 &&
246 unconditional(&s->ip)) { 237 unconditional(&s->ip)) {
247 /* Tail of chains: STANDARD target (return/policy) */ 238 /* Tail of chains: STANDARD target (return/policy) */
@@ -311,6 +302,7 @@ ipt_do_table(struct sk_buff *skb,
311 unsigned int *stackptr, origptr, cpu; 302 unsigned int *stackptr, origptr, cpu;
312 const struct xt_table_info *private; 303 const struct xt_table_info *private;
313 struct xt_action_param acpar; 304 struct xt_action_param acpar;
305 unsigned int addend;
314 306
315 /* Initialization */ 307 /* Initialization */
316 ip = ip_hdr(skb); 308 ip = ip_hdr(skb);
@@ -331,7 +323,8 @@ ipt_do_table(struct sk_buff *skb,
331 acpar.hooknum = hook; 323 acpar.hooknum = hook;
332 324
333 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 325 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
334 xt_info_rdlock_bh(); 326 local_bh_disable();
327 addend = xt_write_recseq_begin();
335 private = table->private; 328 private = table->private;
336 cpu = smp_processor_id(); 329 cpu = smp_processor_id();
337 table_base = private->entries[cpu]; 330 table_base = private->entries[cpu];
@@ -346,7 +339,7 @@ ipt_do_table(struct sk_buff *skb,
346 get_entry(table_base, private->underflow[hook])); 339 get_entry(table_base, private->underflow[hook]));
347 340
348 do { 341 do {
349 const struct ipt_entry_target *t; 342 const struct xt_entry_target *t;
350 const struct xt_entry_match *ematch; 343 const struct xt_entry_match *ematch;
351 344
352 IP_NF_ASSERT(e); 345 IP_NF_ASSERT(e);
@@ -380,14 +373,14 @@ ipt_do_table(struct sk_buff *skb,
380 if (!t->u.kernel.target->target) { 373 if (!t->u.kernel.target->target) {
381 int v; 374 int v;
382 375
383 v = ((struct ipt_standard_target *)t)->verdict; 376 v = ((struct xt_standard_target *)t)->verdict;
384 if (v < 0) { 377 if (v < 0) {
385 /* Pop from stack? */ 378 /* Pop from stack? */
386 if (v != IPT_RETURN) { 379 if (v != XT_RETURN) {
387 verdict = (unsigned)(-v) - 1; 380 verdict = (unsigned)(-v) - 1;
388 break; 381 break;
389 } 382 }
390 if (*stackptr == 0) { 383 if (*stackptr <= origptr) {
391 e = get_entry(table_base, 384 e = get_entry(table_base,
392 private->underflow[hook]); 385 private->underflow[hook]);
393 pr_debug("Underflow (this is normal) " 386 pr_debug("Underflow (this is normal) "
@@ -421,16 +414,18 @@ ipt_do_table(struct sk_buff *skb,
421 verdict = t->u.kernel.target->target(skb, &acpar); 414 verdict = t->u.kernel.target->target(skb, &acpar);
422 /* Target might have changed stuff. */ 415 /* Target might have changed stuff. */
423 ip = ip_hdr(skb); 416 ip = ip_hdr(skb);
424 if (verdict == IPT_CONTINUE) 417 if (verdict == XT_CONTINUE)
425 e = ipt_next_entry(e); 418 e = ipt_next_entry(e);
426 else 419 else
427 /* Verdict */ 420 /* Verdict */
428 break; 421 break;
429 } while (!acpar.hotdrop); 422 } while (!acpar.hotdrop);
430 xt_info_rdunlock_bh();
431 pr_debug("Exiting %s; resetting sp from %u to %u\n", 423 pr_debug("Exiting %s; resetting sp from %u to %u\n",
432 __func__, *stackptr, origptr); 424 __func__, *stackptr, origptr);
433 *stackptr = origptr; 425 *stackptr = origptr;
426 xt_write_recseq_end(addend);
427 local_bh_enable();
428
434#ifdef DEBUG_ALLOW_ALL 429#ifdef DEBUG_ALLOW_ALL
435 return NF_ACCEPT; 430 return NF_ACCEPT;
436#else 431#else
@@ -461,7 +456,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
461 e->counters.pcnt = pos; 456 e->counters.pcnt = pos;
462 457
463 for (;;) { 458 for (;;) {
464 const struct ipt_standard_target *t 459 const struct xt_standard_target *t
465 = (void *)ipt_get_target_c(e); 460 = (void *)ipt_get_target_c(e);
466 int visited = e->comefrom & (1 << hook); 461 int visited = e->comefrom & (1 << hook);
467 462
@@ -475,13 +470,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
475 /* Unconditional return/END. */ 470 /* Unconditional return/END. */
476 if ((e->target_offset == sizeof(struct ipt_entry) && 471 if ((e->target_offset == sizeof(struct ipt_entry) &&
477 (strcmp(t->target.u.user.name, 472 (strcmp(t->target.u.user.name,
478 IPT_STANDARD_TARGET) == 0) && 473 XT_STANDARD_TARGET) == 0) &&
479 t->verdict < 0 && unconditional(&e->ip)) || 474 t->verdict < 0 && unconditional(&e->ip)) ||
480 visited) { 475 visited) {
481 unsigned int oldpos, size; 476 unsigned int oldpos, size;
482 477
483 if ((strcmp(t->target.u.user.name, 478 if ((strcmp(t->target.u.user.name,
484 IPT_STANDARD_TARGET) == 0) && 479 XT_STANDARD_TARGET) == 0) &&
485 t->verdict < -NF_MAX_VERDICT - 1) { 480 t->verdict < -NF_MAX_VERDICT - 1) {
486 duprintf("mark_source_chains: bad " 481 duprintf("mark_source_chains: bad "
487 "negative verdict (%i)\n", 482 "negative verdict (%i)\n",
@@ -524,7 +519,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
524 int newpos = t->verdict; 519 int newpos = t->verdict;
525 520
526 if (strcmp(t->target.u.user.name, 521 if (strcmp(t->target.u.user.name,
527 IPT_STANDARD_TARGET) == 0 && 522 XT_STANDARD_TARGET) == 0 &&
528 newpos >= 0) { 523 newpos >= 0) {
529 if (newpos > newinfo->size - 524 if (newpos > newinfo->size -
530 sizeof(struct ipt_entry)) { 525 sizeof(struct ipt_entry)) {
@@ -552,7 +547,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
552 return 1; 547 return 1;
553} 548}
554 549
555static void cleanup_match(struct ipt_entry_match *m, struct net *net) 550static void cleanup_match(struct xt_entry_match *m, struct net *net)
556{ 551{
557 struct xt_mtdtor_param par; 552 struct xt_mtdtor_param par;
558 553
@@ -568,14 +563,14 @@ static void cleanup_match(struct ipt_entry_match *m, struct net *net)
568static int 563static int
569check_entry(const struct ipt_entry *e, const char *name) 564check_entry(const struct ipt_entry *e, const char *name)
570{ 565{
571 const struct ipt_entry_target *t; 566 const struct xt_entry_target *t;
572 567
573 if (!ip_checkentry(&e->ip)) { 568 if (!ip_checkentry(&e->ip)) {
574 duprintf("ip check failed %p %s.\n", e, par->match->name); 569 duprintf("ip check failed %p %s.\n", e, name);
575 return -EINVAL; 570 return -EINVAL;
576 } 571 }
577 572
578 if (e->target_offset + sizeof(struct ipt_entry_target) > 573 if (e->target_offset + sizeof(struct xt_entry_target) >
579 e->next_offset) 574 e->next_offset)
580 return -EINVAL; 575 return -EINVAL;
581 576
@@ -587,7 +582,7 @@ check_entry(const struct ipt_entry *e, const char *name)
587} 582}
588 583
589static int 584static int
590check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) 585check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
591{ 586{
592 const struct ipt_ip *ip = par->entryinfo; 587 const struct ipt_ip *ip = par->entryinfo;
593 int ret; 588 int ret;
@@ -605,7 +600,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
605} 600}
606 601
607static int 602static int
608find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) 603find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
609{ 604{
610 struct xt_match *match; 605 struct xt_match *match;
611 int ret; 606 int ret;
@@ -630,7 +625,7 @@ err:
630 625
631static int check_target(struct ipt_entry *e, struct net *net, const char *name) 626static int check_target(struct ipt_entry *e, struct net *net, const char *name)
632{ 627{
633 struct ipt_entry_target *t = ipt_get_target(e); 628 struct xt_entry_target *t = ipt_get_target(e);
634 struct xt_tgchk_param par = { 629 struct xt_tgchk_param par = {
635 .net = net, 630 .net = net,
636 .table = name, 631 .table = name,
@@ -656,7 +651,7 @@ static int
656find_check_entry(struct ipt_entry *e, struct net *net, const char *name, 651find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
657 unsigned int size) 652 unsigned int size)
658{ 653{
659 struct ipt_entry_target *t; 654 struct xt_entry_target *t;
660 struct xt_target *target; 655 struct xt_target *target;
661 int ret; 656 int ret;
662 unsigned int j; 657 unsigned int j;
@@ -707,7 +702,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
707 702
708static bool check_underflow(const struct ipt_entry *e) 703static bool check_underflow(const struct ipt_entry *e)
709{ 704{
710 const struct ipt_entry_target *t; 705 const struct xt_entry_target *t;
711 unsigned int verdict; 706 unsigned int verdict;
712 707
713 if (!unconditional(&e->ip)) 708 if (!unconditional(&e->ip))
@@ -715,7 +710,7 @@ static bool check_underflow(const struct ipt_entry *e)
715 t = ipt_get_target_c(e); 710 t = ipt_get_target_c(e);
716 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) 711 if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
717 return false; 712 return false;
718 verdict = ((struct ipt_standard_target *)t)->verdict; 713 verdict = ((struct xt_standard_target *)t)->verdict;
719 verdict = -verdict - 1; 714 verdict = -verdict - 1;
720 return verdict == NF_DROP || verdict == NF_ACCEPT; 715 return verdict == NF_DROP || verdict == NF_ACCEPT;
721} 716}
@@ -738,7 +733,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
738 } 733 }
739 734
740 if (e->next_offset 735 if (e->next_offset
741 < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) { 736 < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
742 duprintf("checking: element %p size %u\n", 737 duprintf("checking: element %p size %u\n",
743 e, e->next_offset); 738 e, e->next_offset);
744 return -EINVAL; 739 return -EINVAL;
@@ -771,7 +766,7 @@ static void
771cleanup_entry(struct ipt_entry *e, struct net *net) 766cleanup_entry(struct ipt_entry *e, struct net *net)
772{ 767{
773 struct xt_tgdtor_param par; 768 struct xt_tgdtor_param par;
774 struct ipt_entry_target *t; 769 struct xt_entry_target *t;
775 struct xt_entry_match *ematch; 770 struct xt_entry_match *ematch;
776 771
777 /* Cleanup all matches */ 772 /* Cleanup all matches */
@@ -884,42 +879,25 @@ get_counters(const struct xt_table_info *t,
884 struct ipt_entry *iter; 879 struct ipt_entry *iter;
885 unsigned int cpu; 880 unsigned int cpu;
886 unsigned int i; 881 unsigned int i;
887 unsigned int curcpu = get_cpu();
888
889 /* Instead of clearing (by a previous call to memset())
890 * the counters and using adds, we set the counters
891 * with data used by 'current' CPU.
892 *
893 * Bottom half has to be disabled to prevent deadlock
894 * if new softirq were to run and call ipt_do_table
895 */
896 local_bh_disable();
897 i = 0;
898 xt_entry_foreach(iter, t->entries[curcpu], t->size) {
899 SET_COUNTER(counters[i], iter->counters.bcnt,
900 iter->counters.pcnt);
901 ++i;
902 }
903 local_bh_enable();
904 /* Processing counters from other cpus, we can let bottom half enabled,
905 * (preemption is disabled)
906 */
907 882
908 for_each_possible_cpu(cpu) { 883 for_each_possible_cpu(cpu) {
909 if (cpu == curcpu) 884 seqcount_t *s = &per_cpu(xt_recseq, cpu);
910 continue; 885
911 i = 0; 886 i = 0;
912 local_bh_disable();
913 xt_info_wrlock(cpu);
914 xt_entry_foreach(iter, t->entries[cpu], t->size) { 887 xt_entry_foreach(iter, t->entries[cpu], t->size) {
915 ADD_COUNTER(counters[i], iter->counters.bcnt, 888 u64 bcnt, pcnt;
916 iter->counters.pcnt); 889 unsigned int start;
890
891 do {
892 start = read_seqcount_begin(s);
893 bcnt = iter->counters.bcnt;
894 pcnt = iter->counters.pcnt;
895 } while (read_seqcount_retry(s, start));
896
897 ADD_COUNTER(counters[i], bcnt, pcnt);
917 ++i; /* macro does multi eval of i */ 898 ++i; /* macro does multi eval of i */
918 } 899 }
919 xt_info_wrunlock(cpu);
920 local_bh_enable();
921 } 900 }
922 put_cpu();
923} 901}
924 902
925static struct xt_counters *alloc_counters(const struct xt_table *table) 903static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -932,7 +910,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
932 (other than comefrom, which userspace doesn't care 910 (other than comefrom, which userspace doesn't care
933 about). */ 911 about). */
934 countersize = sizeof(struct xt_counters) * private->number; 912 countersize = sizeof(struct xt_counters) * private->number;
935 counters = vmalloc(countersize); 913 counters = vzalloc(countersize);
936 914
937 if (counters == NULL) 915 if (counters == NULL)
938 return ERR_PTR(-ENOMEM); 916 return ERR_PTR(-ENOMEM);
@@ -972,8 +950,8 @@ copy_entries_to_user(unsigned int total_size,
972 /* ... then go back and fix counters and names */ 950 /* ... then go back and fix counters and names */
973 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ 951 for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
974 unsigned int i; 952 unsigned int i;
975 const struct ipt_entry_match *m; 953 const struct xt_entry_match *m;
976 const struct ipt_entry_target *t; 954 const struct xt_entry_target *t;
977 955
978 e = (struct ipt_entry *)(loc_cpu_entry + off); 956 e = (struct ipt_entry *)(loc_cpu_entry + off);
979 if (copy_to_user(userptr + off 957 if (copy_to_user(userptr + off
@@ -990,7 +968,7 @@ copy_entries_to_user(unsigned int total_size,
990 m = (void *)e + i; 968 m = (void *)e + i;
991 969
992 if (copy_to_user(userptr + off + i 970 if (copy_to_user(userptr + off + i
993 + offsetof(struct ipt_entry_match, 971 + offsetof(struct xt_entry_match,
994 u.user.name), 972 u.user.name),
995 m->u.kernel.match->name, 973 m->u.kernel.match->name,
996 strlen(m->u.kernel.match->name)+1) 974 strlen(m->u.kernel.match->name)+1)
@@ -1002,7 +980,7 @@ copy_entries_to_user(unsigned int total_size,
1002 980
1003 t = ipt_get_target_c(e); 981 t = ipt_get_target_c(e);
1004 if (copy_to_user(userptr + off + e->target_offset 982 if (copy_to_user(userptr + off + e->target_offset
1005 + offsetof(struct ipt_entry_target, 983 + offsetof(struct xt_entry_target,
1006 u.user.name), 984 u.user.name),
1007 t->u.kernel.target->name, 985 t->u.kernel.target->name,
1008 strlen(t->u.kernel.target->name)+1) != 0) { 986 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1040,7 +1018,7 @@ static int compat_calc_entry(const struct ipt_entry *e,
1040 const void *base, struct xt_table_info *newinfo) 1018 const void *base, struct xt_table_info *newinfo)
1041{ 1019{
1042 const struct xt_entry_match *ematch; 1020 const struct xt_entry_match *ematch;
1043 const struct ipt_entry_target *t; 1021 const struct xt_entry_target *t;
1044 unsigned int entry_offset; 1022 unsigned int entry_offset;
1045 int off, i, ret; 1023 int off, i, ret;
1046 1024
@@ -1080,6 +1058,7 @@ static int compat_table_info(const struct xt_table_info *info,
1080 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 1058 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
1081 newinfo->initial_entries = 0; 1059 newinfo->initial_entries = 0;
1082 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 1060 loc_cpu_entry = info->entries[raw_smp_processor_id()];
1061 xt_compat_init_offsets(AF_INET, info->number);
1083 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 1062 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
1084 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 1063 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
1085 if (ret != 0) 1064 if (ret != 0)
@@ -1092,7 +1071,7 @@ static int compat_table_info(const struct xt_table_info *info,
1092static int get_info(struct net *net, void __user *user, 1071static int get_info(struct net *net, void __user *user,
1093 const int *len, int compat) 1072 const int *len, int compat)
1094{ 1073{
1095 char name[IPT_TABLE_MAXNAMELEN]; 1074 char name[XT_TABLE_MAXNAMELEN];
1096 struct xt_table *t; 1075 struct xt_table *t;
1097 int ret; 1076 int ret;
1098 1077
@@ -1105,7 +1084,7 @@ static int get_info(struct net *net, void __user *user,
1105 if (copy_from_user(name, user, sizeof(name)) != 0) 1084 if (copy_from_user(name, user, sizeof(name)) != 0)
1106 return -EFAULT; 1085 return -EFAULT;
1107 1086
1108 name[IPT_TABLE_MAXNAMELEN-1] = '\0'; 1087 name[XT_TABLE_MAXNAMELEN-1] = '\0';
1109#ifdef CONFIG_COMPAT 1088#ifdef CONFIG_COMPAT
1110 if (compat) 1089 if (compat)
1111 xt_compat_lock(AF_INET); 1090 xt_compat_lock(AF_INET);
@@ -1124,6 +1103,7 @@ static int get_info(struct net *net, void __user *user,
1124 private = &tmp; 1103 private = &tmp;
1125 } 1104 }
1126#endif 1105#endif
1106 memset(&info, 0, sizeof(info));
1127 info.valid_hooks = t->valid_hooks; 1107 info.valid_hooks = t->valid_hooks;
1128 memcpy(info.hook_entry, private->hook_entry, 1108 memcpy(info.hook_entry, private->hook_entry,
1129 sizeof(info.hook_entry)); 1109 sizeof(info.hook_entry));
@@ -1202,7 +1182,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1202 struct ipt_entry *iter; 1182 struct ipt_entry *iter;
1203 1183
1204 ret = 0; 1184 ret = 0;
1205 counters = vmalloc(num_counters * sizeof(struct xt_counters)); 1185 counters = vzalloc(num_counters * sizeof(struct xt_counters));
1206 if (!counters) { 1186 if (!counters) {
1207 ret = -ENOMEM; 1187 ret = -ENOMEM;
1208 goto out; 1188 goto out;
@@ -1277,6 +1257,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
1277 /* overflow check */ 1257 /* overflow check */
1278 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1258 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1279 return -ENOMEM; 1259 return -ENOMEM;
1260 tmp.name[sizeof(tmp.name)-1] = 0;
1280 1261
1281 newinfo = xt_alloc_table_info(tmp.size); 1262 newinfo = xt_alloc_table_info(tmp.size);
1282 if (!newinfo) 1263 if (!newinfo)
@@ -1326,6 +1307,7 @@ do_add_counters(struct net *net, const void __user *user,
1326 int ret = 0; 1307 int ret = 0;
1327 void *loc_cpu_entry; 1308 void *loc_cpu_entry;
1328 struct ipt_entry *iter; 1309 struct ipt_entry *iter;
1310 unsigned int addend;
1329#ifdef CONFIG_COMPAT 1311#ifdef CONFIG_COMPAT
1330 struct compat_xt_counters_info compat_tmp; 1312 struct compat_xt_counters_info compat_tmp;
1331 1313
@@ -1382,12 +1364,12 @@ do_add_counters(struct net *net, const void __user *user,
1382 /* Choose the copy that is on our node */ 1364 /* Choose the copy that is on our node */
1383 curcpu = smp_processor_id(); 1365 curcpu = smp_processor_id();
1384 loc_cpu_entry = private->entries[curcpu]; 1366 loc_cpu_entry = private->entries[curcpu];
1385 xt_info_wrlock(curcpu); 1367 addend = xt_write_recseq_begin();
1386 xt_entry_foreach(iter, loc_cpu_entry, private->size) { 1368 xt_entry_foreach(iter, loc_cpu_entry, private->size) {
1387 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); 1369 ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
1388 ++i; 1370 ++i;
1389 } 1371 }
1390 xt_info_wrunlock(curcpu); 1372 xt_write_recseq_end(addend);
1391 unlock_up_free: 1373 unlock_up_free:
1392 local_bh_enable(); 1374 local_bh_enable();
1393 xt_table_unlock(t); 1375 xt_table_unlock(t);
@@ -1400,14 +1382,14 @@ do_add_counters(struct net *net, const void __user *user,
1400 1382
1401#ifdef CONFIG_COMPAT 1383#ifdef CONFIG_COMPAT
1402struct compat_ipt_replace { 1384struct compat_ipt_replace {
1403 char name[IPT_TABLE_MAXNAMELEN]; 1385 char name[XT_TABLE_MAXNAMELEN];
1404 u32 valid_hooks; 1386 u32 valid_hooks;
1405 u32 num_entries; 1387 u32 num_entries;
1406 u32 size; 1388 u32 size;
1407 u32 hook_entry[NF_INET_NUMHOOKS]; 1389 u32 hook_entry[NF_INET_NUMHOOKS];
1408 u32 underflow[NF_INET_NUMHOOKS]; 1390 u32 underflow[NF_INET_NUMHOOKS];
1409 u32 num_counters; 1391 u32 num_counters;
1410 compat_uptr_t counters; /* struct ipt_counters * */ 1392 compat_uptr_t counters; /* struct xt_counters * */
1411 struct compat_ipt_entry entries[0]; 1393 struct compat_ipt_entry entries[0];
1412}; 1394};
1413 1395
@@ -1416,7 +1398,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
1416 unsigned int *size, struct xt_counters *counters, 1398 unsigned int *size, struct xt_counters *counters,
1417 unsigned int i) 1399 unsigned int i)
1418{ 1400{
1419 struct ipt_entry_target *t; 1401 struct xt_entry_target *t;
1420 struct compat_ipt_entry __user *ce; 1402 struct compat_ipt_entry __user *ce;
1421 u_int16_t target_offset, next_offset; 1403 u_int16_t target_offset, next_offset;
1422 compat_uint_t origsize; 1404 compat_uint_t origsize;
@@ -1451,7 +1433,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
1451} 1433}
1452 1434
1453static int 1435static int
1454compat_find_calc_match(struct ipt_entry_match *m, 1436compat_find_calc_match(struct xt_entry_match *m,
1455 const char *name, 1437 const char *name,
1456 const struct ipt_ip *ip, 1438 const struct ipt_ip *ip,
1457 unsigned int hookmask, 1439 unsigned int hookmask,
@@ -1473,7 +1455,7 @@ compat_find_calc_match(struct ipt_entry_match *m,
1473 1455
1474static void compat_release_entry(struct compat_ipt_entry *e) 1456static void compat_release_entry(struct compat_ipt_entry *e)
1475{ 1457{
1476 struct ipt_entry_target *t; 1458 struct xt_entry_target *t;
1477 struct xt_entry_match *ematch; 1459 struct xt_entry_match *ematch;
1478 1460
1479 /* Cleanup all matches */ 1461 /* Cleanup all matches */
@@ -1494,7 +1476,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
1494 const char *name) 1476 const char *name)
1495{ 1477{
1496 struct xt_entry_match *ematch; 1478 struct xt_entry_match *ematch;
1497 struct ipt_entry_target *t; 1479 struct xt_entry_target *t;
1498 struct xt_target *target; 1480 struct xt_target *target;
1499 unsigned int entry_offset; 1481 unsigned int entry_offset;
1500 unsigned int j; 1482 unsigned int j;
@@ -1576,7 +1558,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
1576 unsigned int *size, const char *name, 1558 unsigned int *size, const char *name,
1577 struct xt_table_info *newinfo, unsigned char *base) 1559 struct xt_table_info *newinfo, unsigned char *base)
1578{ 1560{
1579 struct ipt_entry_target *t; 1561 struct xt_entry_target *t;
1580 struct xt_target *target; 1562 struct xt_target *target;
1581 struct ipt_entry *de; 1563 struct ipt_entry *de;
1582 unsigned int origsize; 1564 unsigned int origsize;
@@ -1680,6 +1662,7 @@ translate_compat_table(struct net *net,
1680 duprintf("translate_compat_table: size %u\n", info->size); 1662 duprintf("translate_compat_table: size %u\n", info->size);
1681 j = 0; 1663 j = 0;
1682 xt_compat_lock(AF_INET); 1664 xt_compat_lock(AF_INET);
1665 xt_compat_init_offsets(AF_INET, number);
1683 /* Walk through entries, checking offsets. */ 1666 /* Walk through entries, checking offsets. */
1684 xt_entry_foreach(iter0, entry0, total_size) { 1667 xt_entry_foreach(iter0, entry0, total_size) {
1685 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1668 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
@@ -1821,6 +1804,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
1821 return -ENOMEM; 1804 return -ENOMEM;
1822 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1805 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1823 return -ENOMEM; 1806 return -ENOMEM;
1807 tmp.name[sizeof(tmp.name)-1] = 0;
1824 1808
1825 newinfo = xt_alloc_table_info(tmp.size); 1809 newinfo = xt_alloc_table_info(tmp.size);
1826 if (!newinfo) 1810 if (!newinfo)
@@ -1884,7 +1868,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
1884} 1868}
1885 1869
1886struct compat_ipt_get_entries { 1870struct compat_ipt_get_entries {
1887 char name[IPT_TABLE_MAXNAMELEN]; 1871 char name[XT_TABLE_MAXNAMELEN];
1888 compat_uint_t size; 1872 compat_uint_t size;
1889 struct compat_ipt_entry entrytable[0]; 1873 struct compat_ipt_entry entrytable[0];
1890}; 1874};
@@ -2039,7 +2023,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2039 2023
2040 case IPT_SO_GET_REVISION_MATCH: 2024 case IPT_SO_GET_REVISION_MATCH:
2041 case IPT_SO_GET_REVISION_TARGET: { 2025 case IPT_SO_GET_REVISION_TARGET: {
2042 struct ipt_get_revision rev; 2026 struct xt_get_revision rev;
2043 int target; 2027 int target;
2044 2028
2045 if (*len != sizeof(rev)) { 2029 if (*len != sizeof(rev)) {
@@ -2050,6 +2034,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2050 ret = -EFAULT; 2034 ret = -EFAULT;
2051 break; 2035 break;
2052 } 2036 }
2037 rev.name[sizeof(rev.name)-1] = 0;
2053 2038
2054 if (cmd == IPT_SO_GET_REVISION_TARGET) 2039 if (cmd == IPT_SO_GET_REVISION_TARGET)
2055 target = 1; 2040 target = 1;
@@ -2176,7 +2161,7 @@ static int icmp_checkentry(const struct xt_mtchk_param *par)
2176 2161
2177static struct xt_target ipt_builtin_tg[] __read_mostly = { 2162static struct xt_target ipt_builtin_tg[] __read_mostly = {
2178 { 2163 {
2179 .name = IPT_STANDARD_TARGET, 2164 .name = XT_STANDARD_TARGET,
2180 .targetsize = sizeof(int), 2165 .targetsize = sizeof(int),
2181 .family = NFPROTO_IPV4, 2166 .family = NFPROTO_IPV4,
2182#ifdef CONFIG_COMPAT 2167#ifdef CONFIG_COMPAT
@@ -2186,9 +2171,9 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = {
2186#endif 2171#endif
2187 }, 2172 },
2188 { 2173 {
2189 .name = IPT_ERROR_TARGET, 2174 .name = XT_ERROR_TARGET,
2190 .target = ipt_error, 2175 .target = ipt_error,
2191 .targetsize = IPT_FUNCTION_MAXNAMELEN, 2176 .targetsize = XT_FUNCTION_MAXNAMELEN,
2192 .family = NFPROTO_IPV4, 2177 .family = NFPROTO_IPV4,
2193 }, 2178 },
2194}; 2179};
@@ -2244,7 +2229,7 @@ static int __init ip_tables_init(void)
2244 if (ret < 0) 2229 if (ret < 0)
2245 goto err1; 2230 goto err1;
2246 2231
2247 /* Noone else will be downing sem now, so we won't sleep */ 2232 /* No one else will be downing sem now, so we won't sleep */
2248 ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); 2233 ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
2249 if (ret < 0) 2234 if (ret < 0)
2250 goto err2; 2235 goto err2;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 3a43cf36db87..5c9e97c79017 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,6 +29,7 @@
29#include <net/netfilter/nf_conntrack.h> 29#include <net/netfilter/nf_conntrack.h>
30#include <net/net_namespace.h> 30#include <net/net_namespace.h>
31#include <net/checksum.h> 31#include <net/checksum.h>
32#include <net/ip.h>
32 33
33#define CLUSTERIP_VERSION "0.8" 34#define CLUSTERIP_VERSION "0.8"
34 35
@@ -231,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb,
231{ 232{
232 const struct iphdr *iph = ip_hdr(skb); 233 const struct iphdr *iph = ip_hdr(skb);
233 unsigned long hashval; 234 unsigned long hashval;
234 u_int16_t sport, dport; 235 u_int16_t sport = 0, dport = 0;
235 const u_int16_t *ports; 236 int poff;
236 237
237 switch (iph->protocol) { 238 poff = proto_ports_offset(iph->protocol);
238 case IPPROTO_TCP: 239 if (poff >= 0) {
239 case IPPROTO_UDP: 240 const u_int16_t *ports;
240 case IPPROTO_UDPLITE: 241 u16 _ports[2];
241 case IPPROTO_SCTP: 242
242 case IPPROTO_DCCP: 243 ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
243 case IPPROTO_ICMP: 244 if (ports) {
244 ports = (const void *)iph+iph->ihl*4; 245 sport = ports[0];
245 sport = ports[0]; 246 dport = ports[1];
246 dport = ports[1]; 247 }
247 break; 248 } else {
248 default:
249 if (net_ratelimit()) 249 if (net_ratelimit())
250 pr_info("unknown protocol %u\n", iph->protocol); 250 pr_info("unknown protocol %u\n", iph->protocol);
251 sport = dport = 0;
252 } 251 }
253 252
254 switch (config->hash_mode) { 253 switch (config->hash_mode) {
@@ -301,19 +300,14 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
301 * that the ->target() function isn't called after ->destroy() */ 300 * that the ->target() function isn't called after ->destroy() */
302 301
303 ct = nf_ct_get(skb, &ctinfo); 302 ct = nf_ct_get(skb, &ctinfo);
304 if (ct == NULL) { 303 if (ct == NULL)
305 pr_info("no conntrack!\n");
306 /* FIXME: need to drop invalid ones, since replies
307 * to outgoing connections of other nodes will be
308 * marked as INVALID */
309 return NF_DROP; 304 return NF_DROP;
310 }
311 305
312 /* special case: ICMP error handling. conntrack distinguishes between 306 /* special case: ICMP error handling. conntrack distinguishes between
313 * error messages (RELATED) and information requests (see below) */ 307 * error messages (RELATED) and information requests (see below) */
314 if (ip_hdr(skb)->protocol == IPPROTO_ICMP && 308 if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
315 (ctinfo == IP_CT_RELATED || 309 (ctinfo == IP_CT_RELATED ||
316 ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)) 310 ctinfo == IP_CT_RELATED_REPLY))
317 return XT_CONTINUE; 311 return XT_CONTINUE;
318 312
319 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 313 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
@@ -327,12 +321,12 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
327 ct->mark = hash; 321 ct->mark = hash;
328 break; 322 break;
329 case IP_CT_RELATED: 323 case IP_CT_RELATED:
330 case IP_CT_RELATED+IP_CT_IS_REPLY: 324 case IP_CT_RELATED_REPLY:
331 /* FIXME: we don't handle expectations at the 325 /* FIXME: we don't handle expectations at the
332 * moment. they can arrive on a different node than 326 * moment. they can arrive on a different node than
333 * the master connection (e.g. FTP passive mode) */ 327 * the master connection (e.g. FTP passive mode) */
334 case IP_CT_ESTABLISHED: 328 case IP_CT_ESTABLISHED:
335 case IP_CT_ESTABLISHED+IP_CT_IS_REPLY: 329 case IP_CT_ESTABLISHED_REPLY:
336 break; 330 break;
337 default: 331 default:
338 break; 332 break;
@@ -670,8 +664,11 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
670 char buffer[PROC_WRITELEN+1]; 664 char buffer[PROC_WRITELEN+1];
671 unsigned long nodenum; 665 unsigned long nodenum;
672 666
673 if (copy_from_user(buffer, input, PROC_WRITELEN)) 667 if (size > PROC_WRITELEN)
668 return -EIO;
669 if (copy_from_user(buffer, input, size))
674 return -EFAULT; 670 return -EFAULT;
671 buffer[size] = 0;
675 672
676 if (*buffer == '+') { 673 if (*buffer == '+') {
677 nodenum = simple_strtoul(buffer+1, NULL, 10); 674 nodenum = simple_strtoul(buffer+1, NULL, 10);
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 915fc17d7ce2..d76d6c9ed946 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -24,16 +24,15 @@
24#include <linux/netfilter/x_tables.h> 24#include <linux/netfilter/x_tables.h>
25#include <linux/netfilter_ipv4/ipt_LOG.h> 25#include <linux/netfilter_ipv4/ipt_LOG.h>
26#include <net/netfilter/nf_log.h> 26#include <net/netfilter/nf_log.h>
27#include <net/netfilter/xt_log.h>
27 28
28MODULE_LICENSE("GPL"); 29MODULE_LICENSE("GPL");
29MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 30MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
30MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog"); 31MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
31 32
32/* Use lock to serialize, so printks don't overlap */
33static DEFINE_SPINLOCK(log_lock);
34
35/* One level of recursion won't kill us */ 33/* One level of recursion won't kill us */
36static void dump_packet(const struct nf_loginfo *info, 34static void dump_packet(struct sbuff *m,
35 const struct nf_loginfo *info,
37 const struct sk_buff *skb, 36 const struct sk_buff *skb,
38 unsigned int iphoff) 37 unsigned int iphoff)
39{ 38{
@@ -48,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info,
48 47
49 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); 48 ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
50 if (ih == NULL) { 49 if (ih == NULL) {
51 printk("TRUNCATED"); 50 sb_add(m, "TRUNCATED");
52 return; 51 return;
53 } 52 }
54 53
55 /* Important fields: 54 /* Important fields:
56 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ 55 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
57 /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ 56 /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
58 printk("SRC=%pI4 DST=%pI4 ", 57 sb_add(m, "SRC=%pI4 DST=%pI4 ",
59 &ih->saddr, &ih->daddr); 58 &ih->saddr, &ih->daddr);
60 59
61 /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ 60 /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
62 printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", 61 sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
63 ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, 62 ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
64 ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); 63 ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
65 64
66 /* Max length: 6 "CE DF MF " */ 65 /* Max length: 6 "CE DF MF " */
67 if (ntohs(ih->frag_off) & IP_CE) 66 if (ntohs(ih->frag_off) & IP_CE)
68 printk("CE "); 67 sb_add(m, "CE ");
69 if (ntohs(ih->frag_off) & IP_DF) 68 if (ntohs(ih->frag_off) & IP_DF)
70 printk("DF "); 69 sb_add(m, "DF ");
71 if (ntohs(ih->frag_off) & IP_MF) 70 if (ntohs(ih->frag_off) & IP_MF)
72 printk("MF "); 71 sb_add(m, "MF ");
73 72
74 /* Max length: 11 "FRAG:65535 " */ 73 /* Max length: 11 "FRAG:65535 " */
75 if (ntohs(ih->frag_off) & IP_OFFSET) 74 if (ntohs(ih->frag_off) & IP_OFFSET)
76 printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); 75 sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
77 76
78 if ((logflags & IPT_LOG_IPOPT) && 77 if ((logflags & IPT_LOG_IPOPT) &&
79 ih->ihl * 4 > sizeof(struct iphdr)) { 78 ih->ihl * 4 > sizeof(struct iphdr)) {
@@ -85,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info,
85 op = skb_header_pointer(skb, iphoff+sizeof(_iph), 84 op = skb_header_pointer(skb, iphoff+sizeof(_iph),
86 optsize, _opt); 85 optsize, _opt);
87 if (op == NULL) { 86 if (op == NULL) {
88 printk("TRUNCATED"); 87 sb_add(m, "TRUNCATED");
89 return; 88 return;
90 } 89 }
91 90
92 /* Max length: 127 "OPT (" 15*4*2chars ") " */ 91 /* Max length: 127 "OPT (" 15*4*2chars ") " */
93 printk("OPT ("); 92 sb_add(m, "OPT (");
94 for (i = 0; i < optsize; i++) 93 for (i = 0; i < optsize; i++)
95 printk("%02X", op[i]); 94 sb_add(m, "%02X", op[i]);
96 printk(") "); 95 sb_add(m, ") ");
97 } 96 }
98 97
99 switch (ih->protocol) { 98 switch (ih->protocol) {
@@ -102,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info,
102 const struct tcphdr *th; 101 const struct tcphdr *th;
103 102
104 /* Max length: 10 "PROTO=TCP " */ 103 /* Max length: 10 "PROTO=TCP " */
105 printk("PROTO=TCP "); 104 sb_add(m, "PROTO=TCP ");
106 105
107 if (ntohs(ih->frag_off) & IP_OFFSET) 106 if (ntohs(ih->frag_off) & IP_OFFSET)
108 break; 107 break;
@@ -111,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info,
111 th = skb_header_pointer(skb, iphoff + ih->ihl * 4, 110 th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
112 sizeof(_tcph), &_tcph); 111 sizeof(_tcph), &_tcph);
113 if (th == NULL) { 112 if (th == NULL) {
114 printk("INCOMPLETE [%u bytes] ", 113 sb_add(m, "INCOMPLETE [%u bytes] ",
115 skb->len - iphoff - ih->ihl*4); 114 skb->len - iphoff - ih->ihl*4);
116 break; 115 break;
117 } 116 }
118 117
119 /* Max length: 20 "SPT=65535 DPT=65535 " */ 118 /* Max length: 20 "SPT=65535 DPT=65535 " */
120 printk("SPT=%u DPT=%u ", 119 sb_add(m, "SPT=%u DPT=%u ",
121 ntohs(th->source), ntohs(th->dest)); 120 ntohs(th->source), ntohs(th->dest));
122 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ 121 /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
123 if (logflags & IPT_LOG_TCPSEQ) 122 if (logflags & IPT_LOG_TCPSEQ)
124 printk("SEQ=%u ACK=%u ", 123 sb_add(m, "SEQ=%u ACK=%u ",
125 ntohl(th->seq), ntohl(th->ack_seq)); 124 ntohl(th->seq), ntohl(th->ack_seq));
126 /* Max length: 13 "WINDOW=65535 " */ 125 /* Max length: 13 "WINDOW=65535 " */
127 printk("WINDOW=%u ", ntohs(th->window)); 126 sb_add(m, "WINDOW=%u ", ntohs(th->window));
128 /* Max length: 9 "RES=0x3F " */ 127 /* Max length: 9 "RES=0x3F " */
129 printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); 128 sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
130 /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ 129 /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
131 if (th->cwr) 130 if (th->cwr)
132 printk("CWR "); 131 sb_add(m, "CWR ");
133 if (th->ece) 132 if (th->ece)
134 printk("ECE "); 133 sb_add(m, "ECE ");
135 if (th->urg) 134 if (th->urg)
136 printk("URG "); 135 sb_add(m, "URG ");
137 if (th->ack) 136 if (th->ack)
138 printk("ACK "); 137 sb_add(m, "ACK ");
139 if (th->psh) 138 if (th->psh)
140 printk("PSH "); 139 sb_add(m, "PSH ");
141 if (th->rst) 140 if (th->rst)
142 printk("RST "); 141 sb_add(m, "RST ");
143 if (th->syn) 142 if (th->syn)
144 printk("SYN "); 143 sb_add(m, "SYN ");
145 if (th->fin) 144 if (th->fin)
146 printk("FIN "); 145 sb_add(m, "FIN ");
147 /* Max length: 11 "URGP=65535 " */ 146 /* Max length: 11 "URGP=65535 " */
148 printk("URGP=%u ", ntohs(th->urg_ptr)); 147 sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
149 148
150 if ((logflags & IPT_LOG_TCPOPT) && 149 if ((logflags & IPT_LOG_TCPOPT) &&
151 th->doff * 4 > sizeof(struct tcphdr)) { 150 th->doff * 4 > sizeof(struct tcphdr)) {
@@ -158,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info,
158 iphoff+ih->ihl*4+sizeof(_tcph), 157 iphoff+ih->ihl*4+sizeof(_tcph),
159 optsize, _opt); 158 optsize, _opt);
160 if (op == NULL) { 159 if (op == NULL) {
161 printk("TRUNCATED"); 160 sb_add(m, "TRUNCATED");
162 return; 161 return;
163 } 162 }
164 163
165 /* Max length: 127 "OPT (" 15*4*2chars ") " */ 164 /* Max length: 127 "OPT (" 15*4*2chars ") " */
166 printk("OPT ("); 165 sb_add(m, "OPT (");
167 for (i = 0; i < optsize; i++) 166 for (i = 0; i < optsize; i++)
168 printk("%02X", op[i]); 167 sb_add(m, "%02X", op[i]);
169 printk(") "); 168 sb_add(m, ") ");
170 } 169 }
171 break; 170 break;
172 } 171 }
@@ -177,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info,
177 176
178 if (ih->protocol == IPPROTO_UDP) 177 if (ih->protocol == IPPROTO_UDP)
179 /* Max length: 10 "PROTO=UDP " */ 178 /* Max length: 10 "PROTO=UDP " */
180 printk("PROTO=UDP " ); 179 sb_add(m, "PROTO=UDP " );
181 else /* Max length: 14 "PROTO=UDPLITE " */ 180 else /* Max length: 14 "PROTO=UDPLITE " */
182 printk("PROTO=UDPLITE "); 181 sb_add(m, "PROTO=UDPLITE ");
183 182
184 if (ntohs(ih->frag_off) & IP_OFFSET) 183 if (ntohs(ih->frag_off) & IP_OFFSET)
185 break; 184 break;
@@ -188,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info,
188 uh = skb_header_pointer(skb, iphoff+ih->ihl*4, 187 uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
189 sizeof(_udph), &_udph); 188 sizeof(_udph), &_udph);
190 if (uh == NULL) { 189 if (uh == NULL) {
191 printk("INCOMPLETE [%u bytes] ", 190 sb_add(m, "INCOMPLETE [%u bytes] ",
192 skb->len - iphoff - ih->ihl*4); 191 skb->len - iphoff - ih->ihl*4);
193 break; 192 break;
194 } 193 }
195 194
196 /* Max length: 20 "SPT=65535 DPT=65535 " */ 195 /* Max length: 20 "SPT=65535 DPT=65535 " */
197 printk("SPT=%u DPT=%u LEN=%u ", 196 sb_add(m, "SPT=%u DPT=%u LEN=%u ",
198 ntohs(uh->source), ntohs(uh->dest), 197 ntohs(uh->source), ntohs(uh->dest),
199 ntohs(uh->len)); 198 ntohs(uh->len));
200 break; 199 break;
@@ -221,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info,
221 [ICMP_ADDRESSREPLY] = 12 }; 220 [ICMP_ADDRESSREPLY] = 12 };
222 221
223 /* Max length: 11 "PROTO=ICMP " */ 222 /* Max length: 11 "PROTO=ICMP " */
224 printk("PROTO=ICMP "); 223 sb_add(m, "PROTO=ICMP ");
225 224
226 if (ntohs(ih->frag_off) & IP_OFFSET) 225 if (ntohs(ih->frag_off) & IP_OFFSET)
227 break; 226 break;
@@ -230,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info,
230 ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, 229 ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
231 sizeof(_icmph), &_icmph); 230 sizeof(_icmph), &_icmph);
232 if (ich == NULL) { 231 if (ich == NULL) {
233 printk("INCOMPLETE [%u bytes] ", 232 sb_add(m, "INCOMPLETE [%u bytes] ",
234 skb->len - iphoff - ih->ihl*4); 233 skb->len - iphoff - ih->ihl*4);
235 break; 234 break;
236 } 235 }
237 236
238 /* Max length: 18 "TYPE=255 CODE=255 " */ 237 /* Max length: 18 "TYPE=255 CODE=255 " */
239 printk("TYPE=%u CODE=%u ", ich->type, ich->code); 238 sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
240 239
241 /* Max length: 25 "INCOMPLETE [65535 bytes] " */ 240 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
242 if (ich->type <= NR_ICMP_TYPES && 241 if (ich->type <= NR_ICMP_TYPES &&
243 required_len[ich->type] && 242 required_len[ich->type] &&
244 skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { 243 skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
245 printk("INCOMPLETE [%u bytes] ", 244 sb_add(m, "INCOMPLETE [%u bytes] ",
246 skb->len - iphoff - ih->ihl*4); 245 skb->len - iphoff - ih->ihl*4);
247 break; 246 break;
248 } 247 }
@@ -251,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info,
251 case ICMP_ECHOREPLY: 250 case ICMP_ECHOREPLY:
252 case ICMP_ECHO: 251 case ICMP_ECHO:
253 /* Max length: 19 "ID=65535 SEQ=65535 " */ 252 /* Max length: 19 "ID=65535 SEQ=65535 " */
254 printk("ID=%u SEQ=%u ", 253 sb_add(m, "ID=%u SEQ=%u ",
255 ntohs(ich->un.echo.id), 254 ntohs(ich->un.echo.id),
256 ntohs(ich->un.echo.sequence)); 255 ntohs(ich->un.echo.sequence));
257 break; 256 break;
258 257
259 case ICMP_PARAMETERPROB: 258 case ICMP_PARAMETERPROB:
260 /* Max length: 14 "PARAMETER=255 " */ 259 /* Max length: 14 "PARAMETER=255 " */
261 printk("PARAMETER=%u ", 260 sb_add(m, "PARAMETER=%u ",
262 ntohl(ich->un.gateway) >> 24); 261 ntohl(ich->un.gateway) >> 24);
263 break; 262 break;
264 case ICMP_REDIRECT: 263 case ICMP_REDIRECT:
265 /* Max length: 24 "GATEWAY=255.255.255.255 " */ 264 /* Max length: 24 "GATEWAY=255.255.255.255 " */
266 printk("GATEWAY=%pI4 ", &ich->un.gateway); 265 sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
267 /* Fall through */ 266 /* Fall through */
268 case ICMP_DEST_UNREACH: 267 case ICMP_DEST_UNREACH:
269 case ICMP_SOURCE_QUENCH: 268 case ICMP_SOURCE_QUENCH:
270 case ICMP_TIME_EXCEEDED: 269 case ICMP_TIME_EXCEEDED:
271 /* Max length: 3+maxlen */ 270 /* Max length: 3+maxlen */
272 if (!iphoff) { /* Only recurse once. */ 271 if (!iphoff) { /* Only recurse once. */
273 printk("["); 272 sb_add(m, "[");
274 dump_packet(info, skb, 273 dump_packet(m, info, skb,
275 iphoff + ih->ihl*4+sizeof(_icmph)); 274 iphoff + ih->ihl*4+sizeof(_icmph));
276 printk("] "); 275 sb_add(m, "] ");
277 } 276 }
278 277
279 /* Max length: 10 "MTU=65535 " */ 278 /* Max length: 10 "MTU=65535 " */
280 if (ich->type == ICMP_DEST_UNREACH && 279 if (ich->type == ICMP_DEST_UNREACH &&
281 ich->code == ICMP_FRAG_NEEDED) 280 ich->code == ICMP_FRAG_NEEDED)
282 printk("MTU=%u ", ntohs(ich->un.frag.mtu)); 281 sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
283 } 282 }
284 break; 283 break;
285 } 284 }
@@ -292,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info,
292 break; 291 break;
293 292
294 /* Max length: 9 "PROTO=AH " */ 293 /* Max length: 9 "PROTO=AH " */
295 printk("PROTO=AH "); 294 sb_add(m, "PROTO=AH ");
296 295
297 /* Max length: 25 "INCOMPLETE [65535 bytes] " */ 296 /* Max length: 25 "INCOMPLETE [65535 bytes] " */
298 ah = skb_header_pointer(skb, iphoff+ih->ihl*4, 297 ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
299 sizeof(_ahdr), &_ahdr); 298 sizeof(_ahdr), &_ahdr);
300 if (ah == NULL) { 299 if (ah == NULL) {
301 printk("INCOMPLETE [%u bytes] ", 300 sb_add(m, "INCOMPLETE [%u bytes] ",
302 skb->len - iphoff - ih->ihl*4); 301 skb->len - iphoff - ih->ihl*4);
303 break; 302 break;
304 } 303 }
305 304
306 /* Length: 15 "SPI=0xF1234567 " */ 305 /* Length: 15 "SPI=0xF1234567 " */
307 printk("SPI=0x%x ", ntohl(ah->spi)); 306 sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
308 break; 307 break;
309 } 308 }
310 case IPPROTO_ESP: { 309 case IPPROTO_ESP: {
@@ -312,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info,
312 const struct ip_esp_hdr *eh; 311 const struct ip_esp_hdr *eh;
313 312
314 /* Max length: 10 "PROTO=ESP " */ 313 /* Max length: 10 "PROTO=ESP " */
315 printk("PROTO=ESP "); 314 sb_add(m, "PROTO=ESP ");
316 315
317 if (ntohs(ih->frag_off) & IP_OFFSET) 316 if (ntohs(ih->frag_off) & IP_OFFSET)
318 break; 317 break;
@@ -321,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info,
321 eh = skb_header_pointer(skb, iphoff+ih->ihl*4, 320 eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
322 sizeof(_esph), &_esph); 321 sizeof(_esph), &_esph);
323 if (eh == NULL) { 322 if (eh == NULL) {
324 printk("INCOMPLETE [%u bytes] ", 323 sb_add(m, "INCOMPLETE [%u bytes] ",
325 skb->len - iphoff - ih->ihl*4); 324 skb->len - iphoff - ih->ihl*4);
326 break; 325 break;
327 } 326 }
328 327
329 /* Length: 15 "SPI=0xF1234567 " */ 328 /* Length: 15 "SPI=0xF1234567 " */
330 printk("SPI=0x%x ", ntohl(eh->spi)); 329 sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
331 break; 330 break;
332 } 331 }
333 /* Max length: 10 "PROTO 255 " */ 332 /* Max length: 10 "PROTO 255 " */
334 default: 333 default:
335 printk("PROTO=%u ", ih->protocol); 334 sb_add(m, "PROTO=%u ", ih->protocol);
336 } 335 }
337 336
338 /* Max length: 15 "UID=4294967295 " */ 337 /* Max length: 15 "UID=4294967295 " */
339 if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { 338 if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
340 read_lock_bh(&skb->sk->sk_callback_lock); 339 read_lock_bh(&skb->sk->sk_callback_lock);
341 if (skb->sk->sk_socket && skb->sk->sk_socket->file) 340 if (skb->sk->sk_socket && skb->sk->sk_socket->file)
342 printk("UID=%u GID=%u ", 341 sb_add(m, "UID=%u GID=%u ",
343 skb->sk->sk_socket->file->f_cred->fsuid, 342 skb->sk->sk_socket->file->f_cred->fsuid,
344 skb->sk->sk_socket->file->f_cred->fsgid); 343 skb->sk->sk_socket->file->f_cred->fsgid);
345 read_unlock_bh(&skb->sk->sk_callback_lock); 344 read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -347,7 +346,7 @@ static void dump_packet(const struct nf_loginfo *info,
347 346
348 /* Max length: 16 "MARK=0xFFFFFFFF " */ 347 /* Max length: 16 "MARK=0xFFFFFFFF " */
349 if (!iphoff && skb->mark) 348 if (!iphoff && skb->mark)
350 printk("MARK=0x%x ", skb->mark); 349 sb_add(m, "MARK=0x%x ", skb->mark);
351 350
352 /* Proto Max log string length */ 351 /* Proto Max log string length */
353 /* IP: 40+46+6+11+127 = 230 */ 352 /* IP: 40+46+6+11+127 = 230 */
@@ -364,7 +363,8 @@ static void dump_packet(const struct nf_loginfo *info,
364 /* maxlen = 230+ 91 + 230 + 252 = 803 */ 363 /* maxlen = 230+ 91 + 230 + 252 = 803 */
365} 364}
366 365
367static void dump_mac_header(const struct nf_loginfo *info, 366static void dump_mac_header(struct sbuff *m,
367 const struct nf_loginfo *info,
368 const struct sk_buff *skb) 368 const struct sk_buff *skb)
369{ 369{
370 struct net_device *dev = skb->dev; 370 struct net_device *dev = skb->dev;
@@ -378,7 +378,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
378 378
379 switch (dev->type) { 379 switch (dev->type) {
380 case ARPHRD_ETHER: 380 case ARPHRD_ETHER:
381 printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ", 381 sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
382 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, 382 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
383 ntohs(eth_hdr(skb)->h_proto)); 383 ntohs(eth_hdr(skb)->h_proto));
384 return; 384 return;
@@ -387,17 +387,17 @@ static void dump_mac_header(const struct nf_loginfo *info,
387 } 387 }
388 388
389fallback: 389fallback:
390 printk("MAC="); 390 sb_add(m, "MAC=");
391 if (dev->hard_header_len && 391 if (dev->hard_header_len &&
392 skb->mac_header != skb->network_header) { 392 skb->mac_header != skb->network_header) {
393 const unsigned char *p = skb_mac_header(skb); 393 const unsigned char *p = skb_mac_header(skb);
394 unsigned int i; 394 unsigned int i;
395 395
396 printk("%02x", *p++); 396 sb_add(m, "%02x", *p++);
397 for (i = 1; i < dev->hard_header_len; i++, p++) 397 for (i = 1; i < dev->hard_header_len; i++, p++)
398 printk(":%02x", *p); 398 sb_add(m, ":%02x", *p);
399 } 399 }
400 printk(" "); 400 sb_add(m, " ");
401} 401}
402 402
403static struct nf_loginfo default_loginfo = { 403static struct nf_loginfo default_loginfo = {
@@ -419,11 +419,12 @@ ipt_log_packet(u_int8_t pf,
419 const struct nf_loginfo *loginfo, 419 const struct nf_loginfo *loginfo,
420 const char *prefix) 420 const char *prefix)
421{ 421{
422 struct sbuff *m = sb_open();
423
422 if (!loginfo) 424 if (!loginfo)
423 loginfo = &default_loginfo; 425 loginfo = &default_loginfo;
424 426
425 spin_lock_bh(&log_lock); 427 sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
426 printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
427 prefix, 428 prefix,
428 in ? in->name : "", 429 in ? in->name : "",
429 out ? out->name : ""); 430 out ? out->name : "");
@@ -434,20 +435,19 @@ ipt_log_packet(u_int8_t pf,
434 435
435 physindev = skb->nf_bridge->physindev; 436 physindev = skb->nf_bridge->physindev;
436 if (physindev && in != physindev) 437 if (physindev && in != physindev)
437 printk("PHYSIN=%s ", physindev->name); 438 sb_add(m, "PHYSIN=%s ", physindev->name);
438 physoutdev = skb->nf_bridge->physoutdev; 439 physoutdev = skb->nf_bridge->physoutdev;
439 if (physoutdev && out != physoutdev) 440 if (physoutdev && out != physoutdev)
440 printk("PHYSOUT=%s ", physoutdev->name); 441 sb_add(m, "PHYSOUT=%s ", physoutdev->name);
441 } 442 }
442#endif 443#endif
443 444
444 /* MAC logging for input path only. */ 445 if (in != NULL)
445 if (in && !out) 446 dump_mac_header(m, loginfo, skb);
446 dump_mac_header(loginfo, skb); 447
448 dump_packet(m, loginfo, skb, 0);
447 449
448 dump_packet(loginfo, skb, 0); 450 sb_close(m);
449 printk("\n");
450 spin_unlock_bh(&log_lock);
451} 451}
452 452
453static unsigned int 453static unsigned int
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index d2ed9dc74ebc..9931152a78b5 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -60,7 +60,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
60 nat = nfct_nat(ct); 60 nat = nfct_nat(ct);
61 61
62 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || 62 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
63 ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); 63 ctinfo == IP_CT_RELATED_REPLY));
64 64
65 /* Source address is 0.0.0.0 - locally generated packet that is 65 /* Source address is 0.0.0.0 - locally generated packet that is
66 * probably not supposed to be masqueraded. 66 * probably not supposed to be masqueraded.
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 43eec80c0e7c..51f13f8ec724 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -40,7 +40,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
40 struct iphdr *niph; 40 struct iphdr *niph;
41 const struct tcphdr *oth; 41 const struct tcphdr *oth;
42 struct tcphdr _otcph, *tcph; 42 struct tcphdr _otcph, *tcph;
43 unsigned int addr_type;
44 43
45 /* IP header checks: fragment. */ 44 /* IP header checks: fragment. */
46 if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) 45 if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
@@ -55,6 +54,9 @@ static void send_reset(struct sk_buff *oldskb, int hook)
55 if (oth->rst) 54 if (oth->rst)
56 return; 55 return;
57 56
57 if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
58 return;
59
58 /* Check checksum */ 60 /* Check checksum */
59 if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) 61 if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
60 return; 62 return;
@@ -101,22 +103,14 @@ static void send_reset(struct sk_buff *oldskb, int hook)
101 nskb->csum_start = (unsigned char *)tcph - nskb->head; 103 nskb->csum_start = (unsigned char *)tcph - nskb->head;
102 nskb->csum_offset = offsetof(struct tcphdr, check); 104 nskb->csum_offset = offsetof(struct tcphdr, check);
103 105
104 addr_type = RTN_UNSPEC;
105 if (hook != NF_INET_FORWARD
106#ifdef CONFIG_BRIDGE_NETFILTER
107 || (nskb->nf_bridge && nskb->nf_bridge->mask & BRNF_BRIDGED)
108#endif
109 )
110 addr_type = RTN_LOCAL;
111
112 /* ip_route_me_harder expects skb->dst to be set */ 106 /* ip_route_me_harder expects skb->dst to be set */
113 skb_dst_set_noref(nskb, skb_dst(oldskb)); 107 skb_dst_set_noref(nskb, skb_dst(oldskb));
114 108
115 nskb->protocol = htons(ETH_P_IP); 109 nskb->protocol = htons(ETH_P_IP);
116 if (ip_route_me_harder(nskb, addr_type)) 110 if (ip_route_me_harder(nskb, RTN_UNSPEC))
117 goto free_nskb; 111 goto free_nskb;
118 112
119 niph->ttl = dst_metric(skb_dst(nskb), RTAX_HOPLIMIT); 113 niph->ttl = ip4_dst_hoplimit(skb_dst(nskb));
120 114
121 /* "Never happens" */ 115 /* "Never happens" */
122 if (nskb->len > dst_mtu(skb_dst(nskb))) 116 if (nskb->len > dst_mtu(skb_dst(nskb)))
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
deleted file mode 100644
index db8bff0fb86d..000000000000
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ /dev/null
@@ -1,134 +0,0 @@
1/*
2 * iptables module to match inet_addr_type() of an ip.
3 *
4 * Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
5 * (C) 2007 Laszlo Attila Toth <panther@balabit.hu>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/skbuff.h>
15#include <linux/netdevice.h>
16#include <linux/ip.h>
17#include <net/route.h>
18
19#include <linux/netfilter_ipv4/ipt_addrtype.h>
20#include <linux/netfilter/x_tables.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
24MODULE_DESCRIPTION("Xtables: address type match for IPv4");
25
26static inline bool match_type(struct net *net, const struct net_device *dev,
27 __be32 addr, u_int16_t mask)
28{
29 return !!(mask & (1 << inet_dev_addr_type(net, dev, addr)));
30}
31
32static bool
33addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
34{
35 struct net *net = dev_net(par->in ? par->in : par->out);
36 const struct ipt_addrtype_info *info = par->matchinfo;
37 const struct iphdr *iph = ip_hdr(skb);
38 bool ret = true;
39
40 if (info->source)
41 ret &= match_type(net, NULL, iph->saddr, info->source) ^
42 info->invert_source;
43 if (info->dest)
44 ret &= match_type(net, NULL, iph->daddr, info->dest) ^
45 info->invert_dest;
46
47 return ret;
48}
49
50static bool
51addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
52{
53 struct net *net = dev_net(par->in ? par->in : par->out);
54 const struct ipt_addrtype_info_v1 *info = par->matchinfo;
55 const struct iphdr *iph = ip_hdr(skb);
56 const struct net_device *dev = NULL;
57 bool ret = true;
58
59 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN)
60 dev = par->in;
61 else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT)
62 dev = par->out;
63
64 if (info->source)
65 ret &= match_type(net, dev, iph->saddr, info->source) ^
66 (info->flags & IPT_ADDRTYPE_INVERT_SOURCE);
67 if (ret && info->dest)
68 ret &= match_type(net, dev, iph->daddr, info->dest) ^
69 !!(info->flags & IPT_ADDRTYPE_INVERT_DEST);
70 return ret;
71}
72
73static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
74{
75 struct ipt_addrtype_info_v1 *info = par->matchinfo;
76
77 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN &&
78 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
79 pr_info("both incoming and outgoing "
80 "interface limitation cannot be selected\n");
81 return -EINVAL;
82 }
83
84 if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
85 (1 << NF_INET_LOCAL_IN)) &&
86 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
87 pr_info("output interface limitation "
88 "not valid in PREROUTING and INPUT\n");
89 return -EINVAL;
90 }
91
92 if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
93 (1 << NF_INET_LOCAL_OUT)) &&
94 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) {
95 pr_info("input interface limitation "
96 "not valid in POSTROUTING and OUTPUT\n");
97 return -EINVAL;
98 }
99
100 return 0;
101}
102
103static struct xt_match addrtype_mt_reg[] __read_mostly = {
104 {
105 .name = "addrtype",
106 .family = NFPROTO_IPV4,
107 .match = addrtype_mt_v0,
108 .matchsize = sizeof(struct ipt_addrtype_info),
109 .me = THIS_MODULE
110 },
111 {
112 .name = "addrtype",
113 .family = NFPROTO_IPV4,
114 .revision = 1,
115 .match = addrtype_mt_v1,
116 .checkentry = addrtype_mt_checkentry_v1,
117 .matchsize = sizeof(struct ipt_addrtype_info_v1),
118 .me = THIS_MODULE
119 }
120};
121
122static int __init addrtype_mt_init(void)
123{
124 return xt_register_matches(addrtype_mt_reg,
125 ARRAY_SIZE(addrtype_mt_reg));
126}
127
128static void __exit addrtype_mt_exit(void)
129{
130 xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg));
131}
132
133module_init(addrtype_mt_init);
134module_exit(addrtype_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index af6e9c778345..2b57e52c746c 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -25,7 +25,8 @@ MODULE_LICENSE("GPL");
25static inline bool match_ip(const struct sk_buff *skb, 25static inline bool match_ip(const struct sk_buff *skb,
26 const struct ipt_ecn_info *einfo) 26 const struct ipt_ecn_info *einfo)
27{ 27{
28 return (ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect; 28 return ((ip_hdr(skb)->tos & IPT_ECN_IP_MASK) == einfo->ip_ect) ^
29 !!(einfo->invert & IPT_ECN_OP_MATCH_IP);
29} 30}
30 31
31static inline bool match_tcp(const struct sk_buff *skb, 32static inline bool match_tcp(const struct sk_buff *skb,
@@ -76,8 +77,6 @@ static bool ecn_mt(const struct sk_buff *skb, struct xt_action_param *par)
76 return false; 77 return false;
77 78
78 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { 79 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
79 if (ip_hdr(skb)->protocol != IPPROTO_TCP)
80 return false;
81 if (!match_tcp(skb, info, &par->hotdrop)) 80 if (!match_tcp(skb, info, &par->hotdrop))
82 return false; 81 return false;
83 } 82 }
@@ -97,7 +96,7 @@ static int ecn_mt_check(const struct xt_mtchk_param *par)
97 return -EINVAL; 96 return -EINVAL;
98 97
99 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) && 98 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) &&
100 ip->proto != IPPROTO_TCP) { 99 (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) {
101 pr_info("cannot match TCP bits in rule for non-tcp packets\n"); 100 pr_info("cannot match TCP bits in rule for non-tcp packets\n");
102 return -EINVAL; 101 return -EINVAL;
103 } 102 }
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 294a2a32f293..aef5d1fbe77d 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, 60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
61 dev_net(out)->ipv4.iptable_mangle); 61 dev_net(out)->ipv4.iptable_mangle);
62 /* Reroute for ANY change. */ 62 /* Reroute for ANY change. */
63 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { 63 if (ret != NF_DROP && ret != NF_STOLEN) {
64 iph = ip_hdr(skb); 64 iph = ip_hdr(skb);
65 65
66 if (iph->saddr != saddr || 66 if (iph->saddr != saddr ||
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 5a03c02af999..de9da21113a1 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -101,7 +101,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
101 101
102 /* This is where we call the helper: as the packet goes out. */ 102 /* This is where we call the helper: as the packet goes out. */
103 ct = nf_ct_get(skb, &ctinfo); 103 ct = nf_ct_get(skb, &ctinfo);
104 if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) 104 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
105 goto out; 105 goto out;
106 106
107 help = nfct_help(ct); 107 help = nfct_help(ct);
@@ -121,7 +121,9 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
121 return ret; 121 return ret;
122 } 122 }
123 123
124 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) { 124 /* adjust seqs for loopback traffic only in outgoing direction */
125 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
126 !nf_is_loopback_packet(skb)) {
125 typeof(nf_nat_seq_adjust_hook) seq_adjust; 127 typeof(nf_nat_seq_adjust_hook) seq_adjust;
126 128
127 seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); 129 seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 244f7cb08d68..5585980fce2e 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -11,6 +11,7 @@
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/percpu.h> 13#include <linux/percpu.h>
14#include <linux/security.h>
14#include <net/net_namespace.h> 15#include <net/net_namespace.h>
15 16
16#include <linux/netfilter.h> 17#include <linux/netfilter.h>
@@ -19,6 +20,7 @@
19#include <net/netfilter/nf_conntrack_l4proto.h> 20#include <net/netfilter/nf_conntrack_l4proto.h>
20#include <net/netfilter/nf_conntrack_expect.h> 21#include <net/netfilter/nf_conntrack_expect.h>
21#include <net/netfilter/nf_conntrack_acct.h> 22#include <net/netfilter/nf_conntrack_acct.h>
23#include <linux/rculist_nulls.h>
22 24
23struct ct_iter_state { 25struct ct_iter_state {
24 struct seq_net_private p; 26 struct seq_net_private p;
@@ -34,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
34 for (st->bucket = 0; 36 for (st->bucket = 0;
35 st->bucket < net->ct.htable_size; 37 st->bucket < net->ct.htable_size;
36 st->bucket++) { 38 st->bucket++) {
37 n = rcu_dereference(net->ct.hash[st->bucket].first); 39 n = rcu_dereference(
40 hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
38 if (!is_a_nulls(n)) 41 if (!is_a_nulls(n))
39 return n; 42 return n;
40 } 43 }
@@ -47,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
47 struct net *net = seq_file_net(seq); 50 struct net *net = seq_file_net(seq);
48 struct ct_iter_state *st = seq->private; 51 struct ct_iter_state *st = seq->private;
49 52
50 head = rcu_dereference(head->next); 53 head = rcu_dereference(hlist_nulls_next_rcu(head));
51 while (is_a_nulls(head)) { 54 while (is_a_nulls(head)) {
52 if (likely(get_nulls_value(head) == st->bucket)) { 55 if (likely(get_nulls_value(head) == st->bucket)) {
53 if (++st->bucket >= net->ct.htable_size) 56 if (++st->bucket >= net->ct.htable_size)
54 return NULL; 57 return NULL;
55 } 58 }
56 head = rcu_dereference(net->ct.hash[st->bucket].first); 59 head = rcu_dereference(
60 hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
57 } 61 }
58 return head; 62 return head;
59} 63}
@@ -87,6 +91,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
87 rcu_read_unlock(); 91 rcu_read_unlock();
88} 92}
89 93
94#ifdef CONFIG_NF_CONNTRACK_SECMARK
95static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
96{
97 int ret;
98 u32 len;
99 char *secctx;
100
101 ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
102 if (ret)
103 return 0;
104
105 ret = seq_printf(s, "secctx=%s ", secctx);
106
107 security_release_secctx(secctx, len);
108 return ret;
109}
110#else
111static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
112{
113 return 0;
114}
115#endif
116
90static int ct_seq_show(struct seq_file *s, void *v) 117static int ct_seq_show(struct seq_file *s, void *v)
91{ 118{
92 struct nf_conntrack_tuple_hash *hash = v; 119 struct nf_conntrack_tuple_hash *hash = v;
@@ -148,10 +175,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
148 goto release; 175 goto release;
149#endif 176#endif
150 177
151#ifdef CONFIG_NF_CONNTRACK_SECMARK 178 if (ct_show_secctx(s, ct))
152 if (seq_printf(s, "secmark=%u ", ct->secmark))
153 goto release; 179 goto release;
154#endif
155 180
156 if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) 181 if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
157 goto release; 182 goto release;
@@ -195,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
195 struct hlist_node *n; 220 struct hlist_node *n;
196 221
197 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { 222 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
198 n = rcu_dereference(net->ct.expect_hash[st->bucket].first); 223 n = rcu_dereference(
224 hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
199 if (n) 225 if (n)
200 return n; 226 return n;
201 } 227 }
@@ -208,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
208 struct net *net = seq_file_net(seq); 234 struct net *net = seq_file_net(seq);
209 struct ct_expect_iter_state *st = seq->private; 235 struct ct_expect_iter_state *st = seq->private;
210 236
211 head = rcu_dereference(head->next); 237 head = rcu_dereference(hlist_next_rcu(head));
212 while (head == NULL) { 238 while (head == NULL) {
213 if (++st->bucket >= nf_ct_expect_hsize) 239 if (++st->bucket >= nf_ct_expect_hsize)
214 return NULL; 240 return NULL;
215 head = rcu_dereference(net->ct.expect_hash[st->bucket].first); 241 head = rcu_dereference(
242 hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
216 } 243 }
217 return head; 244 return head;
218} 245}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 7404bde95994..ab5b27a2916f 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -160,7 +160,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
160 /* Update skb to refer to this connection */ 160 /* Update skb to refer to this connection */
161 skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; 161 skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
162 skb->nfctinfo = *ctinfo; 162 skb->nfctinfo = *ctinfo;
163 return -NF_ACCEPT; 163 return NF_ACCEPT;
164} 164}
165 165
166/* Small and modified version of icmp_rcv */ 166/* Small and modified version of icmp_rcv */
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index c31b87668250..703f366fd235 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,9 +44,16 @@ static unsigned int help(struct sk_buff *skb,
44 44
45 /* Try to get same port: if not, try to change it. */ 45 /* Try to get same port: if not, try to change it. */
46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
47 int res;
48
47 exp->tuple.dst.u.tcp.port = htons(port); 49 exp->tuple.dst.u.tcp.port = htons(port);
48 if (nf_ct_expect_related(exp) == 0) 50 res = nf_ct_expect_related(exp);
51 if (res == 0)
52 break;
53 else if (res != -EBUSY) {
54 port = 0;
49 break; 55 break;
56 }
50 } 57 }
51 58
52 if (port == 0) 59 if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 8c8632d9b93c..3346de5d94d0 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock);
38static struct nf_conntrack_l3proto *l3proto __read_mostly; 38static struct nf_conntrack_l3proto *l3proto __read_mostly;
39 39
40#define MAX_IP_NAT_PROTO 256 40#define MAX_IP_NAT_PROTO 256
41static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] 41static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
42 __read_mostly; 42 __read_mostly;
43 43
44static inline const struct nf_nat_protocol * 44static inline const struct nf_nat_protocol *
@@ -47,28 +47,6 @@ __nf_nat_proto_find(u_int8_t protonum)
47 return rcu_dereference(nf_nat_protos[protonum]); 47 return rcu_dereference(nf_nat_protos[protonum]);
48} 48}
49 49
50const struct nf_nat_protocol *
51nf_nat_proto_find_get(u_int8_t protonum)
52{
53 const struct nf_nat_protocol *p;
54
55 rcu_read_lock();
56 p = __nf_nat_proto_find(protonum);
57 if (!try_module_get(p->me))
58 p = &nf_nat_unknown_protocol;
59 rcu_read_unlock();
60
61 return p;
62}
63EXPORT_SYMBOL_GPL(nf_nat_proto_find_get);
64
65void
66nf_nat_proto_put(const struct nf_nat_protocol *p)
67{
68 module_put(p->me);
69}
70EXPORT_SYMBOL_GPL(nf_nat_proto_put);
71
72/* We keep an extra hash for each conntrack, for fast searching. */ 50/* We keep an extra hash for each conntrack, for fast searching. */
73static inline unsigned int 51static inline unsigned int
74hash_by_src(const struct net *net, u16 zone, 52hash_by_src(const struct net *net, u16 zone,
@@ -243,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
243 manips not an issue. */ 221 manips not an issue. */
244 if (maniptype == IP_NAT_MANIP_SRC && 222 if (maniptype == IP_NAT_MANIP_SRC &&
245 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { 223 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
246 if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { 224 /* try the original tuple first */
225 if (in_range(orig_tuple, range)) {
226 if (!nf_nat_used_tuple(orig_tuple, ct)) {
227 *tuple = *orig_tuple;
228 return;
229 }
230 } else if (find_appropriate_src(net, zone, orig_tuple, tuple,
231 range)) {
247 pr_debug("get_unique_tuple: Found current src map\n"); 232 pr_debug("get_unique_tuple: Found current src map\n");
248 if (!nf_nat_used_tuple(tuple, ct)) 233 if (!nf_nat_used_tuple(tuple, ct))
249 return; 234 return;
@@ -262,11 +247,17 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
262 proto = __nf_nat_proto_find(orig_tuple->dst.protonum); 247 proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
263 248
264 /* Only bother mapping if it's not already in range and unique */ 249 /* Only bother mapping if it's not already in range and unique */
265 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM) && 250 if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
266 (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || 251 if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
267 proto->in_range(tuple, maniptype, &range->min, &range->max)) && 252 if (proto->in_range(tuple, maniptype, &range->min,
268 !nf_nat_used_tuple(tuple, ct)) 253 &range->max) &&
269 goto out; 254 (range->min.all == range->max.all ||
255 !nf_nat_used_tuple(tuple, ct)))
256 goto out;
257 } else if (!nf_nat_used_tuple(tuple, ct)) {
258 goto out;
259 }
260 }
270 261
271 /* Last change: get protocol to try to obtain unique tuple. */ 262 /* Last change: get protocol to try to obtain unique tuple. */
272 proto->unique_tuple(tuple, range, maniptype, ct); 263 proto->unique_tuple(tuple, range, maniptype, ct);
@@ -282,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct,
282 struct net *net = nf_ct_net(ct); 273 struct net *net = nf_ct_net(ct);
283 struct nf_conntrack_tuple curr_tuple, new_tuple; 274 struct nf_conntrack_tuple curr_tuple, new_tuple;
284 struct nf_conn_nat *nat; 275 struct nf_conn_nat *nat;
285 int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
286 276
287 /* nat helper or nfctnetlink also setup binding */ 277 /* nat helper or nfctnetlink also setup binding */
288 nat = nfct_nat(ct); 278 nat = nfct_nat(ct);
@@ -322,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct,
322 ct->status |= IPS_DST_NAT; 312 ct->status |= IPS_DST_NAT;
323 } 313 }
324 314
325 /* Place in source hash if this is the first time. */ 315 if (maniptype == IP_NAT_MANIP_SRC) {
326 if (have_to_hash) {
327 unsigned int srchash; 316 unsigned int srchash;
328 317
329 srchash = hash_by_src(net, nf_ct_zone(ct), 318 srchash = hash_by_src(net, nf_ct_zone(ct),
@@ -339,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct,
339 328
340 /* It's done. */ 329 /* It's done. */
341 if (maniptype == IP_NAT_MANIP_DST) 330 if (maniptype == IP_NAT_MANIP_DST)
342 set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); 331 ct->status |= IPS_DST_NAT_DONE;
343 else 332 else
344 set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); 333 ct->status |= IPS_SRC_NAT_DONE;
345 334
346 return NF_ACCEPT; 335 return NF_ACCEPT;
347} 336}
@@ -444,7 +433,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
444 433
445 /* Must be RELATED */ 434 /* Must be RELATED */
446 NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED || 435 NF_CT_ASSERT(skb->nfctinfo == IP_CT_RELATED ||
447 skb->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY); 436 skb->nfctinfo == IP_CT_RELATED_REPLY);
448 437
449 /* Redirects on non-null nats must be dropped, else they'll 438 /* Redirects on non-null nats must be dropped, else they'll
450 start talking to each other without our translation, and be 439 start talking to each other without our translation, and be
@@ -458,6 +447,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
458 return 0; 447 return 0;
459 } 448 }
460 449
450 if (manip == IP_NAT_MANIP_SRC)
451 statusbit = IPS_SRC_NAT;
452 else
453 statusbit = IPS_DST_NAT;
454
455 /* Invert if this is reply dir. */
456 if (dir == IP_CT_DIR_REPLY)
457 statusbit ^= IPS_NAT_MASK;
458
459 if (!(ct->status & statusbit))
460 return 1;
461
461 pr_debug("icmp_reply_translation: translating error %p manip %u " 462 pr_debug("icmp_reply_translation: translating error %p manip %u "
462 "dir %s\n", skb, manip, 463 "dir %s\n", skb, manip,
463 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); 464 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
@@ -492,20 +493,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
492 493
493 /* Change outer to look the reply to an incoming packet 494 /* Change outer to look the reply to an incoming packet
494 * (proto 0 means don't invert per-proto part). */ 495 * (proto 0 means don't invert per-proto part). */
495 if (manip == IP_NAT_MANIP_SRC) 496 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
496 statusbit = IPS_SRC_NAT; 497 if (!manip_pkt(0, skb, 0, &target, manip))
497 else 498 return 0;
498 statusbit = IPS_DST_NAT;
499
500 /* Invert if this is reply dir. */
501 if (dir == IP_CT_DIR_REPLY)
502 statusbit ^= IPS_NAT_MASK;
503
504 if (ct->status & statusbit) {
505 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
506 if (!manip_pkt(0, skb, 0, &target, manip))
507 return 0;
508 }
509 499
510 return 1; 500 return 1;
511} 501}
@@ -517,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
517 int ret = 0; 507 int ret = 0;
518 508
519 spin_lock_bh(&nf_nat_lock); 509 spin_lock_bh(&nf_nat_lock);
520 if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { 510 if (rcu_dereference_protected(
511 nf_nat_protos[proto->protonum],
512 lockdep_is_held(&nf_nat_lock)
513 ) != &nf_nat_unknown_protocol) {
521 ret = -EBUSY; 514 ret = -EBUSY;
522 goto out; 515 goto out;
523 } 516 }
@@ -528,7 +521,7 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
528} 521}
529EXPORT_SYMBOL(nf_nat_protocol_register); 522EXPORT_SYMBOL(nf_nat_protocol_register);
530 523
531/* Noone stores the protocol anywhere; simply delete it. */ 524/* No one stores the protocol anywhere; simply delete it. */
532void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) 525void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
533{ 526{
534 spin_lock_bh(&nf_nat_lock); 527 spin_lock_bh(&nf_nat_lock);
@@ -539,7 +532,7 @@ void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
539} 532}
540EXPORT_SYMBOL(nf_nat_protocol_unregister); 533EXPORT_SYMBOL(nf_nat_protocol_unregister);
541 534
542/* Noone using conntrack by the time this called. */ 535/* No one using conntrack by the time this called. */
543static void nf_nat_cleanup_conntrack(struct nf_conn *ct) 536static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
544{ 537{
545 struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); 538 struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
@@ -547,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
547 if (nat == NULL || nat->ct == NULL) 540 if (nat == NULL || nat->ct == NULL)
548 return; 541 return;
549 542
550 NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK); 543 NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
551 544
552 spin_lock_bh(&nf_nat_lock); 545 spin_lock_bh(&nf_nat_lock);
553 hlist_del_rcu(&nat->bysource); 546 hlist_del_rcu(&nat->bysource);
@@ -560,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old)
560 struct nf_conn_nat *old_nat = old; 553 struct nf_conn_nat *old_nat = old;
561 struct nf_conn *ct = old_nat->ct; 554 struct nf_conn *ct = old_nat->ct;
562 555
563 if (!ct || !(ct->status & IPS_NAT_DONE_MASK)) 556 if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
564 return; 557 return;
565 558
566 spin_lock_bh(&nf_nat_lock); 559 spin_lock_bh(&nf_nat_lock);
567 new_nat->ct = ct;
568 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); 560 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
569 spin_unlock_bh(&nf_nat_lock); 561 spin_unlock_bh(&nf_nat_lock);
570} 562}
@@ -583,6 +575,26 @@ static struct nf_ct_ext_type nat_extend __read_mostly = {
583#include <linux/netfilter/nfnetlink.h> 575#include <linux/netfilter/nfnetlink.h>
584#include <linux/netfilter/nfnetlink_conntrack.h> 576#include <linux/netfilter/nfnetlink_conntrack.h>
585 577
578static const struct nf_nat_protocol *
579nf_nat_proto_find_get(u_int8_t protonum)
580{
581 const struct nf_nat_protocol *p;
582
583 rcu_read_lock();
584 p = __nf_nat_proto_find(protonum);
585 if (!try_module_get(p->me))
586 p = &nf_nat_unknown_protocol;
587 rcu_read_unlock();
588
589 return p;
590}
591
592static void
593nf_nat_proto_put(const struct nf_nat_protocol *p)
594{
595 module_put(p->me);
596}
597
586static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { 598static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
587 [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, 599 [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 },
588 [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, 600 [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 },
@@ -674,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net)
674{ 686{
675 /* Leave them the same for the moment. */ 687 /* Leave them the same for the moment. */
676 net->ipv4.nat_htable_size = net->ct.htable_size; 688 net->ipv4.nat_htable_size = net->ct.htable_size;
677 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 689 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
678 &net->ipv4.nat_vmalloced, 0);
679 if (!net->ipv4.nat_bysource) 690 if (!net->ipv4.nat_bysource)
680 return -ENOMEM; 691 return -ENOMEM;
681 return 0; 692 return 0;
@@ -697,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
697{ 708{
698 nf_ct_iterate_cleanup(net, &clean_nat, NULL); 709 nf_ct_iterate_cleanup(net, &clean_nat, NULL);
699 synchronize_rcu(); 710 synchronize_rcu();
700 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, 711 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
701 net->ipv4.nat_htable_size);
702} 712}
703 713
704static struct pernet_operations nf_nat_net_ops = { 714static struct pernet_operations nf_nat_net_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index 86e0e84ff0a0..dc73abb3fe27 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -79,9 +79,16 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
79 79
80 /* Try to get same port: if not, try to change it. */ 80 /* Try to get same port: if not, try to change it. */
81 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 81 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
82 int ret;
83
82 exp->tuple.dst.u.tcp.port = htons(port); 84 exp->tuple.dst.u.tcp.port = htons(port);
83 if (nf_ct_expect_related(exp) == 0) 85 ret = nf_ct_expect_related(exp);
86 if (ret == 0)
87 break;
88 else if (ret != -EBUSY) {
89 port = 0;
84 break; 90 break;
91 }
85 } 92 }
86 93
87 if (port == 0) 94 if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 5045196d853c..790f3160e012 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -222,13 +222,24 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
222 /* Try to get a pair of ports. */ 222 /* Try to get a pair of ports. */
223 for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port); 223 for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
224 nated_port != 0; nated_port += 2) { 224 nated_port != 0; nated_port += 2) {
225 int ret;
226
225 rtp_exp->tuple.dst.u.udp.port = htons(nated_port); 227 rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
226 if (nf_ct_expect_related(rtp_exp) == 0) { 228 ret = nf_ct_expect_related(rtp_exp);
229 if (ret == 0) {
227 rtcp_exp->tuple.dst.u.udp.port = 230 rtcp_exp->tuple.dst.u.udp.port =
228 htons(nated_port + 1); 231 htons(nated_port + 1);
229 if (nf_ct_expect_related(rtcp_exp) == 0) 232 ret = nf_ct_expect_related(rtcp_exp);
233 if (ret == 0)
234 break;
235 else if (ret != -EBUSY) {
236 nf_ct_unexpect_related(rtp_exp);
237 nated_port = 0;
230 break; 238 break;
231 nf_ct_unexpect_related(rtp_exp); 239 }
240 } else if (ret != -EBUSY) {
241 nated_port = 0;
242 break;
232 } 243 }
233 } 244 }
234 245
@@ -284,9 +295,16 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
284 295
285 /* Try to get same port: if not, try to change it. */ 296 /* Try to get same port: if not, try to change it. */
286 for (; nated_port != 0; nated_port++) { 297 for (; nated_port != 0; nated_port++) {
298 int ret;
299
287 exp->tuple.dst.u.tcp.port = htons(nated_port); 300 exp->tuple.dst.u.tcp.port = htons(nated_port);
288 if (nf_ct_expect_related(exp) == 0) 301 ret = nf_ct_expect_related(exp);
302 if (ret == 0)
303 break;
304 else if (ret != -EBUSY) {
305 nated_port = 0;
289 break; 306 break;
307 }
290 } 308 }
291 309
292 if (nated_port == 0) { /* No port available */ 310 if (nated_port == 0) { /* No port available */
@@ -334,9 +352,16 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
334 352
335 /* Try to get same port: if not, try to change it. */ 353 /* Try to get same port: if not, try to change it. */
336 for (; nated_port != 0; nated_port++) { 354 for (; nated_port != 0; nated_port++) {
355 int ret;
356
337 exp->tuple.dst.u.tcp.port = htons(nated_port); 357 exp->tuple.dst.u.tcp.port = htons(nated_port);
338 if (nf_ct_expect_related(exp) == 0) 358 ret = nf_ct_expect_related(exp);
359 if (ret == 0)
339 break; 360 break;
361 else if (ret != -EBUSY) {
362 nated_port = 0;
363 break;
364 }
340 } 365 }
341 366
342 if (nated_port == 0) { /* No port available */ 367 if (nated_port == 0) { /* No port available */
@@ -418,9 +443,16 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
418 443
419 /* Try to get same port: if not, try to change it. */ 444 /* Try to get same port: if not, try to change it. */
420 for (; nated_port != 0; nated_port++) { 445 for (; nated_port != 0; nated_port++) {
446 int ret;
447
421 exp->tuple.dst.u.tcp.port = htons(nated_port); 448 exp->tuple.dst.u.tcp.port = htons(nated_port);
422 if (nf_ct_expect_related(exp) == 0) 449 ret = nf_ct_expect_related(exp);
450 if (ret == 0)
451 break;
452 else if (ret != -EBUSY) {
453 nated_port = 0;
423 break; 454 break;
455 }
424 } 456 }
425 457
426 if (nated_port == 0) { /* No port available */ 458 if (nated_port == 0) { /* No port available */
@@ -500,9 +532,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
500 532
501 /* Try to get same port: if not, try to change it. */ 533 /* Try to get same port: if not, try to change it. */
502 for (nated_port = ntohs(port); nated_port != 0; nated_port++) { 534 for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
535 int ret;
536
503 exp->tuple.dst.u.tcp.port = htons(nated_port); 537 exp->tuple.dst.u.tcp.port = htons(nated_port);
504 if (nf_ct_expect_related(exp) == 0) 538 ret = nf_ct_expect_related(exp);
539 if (ret == 0)
505 break; 540 break;
541 else if (ret != -EBUSY) {
542 nated_port = 0;
543 break;
544 }
506 } 545 }
507 546
508 if (nated_port == 0) { /* No port available */ 547 if (nated_port == 0) { /* No port available */
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 4a0c6b548eee..ebc5f8894f99 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -153,6 +153,35 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
153} 153}
154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); 154EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
155 155
156static void nf_nat_csum(struct sk_buff *skb, const struct iphdr *iph, void *data,
157 int datalen, __sum16 *check, int oldlen)
158{
159 struct rtable *rt = skb_rtable(skb);
160
161 if (skb->ip_summed != CHECKSUM_PARTIAL) {
162 if (!(rt->rt_flags & RTCF_LOCAL) &&
163 (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
164 skb->ip_summed = CHECKSUM_PARTIAL;
165 skb->csum_start = skb_headroom(skb) +
166 skb_network_offset(skb) +
167 iph->ihl * 4;
168 skb->csum_offset = (void *)check - data;
169 *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
170 datalen, iph->protocol, 0);
171 } else {
172 *check = 0;
173 *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
174 datalen, iph->protocol,
175 csum_partial(data, datalen,
176 0));
177 if (iph->protocol == IPPROTO_UDP && !*check)
178 *check = CSUM_MANGLED_0;
179 }
180 } else
181 inet_proto_csum_replace2(check, skb,
182 htons(oldlen), htons(datalen), 1);
183}
184
156/* Generic function for mangling variable-length address changes inside 185/* Generic function for mangling variable-length address changes inside
157 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX 186 * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
158 * command in FTP). 187 * command in FTP).
@@ -169,7 +198,6 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
169 const char *rep_buffer, 198 const char *rep_buffer,
170 unsigned int rep_len, bool adjust) 199 unsigned int rep_len, bool adjust)
171{ 200{
172 struct rtable *rt = skb_rtable(skb);
173 struct iphdr *iph; 201 struct iphdr *iph;
174 struct tcphdr *tcph; 202 struct tcphdr *tcph;
175 int oldlen, datalen; 203 int oldlen, datalen;
@@ -192,26 +220,7 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
192 match_offset, match_len, rep_buffer, rep_len); 220 match_offset, match_len, rep_buffer, rep_len);
193 221
194 datalen = skb->len - iph->ihl*4; 222 datalen = skb->len - iph->ihl*4;
195 if (skb->ip_summed != CHECKSUM_PARTIAL) { 223 nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
196 if (!(rt->rt_flags & RTCF_LOCAL) &&
197 skb->dev->features & NETIF_F_V4_CSUM) {
198 skb->ip_summed = CHECKSUM_PARTIAL;
199 skb->csum_start = skb_headroom(skb) +
200 skb_network_offset(skb) +
201 iph->ihl * 4;
202 skb->csum_offset = offsetof(struct tcphdr, check);
203 tcph->check = ~tcp_v4_check(datalen,
204 iph->saddr, iph->daddr, 0);
205 } else {
206 tcph->check = 0;
207 tcph->check = tcp_v4_check(datalen,
208 iph->saddr, iph->daddr,
209 csum_partial(tcph,
210 datalen, 0));
211 }
212 } else
213 inet_proto_csum_replace2(&tcph->check, skb,
214 htons(oldlen), htons(datalen), 1);
215 224
216 if (adjust && rep_len != match_len) 225 if (adjust && rep_len != match_len)
217 nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, 226 nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
@@ -240,7 +249,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
240 const char *rep_buffer, 249 const char *rep_buffer,
241 unsigned int rep_len) 250 unsigned int rep_len)
242{ 251{
243 struct rtable *rt = skb_rtable(skb);
244 struct iphdr *iph; 252 struct iphdr *iph;
245 struct udphdr *udph; 253 struct udphdr *udph;
246 int datalen, oldlen; 254 int datalen, oldlen;
@@ -274,29 +282,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
274 if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) 282 if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
275 return 1; 283 return 1;
276 284
277 if (skb->ip_summed != CHECKSUM_PARTIAL) { 285 nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
278 if (!(rt->rt_flags & RTCF_LOCAL) &&
279 skb->dev->features & NETIF_F_V4_CSUM) {
280 skb->ip_summed = CHECKSUM_PARTIAL;
281 skb->csum_start = skb_headroom(skb) +
282 skb_network_offset(skb) +
283 iph->ihl * 4;
284 skb->csum_offset = offsetof(struct udphdr, check);
285 udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
286 datalen, IPPROTO_UDP,
287 0);
288 } else {
289 udph->check = 0;
290 udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
291 datalen, IPPROTO_UDP,
292 csum_partial(udph,
293 datalen, 0));
294 if (!udph->check)
295 udph->check = CSUM_MANGLED_0;
296 }
297 } else
298 inet_proto_csum_replace2(&udph->check, skb,
299 htons(oldlen), htons(datalen), 1);
300 286
301 return 1; 287 return 1;
302} 288}
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
index ea83a886b03e..535e1a802356 100644
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ b/net/ipv4/netfilter/nf_nat_irc.c
@@ -45,9 +45,16 @@ static unsigned int help(struct sk_buff *skb,
45 45
46 /* Try to get same port: if not, try to change it. */ 46 /* Try to get same port: if not, try to change it. */
47 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 47 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
48 int ret;
49
48 exp->tuple.dst.u.tcp.port = htons(port); 50 exp->tuple.dst.u.tcp.port = htons(port);
49 if (nf_ct_expect_related(exp) == 0) 51 ret = nf_ct_expect_related(exp);
52 if (ret == 0)
53 break;
54 else if (ret != -EBUSY) {
55 port = 0;
50 break; 56 break;
57 }
51 } 58 }
52 59
53 if (port == 0) 60 if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index ebbd319f62f5..733c9abc1cbd 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -53,7 +53,7 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
53 53
54 /* Connection must be valid and new. */ 54 /* Connection must be valid and new. */
55 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || 55 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
56 ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); 56 ctinfo == IP_CT_RELATED_REPLY));
57 NF_CT_ASSERT(par->out != NULL); 57 NF_CT_ASSERT(par->out != NULL);
58 58
59 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); 59 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC);
@@ -106,16 +106,15 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
106{ 106{
107 /* Force range to this IP; let proto decide mapping for 107 /* Force range to this IP; let proto decide mapping for
108 per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). 108 per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
109 Use reply in case it's already been mangled (eg local packet).
110 */ 109 */
111 __be32 ip 110 struct nf_nat_range range;
112 = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC 111
113 ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip 112 range.flags = 0;
114 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); 113 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
115 struct nf_nat_range range 114 HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ?
116 = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } }; 115 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
117 116 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
118 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip); 117
119 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); 118 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
120} 119}
121 120
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 11b538deaaec..e40cf7816fdb 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -307,9 +307,16 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
307 exp->expectfn = ip_nat_sip_expected; 307 exp->expectfn = ip_nat_sip_expected;
308 308
309 for (; port != 0; port++) { 309 for (; port != 0; port++) {
310 int ret;
311
310 exp->tuple.dst.u.udp.port = htons(port); 312 exp->tuple.dst.u.udp.port = htons(port);
311 if (nf_ct_expect_related(exp) == 0) 313 ret = nf_ct_expect_related(exp);
314 if (ret == 0)
315 break;
316 else if (ret != -EBUSY) {
317 port = 0;
312 break; 318 break;
319 }
313 } 320 }
314 321
315 if (port == 0) 322 if (port == 0)
@@ -480,13 +487,25 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
480 /* Try to get same pair of ports: if not, try to change them. */ 487 /* Try to get same pair of ports: if not, try to change them. */
481 for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); 488 for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
482 port != 0; port += 2) { 489 port != 0; port += 2) {
490 int ret;
491
483 rtp_exp->tuple.dst.u.udp.port = htons(port); 492 rtp_exp->tuple.dst.u.udp.port = htons(port);
484 if (nf_ct_expect_related(rtp_exp) != 0) 493 ret = nf_ct_expect_related(rtp_exp);
494 if (ret == -EBUSY)
485 continue; 495 continue;
496 else if (ret < 0) {
497 port = 0;
498 break;
499 }
486 rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); 500 rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
487 if (nf_ct_expect_related(rtcp_exp) == 0) 501 ret = nf_ct_expect_related(rtcp_exp);
502 if (ret == 0)
488 break; 503 break;
489 nf_ct_unexpect_related(rtp_exp); 504 else if (ret != -EBUSY) {
505 nf_ct_unexpect_related(rtp_exp);
506 port = 0;
507 break;
508 }
490 } 509 }
491 510
492 if (port == 0) 511 if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ee5f419d0a56..8812a02078ab 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -54,6 +54,7 @@
54#include <net/netfilter/nf_conntrack_expect.h> 54#include <net/netfilter/nf_conntrack_expect.h>
55#include <net/netfilter/nf_conntrack_helper.h> 55#include <net/netfilter/nf_conntrack_helper.h>
56#include <net/netfilter/nf_nat_helper.h> 56#include <net/netfilter/nf_nat_helper.h>
57#include <linux/netfilter/nf_conntrack_snmp.h>
57 58
58MODULE_LICENSE("GPL"); 59MODULE_LICENSE("GPL");
59MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); 60MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
@@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void)
1310{ 1311{
1311 int ret = 0; 1312 int ret = 0;
1312 1313
1313 ret = nf_conntrack_helper_register(&snmp_helper); 1314 BUG_ON(nf_nat_snmp_hook != NULL);
1314 if (ret < 0) 1315 rcu_assign_pointer(nf_nat_snmp_hook, help);
1315 return ret; 1316
1316 ret = nf_conntrack_helper_register(&snmp_trap_helper); 1317 ret = nf_conntrack_helper_register(&snmp_trap_helper);
1317 if (ret < 0) { 1318 if (ret < 0) {
1318 nf_conntrack_helper_unregister(&snmp_helper); 1319 nf_conntrack_helper_unregister(&snmp_helper);
@@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void)
1323 1324
1324static void __exit nf_nat_snmp_basic_fini(void) 1325static void __exit nf_nat_snmp_basic_fini(void)
1325{ 1326{
1326 nf_conntrack_helper_unregister(&snmp_helper); 1327 rcu_assign_pointer(nf_nat_snmp_hook, NULL);
1327 nf_conntrack_helper_unregister(&snmp_trap_helper); 1328 nf_conntrack_helper_unregister(&snmp_trap_helper);
1328} 1329}
1329 1330
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 95481fee8bdb..483b76d042da 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -31,6 +31,7 @@
31#ifdef CONFIG_XFRM 31#ifdef CONFIG_XFRM
32static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) 32static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
33{ 33{
34 struct flowi4 *fl4 = &fl->u.ip4;
34 const struct nf_conn *ct; 35 const struct nf_conn *ct;
35 const struct nf_conntrack_tuple *t; 36 const struct nf_conntrack_tuple *t;
36 enum ip_conntrack_info ctinfo; 37 enum ip_conntrack_info ctinfo;
@@ -49,25 +50,25 @@ static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
49 statusbit = IPS_SRC_NAT; 50 statusbit = IPS_SRC_NAT;
50 51
51 if (ct->status & statusbit) { 52 if (ct->status & statusbit) {
52 fl->fl4_dst = t->dst.u3.ip; 53 fl4->daddr = t->dst.u3.ip;
53 if (t->dst.protonum == IPPROTO_TCP || 54 if (t->dst.protonum == IPPROTO_TCP ||
54 t->dst.protonum == IPPROTO_UDP || 55 t->dst.protonum == IPPROTO_UDP ||
55 t->dst.protonum == IPPROTO_UDPLITE || 56 t->dst.protonum == IPPROTO_UDPLITE ||
56 t->dst.protonum == IPPROTO_DCCP || 57 t->dst.protonum == IPPROTO_DCCP ||
57 t->dst.protonum == IPPROTO_SCTP) 58 t->dst.protonum == IPPROTO_SCTP)
58 fl->fl_ip_dport = t->dst.u.tcp.port; 59 fl4->fl4_dport = t->dst.u.tcp.port;
59 } 60 }
60 61
61 statusbit ^= IPS_NAT_MASK; 62 statusbit ^= IPS_NAT_MASK;
62 63
63 if (ct->status & statusbit) { 64 if (ct->status & statusbit) {
64 fl->fl4_src = t->src.u3.ip; 65 fl4->saddr = t->src.u3.ip;
65 if (t->dst.protonum == IPPROTO_TCP || 66 if (t->dst.protonum == IPPROTO_TCP ||
66 t->dst.protonum == IPPROTO_UDP || 67 t->dst.protonum == IPPROTO_UDP ||
67 t->dst.protonum == IPPROTO_UDPLITE || 68 t->dst.protonum == IPPROTO_UDPLITE ||
68 t->dst.protonum == IPPROTO_DCCP || 69 t->dst.protonum == IPPROTO_DCCP ||
69 t->dst.protonum == IPPROTO_SCTP) 70 t->dst.protonum == IPPROTO_SCTP)
70 fl->fl_ip_sport = t->src.u.tcp.port; 71 fl4->fl4_sport = t->src.u.tcp.port;
71 } 72 }
72} 73}
73#endif 74#endif
@@ -115,7 +116,7 @@ nf_nat_fn(unsigned int hooknum,
115 116
116 switch (ctinfo) { 117 switch (ctinfo) {
117 case IP_CT_RELATED: 118 case IP_CT_RELATED:
118 case IP_CT_RELATED+IP_CT_IS_REPLY: 119 case IP_CT_RELATED_REPLY:
119 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { 120 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
120 if (!nf_nat_icmp_reply_translation(ct, ctinfo, 121 if (!nf_nat_icmp_reply_translation(ct, ctinfo,
121 hooknum, skb)) 122 hooknum, skb))
@@ -143,7 +144,7 @@ nf_nat_fn(unsigned int hooknum,
143 default: 144 default:
144 /* ESTABLISHED */ 145 /* ESTABLISHED */
145 NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || 146 NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
146 ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)); 147 ctinfo == IP_CT_ESTABLISHED_REPLY);
147 } 148 }
148 149
149 return nf_nat_packet(ct, ctinfo, hooknum, skb); 150 return nf_nat_packet(ct, ctinfo, hooknum, skb);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
new file mode 100644
index 000000000000..39b403f854c6
--- /dev/null
+++ b/net/ipv4/ping.c
@@ -0,0 +1,931 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * "Ping" sockets
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 * Based on ipv4/udp.c code.
14 *
15 * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6),
16 * Pavel Kankovsky (for Linux 2.4.32)
17 *
18 * Pavel gave all rights to bugs to Vasiliy,
19 * none of the bugs are Pavel's now.
20 *
21 */
22
23#include <asm/system.h>
24#include <linux/uaccess.h>
25#include <linux/types.h>
26#include <linux/fcntl.h>
27#include <linux/socket.h>
28#include <linux/sockios.h>
29#include <linux/in.h>
30#include <linux/errno.h>
31#include <linux/timer.h>
32#include <linux/mm.h>
33#include <linux/inet.h>
34#include <linux/netdevice.h>
35#include <net/snmp.h>
36#include <net/ip.h>
37#include <net/ipv6.h>
38#include <net/icmp.h>
39#include <net/protocol.h>
40#include <linux/skbuff.h>
41#include <linux/proc_fs.h>
42#include <net/sock.h>
43#include <net/ping.h>
44#include <net/udp.h>
45#include <net/route.h>
46#include <net/inet_common.h>
47#include <net/checksum.h>
48
49
50static struct ping_table ping_table;
51
52static u16 ping_port_rover;
53
54static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask)
55{
56 int res = (num + net_hash_mix(net)) & mask;
57 pr_debug("hash(%d) = %d\n", num, res);
58 return res;
59}
60
61static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
62 struct net *net, unsigned num)
63{
64 return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
65}
66
67static int ping_v4_get_port(struct sock *sk, unsigned short ident)
68{
69 struct hlist_nulls_node *node;
70 struct hlist_nulls_head *hlist;
71 struct inet_sock *isk, *isk2;
72 struct sock *sk2 = NULL;
73
74 isk = inet_sk(sk);
75 write_lock_bh(&ping_table.lock);
76 if (ident == 0) {
77 u32 i;
78 u16 result = ping_port_rover + 1;
79
80 for (i = 0; i < (1L << 16); i++, result++) {
81 if (!result)
82 result++; /* avoid zero */
83 hlist = ping_hashslot(&ping_table, sock_net(sk),
84 result);
85 ping_portaddr_for_each_entry(sk2, node, hlist) {
86 isk2 = inet_sk(sk2);
87
88 if (isk2->inet_num == result)
89 goto next_port;
90 }
91
92 /* found */
93 ping_port_rover = ident = result;
94 break;
95next_port:
96 ;
97 }
98 if (i >= (1L << 16))
99 goto fail;
100 } else {
101 hlist = ping_hashslot(&ping_table, sock_net(sk), ident);
102 ping_portaddr_for_each_entry(sk2, node, hlist) {
103 isk2 = inet_sk(sk2);
104
105 if ((isk2->inet_num == ident) &&
106 (sk2 != sk) &&
107 (!sk2->sk_reuse || !sk->sk_reuse))
108 goto fail;
109 }
110 }
111
112 pr_debug("found port/ident = %d\n", ident);
113 isk->inet_num = ident;
114 if (sk_unhashed(sk)) {
115 pr_debug("was not hashed\n");
116 sock_hold(sk);
117 hlist_nulls_add_head(&sk->sk_nulls_node, hlist);
118 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
119 }
120 write_unlock_bh(&ping_table.lock);
121 return 0;
122
123fail:
124 write_unlock_bh(&ping_table.lock);
125 return 1;
126}
127
128static void ping_v4_hash(struct sock *sk)
129{
130 pr_debug("ping_v4_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
131 BUG(); /* "Please do not press this button again." */
132}
133
134static void ping_v4_unhash(struct sock *sk)
135{
136 struct inet_sock *isk = inet_sk(sk);
137 pr_debug("ping_v4_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
138 if (sk_hashed(sk)) {
139 write_lock_bh(&ping_table.lock);
140 hlist_nulls_del(&sk->sk_nulls_node);
141 sock_put(sk);
142 isk->inet_num = isk->inet_sport = 0;
143 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
144 write_unlock_bh(&ping_table.lock);
145 }
146}
147
148static struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr,
149 u16 ident, int dif)
150{
151 struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
152 struct sock *sk = NULL;
153 struct inet_sock *isk;
154 struct hlist_nulls_node *hnode;
155
156 pr_debug("try to find: num = %d, daddr = %ld, dif = %d\n",
157 (int)ident, (unsigned long)daddr, dif);
158 read_lock_bh(&ping_table.lock);
159
160 ping_portaddr_for_each_entry(sk, hnode, hslot) {
161 isk = inet_sk(sk);
162
163 pr_debug("found: %p: num = %d, daddr = %ld, dif = %d\n", sk,
164 (int)isk->inet_num, (unsigned long)isk->inet_rcv_saddr,
165 sk->sk_bound_dev_if);
166
167 pr_debug("iterate\n");
168 if (isk->inet_num != ident)
169 continue;
170 if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != daddr)
171 continue;
172 if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
173 continue;
174
175 sock_hold(sk);
176 goto exit;
177 }
178
179 sk = NULL;
180exit:
181 read_unlock_bh(&ping_table.lock);
182
183 return sk;
184}
185
186static void inet_get_ping_group_range_net(struct net *net, gid_t *low,
187 gid_t *high)
188{
189 gid_t *data = net->ipv4.sysctl_ping_group_range;
190 unsigned seq;
191 do {
192 seq = read_seqbegin(&sysctl_local_ports.lock);
193
194 *low = data[0];
195 *high = data[1];
196 } while (read_seqretry(&sysctl_local_ports.lock, seq));
197}
198
199
200static int ping_init_sock(struct sock *sk)
201{
202 struct net *net = sock_net(sk);
203 gid_t group = current_egid();
204 gid_t range[2];
205 struct group_info *group_info = get_current_groups();
206 int i, j, count = group_info->ngroups;
207
208 inet_get_ping_group_range_net(net, range, range+1);
209 if (range[0] <= group && group <= range[1])
210 return 0;
211
212 for (i = 0; i < group_info->nblocks; i++) {
213 int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
214
215 for (j = 0; j < cp_count; j++) {
216 group = group_info->blocks[i][j];
217 if (range[0] <= group && group <= range[1])
218 return 0;
219 }
220
221 count -= cp_count;
222 }
223
224 return -EACCES;
225}
226
227static void ping_close(struct sock *sk, long timeout)
228{
229 pr_debug("ping_close(sk=%p,sk->num=%u)\n",
230 inet_sk(sk), inet_sk(sk)->inet_num);
231 pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter);
232
233 sk_common_release(sk);
234}
235
236/*
237 * We need our own bind because there are no privileged id's == local ports.
238 * Moreover, we don't allow binding to multi- and broadcast addresses.
239 */
240
241static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
242{
243 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
244 struct inet_sock *isk = inet_sk(sk);
245 unsigned short snum;
246 int chk_addr_ret;
247 int err;
248
249 if (addr_len < sizeof(struct sockaddr_in))
250 return -EINVAL;
251
252 pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n",
253 sk, addr->sin_addr.s_addr, ntohs(addr->sin_port));
254
255 chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
256 if (addr->sin_addr.s_addr == INADDR_ANY)
257 chk_addr_ret = RTN_LOCAL;
258
259 if ((sysctl_ip_nonlocal_bind == 0 &&
260 isk->freebind == 0 && isk->transparent == 0 &&
261 chk_addr_ret != RTN_LOCAL) ||
262 chk_addr_ret == RTN_MULTICAST ||
263 chk_addr_ret == RTN_BROADCAST)
264 return -EADDRNOTAVAIL;
265
266 lock_sock(sk);
267
268 err = -EINVAL;
269 if (isk->inet_num != 0)
270 goto out;
271
272 err = -EADDRINUSE;
273 isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr;
274 snum = ntohs(addr->sin_port);
275 if (ping_v4_get_port(sk, snum) != 0) {
276 isk->inet_saddr = isk->inet_rcv_saddr = 0;
277 goto out;
278 }
279
280 pr_debug("after bind(): num = %d, daddr = %ld, dif = %d\n",
281 (int)isk->inet_num,
282 (unsigned long) isk->inet_rcv_saddr,
283 (int)sk->sk_bound_dev_if);
284
285 err = 0;
286 if (isk->inet_rcv_saddr)
287 sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
288 if (snum)
289 sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
290 isk->inet_sport = htons(isk->inet_num);
291 isk->inet_daddr = 0;
292 isk->inet_dport = 0;
293 sk_dst_reset(sk);
294out:
295 release_sock(sk);
296 pr_debug("ping_v4_bind -> %d\n", err);
297 return err;
298}
299
300/*
301 * Is this a supported type of ICMP message?
302 */
303
304static inline int ping_supported(int type, int code)
305{
306 if (type == ICMP_ECHO && code == 0)
307 return 1;
308 return 0;
309}
310
311/*
312 * This routine is called by the ICMP module when it gets some
313 * sort of error condition.
314 */
315
316static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
317
318void ping_err(struct sk_buff *skb, u32 info)
319{
320 struct iphdr *iph = (struct iphdr *)skb->data;
321 struct icmphdr *icmph = (struct icmphdr *)(skb->data+(iph->ihl<<2));
322 struct inet_sock *inet_sock;
323 int type = icmph->type;
324 int code = icmph->code;
325 struct net *net = dev_net(skb->dev);
326 struct sock *sk;
327 int harderr;
328 int err;
329
330 /* We assume the packet has already been checked by icmp_unreach */
331
332 if (!ping_supported(icmph->type, icmph->code))
333 return;
334
335 pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type,
336 code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
337
338 sk = ping_v4_lookup(net, iph->daddr, iph->saddr,
339 ntohs(icmph->un.echo.id), skb->dev->ifindex);
340 if (sk == NULL) {
341 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
342 pr_debug("no socket, dropping\n");
343 return; /* No socket for error */
344 }
345 pr_debug("err on socket %p\n", sk);
346
347 err = 0;
348 harderr = 0;
349 inet_sock = inet_sk(sk);
350
351 switch (type) {
352 default:
353 case ICMP_TIME_EXCEEDED:
354 err = EHOSTUNREACH;
355 break;
356 case ICMP_SOURCE_QUENCH:
357 /* This is not a real error but ping wants to see it.
358 * Report it with some fake errno. */
359 err = EREMOTEIO;
360 break;
361 case ICMP_PARAMETERPROB:
362 err = EPROTO;
363 harderr = 1;
364 break;
365 case ICMP_DEST_UNREACH:
366 if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
367 if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
368 err = EMSGSIZE;
369 harderr = 1;
370 break;
371 }
372 goto out;
373 }
374 err = EHOSTUNREACH;
375 if (code <= NR_ICMP_UNREACH) {
376 harderr = icmp_err_convert[code].fatal;
377 err = icmp_err_convert[code].errno;
378 }
379 break;
380 case ICMP_REDIRECT:
381 /* See ICMP_SOURCE_QUENCH */
382 err = EREMOTEIO;
383 break;
384 }
385
386 /*
387 * RFC1122: OK. Passes ICMP errors back to application, as per
388 * 4.1.3.3.
389 */
390 if (!inet_sock->recverr) {
391 if (!harderr || sk->sk_state != TCP_ESTABLISHED)
392 goto out;
393 } else {
394 ip_icmp_error(sk, skb, err, 0 /* no remote port */,
395 info, (u8 *)icmph);
396 }
397 sk->sk_err = err;
398 sk->sk_error_report(sk);
399out:
400 sock_put(sk);
401}
402
403/*
404 * Copy and checksum an ICMP Echo packet from user space into a buffer.
405 */
406
407struct pingfakehdr {
408 struct icmphdr icmph;
409 struct iovec *iov;
410 u32 wcheck;
411};
412
413static int ping_getfrag(void *from, char * to,
414 int offset, int fraglen, int odd, struct sk_buff *skb)
415{
416 struct pingfakehdr *pfh = (struct pingfakehdr *)from;
417
418 if (offset == 0) {
419 if (fraglen < sizeof(struct icmphdr))
420 BUG();
421 if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr),
422 pfh->iov, 0, fraglen - sizeof(struct icmphdr),
423 &pfh->wcheck))
424 return -EFAULT;
425
426 return 0;
427 }
428 if (offset < sizeof(struct icmphdr))
429 BUG();
430 if (csum_partial_copy_fromiovecend
431 (to, pfh->iov, offset - sizeof(struct icmphdr),
432 fraglen, &pfh->wcheck))
433 return -EFAULT;
434 return 0;
435}
436
437static int ping_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
438 struct flowi4 *fl4)
439{
440 struct sk_buff *skb = skb_peek(&sk->sk_write_queue);
441
442 pfh->wcheck = csum_partial((char *)&pfh->icmph,
443 sizeof(struct icmphdr), pfh->wcheck);
444 pfh->icmph.checksum = csum_fold(pfh->wcheck);
445 memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr));
446 skb->ip_summed = CHECKSUM_NONE;
447 return ip_push_pending_frames(sk, fl4);
448}
449
450static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
451 size_t len)
452{
453 struct net *net = sock_net(sk);
454 struct flowi4 fl4;
455 struct inet_sock *inet = inet_sk(sk);
456 struct ipcm_cookie ipc;
457 struct icmphdr user_icmph;
458 struct pingfakehdr pfh;
459 struct rtable *rt = NULL;
460 struct ip_options_data opt_copy;
461 int free = 0;
462 u32 saddr, daddr, faddr;
463 u8 tos;
464 int err;
465
466 pr_debug("ping_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
467
468
469 if (len > 0xFFFF)
470 return -EMSGSIZE;
471
472 /*
473 * Check the flags.
474 */
475
476 /* Mirror BSD error message compatibility */
477 if (msg->msg_flags & MSG_OOB)
478 return -EOPNOTSUPP;
479
480 /*
481 * Fetch the ICMP header provided by the userland.
482 * iovec is modified!
483 */
484
485 if (memcpy_fromiovec((u8 *)&user_icmph, msg->msg_iov,
486 sizeof(struct icmphdr)))
487 return -EFAULT;
488 if (!ping_supported(user_icmph.type, user_icmph.code))
489 return -EINVAL;
490
491 /*
492 * Get and verify the address.
493 */
494
495 if (msg->msg_name) {
496 struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
497 if (msg->msg_namelen < sizeof(*usin))
498 return -EINVAL;
499 if (usin->sin_family != AF_INET)
500 return -EINVAL;
501 daddr = usin->sin_addr.s_addr;
502 /* no remote port */
503 } else {
504 if (sk->sk_state != TCP_ESTABLISHED)
505 return -EDESTADDRREQ;
506 daddr = inet->inet_daddr;
507 /* no remote port */
508 }
509
510 ipc.addr = inet->inet_saddr;
511 ipc.opt = NULL;
512 ipc.oif = sk->sk_bound_dev_if;
513 ipc.tx_flags = 0;
514 err = sock_tx_timestamp(sk, &ipc.tx_flags);
515 if (err)
516 return err;
517
518 if (msg->msg_controllen) {
519 err = ip_cmsg_send(sock_net(sk), msg, &ipc);
520 if (err)
521 return err;
522 if (ipc.opt)
523 free = 1;
524 }
525 if (!ipc.opt) {
526 struct ip_options_rcu *inet_opt;
527
528 rcu_read_lock();
529 inet_opt = rcu_dereference(inet->inet_opt);
530 if (inet_opt) {
531 memcpy(&opt_copy, inet_opt,
532 sizeof(*inet_opt) + inet_opt->opt.optlen);
533 ipc.opt = &opt_copy.opt;
534 }
535 rcu_read_unlock();
536 }
537
538 saddr = ipc.addr;
539 ipc.addr = faddr = daddr;
540
541 if (ipc.opt && ipc.opt->opt.srr) {
542 if (!daddr)
543 return -EINVAL;
544 faddr = ipc.opt->opt.faddr;
545 }
546 tos = RT_TOS(inet->tos);
547 if (sock_flag(sk, SOCK_LOCALROUTE) ||
548 (msg->msg_flags & MSG_DONTROUTE) ||
549 (ipc.opt && ipc.opt->opt.is_strictroute)) {
550 tos |= RTO_ONLINK;
551 }
552
553 if (ipv4_is_multicast(daddr)) {
554 if (!ipc.oif)
555 ipc.oif = inet->mc_index;
556 if (!saddr)
557 saddr = inet->mc_addr;
558 }
559
560 flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
561 RT_SCOPE_UNIVERSE, sk->sk_protocol,
562 inet_sk_flowi_flags(sk), faddr, saddr, 0, 0);
563
564 security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
565 rt = ip_route_output_flow(net, &fl4, sk);
566 if (IS_ERR(rt)) {
567 err = PTR_ERR(rt);
568 rt = NULL;
569 if (err == -ENETUNREACH)
570 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
571 goto out;
572 }
573
574 err = -EACCES;
575 if ((rt->rt_flags & RTCF_BROADCAST) &&
576 !sock_flag(sk, SOCK_BROADCAST))
577 goto out;
578
579 if (msg->msg_flags & MSG_CONFIRM)
580 goto do_confirm;
581back_from_confirm:
582
583 if (!ipc.addr)
584 ipc.addr = fl4.daddr;
585
586 lock_sock(sk);
587
588 pfh.icmph.type = user_icmph.type; /* already checked */
589 pfh.icmph.code = user_icmph.code; /* ditto */
590 pfh.icmph.checksum = 0;
591 pfh.icmph.un.echo.id = inet->inet_sport;
592 pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence;
593 pfh.iov = msg->msg_iov;
594 pfh.wcheck = 0;
595
596 err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len,
597 0, &ipc, &rt, msg->msg_flags);
598 if (err)
599 ip_flush_pending_frames(sk);
600 else
601 err = ping_push_pending_frames(sk, &pfh, &fl4);
602 release_sock(sk);
603
604out:
605 ip_rt_put(rt);
606 if (free)
607 kfree(ipc.opt);
608 if (!err) {
609 icmp_out_count(sock_net(sk), user_icmph.type);
610 return len;
611 }
612 return err;
613
614do_confirm:
615 dst_confirm(&rt->dst);
616 if (!(msg->msg_flags & MSG_PROBE) || len)
617 goto back_from_confirm;
618 err = 0;
619 goto out;
620}
621
622static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
623 size_t len, int noblock, int flags, int *addr_len)
624{
625 struct inet_sock *isk = inet_sk(sk);
626 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
627 struct sk_buff *skb;
628 int copied, err;
629
630 pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
631
632 if (flags & MSG_OOB)
633 goto out;
634
635 if (addr_len)
636 *addr_len = sizeof(*sin);
637
638 if (flags & MSG_ERRQUEUE)
639 return ip_recv_error(sk, msg, len);
640
641 skb = skb_recv_datagram(sk, flags, noblock, &err);
642 if (!skb)
643 goto out;
644
645 copied = skb->len;
646 if (copied > len) {
647 msg->msg_flags |= MSG_TRUNC;
648 copied = len;
649 }
650
651 /* Don't bother checking the checksum */
652 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
653 if (err)
654 goto done;
655
656 sock_recv_timestamp(msg, sk, skb);
657
658 /* Copy the address. */
659 if (sin) {
660 sin->sin_family = AF_INET;
661 sin->sin_port = 0 /* skb->h.uh->source */;
662 sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
663 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
664 }
665 if (isk->cmsg_flags)
666 ip_cmsg_recv(msg, skb);
667 err = copied;
668
669done:
670 skb_free_datagram(sk, skb);
671out:
672 pr_debug("ping_recvmsg -> %d\n", err);
673 return err;
674}
675
676static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
677{
678 pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
679 inet_sk(sk), inet_sk(sk)->inet_num, skb);
680 if (sock_queue_rcv_skb(sk, skb) < 0) {
681 ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_INERRORS);
682 kfree_skb(skb);
683 pr_debug("ping_queue_rcv_skb -> failed\n");
684 return -1;
685 }
686 return 0;
687}
688
689
690/*
691 * All we need to do is get the socket.
692 */
693
694void ping_rcv(struct sk_buff *skb)
695{
696 struct sock *sk;
697 struct net *net = dev_net(skb->dev);
698 struct iphdr *iph = ip_hdr(skb);
699 struct icmphdr *icmph = icmp_hdr(skb);
700 u32 saddr = iph->saddr;
701 u32 daddr = iph->daddr;
702
703 /* We assume the packet has already been checked by icmp_rcv */
704
705 pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
706 skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
707
708 /* Push ICMP header back */
709 skb_push(skb, skb->data - (u8 *)icmph);
710
711 sk = ping_v4_lookup(net, saddr, daddr, ntohs(icmph->un.echo.id),
712 skb->dev->ifindex);
713 if (sk != NULL) {
714 pr_debug("rcv on socket %p\n", sk);
715 ping_queue_rcv_skb(sk, skb_get(skb));
716 sock_put(sk);
717 return;
718 }
719 pr_debug("no socket, dropping\n");
720
721 /* We're called from icmp_rcv(). kfree_skb() is done there. */
722}
723
724struct proto ping_prot = {
725 .name = "PING",
726 .owner = THIS_MODULE,
727 .init = ping_init_sock,
728 .close = ping_close,
729 .connect = ip4_datagram_connect,
730 .disconnect = udp_disconnect,
731 .setsockopt = ip_setsockopt,
732 .getsockopt = ip_getsockopt,
733 .sendmsg = ping_sendmsg,
734 .recvmsg = ping_recvmsg,
735 .bind = ping_bind,
736 .backlog_rcv = ping_queue_rcv_skb,
737 .hash = ping_v4_hash,
738 .unhash = ping_v4_unhash,
739 .get_port = ping_v4_get_port,
740 .obj_size = sizeof(struct inet_sock),
741};
742EXPORT_SYMBOL(ping_prot);
743
744#ifdef CONFIG_PROC_FS
745
746static struct sock *ping_get_first(struct seq_file *seq, int start)
747{
748 struct sock *sk;
749 struct ping_iter_state *state = seq->private;
750 struct net *net = seq_file_net(seq);
751
752 for (state->bucket = start; state->bucket < PING_HTABLE_SIZE;
753 ++state->bucket) {
754 struct hlist_nulls_node *node;
755 struct hlist_nulls_head *hslot;
756
757 hslot = &ping_table.hash[state->bucket];
758
759 if (hlist_nulls_empty(hslot))
760 continue;
761
762 sk_nulls_for_each(sk, node, hslot) {
763 if (net_eq(sock_net(sk), net))
764 goto found;
765 }
766 }
767 sk = NULL;
768found:
769 return sk;
770}
771
772static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk)
773{
774 struct ping_iter_state *state = seq->private;
775 struct net *net = seq_file_net(seq);
776
777 do {
778 sk = sk_nulls_next(sk);
779 } while (sk && (!net_eq(sock_net(sk), net)));
780
781 if (!sk)
782 return ping_get_first(seq, state->bucket + 1);
783 return sk;
784}
785
786static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
787{
788 struct sock *sk = ping_get_first(seq, 0);
789
790 if (sk)
791 while (pos && (sk = ping_get_next(seq, sk)) != NULL)
792 --pos;
793 return pos ? NULL : sk;
794}
795
796static void *ping_seq_start(struct seq_file *seq, loff_t *pos)
797{
798 struct ping_iter_state *state = seq->private;
799 state->bucket = 0;
800
801 read_lock_bh(&ping_table.lock);
802
803 return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
804}
805
806static void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
807{
808 struct sock *sk;
809
810 if (v == SEQ_START_TOKEN)
811 sk = ping_get_idx(seq, 0);
812 else
813 sk = ping_get_next(seq, v);
814
815 ++*pos;
816 return sk;
817}
818
819static void ping_seq_stop(struct seq_file *seq, void *v)
820{
821 read_unlock_bh(&ping_table.lock);
822}
823
824static void ping_format_sock(struct sock *sp, struct seq_file *f,
825 int bucket, int *len)
826{
827 struct inet_sock *inet = inet_sk(sp);
828 __be32 dest = inet->inet_daddr;
829 __be32 src = inet->inet_rcv_saddr;
830 __u16 destp = ntohs(inet->inet_dport);
831 __u16 srcp = ntohs(inet->inet_sport);
832
833 seq_printf(f, "%5d: %08X:%04X %08X:%04X"
834 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n",
835 bucket, src, srcp, dest, destp, sp->sk_state,
836 sk_wmem_alloc_get(sp),
837 sk_rmem_alloc_get(sp),
838 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
839 atomic_read(&sp->sk_refcnt), sp,
840 atomic_read(&sp->sk_drops), len);
841}
842
843static int ping_seq_show(struct seq_file *seq, void *v)
844{
845 if (v == SEQ_START_TOKEN)
846 seq_printf(seq, "%-127s\n",
847 " sl local_address rem_address st tx_queue "
848 "rx_queue tr tm->when retrnsmt uid timeout "
849 "inode ref pointer drops");
850 else {
851 struct ping_iter_state *state = seq->private;
852 int len;
853
854 ping_format_sock(v, seq, state->bucket, &len);
855 seq_printf(seq, "%*s\n", 127 - len, "");
856 }
857 return 0;
858}
859
860static const struct seq_operations ping_seq_ops = {
861 .show = ping_seq_show,
862 .start = ping_seq_start,
863 .next = ping_seq_next,
864 .stop = ping_seq_stop,
865};
866
867static int ping_seq_open(struct inode *inode, struct file *file)
868{
869 return seq_open_net(inode, file, &ping_seq_ops,
870 sizeof(struct ping_iter_state));
871}
872
873static const struct file_operations ping_seq_fops = {
874 .open = ping_seq_open,
875 .read = seq_read,
876 .llseek = seq_lseek,
877 .release = seq_release_net,
878};
879
880static int ping_proc_register(struct net *net)
881{
882 struct proc_dir_entry *p;
883 int rc = 0;
884
885 p = proc_net_fops_create(net, "icmp", S_IRUGO, &ping_seq_fops);
886 if (!p)
887 rc = -ENOMEM;
888 return rc;
889}
890
891static void ping_proc_unregister(struct net *net)
892{
893 proc_net_remove(net, "icmp");
894}
895
896
897static int __net_init ping_proc_init_net(struct net *net)
898{
899 return ping_proc_register(net);
900}
901
902static void __net_exit ping_proc_exit_net(struct net *net)
903{
904 ping_proc_unregister(net);
905}
906
907static struct pernet_operations ping_net_ops = {
908 .init = ping_proc_init_net,
909 .exit = ping_proc_exit_net,
910};
911
912int __init ping_proc_init(void)
913{
914 return register_pernet_subsys(&ping_net_ops);
915}
916
917void ping_proc_exit(void)
918{
919 unregister_pernet_subsys(&ping_net_ops);
920}
921
922#endif
923
924void __init ping_init(void)
925{
926 int i;
927
928 for (i = 0; i < PING_HTABLE_SIZE; i++)
929 INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i);
930 rwlock_init(&ping_table.lock);
931}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 4ae1f203f7cb..b14ec7d03b6e 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -59,13 +59,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
59 local_bh_enable(); 59 local_bh_enable();
60 60
61 socket_seq_show(seq); 61 socket_seq_show(seq);
62 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", 62 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
63 sock_prot_inuse_get(net, &tcp_prot), orphans, 63 sock_prot_inuse_get(net, &tcp_prot), orphans,
64 tcp_death_row.tw_count, sockets, 64 tcp_death_row.tw_count, sockets,
65 atomic_read(&tcp_memory_allocated)); 65 atomic_long_read(&tcp_memory_allocated));
66 seq_printf(seq, "UDP: inuse %d mem %d\n", 66 seq_printf(seq, "UDP: inuse %d mem %ld\n",
67 sock_prot_inuse_get(net, &udp_prot), 67 sock_prot_inuse_get(net, &udp_prot),
68 atomic_read(&udp_memory_allocated)); 68 atomic_long_read(&udp_memory_allocated));
69 seq_printf(seq, "UDPLITE: inuse %d\n", 69 seq_printf(seq, "UDPLITE: inuse %d\n",
70 sock_prot_inuse_get(net, &udplite_prot)); 70 sock_prot_inuse_get(net, &udplite_prot));
71 seq_printf(seq, "RAW: inuse %d\n", 71 seq_printf(seq, "RAW: inuse %d\n",
@@ -253,6 +253,7 @@ static const struct snmp_mib snmp4_net_list[] = {
253 SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), 253 SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
254 SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), 254 SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
255 SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), 255 SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
256 SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
256 SNMP_MIB_SENTINEL 257 SNMP_MIB_SENTINEL
257}; 258};
258 259
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index f2d297351405..9ae5c01cd0b2 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -28,8 +28,7 @@
28#include <linux/spinlock.h> 28#include <linux/spinlock.h>
29#include <net/protocol.h> 29#include <net/protocol.h>
30 30
31const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; 31const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
32static DEFINE_SPINLOCK(inet_proto_lock);
33 32
34/* 33/*
35 * Add a protocol handler to the hash tables 34 * Add a protocol handler to the hash tables
@@ -37,20 +36,10 @@ static DEFINE_SPINLOCK(inet_proto_lock);
37 36
38int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) 37int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
39{ 38{
40 int hash, ret; 39 int hash = protocol & (MAX_INET_PROTOS - 1);
41 40
42 hash = protocol & (MAX_INET_PROTOS - 1); 41 return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
43 42 NULL, prot) ? 0 : -1;
44 spin_lock_bh(&inet_proto_lock);
45 if (inet_protos[hash]) {
46 ret = -1;
47 } else {
48 inet_protos[hash] = prot;
49 ret = 0;
50 }
51 spin_unlock_bh(&inet_proto_lock);
52
53 return ret;
54} 43}
55EXPORT_SYMBOL(inet_add_protocol); 44EXPORT_SYMBOL(inet_add_protocol);
56 45
@@ -60,18 +49,10 @@ EXPORT_SYMBOL(inet_add_protocol);
60 49
61int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) 50int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
62{ 51{
63 int hash, ret; 52 int ret, hash = protocol & (MAX_INET_PROTOS - 1);
64
65 hash = protocol & (MAX_INET_PROTOS - 1);
66 53
67 spin_lock_bh(&inet_proto_lock); 54 ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
68 if (inet_protos[hash] == prot) { 55 prot, NULL) == prot) ? 0 : -1;
69 inet_protos[hash] = NULL;
70 ret = 0;
71 } else {
72 ret = -1;
73 }
74 spin_unlock_bh(&inet_proto_lock);
75 56
76 synchronize_net(); 57 synchronize_net();
77 58
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 009a7b2aa1ef..c9893d43242e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -76,6 +76,7 @@
76#include <linux/seq_file.h> 76#include <linux/seq_file.h>
77#include <linux/netfilter.h> 77#include <linux/netfilter.h>
78#include <linux/netfilter_ipv4.h> 78#include <linux/netfilter_ipv4.h>
79#include <linux/compat.h>
79 80
80static struct raw_hashinfo raw_v4_hashinfo = { 81static struct raw_hashinfo raw_v4_hashinfo = {
81 .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), 82 .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
@@ -153,7 +154,7 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
153 * RFC 1122: SHOULD pass TOS value up to the transport layer. 154 * RFC 1122: SHOULD pass TOS value up to the transport layer.
154 * -> It does. And not only TOS, but all IP header. 155 * -> It does. And not only TOS, but all IP header.
155 */ 156 */
156static int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) 157static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
157{ 158{
158 struct sock *sk; 159 struct sock *sk;
159 struct hlist_head *head; 160 struct hlist_head *head;
@@ -246,7 +247,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
246 } 247 }
247 248
248 if (inet->recverr) { 249 if (inet->recverr) {
249 struct iphdr *iph = (struct iphdr *)skb->data; 250 const struct iphdr *iph = (const struct iphdr *)skb->data;
250 u8 *payload = skb->data + (iph->ihl << 2); 251 u8 *payload = skb->data + (iph->ihl << 2);
251 252
252 if (inet->hdrincl) 253 if (inet->hdrincl)
@@ -264,7 +265,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
264{ 265{
265 int hash; 266 int hash;
266 struct sock *raw_sk; 267 struct sock *raw_sk;
267 struct iphdr *iph; 268 const struct iphdr *iph;
268 struct net *net; 269 struct net *net;
269 270
270 hash = protocol & (RAW_HTABLE_SIZE - 1); 271 hash = protocol & (RAW_HTABLE_SIZE - 1);
@@ -272,7 +273,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
272 read_lock(&raw_v4_hashinfo.lock); 273 read_lock(&raw_v4_hashinfo.lock);
273 raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]); 274 raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
274 if (raw_sk != NULL) { 275 if (raw_sk != NULL) {
275 iph = (struct iphdr *)skb->data; 276 iph = (const struct iphdr *)skb->data;
276 net = dev_net(skb->dev); 277 net = dev_net(skb->dev);
277 278
278 while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol, 279 while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
@@ -280,7 +281,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
280 skb->dev->ifindex)) != NULL) { 281 skb->dev->ifindex)) != NULL) {
281 raw_err(raw_sk, skb, info); 282 raw_err(raw_sk, skb, info);
282 raw_sk = sk_next(raw_sk); 283 raw_sk = sk_next(raw_sk);
283 iph = (struct iphdr *)skb->data; 284 iph = (const struct iphdr *)skb->data;
284 } 285 }
285 } 286 }
286 read_unlock(&raw_v4_hashinfo.lock); 287 read_unlock(&raw_v4_hashinfo.lock);
@@ -313,9 +314,10 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
313 return 0; 314 return 0;
314} 315}
315 316
316static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, 317static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
317 struct rtable **rtp, 318 void *from, size_t length,
318 unsigned int flags) 319 struct rtable **rtp,
320 unsigned int flags)
319{ 321{
320 struct inet_sock *inet = inet_sk(sk); 322 struct inet_sock *inet = inet_sk(sk);
321 struct net *net = sock_net(sk); 323 struct net *net = sock_net(sk);
@@ -326,7 +328,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
326 struct rtable *rt = *rtp; 328 struct rtable *rt = *rtp;
327 329
328 if (length > rt->dst.dev->mtu) { 330 if (length > rt->dst.dev->mtu) {
329 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, 331 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
330 rt->dst.dev->mtu); 332 rt->dst.dev->mtu);
331 return -EMSGSIZE; 333 return -EMSGSIZE;
332 } 334 }
@@ -371,7 +373,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
371 373
372 if (iphlen >= sizeof(*iph)) { 374 if (iphlen >= sizeof(*iph)) {
373 if (!iph->saddr) 375 if (!iph->saddr)
374 iph->saddr = rt->rt_src; 376 iph->saddr = fl4->saddr;
375 iph->check = 0; 377 iph->check = 0;
376 iph->tot_len = htons(length); 378 iph->tot_len = htons(length);
377 if (!iph->id) 379 if (!iph->id)
@@ -401,7 +403,7 @@ error:
401 return err; 403 return err;
402} 404}
403 405
404static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) 406static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
405{ 407{
406 struct iovec *iov; 408 struct iovec *iov;
407 u8 __user *type = NULL; 409 u8 __user *type = NULL;
@@ -417,7 +419,7 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
417 if (!iov) 419 if (!iov)
418 continue; 420 continue;
419 421
420 switch (fl->proto) { 422 switch (fl4->flowi4_proto) {
421 case IPPROTO_ICMP: 423 case IPPROTO_ICMP:
422 /* check if one-byte field is readable or not. */ 424 /* check if one-byte field is readable or not. */
423 if (iov->iov_base && iov->iov_len < 1) 425 if (iov->iov_base && iov->iov_len < 1)
@@ -432,8 +434,8 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
432 code = iov->iov_base; 434 code = iov->iov_base;
433 435
434 if (type && code) { 436 if (type && code) {
435 if (get_user(fl->fl_icmp_type, type) || 437 if (get_user(fl4->fl4_icmp_type, type) ||
436 get_user(fl->fl_icmp_code, code)) 438 get_user(fl4->fl4_icmp_code, code))
437 return -EFAULT; 439 return -EFAULT;
438 probed = 1; 440 probed = 1;
439 } 441 }
@@ -454,11 +456,13 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
454 struct inet_sock *inet = inet_sk(sk); 456 struct inet_sock *inet = inet_sk(sk);
455 struct ipcm_cookie ipc; 457 struct ipcm_cookie ipc;
456 struct rtable *rt = NULL; 458 struct rtable *rt = NULL;
459 struct flowi4 fl4;
457 int free = 0; 460 int free = 0;
458 __be32 daddr; 461 __be32 daddr;
459 __be32 saddr; 462 __be32 saddr;
460 u8 tos; 463 u8 tos;
461 int err; 464 int err;
465 struct ip_options_data opt_copy;
462 466
463 err = -EMSGSIZE; 467 err = -EMSGSIZE;
464 if (len > 0xFFFF) 468 if (len > 0xFFFF)
@@ -505,7 +509,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
505 509
506 ipc.addr = inet->inet_saddr; 510 ipc.addr = inet->inet_saddr;
507 ipc.opt = NULL; 511 ipc.opt = NULL;
508 ipc.shtx.flags = 0; 512 ipc.tx_flags = 0;
509 ipc.oif = sk->sk_bound_dev_if; 513 ipc.oif = sk->sk_bound_dev_if;
510 514
511 if (msg->msg_controllen) { 515 if (msg->msg_controllen) {
@@ -519,8 +523,18 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
519 saddr = ipc.addr; 523 saddr = ipc.addr;
520 ipc.addr = daddr; 524 ipc.addr = daddr;
521 525
522 if (!ipc.opt) 526 if (!ipc.opt) {
523 ipc.opt = inet->opt; 527 struct ip_options_rcu *inet_opt;
528
529 rcu_read_lock();
530 inet_opt = rcu_dereference(inet->inet_opt);
531 if (inet_opt) {
532 memcpy(&opt_copy, inet_opt,
533 sizeof(*inet_opt) + inet_opt->opt.optlen);
534 ipc.opt = &opt_copy.opt;
535 }
536 rcu_read_unlock();
537 }
524 538
525 if (ipc.opt) { 539 if (ipc.opt) {
526 err = -EINVAL; 540 err = -EINVAL;
@@ -529,10 +543,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
529 */ 543 */
530 if (inet->hdrincl) 544 if (inet->hdrincl)
531 goto done; 545 goto done;
532 if (ipc.opt->srr) { 546 if (ipc.opt->opt.srr) {
533 if (!daddr) 547 if (!daddr)
534 goto done; 548 goto done;
535 daddr = ipc.opt->faddr; 549 daddr = ipc.opt->opt.faddr;
536 } 550 }
537 } 551 }
538 tos = RT_CONN_FLAGS(sk); 552 tos = RT_CONN_FLAGS(sk);
@@ -546,27 +560,24 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
546 saddr = inet->mc_addr; 560 saddr = inet->mc_addr;
547 } 561 }
548 562
549 { 563 flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
550 struct flowi fl = { .oif = ipc.oif, 564 RT_SCOPE_UNIVERSE,
551 .mark = sk->sk_mark, 565 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
552 .nl_u = { .ip4_u = 566 FLOWI_FLAG_CAN_SLEEP, daddr, saddr, 0, 0);
553 { .daddr = daddr,
554 .saddr = saddr,
555 .tos = tos } },
556 .proto = inet->hdrincl ? IPPROTO_RAW :
557 sk->sk_protocol,
558 };
559 if (!inet->hdrincl) {
560 err = raw_probe_proto_opt(&fl, msg);
561 if (err)
562 goto done;
563 }
564 567
565 security_sk_classify_flow(sk, &fl); 568 if (!inet->hdrincl) {
566 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); 569 err = raw_probe_proto_opt(&fl4, msg);
570 if (err)
571 goto done;
567 } 572 }
568 if (err) 573
574 security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
575 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
576 if (IS_ERR(rt)) {
577 err = PTR_ERR(rt);
578 rt = NULL;
569 goto done; 579 goto done;
580 }
570 581
571 err = -EACCES; 582 err = -EACCES;
572 if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) 583 if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
@@ -577,19 +588,20 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
577back_from_confirm: 588back_from_confirm:
578 589
579 if (inet->hdrincl) 590 if (inet->hdrincl)
580 err = raw_send_hdrinc(sk, msg->msg_iov, len, 591 err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len,
581 &rt, msg->msg_flags); 592 &rt, msg->msg_flags);
582 593
583 else { 594 else {
584 if (!ipc.addr) 595 if (!ipc.addr)
585 ipc.addr = rt->rt_dst; 596 ipc.addr = fl4.daddr;
586 lock_sock(sk); 597 lock_sock(sk);
587 err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, 598 err = ip_append_data(sk, &fl4, ip_generic_getfrag,
588 &ipc, &rt, msg->msg_flags); 599 msg->msg_iov, len, 0,
600 &ipc, &rt, msg->msg_flags);
589 if (err) 601 if (err)
590 ip_flush_pending_frames(sk); 602 ip_flush_pending_frames(sk);
591 else if (!(msg->msg_flags & MSG_MORE)) { 603 else if (!(msg->msg_flags & MSG_MORE)) {
592 err = ip_push_pending_frames(sk); 604 err = ip_push_pending_frames(sk, &fl4);
593 if (err == -ENOBUFS && !inet->recverr) 605 if (err == -ENOBUFS && !inet->recverr)
594 err = 0; 606 err = 0;
595 } 607 }
@@ -616,7 +628,7 @@ do_confirm:
616static void raw_close(struct sock *sk, long timeout) 628static void raw_close(struct sock *sk, long timeout)
617{ 629{
618 /* 630 /*
619 * Raw sockets may have direct kernel refereneces. Kill them. 631 * Raw sockets may have direct kernel references. Kill them.
620 */ 632 */
621 ip_ra_control(sk, 0, NULL); 633 ip_ra_control(sk, 0, NULL);
622 634
@@ -839,6 +851,23 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
839 } 851 }
840} 852}
841 853
854#ifdef CONFIG_COMPAT
855static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
856{
857 switch (cmd) {
858 case SIOCOUTQ:
859 case SIOCINQ:
860 return -ENOIOCTLCMD;
861 default:
862#ifdef CONFIG_IP_MROUTE
863 return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg));
864#else
865 return -ENOIOCTLCMD;
866#endif
867 }
868}
869#endif
870
842struct proto raw_prot = { 871struct proto raw_prot = {
843 .name = "RAW", 872 .name = "RAW",
844 .owner = THIS_MODULE, 873 .owner = THIS_MODULE,
@@ -861,6 +890,7 @@ struct proto raw_prot = {
861#ifdef CONFIG_COMPAT 890#ifdef CONFIG_COMPAT
862 .compat_setsockopt = compat_raw_setsockopt, 891 .compat_setsockopt = compat_raw_setsockopt,
863 .compat_getsockopt = compat_raw_getsockopt, 892 .compat_getsockopt = compat_raw_getsockopt,
893 .compat_ioctl = compat_raw_ioctl,
864#endif 894#endif
865}; 895};
866 896
@@ -949,7 +979,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
949 srcp = inet->inet_num; 979 srcp = inet->inet_num;
950 980
951 seq_printf(seq, "%4d: %08X:%04X %08X:%04X" 981 seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
952 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", 982 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n",
953 i, src, srcp, dest, destp, sp->sk_state, 983 i, src, srcp, dest, destp, sp->sk_state,
954 sk_wmem_alloc_get(sp), 984 sk_wmem_alloc_get(sp),
955 sk_rmem_alloc_get(sp), 985 sk_rmem_alloc_get(sp),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ac6559cb54f9..aa13ef105110 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -109,8 +109,8 @@
109#include <linux/sysctl.h> 109#include <linux/sysctl.h>
110#endif 110#endif
111 111
112#define RT_FL_TOS(oldflp) \ 112#define RT_FL_TOS(oldflp4) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) 113 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 114
115#define IP_MAX_MTU 0xFFF0 115#define IP_MAX_MTU 0xFFF0
116 116
@@ -131,42 +131,80 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256; 131static int ip_rt_min_advmss __read_mostly = 256;
132static int rt_chain_length_max __read_mostly = 20; 132static int rt_chain_length_max __read_mostly = 20;
133 133
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
136
137/* 134/*
138 * Interface to generic destination cache. 135 * Interface to generic destination cache.
139 */ 136 */
140 137
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 138static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
139static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
140static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
142static void ipv4_dst_destroy(struct dst_entry *dst); 141static void ipv4_dst_destroy(struct dst_entry *dst);
143static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb); 143static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 144static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148static int rt_garbage_collect(struct dst_ops *ops); 145static int rt_garbage_collect(struct dst_ops *ops);
149 146
147static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
148 int how)
149{
150}
151
152static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
153{
154 struct rtable *rt = (struct rtable *) dst;
155 struct inet_peer *peer;
156 u32 *p = NULL;
157
158 if (!rt->peer)
159 rt_bind_peer(rt, rt->rt_dst, 1);
160
161 peer = rt->peer;
162 if (peer) {
163 u32 *old_p = __DST_METRICS_PTR(old);
164 unsigned long prev, new;
165
166 p = peer->metrics;
167 if (inet_metrics_new(peer))
168 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
169
170 new = (unsigned long) p;
171 prev = cmpxchg(&dst->_metrics, old, new);
172
173 if (prev != old) {
174 p = __DST_METRICS_PTR(prev);
175 if (prev & DST_METRICS_READ_ONLY)
176 p = NULL;
177 } else {
178 if (rt->fi) {
179 fib_info_put(rt->fi);
180 rt->fi = NULL;
181 }
182 }
183 }
184 return p;
185}
150 186
151static struct dst_ops ipv4_dst_ops = { 187static struct dst_ops ipv4_dst_ops = {
152 .family = AF_INET, 188 .family = AF_INET,
153 .protocol = cpu_to_be16(ETH_P_IP), 189 .protocol = cpu_to_be16(ETH_P_IP),
154 .gc = rt_garbage_collect, 190 .gc = rt_garbage_collect,
155 .check = ipv4_dst_check, 191 .check = ipv4_dst_check,
192 .default_advmss = ipv4_default_advmss,
193 .default_mtu = ipv4_default_mtu,
194 .cow_metrics = ipv4_cow_metrics,
156 .destroy = ipv4_dst_destroy, 195 .destroy = ipv4_dst_destroy,
157 .ifdown = ipv4_dst_ifdown, 196 .ifdown = ipv4_dst_ifdown,
158 .negative_advice = ipv4_negative_advice, 197 .negative_advice = ipv4_negative_advice,
159 .link_failure = ipv4_link_failure, 198 .link_failure = ipv4_link_failure,
160 .update_pmtu = ip_rt_update_pmtu, 199 .update_pmtu = ip_rt_update_pmtu,
161 .local_out = __ip_local_out, 200 .local_out = __ip_local_out,
162 .entries = ATOMIC_INIT(0),
163}; 201};
164 202
165#define ECN_OR_COST(class) TC_PRIO_##class 203#define ECN_OR_COST(class) TC_PRIO_##class
166 204
167const __u8 ip_tos2prio[16] = { 205const __u8 ip_tos2prio[16] = {
168 TC_PRIO_BESTEFFORT, 206 TC_PRIO_BESTEFFORT,
169 ECN_OR_COST(FILLER), 207 ECN_OR_COST(BESTEFFORT),
170 TC_PRIO_BESTEFFORT, 208 TC_PRIO_BESTEFFORT,
171 ECN_OR_COST(BESTEFFORT), 209 ECN_OR_COST(BESTEFFORT),
172 TC_PRIO_BULK, 210 TC_PRIO_BULK,
@@ -199,7 +237,7 @@ const __u8 ip_tos2prio[16] = {
199 */ 237 */
200 238
201struct rt_hash_bucket { 239struct rt_hash_bucket {
202 struct rtable *chain; 240 struct rtable __rcu *chain;
203}; 241};
204 242
205#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ 243#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
@@ -281,7 +319,7 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
281 struct rtable *r = NULL; 319 struct rtable *r = NULL;
282 320
283 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 321 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
284 if (!rt_hash_table[st->bucket].chain) 322 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
285 continue; 323 continue;
286 rcu_read_lock_bh(); 324 rcu_read_lock_bh();
287 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); 325 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
@@ -301,17 +339,17 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
301{ 339{
302 struct rt_cache_iter_state *st = seq->private; 340 struct rt_cache_iter_state *st = seq->private;
303 341
304 r = r->dst.rt_next; 342 r = rcu_dereference_bh(r->dst.rt_next);
305 while (!r) { 343 while (!r) {
306 rcu_read_unlock_bh(); 344 rcu_read_unlock_bh();
307 do { 345 do {
308 if (--st->bucket < 0) 346 if (--st->bucket < 0)
309 return NULL; 347 return NULL;
310 } while (!rt_hash_table[st->bucket].chain); 348 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
311 rcu_read_lock_bh(); 349 rcu_read_lock_bh();
312 r = rt_hash_table[st->bucket].chain; 350 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
313 } 351 }
314 return rcu_dereference_bh(r); 352 return r;
315} 353}
316 354
317static struct rtable *rt_cache_get_next(struct seq_file *seq, 355static struct rtable *rt_cache_get_next(struct seq_file *seq,
@@ -382,12 +420,11 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
382 (__force u32)r->rt_gateway, 420 (__force u32)r->rt_gateway,
383 r->rt_flags, atomic_read(&r->dst.__refcnt), 421 r->rt_flags, atomic_read(&r->dst.__refcnt),
384 r->dst.__use, 0, (__force u32)r->rt_src, 422 r->dst.__use, 0, (__force u32)r->rt_src,
385 (dst_metric(&r->dst, RTAX_ADVMSS) ? 423 dst_metric_advmss(&r->dst) + 40,
386 (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
387 dst_metric(&r->dst, RTAX_WINDOW), 424 dst_metric(&r->dst, RTAX_WINDOW),
388 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 425 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
389 dst_metric(&r->dst, RTAX_RTTVAR)), 426 dst_metric(&r->dst, RTAX_RTTVAR)),
390 r->fl.fl4_tos, 427 r->rt_key_tos,
391 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, 428 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
392 r->dst.hh ? (r->dst.hh->hh_output == 429 r->dst.hh ? (r->dst.hh->hh_output ==
393 dev_queue_xmit) : 0, 430 dev_queue_xmit) : 0,
@@ -466,7 +503,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
466 503
467 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 504 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
468 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 505 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
469 atomic_read(&ipv4_dst_ops.entries), 506 dst_entries_get_slow(&ipv4_dst_ops),
470 st->in_hit, 507 st->in_hit,
471 st->in_slow_tot, 508 st->in_slow_tot,
472 st->in_slow_mc, 509 st->in_slow_mc,
@@ -510,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = {
510 .release = seq_release, 547 .release = seq_release,
511}; 548};
512 549
513#ifdef CONFIG_NET_CLS_ROUTE 550#ifdef CONFIG_IP_ROUTE_CLASSID
514static int rt_acct_proc_show(struct seq_file *m, void *v) 551static int rt_acct_proc_show(struct seq_file *m, void *v)
515{ 552{
516 struct ip_rt_acct *dst, *src; 553 struct ip_rt_acct *dst, *src;
@@ -563,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
563 if (!pde) 600 if (!pde)
564 goto err2; 601 goto err2;
565 602
566#ifdef CONFIG_NET_CLS_ROUTE 603#ifdef CONFIG_IP_ROUTE_CLASSID
567 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 604 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
568 if (!pde) 605 if (!pde)
569 goto err3; 606 goto err3;
570#endif 607#endif
571 return 0; 608 return 0;
572 609
573#ifdef CONFIG_NET_CLS_ROUTE 610#ifdef CONFIG_IP_ROUTE_CLASSID
574err3: 611err3:
575 remove_proc_entry("rt_cache", net->proc_net_stat); 612 remove_proc_entry("rt_cache", net->proc_net_stat);
576#endif 613#endif
@@ -584,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
584{ 621{
585 remove_proc_entry("rt_cache", net->proc_net_stat); 622 remove_proc_entry("rt_cache", net->proc_net_stat);
586 remove_proc_entry("rt_cache", net->proc_net); 623 remove_proc_entry("rt_cache", net->proc_net);
587#ifdef CONFIG_NET_CLS_ROUTE 624#ifdef CONFIG_IP_ROUTE_CLASSID
588 remove_proc_entry("rt_acct", net->proc_net); 625 remove_proc_entry("rt_acct", net->proc_net);
589#endif 626#endif
590} 627}
@@ -622,13 +659,13 @@ static inline int rt_fast_clean(struct rtable *rth)
622 /* Kill broadcast/multicast entries very aggresively, if they 659 /* Kill broadcast/multicast entries very aggresively, if they
623 collide in hash table with more useful entries */ 660 collide in hash table with more useful entries */
624 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 661 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
625 rth->fl.iif && rth->dst.rt_next; 662 rt_is_input_route(rth) && rth->dst.rt_next;
626} 663}
627 664
628static inline int rt_valuable(struct rtable *rth) 665static inline int rt_valuable(struct rtable *rth)
629{ 666{
630 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 667 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
631 rth->dst.expires; 668 (rth->peer && rth->peer->pmtu_expires);
632} 669}
633 670
634static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 671static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -639,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
639 if (atomic_read(&rth->dst.__refcnt)) 676 if (atomic_read(&rth->dst.__refcnt))
640 goto out; 677 goto out;
641 678
642 ret = 1;
643 if (rth->dst.expires &&
644 time_after_eq(jiffies, rth->dst.expires))
645 goto out;
646
647 age = jiffies - rth->dst.lastuse; 679 age = jiffies - rth->dst.lastuse;
648 ret = 0;
649 if ((age <= tmo1 && !rt_fast_clean(rth)) || 680 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
650 (age <= tmo2 && rt_valuable(rth))) 681 (age <= tmo2 && rt_valuable(rth)))
651 goto out; 682 goto out;
@@ -667,7 +698,7 @@ static inline u32 rt_score(struct rtable *rt)
667 if (rt_valuable(rt)) 698 if (rt_valuable(rt))
668 score |= (1<<31); 699 score |= (1<<31);
669 700
670 if (!rt->fl.iif || 701 if (rt_is_output_route(rt) ||
671 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) 702 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
672 score |= (1<<30); 703 score |= (1<<30);
673 704
@@ -680,22 +711,22 @@ static inline bool rt_caching(const struct net *net)
680 net->ipv4.sysctl_rt_cache_rebuild_count; 711 net->ipv4.sysctl_rt_cache_rebuild_count;
681} 712}
682 713
683static inline bool compare_hash_inputs(const struct flowi *fl1, 714static inline bool compare_hash_inputs(const struct rtable *rt1,
684 const struct flowi *fl2) 715 const struct rtable *rt2)
685{ 716{
686 return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | 717 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
687 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | 718 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
688 (fl1->iif ^ fl2->iif)) == 0); 719 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
689} 720}
690 721
691static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 722static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
692{ 723{
693 return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | 724 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
694 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | 725 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
695 (fl1->mark ^ fl2->mark) | 726 (rt1->rt_mark ^ rt2->rt_mark) |
696 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) | 727 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
697 (fl1->oif ^ fl2->oif) | 728 (rt1->rt_oif ^ rt2->rt_oif) |
698 (fl1->iif ^ fl2->iif)) == 0; 729 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
699} 730}
700 731
701static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 732static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
@@ -713,55 +744,48 @@ static inline int rt_is_expired(struct rtable *rth)
713 * Can be called by a softirq or a process. 744 * Can be called by a softirq or a process.
714 * In the later case, we want to be reschedule if necessary 745 * In the later case, we want to be reschedule if necessary
715 */ 746 */
716static void rt_do_flush(int process_context) 747static void rt_do_flush(struct net *net, int process_context)
717{ 748{
718 unsigned int i; 749 unsigned int i;
719 struct rtable *rth, *next; 750 struct rtable *rth, *next;
720 struct rtable * tail;
721 751
722 for (i = 0; i <= rt_hash_mask; i++) { 752 for (i = 0; i <= rt_hash_mask; i++) {
753 struct rtable __rcu **pprev;
754 struct rtable *list;
755
723 if (process_context && need_resched()) 756 if (process_context && need_resched())
724 cond_resched(); 757 cond_resched();
725 rth = rt_hash_table[i].chain; 758 rth = rcu_dereference_raw(rt_hash_table[i].chain);
726 if (!rth) 759 if (!rth)
727 continue; 760 continue;
728 761
729 spin_lock_bh(rt_hash_lock_addr(i)); 762 spin_lock_bh(rt_hash_lock_addr(i));
730#ifdef CONFIG_NET_NS
731 {
732 struct rtable ** prev, * p;
733 763
734 rth = rt_hash_table[i].chain; 764 list = NULL;
765 pprev = &rt_hash_table[i].chain;
766 rth = rcu_dereference_protected(*pprev,
767 lockdep_is_held(rt_hash_lock_addr(i)));
735 768
736 /* defer releasing the head of the list after spin_unlock */ 769 while (rth) {
737 for (tail = rth; tail; tail = tail->dst.rt_next) 770 next = rcu_dereference_protected(rth->dst.rt_next,
738 if (!rt_is_expired(tail)) 771 lockdep_is_held(rt_hash_lock_addr(i)));
739 break; 772
740 if (rth != tail) 773 if (!net ||
741 rt_hash_table[i].chain = tail; 774 net_eq(dev_net(rth->dst.dev), net)) {
742 775 rcu_assign_pointer(*pprev, next);
743 /* call rt_free on entries after the tail requiring flush */ 776 rcu_assign_pointer(rth->dst.rt_next, list);
744 prev = &rt_hash_table[i].chain; 777 list = rth;
745 for (p = *prev; p; p = next) {
746 next = p->dst.rt_next;
747 if (!rt_is_expired(p)) {
748 prev = &p->dst.rt_next;
749 } else { 778 } else {
750 *prev = next; 779 pprev = &rth->dst.rt_next;
751 rt_free(p);
752 } 780 }
781 rth = next;
753 } 782 }
754 } 783
755#else
756 rth = rt_hash_table[i].chain;
757 rt_hash_table[i].chain = NULL;
758 tail = NULL;
759#endif
760 spin_unlock_bh(rt_hash_lock_addr(i)); 784 spin_unlock_bh(rt_hash_lock_addr(i));
761 785
762 for (; rth != tail; rth = next) { 786 for (; list; list = next) {
763 next = rth->dst.rt_next; 787 next = rcu_dereference_protected(list->dst.rt_next, 1);
764 rt_free(rth); 788 rt_free(list);
765 } 789 }
766 } 790 }
767} 791}
@@ -789,104 +813,15 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
789 const struct rtable *aux = head; 813 const struct rtable *aux = head;
790 814
791 while (aux != rth) { 815 while (aux != rth) {
792 if (compare_hash_inputs(&aux->fl, &rth->fl)) 816 if (compare_hash_inputs(aux, rth))
793 return 0; 817 return 0;
794 aux = aux->dst.rt_next; 818 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
795 } 819 }
796 return ONE; 820 return ONE;
797} 821}
798 822
799static void rt_check_expire(void)
800{
801 static unsigned int rover;
802 unsigned int i = rover, goal;
803 struct rtable *rth, **rthp;
804 unsigned long samples = 0;
805 unsigned long sum = 0, sum2 = 0;
806 unsigned long delta;
807 u64 mult;
808
809 delta = jiffies - expires_ljiffies;
810 expires_ljiffies = jiffies;
811 mult = ((u64)delta) << rt_hash_log;
812 if (ip_rt_gc_timeout > 1)
813 do_div(mult, ip_rt_gc_timeout);
814 goal = (unsigned int)mult;
815 if (goal > rt_hash_mask)
816 goal = rt_hash_mask + 1;
817 for (; goal > 0; goal--) {
818 unsigned long tmo = ip_rt_gc_timeout;
819 unsigned long length;
820
821 i = (i + 1) & rt_hash_mask;
822 rthp = &rt_hash_table[i].chain;
823
824 if (need_resched())
825 cond_resched();
826
827 samples++;
828
829 if (*rthp == NULL)
830 continue;
831 length = 0;
832 spin_lock_bh(rt_hash_lock_addr(i));
833 while ((rth = *rthp) != NULL) {
834 prefetch(rth->dst.rt_next);
835 if (rt_is_expired(rth)) {
836 *rthp = rth->dst.rt_next;
837 rt_free(rth);
838 continue;
839 }
840 if (rth->dst.expires) {
841 /* Entry is expired even if it is in use */
842 if (time_before_eq(jiffies, rth->dst.expires)) {
843nofree:
844 tmo >>= 1;
845 rthp = &rth->dst.rt_next;
846 /*
847 * We only count entries on
848 * a chain with equal hash inputs once
849 * so that entries for different QOS
850 * levels, and other non-hash input
851 * attributes don't unfairly skew
852 * the length computation
853 */
854 length += has_noalias(rt_hash_table[i].chain, rth);
855 continue;
856 }
857 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
858 goto nofree;
859
860 /* Cleanup aged off entries. */
861 *rthp = rth->dst.rt_next;
862 rt_free(rth);
863 }
864 spin_unlock_bh(rt_hash_lock_addr(i));
865 sum += length;
866 sum2 += length*length;
867 }
868 if (samples) {
869 unsigned long avg = sum / samples;
870 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
871 rt_chain_length_max = max_t(unsigned long,
872 ip_rt_gc_elasticity,
873 (avg + 4*sd) >> FRACT_BITS);
874 }
875 rover = i;
876}
877
878/*
879 * rt_worker_func() is run in process context.
880 * we call rt_check_expire() to scan part of the hash table
881 */
882static void rt_worker_func(struct work_struct *work)
883{
884 rt_check_expire();
885 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
886}
887
888/* 823/*
889 * Pertubation of rt_genid by a small quantity [1..256] 824 * Perturbation of rt_genid by a small quantity [1..256]
890 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 825 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
891 * many times (2^24) without giving recent rt_genid. 826 * many times (2^24) without giving recent rt_genid.
892 * Jenkins hash is strong enough that litle changes of rt_genid are OK. 827 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
@@ -907,13 +842,13 @@ void rt_cache_flush(struct net *net, int delay)
907{ 842{
908 rt_cache_invalidate(net); 843 rt_cache_invalidate(net);
909 if (delay >= 0) 844 if (delay >= 0)
910 rt_do_flush(!in_softirq()); 845 rt_do_flush(net, !in_softirq());
911} 846}
912 847
913/* Flush previous cache invalidated entries from the cache */ 848/* Flush previous cache invalidated entries from the cache */
914void rt_cache_flush_batch(void) 849void rt_cache_flush_batch(struct net *net)
915{ 850{
916 rt_do_flush(!in_softirq()); 851 rt_do_flush(net, !in_softirq());
917} 852}
918 853
919static void rt_emergency_hash_rebuild(struct net *net) 854static void rt_emergency_hash_rebuild(struct net *net)
@@ -942,9 +877,11 @@ static int rt_garbage_collect(struct dst_ops *ops)
942 static unsigned long last_gc; 877 static unsigned long last_gc;
943 static int rover; 878 static int rover;
944 static int equilibrium; 879 static int equilibrium;
945 struct rtable *rth, **rthp; 880 struct rtable *rth;
881 struct rtable __rcu **rthp;
946 unsigned long now = jiffies; 882 unsigned long now = jiffies;
947 int goal; 883 int goal;
884 int entries = dst_entries_get_fast(&ipv4_dst_ops);
948 885
949 /* 886 /*
950 * Garbage collection is pretty expensive, 887 * Garbage collection is pretty expensive,
@@ -954,28 +891,28 @@ static int rt_garbage_collect(struct dst_ops *ops)
954 RT_CACHE_STAT_INC(gc_total); 891 RT_CACHE_STAT_INC(gc_total);
955 892
956 if (now - last_gc < ip_rt_gc_min_interval && 893 if (now - last_gc < ip_rt_gc_min_interval &&
957 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { 894 entries < ip_rt_max_size) {
958 RT_CACHE_STAT_INC(gc_ignored); 895 RT_CACHE_STAT_INC(gc_ignored);
959 goto out; 896 goto out;
960 } 897 }
961 898
899 entries = dst_entries_get_slow(&ipv4_dst_ops);
962 /* Calculate number of entries, which we want to expire now. */ 900 /* Calculate number of entries, which we want to expire now. */
963 goal = atomic_read(&ipv4_dst_ops.entries) - 901 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
964 (ip_rt_gc_elasticity << rt_hash_log);
965 if (goal <= 0) { 902 if (goal <= 0) {
966 if (equilibrium < ipv4_dst_ops.gc_thresh) 903 if (equilibrium < ipv4_dst_ops.gc_thresh)
967 equilibrium = ipv4_dst_ops.gc_thresh; 904 equilibrium = ipv4_dst_ops.gc_thresh;
968 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 905 goal = entries - equilibrium;
969 if (goal > 0) { 906 if (goal > 0) {
970 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); 907 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
971 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 908 goal = entries - equilibrium;
972 } 909 }
973 } else { 910 } else {
974 /* We are in dangerous area. Try to reduce cache really 911 /* We are in dangerous area. Try to reduce cache really
975 * aggressively. 912 * aggressively.
976 */ 913 */
977 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); 914 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
978 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; 915 equilibrium = entries - goal;
979 } 916 }
980 917
981 if (now - last_gc >= ip_rt_gc_min_interval) 918 if (now - last_gc >= ip_rt_gc_min_interval)
@@ -995,7 +932,8 @@ static int rt_garbage_collect(struct dst_ops *ops)
995 k = (k + 1) & rt_hash_mask; 932 k = (k + 1) & rt_hash_mask;
996 rthp = &rt_hash_table[k].chain; 933 rthp = &rt_hash_table[k].chain;
997 spin_lock_bh(rt_hash_lock_addr(k)); 934 spin_lock_bh(rt_hash_lock_addr(k));
998 while ((rth = *rthp) != NULL) { 935 while ((rth = rcu_dereference_protected(*rthp,
936 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
999 if (!rt_is_expired(rth) && 937 if (!rt_is_expired(rth) &&
1000 !rt_may_expire(rth, tmo, expire)) { 938 !rt_may_expire(rth, tmo, expire)) {
1001 tmo >>= 1; 939 tmo >>= 1;
@@ -1030,16 +968,14 @@ static int rt_garbage_collect(struct dst_ops *ops)
1030 break; 968 break;
1031 969
1032 expire >>= 1; 970 expire >>= 1;
1033#if RT_CACHE_DEBUG >= 2
1034 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1035 atomic_read(&ipv4_dst_ops.entries), goal, i);
1036#endif
1037 971
1038 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 972 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1039 goto out; 973 goto out;
1040 } while (!in_softirq() && time_before_eq(jiffies, now)); 974 } while (!in_softirq() && time_before_eq(jiffies, now));
1041 975
1042 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 976 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
977 goto out;
978 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1043 goto out; 979 goto out;
1044 if (net_ratelimit()) 980 if (net_ratelimit())
1045 printk(KERN_WARNING "dst cache overflow\n"); 981 printk(KERN_WARNING "dst cache overflow\n");
@@ -1049,12 +985,9 @@ static int rt_garbage_collect(struct dst_ops *ops)
1049work_done: 985work_done:
1050 expire += ip_rt_gc_min_interval; 986 expire += ip_rt_gc_min_interval;
1051 if (expire > ip_rt_gc_timeout || 987 if (expire > ip_rt_gc_timeout ||
1052 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) 988 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
989 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1053 expire = ip_rt_gc_timeout; 990 expire = ip_rt_gc_timeout;
1054#if RT_CACHE_DEBUG >= 2
1055 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1056 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1057#endif
1058out: return 0; 991out: return 0;
1059} 992}
1060 993
@@ -1068,17 +1001,17 @@ static int slow_chain_length(const struct rtable *head)
1068 1001
1069 while (rth) { 1002 while (rth) {
1070 length += has_noalias(head, rth); 1003 length += has_noalias(head, rth);
1071 rth = rth->dst.rt_next; 1004 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1072 } 1005 }
1073 return length >> FRACT_BITS; 1006 return length >> FRACT_BITS;
1074} 1007}
1075 1008
1076static int rt_intern_hash(unsigned hash, struct rtable *rt, 1009static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1077 struct rtable **rp, struct sk_buff *skb, int ifindex) 1010 struct sk_buff *skb, int ifindex)
1078{ 1011{
1079 struct rtable *rth, **rthp; 1012 struct rtable *rth, *cand;
1013 struct rtable __rcu **rthp, **candp;
1080 unsigned long now; 1014 unsigned long now;
1081 struct rtable *cand, **candp;
1082 u32 min_score; 1015 u32 min_score;
1083 int chain_length; 1016 int chain_length;
1084 int attempts = !in_softirq(); 1017 int attempts = !in_softirq();
@@ -1102,36 +1035,37 @@ restart:
1102 * Note that we do rt_free on this new route entry, so that 1035 * Note that we do rt_free on this new route entry, so that
1103 * once its refcount hits zero, we are still able to reap it 1036 * once its refcount hits zero, we are still able to reap it
1104 * (Thanks Alexey) 1037 * (Thanks Alexey)
1105 * Note also the rt_free uses call_rcu. We don't actually 1038 * Note: To avoid expensive rcu stuff for this uncached dst,
1106 * need rcu protection here, this is just our path to get 1039 * we set DST_NOCACHE so that dst_release() can free dst without
1107 * on the route gc list. 1040 * waiting a grace period.
1108 */ 1041 */
1109 1042
1110 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1043 rt->dst.flags |= DST_NOCACHE;
1044 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1111 int err = arp_bind_neighbour(&rt->dst); 1045 int err = arp_bind_neighbour(&rt->dst);
1112 if (err) { 1046 if (err) {
1113 if (net_ratelimit()) 1047 if (net_ratelimit())
1114 printk(KERN_WARNING 1048 printk(KERN_WARNING
1115 "Neighbour table failure & not caching routes.\n"); 1049 "Neighbour table failure & not caching routes.\n");
1116 rt_drop(rt); 1050 ip_rt_put(rt);
1117 return err; 1051 return ERR_PTR(err);
1118 } 1052 }
1119 } 1053 }
1120 1054
1121 rt_free(rt);
1122 goto skip_hashing; 1055 goto skip_hashing;
1123 } 1056 }
1124 1057
1125 rthp = &rt_hash_table[hash].chain; 1058 rthp = &rt_hash_table[hash].chain;
1126 1059
1127 spin_lock_bh(rt_hash_lock_addr(hash)); 1060 spin_lock_bh(rt_hash_lock_addr(hash));
1128 while ((rth = *rthp) != NULL) { 1061 while ((rth = rcu_dereference_protected(*rthp,
1062 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1129 if (rt_is_expired(rth)) { 1063 if (rt_is_expired(rth)) {
1130 *rthp = rth->dst.rt_next; 1064 *rthp = rth->dst.rt_next;
1131 rt_free(rth); 1065 rt_free(rth);
1132 continue; 1066 continue;
1133 } 1067 }
1134 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { 1068 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1135 /* Put it first */ 1069 /* Put it first */
1136 *rthp = rth->dst.rt_next; 1070 *rthp = rth->dst.rt_next;
1137 /* 1071 /*
@@ -1151,11 +1085,9 @@ restart:
1151 spin_unlock_bh(rt_hash_lock_addr(hash)); 1085 spin_unlock_bh(rt_hash_lock_addr(hash));
1152 1086
1153 rt_drop(rt); 1087 rt_drop(rt);
1154 if (rp) 1088 if (skb)
1155 *rp = rth;
1156 else
1157 skb_dst_set(skb, &rth->dst); 1089 skb_dst_set(skb, &rth->dst);
1158 return 0; 1090 return rth;
1159 } 1091 }
1160 1092
1161 if (!atomic_read(&rth->dst.__refcnt)) { 1093 if (!atomic_read(&rth->dst.__refcnt)) {
@@ -1196,7 +1128,7 @@ restart:
1196 rt_emergency_hash_rebuild(net); 1128 rt_emergency_hash_rebuild(net);
1197 spin_unlock_bh(rt_hash_lock_addr(hash)); 1129 spin_unlock_bh(rt_hash_lock_addr(hash));
1198 1130
1199 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1131 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1200 ifindex, rt_genid(net)); 1132 ifindex, rt_genid(net));
1201 goto restart; 1133 goto restart;
1202 } 1134 }
@@ -1205,14 +1137,14 @@ restart:
1205 /* Try to bind route to arp only if it is output 1137 /* Try to bind route to arp only if it is output
1206 route or unicast forwarding path. 1138 route or unicast forwarding path.
1207 */ 1139 */
1208 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1140 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1209 int err = arp_bind_neighbour(&rt->dst); 1141 int err = arp_bind_neighbour(&rt->dst);
1210 if (err) { 1142 if (err) {
1211 spin_unlock_bh(rt_hash_lock_addr(hash)); 1143 spin_unlock_bh(rt_hash_lock_addr(hash));
1212 1144
1213 if (err != -ENOBUFS) { 1145 if (err != -ENOBUFS) {
1214 rt_drop(rt); 1146 rt_drop(rt);
1215 return err; 1147 return ERR_PTR(err);
1216 } 1148 }
1217 1149
1218 /* Neighbour tables are full and nothing 1150 /* Neighbour tables are full and nothing
@@ -1233,25 +1165,15 @@ restart:
1233 if (net_ratelimit()) 1165 if (net_ratelimit())
1234 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); 1166 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1235 rt_drop(rt); 1167 rt_drop(rt);
1236 return -ENOBUFS; 1168 return ERR_PTR(-ENOBUFS);
1237 } 1169 }
1238 } 1170 }
1239 1171
1240 rt->dst.rt_next = rt_hash_table[hash].chain; 1172 rt->dst.rt_next = rt_hash_table[hash].chain;
1241 1173
1242#if RT_CACHE_DEBUG >= 2
1243 if (rt->dst.rt_next) {
1244 struct rtable *trt;
1245 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1246 hash, &rt->rt_dst);
1247 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1248 printk(" . %pI4", &trt->rt_dst);
1249 printk("\n");
1250 }
1251#endif
1252 /* 1174 /*
1253 * Since lookup is lockfree, we must make sure 1175 * Since lookup is lockfree, we must make sure
1254 * previous writes to rt are comitted to memory 1176 * previous writes to rt are committed to memory
1255 * before making rt visible to other CPUS. 1177 * before making rt visible to other CPUS.
1256 */ 1178 */
1257 rcu_assign_pointer(rt_hash_table[hash].chain, rt); 1179 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
@@ -1259,28 +1181,28 @@ restart:
1259 spin_unlock_bh(rt_hash_lock_addr(hash)); 1181 spin_unlock_bh(rt_hash_lock_addr(hash));
1260 1182
1261skip_hashing: 1183skip_hashing:
1262 if (rp) 1184 if (skb)
1263 *rp = rt;
1264 else
1265 skb_dst_set(skb, &rt->dst); 1185 skb_dst_set(skb, &rt->dst);
1266 return 0; 1186 return rt;
1187}
1188
1189static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1190
1191static u32 rt_peer_genid(void)
1192{
1193 return atomic_read(&__rt_peer_genid);
1267} 1194}
1268 1195
1269void rt_bind_peer(struct rtable *rt, int create) 1196void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1270{ 1197{
1271 static DEFINE_SPINLOCK(rt_peer_lock);
1272 struct inet_peer *peer; 1198 struct inet_peer *peer;
1273 1199
1274 peer = inet_getpeer(rt->rt_dst, create); 1200 peer = inet_getpeer_v4(daddr, create);
1275 1201
1276 spin_lock_bh(&rt_peer_lock); 1202 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1277 if (rt->peer == NULL) {
1278 rt->peer = peer;
1279 peer = NULL;
1280 }
1281 spin_unlock_bh(&rt_peer_lock);
1282 if (peer)
1283 inet_putpeer(peer); 1203 inet_putpeer(peer);
1204 else
1205 rt->rt_peer_genid = rt_peer_genid();
1284} 1206}
1285 1207
1286/* 1208/*
@@ -1309,7 +1231,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1309 1231
1310 if (rt) { 1232 if (rt) {
1311 if (rt->peer == NULL) 1233 if (rt->peer == NULL)
1312 rt_bind_peer(rt, 1); 1234 rt_bind_peer(rt, rt->rt_dst, 1);
1313 1235
1314 /* If peer is attached to destination, it is never detached, 1236 /* If peer is attached to destination, it is never detached,
1315 so that we need not to grab a lock to dereference it. 1237 so that we need not to grab a lock to dereference it.
@@ -1328,12 +1250,14 @@ EXPORT_SYMBOL(__ip_select_ident);
1328 1250
1329static void rt_del(unsigned hash, struct rtable *rt) 1251static void rt_del(unsigned hash, struct rtable *rt)
1330{ 1252{
1331 struct rtable **rthp, *aux; 1253 struct rtable __rcu **rthp;
1254 struct rtable *aux;
1332 1255
1333 rthp = &rt_hash_table[hash].chain; 1256 rthp = &rt_hash_table[hash].chain;
1334 spin_lock_bh(rt_hash_lock_addr(hash)); 1257 spin_lock_bh(rt_hash_lock_addr(hash));
1335 ip_rt_put(rt); 1258 ip_rt_put(rt);
1336 while ((aux = *rthp) != NULL) { 1259 while ((aux = rcu_dereference_protected(*rthp,
1260 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1337 if (aux == rt || rt_is_expired(aux)) { 1261 if (aux == rt || rt_is_expired(aux)) {
1338 *rthp = aux->dst.rt_next; 1262 *rthp = aux->dst.rt_next;
1339 rt_free(aux); 1263 rt_free(aux);
@@ -1348,12 +1272,8 @@ static void rt_del(unsigned hash, struct rtable *rt)
1348void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1272void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1349 __be32 saddr, struct net_device *dev) 1273 __be32 saddr, struct net_device *dev)
1350{ 1274{
1351 int i, k;
1352 struct in_device *in_dev = __in_dev_get_rcu(dev); 1275 struct in_device *in_dev = __in_dev_get_rcu(dev);
1353 struct rtable *rth, **rthp; 1276 struct inet_peer *peer;
1354 __be32 skeys[2] = { saddr, 0 };
1355 int ikeys[2] = { dev->ifindex, 0 };
1356 struct netevent_redirect netevent;
1357 struct net *net; 1277 struct net *net;
1358 1278
1359 if (!in_dev) 1279 if (!in_dev)
@@ -1365,9 +1285,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1365 ipv4_is_zeronet(new_gw)) 1285 ipv4_is_zeronet(new_gw))
1366 goto reject_redirect; 1286 goto reject_redirect;
1367 1287
1368 if (!rt_caching(net))
1369 goto reject_redirect;
1370
1371 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1288 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1372 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1289 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1373 goto reject_redirect; 1290 goto reject_redirect;
@@ -1378,93 +1295,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1378 goto reject_redirect; 1295 goto reject_redirect;
1379 } 1296 }
1380 1297
1381 for (i = 0; i < 2; i++) { 1298 peer = inet_getpeer_v4(daddr, 1);
1382 for (k = 0; k < 2; k++) { 1299 if (peer) {
1383 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1300 peer->redirect_learned.a4 = new_gw;
1384 rt_genid(net));
1385
1386 rthp=&rt_hash_table[hash].chain;
1387
1388 while ((rth = rcu_dereference(*rthp)) != NULL) {
1389 struct rtable *rt;
1390
1391 if (rth->fl.fl4_dst != daddr ||
1392 rth->fl.fl4_src != skeys[i] ||
1393 rth->fl.oif != ikeys[k] ||
1394 rth->fl.iif != 0 ||
1395 rt_is_expired(rth) ||
1396 !net_eq(dev_net(rth->dst.dev), net)) {
1397 rthp = &rth->dst.rt_next;
1398 continue;
1399 }
1400
1401 if (rth->rt_dst != daddr ||
1402 rth->rt_src != saddr ||
1403 rth->dst.error ||
1404 rth->rt_gateway != old_gw ||
1405 rth->dst.dev != dev)
1406 break;
1407
1408 dst_hold(&rth->dst);
1409
1410 rt = dst_alloc(&ipv4_dst_ops);
1411 if (rt == NULL) {
1412 ip_rt_put(rth);
1413 return;
1414 }
1415
1416 /* Copy all the information. */
1417 *rt = *rth;
1418 rt->dst.__use = 1;
1419 atomic_set(&rt->dst.__refcnt, 1);
1420 rt->dst.child = NULL;
1421 if (rt->dst.dev)
1422 dev_hold(rt->dst.dev);
1423 if (rt->idev)
1424 in_dev_hold(rt->idev);
1425 rt->dst.obsolete = -1;
1426 rt->dst.lastuse = jiffies;
1427 rt->dst.path = &rt->dst;
1428 rt->dst.neighbour = NULL;
1429 rt->dst.hh = NULL;
1430#ifdef CONFIG_XFRM
1431 rt->dst.xfrm = NULL;
1432#endif
1433 rt->rt_genid = rt_genid(net);
1434 rt->rt_flags |= RTCF_REDIRECTED;
1435
1436 /* Gateway is different ... */
1437 rt->rt_gateway = new_gw;
1438
1439 /* Redirect received -> path was valid */
1440 dst_confirm(&rth->dst);
1441
1442 if (rt->peer)
1443 atomic_inc(&rt->peer->refcnt);
1444
1445 if (arp_bind_neighbour(&rt->dst) ||
1446 !(rt->dst.neighbour->nud_state &
1447 NUD_VALID)) {
1448 if (rt->dst.neighbour)
1449 neigh_event_send(rt->dst.neighbour, NULL);
1450 ip_rt_put(rth);
1451 rt_drop(rt);
1452 goto do_next;
1453 }
1454 1301
1455 netevent.old = &rth->dst; 1302 inet_putpeer(peer);
1456 netevent.new = &rt->dst;
1457 call_netevent_notifiers(NETEVENT_REDIRECT,
1458 &netevent);
1459 1303
1460 rt_del(hash, rth); 1304 atomic_inc(&__rt_peer_genid);
1461 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1462 ip_rt_put(rt);
1463 goto do_next;
1464 }
1465 do_next:
1466 ;
1467 }
1468 } 1305 }
1469 return; 1306 return;
1470 1307
@@ -1479,6 +1316,23 @@ reject_redirect:
1479 ; 1316 ;
1480} 1317}
1481 1318
1319static bool peer_pmtu_expired(struct inet_peer *peer)
1320{
1321 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1322
1323 return orig &&
1324 time_after_eq(jiffies, orig) &&
1325 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1326}
1327
1328static bool peer_pmtu_cleaned(struct inet_peer *peer)
1329{
1330 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1331
1332 return orig &&
1333 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1334}
1335
1482static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1336static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1483{ 1337{
1484 struct rtable *rt = (struct rtable *)dst; 1338 struct rtable *rt = (struct rtable *)dst;
@@ -1488,18 +1342,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1488 if (dst->obsolete > 0) { 1342 if (dst->obsolete > 0) {
1489 ip_rt_put(rt); 1343 ip_rt_put(rt);
1490 ret = NULL; 1344 ret = NULL;
1491 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1345 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1492 (rt->dst.expires && 1346 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1493 time_after_eq(jiffies, rt->dst.expires))) { 1347 rt->rt_oif,
1494 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1495 rt->fl.oif,
1496 rt_genid(dev_net(dst->dev))); 1348 rt_genid(dev_net(dst->dev)));
1497#if RT_CACHE_DEBUG >= 1
1498 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1499 &rt->rt_dst, rt->fl.fl4_tos);
1500#endif
1501 rt_del(hash, rt); 1349 rt_del(hash, rt);
1502 ret = NULL; 1350 ret = NULL;
1351 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1352 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1503 } 1353 }
1504 } 1354 }
1505 return ret; 1355 return ret;
@@ -1525,6 +1375,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1525{ 1375{
1526 struct rtable *rt = skb_rtable(skb); 1376 struct rtable *rt = skb_rtable(skb);
1527 struct in_device *in_dev; 1377 struct in_device *in_dev;
1378 struct inet_peer *peer;
1528 int log_martians; 1379 int log_martians;
1529 1380
1530 rcu_read_lock(); 1381 rcu_read_lock();
@@ -1536,36 +1387,44 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1536 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1387 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1537 rcu_read_unlock(); 1388 rcu_read_unlock();
1538 1389
1390 if (!rt->peer)
1391 rt_bind_peer(rt, rt->rt_dst, 1);
1392 peer = rt->peer;
1393 if (!peer) {
1394 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1395 return;
1396 }
1397
1539 /* No redirected packets during ip_rt_redirect_silence; 1398 /* No redirected packets during ip_rt_redirect_silence;
1540 * reset the algorithm. 1399 * reset the algorithm.
1541 */ 1400 */
1542 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) 1401 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1543 rt->dst.rate_tokens = 0; 1402 peer->rate_tokens = 0;
1544 1403
1545 /* Too many ignored redirects; do not send anything 1404 /* Too many ignored redirects; do not send anything
1546 * set dst.rate_last to the last seen redirected packet. 1405 * set dst.rate_last to the last seen redirected packet.
1547 */ 1406 */
1548 if (rt->dst.rate_tokens >= ip_rt_redirect_number) { 1407 if (peer->rate_tokens >= ip_rt_redirect_number) {
1549 rt->dst.rate_last = jiffies; 1408 peer->rate_last = jiffies;
1550 return; 1409 return;
1551 } 1410 }
1552 1411
1553 /* Check for load limit; set rate_last to the latest sent 1412 /* Check for load limit; set rate_last to the latest sent
1554 * redirect. 1413 * redirect.
1555 */ 1414 */
1556 if (rt->dst.rate_tokens == 0 || 1415 if (peer->rate_tokens == 0 ||
1557 time_after(jiffies, 1416 time_after(jiffies,
1558 (rt->dst.rate_last + 1417 (peer->rate_last +
1559 (ip_rt_redirect_load << rt->dst.rate_tokens)))) { 1418 (ip_rt_redirect_load << peer->rate_tokens)))) {
1560 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1419 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1561 rt->dst.rate_last = jiffies; 1420 peer->rate_last = jiffies;
1562 ++rt->dst.rate_tokens; 1421 ++peer->rate_tokens;
1563#ifdef CONFIG_IP_ROUTE_VERBOSE 1422#ifdef CONFIG_IP_ROUTE_VERBOSE
1564 if (log_martians && 1423 if (log_martians &&
1565 rt->dst.rate_tokens == ip_rt_redirect_number && 1424 peer->rate_tokens == ip_rt_redirect_number &&
1566 net_ratelimit()) 1425 net_ratelimit())
1567 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1426 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1568 &rt->rt_src, rt->rt_iif, 1427 &ip_hdr(skb)->saddr, rt->rt_iif,
1569 &rt->rt_dst, &rt->rt_gateway); 1428 &rt->rt_dst, &rt->rt_gateway);
1570#endif 1429#endif
1571 } 1430 }
@@ -1574,7 +1433,9 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1574static int ip_error(struct sk_buff *skb) 1433static int ip_error(struct sk_buff *skb)
1575{ 1434{
1576 struct rtable *rt = skb_rtable(skb); 1435 struct rtable *rt = skb_rtable(skb);
1436 struct inet_peer *peer;
1577 unsigned long now; 1437 unsigned long now;
1438 bool send;
1578 int code; 1439 int code;
1579 1440
1580 switch (rt->dst.error) { 1441 switch (rt->dst.error) {
@@ -1594,15 +1455,24 @@ static int ip_error(struct sk_buff *skb)
1594 break; 1455 break;
1595 } 1456 }
1596 1457
1597 now = jiffies; 1458 if (!rt->peer)
1598 rt->dst.rate_tokens += now - rt->dst.rate_last; 1459 rt_bind_peer(rt, rt->rt_dst, 1);
1599 if (rt->dst.rate_tokens > ip_rt_error_burst) 1460 peer = rt->peer;
1600 rt->dst.rate_tokens = ip_rt_error_burst; 1461
1601 rt->dst.rate_last = now; 1462 send = true;
1602 if (rt->dst.rate_tokens >= ip_rt_error_cost) { 1463 if (peer) {
1603 rt->dst.rate_tokens -= ip_rt_error_cost; 1464 now = jiffies;
1465 peer->rate_tokens += now - peer->rate_last;
1466 if (peer->rate_tokens > ip_rt_error_burst)
1467 peer->rate_tokens = ip_rt_error_burst;
1468 peer->rate_last = now;
1469 if (peer->rate_tokens >= ip_rt_error_cost)
1470 peer->rate_tokens -= ip_rt_error_cost;
1471 else
1472 send = false;
1473 }
1474 if (send)
1604 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1475 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1605 }
1606 1476
1607out: kfree_skb(skb); 1477out: kfree_skb(skb);
1608 return 0; 1478 return 0;
@@ -1626,88 +1496,148 @@ static inline unsigned short guess_mtu(unsigned short old_mtu)
1626 return 68; 1496 return 68;
1627} 1497}
1628 1498
1629unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, 1499unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1630 unsigned short new_mtu, 1500 unsigned short new_mtu,
1631 struct net_device *dev) 1501 struct net_device *dev)
1632{ 1502{
1633 int i, k;
1634 unsigned short old_mtu = ntohs(iph->tot_len); 1503 unsigned short old_mtu = ntohs(iph->tot_len);
1635 struct rtable *rth;
1636 int ikeys[2] = { dev->ifindex, 0 };
1637 __be32 skeys[2] = { iph->saddr, 0, };
1638 __be32 daddr = iph->daddr;
1639 unsigned short est_mtu = 0; 1504 unsigned short est_mtu = 0;
1505 struct inet_peer *peer;
1640 1506
1641 for (k = 0; k < 2; k++) { 1507 peer = inet_getpeer_v4(iph->daddr, 1);
1642 for (i = 0; i < 2; i++) { 1508 if (peer) {
1643 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1509 unsigned short mtu = new_mtu;
1644 rt_genid(net)); 1510
1645 1511 if (new_mtu < 68 || new_mtu >= old_mtu) {
1646 rcu_read_lock(); 1512 /* BSD 4.2 derived systems incorrectly adjust
1647 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 1513 * tot_len by the IP header length, and report
1648 rth = rcu_dereference(rth->dst.rt_next)) { 1514 * a zero MTU in the ICMP message.
1649 unsigned short mtu = new_mtu; 1515 */
1650 1516 if (mtu == 0 &&
1651 if (rth->fl.fl4_dst != daddr || 1517 old_mtu >= 68 + (iph->ihl << 2))
1652 rth->fl.fl4_src != skeys[i] || 1518 old_mtu -= iph->ihl << 2;
1653 rth->rt_dst != daddr || 1519 mtu = guess_mtu(old_mtu);
1654 rth->rt_src != iph->saddr || 1520 }
1655 rth->fl.oif != ikeys[k] ||
1656 rth->fl.iif != 0 ||
1657 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1658 !net_eq(dev_net(rth->dst.dev), net) ||
1659 rt_is_expired(rth))
1660 continue;
1661 1521
1662 if (new_mtu < 68 || new_mtu >= old_mtu) { 1522 if (mtu < ip_rt_min_pmtu)
1523 mtu = ip_rt_min_pmtu;
1524 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1525 unsigned long pmtu_expires;
1663 1526
1664 /* BSD 4.2 compatibility hack :-( */ 1527 pmtu_expires = jiffies + ip_rt_mtu_expires;
1665 if (mtu == 0 && 1528 if (!pmtu_expires)
1666 old_mtu >= dst_mtu(&rth->dst) && 1529 pmtu_expires = 1UL;
1667 old_mtu >= 68 + (iph->ihl << 2))
1668 old_mtu -= iph->ihl << 2;
1669 1530
1670 mtu = guess_mtu(old_mtu); 1531 est_mtu = mtu;
1671 } 1532 peer->pmtu_learned = mtu;
1672 if (mtu <= dst_mtu(&rth->dst)) { 1533 peer->pmtu_expires = pmtu_expires;
1673 if (mtu < dst_mtu(&rth->dst)) {
1674 dst_confirm(&rth->dst);
1675 if (mtu < ip_rt_min_pmtu) {
1676 mtu = ip_rt_min_pmtu;
1677 rth->dst.metrics[RTAX_LOCK-1] |=
1678 (1 << RTAX_MTU);
1679 }
1680 rth->dst.metrics[RTAX_MTU-1] = mtu;
1681 dst_set_expires(&rth->dst,
1682 ip_rt_mtu_expires);
1683 }
1684 est_mtu = mtu;
1685 }
1686 }
1687 rcu_read_unlock();
1688 } 1534 }
1535
1536 inet_putpeer(peer);
1537
1538 atomic_inc(&__rt_peer_genid);
1689 } 1539 }
1690 return est_mtu ? : new_mtu; 1540 return est_mtu ? : new_mtu;
1691} 1541}
1692 1542
1543static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1544{
1545 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1546
1547 if (!expires)
1548 return;
1549 if (time_before(jiffies, expires)) {
1550 u32 orig_dst_mtu = dst_mtu(dst);
1551 if (peer->pmtu_learned < orig_dst_mtu) {
1552 if (!peer->pmtu_orig)
1553 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1554 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1555 }
1556 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1557 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1558}
1559
1693static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1560static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1694{ 1561{
1695 if (dst_mtu(dst) > mtu && mtu >= 68 && 1562 struct rtable *rt = (struct rtable *) dst;
1696 !(dst_metric_locked(dst, RTAX_MTU))) { 1563 struct inet_peer *peer;
1697 if (mtu < ip_rt_min_pmtu) { 1564
1565 dst_confirm(dst);
1566
1567 if (!rt->peer)
1568 rt_bind_peer(rt, rt->rt_dst, 1);
1569 peer = rt->peer;
1570 if (peer) {
1571 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1572
1573 if (mtu < ip_rt_min_pmtu)
1698 mtu = ip_rt_min_pmtu; 1574 mtu = ip_rt_min_pmtu;
1699 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); 1575 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1576
1577 pmtu_expires = jiffies + ip_rt_mtu_expires;
1578 if (!pmtu_expires)
1579 pmtu_expires = 1UL;
1580
1581 peer->pmtu_learned = mtu;
1582 peer->pmtu_expires = pmtu_expires;
1583
1584 atomic_inc(&__rt_peer_genid);
1585 rt->rt_peer_genid = rt_peer_genid();
1700 } 1586 }
1701 dst->metrics[RTAX_MTU-1] = mtu; 1587 check_peer_pmtu(dst, peer);
1702 dst_set_expires(dst, ip_rt_mtu_expires); 1588 }
1703 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); 1589}
1590
1591static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1592{
1593 struct rtable *rt = (struct rtable *) dst;
1594 __be32 orig_gw = rt->rt_gateway;
1595
1596 dst_confirm(&rt->dst);
1597
1598 neigh_release(rt->dst.neighbour);
1599 rt->dst.neighbour = NULL;
1600
1601 rt->rt_gateway = peer->redirect_learned.a4;
1602 if (arp_bind_neighbour(&rt->dst) ||
1603 !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1604 if (rt->dst.neighbour)
1605 neigh_event_send(rt->dst.neighbour, NULL);
1606 rt->rt_gateway = orig_gw;
1607 return -EAGAIN;
1608 } else {
1609 rt->rt_flags |= RTCF_REDIRECTED;
1610 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1611 rt->dst.neighbour);
1704 } 1612 }
1613 return 0;
1705} 1614}
1706 1615
1707static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1616static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1708{ 1617{
1709 if (rt_is_expired((struct rtable *)dst)) 1618 struct rtable *rt = (struct rtable *) dst;
1619
1620 if (rt_is_expired(rt))
1710 return NULL; 1621 return NULL;
1622 if (rt->rt_peer_genid != rt_peer_genid()) {
1623 struct inet_peer *peer;
1624
1625 if (!rt->peer)
1626 rt_bind_peer(rt, rt->rt_dst, 0);
1627
1628 peer = rt->peer;
1629 if (peer) {
1630 check_peer_pmtu(dst, peer);
1631
1632 if (peer->redirect_learned.a4 &&
1633 peer->redirect_learned.a4 != rt->rt_gateway) {
1634 if (check_peer_redir(dst, peer))
1635 return NULL;
1636 }
1637 }
1638
1639 rt->rt_peer_genid = rt_peer_genid();
1640 }
1711 return dst; 1641 return dst;
1712} 1642}
1713 1643
@@ -1715,33 +1645,17 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
1715{ 1645{
1716 struct rtable *rt = (struct rtable *) dst; 1646 struct rtable *rt = (struct rtable *) dst;
1717 struct inet_peer *peer = rt->peer; 1647 struct inet_peer *peer = rt->peer;
1718 struct in_device *idev = rt->idev;
1719 1648
1649 if (rt->fi) {
1650 fib_info_put(rt->fi);
1651 rt->fi = NULL;
1652 }
1720 if (peer) { 1653 if (peer) {
1721 rt->peer = NULL; 1654 rt->peer = NULL;
1722 inet_putpeer(peer); 1655 inet_putpeer(peer);
1723 } 1656 }
1724
1725 if (idev) {
1726 rt->idev = NULL;
1727 in_dev_put(idev);
1728 }
1729} 1657}
1730 1658
1731static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1732 int how)
1733{
1734 struct rtable *rt = (struct rtable *) dst;
1735 struct in_device *idev = rt->idev;
1736 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1737 struct in_device *loopback_idev =
1738 in_dev_get(dev_net(dev)->loopback_dev);
1739 if (loopback_idev) {
1740 rt->idev = loopback_idev;
1741 in_dev_put(idev);
1742 }
1743 }
1744}
1745 1659
1746static void ipv4_link_failure(struct sk_buff *skb) 1660static void ipv4_link_failure(struct sk_buff *skb)
1747{ 1661{
@@ -1750,8 +1664,8 @@ static void ipv4_link_failure(struct sk_buff *skb)
1750 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1664 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1751 1665
1752 rt = skb_rtable(skb); 1666 rt = skb_rtable(skb);
1753 if (rt) 1667 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1754 dst_set_expires(&rt->dst, 0); 1668 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1755} 1669}
1756 1670
1757static int ip_rt_bug(struct sk_buff *skb) 1671static int ip_rt_bug(struct sk_buff *skb)
@@ -1760,6 +1674,7 @@ static int ip_rt_bug(struct sk_buff *skb)
1760 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1674 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1761 skb->dev ? skb->dev->name : "?"); 1675 skb->dev ? skb->dev->name : "?");
1762 kfree_skb(skb); 1676 kfree_skb(skb);
1677 WARN_ON(1);
1763 return 0; 1678 return 0;
1764} 1679}
1765 1680
@@ -1772,23 +1687,39 @@ static int ip_rt_bug(struct sk_buff *skb)
1772 in IP options! 1687 in IP options!
1773 */ 1688 */
1774 1689
1775void ip_rt_get_source(u8 *addr, struct rtable *rt) 1690void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1776{ 1691{
1777 __be32 src; 1692 __be32 src;
1778 struct fib_result res;
1779 1693
1780 if (rt->fl.iif == 0) 1694 if (rt_is_output_route(rt))
1781 src = rt->rt_src; 1695 src = ip_hdr(skb)->saddr;
1782 else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) { 1696 else {
1783 src = FIB_RES_PREFSRC(res); 1697 struct fib_result res;
1784 fib_res_put(&res); 1698 struct flowi4 fl4;
1785 } else 1699 struct iphdr *iph;
1786 src = inet_select_addr(rt->dst.dev, rt->rt_gateway, 1700
1701 iph = ip_hdr(skb);
1702
1703 memset(&fl4, 0, sizeof(fl4));
1704 fl4.daddr = iph->daddr;
1705 fl4.saddr = iph->saddr;
1706 fl4.flowi4_tos = iph->tos;
1707 fl4.flowi4_oif = rt->dst.dev->ifindex;
1708 fl4.flowi4_iif = skb->dev->ifindex;
1709 fl4.flowi4_mark = skb->mark;
1710
1711 rcu_read_lock();
1712 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1713 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1714 else
1715 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1787 RT_SCOPE_UNIVERSE); 1716 RT_SCOPE_UNIVERSE);
1717 rcu_read_unlock();
1718 }
1788 memcpy(addr, &src, 4); 1719 memcpy(addr, &src, 4);
1789} 1720}
1790 1721
1791#ifdef CONFIG_NET_CLS_ROUTE 1722#ifdef CONFIG_IP_ROUTE_CLASSID
1792static void set_class_tag(struct rtable *rt, u32 tag) 1723static void set_class_tag(struct rtable *rt, u32 tag)
1793{ 1724{
1794 if (!(rt->dst.tclassid & 0xFFFF)) 1725 if (!(rt->dst.tclassid & 0xFFFF))
@@ -1798,46 +1729,107 @@ static void set_class_tag(struct rtable *rt, u32 tag)
1798} 1729}
1799#endif 1730#endif
1800 1731
1801static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 1732static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1802{ 1733{
1803 struct fib_info *fi = res->fi; 1734 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1735
1736 if (advmss == 0) {
1737 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1738 ip_rt_min_advmss);
1739 if (advmss > 65535 - 40)
1740 advmss = 65535 - 40;
1741 }
1742 return advmss;
1743}
1744
1745static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1746{
1747 unsigned int mtu = dst->dev->mtu;
1748
1749 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1750 const struct rtable *rt = (const struct rtable *) dst;
1751
1752 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1753 mtu = 576;
1754 }
1755
1756 if (mtu > IP_MAX_MTU)
1757 mtu = IP_MAX_MTU;
1758
1759 return mtu;
1760}
1761
1762static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1763 struct fib_info *fi)
1764{
1765 struct inet_peer *peer;
1766 int create = 0;
1767
1768 /* If a peer entry exists for this destination, we must hook
1769 * it up in order to get at cached metrics.
1770 */
1771 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1772 create = 1;
1773
1774 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1775 if (peer) {
1776 rt->rt_peer_genid = rt_peer_genid();
1777 if (inet_metrics_new(peer))
1778 memcpy(peer->metrics, fi->fib_metrics,
1779 sizeof(u32) * RTAX_MAX);
1780 dst_init_metrics(&rt->dst, peer->metrics, false);
1781
1782 check_peer_pmtu(&rt->dst, peer);
1783 if (peer->redirect_learned.a4 &&
1784 peer->redirect_learned.a4 != rt->rt_gateway) {
1785 rt->rt_gateway = peer->redirect_learned.a4;
1786 rt->rt_flags |= RTCF_REDIRECTED;
1787 }
1788 } else {
1789 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1790 rt->fi = fi;
1791 atomic_inc(&fi->fib_clntref);
1792 }
1793 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1794 }
1795}
1796
1797static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1798 const struct fib_result *res,
1799 struct fib_info *fi, u16 type, u32 itag)
1800{
1801 struct dst_entry *dst = &rt->dst;
1804 1802
1805 if (fi) { 1803 if (fi) {
1806 if (FIB_RES_GW(*res) && 1804 if (FIB_RES_GW(*res) &&
1807 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1805 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1808 rt->rt_gateway = FIB_RES_GW(*res); 1806 rt->rt_gateway = FIB_RES_GW(*res);
1809 memcpy(rt->dst.metrics, fi->fib_metrics, 1807 rt_init_metrics(rt, fl4, fi);
1810 sizeof(rt->dst.metrics)); 1808#ifdef CONFIG_IP_ROUTE_CLASSID
1811 if (fi->fib_mtu == 0) { 1809 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1812 rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1813 if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1814 rt->rt_gateway != rt->rt_dst &&
1815 rt->dst.dev->mtu > 576)
1816 rt->dst.metrics[RTAX_MTU-1] = 576;
1817 }
1818#ifdef CONFIG_NET_CLS_ROUTE
1819 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1820#endif 1810#endif
1821 } else 1811 }
1822 rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu; 1812
1823 1813 if (dst_mtu(dst) > IP_MAX_MTU)
1824 if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0) 1814 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1825 rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; 1815 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1826 if (dst_mtu(&rt->dst) > IP_MAX_MTU) 1816 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1827 rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; 1817
1828 if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0) 1818#ifdef CONFIG_IP_ROUTE_CLASSID
1829 rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1830 ip_rt_min_advmss);
1831 if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1832 rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1833
1834#ifdef CONFIG_NET_CLS_ROUTE
1835#ifdef CONFIG_IP_MULTIPLE_TABLES 1819#ifdef CONFIG_IP_MULTIPLE_TABLES
1836 set_class_tag(rt, fib_rules_tclass(res)); 1820 set_class_tag(rt, fib_rules_tclass(res));
1837#endif 1821#endif
1838 set_class_tag(rt, itag); 1822 set_class_tag(rt, itag);
1839#endif 1823#endif
1840 rt->rt_type = res->type; 1824}
1825
1826static struct rtable *rt_dst_alloc(struct net_device *dev,
1827 bool nopolicy, bool noxfrm)
1828{
1829 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1830 DST_HOST |
1831 (nopolicy ? DST_NOPOLICY : 0) |
1832 (noxfrm ? DST_NOXFRM : 0));
1841} 1833}
1842 1834
1843/* called in rcu_read_lock() section */ 1835/* called in rcu_read_lock() section */
@@ -1865,42 +1857,38 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1865 goto e_inval; 1857 goto e_inval;
1866 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1858 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1867 } else { 1859 } else {
1868 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, 1860 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1869 &itag, 0); 1861 &itag);
1870 if (err < 0) 1862 if (err < 0)
1871 goto e_err; 1863 goto e_err;
1872 } 1864 }
1873 rth = dst_alloc(&ipv4_dst_ops); 1865 rth = rt_dst_alloc(init_net.loopback_dev,
1866 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1874 if (!rth) 1867 if (!rth)
1875 goto e_nobufs; 1868 goto e_nobufs;
1876 1869
1870#ifdef CONFIG_IP_ROUTE_CLASSID
1871 rth->dst.tclassid = itag;
1872#endif
1877 rth->dst.output = ip_rt_bug; 1873 rth->dst.output = ip_rt_bug;
1878 rth->dst.obsolete = -1;
1879 1874
1880 atomic_set(&rth->dst.__refcnt, 1); 1875 rth->rt_key_dst = daddr;
1881 rth->dst.flags= DST_HOST; 1876 rth->rt_key_src = saddr;
1882 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 1877 rth->rt_genid = rt_genid(dev_net(dev));
1883 rth->dst.flags |= DST_NOPOLICY; 1878 rth->rt_flags = RTCF_MULTICAST;
1884 rth->fl.fl4_dst = daddr; 1879 rth->rt_type = RTN_MULTICAST;
1880 rth->rt_key_tos = tos;
1885 rth->rt_dst = daddr; 1881 rth->rt_dst = daddr;
1886 rth->fl.fl4_tos = tos;
1887 rth->fl.mark = skb->mark;
1888 rth->fl.fl4_src = saddr;
1889 rth->rt_src = saddr; 1882 rth->rt_src = saddr;
1890#ifdef CONFIG_NET_CLS_ROUTE 1883 rth->rt_route_iif = dev->ifindex;
1891 rth->dst.tclassid = itag; 1884 rth->rt_iif = dev->ifindex;
1892#endif 1885 rth->rt_oif = 0;
1893 rth->rt_iif = 1886 rth->rt_mark = skb->mark;
1894 rth->fl.iif = dev->ifindex;
1895 rth->dst.dev = init_net.loopback_dev;
1896 dev_hold(rth->dst.dev);
1897 rth->idev = in_dev_get(rth->dst.dev);
1898 rth->fl.oif = 0;
1899 rth->rt_gateway = daddr; 1887 rth->rt_gateway = daddr;
1900 rth->rt_spec_dst= spec_dst; 1888 rth->rt_spec_dst= spec_dst;
1901 rth->rt_genid = rt_genid(dev_net(dev)); 1889 rth->rt_peer_genid = 0;
1902 rth->rt_flags = RTCF_MULTICAST; 1890 rth->peer = NULL;
1903 rth->rt_type = RTN_MULTICAST; 1891 rth->fi = NULL;
1904 if (our) { 1892 if (our) {
1905 rth->dst.input= ip_local_deliver; 1893 rth->dst.input= ip_local_deliver;
1906 rth->rt_flags |= RTCF_LOCAL; 1894 rth->rt_flags |= RTCF_LOCAL;
@@ -1913,7 +1901,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1913 RT_CACHE_STAT_INC(in_slow_mc); 1901 RT_CACHE_STAT_INC(in_slow_mc);
1914 1902
1915 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1903 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1916 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); 1904 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1905 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1917 1906
1918e_nobufs: 1907e_nobufs:
1919 return -ENOBUFS; 1908 return -ENOBUFS;
@@ -1956,7 +1945,7 @@ static void ip_handle_martian_source(struct net_device *dev,
1956 1945
1957/* called in rcu_read_lock() section */ 1946/* called in rcu_read_lock() section */
1958static int __mkroute_input(struct sk_buff *skb, 1947static int __mkroute_input(struct sk_buff *skb,
1959 struct fib_result *res, 1948 const struct fib_result *res,
1960 struct in_device *in_dev, 1949 struct in_device *in_dev,
1961 __be32 daddr, __be32 saddr, u32 tos, 1950 __be32 daddr, __be32 saddr, u32 tos,
1962 struct rtable **result) 1951 struct rtable **result)
@@ -1978,8 +1967,8 @@ static int __mkroute_input(struct sk_buff *skb,
1978 } 1967 }
1979 1968
1980 1969
1981 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 1970 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1982 in_dev->dev, &spec_dst, &itag, skb->mark); 1971 in_dev->dev, &spec_dst, &itag);
1983 if (err < 0) { 1972 if (err < 0) {
1984 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1973 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1985 saddr); 1974 saddr);
@@ -2010,42 +1999,36 @@ static int __mkroute_input(struct sk_buff *skb,
2010 } 1999 }
2011 } 2000 }
2012 2001
2013 2002 rth = rt_dst_alloc(out_dev->dev,
2014 rth = dst_alloc(&ipv4_dst_ops); 2003 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2004 IN_DEV_CONF_GET(out_dev, NOXFRM));
2015 if (!rth) { 2005 if (!rth) {
2016 err = -ENOBUFS; 2006 err = -ENOBUFS;
2017 goto cleanup; 2007 goto cleanup;
2018 } 2008 }
2019 2009
2020 atomic_set(&rth->dst.__refcnt, 1); 2010 rth->rt_key_dst = daddr;
2021 rth->dst.flags= DST_HOST; 2011 rth->rt_key_src = saddr;
2022 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2012 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2023 rth->dst.flags |= DST_NOPOLICY; 2013 rth->rt_flags = flags;
2024 if (IN_DEV_CONF_GET(out_dev, NOXFRM)) 2014 rth->rt_type = res->type;
2025 rth->dst.flags |= DST_NOXFRM; 2015 rth->rt_key_tos = tos;
2026 rth->fl.fl4_dst = daddr;
2027 rth->rt_dst = daddr; 2016 rth->rt_dst = daddr;
2028 rth->fl.fl4_tos = tos;
2029 rth->fl.mark = skb->mark;
2030 rth->fl.fl4_src = saddr;
2031 rth->rt_src = saddr; 2017 rth->rt_src = saddr;
2018 rth->rt_route_iif = in_dev->dev->ifindex;
2019 rth->rt_iif = in_dev->dev->ifindex;
2020 rth->rt_oif = 0;
2021 rth->rt_mark = skb->mark;
2032 rth->rt_gateway = daddr; 2022 rth->rt_gateway = daddr;
2033 rth->rt_iif =
2034 rth->fl.iif = in_dev->dev->ifindex;
2035 rth->dst.dev = (out_dev)->dev;
2036 dev_hold(rth->dst.dev);
2037 rth->idev = in_dev_get(rth->dst.dev);
2038 rth->fl.oif = 0;
2039 rth->rt_spec_dst= spec_dst; 2023 rth->rt_spec_dst= spec_dst;
2024 rth->rt_peer_genid = 0;
2025 rth->peer = NULL;
2026 rth->fi = NULL;
2040 2027
2041 rth->dst.obsolete = -1;
2042 rth->dst.input = ip_forward; 2028 rth->dst.input = ip_forward;
2043 rth->dst.output = ip_output; 2029 rth->dst.output = ip_output;
2044 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2045
2046 rt_set_nexthop(rth, res, itag);
2047 2030
2048 rth->rt_flags = flags; 2031 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2049 2032
2050 *result = rth; 2033 *result = rth;
2051 err = 0; 2034 err = 0;
@@ -2055,7 +2038,7 @@ static int __mkroute_input(struct sk_buff *skb,
2055 2038
2056static int ip_mkroute_input(struct sk_buff *skb, 2039static int ip_mkroute_input(struct sk_buff *skb,
2057 struct fib_result *res, 2040 struct fib_result *res,
2058 const struct flowi *fl, 2041 const struct flowi4 *fl4,
2059 struct in_device *in_dev, 2042 struct in_device *in_dev,
2060 __be32 daddr, __be32 saddr, u32 tos) 2043 __be32 daddr, __be32 saddr, u32 tos)
2061{ 2044{
@@ -2064,8 +2047,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
2064 unsigned hash; 2047 unsigned hash;
2065 2048
2066#ifdef CONFIG_IP_ROUTE_MULTIPATH 2049#ifdef CONFIG_IP_ROUTE_MULTIPATH
2067 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) 2050 if (res->fi && res->fi->fib_nhs > 1)
2068 fib_select_multipath(fl, res); 2051 fib_select_multipath(res);
2069#endif 2052#endif
2070 2053
2071 /* create a routing cache entry */ 2054 /* create a routing cache entry */
@@ -2074,9 +2057,12 @@ static int ip_mkroute_input(struct sk_buff *skb,
2074 return err; 2057 return err;
2075 2058
2076 /* put it into the cache */ 2059 /* put it into the cache */
2077 hash = rt_hash(daddr, saddr, fl->iif, 2060 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2078 rt_genid(dev_net(rth->dst.dev))); 2061 rt_genid(dev_net(rth->dst.dev)));
2079 return rt_intern_hash(hash, rth, NULL, skb, fl->iif); 2062 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2063 if (IS_ERR(rth))
2064 return PTR_ERR(rth);
2065 return 0;
2080} 2066}
2081 2067
2082/* 2068/*
@@ -2087,6 +2073,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
2087 * Such approach solves two big problems: 2073 * Such approach solves two big problems:
2088 * 1. Not simplex devices are handled properly. 2074 * 1. Not simplex devices are handled properly.
2089 * 2. IP spoofing attempts are filtered with 100% of guarantee. 2075 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2076 * called with rcu_read_lock()
2090 */ 2077 */
2091 2078
2092static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2079static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2094,21 +2081,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2094{ 2081{
2095 struct fib_result res; 2082 struct fib_result res;
2096 struct in_device *in_dev = __in_dev_get_rcu(dev); 2083 struct in_device *in_dev = __in_dev_get_rcu(dev);
2097 struct flowi fl = { .nl_u = { .ip4_u = 2084 struct flowi4 fl4;
2098 { .daddr = daddr,
2099 .saddr = saddr,
2100 .tos = tos,
2101 .scope = RT_SCOPE_UNIVERSE,
2102 } },
2103 .mark = skb->mark,
2104 .iif = dev->ifindex };
2105 unsigned flags = 0; 2085 unsigned flags = 0;
2106 u32 itag = 0; 2086 u32 itag = 0;
2107 struct rtable * rth; 2087 struct rtable * rth;
2108 unsigned hash; 2088 unsigned hash;
2109 __be32 spec_dst; 2089 __be32 spec_dst;
2110 int err = -EINVAL; 2090 int err = -EINVAL;
2111 int free_res = 0;
2112 struct net * net = dev_net(dev); 2091 struct net * net = dev_net(dev);
2113 2092
2114 /* IP on this device is disabled. */ 2093 /* IP on this device is disabled. */
@@ -2124,7 +2103,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2124 ipv4_is_loopback(saddr)) 2103 ipv4_is_loopback(saddr))
2125 goto martian_source; 2104 goto martian_source;
2126 2105
2127 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) 2106 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2128 goto brd_input; 2107 goto brd_input;
2129 2108
2130 /* Accept zero addresses only to limited broadcast; 2109 /* Accept zero addresses only to limited broadcast;
@@ -2133,19 +2112,25 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2133 if (ipv4_is_zeronet(saddr)) 2112 if (ipv4_is_zeronet(saddr))
2134 goto martian_source; 2113 goto martian_source;
2135 2114
2136 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || 2115 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2137 ipv4_is_loopback(daddr))
2138 goto martian_destination; 2116 goto martian_destination;
2139 2117
2140 /* 2118 /*
2141 * Now we are ready to route packet. 2119 * Now we are ready to route packet.
2142 */ 2120 */
2143 if ((err = fib_lookup(net, &fl, &res)) != 0) { 2121 fl4.flowi4_oif = 0;
2122 fl4.flowi4_iif = dev->ifindex;
2123 fl4.flowi4_mark = skb->mark;
2124 fl4.flowi4_tos = tos;
2125 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2126 fl4.daddr = daddr;
2127 fl4.saddr = saddr;
2128 err = fib_lookup(net, &fl4, &res);
2129 if (err != 0) {
2144 if (!IN_DEV_FORWARD(in_dev)) 2130 if (!IN_DEV_FORWARD(in_dev))
2145 goto e_hostunreach; 2131 goto e_hostunreach;
2146 goto no_route; 2132 goto no_route;
2147 } 2133 }
2148 free_res = 1;
2149 2134
2150 RT_CACHE_STAT_INC(in_slow_tot); 2135 RT_CACHE_STAT_INC(in_slow_tot);
2151 2136
@@ -2153,9 +2138,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2153 goto brd_input; 2138 goto brd_input;
2154 2139
2155 if (res.type == RTN_LOCAL) { 2140 if (res.type == RTN_LOCAL) {
2156 err = fib_validate_source(saddr, daddr, tos, 2141 err = fib_validate_source(skb, saddr, daddr, tos,
2157 net->loopback_dev->ifindex, 2142 net->loopback_dev->ifindex,
2158 dev, &spec_dst, &itag, skb->mark); 2143 dev, &spec_dst, &itag);
2159 if (err < 0) 2144 if (err < 0)
2160 goto martian_source_keep_err; 2145 goto martian_source_keep_err;
2161 if (err) 2146 if (err)
@@ -2169,10 +2154,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2169 if (res.type != RTN_UNICAST) 2154 if (res.type != RTN_UNICAST)
2170 goto martian_destination; 2155 goto martian_destination;
2171 2156
2172 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 2157 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2173done:
2174 if (free_res)
2175 fib_res_put(&res);
2176out: return err; 2158out: return err;
2177 2159
2178brd_input: 2160brd_input:
@@ -2182,8 +2164,8 @@ brd_input:
2182 if (ipv4_is_zeronet(saddr)) 2164 if (ipv4_is_zeronet(saddr))
2183 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 2165 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2184 else { 2166 else {
2185 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, 2167 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2186 &itag, skb->mark); 2168 &itag);
2187 if (err < 0) 2169 if (err < 0)
2188 goto martian_source_keep_err; 2170 goto martian_source_keep_err;
2189 if (err) 2171 if (err)
@@ -2194,45 +2176,48 @@ brd_input:
2194 RT_CACHE_STAT_INC(in_brd); 2176 RT_CACHE_STAT_INC(in_brd);
2195 2177
2196local_input: 2178local_input:
2197 rth = dst_alloc(&ipv4_dst_ops); 2179 rth = rt_dst_alloc(net->loopback_dev,
2180 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2198 if (!rth) 2181 if (!rth)
2199 goto e_nobufs; 2182 goto e_nobufs;
2200 2183
2184 rth->dst.input= ip_local_deliver;
2201 rth->dst.output= ip_rt_bug; 2185 rth->dst.output= ip_rt_bug;
2202 rth->dst.obsolete = -1; 2186#ifdef CONFIG_IP_ROUTE_CLASSID
2203 rth->rt_genid = rt_genid(net); 2187 rth->dst.tclassid = itag;
2188#endif
2204 2189
2205 atomic_set(&rth->dst.__refcnt, 1); 2190 rth->rt_key_dst = daddr;
2206 rth->dst.flags= DST_HOST; 2191 rth->rt_key_src = saddr;
2207 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2192 rth->rt_genid = rt_genid(net);
2208 rth->dst.flags |= DST_NOPOLICY; 2193 rth->rt_flags = flags|RTCF_LOCAL;
2209 rth->fl.fl4_dst = daddr; 2194 rth->rt_type = res.type;
2195 rth->rt_key_tos = tos;
2210 rth->rt_dst = daddr; 2196 rth->rt_dst = daddr;
2211 rth->fl.fl4_tos = tos;
2212 rth->fl.mark = skb->mark;
2213 rth->fl.fl4_src = saddr;
2214 rth->rt_src = saddr; 2197 rth->rt_src = saddr;
2215#ifdef CONFIG_NET_CLS_ROUTE 2198#ifdef CONFIG_IP_ROUTE_CLASSID
2216 rth->dst.tclassid = itag; 2199 rth->dst.tclassid = itag;
2217#endif 2200#endif
2218 rth->rt_iif = 2201 rth->rt_route_iif = dev->ifindex;
2219 rth->fl.iif = dev->ifindex; 2202 rth->rt_iif = dev->ifindex;
2220 rth->dst.dev = net->loopback_dev; 2203 rth->rt_oif = 0;
2221 dev_hold(rth->dst.dev); 2204 rth->rt_mark = skb->mark;
2222 rth->idev = in_dev_get(rth->dst.dev);
2223 rth->rt_gateway = daddr; 2205 rth->rt_gateway = daddr;
2224 rth->rt_spec_dst= spec_dst; 2206 rth->rt_spec_dst= spec_dst;
2225 rth->dst.input= ip_local_deliver; 2207 rth->rt_peer_genid = 0;
2226 rth->rt_flags = flags|RTCF_LOCAL; 2208 rth->peer = NULL;
2209 rth->fi = NULL;
2227 if (res.type == RTN_UNREACHABLE) { 2210 if (res.type == RTN_UNREACHABLE) {
2228 rth->dst.input= ip_error; 2211 rth->dst.input= ip_error;
2229 rth->dst.error= -err; 2212 rth->dst.error= -err;
2230 rth->rt_flags &= ~RTCF_LOCAL; 2213 rth->rt_flags &= ~RTCF_LOCAL;
2231 } 2214 }
2232 rth->rt_type = res.type; 2215 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2233 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); 2216 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2234 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); 2217 err = 0;
2235 goto done; 2218 if (IS_ERR(rth))
2219 err = PTR_ERR(rth);
2220 goto out;
2236 2221
2237no_route: 2222no_route:
2238 RT_CACHE_STAT_INC(in_no_route); 2223 RT_CACHE_STAT_INC(in_no_route);
@@ -2255,21 +2240,21 @@ martian_destination:
2255 2240
2256e_hostunreach: 2241e_hostunreach:
2257 err = -EHOSTUNREACH; 2242 err = -EHOSTUNREACH;
2258 goto done; 2243 goto out;
2259 2244
2260e_inval: 2245e_inval:
2261 err = -EINVAL; 2246 err = -EINVAL;
2262 goto done; 2247 goto out;
2263 2248
2264e_nobufs: 2249e_nobufs:
2265 err = -ENOBUFS; 2250 err = -ENOBUFS;
2266 goto done; 2251 goto out;
2267 2252
2268martian_source: 2253martian_source:
2269 err = -EINVAL; 2254 err = -EINVAL;
2270martian_source_keep_err: 2255martian_source_keep_err:
2271 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2256 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2272 goto done; 2257 goto out;
2273} 2258}
2274 2259
2275int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2260int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2293,12 +2278,12 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2293 2278
2294 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2279 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2295 rth = rcu_dereference(rth->dst.rt_next)) { 2280 rth = rcu_dereference(rth->dst.rt_next)) {
2296 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | 2281 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2297 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | 2282 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2298 (rth->fl.iif ^ iif) | 2283 (rth->rt_iif ^ iif) |
2299 rth->fl.oif | 2284 rth->rt_oif |
2300 (rth->fl.fl4_tos ^ tos)) == 0 && 2285 (rth->rt_key_tos ^ tos)) == 0 &&
2301 rth->fl.mark == skb->mark && 2286 rth->rt_mark == skb->mark &&
2302 net_eq(dev_net(rth->dst.dev), net) && 2287 net_eq(dev_net(rth->dst.dev), net) &&
2303 !rt_is_expired(rth)) { 2288 !rt_is_expired(rth)) {
2304 if (noref) { 2289 if (noref) {
@@ -2331,8 +2316,8 @@ skip_cache:
2331 struct in_device *in_dev = __in_dev_get_rcu(dev); 2316 struct in_device *in_dev = __in_dev_get_rcu(dev);
2332 2317
2333 if (in_dev) { 2318 if (in_dev) {
2334 int our = ip_check_mc(in_dev, daddr, saddr, 2319 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2335 ip_hdr(skb)->protocol); 2320 ip_hdr(skb)->protocol);
2336 if (our 2321 if (our
2337#ifdef CONFIG_IP_MROUTE 2322#ifdef CONFIG_IP_MROUTE
2338 || 2323 ||
@@ -2355,108 +2340,95 @@ skip_cache:
2355} 2340}
2356EXPORT_SYMBOL(ip_route_input_common); 2341EXPORT_SYMBOL(ip_route_input_common);
2357 2342
2358static int __mkroute_output(struct rtable **result, 2343/* called with rcu_read_lock() */
2359 struct fib_result *res, 2344static struct rtable *__mkroute_output(const struct fib_result *res,
2360 const struct flowi *fl, 2345 const struct flowi4 *fl4,
2361 const struct flowi *oldflp, 2346 __be32 orig_daddr, __be32 orig_saddr,
2362 struct net_device *dev_out, 2347 int orig_oif, struct net_device *dev_out,
2363 unsigned flags) 2348 unsigned int flags)
2364{ 2349{
2365 struct rtable *rth; 2350 struct fib_info *fi = res->fi;
2351 u32 tos = RT_FL_TOS(fl4);
2366 struct in_device *in_dev; 2352 struct in_device *in_dev;
2367 u32 tos = RT_FL_TOS(oldflp); 2353 u16 type = res->type;
2368 int err = 0; 2354 struct rtable *rth;
2369 2355
2370 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) 2356 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2371 return -EINVAL; 2357 return ERR_PTR(-EINVAL);
2372 2358
2373 if (fl->fl4_dst == htonl(0xFFFFFFFF)) 2359 if (ipv4_is_lbcast(fl4->daddr))
2374 res->type = RTN_BROADCAST; 2360 type = RTN_BROADCAST;
2375 else if (ipv4_is_multicast(fl->fl4_dst)) 2361 else if (ipv4_is_multicast(fl4->daddr))
2376 res->type = RTN_MULTICAST; 2362 type = RTN_MULTICAST;
2377 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) 2363 else if (ipv4_is_zeronet(fl4->daddr))
2378 return -EINVAL; 2364 return ERR_PTR(-EINVAL);
2379 2365
2380 if (dev_out->flags & IFF_LOOPBACK) 2366 if (dev_out->flags & IFF_LOOPBACK)
2381 flags |= RTCF_LOCAL; 2367 flags |= RTCF_LOCAL;
2382 2368
2383 /* get work reference to inet device */ 2369 in_dev = __in_dev_get_rcu(dev_out);
2384 in_dev = in_dev_get(dev_out);
2385 if (!in_dev) 2370 if (!in_dev)
2386 return -EINVAL; 2371 return ERR_PTR(-EINVAL);
2387 2372
2388 if (res->type == RTN_BROADCAST) { 2373 if (type == RTN_BROADCAST) {
2389 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2374 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2390 if (res->fi) { 2375 fi = NULL;
2391 fib_info_put(res->fi); 2376 } else if (type == RTN_MULTICAST) {
2392 res->fi = NULL; 2377 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2393 } 2378 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2394 } else if (res->type == RTN_MULTICAST) { 2379 fl4->flowi4_proto))
2395 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2396 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2397 oldflp->proto))
2398 flags &= ~RTCF_LOCAL; 2380 flags &= ~RTCF_LOCAL;
2399 /* If multicast route do not exist use 2381 /* If multicast route do not exist use
2400 default one, but do not gateway in this case. 2382 * default one, but do not gateway in this case.
2401 Yes, it is hack. 2383 * Yes, it is hack.
2402 */ 2384 */
2403 if (res->fi && res->prefixlen < 4) { 2385 if (fi && res->prefixlen < 4)
2404 fib_info_put(res->fi); 2386 fi = NULL;
2405 res->fi = NULL;
2406 }
2407 } 2387 }
2408 2388
2389 rth = rt_dst_alloc(dev_out,
2390 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2391 IN_DEV_CONF_GET(in_dev, NOXFRM));
2392 if (!rth)
2393 return ERR_PTR(-ENOBUFS);
2409 2394
2410 rth = dst_alloc(&ipv4_dst_ops); 2395 rth->dst.output = ip_output;
2411 if (!rth) {
2412 err = -ENOBUFS;
2413 goto cleanup;
2414 }
2415 2396
2416 atomic_set(&rth->dst.__refcnt, 1); 2397 rth->rt_key_dst = orig_daddr;
2417 rth->dst.flags= DST_HOST; 2398 rth->rt_key_src = orig_saddr;
2418 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2419 rth->dst.flags |= DST_NOXFRM;
2420 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2421 rth->dst.flags |= DST_NOPOLICY;
2422
2423 rth->fl.fl4_dst = oldflp->fl4_dst;
2424 rth->fl.fl4_tos = tos;
2425 rth->fl.fl4_src = oldflp->fl4_src;
2426 rth->fl.oif = oldflp->oif;
2427 rth->fl.mark = oldflp->mark;
2428 rth->rt_dst = fl->fl4_dst;
2429 rth->rt_src = fl->fl4_src;
2430 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2431 /* get references to the devices that are to be hold by the routing
2432 cache entry */
2433 rth->dst.dev = dev_out;
2434 dev_hold(dev_out);
2435 rth->idev = in_dev_get(dev_out);
2436 rth->rt_gateway = fl->fl4_dst;
2437 rth->rt_spec_dst= fl->fl4_src;
2438
2439 rth->dst.output=ip_output;
2440 rth->dst.obsolete = -1;
2441 rth->rt_genid = rt_genid(dev_net(dev_out)); 2399 rth->rt_genid = rt_genid(dev_net(dev_out));
2400 rth->rt_flags = flags;
2401 rth->rt_type = type;
2402 rth->rt_key_tos = tos;
2403 rth->rt_dst = fl4->daddr;
2404 rth->rt_src = fl4->saddr;
2405 rth->rt_route_iif = 0;
2406 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2407 rth->rt_oif = orig_oif;
2408 rth->rt_mark = fl4->flowi4_mark;
2409 rth->rt_gateway = fl4->daddr;
2410 rth->rt_spec_dst= fl4->saddr;
2411 rth->rt_peer_genid = 0;
2412 rth->peer = NULL;
2413 rth->fi = NULL;
2442 2414
2443 RT_CACHE_STAT_INC(out_slow_tot); 2415 RT_CACHE_STAT_INC(out_slow_tot);
2444 2416
2445 if (flags & RTCF_LOCAL) { 2417 if (flags & RTCF_LOCAL) {
2446 rth->dst.input = ip_local_deliver; 2418 rth->dst.input = ip_local_deliver;
2447 rth->rt_spec_dst = fl->fl4_dst; 2419 rth->rt_spec_dst = fl4->daddr;
2448 } 2420 }
2449 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2421 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2450 rth->rt_spec_dst = fl->fl4_src; 2422 rth->rt_spec_dst = fl4->saddr;
2451 if (flags & RTCF_LOCAL && 2423 if (flags & RTCF_LOCAL &&
2452 !(dev_out->flags & IFF_LOOPBACK)) { 2424 !(dev_out->flags & IFF_LOOPBACK)) {
2453 rth->dst.output = ip_mc_output; 2425 rth->dst.output = ip_mc_output;
2454 RT_CACHE_STAT_INC(out_slow_mc); 2426 RT_CACHE_STAT_INC(out_slow_mc);
2455 } 2427 }
2456#ifdef CONFIG_IP_MROUTE 2428#ifdef CONFIG_IP_MROUTE
2457 if (res->type == RTN_MULTICAST) { 2429 if (type == RTN_MULTICAST) {
2458 if (IN_DEV_MFORWARD(in_dev) && 2430 if (IN_DEV_MFORWARD(in_dev) &&
2459 !ipv4_is_local_multicast(oldflp->fl4_dst)) { 2431 !ipv4_is_local_multicast(fl4->daddr)) {
2460 rth->dst.input = ip_mr_input; 2432 rth->dst.input = ip_mr_input;
2461 rth->dst.output = ip_mc_output; 2433 rth->dst.output = ip_mc_output;
2462 } 2434 }
@@ -2464,73 +2436,47 @@ static int __mkroute_output(struct rtable **result,
2464#endif 2436#endif
2465 } 2437 }
2466 2438
2467 rt_set_nexthop(rth, res, 0); 2439 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2468
2469 rth->rt_flags = flags;
2470
2471 *result = rth;
2472 cleanup:
2473 /* release work reference to inet device */
2474 in_dev_put(in_dev);
2475
2476 return err;
2477}
2478
2479static int ip_mkroute_output(struct rtable **rp,
2480 struct fib_result *res,
2481 const struct flowi *fl,
2482 const struct flowi *oldflp,
2483 struct net_device *dev_out,
2484 unsigned flags)
2485{
2486 struct rtable *rth = NULL;
2487 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2488 unsigned hash;
2489 if (err == 0) {
2490 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2491 rt_genid(dev_net(dev_out)));
2492 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2493 }
2494 2440
2495 return err; 2441 return rth;
2496} 2442}
2497 2443
2498/* 2444/*
2499 * Major route resolver routine. 2445 * Major route resolver routine.
2446 * called with rcu_read_lock();
2500 */ 2447 */
2501 2448
2502static int ip_route_output_slow(struct net *net, struct rtable **rp, 2449static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2503 const struct flowi *oldflp) 2450{
2504{
2505 u32 tos = RT_FL_TOS(oldflp);
2506 struct flowi fl = { .nl_u = { .ip4_u =
2507 { .daddr = oldflp->fl4_dst,
2508 .saddr = oldflp->fl4_src,
2509 .tos = tos & IPTOS_RT_MASK,
2510 .scope = ((tos & RTO_ONLINK) ?
2511 RT_SCOPE_LINK :
2512 RT_SCOPE_UNIVERSE),
2513 } },
2514 .mark = oldflp->mark,
2515 .iif = net->loopback_dev->ifindex,
2516 .oif = oldflp->oif };
2517 struct fib_result res;
2518 unsigned flags = 0;
2519 struct net_device *dev_out = NULL; 2451 struct net_device *dev_out = NULL;
2520 int free_res = 0; 2452 u32 tos = RT_FL_TOS(fl4);
2521 int err; 2453 unsigned int flags = 0;
2522 2454 struct fib_result res;
2455 struct rtable *rth;
2456 __be32 orig_daddr;
2457 __be32 orig_saddr;
2458 int orig_oif;
2523 2459
2524 res.fi = NULL; 2460 res.fi = NULL;
2525#ifdef CONFIG_IP_MULTIPLE_TABLES 2461#ifdef CONFIG_IP_MULTIPLE_TABLES
2526 res.r = NULL; 2462 res.r = NULL;
2527#endif 2463#endif
2528 2464
2529 if (oldflp->fl4_src) { 2465 orig_daddr = fl4->daddr;
2530 err = -EINVAL; 2466 orig_saddr = fl4->saddr;
2531 if (ipv4_is_multicast(oldflp->fl4_src) || 2467 orig_oif = fl4->flowi4_oif;
2532 ipv4_is_lbcast(oldflp->fl4_src) || 2468
2533 ipv4_is_zeronet(oldflp->fl4_src)) 2469 fl4->flowi4_iif = net->loopback_dev->ifindex;
2470 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2471 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2472 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2473
2474 rcu_read_lock();
2475 if (fl4->saddr) {
2476 rth = ERR_PTR(-EINVAL);
2477 if (ipv4_is_multicast(fl4->saddr) ||
2478 ipv4_is_lbcast(fl4->saddr) ||
2479 ipv4_is_zeronet(fl4->saddr))
2534 goto out; 2480 goto out;
2535 2481
2536 /* I removed check for oif == dev_out->oif here. 2482 /* I removed check for oif == dev_out->oif here.
@@ -2541,11 +2487,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2541 of another iface. --ANK 2487 of another iface. --ANK
2542 */ 2488 */
2543 2489
2544 if (oldflp->oif == 0 && 2490 if (fl4->flowi4_oif == 0 &&
2545 (ipv4_is_multicast(oldflp->fl4_dst) || 2491 (ipv4_is_multicast(fl4->daddr) ||
2546 oldflp->fl4_dst == htonl(0xFFFFFFFF))) { 2492 ipv4_is_lbcast(fl4->daddr))) {
2547 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2493 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2548 dev_out = ip_dev_find(net, oldflp->fl4_src); 2494 dev_out = __ip_dev_find(net, fl4->saddr, false);
2549 if (dev_out == NULL) 2495 if (dev_out == NULL)
2550 goto out; 2496 goto out;
2551 2497
@@ -2564,67 +2510,60 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2564 Luckily, this hack is good workaround. 2510 Luckily, this hack is good workaround.
2565 */ 2511 */
2566 2512
2567 fl.oif = dev_out->ifindex; 2513 fl4->flowi4_oif = dev_out->ifindex;
2568 goto make_route; 2514 goto make_route;
2569 } 2515 }
2570 2516
2571 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { 2517 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2572 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2518 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2573 dev_out = ip_dev_find(net, oldflp->fl4_src); 2519 if (!__ip_dev_find(net, fl4->saddr, false))
2574 if (dev_out == NULL)
2575 goto out; 2520 goto out;
2576 dev_put(dev_out);
2577 dev_out = NULL;
2578 } 2521 }
2579 } 2522 }
2580 2523
2581 2524
2582 if (oldflp->oif) { 2525 if (fl4->flowi4_oif) {
2583 dev_out = dev_get_by_index(net, oldflp->oif); 2526 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2584 err = -ENODEV; 2527 rth = ERR_PTR(-ENODEV);
2585 if (dev_out == NULL) 2528 if (dev_out == NULL)
2586 goto out; 2529 goto out;
2587 2530
2588 /* RACE: Check return value of inet_select_addr instead. */ 2531 /* RACE: Check return value of inet_select_addr instead. */
2589 if (__in_dev_get_rtnl(dev_out) == NULL) { 2532 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2590 dev_put(dev_out); 2533 rth = ERR_PTR(-ENETUNREACH);
2591 goto out; /* Wrong error code */ 2534 goto out;
2592 } 2535 }
2593 2536 if (ipv4_is_local_multicast(fl4->daddr) ||
2594 if (ipv4_is_local_multicast(oldflp->fl4_dst) || 2537 ipv4_is_lbcast(fl4->daddr)) {
2595 oldflp->fl4_dst == htonl(0xFFFFFFFF)) { 2538 if (!fl4->saddr)
2596 if (!fl.fl4_src) 2539 fl4->saddr = inet_select_addr(dev_out, 0,
2597 fl.fl4_src = inet_select_addr(dev_out, 0,
2598 RT_SCOPE_LINK); 2540 RT_SCOPE_LINK);
2599 goto make_route; 2541 goto make_route;
2600 } 2542 }
2601 if (!fl.fl4_src) { 2543 if (fl4->saddr) {
2602 if (ipv4_is_multicast(oldflp->fl4_dst)) 2544 if (ipv4_is_multicast(fl4->daddr))
2603 fl.fl4_src = inet_select_addr(dev_out, 0, 2545 fl4->saddr = inet_select_addr(dev_out, 0,
2604 fl.fl4_scope); 2546 fl4->flowi4_scope);
2605 else if (!oldflp->fl4_dst) 2547 else if (!fl4->daddr)
2606 fl.fl4_src = inet_select_addr(dev_out, 0, 2548 fl4->saddr = inet_select_addr(dev_out, 0,
2607 RT_SCOPE_HOST); 2549 RT_SCOPE_HOST);
2608 } 2550 }
2609 } 2551 }
2610 2552
2611 if (!fl.fl4_dst) { 2553 if (!fl4->daddr) {
2612 fl.fl4_dst = fl.fl4_src; 2554 fl4->daddr = fl4->saddr;
2613 if (!fl.fl4_dst) 2555 if (!fl4->daddr)
2614 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); 2556 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2615 if (dev_out)
2616 dev_put(dev_out);
2617 dev_out = net->loopback_dev; 2557 dev_out = net->loopback_dev;
2618 dev_hold(dev_out); 2558 fl4->flowi4_oif = net->loopback_dev->ifindex;
2619 fl.oif = net->loopback_dev->ifindex;
2620 res.type = RTN_LOCAL; 2559 res.type = RTN_LOCAL;
2621 flags |= RTCF_LOCAL; 2560 flags |= RTCF_LOCAL;
2622 goto make_route; 2561 goto make_route;
2623 } 2562 }
2624 2563
2625 if (fib_lookup(net, &fl, &res)) { 2564 if (fib_lookup(net, fl4, &res)) {
2626 res.fi = NULL; 2565 res.fi = NULL;
2627 if (oldflp->oif) { 2566 if (fl4->flowi4_oif) {
2628 /* Apparently, routing tables are wrong. Assume, 2567 /* Apparently, routing tables are wrong. Assume,
2629 that the destination is on link. 2568 that the destination is on link.
2630 2569
@@ -2643,98 +2582,100 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2643 likely IPv6, but we do not. 2582 likely IPv6, but we do not.
2644 */ 2583 */
2645 2584
2646 if (fl.fl4_src == 0) 2585 if (fl4->saddr == 0)
2647 fl.fl4_src = inet_select_addr(dev_out, 0, 2586 fl4->saddr = inet_select_addr(dev_out, 0,
2648 RT_SCOPE_LINK); 2587 RT_SCOPE_LINK);
2649 res.type = RTN_UNICAST; 2588 res.type = RTN_UNICAST;
2650 goto make_route; 2589 goto make_route;
2651 } 2590 }
2652 if (dev_out) 2591 rth = ERR_PTR(-ENETUNREACH);
2653 dev_put(dev_out);
2654 err = -ENETUNREACH;
2655 goto out; 2592 goto out;
2656 } 2593 }
2657 free_res = 1;
2658 2594
2659 if (res.type == RTN_LOCAL) { 2595 if (res.type == RTN_LOCAL) {
2660 if (!fl.fl4_src) 2596 if (!fl4->saddr) {
2661 fl.fl4_src = fl.fl4_dst; 2597 if (res.fi->fib_prefsrc)
2662 if (dev_out) 2598 fl4->saddr = res.fi->fib_prefsrc;
2663 dev_put(dev_out); 2599 else
2600 fl4->saddr = fl4->daddr;
2601 }
2664 dev_out = net->loopback_dev; 2602 dev_out = net->loopback_dev;
2665 dev_hold(dev_out); 2603 fl4->flowi4_oif = dev_out->ifindex;
2666 fl.oif = dev_out->ifindex;
2667 if (res.fi)
2668 fib_info_put(res.fi);
2669 res.fi = NULL; 2604 res.fi = NULL;
2670 flags |= RTCF_LOCAL; 2605 flags |= RTCF_LOCAL;
2671 goto make_route; 2606 goto make_route;
2672 } 2607 }
2673 2608
2674#ifdef CONFIG_IP_ROUTE_MULTIPATH 2609#ifdef CONFIG_IP_ROUTE_MULTIPATH
2675 if (res.fi->fib_nhs > 1 && fl.oif == 0) 2610 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2676 fib_select_multipath(&fl, &res); 2611 fib_select_multipath(&res);
2677 else 2612 else
2678#endif 2613#endif
2679 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) 2614 if (!res.prefixlen &&
2680 fib_select_default(net, &fl, &res); 2615 res.table->tb_num_default > 1 &&
2616 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2617 fib_select_default(&res);
2681 2618
2682 if (!fl.fl4_src) 2619 if (!fl4->saddr)
2683 fl.fl4_src = FIB_RES_PREFSRC(res); 2620 fl4->saddr = FIB_RES_PREFSRC(net, res);
2684 2621
2685 if (dev_out)
2686 dev_put(dev_out);
2687 dev_out = FIB_RES_DEV(res); 2622 dev_out = FIB_RES_DEV(res);
2688 dev_hold(dev_out); 2623 fl4->flowi4_oif = dev_out->ifindex;
2689 fl.oif = dev_out->ifindex;
2690 2624
2691 2625
2692make_route: 2626make_route:
2693 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2627 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2628 dev_out, flags);
2629 if (!IS_ERR(rth)) {
2630 unsigned int hash;
2694 2631
2632 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2633 rt_genid(dev_net(dev_out)));
2634 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2635 }
2695 2636
2696 if (free_res) 2637out:
2697 fib_res_put(&res); 2638 rcu_read_unlock();
2698 if (dev_out) 2639 return rth;
2699 dev_put(dev_out);
2700out: return err;
2701} 2640}
2702 2641
2703int __ip_route_output_key(struct net *net, struct rtable **rp, 2642struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2704 const struct flowi *flp)
2705{ 2643{
2706 unsigned hash;
2707 struct rtable *rth; 2644 struct rtable *rth;
2645 unsigned int hash;
2708 2646
2709 if (!rt_caching(net)) 2647 if (!rt_caching(net))
2710 goto slow_output; 2648 goto slow_output;
2711 2649
2712 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); 2650 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2713 2651
2714 rcu_read_lock_bh(); 2652 rcu_read_lock_bh();
2715 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; 2653 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2716 rth = rcu_dereference_bh(rth->dst.rt_next)) { 2654 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2717 if (rth->fl.fl4_dst == flp->fl4_dst && 2655 if (rth->rt_key_dst == flp4->daddr &&
2718 rth->fl.fl4_src == flp->fl4_src && 2656 rth->rt_key_src == flp4->saddr &&
2719 rth->fl.iif == 0 && 2657 rt_is_output_route(rth) &&
2720 rth->fl.oif == flp->oif && 2658 rth->rt_oif == flp4->flowi4_oif &&
2721 rth->fl.mark == flp->mark && 2659 rth->rt_mark == flp4->flowi4_mark &&
2722 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2660 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2723 (IPTOS_RT_MASK | RTO_ONLINK)) && 2661 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2724 net_eq(dev_net(rth->dst.dev), net) && 2662 net_eq(dev_net(rth->dst.dev), net) &&
2725 !rt_is_expired(rth)) { 2663 !rt_is_expired(rth)) {
2726 dst_use(&rth->dst, jiffies); 2664 dst_use(&rth->dst, jiffies);
2727 RT_CACHE_STAT_INC(out_hit); 2665 RT_CACHE_STAT_INC(out_hit);
2728 rcu_read_unlock_bh(); 2666 rcu_read_unlock_bh();
2729 *rp = rth; 2667 if (!flp4->saddr)
2730 return 0; 2668 flp4->saddr = rth->rt_src;
2669 if (!flp4->daddr)
2670 flp4->daddr = rth->rt_dst;
2671 return rth;
2731 } 2672 }
2732 RT_CACHE_STAT_INC(out_hlist_search); 2673 RT_CACHE_STAT_INC(out_hlist_search);
2733 } 2674 }
2734 rcu_read_unlock_bh(); 2675 rcu_read_unlock_bh();
2735 2676
2736slow_output: 2677slow_output:
2737 return ip_route_output_slow(net, rp, flp); 2678 return ip_route_output_slow(net, flp4);
2738} 2679}
2739EXPORT_SYMBOL_GPL(__ip_route_output_key); 2680EXPORT_SYMBOL_GPL(__ip_route_output_key);
2740 2681
@@ -2743,95 +2684,96 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo
2743 return NULL; 2684 return NULL;
2744} 2685}
2745 2686
2687static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2688{
2689 return 0;
2690}
2691
2746static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2692static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2747{ 2693{
2748} 2694}
2749 2695
2696static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2697 unsigned long old)
2698{
2699 return NULL;
2700}
2701
2750static struct dst_ops ipv4_dst_blackhole_ops = { 2702static struct dst_ops ipv4_dst_blackhole_ops = {
2751 .family = AF_INET, 2703 .family = AF_INET,
2752 .protocol = cpu_to_be16(ETH_P_IP), 2704 .protocol = cpu_to_be16(ETH_P_IP),
2753 .destroy = ipv4_dst_destroy, 2705 .destroy = ipv4_dst_destroy,
2754 .check = ipv4_blackhole_dst_check, 2706 .check = ipv4_blackhole_dst_check,
2707 .default_mtu = ipv4_blackhole_default_mtu,
2708 .default_advmss = ipv4_default_advmss,
2755 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2709 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2756 .entries = ATOMIC_INIT(0), 2710 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2757}; 2711};
2758 2712
2759 2713struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2760static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2761{ 2714{
2762 struct rtable *ort = *rp; 2715 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2763 struct rtable *rt = (struct rtable *) 2716 struct rtable *ort = (struct rtable *) dst_orig;
2764 dst_alloc(&ipv4_dst_blackhole_ops);
2765 2717
2766 if (rt) { 2718 if (rt) {
2767 struct dst_entry *new = &rt->dst; 2719 struct dst_entry *new = &rt->dst;
2768 2720
2769 atomic_set(&new->__refcnt, 1);
2770 new->__use = 1; 2721 new->__use = 1;
2771 new->input = dst_discard; 2722 new->input = dst_discard;
2772 new->output = dst_discard; 2723 new->output = dst_discard;
2773 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32)); 2724 dst_copy_metrics(new, &ort->dst);
2774 2725
2775 new->dev = ort->dst.dev; 2726 new->dev = ort->dst.dev;
2776 if (new->dev) 2727 if (new->dev)
2777 dev_hold(new->dev); 2728 dev_hold(new->dev);
2778 2729
2779 rt->fl = ort->fl; 2730 rt->rt_key_dst = ort->rt_key_dst;
2731 rt->rt_key_src = ort->rt_key_src;
2732 rt->rt_key_tos = ort->rt_key_tos;
2733 rt->rt_route_iif = ort->rt_route_iif;
2734 rt->rt_iif = ort->rt_iif;
2735 rt->rt_oif = ort->rt_oif;
2736 rt->rt_mark = ort->rt_mark;
2780 2737
2781 rt->idev = ort->idev;
2782 if (rt->idev)
2783 in_dev_hold(rt->idev);
2784 rt->rt_genid = rt_genid(net); 2738 rt->rt_genid = rt_genid(net);
2785 rt->rt_flags = ort->rt_flags; 2739 rt->rt_flags = ort->rt_flags;
2786 rt->rt_type = ort->rt_type; 2740 rt->rt_type = ort->rt_type;
2787 rt->rt_dst = ort->rt_dst; 2741 rt->rt_dst = ort->rt_dst;
2788 rt->rt_src = ort->rt_src; 2742 rt->rt_src = ort->rt_src;
2789 rt->rt_iif = ort->rt_iif;
2790 rt->rt_gateway = ort->rt_gateway; 2743 rt->rt_gateway = ort->rt_gateway;
2791 rt->rt_spec_dst = ort->rt_spec_dst; 2744 rt->rt_spec_dst = ort->rt_spec_dst;
2792 rt->peer = ort->peer; 2745 rt->peer = ort->peer;
2793 if (rt->peer) 2746 if (rt->peer)
2794 atomic_inc(&rt->peer->refcnt); 2747 atomic_inc(&rt->peer->refcnt);
2748 rt->fi = ort->fi;
2749 if (rt->fi)
2750 atomic_inc(&rt->fi->fib_clntref);
2795 2751
2796 dst_free(new); 2752 dst_free(new);
2797 } 2753 }
2798 2754
2799 dst_release(&(*rp)->dst); 2755 dst_release(dst_orig);
2800 *rp = rt; 2756
2801 return (rt ? 0 : -ENOMEM); 2757 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2802} 2758}
2803 2759
2804int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, 2760struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2805 struct sock *sk, int flags) 2761 struct sock *sk)
2806{ 2762{
2807 int err; 2763 struct rtable *rt = __ip_route_output_key(net, flp4);
2808
2809 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2810 return err;
2811 2764
2812 if (flp->proto) { 2765 if (IS_ERR(rt))
2813 if (!flp->fl4_src) 2766 return rt;
2814 flp->fl4_src = (*rp)->rt_src;
2815 if (!flp->fl4_dst)
2816 flp->fl4_dst = (*rp)->rt_dst;
2817 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2818 flags ? XFRM_LOOKUP_WAIT : 0);
2819 if (err == -EREMOTE)
2820 err = ipv4_dst_blackhole(net, rp, flp);
2821 2767
2822 return err; 2768 if (flp4->flowi4_proto)
2823 } 2769 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2770 flowi4_to_flowi(flp4),
2771 sk, 0);
2824 2772
2825 return 0; 2773 return rt;
2826} 2774}
2827EXPORT_SYMBOL_GPL(ip_route_output_flow); 2775EXPORT_SYMBOL_GPL(ip_route_output_flow);
2828 2776
2829int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2830{
2831 return ip_route_output_flow(net, rp, flp, NULL, 0);
2832}
2833EXPORT_SYMBOL(ip_route_output_key);
2834
2835static int rt_fill_info(struct net *net, 2777static int rt_fill_info(struct net *net,
2836 struct sk_buff *skb, u32 pid, u32 seq, int event, 2778 struct sk_buff *skb, u32 pid, u32 seq, int event,
2837 int nowait, unsigned int flags) 2779 int nowait, unsigned int flags)
@@ -2839,7 +2781,8 @@ static int rt_fill_info(struct net *net,
2839 struct rtable *rt = skb_rtable(skb); 2781 struct rtable *rt = skb_rtable(skb);
2840 struct rtmsg *r; 2782 struct rtmsg *r;
2841 struct nlmsghdr *nlh; 2783 struct nlmsghdr *nlh;
2842 long expires; 2784 long expires = 0;
2785 const struct inet_peer *peer = rt->peer;
2843 u32 id = 0, ts = 0, tsage = 0, error; 2786 u32 id = 0, ts = 0, tsage = 0, error;
2844 2787
2845 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2788 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
@@ -2850,7 +2793,7 @@ static int rt_fill_info(struct net *net,
2850 r->rtm_family = AF_INET; 2793 r->rtm_family = AF_INET;
2851 r->rtm_dst_len = 32; 2794 r->rtm_dst_len = 32;
2852 r->rtm_src_len = 0; 2795 r->rtm_src_len = 0;
2853 r->rtm_tos = rt->fl.fl4_tos; 2796 r->rtm_tos = rt->rt_key_tos;
2854 r->rtm_table = RT_TABLE_MAIN; 2797 r->rtm_table = RT_TABLE_MAIN;
2855 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); 2798 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2856 r->rtm_type = rt->rt_type; 2799 r->rtm_type = rt->rt_type;
@@ -2862,48 +2805,52 @@ static int rt_fill_info(struct net *net,
2862 2805
2863 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); 2806 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2864 2807
2865 if (rt->fl.fl4_src) { 2808 if (rt->rt_key_src) {
2866 r->rtm_src_len = 32; 2809 r->rtm_src_len = 32;
2867 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); 2810 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2868 } 2811 }
2869 if (rt->dst.dev) 2812 if (rt->dst.dev)
2870 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); 2813 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2871#ifdef CONFIG_NET_CLS_ROUTE 2814#ifdef CONFIG_IP_ROUTE_CLASSID
2872 if (rt->dst.tclassid) 2815 if (rt->dst.tclassid)
2873 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); 2816 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2874#endif 2817#endif
2875 if (rt->fl.iif) 2818 if (rt_is_input_route(rt))
2876 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2819 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2877 else if (rt->rt_src != rt->fl.fl4_src) 2820 else if (rt->rt_src != rt->rt_key_src)
2878 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 2821 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2879 2822
2880 if (rt->rt_dst != rt->rt_gateway) 2823 if (rt->rt_dst != rt->rt_gateway)
2881 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2824 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2882 2825
2883 if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0) 2826 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2884 goto nla_put_failure; 2827 goto nla_put_failure;
2885 2828
2886 if (rt->fl.mark) 2829 if (rt->rt_mark)
2887 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); 2830 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2888 2831
2889 error = rt->dst.error; 2832 error = rt->dst.error;
2890 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; 2833 if (peer) {
2891 if (rt->peer) {
2892 inet_peer_refcheck(rt->peer); 2834 inet_peer_refcheck(rt->peer);
2893 id = atomic_read(&rt->peer->ip_id_count) & 0xffff; 2835 id = atomic_read(&peer->ip_id_count) & 0xffff;
2894 if (rt->peer->tcp_ts_stamp) { 2836 if (peer->tcp_ts_stamp) {
2895 ts = rt->peer->tcp_ts; 2837 ts = peer->tcp_ts;
2896 tsage = get_seconds() - rt->peer->tcp_ts_stamp; 2838 tsage = get_seconds() - peer->tcp_ts_stamp;
2897 } 2839 }
2840 expires = ACCESS_ONCE(peer->pmtu_expires);
2841 if (expires)
2842 expires -= jiffies;
2898 } 2843 }
2899 2844
2900 if (rt->fl.iif) { 2845 if (rt_is_input_route(rt)) {
2901#ifdef CONFIG_IP_MROUTE 2846#ifdef CONFIG_IP_MROUTE
2902 __be32 dst = rt->rt_dst; 2847 __be32 dst = rt->rt_dst;
2903 2848
2904 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 2849 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2905 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2850 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2906 int err = ipmr_get_route(net, skb, r, nowait); 2851 int err = ipmr_get_route(net, skb,
2852 rt->rt_src, rt->rt_dst,
2853 r, nowait);
2907 if (err <= 0) { 2854 if (err <= 0) {
2908 if (!nowait) { 2855 if (!nowait) {
2909 if (err == 0) 2856 if (err == 0)
@@ -2917,7 +2864,7 @@ static int rt_fill_info(struct net *net,
2917 } 2864 }
2918 } else 2865 } else
2919#endif 2866#endif
2920 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); 2867 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2921 } 2868 }
2922 2869
2923 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, 2870 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
@@ -2991,18 +2938,18 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2991 if (err == 0 && rt->dst.error) 2938 if (err == 0 && rt->dst.error)
2992 err = -rt->dst.error; 2939 err = -rt->dst.error;
2993 } else { 2940 } else {
2994 struct flowi fl = { 2941 struct flowi4 fl4 = {
2995 .nl_u = { 2942 .daddr = dst,
2996 .ip4_u = { 2943 .saddr = src,
2997 .daddr = dst, 2944 .flowi4_tos = rtm->rtm_tos,
2998 .saddr = src, 2945 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2999 .tos = rtm->rtm_tos, 2946 .flowi4_mark = mark,
3000 },
3001 },
3002 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3003 .mark = mark,
3004 }; 2947 };
3005 err = ip_route_output_key(net, &rt, &fl); 2948 rt = ip_route_output_key(net, &fl4);
2949
2950 err = 0;
2951 if (IS_ERR(rt))
2952 err = PTR_ERR(rt);
3006 } 2953 }
3007 2954
3008 if (err) 2955 if (err)
@@ -3285,6 +3232,8 @@ static __net_init int rt_genid_init(struct net *net)
3285{ 3232{
3286 get_random_bytes(&net->ipv4.rt_genid, 3233 get_random_bytes(&net->ipv4.rt_genid,
3287 sizeof(net->ipv4.rt_genid)); 3234 sizeof(net->ipv4.rt_genid));
3235 get_random_bytes(&net->ipv4.dev_addr_genid,
3236 sizeof(net->ipv4.dev_addr_genid));
3288 return 0; 3237 return 0;
3289} 3238}
3290 3239
@@ -3293,9 +3242,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
3293}; 3242};
3294 3243
3295 3244
3296#ifdef CONFIG_NET_CLS_ROUTE 3245#ifdef CONFIG_IP_ROUTE_CLASSID
3297struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3246struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3298#endif /* CONFIG_NET_CLS_ROUTE */ 3247#endif /* CONFIG_IP_ROUTE_CLASSID */
3299 3248
3300static __initdata unsigned long rhash_entries; 3249static __initdata unsigned long rhash_entries;
3301static int __init set_rhash_entries(char *str) 3250static int __init set_rhash_entries(char *str)
@@ -3311,7 +3260,7 @@ int __init ip_rt_init(void)
3311{ 3260{
3312 int rc = 0; 3261 int rc = 0;
3313 3262
3314#ifdef CONFIG_NET_CLS_ROUTE 3263#ifdef CONFIG_IP_ROUTE_CLASSID
3315 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3264 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3316 if (!ip_rt_acct) 3265 if (!ip_rt_acct)
3317 panic("IP: failed to allocate ip_rt_acct\n"); 3266 panic("IP: failed to allocate ip_rt_acct\n");
@@ -3323,6 +3272,12 @@ int __init ip_rt_init(void)
3323 3272
3324 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3273 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3325 3274
3275 if (dst_entries_init(&ipv4_dst_ops) < 0)
3276 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3277
3278 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3279 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3280
3326 rt_hash_table = (struct rt_hash_bucket *) 3281 rt_hash_table = (struct rt_hash_bucket *)
3327 alloc_large_system_hash("IP route cache", 3282 alloc_large_system_hash("IP route cache",
3328 sizeof(struct rt_hash_bucket), 3283 sizeof(struct rt_hash_bucket),
@@ -3342,14 +3297,6 @@ int __init ip_rt_init(void)
3342 devinet_init(); 3297 devinet_init();
3343 ip_fib_init(); 3298 ip_fib_init();
3344 3299
3345 /* All the timers, started at system startup tend
3346 to synchronize. Perturb it a bit.
3347 */
3348 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3349 expires_ljiffies = jiffies;
3350 schedule_delayed_work(&expires_work,
3351 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3352
3353 if (ip_rt_proc_init()) 3300 if (ip_rt_proc_init())
3354 printk(KERN_ERR "Unable to create route proc files\n"); 3301 printk(KERN_ERR "Unable to create route proc files\n");
3355#ifdef CONFIG_XFRM 3302#ifdef CONFIG_XFRM
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 650cace2180d..26461492a847 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -321,10 +321,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
321 * the ACK carries the same options again (see RFC1122 4.2.3.8) 321 * the ACK carries the same options again (see RFC1122 4.2.3.8)
322 */ 322 */
323 if (opt && opt->optlen) { 323 if (opt && opt->optlen) {
324 int opt_size = sizeof(struct ip_options) + opt->optlen; 324 int opt_size = sizeof(struct ip_options_rcu) + opt->optlen;
325 325
326 ireq->opt = kmalloc(opt_size, GFP_ATOMIC); 326 ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
327 if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) { 327 if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) {
328 kfree(ireq->opt); 328 kfree(ireq->opt);
329 ireq->opt = NULL; 329 ireq->opt = NULL;
330 } 330 }
@@ -345,20 +345,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
345 * no easy way to do this. 345 * no easy way to do this.
346 */ 346 */
347 { 347 {
348 struct flowi fl = { .mark = sk->sk_mark, 348 struct flowi4 fl4;
349 .nl_u = { .ip4_u = 349
350 { .daddr = ((opt && opt->srr) ? 350 flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk),
351 opt->faddr : 351 RT_SCOPE_UNIVERSE, IPPROTO_TCP,
352 ireq->rmt_addr), 352 inet_sk_flowi_flags(sk),
353 .saddr = ireq->loc_addr, 353 (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
354 .tos = RT_CONN_FLAGS(sk) } }, 354 ireq->loc_addr, th->source, th->dest);
355 .proto = IPPROTO_TCP, 355 security_req_classify_flow(req, flowi4_to_flowi(&fl4));
356 .flags = inet_sk_flowi_flags(sk), 356 rt = ip_route_output_key(sock_net(sk), &fl4);
357 .uli_u = { .ports = 357 if (IS_ERR(rt)) {
358 { .sport = th->dest,
359 .dport = th->source } } };
360 security_req_classify_flow(req, &fl);
361 if (ip_route_output_key(sock_net(sk), &rt, &fl)) {
362 reqsk_free(req); 358 reqsk_free(req);
363 goto out; 359 goto out;
364 } 360 }
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d96c1da4b17c..57d0752e239a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -13,6 +13,7 @@
13#include <linux/seqlock.h> 13#include <linux/seqlock.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/nsproxy.h>
16#include <net/snmp.h> 17#include <net/snmp.h>
17#include <net/icmp.h> 18#include <net/icmp.h>
18#include <net/ip.h> 19#include <net/ip.h>
@@ -21,11 +22,18 @@
21#include <net/udp.h> 22#include <net/udp.h>
22#include <net/cipso_ipv4.h> 23#include <net/cipso_ipv4.h>
23#include <net/inet_frag.h> 24#include <net/inet_frag.h>
25#include <net/ping.h>
24 26
25static int zero; 27static int zero;
26static int tcp_retr1_max = 255; 28static int tcp_retr1_max = 255;
27static int ip_local_port_range_min[] = { 1, 1 }; 29static int ip_local_port_range_min[] = { 1, 1 };
28static int ip_local_port_range_max[] = { 65535, 65535 }; 30static int ip_local_port_range_max[] = { 65535, 65535 };
31static int tcp_adv_win_scale_min = -31;
32static int tcp_adv_win_scale_max = 31;
33static int ip_ttl_min = 1;
34static int ip_ttl_max = 255;
35static int ip_ping_group_range_min[] = { 0, 0 };
36static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
29 37
30/* Update system visible IP port range */ 38/* Update system visible IP port range */
31static void set_local_port_range(int range[2]) 39static void set_local_port_range(int range[2])
@@ -64,6 +72,53 @@ static int ipv4_local_port_range(ctl_table *table, int write,
64 return ret; 72 return ret;
65} 73}
66 74
75
76void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high)
77{
78 gid_t *data = table->data;
79 unsigned seq;
80 do {
81 seq = read_seqbegin(&sysctl_local_ports.lock);
82
83 *low = data[0];
84 *high = data[1];
85 } while (read_seqretry(&sysctl_local_ports.lock, seq));
86}
87
88/* Update system visible IP port range */
89static void set_ping_group_range(struct ctl_table *table, int range[2])
90{
91 gid_t *data = table->data;
92 write_seqlock(&sysctl_local_ports.lock);
93 data[0] = range[0];
94 data[1] = range[1];
95 write_sequnlock(&sysctl_local_ports.lock);
96}
97
98/* Validate changes from /proc interface. */
99static int ipv4_ping_group_range(ctl_table *table, int write,
100 void __user *buffer,
101 size_t *lenp, loff_t *ppos)
102{
103 int ret;
104 gid_t range[2];
105 ctl_table tmp = {
106 .data = &range,
107 .maxlen = sizeof(range),
108 .mode = table->mode,
109 .extra1 = &ip_ping_group_range_min,
110 .extra2 = &ip_ping_group_range_max,
111 };
112
113 inet_get_ping_group_range_table(table, range, range + 1);
114 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
115
116 if (write && ret == 0)
117 set_ping_group_range(table, range);
118
119 return ret;
120}
121
67static int proc_tcp_congestion_control(ctl_table *ctl, int write, 122static int proc_tcp_congestion_control(ctl_table *ctl, int write,
68 void __user *buffer, size_t *lenp, loff_t *ppos) 123 void __user *buffer, size_t *lenp, loff_t *ppos)
69{ 124{
@@ -153,8 +208,9 @@ static struct ctl_table ipv4_table[] = {
153 .data = &sysctl_ip_default_ttl, 208 .data = &sysctl_ip_default_ttl,
154 .maxlen = sizeof(int), 209 .maxlen = sizeof(int),
155 .mode = 0644, 210 .mode = 0644,
156 .proc_handler = ipv4_doint_and_flush, 211 .proc_handler = proc_dointvec_minmax,
157 .extra2 = &init_net, 212 .extra1 = &ip_ttl_min,
213 .extra2 = &ip_ttl_max,
158 }, 214 },
159 { 215 {
160 .procname = "ip_no_pmtu_disc", 216 .procname = "ip_no_pmtu_disc",
@@ -306,7 +362,6 @@ static struct ctl_table ipv4_table[] = {
306 .mode = 0644, 362 .mode = 0644,
307 .proc_handler = proc_do_large_bitmap, 363 .proc_handler = proc_do_large_bitmap,
308 }, 364 },
309#ifdef CONFIG_IP_MULTICAST
310 { 365 {
311 .procname = "igmp_max_memberships", 366 .procname = "igmp_max_memberships",
312 .data = &sysctl_igmp_max_memberships, 367 .data = &sysctl_igmp_max_memberships,
@@ -314,8 +369,6 @@ static struct ctl_table ipv4_table[] = {
314 .mode = 0644, 369 .mode = 0644,
315 .proc_handler = proc_dointvec 370 .proc_handler = proc_dointvec
316 }, 371 },
317
318#endif
319 { 372 {
320 .procname = "igmp_max_msf", 373 .procname = "igmp_max_msf",
321 .data = &sysctl_igmp_max_msf, 374 .data = &sysctl_igmp_max_msf,
@@ -398,7 +451,7 @@ static struct ctl_table ipv4_table[] = {
398 .data = &sysctl_tcp_mem, 451 .data = &sysctl_tcp_mem,
399 .maxlen = sizeof(sysctl_tcp_mem), 452 .maxlen = sizeof(sysctl_tcp_mem),
400 .mode = 0644, 453 .mode = 0644,
401 .proc_handler = proc_dointvec 454 .proc_handler = proc_doulongvec_minmax
402 }, 455 },
403 { 456 {
404 .procname = "tcp_wmem", 457 .procname = "tcp_wmem",
@@ -426,7 +479,9 @@ static struct ctl_table ipv4_table[] = {
426 .data = &sysctl_tcp_adv_win_scale, 479 .data = &sysctl_tcp_adv_win_scale,
427 .maxlen = sizeof(int), 480 .maxlen = sizeof(int),
428 .mode = 0644, 481 .mode = 0644,
429 .proc_handler = proc_dointvec 482 .proc_handler = proc_dointvec_minmax,
483 .extra1 = &tcp_adv_win_scale_min,
484 .extra2 = &tcp_adv_win_scale_max,
430 }, 485 },
431 { 486 {
432 .procname = "tcp_tw_reuse", 487 .procname = "tcp_tw_reuse",
@@ -602,8 +657,7 @@ static struct ctl_table ipv4_table[] = {
602 .data = &sysctl_udp_mem, 657 .data = &sysctl_udp_mem,
603 .maxlen = sizeof(sysctl_udp_mem), 658 .maxlen = sizeof(sysctl_udp_mem),
604 .mode = 0644, 659 .mode = 0644,
605 .proc_handler = proc_dointvec_minmax, 660 .proc_handler = proc_doulongvec_minmax,
606 .extra1 = &zero
607 }, 661 },
608 { 662 {
609 .procname = "udp_rmem_min", 663 .procname = "udp_rmem_min",
@@ -674,6 +728,13 @@ static struct ctl_table ipv4_net_table[] = {
674 .mode = 0644, 728 .mode = 0644,
675 .proc_handler = proc_dointvec 729 .proc_handler = proc_dointvec
676 }, 730 },
731 {
732 .procname = "ping_group_range",
733 .data = &init_net.ipv4.sysctl_ping_group_range,
734 .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range),
735 .mode = 0644,
736 .proc_handler = ipv4_ping_group_range,
737 },
677 { } 738 { }
678}; 739};
679 740
@@ -708,8 +769,18 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
708 &net->ipv4.sysctl_icmp_ratemask; 769 &net->ipv4.sysctl_icmp_ratemask;
709 table[6].data = 770 table[6].data =
710 &net->ipv4.sysctl_rt_cache_rebuild_count; 771 &net->ipv4.sysctl_rt_cache_rebuild_count;
772 table[7].data =
773 &net->ipv4.sysctl_ping_group_range;
774
711 } 775 }
712 776
777 /*
778 * Sane defaults - nobody may create ping sockets.
779 * Boot scripts should set this to distro-specific group.
780 */
781 net->ipv4.sysctl_ping_group_range[0] = 1;
782 net->ipv4.sysctl_ping_group_range[1] = 0;
783
713 net->ipv4.sysctl_rt_cache_rebuild_count = 4; 784 net->ipv4.sysctl_rt_cache_rebuild_count = 4;
714 785
715 net->ipv4.ipv4_hdr = register_net_sysctl_table(net, 786 net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f115ea68a4ef..46febcacb729 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -282,7 +282,7 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
282struct percpu_counter tcp_orphan_count; 282struct percpu_counter tcp_orphan_count;
283EXPORT_SYMBOL_GPL(tcp_orphan_count); 283EXPORT_SYMBOL_GPL(tcp_orphan_count);
284 284
285int sysctl_tcp_mem[3] __read_mostly; 285long sysctl_tcp_mem[3] __read_mostly;
286int sysctl_tcp_wmem[3] __read_mostly; 286int sysctl_tcp_wmem[3] __read_mostly;
287int sysctl_tcp_rmem[3] __read_mostly; 287int sysctl_tcp_rmem[3] __read_mostly;
288 288
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(sysctl_tcp_mem);
290EXPORT_SYMBOL(sysctl_tcp_rmem); 290EXPORT_SYMBOL(sysctl_tcp_rmem);
291EXPORT_SYMBOL(sysctl_tcp_wmem); 291EXPORT_SYMBOL(sysctl_tcp_wmem);
292 292
293atomic_t tcp_memory_allocated; /* Current allocated memory. */ 293atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
294EXPORT_SYMBOL(tcp_memory_allocated); 294EXPORT_SYMBOL(tcp_memory_allocated);
295 295
296/* 296/*
@@ -505,6 +505,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
505 else 505 else
506 answ = tp->write_seq - tp->snd_una; 506 answ = tp->write_seq - tp->snd_una;
507 break; 507 break;
508 case SIOCOUTQNSD:
509 if (sk->sk_state == TCP_LISTEN)
510 return -EINVAL;
511
512 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
513 answ = 0;
514 else
515 answ = tp->write_seq - tp->snd_nxt;
516 break;
508 default: 517 default:
509 return -ENOIOCTLCMD; 518 return -ENOIOCTLCMD;
510 } 519 }
@@ -873,9 +882,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
873 flags); 882 flags);
874 883
875 lock_sock(sk); 884 lock_sock(sk);
876 TCP_CHECK_TIMER(sk);
877 res = do_tcp_sendpages(sk, &page, offset, size, flags); 885 res = do_tcp_sendpages(sk, &page, offset, size, flags);
878 TCP_CHECK_TIMER(sk);
879 release_sock(sk); 886 release_sock(sk);
880 return res; 887 return res;
881} 888}
@@ -916,7 +923,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
916 long timeo; 923 long timeo;
917 924
918 lock_sock(sk); 925 lock_sock(sk);
919 TCP_CHECK_TIMER(sk);
920 926
921 flags = msg->msg_flags; 927 flags = msg->msg_flags;
922 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 928 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -993,7 +999,8 @@ new_segment:
993 /* We have some space in skb head. Superb! */ 999 /* We have some space in skb head. Superb! */
994 if (copy > skb_tailroom(skb)) 1000 if (copy > skb_tailroom(skb))
995 copy = skb_tailroom(skb); 1001 copy = skb_tailroom(skb);
996 if ((err = skb_add_data(skb, from, copy)) != 0) 1002 err = skb_add_data_nocache(sk, skb, from, copy);
1003 if (err)
997 goto do_fault; 1004 goto do_fault;
998 } else { 1005 } else {
999 int merge = 0; 1006 int merge = 0;
@@ -1036,8 +1043,8 @@ new_segment:
1036 1043
1037 /* Time to copy data. We are close to 1044 /* Time to copy data. We are close to
1038 * the end! */ 1045 * the end! */
1039 err = skb_copy_to_page(sk, from, skb, page, 1046 err = skb_copy_to_page_nocache(sk, from, skb,
1040 off, copy); 1047 page, off, copy);
1041 if (err) { 1048 if (err) {
1042 /* If this page was new, give it to the 1049 /* If this page was new, give it to the
1043 * socket so it does not get leaked. 1050 * socket so it does not get leaked.
@@ -1104,7 +1111,6 @@ wait_for_memory:
1104out: 1111out:
1105 if (copied) 1112 if (copied)
1106 tcp_push(sk, flags, mss_now, tp->nonagle); 1113 tcp_push(sk, flags, mss_now, tp->nonagle);
1107 TCP_CHECK_TIMER(sk);
1108 release_sock(sk); 1114 release_sock(sk);
1109 return copied; 1115 return copied;
1110 1116
@@ -1123,7 +1129,6 @@ do_error:
1123 goto out; 1129 goto out;
1124out_err: 1130out_err:
1125 err = sk_stream_error(sk, flags, err); 1131 err = sk_stream_error(sk, flags, err);
1126 TCP_CHECK_TIMER(sk);
1127 release_sock(sk); 1132 release_sock(sk);
1128 return err; 1133 return err;
1129} 1134}
@@ -1193,7 +1198,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
1193 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 1198 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1194 1199
1195 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), 1200 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1196 KERN_INFO "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", 1201 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1197 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); 1202 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1198#endif 1203#endif
1199 1204
@@ -1415,8 +1420,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1415 1420
1416 lock_sock(sk); 1421 lock_sock(sk);
1417 1422
1418 TCP_CHECK_TIMER(sk);
1419
1420 err = -ENOTCONN; 1423 err = -ENOTCONN;
1421 if (sk->sk_state == TCP_LISTEN) 1424 if (sk->sk_state == TCP_LISTEN)
1422 goto out; 1425 goto out;
@@ -1477,10 +1480,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1477 * shouldn't happen. 1480 * shouldn't happen.
1478 */ 1481 */
1479 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), 1482 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
1480 KERN_INFO "recvmsg bug: copied %X " 1483 "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
1481 "seq %X rcvnxt %X fl %X\n", *seq, 1484 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
1482 TCP_SKB_CB(skb)->seq, tp->rcv_nxt, 1485 flags))
1483 flags))
1484 break; 1486 break;
1485 1487
1486 offset = *seq - TCP_SKB_CB(skb)->seq; 1488 offset = *seq - TCP_SKB_CB(skb)->seq;
@@ -1490,10 +1492,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1490 goto found_ok_skb; 1492 goto found_ok_skb;
1491 if (tcp_hdr(skb)->fin) 1493 if (tcp_hdr(skb)->fin)
1492 goto found_fin_ok; 1494 goto found_fin_ok;
1493 WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: " 1495 WARN(!(flags & MSG_PEEK),
1494 "copied %X seq %X rcvnxt %X fl %X\n", 1496 "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
1495 *seq, TCP_SKB_CB(skb)->seq, 1497 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
1496 tp->rcv_nxt, flags);
1497 } 1498 }
1498 1499
1499 /* Well, if we have backlog, try to process it now yet. */ 1500 /* Well, if we have backlog, try to process it now yet. */
@@ -1769,12 +1770,10 @@ skip_copy:
1769 /* Clean up data we have read: This will do ACK frames. */ 1770 /* Clean up data we have read: This will do ACK frames. */
1770 tcp_cleanup_rbuf(sk, copied); 1771 tcp_cleanup_rbuf(sk, copied);
1771 1772
1772 TCP_CHECK_TIMER(sk);
1773 release_sock(sk); 1773 release_sock(sk);
1774 return copied; 1774 return copied;
1775 1775
1776out: 1776out:
1777 TCP_CHECK_TIMER(sk);
1778 release_sock(sk); 1777 release_sock(sk);
1779 return err; 1778 return err;
1780 1779
@@ -2246,7 +2245,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2246 /* Values greater than interface MTU won't take effect. However 2245 /* Values greater than interface MTU won't take effect. However
2247 * at the point when this call is done we typically don't yet 2246 * at the point when this call is done we typically don't yet
2248 * know which interface is going to be used */ 2247 * know which interface is going to be used */
2249 if (val < 8 || val > MAX_TCP_WINDOW) { 2248 if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2250 err = -EINVAL; 2249 err = -EINVAL;
2251 break; 2250 break;
2252 } 2251 }
@@ -2392,7 +2391,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2392 err = tp->af_specific->md5_parse(sk, optval, optlen); 2391 err = tp->af_specific->md5_parse(sk, optval, optlen);
2393 break; 2392 break;
2394#endif 2393#endif
2395 2394 case TCP_USER_TIMEOUT:
2395 /* Cap the max timeout in ms TCP will retry/retrans
2396 * before giving up and aborting (ETIMEDOUT) a connection.
2397 */
2398 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2399 break;
2396 default: 2400 default:
2397 err = -ENOPROTOOPT; 2401 err = -ENOPROTOOPT;
2398 break; 2402 break;
@@ -2611,6 +2615,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2611 case TCP_THIN_DUPACK: 2615 case TCP_THIN_DUPACK:
2612 val = tp->thin_dupack; 2616 val = tp->thin_dupack;
2613 break; 2617 break;
2618
2619 case TCP_USER_TIMEOUT:
2620 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2621 break;
2614 default: 2622 default:
2615 return -ENOPROTOOPT; 2623 return -ENOPROTOOPT;
2616 } 2624 }
@@ -2646,7 +2654,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2646EXPORT_SYMBOL(compat_tcp_getsockopt); 2654EXPORT_SYMBOL(compat_tcp_getsockopt);
2647#endif 2655#endif
2648 2656
2649struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) 2657struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features)
2650{ 2658{
2651 struct sk_buff *segs = ERR_PTR(-EINVAL); 2659 struct sk_buff *segs = ERR_PTR(-EINVAL);
2652 struct tcphdr *th; 2660 struct tcphdr *th;
@@ -3212,7 +3220,7 @@ __setup("thash_entries=", set_thash_entries);
3212void __init tcp_init(void) 3220void __init tcp_init(void)
3213{ 3221{
3214 struct sk_buff *skb = NULL; 3222 struct sk_buff *skb = NULL;
3215 unsigned long nr_pages, limit; 3223 unsigned long limit;
3216 int i, max_share, cnt; 3224 int i, max_share, cnt;
3217 unsigned long jiffy = jiffies; 3225 unsigned long jiffy = jiffies;
3218 3226
@@ -3269,13 +3277,7 @@ void __init tcp_init(void)
3269 sysctl_tcp_max_orphans = cnt / 2; 3277 sysctl_tcp_max_orphans = cnt / 2;
3270 sysctl_max_syn_backlog = max(128, cnt / 256); 3278 sysctl_max_syn_backlog = max(128, cnt / 256);
3271 3279
3272 /* Set the pressure threshold to be a fraction of global memory that 3280 limit = nr_free_buffer_pages() / 8;
3273 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
3274 * memory, with a floor of 128 pages.
3275 */
3276 nr_pages = totalram_pages - totalhigh_pages;
3277 limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
3278 limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
3279 limit = max(limit, 128UL); 3281 limit = max(limit, 128UL);
3280 sysctl_tcp_mem[0] = limit / 4 * 3; 3282 sysctl_tcp_mem[0] = limit / 4 * 3;
3281 sysctl_tcp_mem[1] = limit; 3283 sysctl_tcp_mem[1] = limit;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 3b53fd1af23f..6187eb4d1dcf 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -209,7 +209,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
209} 209}
210 210
211 211
212static struct tcp_congestion_ops bictcp = { 212static struct tcp_congestion_ops bictcp __read_mostly = {
213 .init = bictcp_init, 213 .init = bictcp_init,
214 .ssthresh = bictcp_recalc_ssthresh, 214 .ssthresh = bictcp_recalc_ssthresh,
215 .cong_avoid = bictcp_cong_avoid, 215 .cong_avoid = bictcp_cong_avoid,
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 71d5f2f29fa6..f376b05cca81 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -39,7 +39,7 @@
39 39
40/* Number of delay samples for detecting the increase of delay */ 40/* Number of delay samples for detecting the increase of delay */
41#define HYSTART_MIN_SAMPLES 8 41#define HYSTART_MIN_SAMPLES 8
42#define HYSTART_DELAY_MIN (2U<<3) 42#define HYSTART_DELAY_MIN (4U<<3)
43#define HYSTART_DELAY_MAX (16U<<3) 43#define HYSTART_DELAY_MAX (16U<<3)
44#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) 44#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
45 45
@@ -52,6 +52,7 @@ static int tcp_friendliness __read_mostly = 1;
52static int hystart __read_mostly = 1; 52static int hystart __read_mostly = 1;
53static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; 53static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
54static int hystart_low_window __read_mostly = 16; 54static int hystart_low_window __read_mostly = 16;
55static int hystart_ack_delta __read_mostly = 2;
55 56
56static u32 cube_rtt_scale __read_mostly; 57static u32 cube_rtt_scale __read_mostly;
57static u32 beta_scale __read_mostly; 58static u32 beta_scale __read_mostly;
@@ -75,6 +76,8 @@ MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
75 " 1: packet-train 2: delay 3: both packet-train and delay"); 76 " 1: packet-train 2: delay 3: both packet-train and delay");
76module_param(hystart_low_window, int, 0644); 77module_param(hystart_low_window, int, 0644);
77MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); 78MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
79module_param(hystart_ack_delta, int, 0644);
80MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
78 81
79/* BIC TCP Parameters */ 82/* BIC TCP Parameters */
80struct bictcp { 83struct bictcp {
@@ -85,17 +88,18 @@ struct bictcp {
85 u32 last_time; /* time when updated last_cwnd */ 88 u32 last_time; /* time when updated last_cwnd */
86 u32 bic_origin_point;/* origin point of bic function */ 89 u32 bic_origin_point;/* origin point of bic function */
87 u32 bic_K; /* time to origin point from the beginning of the current epoch */ 90 u32 bic_K; /* time to origin point from the beginning of the current epoch */
88 u32 delay_min; /* min delay */ 91 u32 delay_min; /* min delay (msec << 3) */
89 u32 epoch_start; /* beginning of an epoch */ 92 u32 epoch_start; /* beginning of an epoch */
90 u32 ack_cnt; /* number of acks */ 93 u32 ack_cnt; /* number of acks */
91 u32 tcp_cwnd; /* estimated tcp cwnd */ 94 u32 tcp_cwnd; /* estimated tcp cwnd */
92#define ACK_RATIO_SHIFT 4 95#define ACK_RATIO_SHIFT 4
96#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT)
93 u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ 97 u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
94 u8 sample_cnt; /* number of samples to decide curr_rtt */ 98 u8 sample_cnt; /* number of samples to decide curr_rtt */
95 u8 found; /* the exit point is found? */ 99 u8 found; /* the exit point is found? */
96 u32 round_start; /* beginning of each round */ 100 u32 round_start; /* beginning of each round */
97 u32 end_seq; /* end_seq of the round */ 101 u32 end_seq; /* end_seq of the round */
98 u32 last_jiffies; /* last time when the ACK spacing is close */ 102 u32 last_ack; /* last time when the ACK spacing is close */
99 u32 curr_rtt; /* the minimum rtt of current round */ 103 u32 curr_rtt; /* the minimum rtt of current round */
100}; 104};
101 105
@@ -116,12 +120,21 @@ static inline void bictcp_reset(struct bictcp *ca)
116 ca->found = 0; 120 ca->found = 0;
117} 121}
118 122
123static inline u32 bictcp_clock(void)
124{
125#if HZ < 1000
126 return ktime_to_ms(ktime_get_real());
127#else
128 return jiffies_to_msecs(jiffies);
129#endif
130}
131
119static inline void bictcp_hystart_reset(struct sock *sk) 132static inline void bictcp_hystart_reset(struct sock *sk)
120{ 133{
121 struct tcp_sock *tp = tcp_sk(sk); 134 struct tcp_sock *tp = tcp_sk(sk);
122 struct bictcp *ca = inet_csk_ca(sk); 135 struct bictcp *ca = inet_csk_ca(sk);
123 136
124 ca->round_start = ca->last_jiffies = jiffies; 137 ca->round_start = ca->last_ack = bictcp_clock();
125 ca->end_seq = tp->snd_nxt; 138 ca->end_seq = tp->snd_nxt;
126 ca->curr_rtt = 0; 139 ca->curr_rtt = 0;
127 ca->sample_cnt = 0; 140 ca->sample_cnt = 0;
@@ -236,8 +249,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
236 */ 249 */
237 250
238 /* change the unit from HZ to bictcp_HZ */ 251 /* change the unit from HZ to bictcp_HZ */
239 t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start) 252 t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3)
240 << BICTCP_HZ) / HZ; 253 - ca->epoch_start) << BICTCP_HZ) / HZ;
241 254
242 if (t < ca->bic_K) /* t - K */ 255 if (t < ca->bic_K) /* t - K */
243 offs = ca->bic_K - t; 256 offs = ca->bic_K - t;
@@ -258,6 +271,13 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
258 ca->cnt = 100 * cwnd; /* very small increment*/ 271 ca->cnt = 100 * cwnd; /* very small increment*/
259 } 272 }
260 273
274 /*
275 * The initial growth of cubic function may be too conservative
276 * when the available bandwidth is still unknown.
277 */
278 if (ca->loss_cwnd == 0 && ca->cnt > 20)
279 ca->cnt = 20; /* increase cwnd 5% per RTT */
280
261 /* TCP Friendly */ 281 /* TCP Friendly */
262 if (tcp_friendliness) { 282 if (tcp_friendliness) {
263 u32 scale = beta_scale; 283 u32 scale = beta_scale;
@@ -339,12 +359,12 @@ static void hystart_update(struct sock *sk, u32 delay)
339 struct bictcp *ca = inet_csk_ca(sk); 359 struct bictcp *ca = inet_csk_ca(sk);
340 360
341 if (!(ca->found & hystart_detect)) { 361 if (!(ca->found & hystart_detect)) {
342 u32 curr_jiffies = jiffies; 362 u32 now = bictcp_clock();
343 363
344 /* first detection parameter - ack-train detection */ 364 /* first detection parameter - ack-train detection */
345 if (curr_jiffies - ca->last_jiffies <= msecs_to_jiffies(2)) { 365 if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
346 ca->last_jiffies = curr_jiffies; 366 ca->last_ack = now;
347 if (curr_jiffies - ca->round_start >= ca->delay_min>>4) 367 if ((s32)(now - ca->round_start) > ca->delay_min >> 4)
348 ca->found |= HYSTART_ACK_TRAIN; 368 ca->found |= HYSTART_ACK_TRAIN;
349 } 369 }
350 370
@@ -379,8 +399,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
379 u32 delay; 399 u32 delay;
380 400
381 if (icsk->icsk_ca_state == TCP_CA_Open) { 401 if (icsk->icsk_ca_state == TCP_CA_Open) {
382 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; 402 u32 ratio = ca->delayed_ack;
383 ca->delayed_ack += cnt; 403
404 ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
405 ratio += cnt;
406
407 ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT);
384 } 408 }
385 409
386 /* Some calls are for duplicates without timetamps */ 410 /* Some calls are for duplicates without timetamps */
@@ -391,7 +415,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
391 if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) 415 if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ)
392 return; 416 return;
393 417
394 delay = usecs_to_jiffies(rtt_us) << 3; 418 delay = (rtt_us << 3) / USEC_PER_MSEC;
395 if (delay == 0) 419 if (delay == 0)
396 delay = 1; 420 delay = 1;
397 421
@@ -405,7 +429,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
405 hystart_update(sk, delay); 429 hystart_update(sk, delay);
406} 430}
407 431
408static struct tcp_congestion_ops cubictcp = { 432static struct tcp_congestion_ops cubictcp __read_mostly = {
409 .init = bictcp_init, 433 .init = bictcp_init,
410 .ssthresh = bictcp_recalc_ssthresh, 434 .ssthresh = bictcp_recalc_ssthresh,
411 .cong_avoid = bictcp_cong_avoid, 435 .cong_avoid = bictcp_cong_avoid,
@@ -447,6 +471,10 @@ static int __init cubictcp_register(void)
447 /* divide by bic_scale and by constant Srtt (100ms) */ 471 /* divide by bic_scale and by constant Srtt (100ms) */
448 do_div(cube_factor, bic_scale * 10); 472 do_div(cube_factor, bic_scale * 10);
449 473
474 /* hystart needs ms clock resolution */
475 if (hystart && HZ < 1000)
476 cubictcp.flags |= TCP_CONG_RTT_STAMP;
477
450 return tcp_register_congestion_control(&cubictcp); 478 return tcp_register_congestion_control(&cubictcp);
451} 479}
452 480
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 8b6caaf75bb9..30f27f6b3655 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -158,7 +158,7 @@ static u32 hstcp_ssthresh(struct sock *sk)
158} 158}
159 159
160 160
161static struct tcp_congestion_ops tcp_highspeed = { 161static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
162 .init = hstcp_init, 162 .init = hstcp_init,
163 .ssthresh = hstcp_ssthresh, 163 .ssthresh = hstcp_ssthresh,
164 .cong_avoid = hstcp_cong_avoid, 164 .cong_avoid = hstcp_cong_avoid,
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 7c94a4955416..c1a8175361e8 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -284,7 +284,7 @@ static void htcp_state(struct sock *sk, u8 new_state)
284 } 284 }
285} 285}
286 286
287static struct tcp_congestion_ops htcp = { 287static struct tcp_congestion_ops htcp __read_mostly = {
288 .init = htcp_init, 288 .init = htcp_init,
289 .ssthresh = htcp_recalc_ssthresh, 289 .ssthresh = htcp_recalc_ssthresh,
290 .cong_avoid = htcp_cong_avoid, 290 .cong_avoid = htcp_cong_avoid,
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 377bc9349371..fe3ecf484b44 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -162,7 +162,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
162 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); 162 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
163} 163}
164 164
165static struct tcp_congestion_ops tcp_hybla = { 165static struct tcp_congestion_ops tcp_hybla __read_mostly = {
166 .init = hybla_init, 166 .init = hybla_init,
167 .ssthresh = tcp_reno_ssthresh, 167 .ssthresh = tcp_reno_ssthresh,
168 .min_cwnd = tcp_reno_min_cwnd, 168 .min_cwnd = tcp_reno_min_cwnd,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 1eba160b72dc..813b43a76fec 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -6,7 +6,7 @@
6 * The algorithm is described in: 6 * The algorithm is described in:
7 * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm 7 * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm
8 * for High-Speed Networks" 8 * for High-Speed Networks"
9 * http://www.ews.uiuc.edu/~shaoliu/papersandslides/liubassri06perf.pdf 9 * http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf
10 * 10 *
11 * Implemented from description in paper and ns-2 simulation. 11 * Implemented from description in paper and ns-2 simulation.
12 * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org> 12 * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org>
@@ -322,7 +322,7 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
322 } 322 }
323} 323}
324 324
325static struct tcp_congestion_ops tcp_illinois = { 325static struct tcp_congestion_ops tcp_illinois __read_mostly = {
326 .flags = TCP_CONG_RTT_STAMP, 326 .flags = TCP_CONG_RTT_STAMP,
327 .init = tcp_illinois_init, 327 .init = tcp_illinois_init,
328 .ssthresh = tcp_illinois_ssthresh, 328 .ssthresh = tcp_illinois_ssthresh,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b55f60f6fcbe..bef9f04c22ba 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -182,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk)
182 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); 182 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
183} 183}
184 184
185void tcp_enter_quickack_mode(struct sock *sk) 185static void tcp_enter_quickack_mode(struct sock *sk)
186{ 186{
187 struct inet_connection_sock *icsk = inet_csk(sk); 187 struct inet_connection_sock *icsk = inet_csk(sk);
188 tcp_incr_quickack(sk); 188 tcp_incr_quickack(sk);
@@ -259,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk)
259 int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + 259 int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
260 sizeof(struct sk_buff); 260 sizeof(struct sk_buff);
261 261
262 if (sk->sk_sndbuf < 3 * sndmem) 262 if (sk->sk_sndbuf < 3 * sndmem) {
263 sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]); 263 sk->sk_sndbuf = 3 * sndmem;
264 if (sk->sk_sndbuf > sysctl_tcp_wmem[2])
265 sk->sk_sndbuf = sysctl_tcp_wmem[2];
266 }
264} 267}
265 268
266/* 2. Tuning advertised window (window_clamp, rcv_ssthresh) 269/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -396,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk)
396 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && 399 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
397 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && 400 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
398 !tcp_memory_pressure && 401 !tcp_memory_pressure &&
399 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { 402 atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
400 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), 403 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
401 sysctl_tcp_rmem[2]); 404 sysctl_tcp_rmem[2]);
402 } 405 }
@@ -428,10 +431,10 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss);
428 * 431 *
429 * The algorithm for RTT estimation w/o timestamps is based on 432 * The algorithm for RTT estimation w/o timestamps is based on
430 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. 433 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
431 * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps> 434 * <http://public.lanl.gov/radiant/pubs.html#DRS>
432 * 435 *
433 * More detail on this code can be found at 436 * More detail on this code can be found at
434 * <http://www.psc.edu/~jheffner/senior_thesis.ps>, 437 * <http://staff.psc.edu/jheffner/>,
435 * though this reference is out of date. A new paper 438 * though this reference is out of date. A new paper
436 * is pending. 439 * is pending.
437 */ 440 */
@@ -731,7 +734,7 @@ void tcp_update_metrics(struct sock *sk)
731 * Reset our results. 734 * Reset our results.
732 */ 735 */
733 if (!(dst_metric_locked(dst, RTAX_RTT))) 736 if (!(dst_metric_locked(dst, RTAX_RTT)))
734 dst->metrics[RTAX_RTT - 1] = 0; 737 dst_metric_set(dst, RTAX_RTT, 0);
735 return; 738 return;
736 } 739 }
737 740
@@ -773,57 +776,48 @@ void tcp_update_metrics(struct sock *sk)
773 if (dst_metric(dst, RTAX_SSTHRESH) && 776 if (dst_metric(dst, RTAX_SSTHRESH) &&
774 !dst_metric_locked(dst, RTAX_SSTHRESH) && 777 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
775 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) 778 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
776 dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; 779 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
777 if (!dst_metric_locked(dst, RTAX_CWND) && 780 if (!dst_metric_locked(dst, RTAX_CWND) &&
778 tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) 781 tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
779 dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd; 782 dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
780 } else if (tp->snd_cwnd > tp->snd_ssthresh && 783 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
781 icsk->icsk_ca_state == TCP_CA_Open) { 784 icsk->icsk_ca_state == TCP_CA_Open) {
782 /* Cong. avoidance phase, cwnd is reliable. */ 785 /* Cong. avoidance phase, cwnd is reliable. */
783 if (!dst_metric_locked(dst, RTAX_SSTHRESH)) 786 if (!dst_metric_locked(dst, RTAX_SSTHRESH))
784 dst->metrics[RTAX_SSTHRESH-1] = 787 dst_metric_set(dst, RTAX_SSTHRESH,
785 max(tp->snd_cwnd >> 1, tp->snd_ssthresh); 788 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
786 if (!dst_metric_locked(dst, RTAX_CWND)) 789 if (!dst_metric_locked(dst, RTAX_CWND))
787 dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1; 790 dst_metric_set(dst, RTAX_CWND,
791 (dst_metric(dst, RTAX_CWND) +
792 tp->snd_cwnd) >> 1);
788 } else { 793 } else {
789 /* Else slow start did not finish, cwnd is non-sense, 794 /* Else slow start did not finish, cwnd is non-sense,
790 ssthresh may be also invalid. 795 ssthresh may be also invalid.
791 */ 796 */
792 if (!dst_metric_locked(dst, RTAX_CWND)) 797 if (!dst_metric_locked(dst, RTAX_CWND))
793 dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1; 798 dst_metric_set(dst, RTAX_CWND,
799 (dst_metric(dst, RTAX_CWND) +
800 tp->snd_ssthresh) >> 1);
794 if (dst_metric(dst, RTAX_SSTHRESH) && 801 if (dst_metric(dst, RTAX_SSTHRESH) &&
795 !dst_metric_locked(dst, RTAX_SSTHRESH) && 802 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
796 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) 803 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
797 dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh; 804 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
798 } 805 }
799 806
800 if (!dst_metric_locked(dst, RTAX_REORDERING)) { 807 if (!dst_metric_locked(dst, RTAX_REORDERING)) {
801 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && 808 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
802 tp->reordering != sysctl_tcp_reordering) 809 tp->reordering != sysctl_tcp_reordering)
803 dst->metrics[RTAX_REORDERING-1] = tp->reordering; 810 dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
804 } 811 }
805 } 812 }
806} 813}
807 814
808/* Numbers are taken from RFC3390.
809 *
810 * John Heffner states:
811 *
812 * The RFC specifies a window of no more than 4380 bytes
813 * unless 2*MSS > 4380. Reading the pseudocode in the RFC
814 * is a bit misleading because they use a clamp at 4380 bytes
815 * rather than use a multiplier in the relevant range.
816 */
817__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) 815__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
818{ 816{
819 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 817 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
820 818
821 if (!cwnd) { 819 if (!cwnd)
822 if (tp->mss_cache > 1460) 820 cwnd = TCP_INIT_CWND;
823 cwnd = 2;
824 else
825 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
826 }
827 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 821 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
828} 822}
829 823
@@ -922,25 +916,20 @@ static void tcp_init_metrics(struct sock *sk)
922 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); 916 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
923 } 917 }
924 tcp_set_rto(sk); 918 tcp_set_rto(sk);
925 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) 919 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) {
926 goto reset;
927
928cwnd:
929 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
930 tp->snd_cwnd_stamp = tcp_time_stamp;
931 return;
932
933reset: 920reset:
934 /* Play conservative. If timestamps are not 921 /* Play conservative. If timestamps are not
935 * supported, TCP will fail to recalculate correct 922 * supported, TCP will fail to recalculate correct
936 * rtt, if initial rto is too small. FORGET ALL AND RESET! 923 * rtt, if initial rto is too small. FORGET ALL AND RESET!
937 */ 924 */
938 if (!tp->rx_opt.saw_tstamp && tp->srtt) { 925 if (!tp->rx_opt.saw_tstamp && tp->srtt) {
939 tp->srtt = 0; 926 tp->srtt = 0;
940 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; 927 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
941 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; 928 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
929 }
942 } 930 }
943 goto cwnd; 931 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
932 tp->snd_cwnd_stamp = tcp_time_stamp;
944} 933}
945 934
946static void tcp_update_reordering(struct sock *sk, const int metric, 935static void tcp_update_reordering(struct sock *sk, const int metric,
@@ -1233,7 +1222,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
1233 } 1222 }
1234 1223
1235 /* D-SACK for already forgotten data... Do dumb counting. */ 1224 /* D-SACK for already forgotten data... Do dumb counting. */
1236 if (dup_sack && 1225 if (dup_sack && tp->undo_marker && tp->undo_retrans &&
1237 !after(end_seq_0, prior_snd_una) && 1226 !after(end_seq_0, prior_snd_una) &&
1238 after(end_seq_0, tp->undo_marker)) 1227 after(end_seq_0, tp->undo_marker))
1239 tp->undo_retrans--; 1228 tp->undo_retrans--;
@@ -1310,7 +1299,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1310 1299
1311 /* Account D-SACK for retransmitted packet. */ 1300 /* Account D-SACK for retransmitted packet. */
1312 if (dup_sack && (sacked & TCPCB_RETRANS)) { 1301 if (dup_sack && (sacked & TCPCB_RETRANS)) {
1313 if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) 1302 if (tp->undo_marker && tp->undo_retrans &&
1303 after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
1314 tp->undo_retrans--; 1304 tp->undo_retrans--;
1315 if (sacked & TCPCB_SACKED_ACKED) 1305 if (sacked & TCPCB_SACKED_ACKED)
1316 state->reord = min(fack_count, state->reord); 1306 state->reord = min(fack_count, state->reord);
@@ -2314,7 +2304,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
2314 2304
2315static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) 2305static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
2316{ 2306{
2317 return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); 2307 return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
2318} 2308}
2319 2309
2320static inline int tcp_head_timedout(struct sock *sk) 2310static inline int tcp_head_timedout(struct sock *sk)
@@ -2508,7 +2498,7 @@ static void tcp_timeout_skbs(struct sock *sk)
2508/* Mark head of queue up as lost. With RFC3517 SACK, the packets is 2498/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
2509 * is against sacked "cnt", otherwise it's against facked "cnt" 2499 * is against sacked "cnt", otherwise it's against facked "cnt"
2510 */ 2500 */
2511static void tcp_mark_head_lost(struct sock *sk, int packets) 2501static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2512{ 2502{
2513 struct tcp_sock *tp = tcp_sk(sk); 2503 struct tcp_sock *tp = tcp_sk(sk);
2514 struct sk_buff *skb; 2504 struct sk_buff *skb;
@@ -2516,13 +2506,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2516 int err; 2506 int err;
2517 unsigned int mss; 2507 unsigned int mss;
2518 2508
2519 if (packets == 0)
2520 return;
2521
2522 WARN_ON(packets > tp->packets_out); 2509 WARN_ON(packets > tp->packets_out);
2523 if (tp->lost_skb_hint) { 2510 if (tp->lost_skb_hint) {
2524 skb = tp->lost_skb_hint; 2511 skb = tp->lost_skb_hint;
2525 cnt = tp->lost_cnt_hint; 2512 cnt = tp->lost_cnt_hint;
2513 /* Head already handled? */
2514 if (mark_head && skb != tcp_write_queue_head(sk))
2515 return;
2526 } else { 2516 } else {
2527 skb = tcp_write_queue_head(sk); 2517 skb = tcp_write_queue_head(sk);
2528 cnt = 0; 2518 cnt = 0;
@@ -2557,6 +2547,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2557 } 2547 }
2558 2548
2559 tcp_skb_mark_lost(tp, skb); 2549 tcp_skb_mark_lost(tp, skb);
2550
2551 if (mark_head)
2552 break;
2560 } 2553 }
2561 tcp_verify_left_out(tp); 2554 tcp_verify_left_out(tp);
2562} 2555}
@@ -2568,17 +2561,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2568 struct tcp_sock *tp = tcp_sk(sk); 2561 struct tcp_sock *tp = tcp_sk(sk);
2569 2562
2570 if (tcp_is_reno(tp)) { 2563 if (tcp_is_reno(tp)) {
2571 tcp_mark_head_lost(sk, 1); 2564 tcp_mark_head_lost(sk, 1, 1);
2572 } else if (tcp_is_fack(tp)) { 2565 } else if (tcp_is_fack(tp)) {
2573 int lost = tp->fackets_out - tp->reordering; 2566 int lost = tp->fackets_out - tp->reordering;
2574 if (lost <= 0) 2567 if (lost <= 0)
2575 lost = 1; 2568 lost = 1;
2576 tcp_mark_head_lost(sk, lost); 2569 tcp_mark_head_lost(sk, lost, 0);
2577 } else { 2570 } else {
2578 int sacked_upto = tp->sacked_out - tp->reordering; 2571 int sacked_upto = tp->sacked_out - tp->reordering;
2579 if (sacked_upto < fast_rexmit) 2572 if (sacked_upto >= 0)
2580 sacked_upto = fast_rexmit; 2573 tcp_mark_head_lost(sk, sacked_upto, 0);
2581 tcp_mark_head_lost(sk, sacked_upto); 2574 else if (fast_rexmit)
2575 tcp_mark_head_lost(sk, 1, 1);
2582 } 2576 }
2583 2577
2584 tcp_timeout_skbs(sk); 2578 tcp_timeout_skbs(sk);
@@ -2665,7 +2659,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2665#define DBGUNDO(x...) do { } while (0) 2659#define DBGUNDO(x...) do { } while (0)
2666#endif 2660#endif
2667 2661
2668static void tcp_undo_cwr(struct sock *sk, const int undo) 2662static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
2669{ 2663{
2670 struct tcp_sock *tp = tcp_sk(sk); 2664 struct tcp_sock *tp = tcp_sk(sk);
2671 2665
@@ -2677,14 +2671,13 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
2677 else 2671 else
2678 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); 2672 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
2679 2673
2680 if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { 2674 if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) {
2681 tp->snd_ssthresh = tp->prior_ssthresh; 2675 tp->snd_ssthresh = tp->prior_ssthresh;
2682 TCP_ECN_withdraw_cwr(tp); 2676 TCP_ECN_withdraw_cwr(tp);
2683 } 2677 }
2684 } else { 2678 } else {
2685 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); 2679 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
2686 } 2680 }
2687 tcp_moderate_cwnd(tp);
2688 tp->snd_cwnd_stamp = tcp_time_stamp; 2681 tp->snd_cwnd_stamp = tcp_time_stamp;
2689} 2682}
2690 2683
@@ -2705,7 +2698,7 @@ static int tcp_try_undo_recovery(struct sock *sk)
2705 * or our original transmission succeeded. 2698 * or our original transmission succeeded.
2706 */ 2699 */
2707 DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); 2700 DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
2708 tcp_undo_cwr(sk, 1); 2701 tcp_undo_cwr(sk, true);
2709 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) 2702 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
2710 mib_idx = LINUX_MIB_TCPLOSSUNDO; 2703 mib_idx = LINUX_MIB_TCPLOSSUNDO;
2711 else 2704 else
@@ -2732,7 +2725,7 @@ static void tcp_try_undo_dsack(struct sock *sk)
2732 2725
2733 if (tp->undo_marker && !tp->undo_retrans) { 2726 if (tp->undo_marker && !tp->undo_retrans) {
2734 DBGUNDO(sk, "D-SACK"); 2727 DBGUNDO(sk, "D-SACK");
2735 tcp_undo_cwr(sk, 1); 2728 tcp_undo_cwr(sk, true);
2736 tp->undo_marker = 0; 2729 tp->undo_marker = 0;
2737 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); 2730 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
2738 } 2731 }
@@ -2785,7 +2778,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
2785 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); 2778 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
2786 2779
2787 DBGUNDO(sk, "Hoe"); 2780 DBGUNDO(sk, "Hoe");
2788 tcp_undo_cwr(sk, 0); 2781 tcp_undo_cwr(sk, false);
2789 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); 2782 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
2790 2783
2791 /* So... Do not make Hoe's retransmit yet. 2784 /* So... Do not make Hoe's retransmit yet.
@@ -2814,7 +2807,7 @@ static int tcp_try_undo_loss(struct sock *sk)
2814 2807
2815 DBGUNDO(sk, "partial loss"); 2808 DBGUNDO(sk, "partial loss");
2816 tp->lost_out = 0; 2809 tp->lost_out = 0;
2817 tcp_undo_cwr(sk, 1); 2810 tcp_undo_cwr(sk, true);
2818 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); 2811 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
2819 inet_csk(sk)->icsk_retransmits = 0; 2812 inet_csk(sk)->icsk_retransmits = 0;
2820 tp->undo_marker = 0; 2813 tp->undo_marker = 0;
@@ -2828,8 +2821,11 @@ static int tcp_try_undo_loss(struct sock *sk)
2828static inline void tcp_complete_cwr(struct sock *sk) 2821static inline void tcp_complete_cwr(struct sock *sk)
2829{ 2822{
2830 struct tcp_sock *tp = tcp_sk(sk); 2823 struct tcp_sock *tp = tcp_sk(sk);
2831 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); 2824 /* Do not moderate cwnd if it's already undone in cwr or recovery */
2832 tp->snd_cwnd_stamp = tcp_time_stamp; 2825 if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) {
2826 tp->snd_cwnd = tp->snd_ssthresh;
2827 tp->snd_cwnd_stamp = tcp_time_stamp;
2828 }
2833 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); 2829 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
2834} 2830}
2835 2831
@@ -2887,7 +2883,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
2887 icsk->icsk_mtup.probe_size; 2883 icsk->icsk_mtup.probe_size;
2888 tp->snd_cwnd_cnt = 0; 2884 tp->snd_cwnd_cnt = 0;
2889 tp->snd_cwnd_stamp = tcp_time_stamp; 2885 tp->snd_cwnd_stamp = tcp_time_stamp;
2890 tp->rcv_ssthresh = tcp_current_ssthresh(sk); 2886 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2891 2887
2892 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; 2888 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
2893 icsk->icsk_mtup.probe_size = 0; 2889 icsk->icsk_mtup.probe_size = 0;
@@ -2984,7 +2980,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
2984 before(tp->snd_una, tp->high_seq) && 2980 before(tp->snd_una, tp->high_seq) &&
2985 icsk->icsk_ca_state != TCP_CA_Open && 2981 icsk->icsk_ca_state != TCP_CA_Open &&
2986 tp->fackets_out > tp->reordering) { 2982 tp->fackets_out > tp->reordering) {
2987 tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); 2983 tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
2988 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); 2984 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
2989 } 2985 }
2990 2986
@@ -3356,7 +3352,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3356 net_invalid_timestamp())) 3352 net_invalid_timestamp()))
3357 rtt_us = ktime_us_delta(ktime_get_real(), 3353 rtt_us = ktime_us_delta(ktime_get_real(),
3358 last_ackt); 3354 last_ackt);
3359 else if (ca_seq_rtt > 0) 3355 else if (ca_seq_rtt >= 0)
3360 rtt_us = jiffies_to_usecs(ca_seq_rtt); 3356 rtt_us = jiffies_to_usecs(ca_seq_rtt);
3361 } 3357 }
3362 3358
@@ -3412,8 +3408,8 @@ static void tcp_ack_probe(struct sock *sk)
3412 3408
3413static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) 3409static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
3414{ 3410{
3415 return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || 3411 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3416 inet_csk(sk)->icsk_ca_state != TCP_CA_Open); 3412 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3417} 3413}
3418 3414
3419static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) 3415static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
@@ -3430,9 +3426,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
3430 const u32 ack, const u32 ack_seq, 3426 const u32 ack, const u32 ack_seq,
3431 const u32 nwin) 3427 const u32 nwin)
3432{ 3428{
3433 return (after(ack, tp->snd_una) || 3429 return after(ack, tp->snd_una) ||
3434 after(ack_seq, tp->snd_wl1) || 3430 after(ack_seq, tp->snd_wl1) ||
3435 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); 3431 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
3436} 3432}
3437 3433
3438/* Update our send window. 3434/* Update our send window.
@@ -3500,7 +3496,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
3500 if (flag & FLAG_ECE) 3496 if (flag & FLAG_ECE)
3501 tcp_ratehalving_spur_to_response(sk); 3497 tcp_ratehalving_spur_to_response(sk);
3502 else 3498 else
3503 tcp_undo_cwr(sk, 1); 3499 tcp_undo_cwr(sk, true);
3504} 3500}
3505 3501
3506/* F-RTO spurious RTO detection algorithm (RFC4138) 3502/* F-RTO spurious RTO detection algorithm (RFC4138)
@@ -4406,7 +4402,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4406 if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) { 4402 if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
4407 tp->ucopy.len -= chunk; 4403 tp->ucopy.len -= chunk;
4408 tp->copied_seq += chunk; 4404 tp->copied_seq += chunk;
4409 eaten = (chunk == skb->len && !th->fin); 4405 eaten = (chunk == skb->len);
4410 tcp_rcv_space_adjust(sk); 4406 tcp_rcv_space_adjust(sk);
4411 } 4407 }
4412 local_bh_disable(); 4408 local_bh_disable();
@@ -4870,7 +4866,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk)
4870 return 0; 4866 return 0;
4871 4867
4872 /* If we are under soft global TCP memory pressure, do not expand. */ 4868 /* If we are under soft global TCP memory pressure, do not expand. */
4873 if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) 4869 if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
4874 return 0; 4870 return 0;
4875 4871
4876 /* If we filled the congestion window, do not expand. */ 4872 /* If we filled the congestion window, do not expand. */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 020766292bb0..708dc203b034 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -146,13 +146,15 @@ EXPORT_SYMBOL_GPL(tcp_twsk_unique);
146/* This will initiate an outgoing connection. */ 146/* This will initiate an outgoing connection. */
147int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 147int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
148{ 148{
149 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
149 struct inet_sock *inet = inet_sk(sk); 150 struct inet_sock *inet = inet_sk(sk);
150 struct tcp_sock *tp = tcp_sk(sk); 151 struct tcp_sock *tp = tcp_sk(sk);
151 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 152 __be16 orig_sport, orig_dport;
152 struct rtable *rt;
153 __be32 daddr, nexthop; 153 __be32 daddr, nexthop;
154 int tmp; 154 struct flowi4 *fl4;
155 struct rtable *rt;
155 int err; 156 int err;
157 struct ip_options_rcu *inet_opt;
156 158
157 if (addr_len < sizeof(struct sockaddr_in)) 159 if (addr_len < sizeof(struct sockaddr_in))
158 return -EINVAL; 160 return -EINVAL;
@@ -161,20 +163,26 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
161 return -EAFNOSUPPORT; 163 return -EAFNOSUPPORT;
162 164
163 nexthop = daddr = usin->sin_addr.s_addr; 165 nexthop = daddr = usin->sin_addr.s_addr;
164 if (inet->opt && inet->opt->srr) { 166 inet_opt = rcu_dereference_protected(inet->inet_opt,
167 sock_owned_by_user(sk));
168 if (inet_opt && inet_opt->opt.srr) {
165 if (!daddr) 169 if (!daddr)
166 return -EINVAL; 170 return -EINVAL;
167 nexthop = inet->opt->faddr; 171 nexthop = inet_opt->opt.faddr;
168 } 172 }
169 173
170 tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr, 174 orig_sport = inet->inet_sport;
171 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 175 orig_dport = usin->sin_port;
172 IPPROTO_TCP, 176 fl4 = &inet->cork.fl.u.ip4;
173 inet->inet_sport, usin->sin_port, sk, 1); 177 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
174 if (tmp < 0) { 178 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175 if (tmp == -ENETUNREACH) 179 IPPROTO_TCP,
180 orig_sport, orig_dport, sk, true);
181 if (IS_ERR(rt)) {
182 err = PTR_ERR(rt);
183 if (err == -ENETUNREACH)
176 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 184 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
177 return tmp; 185 return err;
178 } 186 }
179 187
180 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 188 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
@@ -182,11 +190,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
182 return -ENETUNREACH; 190 return -ENETUNREACH;
183 } 191 }
184 192
185 if (!inet->opt || !inet->opt->srr) 193 if (!inet_opt || !inet_opt->opt.srr)
186 daddr = rt->rt_dst; 194 daddr = fl4->daddr;
187 195
188 if (!inet->inet_saddr) 196 if (!inet->inet_saddr)
189 inet->inet_saddr = rt->rt_src; 197 inet->inet_saddr = fl4->saddr;
190 inet->inet_rcv_saddr = inet->inet_saddr; 198 inet->inet_rcv_saddr = inet->inet_saddr;
191 199
192 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 200 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
@@ -197,8 +205,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
197 } 205 }
198 206
199 if (tcp_death_row.sysctl_tw_recycle && 207 if (tcp_death_row.sysctl_tw_recycle &&
200 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { 208 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
201 struct inet_peer *peer = rt_get_peer(rt); 209 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
202 /* 210 /*
203 * VJ's idea. We save last timestamp seen from 211 * VJ's idea. We save last timestamp seen from
204 * the destination in peer table, when entering state 212 * the destination in peer table, when entering state
@@ -218,8 +226,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
218 inet->inet_daddr = daddr; 226 inet->inet_daddr = daddr;
219 227
220 inet_csk(sk)->icsk_ext_hdr_len = 0; 228 inet_csk(sk)->icsk_ext_hdr_len = 0;
221 if (inet->opt) 229 if (inet_opt)
222 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; 230 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
223 231
224 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 232 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
225 233
@@ -233,11 +241,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
233 if (err) 241 if (err)
234 goto failure; 242 goto failure;
235 243
236 err = ip_route_newports(&rt, IPPROTO_TCP, 244 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
237 inet->inet_sport, inet->inet_dport, sk); 245 inet->inet_sport, inet->inet_dport, sk);
238 if (err) 246 if (IS_ERR(rt)) {
247 err = PTR_ERR(rt);
248 rt = NULL;
239 goto failure; 249 goto failure;
240 250 }
241 /* OK, now commit destination to socket. */ 251 /* OK, now commit destination to socket. */
242 sk->sk_gso_type = SKB_GSO_TCPV4; 252 sk->sk_gso_type = SKB_GSO_TCPV4;
243 sk_setup_caps(sk, &rt->dst); 253 sk_setup_caps(sk, &rt->dst);
@@ -273,7 +283,7 @@ EXPORT_SYMBOL(tcp_v4_connect);
273/* 283/*
274 * This routine does path mtu discovery as defined in RFC1191. 284 * This routine does path mtu discovery as defined in RFC1191.
275 */ 285 */
276static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) 286static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
277{ 287{
278 struct dst_entry *dst; 288 struct dst_entry *dst;
279 struct inet_sock *inet = inet_sk(sk); 289 struct inet_sock *inet = inet_sk(sk);
@@ -335,7 +345,7 @@ static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
335 345
336void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) 346void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
337{ 347{
338 struct iphdr *iph = (struct iphdr *)icmp_skb->data; 348 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
339 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); 349 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
340 struct inet_connection_sock *icsk; 350 struct inet_connection_sock *icsk;
341 struct tcp_sock *tp; 351 struct tcp_sock *tp;
@@ -415,6 +425,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
415 !icsk->icsk_backoff) 425 !icsk->icsk_backoff)
416 break; 426 break;
417 427
428 if (sock_owned_by_user(sk))
429 break;
430
418 icsk->icsk_backoff--; 431 icsk->icsk_backoff--;
419 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << 432 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
420 icsk->icsk_backoff; 433 icsk->icsk_backoff;
@@ -429,11 +442,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
429 if (remaining) { 442 if (remaining) {
430 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 443 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
431 remaining, TCP_RTO_MAX); 444 remaining, TCP_RTO_MAX);
432 } else if (sock_owned_by_user(sk)) {
433 /* RTO revert clocked out retransmission,
434 * but socket is locked. Will defer. */
435 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
436 HZ/20, TCP_RTO_MAX);
437 } else { 445 } else {
438 /* RTO revert clocked out retransmission. 446 /* RTO revert clocked out retransmission.
439 * Will retransmit now */ 447 * Will retransmit now */
@@ -643,7 +651,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
643 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; 651 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
644 652
645 net = dev_net(skb_dst(skb)->dev); 653 net = dev_net(skb_dst(skb)->dev);
646 ip_send_reply(net->ipv4.tcp_sock, skb, 654 ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
647 &arg, arg.iov[0].iov_len); 655 &arg, arg.iov[0].iov_len);
648 656
649 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 657 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -718,7 +726,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
718 if (oif) 726 if (oif)
719 arg.bound_dev_if = oif; 727 arg.bound_dev_if = oif;
720 728
721 ip_send_reply(net->ipv4.tcp_sock, skb, 729 ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
722 &arg, arg.iov[0].iov_len); 730 &arg, arg.iov[0].iov_len);
723 731
724 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 732 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -761,11 +769,12 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
761 struct request_values *rvp) 769 struct request_values *rvp)
762{ 770{
763 const struct inet_request_sock *ireq = inet_rsk(req); 771 const struct inet_request_sock *ireq = inet_rsk(req);
772 struct flowi4 fl4;
764 int err = -1; 773 int err = -1;
765 struct sk_buff * skb; 774 struct sk_buff * skb;
766 775
767 /* First, grab a route. */ 776 /* First, grab a route. */
768 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) 777 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
769 return -1; 778 return -1;
770 779
771 skb = tcp_make_synack(sk, dst, req, rvp); 780 skb = tcp_make_synack(sk, dst, req, rvp);
@@ -816,17 +825,18 @@ static void syn_flood_warning(const struct sk_buff *skb)
816/* 825/*
817 * Save and compile IPv4 options into the request_sock if needed. 826 * Save and compile IPv4 options into the request_sock if needed.
818 */ 827 */
819static struct ip_options *tcp_v4_save_options(struct sock *sk, 828static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
820 struct sk_buff *skb) 829 struct sk_buff *skb)
821{ 830{
822 struct ip_options *opt = &(IPCB(skb)->opt); 831 const struct ip_options *opt = &(IPCB(skb)->opt);
823 struct ip_options *dopt = NULL; 832 struct ip_options_rcu *dopt = NULL;
824 833
825 if (opt && opt->optlen) { 834 if (opt && opt->optlen) {
826 int opt_size = optlength(opt); 835 int opt_size = sizeof(*dopt) + opt->optlen;
836
827 dopt = kmalloc(opt_size, GFP_ATOMIC); 837 dopt = kmalloc(opt_size, GFP_ATOMIC);
828 if (dopt) { 838 if (dopt) {
829 if (ip_options_echo(dopt, skb)) { 839 if (ip_options_echo(&dopt->opt, skb)) {
830 kfree(dopt); 840 kfree(dopt);
831 dopt = NULL; 841 dopt = NULL;
832 } 842 }
@@ -1212,12 +1222,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1212}; 1222};
1213#endif 1223#endif
1214 1224
1215static struct timewait_sock_ops tcp_timewait_sock_ops = {
1216 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1217 .twsk_unique = tcp_twsk_unique,
1218 .twsk_destructor= tcp_twsk_destructor,
1219};
1220
1221int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1225int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1222{ 1226{
1223 struct tcp_extend_values tmp_ext; 1227 struct tcp_extend_values tmp_ext;
@@ -1335,6 +1339,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1335 req->cookie_ts = tmp_opt.tstamp_ok; 1339 req->cookie_ts = tmp_opt.tstamp_ok;
1336 } else if (!isn) { 1340 } else if (!isn) {
1337 struct inet_peer *peer = NULL; 1341 struct inet_peer *peer = NULL;
1342 struct flowi4 fl4;
1338 1343
1339 /* VJ's idea. We save last timestamp seen 1344 /* VJ's idea. We save last timestamp seen
1340 * from the destination in peer table, when entering 1345 * from the destination in peer table, when entering
@@ -1347,9 +1352,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1347 */ 1352 */
1348 if (tmp_opt.saw_tstamp && 1353 if (tmp_opt.saw_tstamp &&
1349 tcp_death_row.sysctl_tw_recycle && 1354 tcp_death_row.sysctl_tw_recycle &&
1350 (dst = inet_csk_route_req(sk, req)) != NULL && 1355 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1351 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1356 fl4.daddr == saddr &&
1352 peer->v4daddr == saddr) { 1357 (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1353 inet_peer_refcheck(peer); 1358 inet_peer_refcheck(peer);
1354 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && 1359 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1355 (s32)(peer->tcp_ts - req->ts_recent) > 1360 (s32)(peer->tcp_ts - req->ts_recent) >
@@ -1413,19 +1418,16 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1413#ifdef CONFIG_TCP_MD5SIG 1418#ifdef CONFIG_TCP_MD5SIG
1414 struct tcp_md5sig_key *key; 1419 struct tcp_md5sig_key *key;
1415#endif 1420#endif
1421 struct ip_options_rcu *inet_opt;
1416 1422
1417 if (sk_acceptq_is_full(sk)) 1423 if (sk_acceptq_is_full(sk))
1418 goto exit_overflow; 1424 goto exit_overflow;
1419 1425
1420 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1421 goto exit;
1422
1423 newsk = tcp_create_openreq_child(sk, req, skb); 1426 newsk = tcp_create_openreq_child(sk, req, skb);
1424 if (!newsk) 1427 if (!newsk)
1425 goto exit; 1428 goto exit_nonewsk;
1426 1429
1427 newsk->sk_gso_type = SKB_GSO_TCPV4; 1430 newsk->sk_gso_type = SKB_GSO_TCPV4;
1428 sk_setup_caps(newsk, dst);
1429 1431
1430 newtp = tcp_sk(newsk); 1432 newtp = tcp_sk(newsk);
1431 newinet = inet_sk(newsk); 1433 newinet = inet_sk(newsk);
@@ -1433,18 +1435,24 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1433 newinet->inet_daddr = ireq->rmt_addr; 1435 newinet->inet_daddr = ireq->rmt_addr;
1434 newinet->inet_rcv_saddr = ireq->loc_addr; 1436 newinet->inet_rcv_saddr = ireq->loc_addr;
1435 newinet->inet_saddr = ireq->loc_addr; 1437 newinet->inet_saddr = ireq->loc_addr;
1436 newinet->opt = ireq->opt; 1438 inet_opt = ireq->opt;
1439 rcu_assign_pointer(newinet->inet_opt, inet_opt);
1437 ireq->opt = NULL; 1440 ireq->opt = NULL;
1438 newinet->mc_index = inet_iif(skb); 1441 newinet->mc_index = inet_iif(skb);
1439 newinet->mc_ttl = ip_hdr(skb)->ttl; 1442 newinet->mc_ttl = ip_hdr(skb)->ttl;
1440 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1443 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1441 if (newinet->opt) 1444 if (inet_opt)
1442 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; 1445 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1443 newinet->inet_id = newtp->write_seq ^ jiffies; 1446 newinet->inet_id = newtp->write_seq ^ jiffies;
1444 1447
1448 if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1449 goto put_and_exit;
1450
1451 sk_setup_caps(newsk, dst);
1452
1445 tcp_mtup_init(newsk); 1453 tcp_mtup_init(newsk);
1446 tcp_sync_mss(newsk, dst_mtu(dst)); 1454 tcp_sync_mss(newsk, dst_mtu(dst));
1447 newtp->advmss = dst_metric(dst, RTAX_ADVMSS); 1455 newtp->advmss = dst_metric_advmss(dst);
1448 if (tcp_sk(sk)->rx_opt.user_mss && 1456 if (tcp_sk(sk)->rx_opt.user_mss &&
1449 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) 1457 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1450 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; 1458 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
@@ -1469,17 +1477,22 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1469 } 1477 }
1470#endif 1478#endif
1471 1479
1480 if (__inet_inherit_port(sk, newsk) < 0)
1481 goto put_and_exit;
1472 __inet_hash_nolisten(newsk, NULL); 1482 __inet_hash_nolisten(newsk, NULL);
1473 __inet_inherit_port(sk, newsk);
1474 1483
1475 return newsk; 1484 return newsk;
1476 1485
1477exit_overflow: 1486exit_overflow:
1478 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 1487 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1488exit_nonewsk:
1489 dst_release(dst);
1479exit: 1490exit:
1480 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1491 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1481 dst_release(dst);
1482 return NULL; 1492 return NULL;
1493put_and_exit:
1494 sock_put(newsk);
1495 goto exit;
1483} 1496}
1484EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1497EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1485 1498
@@ -1560,12 +1573,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1560 1573
1561 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1574 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1562 sock_rps_save_rxhash(sk, skb->rxhash); 1575 sock_rps_save_rxhash(sk, skb->rxhash);
1563 TCP_CHECK_TIMER(sk);
1564 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1576 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1565 rsk = sk; 1577 rsk = sk;
1566 goto reset; 1578 goto reset;
1567 } 1579 }
1568 TCP_CHECK_TIMER(sk);
1569 return 0; 1580 return 0;
1570 } 1581 }
1571 1582
@@ -1578,6 +1589,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1578 goto discard; 1589 goto discard;
1579 1590
1580 if (nsk != sk) { 1591 if (nsk != sk) {
1592 sock_rps_save_rxhash(nsk, skb->rxhash);
1581 if (tcp_child_process(sk, nsk, skb)) { 1593 if (tcp_child_process(sk, nsk, skb)) {
1582 rsk = nsk; 1594 rsk = nsk;
1583 goto reset; 1595 goto reset;
@@ -1587,13 +1599,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1587 } else 1599 } else
1588 sock_rps_save_rxhash(sk, skb->rxhash); 1600 sock_rps_save_rxhash(sk, skb->rxhash);
1589 1601
1590
1591 TCP_CHECK_TIMER(sk);
1592 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { 1602 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1593 rsk = sk; 1603 rsk = sk;
1594 goto reset; 1604 goto reset;
1595 } 1605 }
1596 TCP_CHECK_TIMER(sk);
1597 return 0; 1606 return 0;
1598 1607
1599reset: 1608reset:
@@ -1761,64 +1770,41 @@ do_time_wait:
1761 goto discard_it; 1770 goto discard_it;
1762} 1771}
1763 1772
1764/* VJ's idea. Save last timestamp seen from this destination 1773struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1765 * and hold it at least for normal timewait interval to use for duplicate
1766 * segment detection in subsequent connections, before they enter synchronized
1767 * state.
1768 */
1769
1770int tcp_v4_remember_stamp(struct sock *sk)
1771{ 1774{
1775 struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1772 struct inet_sock *inet = inet_sk(sk); 1776 struct inet_sock *inet = inet_sk(sk);
1773 struct tcp_sock *tp = tcp_sk(sk); 1777 struct inet_peer *peer;
1774 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1775 struct inet_peer *peer = NULL;
1776 int release_it = 0;
1777 1778
1778 if (!rt || rt->rt_dst != inet->inet_daddr) { 1779 if (!rt ||
1779 peer = inet_getpeer(inet->inet_daddr, 1); 1780 inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1780 release_it = 1; 1781 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1782 *release_it = true;
1781 } else { 1783 } else {
1782 if (!rt->peer) 1784 if (!rt->peer)
1783 rt_bind_peer(rt, 1); 1785 rt_bind_peer(rt, inet->inet_daddr, 1);
1784 peer = rt->peer; 1786 peer = rt->peer;
1787 *release_it = false;
1785 } 1788 }
1786 1789
1787 if (peer) { 1790 return peer;
1788 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1789 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1790 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1791 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1792 peer->tcp_ts = tp->rx_opt.ts_recent;
1793 }
1794 if (release_it)
1795 inet_putpeer(peer);
1796 return 1;
1797 }
1798
1799 return 0;
1800} 1791}
1801EXPORT_SYMBOL(tcp_v4_remember_stamp); 1792EXPORT_SYMBOL(tcp_v4_get_peer);
1802 1793
1803int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) 1794void *tcp_v4_tw_get_peer(struct sock *sk)
1804{ 1795{
1805 struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); 1796 struct inet_timewait_sock *tw = inet_twsk(sk);
1806
1807 if (peer) {
1808 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1809
1810 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1811 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1812 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1813 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1814 peer->tcp_ts = tcptw->tw_ts_recent;
1815 }
1816 inet_putpeer(peer);
1817 return 1;
1818 }
1819 1797
1820 return 0; 1798 return inet_getpeer_v4(tw->tw_daddr, 1);
1821} 1799}
1800EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1801
1802static struct timewait_sock_ops tcp_timewait_sock_ops = {
1803 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1804 .twsk_unique = tcp_twsk_unique,
1805 .twsk_destructor= tcp_twsk_destructor,
1806 .twsk_getpeer = tcp_v4_tw_get_peer,
1807};
1822 1808
1823const struct inet_connection_sock_af_ops ipv4_specific = { 1809const struct inet_connection_sock_af_ops ipv4_specific = {
1824 .queue_xmit = ip_queue_xmit, 1810 .queue_xmit = ip_queue_xmit,
@@ -1826,7 +1812,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
1826 .rebuild_header = inet_sk_rebuild_header, 1812 .rebuild_header = inet_sk_rebuild_header,
1827 .conn_request = tcp_v4_conn_request, 1813 .conn_request = tcp_v4_conn_request,
1828 .syn_recv_sock = tcp_v4_syn_recv_sock, 1814 .syn_recv_sock = tcp_v4_syn_recv_sock,
1829 .remember_stamp = tcp_v4_remember_stamp, 1815 .get_peer = tcp_v4_get_peer,
1830 .net_header_len = sizeof(struct iphdr), 1816 .net_header_len = sizeof(struct iphdr),
1831 .setsockopt = ip_setsockopt, 1817 .setsockopt = ip_setsockopt,
1832 .getsockopt = ip_getsockopt, 1818 .getsockopt = ip_getsockopt,
@@ -2022,13 +2008,12 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
2022 } 2008 }
2023 req = req->dl_next; 2009 req = req->dl_next;
2024 } 2010 }
2025 st->offset = 0;
2026 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) 2011 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2027 break; 2012 break;
2028get_req: 2013get_req:
2029 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; 2014 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2030 } 2015 }
2031 sk = sk_next(st->syn_wait_sk); 2016 sk = sk_nulls_next(st->syn_wait_sk);
2032 st->state = TCP_SEQ_STATE_LISTENING; 2017 st->state = TCP_SEQ_STATE_LISTENING;
2033 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2018 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2034 } else { 2019 } else {
@@ -2037,11 +2022,13 @@ get_req:
2037 if (reqsk_queue_len(&icsk->icsk_accept_queue)) 2022 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2038 goto start_req; 2023 goto start_req;
2039 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 2024 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2040 sk = sk_next(sk); 2025 sk = sk_nulls_next(sk);
2041 } 2026 }
2042get_sk: 2027get_sk:
2043 sk_nulls_for_each_from(sk, node) { 2028 sk_nulls_for_each_from(sk, node) {
2044 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { 2029 if (!net_eq(sock_net(sk), net))
2030 continue;
2031 if (sk->sk_family == st->family) {
2045 cur = sk; 2032 cur = sk;
2046 goto out; 2033 goto out;
2047 } 2034 }
@@ -2385,7 +2372,7 @@ static void get_openreq4(struct sock *sk, struct request_sock *req,
2385 int ttd = req->expires - jiffies; 2372 int ttd = req->expires - jiffies;
2386 2373
2387 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2374 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2388 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", 2375 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2389 i, 2376 i,
2390 ireq->loc_addr, 2377 ireq->loc_addr,
2391 ntohs(inet_sk(sk)->inet_sport), 2378 ntohs(inet_sk(sk)->inet_sport),
@@ -2440,7 +2427,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2440 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); 2427 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2441 2428
2442 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " 2429 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2443 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n", 2430 "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2444 i, src, srcp, dest, destp, sk->sk_state, 2431 i, src, srcp, dest, destp, sk->sk_state,
2445 tp->write_seq - tp->snd_una, 2432 tp->write_seq - tp->snd_una,
2446 rx_queue, 2433 rx_queue,
@@ -2475,7 +2462,7 @@ static void get_timewait4_sock(struct inet_timewait_sock *tw,
2475 srcp = ntohs(tw->tw_sport); 2462 srcp = ntohs(tw->tw_sport);
2476 2463
2477 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2464 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2478 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n", 2465 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2479 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2466 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2480 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, 2467 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2481 atomic_read(&tw->tw_refcnt), tw, len); 2468 atomic_read(&tw->tw_refcnt), tw, len);
@@ -2553,7 +2540,7 @@ void tcp4_proc_exit(void)
2553 2540
2554struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) 2541struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2555{ 2542{
2556 struct iphdr *iph = skb_gro_network_header(skb); 2543 const struct iphdr *iph = skb_gro_network_header(skb);
2557 2544
2558 switch (skb->ip_summed) { 2545 switch (skb->ip_summed) {
2559 case CHECKSUM_COMPLETE: 2546 case CHECKSUM_COMPLETE:
@@ -2571,11 +2558,10 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2571 2558
2572 return tcp_gro_receive(head, skb); 2559 return tcp_gro_receive(head, skb);
2573} 2560}
2574EXPORT_SYMBOL(tcp4_gro_receive);
2575 2561
2576int tcp4_gro_complete(struct sk_buff *skb) 2562int tcp4_gro_complete(struct sk_buff *skb)
2577{ 2563{
2578 struct iphdr *iph = ip_hdr(skb); 2564 const struct iphdr *iph = ip_hdr(skb);
2579 struct tcphdr *th = tcp_hdr(skb); 2565 struct tcphdr *th = tcp_hdr(skb);
2580 2566
2581 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), 2567 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
@@ -2584,7 +2570,6 @@ int tcp4_gro_complete(struct sk_buff *skb)
2584 2570
2585 return tcp_gro_complete(skb); 2571 return tcp_gro_complete(skb);
2586} 2572}
2587EXPORT_SYMBOL(tcp4_gro_complete);
2588 2573
2589struct proto tcp_prot = { 2574struct proto tcp_prot = {
2590 .name = "TCP", 2575 .name = "TCP",
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index de870377fbba..72f7218b03f5 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -12,7 +12,7 @@
12 * within cong_avoid. 12 * within cong_avoid.
13 * o Error correcting in remote HZ, therefore remote HZ will be keeped 13 * o Error correcting in remote HZ, therefore remote HZ will be keeped
14 * on checking and updating. 14 * on checking and updating.
15 * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne 15 * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since
16 * OWD have a similar meaning as RTT. Also correct the buggy formular. 16 * OWD have a similar meaning as RTT. Also correct the buggy formular.
17 * o Handle reaction for Early Congestion Indication (ECI) within 17 * o Handle reaction for Early Congestion Indication (ECI) within
18 * pkts_acked, as mentioned within pseudo code. 18 * pkts_acked, as mentioned within pseudo code.
@@ -313,7 +313,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
313 lp->last_drop = tcp_time_stamp; 313 lp->last_drop = tcp_time_stamp;
314} 314}
315 315
316static struct tcp_congestion_ops tcp_lp = { 316static struct tcp_congestion_ops tcp_lp __read_mostly = {
317 .flags = TCP_CONG_RTT_STAMP, 317 .flags = TCP_CONG_RTT_STAMP,
318 .init = tcp_lp_init, 318 .init = tcp_lp_init,
319 .ssthresh = tcp_reno_ssthresh, 319 .ssthresh = tcp_reno_ssthresh,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f25b56cb85cb..80b1f80759ab 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,13 +49,63 @@ struct inet_timewait_death_row tcp_death_row = {
49}; 49};
50EXPORT_SYMBOL_GPL(tcp_death_row); 50EXPORT_SYMBOL_GPL(tcp_death_row);
51 51
52/* VJ's idea. Save last timestamp seen from this destination
53 * and hold it at least for normal timewait interval to use for duplicate
54 * segment detection in subsequent connections, before they enter synchronized
55 * state.
56 */
57
58static int tcp_remember_stamp(struct sock *sk)
59{
60 const struct inet_connection_sock *icsk = inet_csk(sk);
61 struct tcp_sock *tp = tcp_sk(sk);
62 struct inet_peer *peer;
63 bool release_it;
64
65 peer = icsk->icsk_af_ops->get_peer(sk, &release_it);
66 if (peer) {
67 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
68 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
69 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
70 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
71 peer->tcp_ts = tp->rx_opt.ts_recent;
72 }
73 if (release_it)
74 inet_putpeer(peer);
75 return 1;
76 }
77
78 return 0;
79}
80
81static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
82{
83 struct sock *sk = (struct sock *) tw;
84 struct inet_peer *peer;
85
86 peer = twsk_getpeer(sk);
87 if (peer) {
88 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
89
90 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
91 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
92 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
93 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
94 peer->tcp_ts = tcptw->tw_ts_recent;
95 }
96 inet_putpeer(peer);
97 return 1;
98 }
99 return 0;
100}
101
52static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 102static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
53{ 103{
54 if (seq == s_win) 104 if (seq == s_win)
55 return 1; 105 return 1;
56 if (after(end_seq, s_win) && before(seq, e_win)) 106 if (after(end_seq, s_win) && before(seq, e_win))
57 return 1; 107 return 1;
58 return (seq == e_win && seq == end_seq); 108 return seq == e_win && seq == end_seq;
59} 109}
60 110
61/* 111/*
@@ -149,14 +199,9 @@ kill_with_rst:
149 tcptw->tw_ts_recent = tmp_opt.rcv_tsval; 199 tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
150 } 200 }
151 201
152 /* I am shamed, but failed to make it more elegant. 202 if (tcp_death_row.sysctl_tw_recycle &&
153 * Yes, it is direct reference to IP, which is impossible 203 tcptw->tw_ts_recent_stamp &&
154 * to generalize to IPv6. Taking into account that IPv6 204 tcp_tw_remember_stamp(tw))
155 * do not understand recycling in any case, it not
156 * a big problem in practice. --ANK */
157 if (tw->tw_family == AF_INET &&
158 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
159 tcp_v4_tw_remember_stamp(tw))
160 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, 205 inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
161 TCP_TIMEWAIT_LEN); 206 TCP_TIMEWAIT_LEN);
162 else 207 else
@@ -274,7 +319,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
274 int recycle_ok = 0; 319 int recycle_ok = 0;
275 320
276 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 321 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
277 recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); 322 recycle_ok = tcp_remember_stamp(sk);
278 323
279 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) 324 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
280 tw = inet_twsk_alloc(sk, state); 325 tw = inet_twsk_alloc(sk, state);
@@ -347,7 +392,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
347 * socket up. We've got bigger problems than 392 * socket up. We've got bigger problems than
348 * non-graceful socket closings. 393 * non-graceful socket closings.
349 */ 394 */
350 LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); 395 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
351 } 396 }
352 397
353 tcp_update_metrics(sk); 398 tcp_update_metrics(sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index de3bd8458588..882e0b0964d0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -55,7 +55,7 @@ int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
55int sysctl_tcp_tso_win_divisor __read_mostly = 3; 55int sysctl_tcp_tso_win_divisor __read_mostly = 3;
56 56
57int sysctl_tcp_mtu_probing __read_mostly = 0; 57int sysctl_tcp_mtu_probing __read_mostly = 0;
58int sysctl_tcp_base_mss __read_mostly = 512; 58int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
59 59
60/* By default, RFC2861 behavior. */ 60/* By default, RFC2861 behavior. */
61int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 61int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
@@ -73,7 +73,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
73 tcp_advance_send_head(sk, skb); 73 tcp_advance_send_head(sk, skb);
74 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; 74 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
75 75
76 /* Don't override Nagle indefinately with F-RTO */ 76 /* Don't override Nagle indefinitely with F-RTO */
77 if (tp->frto_counter == 2) 77 if (tp->frto_counter == 2)
78 tp->frto_counter = 3; 78 tp->frto_counter = 3;
79 79
@@ -119,9 +119,13 @@ static __u16 tcp_advertise_mss(struct sock *sk)
119 struct dst_entry *dst = __sk_dst_get(sk); 119 struct dst_entry *dst = __sk_dst_get(sk);
120 int mss = tp->advmss; 120 int mss = tp->advmss;
121 121
122 if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { 122 if (dst) {
123 mss = dst_metric(dst, RTAX_ADVMSS); 123 unsigned int metric = dst_metric_advmss(dst);
124 tp->advmss = mss; 124
125 if (metric < mss) {
126 mss = metric;
127 tp->advmss = mss;
128 }
125 } 129 }
126 130
127 return (__u16)mss; 131 return (__u16)mss;
@@ -224,24 +228,22 @@ void tcp_select_initial_window(int __space, __u32 mss,
224 } 228 }
225 } 229 }
226 230
227 /* Set initial window to value enough for senders, 231 /* Set initial window to a value enough for senders starting with
228 * following RFC2414. Senders, not following this RFC, 232 * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place
229 * will be satisfied with 2. 233 * a limit on the initial window when mss is larger than 1460.
230 */ 234 */
231 if (mss > (1 << *rcv_wscale)) { 235 if (mss > (1 << *rcv_wscale)) {
232 int init_cwnd = 4; 236 int init_cwnd = TCP_DEFAULT_INIT_RCVWND;
233 if (mss > 1460 * 3) 237 if (mss > 1460)
234 init_cwnd = 2; 238 init_cwnd =
235 else if (mss > 1460) 239 max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
236 init_cwnd = 3;
237 /* when initializing use the value from init_rcv_wnd 240 /* when initializing use the value from init_rcv_wnd
238 * rather than the default from above 241 * rather than the default from above
239 */ 242 */
240 if (init_rcv_wnd && 243 if (init_rcv_wnd)
241 (*rcv_wnd > init_rcv_wnd * mss)) 244 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
242 *rcv_wnd = init_rcv_wnd * mss; 245 else
243 else if (*rcv_wnd > init_cwnd * mss) 246 *rcv_wnd = min(*rcv_wnd, init_cwnd * mss);
244 *rcv_wnd = init_cwnd * mss;
245 } 247 }
246 248
247 /* Set the clamp no higher than max representable value */ 249 /* Set the clamp no higher than max representable value */
@@ -392,27 +394,30 @@ struct tcp_out_options {
392 */ 394 */
393static u8 tcp_cookie_size_check(u8 desired) 395static u8 tcp_cookie_size_check(u8 desired)
394{ 396{
395 if (desired > 0) { 397 int cookie_size;
398
399 if (desired > 0)
396 /* previously specified */ 400 /* previously specified */
397 return desired; 401 return desired;
398 } 402
399 if (sysctl_tcp_cookie_size <= 0) { 403 cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size);
404 if (cookie_size <= 0)
400 /* no default specified */ 405 /* no default specified */
401 return 0; 406 return 0;
402 } 407
403 if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) { 408 if (cookie_size <= TCP_COOKIE_MIN)
404 /* value too small, specify minimum */ 409 /* value too small, specify minimum */
405 return TCP_COOKIE_MIN; 410 return TCP_COOKIE_MIN;
406 } 411
407 if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) { 412 if (cookie_size >= TCP_COOKIE_MAX)
408 /* value too large, specify maximum */ 413 /* value too large, specify maximum */
409 return TCP_COOKIE_MAX; 414 return TCP_COOKIE_MAX;
410 } 415
411 if (0x1 & sysctl_tcp_cookie_size) { 416 if (cookie_size & 1)
412 /* 8-bit multiple, illegal, fix it */ 417 /* 8-bit multiple, illegal, fix it */
413 return (u8)(sysctl_tcp_cookie_size + 0x1); 418 cookie_size++;
414 } 419
415 return (u8)sysctl_tcp_cookie_size; 420 return (u8)cookie_size;
416} 421}
417 422
418/* Write previously computed TCP options to the packet. 423/* Write previously computed TCP options to the packet.
@@ -828,8 +833,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
828 &md5); 833 &md5);
829 tcp_header_size = tcp_options_size + sizeof(struct tcphdr); 834 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
830 835
831 if (tcp_packets_in_flight(tp) == 0) 836 if (tcp_packets_in_flight(tp) == 0) {
832 tcp_ca_event(sk, CA_EVENT_TX_START); 837 tcp_ca_event(sk, CA_EVENT_TX_START);
838 skb->ooo_okay = 1;
839 } else
840 skb->ooo_okay = 0;
833 841
834 skb_push(skb, tcp_header_size); 842 skb_push(skb, tcp_header_size);
835 skb_reset_transport_header(skb); 843 skb_reset_transport_header(skb);
@@ -891,7 +899,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
891 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, 899 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
892 tcp_skb_pcount(skb)); 900 tcp_skb_pcount(skb));
893 901
894 err = icsk->icsk_af_ops->queue_xmit(skb); 902 err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl);
895 if (likely(err <= 0)) 903 if (likely(err <= 0))
896 return err; 904 return err;
897 905
@@ -995,7 +1003,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
995 int nlen; 1003 int nlen;
996 u8 flags; 1004 u8 flags;
997 1005
998 BUG_ON(len > skb->len); 1006 if (WARN_ON(len > skb->len))
1007 return -EINVAL;
999 1008
1000 nsize = skb_headlen(skb) - len; 1009 nsize = skb_headlen(skb) - len;
1001 if (nsize < 0) 1010 if (nsize < 0)
@@ -1342,7 +1351,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
1342 return 0; 1351 return 0;
1343} 1352}
1344 1353
1345/* Intialize TSO state of a skb. 1354/* Initialize TSO state of a skb.
1346 * This must be invoked the first time we consider transmitting 1355 * This must be invoked the first time we consider transmitting
1347 * SKB onto the wire. 1356 * SKB onto the wire.
1348 */ 1357 */
@@ -1376,9 +1385,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp,
1376 const struct sk_buff *skb, 1385 const struct sk_buff *skb,
1377 unsigned mss_now, int nonagle) 1386 unsigned mss_now, int nonagle)
1378{ 1387{
1379 return (skb->len < mss_now && 1388 return skb->len < mss_now &&
1380 ((nonagle & TCP_NAGLE_CORK) || 1389 ((nonagle & TCP_NAGLE_CORK) ||
1381 (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); 1390 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1382} 1391}
1383 1392
1384/* Return non-zero if the Nagle test allows this packet to be 1393/* Return non-zero if the Nagle test allows this packet to be
@@ -1449,10 +1458,10 @@ int tcp_may_send_now(struct sock *sk)
1449 struct tcp_sock *tp = tcp_sk(sk); 1458 struct tcp_sock *tp = tcp_sk(sk);
1450 struct sk_buff *skb = tcp_send_head(sk); 1459 struct sk_buff *skb = tcp_send_head(sk);
1451 1460
1452 return (skb && 1461 return skb &&
1453 tcp_snd_test(sk, skb, tcp_current_mss(sk), 1462 tcp_snd_test(sk, skb, tcp_current_mss(sk),
1454 (tcp_skb_is_last(sk, skb) ? 1463 (tcp_skb_is_last(sk, skb) ?
1455 tp->nonagle : TCP_NAGLE_PUSH))); 1464 tp->nonagle : TCP_NAGLE_PUSH));
1456} 1465}
1457 1466
1458/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet 1467/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
@@ -1519,6 +1528,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1519 struct tcp_sock *tp = tcp_sk(sk); 1528 struct tcp_sock *tp = tcp_sk(sk);
1520 const struct inet_connection_sock *icsk = inet_csk(sk); 1529 const struct inet_connection_sock *icsk = inet_csk(sk);
1521 u32 send_win, cong_win, limit, in_flight; 1530 u32 send_win, cong_win, limit, in_flight;
1531 int win_divisor;
1522 1532
1523 if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) 1533 if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
1524 goto send_now; 1534 goto send_now;
@@ -1550,13 +1560,14 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1550 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) 1560 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
1551 goto send_now; 1561 goto send_now;
1552 1562
1553 if (sysctl_tcp_tso_win_divisor) { 1563 win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
1564 if (win_divisor) {
1554 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); 1565 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
1555 1566
1556 /* If at least some fraction of a window is available, 1567 /* If at least some fraction of a window is available,
1557 * just use it. 1568 * just use it.
1558 */ 1569 */
1559 chunk /= sysctl_tcp_tso_win_divisor; 1570 chunk /= win_divisor;
1560 if (limit >= chunk) 1571 if (limit >= chunk)
1561 goto send_now; 1572 goto send_now;
1562 } else { 1573 } else {
@@ -2152,7 +2163,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2152 if (!tp->retrans_stamp) 2163 if (!tp->retrans_stamp)
2153 tp->retrans_stamp = TCP_SKB_CB(skb)->when; 2164 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
2154 2165
2155 tp->undo_retrans++; 2166 tp->undo_retrans += tcp_skb_pcount(skb);
2156 2167
2157 /* snd_nxt is stored to detect loss of retransmitted segment, 2168 /* snd_nxt is stored to detect loss of retransmitted segment,
2158 * see tcp_input.c tcp_sacktag_write_queue(). 2169 * see tcp_input.c tcp_sacktag_write_queue().
@@ -2421,7 +2432,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2421 2432
2422 skb_dst_set(skb, dst_clone(dst)); 2433 skb_dst_set(skb, dst_clone(dst));
2423 2434
2424 mss = dst_metric(dst, RTAX_ADVMSS); 2435 mss = dst_metric_advmss(dst);
2425 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2436 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2426 mss = tp->rx_opt.user_mss; 2437 mss = tp->rx_opt.user_mss;
2427 2438
@@ -2429,6 +2440,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2429 __u8 rcv_wscale; 2440 __u8 rcv_wscale;
2430 /* Set this up on the first call only */ 2441 /* Set this up on the first call only */
2431 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); 2442 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2443
2444 /* limit the window selection if the user enforce a smaller rx buffer */
2445 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2446 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2447 req->window_clamp = tcp_full_space(sk);
2448
2432 /* tcp_full_space because it is guaranteed to be the first packet */ 2449 /* tcp_full_space because it is guaranteed to be the first packet */
2433 tcp_select_initial_window(tcp_full_space(sk), 2450 tcp_select_initial_window(tcp_full_space(sk),
2434 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), 2451 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -2549,12 +2566,17 @@ static void tcp_connect_init(struct sock *sk)
2549 2566
2550 if (!tp->window_clamp) 2567 if (!tp->window_clamp)
2551 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 2568 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
2552 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 2569 tp->advmss = dst_metric_advmss(dst);
2553 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) 2570 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
2554 tp->advmss = tp->rx_opt.user_mss; 2571 tp->advmss = tp->rx_opt.user_mss;
2555 2572
2556 tcp_initialize_rcv_mss(sk); 2573 tcp_initialize_rcv_mss(sk);
2557 2574
2575 /* limit the window selection if the user enforce a smaller rx buffer */
2576 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2577 (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
2578 tp->window_clamp = tcp_full_space(sk);
2579
2558 tcp_select_initial_window(tcp_full_space(sk), 2580 tcp_select_initial_window(tcp_full_space(sk),
2559 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 2581 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
2560 &tp->rcv_wnd, 2582 &tp->rcv_wnd,
@@ -2587,6 +2609,7 @@ int tcp_connect(struct sock *sk)
2587{ 2609{
2588 struct tcp_sock *tp = tcp_sk(sk); 2610 struct tcp_sock *tp = tcp_sk(sk);
2589 struct sk_buff *buff; 2611 struct sk_buff *buff;
2612 int err;
2590 2613
2591 tcp_connect_init(sk); 2614 tcp_connect_init(sk);
2592 2615
@@ -2609,7 +2632,9 @@ int tcp_connect(struct sock *sk)
2609 sk->sk_wmem_queued += buff->truesize; 2632 sk->sk_wmem_queued += buff->truesize;
2610 sk_mem_charge(sk, buff->truesize); 2633 sk_mem_charge(sk, buff->truesize);
2611 tp->packets_out += tcp_skb_pcount(buff); 2634 tp->packets_out += tcp_skb_pcount(buff);
2612 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); 2635 err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
2636 if (err == -ECONNREFUSED)
2637 return err;
2613 2638
2614 /* We change tp->snd_nxt after the tcp_transmit_skb() call 2639 /* We change tp->snd_nxt after the tcp_transmit_skb() call
2615 * in order to make this packet get counted in tcpOutSegs. 2640 * in order to make this packet get counted in tcpOutSegs.
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index f8efada580e8..85ee7eb7e38e 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -154,7 +154,7 @@ static int tcpprobe_sprint(char *tbuf, int n)
154 struct timespec tv 154 struct timespec tv
155 = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); 155 = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
156 156
157 return snprintf(tbuf, n, 157 return scnprintf(tbuf, n,
158 "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n", 158 "%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n",
159 (unsigned long) tv.tv_sec, 159 (unsigned long) tv.tv_sec,
160 (unsigned long) tv.tv_nsec, 160 (unsigned long) tv.tv_nsec,
@@ -174,7 +174,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,
174 return -EINVAL; 174 return -EINVAL;
175 175
176 while (cnt < len) { 176 while (cnt < len) {
177 char tbuf[128]; 177 char tbuf[164];
178 int width; 178 int width;
179 179
180 /* Wait for data in buffer */ 180 /* Wait for data in buffer */
@@ -214,6 +214,7 @@ static const struct file_operations tcpprobe_fops = {
214 .owner = THIS_MODULE, 214 .owner = THIS_MODULE,
215 .open = tcpprobe_open, 215 .open = tcpprobe_open,
216 .read = tcpprobe_read, 216 .read = tcpprobe_read,
217 .llseek = noop_llseek,
217}; 218};
218 219
219static __init int tcpprobe_init(void) 220static __init int tcpprobe_init(void)
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index a76513779e2b..8ce55b8aaec8 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -35,7 +35,7 @@ static u32 tcp_scalable_ssthresh(struct sock *sk)
35} 35}
36 36
37 37
38static struct tcp_congestion_ops tcp_scalable = { 38static struct tcp_congestion_ops tcp_scalable __read_mostly = {
39 .ssthresh = tcp_scalable_ssthresh, 39 .ssthresh = tcp_scalable_ssthresh,
40 .cong_avoid = tcp_scalable_cong_avoid, 40 .cong_avoid = tcp_scalable_cong_avoid,
41 .min_cwnd = tcp_reno_min_cwnd, 41 .min_cwnd = tcp_reno_min_cwnd,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 74c54b30600f..ecd44b0c45f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -140,10 +140,10 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
140 */ 140 */
141static bool retransmits_timed_out(struct sock *sk, 141static bool retransmits_timed_out(struct sock *sk,
142 unsigned int boundary, 142 unsigned int boundary,
143 unsigned int timeout,
143 bool syn_set) 144 bool syn_set)
144{ 145{
145 unsigned int timeout, linear_backoff_thresh; 146 unsigned int linear_backoff_thresh, start_ts;
146 unsigned int start_ts;
147 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; 147 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
148 148
149 if (!inet_csk(sk)->icsk_retransmits) 149 if (!inet_csk(sk)->icsk_retransmits)
@@ -154,14 +154,15 @@ static bool retransmits_timed_out(struct sock *sk,
154 else 154 else
155 start_ts = tcp_sk(sk)->retrans_stamp; 155 start_ts = tcp_sk(sk)->retrans_stamp;
156 156
157 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); 157 if (likely(timeout == 0)) {
158 158 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
159 if (boundary <= linear_backoff_thresh)
160 timeout = ((2 << boundary) - 1) * rto_base;
161 else
162 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
163 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
164 159
160 if (boundary <= linear_backoff_thresh)
161 timeout = ((2 << boundary) - 1) * rto_base;
162 else
163 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
164 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
165 }
165 return (tcp_time_stamp - start_ts) >= timeout; 166 return (tcp_time_stamp - start_ts) >= timeout;
166} 167}
167 168
@@ -178,7 +179,7 @@ static int tcp_write_timeout(struct sock *sk)
178 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 179 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
179 syn_set = 1; 180 syn_set = 1;
180 } else { 181 } else {
181 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) { 182 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
182 /* Black hole detection */ 183 /* Black hole detection */
183 tcp_mtu_probing(icsk, sk); 184 tcp_mtu_probing(icsk, sk);
184 185
@@ -191,14 +192,15 @@ static int tcp_write_timeout(struct sock *sk)
191 192
192 retry_until = tcp_orphan_retries(sk, alive); 193 retry_until = tcp_orphan_retries(sk, alive);
193 do_reset = alive || 194 do_reset = alive ||
194 !retransmits_timed_out(sk, retry_until, 0); 195 !retransmits_timed_out(sk, retry_until, 0, 0);
195 196
196 if (tcp_out_of_resources(sk, do_reset)) 197 if (tcp_out_of_resources(sk, do_reset))
197 return 1; 198 return 1;
198 } 199 }
199 } 200 }
200 201
201 if (retransmits_timed_out(sk, retry_until, syn_set)) { 202 if (retransmits_timed_out(sk, retry_until,
203 syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) {
202 /* Has it gone just too far? */ 204 /* Has it gone just too far? */
203 tcp_write_err(sk); 205 tcp_write_err(sk);
204 return 1; 206 return 1;
@@ -257,7 +259,6 @@ static void tcp_delack_timer(unsigned long data)
257 tcp_send_ack(sk); 259 tcp_send_ack(sk);
258 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); 260 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
259 } 261 }
260 TCP_CHECK_TIMER(sk);
261 262
262out: 263out:
263 if (tcp_memory_pressure) 264 if (tcp_memory_pressure)
@@ -365,18 +366,19 @@ void tcp_retransmit_timer(struct sock *sk)
365 if (icsk->icsk_retransmits == 0) { 366 if (icsk->icsk_retransmits == 0) {
366 int mib_idx; 367 int mib_idx;
367 368
368 if (icsk->icsk_ca_state == TCP_CA_Disorder) { 369 if (icsk->icsk_ca_state == TCP_CA_Recovery) {
369 if (tcp_is_sack(tp))
370 mib_idx = LINUX_MIB_TCPSACKFAILURES;
371 else
372 mib_idx = LINUX_MIB_TCPRENOFAILURES;
373 } else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
374 if (tcp_is_sack(tp)) 370 if (tcp_is_sack(tp))
375 mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; 371 mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
376 else 372 else
377 mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; 373 mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
378 } else if (icsk->icsk_ca_state == TCP_CA_Loss) { 374 } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
379 mib_idx = LINUX_MIB_TCPLOSSFAILURES; 375 mib_idx = LINUX_MIB_TCPLOSSFAILURES;
376 } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
377 tp->sacked_out) {
378 if (tcp_is_sack(tp))
379 mib_idx = LINUX_MIB_TCPSACKFAILURES;
380 else
381 mib_idx = LINUX_MIB_TCPRENOFAILURES;
380 } else { 382 } else {
381 mib_idx = LINUX_MIB_TCPTIMEOUTS; 383 mib_idx = LINUX_MIB_TCPTIMEOUTS;
382 } 384 }
@@ -440,7 +442,7 @@ out_reset_timer:
440 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); 442 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
441 } 443 }
442 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); 444 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
443 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0)) 445 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
444 __sk_dst_reset(sk); 446 __sk_dst_reset(sk);
445 447
446out:; 448out:;
@@ -478,7 +480,6 @@ static void tcp_write_timer(unsigned long data)
478 tcp_probe_timer(sk); 480 tcp_probe_timer(sk);
479 break; 481 break;
480 } 482 }
481 TCP_CHECK_TIMER(sk);
482 483
483out: 484out:
484 sk_mem_reclaim(sk); 485 sk_mem_reclaim(sk);
@@ -560,7 +561,14 @@ static void tcp_keepalive_timer (unsigned long data)
560 elapsed = keepalive_time_elapsed(tp); 561 elapsed = keepalive_time_elapsed(tp);
561 562
562 if (elapsed >= keepalive_time_when(tp)) { 563 if (elapsed >= keepalive_time_when(tp)) {
563 if (icsk->icsk_probes_out >= keepalive_probes(tp)) { 564 /* If the TCP_USER_TIMEOUT option is enabled, use that
565 * to determine when to timeout instead.
566 */
567 if ((icsk->icsk_user_timeout != 0 &&
568 elapsed >= icsk->icsk_user_timeout &&
569 icsk->icsk_probes_out > 0) ||
570 (icsk->icsk_user_timeout == 0 &&
571 icsk->icsk_probes_out >= keepalive_probes(tp))) {
564 tcp_send_active_reset(sk, GFP_ATOMIC); 572 tcp_send_active_reset(sk, GFP_ATOMIC);
565 tcp_write_err(sk); 573 tcp_write_err(sk);
566 goto out; 574 goto out;
@@ -579,7 +587,6 @@ static void tcp_keepalive_timer (unsigned long data)
579 elapsed = keepalive_time_when(tp) - elapsed; 587 elapsed = keepalive_time_when(tp) - elapsed;
580 } 588 }
581 589
582 TCP_CHECK_TIMER(sk);
583 sk_mem_reclaim(sk); 590 sk_mem_reclaim(sk);
584 591
585resched: 592resched:
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index c6743eec9b7d..80fa2bfd7ede 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -304,7 +304,7 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
304} 304}
305EXPORT_SYMBOL_GPL(tcp_vegas_get_info); 305EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
306 306
307static struct tcp_congestion_ops tcp_vegas = { 307static struct tcp_congestion_ops tcp_vegas __read_mostly = {
308 .flags = TCP_CONG_RTT_STAMP, 308 .flags = TCP_CONG_RTT_STAMP,
309 .init = tcp_vegas_init, 309 .init = tcp_vegas_init,
310 .ssthresh = tcp_reno_ssthresh, 310 .ssthresh = tcp_reno_ssthresh,
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index b612acf76183..ac43cd747bce 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -6,7 +6,7 @@
6 * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." 6 * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
7 * IEEE Journal on Selected Areas in Communication, 7 * IEEE Journal on Selected Areas in Communication,
8 * Feb. 2003. 8 * Feb. 2003.
9 * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf 9 * See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
10 */ 10 */
11 11
12#include <linux/mm.h> 12#include <linux/mm.h>
@@ -201,7 +201,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
201 return max(tp->snd_cwnd >> 1U, 2U); 201 return max(tp->snd_cwnd >> 1U, 2U);
202} 202}
203 203
204static struct tcp_congestion_ops tcp_veno = { 204static struct tcp_congestion_ops tcp_veno __read_mostly = {
205 .flags = TCP_CONG_RTT_STAMP, 205 .flags = TCP_CONG_RTT_STAMP,
206 .init = tcp_veno_init, 206 .init = tcp_veno_init,
207 .ssthresh = tcp_veno_ssthresh, 207 .ssthresh = tcp_veno_ssthresh,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 20151d6a6241..1b91bf48e277 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk)
80 */ 80 */
81static inline u32 westwood_do_filter(u32 a, u32 b) 81static inline u32 westwood_do_filter(u32 a, u32 b)
82{ 82{
83 return (((7 * a) + b) >> 3); 83 return ((7 * a) + b) >> 3;
84} 84}
85 85
86static void westwood_filter(struct westwood *w, u32 delta) 86static void westwood_filter(struct westwood *w, u32 delta)
@@ -272,7 +272,7 @@ static void tcp_westwood_info(struct sock *sk, u32 ext,
272} 272}
273 273
274 274
275static struct tcp_congestion_ops tcp_westwood = { 275static struct tcp_congestion_ops tcp_westwood __read_mostly = {
276 .init = tcp_westwood_init, 276 .init = tcp_westwood_init,
277 .ssthresh = tcp_reno_ssthresh, 277 .ssthresh = tcp_reno_ssthresh,
278 .cong_avoid = tcp_reno_cong_avoid, 278 .cong_avoid = tcp_reno_cong_avoid,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index a0f240358892..05c3b6f0e8e1 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -20,7 +20,7 @@
20#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss 20#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss
21#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion 21#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion
22#define TCP_YEAH_PHY 8 //lin maximum delta from base 22#define TCP_YEAH_PHY 8 //lin maximum delta from base
23#define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss 23#define TCP_YEAH_RHO 16 //lin minimum number of consecutive rtt to consider competition on loss
24#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count 24#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count
25 25
26#define TCP_SCALABLE_AI_CNT 100U 26#define TCP_SCALABLE_AI_CNT 100U
@@ -225,7 +225,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {
225 return tp->snd_cwnd - reduction; 225 return tp->snd_cwnd - reduction;
226} 226}
227 227
228static struct tcp_congestion_ops tcp_yeah = { 228static struct tcp_congestion_ops tcp_yeah __read_mostly = {
229 .flags = TCP_CONG_RTT_STAMP, 229 .flags = TCP_CONG_RTT_STAMP,
230 .init = tcp_yeah_init, 230 .init = tcp_yeah_init,
231 .ssthresh = tcp_yeah_ssthresh, 231 .ssthresh = tcp_yeah_ssthresh,
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 59186ca7808a..ac3b3ee4b07c 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -14,32 +14,37 @@
14#include <net/protocol.h> 14#include <net/protocol.h>
15#include <net/xfrm.h> 15#include <net/xfrm.h>
16 16
17static struct xfrm_tunnel *tunnel4_handlers; 17static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly;
18static struct xfrm_tunnel *tunnel64_handlers; 18static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly;
19static DEFINE_MUTEX(tunnel4_mutex); 19static DEFINE_MUTEX(tunnel4_mutex);
20 20
21static inline struct xfrm_tunnel **fam_handlers(unsigned short family) 21static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family)
22{ 22{
23 return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; 23 return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers;
24} 24}
25 25
26int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) 26int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
27{ 27{
28 struct xfrm_tunnel **pprev; 28 struct xfrm_tunnel __rcu **pprev;
29 struct xfrm_tunnel *t;
30
29 int ret = -EEXIST; 31 int ret = -EEXIST;
30 int priority = handler->priority; 32 int priority = handler->priority;
31 33
32 mutex_lock(&tunnel4_mutex); 34 mutex_lock(&tunnel4_mutex);
33 35
34 for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { 36 for (pprev = fam_handlers(family);
35 if ((*pprev)->priority > priority) 37 (t = rcu_dereference_protected(*pprev,
38 lockdep_is_held(&tunnel4_mutex))) != NULL;
39 pprev = &t->next) {
40 if (t->priority > priority)
36 break; 41 break;
37 if ((*pprev)->priority == priority) 42 if (t->priority == priority)
38 goto err; 43 goto err;
39 } 44 }
40 45
41 handler->next = *pprev; 46 handler->next = *pprev;
42 *pprev = handler; 47 rcu_assign_pointer(*pprev, handler);
43 48
44 ret = 0; 49 ret = 0;
45 50
@@ -52,13 +57,17 @@ EXPORT_SYMBOL(xfrm4_tunnel_register);
52 57
53int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) 58int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
54{ 59{
55 struct xfrm_tunnel **pprev; 60 struct xfrm_tunnel __rcu **pprev;
61 struct xfrm_tunnel *t;
56 int ret = -ENOENT; 62 int ret = -ENOENT;
57 63
58 mutex_lock(&tunnel4_mutex); 64 mutex_lock(&tunnel4_mutex);
59 65
60 for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { 66 for (pprev = fam_handlers(family);
61 if (*pprev == handler) { 67 (t = rcu_dereference_protected(*pprev,
68 lockdep_is_held(&tunnel4_mutex))) != NULL;
69 pprev = &t->next) {
70 if (t == handler) {
62 *pprev = handler->next; 71 *pprev = handler->next;
63 ret = 0; 72 ret = 0;
64 break; 73 break;
@@ -73,6 +82,11 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
73} 82}
74EXPORT_SYMBOL(xfrm4_tunnel_deregister); 83EXPORT_SYMBOL(xfrm4_tunnel_deregister);
75 84
85#define for_each_tunnel_rcu(head, handler) \
86 for (handler = rcu_dereference(head); \
87 handler != NULL; \
88 handler = rcu_dereference(handler->next)) \
89
76static int tunnel4_rcv(struct sk_buff *skb) 90static int tunnel4_rcv(struct sk_buff *skb)
77{ 91{
78 struct xfrm_tunnel *handler; 92 struct xfrm_tunnel *handler;
@@ -80,7 +94,7 @@ static int tunnel4_rcv(struct sk_buff *skb)
80 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 94 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
81 goto drop; 95 goto drop;
82 96
83 for (handler = tunnel4_handlers; handler; handler = handler->next) 97 for_each_tunnel_rcu(tunnel4_handlers, handler)
84 if (!handler->handler(skb)) 98 if (!handler->handler(skb))
85 return 0; 99 return 0;
86 100
@@ -99,7 +113,7 @@ static int tunnel64_rcv(struct sk_buff *skb)
99 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) 113 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
100 goto drop; 114 goto drop;
101 115
102 for (handler = tunnel64_handlers; handler; handler = handler->next) 116 for_each_tunnel_rcu(tunnel64_handlers, handler)
103 if (!handler->handler(skb)) 117 if (!handler->handler(skb))
104 return 0; 118 return 0;
105 119
@@ -115,7 +129,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)
115{ 129{
116 struct xfrm_tunnel *handler; 130 struct xfrm_tunnel *handler;
117 131
118 for (handler = tunnel4_handlers; handler; handler = handler->next) 132 for_each_tunnel_rcu(tunnel4_handlers, handler)
119 if (!handler->err_handler(skb, info)) 133 if (!handler->err_handler(skb, info))
120 break; 134 break;
121} 135}
@@ -125,7 +139,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info)
125{ 139{
126 struct xfrm_tunnel *handler; 140 struct xfrm_tunnel *handler;
127 141
128 for (handler = tunnel64_handlers; handler; handler = handler->next) 142 for_each_tunnel_rcu(tunnel64_handlers, handler)
129 if (!handler->err_handler(skb, info)) 143 if (!handler->err_handler(skb, info))
130 break; 144 break;
131} 145}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fb23c2e63b52..198f75b7bdd3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -110,7 +110,7 @@
110struct udp_table udp_table __read_mostly; 110struct udp_table udp_table __read_mostly;
111EXPORT_SYMBOL(udp_table); 111EXPORT_SYMBOL(udp_table);
112 112
113int sysctl_udp_mem[3] __read_mostly; 113long sysctl_udp_mem[3] __read_mostly;
114EXPORT_SYMBOL(sysctl_udp_mem); 114EXPORT_SYMBOL(sysctl_udp_mem);
115 115
116int sysctl_udp_rmem_min __read_mostly; 116int sysctl_udp_rmem_min __read_mostly;
@@ -119,7 +119,7 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min);
119int sysctl_udp_wmem_min __read_mostly; 119int sysctl_udp_wmem_min __read_mostly;
120EXPORT_SYMBOL(sysctl_udp_wmem_min); 120EXPORT_SYMBOL(sysctl_udp_wmem_min);
121 121
122atomic_t udp_memory_allocated; 122atomic_long_t udp_memory_allocated;
123EXPORT_SYMBOL(udp_memory_allocated); 123EXPORT_SYMBOL(udp_memory_allocated);
124 124
125#define MAX_UDP_PORTS 65536 125#define MAX_UDP_PORTS 65536
@@ -189,7 +189,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
189 * @sk: socket struct in question 189 * @sk: socket struct in question
190 * @snum: port number to look up 190 * @snum: port number to look up
191 * @saddr_comp: AF-dependent comparison of bound local IP addresses 191 * @saddr_comp: AF-dependent comparison of bound local IP addresses
192 * @hash2_nulladdr: AF-dependant hash value in secondary hash chains, 192 * @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
193 * with NULL address 193 * with NULL address
194 */ 194 */
195int udp_lib_get_port(struct sock *sk, unsigned short snum, 195int udp_lib_get_port(struct sock *sk, unsigned short snum,
@@ -430,7 +430,7 @@ begin:
430 430
431 if (result) { 431 if (result) {
432exact_match: 432exact_match:
433 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 433 if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
434 result = NULL; 434 result = NULL;
435 else if (unlikely(compute_score2(result, net, saddr, sport, 435 else if (unlikely(compute_score2(result, net, saddr, sport,
436 daddr, hnum, dif) < badness)) { 436 daddr, hnum, dif) < badness)) {
@@ -500,7 +500,7 @@ begin:
500 goto begin; 500 goto begin;
501 501
502 if (result) { 502 if (result) {
503 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) 503 if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
504 result = NULL; 504 result = NULL;
505 else if (unlikely(compute_score(result, net, saddr, hnum, sport, 505 else if (unlikely(compute_score(result, net, saddr, hnum, sport,
506 daddr, dport, dif) < badness)) { 506 daddr, dport, dif) < badness)) {
@@ -578,7 +578,7 @@ found:
578void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) 578void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
579{ 579{
580 struct inet_sock *inet; 580 struct inet_sock *inet;
581 struct iphdr *iph = (struct iphdr *)skb->data; 581 const struct iphdr *iph = (const struct iphdr *)skb->data;
582 struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); 582 struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
583 const int type = icmp_hdr(skb)->type; 583 const int type = icmp_hdr(skb)->type;
584 const int code = icmp_hdr(skb)->code; 584 const int code = icmp_hdr(skb)->code;
@@ -663,75 +663,71 @@ void udp_flush_pending_frames(struct sock *sk)
663EXPORT_SYMBOL(udp_flush_pending_frames); 663EXPORT_SYMBOL(udp_flush_pending_frames);
664 664
665/** 665/**
666 * udp4_hwcsum_outgoing - handle outgoing HW checksumming 666 * udp4_hwcsum - handle outgoing HW checksumming
667 * @sk: socket we are sending on
668 * @skb: sk_buff containing the filled-in UDP header 667 * @skb: sk_buff containing the filled-in UDP header
669 * (checksum field must be zeroed out) 668 * (checksum field must be zeroed out)
669 * @src: source IP address
670 * @dst: destination IP address
670 */ 671 */
671static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, 672static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
672 __be32 src, __be32 dst, int len)
673{ 673{
674 unsigned int offset;
675 struct udphdr *uh = udp_hdr(skb); 674 struct udphdr *uh = udp_hdr(skb);
675 struct sk_buff *frags = skb_shinfo(skb)->frag_list;
676 int offset = skb_transport_offset(skb);
677 int len = skb->len - offset;
678 int hlen = len;
676 __wsum csum = 0; 679 __wsum csum = 0;
677 680
678 if (skb_queue_len(&sk->sk_write_queue) == 1) { 681 if (!frags) {
679 /* 682 /*
680 * Only one fragment on the socket. 683 * Only one fragment on the socket.
681 */ 684 */
682 skb->csum_start = skb_transport_header(skb) - skb->head; 685 skb->csum_start = skb_transport_header(skb) - skb->head;
683 skb->csum_offset = offsetof(struct udphdr, check); 686 skb->csum_offset = offsetof(struct udphdr, check);
684 uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); 687 uh->check = ~csum_tcpudp_magic(src, dst, len,
688 IPPROTO_UDP, 0);
685 } else { 689 } else {
686 /* 690 /*
687 * HW-checksum won't work as there are two or more 691 * HW-checksum won't work as there are two or more
688 * fragments on the socket so that all csums of sk_buffs 692 * fragments on the socket so that all csums of sk_buffs
689 * should be together 693 * should be together
690 */ 694 */
691 offset = skb_transport_offset(skb); 695 do {
692 skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); 696 csum = csum_add(csum, frags->csum);
697 hlen -= frags->len;
698 } while ((frags = frags->next));
693 699
700 csum = skb_checksum(skb, offset, hlen, csum);
694 skb->ip_summed = CHECKSUM_NONE; 701 skb->ip_summed = CHECKSUM_NONE;
695 702
696 skb_queue_walk(&sk->sk_write_queue, skb) {
697 csum = csum_add(csum, skb->csum);
698 }
699
700 uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum); 703 uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
701 if (uh->check == 0) 704 if (uh->check == 0)
702 uh->check = CSUM_MANGLED_0; 705 uh->check = CSUM_MANGLED_0;
703 } 706 }
704} 707}
705 708
706/* 709static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
707 * Push out all pending data as one UDP datagram. Socket is locked.
708 */
709static int udp_push_pending_frames(struct sock *sk)
710{ 710{
711 struct udp_sock *up = udp_sk(sk); 711 struct sock *sk = skb->sk;
712 struct inet_sock *inet = inet_sk(sk); 712 struct inet_sock *inet = inet_sk(sk);
713 struct flowi *fl = &inet->cork.fl;
714 struct sk_buff *skb;
715 struct udphdr *uh; 713 struct udphdr *uh;
716 int err = 0; 714 int err = 0;
717 int is_udplite = IS_UDPLITE(sk); 715 int is_udplite = IS_UDPLITE(sk);
716 int offset = skb_transport_offset(skb);
717 int len = skb->len - offset;
718 __wsum csum = 0; 718 __wsum csum = 0;
719 719
720 /* Grab the skbuff where UDP header space exists. */
721 if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
722 goto out;
723
724 /* 720 /*
725 * Create a UDP header 721 * Create a UDP header
726 */ 722 */
727 uh = udp_hdr(skb); 723 uh = udp_hdr(skb);
728 uh->source = fl->fl_ip_sport; 724 uh->source = inet->inet_sport;
729 uh->dest = fl->fl_ip_dport; 725 uh->dest = fl4->fl4_dport;
730 uh->len = htons(up->len); 726 uh->len = htons(len);
731 uh->check = 0; 727 uh->check = 0;
732 728
733 if (is_udplite) /* UDP-Lite */ 729 if (is_udplite) /* UDP-Lite */
734 csum = udplite_csum_outgoing(sk, skb); 730 csum = udplite_csum(skb);
735 731
736 else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ 732 else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */
737 733
@@ -740,20 +736,20 @@ static int udp_push_pending_frames(struct sock *sk)
740 736
741 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ 737 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
742 738
743 udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len); 739 udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
744 goto send; 740 goto send;
745 741
746 } else /* `normal' UDP */ 742 } else
747 csum = udp_csum_outgoing(sk, skb); 743 csum = udp_csum(skb);
748 744
749 /* add protocol-dependent pseudo-header */ 745 /* add protocol-dependent pseudo-header */
750 uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, 746 uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
751 sk->sk_protocol, csum); 747 sk->sk_protocol, csum);
752 if (uh->check == 0) 748 if (uh->check == 0)
753 uh->check = CSUM_MANGLED_0; 749 uh->check = CSUM_MANGLED_0;
754 750
755send: 751send:
756 err = ip_push_pending_frames(sk); 752 err = ip_send_skb(skb);
757 if (err) { 753 if (err) {
758 if (err == -ENOBUFS && !inet->recverr) { 754 if (err == -ENOBUFS && !inet->recverr) {
759 UDP_INC_STATS_USER(sock_net(sk), 755 UDP_INC_STATS_USER(sock_net(sk),
@@ -763,6 +759,26 @@ send:
763 } else 759 } else
764 UDP_INC_STATS_USER(sock_net(sk), 760 UDP_INC_STATS_USER(sock_net(sk),
765 UDP_MIB_OUTDATAGRAMS, is_udplite); 761 UDP_MIB_OUTDATAGRAMS, is_udplite);
762 return err;
763}
764
765/*
766 * Push out all pending data as one UDP datagram. Socket is locked.
767 */
768static int udp_push_pending_frames(struct sock *sk)
769{
770 struct udp_sock *up = udp_sk(sk);
771 struct inet_sock *inet = inet_sk(sk);
772 struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
773 struct sk_buff *skb;
774 int err = 0;
775
776 skb = ip_finish_skb(sk, fl4);
777 if (!skb)
778 goto out;
779
780 err = udp_send_skb(skb, fl4);
781
766out: 782out:
767 up->len = 0; 783 up->len = 0;
768 up->pending = 0; 784 up->pending = 0;
@@ -774,6 +790,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
774{ 790{
775 struct inet_sock *inet = inet_sk(sk); 791 struct inet_sock *inet = inet_sk(sk);
776 struct udp_sock *up = udp_sk(sk); 792 struct udp_sock *up = udp_sk(sk);
793 struct flowi4 fl4_stack;
794 struct flowi4 *fl4;
777 int ulen = len; 795 int ulen = len;
778 struct ipcm_cookie ipc; 796 struct ipcm_cookie ipc;
779 struct rtable *rt = NULL; 797 struct rtable *rt = NULL;
@@ -785,6 +803,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
785 int err, is_udplite = IS_UDPLITE(sk); 803 int err, is_udplite = IS_UDPLITE(sk);
786 int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; 804 int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
787 int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); 805 int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
806 struct sk_buff *skb;
807 struct ip_options_data opt_copy;
788 808
789 if (len > 0xFFFF) 809 if (len > 0xFFFF)
790 return -EMSGSIZE; 810 return -EMSGSIZE;
@@ -797,8 +817,11 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
797 return -EOPNOTSUPP; 817 return -EOPNOTSUPP;
798 818
799 ipc.opt = NULL; 819 ipc.opt = NULL;
800 ipc.shtx.flags = 0; 820 ipc.tx_flags = 0;
801 821
822 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
823
824 fl4 = &inet->cork.fl.u.ip4;
802 if (up->pending) { 825 if (up->pending) {
803 /* 826 /*
804 * There are pending frames. 827 * There are pending frames.
@@ -845,7 +868,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
845 ipc.addr = inet->inet_saddr; 868 ipc.addr = inet->inet_saddr;
846 869
847 ipc.oif = sk->sk_bound_dev_if; 870 ipc.oif = sk->sk_bound_dev_if;
848 err = sock_tx_timestamp(msg, sk, &ipc.shtx); 871 err = sock_tx_timestamp(sk, &ipc.tx_flags);
849 if (err) 872 if (err)
850 return err; 873 return err;
851 if (msg->msg_controllen) { 874 if (msg->msg_controllen) {
@@ -856,22 +879,32 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
856 free = 1; 879 free = 1;
857 connected = 0; 880 connected = 0;
858 } 881 }
859 if (!ipc.opt) 882 if (!ipc.opt) {
860 ipc.opt = inet->opt; 883 struct ip_options_rcu *inet_opt;
884
885 rcu_read_lock();
886 inet_opt = rcu_dereference(inet->inet_opt);
887 if (inet_opt) {
888 memcpy(&opt_copy, inet_opt,
889 sizeof(*inet_opt) + inet_opt->opt.optlen);
890 ipc.opt = &opt_copy.opt;
891 }
892 rcu_read_unlock();
893 }
861 894
862 saddr = ipc.addr; 895 saddr = ipc.addr;
863 ipc.addr = faddr = daddr; 896 ipc.addr = faddr = daddr;
864 897
865 if (ipc.opt && ipc.opt->srr) { 898 if (ipc.opt && ipc.opt->opt.srr) {
866 if (!daddr) 899 if (!daddr)
867 return -EINVAL; 900 return -EINVAL;
868 faddr = ipc.opt->faddr; 901 faddr = ipc.opt->opt.faddr;
869 connected = 0; 902 connected = 0;
870 } 903 }
871 tos = RT_TOS(inet->tos); 904 tos = RT_TOS(inet->tos);
872 if (sock_flag(sk, SOCK_LOCALROUTE) || 905 if (sock_flag(sk, SOCK_LOCALROUTE) ||
873 (msg->msg_flags & MSG_DONTROUTE) || 906 (msg->msg_flags & MSG_DONTROUTE) ||
874 (ipc.opt && ipc.opt->is_strictroute)) { 907 (ipc.opt && ipc.opt->opt.is_strictroute)) {
875 tos |= RTO_ONLINK; 908 tos |= RTO_ONLINK;
876 connected = 0; 909 connected = 0;
877 } 910 }
@@ -888,22 +921,19 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
888 rt = (struct rtable *)sk_dst_check(sk, 0); 921 rt = (struct rtable *)sk_dst_check(sk, 0);
889 922
890 if (rt == NULL) { 923 if (rt == NULL) {
891 struct flowi fl = { .oif = ipc.oif,
892 .mark = sk->sk_mark,
893 .nl_u = { .ip4_u =
894 { .daddr = faddr,
895 .saddr = saddr,
896 .tos = tos } },
897 .proto = sk->sk_protocol,
898 .flags = inet_sk_flowi_flags(sk),
899 .uli_u = { .ports =
900 { .sport = inet->inet_sport,
901 .dport = dport } } };
902 struct net *net = sock_net(sk); 924 struct net *net = sock_net(sk);
903 925
904 security_sk_classify_flow(sk, &fl); 926 fl4 = &fl4_stack;
905 err = ip_route_output_flow(net, &rt, &fl, sk, 1); 927 flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
906 if (err) { 928 RT_SCOPE_UNIVERSE, sk->sk_protocol,
929 inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP,
930 faddr, saddr, dport, inet->inet_sport);
931
932 security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
933 rt = ip_route_output_flow(net, fl4, sk);
934 if (IS_ERR(rt)) {
935 err = PTR_ERR(rt);
936 rt = NULL;
907 if (err == -ENETUNREACH) 937 if (err == -ENETUNREACH)
908 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); 938 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
909 goto out; 939 goto out;
@@ -921,9 +951,20 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
921 goto do_confirm; 951 goto do_confirm;
922back_from_confirm: 952back_from_confirm:
923 953
924 saddr = rt->rt_src; 954 saddr = fl4->saddr;
925 if (!ipc.addr) 955 if (!ipc.addr)
926 daddr = ipc.addr = rt->rt_dst; 956 daddr = ipc.addr = fl4->daddr;
957
958 /* Lockless fast path for the non-corking case. */
959 if (!corkreq) {
960 skb = ip_make_skb(sk, fl4, getfrag, msg->msg_iov, ulen,
961 sizeof(struct udphdr), &ipc, &rt,
962 msg->msg_flags);
963 err = PTR_ERR(skb);
964 if (skb && !IS_ERR(skb))
965 err = udp_send_skb(skb, fl4);
966 goto out;
967 }
927 968
928 lock_sock(sk); 969 lock_sock(sk);
929 if (unlikely(up->pending)) { 970 if (unlikely(up->pending)) {
@@ -938,18 +979,18 @@ back_from_confirm:
938 /* 979 /*
939 * Now cork the socket to pend data. 980 * Now cork the socket to pend data.
940 */ 981 */
941 inet->cork.fl.fl4_dst = daddr; 982 fl4 = &inet->cork.fl.u.ip4;
942 inet->cork.fl.fl_ip_dport = dport; 983 fl4->daddr = daddr;
943 inet->cork.fl.fl4_src = saddr; 984 fl4->saddr = saddr;
944 inet->cork.fl.fl_ip_sport = inet->inet_sport; 985 fl4->fl4_dport = dport;
986 fl4->fl4_sport = inet->inet_sport;
945 up->pending = AF_INET; 987 up->pending = AF_INET;
946 988
947do_append_data: 989do_append_data:
948 up->len += ulen; 990 up->len += ulen;
949 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; 991 err = ip_append_data(sk, fl4, getfrag, msg->msg_iov, ulen,
950 err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, 992 sizeof(struct udphdr), &ipc, &rt,
951 sizeof(struct udphdr), &ipc, &rt, 993 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
952 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
953 if (err) 994 if (err)
954 udp_flush_pending_frames(sk); 995 udp_flush_pending_frames(sk);
955 else if (!corkreq) 996 else if (!corkreq)
@@ -989,6 +1030,7 @@ EXPORT_SYMBOL(udp_sendmsg);
989int udp_sendpage(struct sock *sk, struct page *page, int offset, 1030int udp_sendpage(struct sock *sk, struct page *page, int offset,
990 size_t size, int flags) 1031 size_t size, int flags)
991{ 1032{
1033 struct inet_sock *inet = inet_sk(sk);
992 struct udp_sock *up = udp_sk(sk); 1034 struct udp_sock *up = udp_sk(sk);
993 int ret; 1035 int ret;
994 1036
@@ -1013,7 +1055,8 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
1013 return -EINVAL; 1055 return -EINVAL;
1014 } 1056 }
1015 1057
1016 ret = ip_append_page(sk, page, offset, size, flags); 1058 ret = ip_append_page(sk, &inet->cork.fl.u.ip4,
1059 page, offset, size, flags);
1017 if (ret == -EOPNOTSUPP) { 1060 if (ret == -EOPNOTSUPP) {
1018 release_sock(sk); 1061 release_sock(sk);
1019 return sock_no_sendpage(sk->sk_socket, page, offset, 1062 return sock_no_sendpage(sk->sk_socket, page, offset,
@@ -1206,6 +1249,9 @@ csum_copy_err:
1206 1249
1207 if (noblock) 1250 if (noblock)
1208 return -EAGAIN; 1251 return -EAGAIN;
1252
1253 /* starting over for a new packet */
1254 msg->msg_flags &= ~MSG_TRUNC;
1209 goto try_again; 1255 goto try_again;
1210} 1256}
1211 1257
@@ -1413,7 +1459,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1413 } 1459 }
1414 } 1460 }
1415 1461
1416 if (sk->sk_filter) { 1462 if (rcu_dereference_raw(sk->sk_filter)) {
1417 if (udp_lib_checksum_complete(skb)) 1463 if (udp_lib_checksum_complete(skb))
1418 goto drop; 1464 goto drop;
1419 } 1465 }
@@ -1899,6 +1945,7 @@ struct proto udp_prot = {
1899 .compat_setsockopt = compat_udp_setsockopt, 1945 .compat_setsockopt = compat_udp_setsockopt,
1900 .compat_getsockopt = compat_udp_getsockopt, 1946 .compat_getsockopt = compat_udp_getsockopt,
1901#endif 1947#endif
1948 .clear_sk = sk_prot_clear_portaddr_nulls,
1902}; 1949};
1903EXPORT_SYMBOL(udp_prot); 1950EXPORT_SYMBOL(udp_prot);
1904 1951
@@ -2046,7 +2093,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
2046 __u16 srcp = ntohs(inet->inet_sport); 2093 __u16 srcp = ntohs(inet->inet_sport);
2047 2094
2048 seq_printf(f, "%5d: %08X:%04X %08X:%04X" 2095 seq_printf(f, "%5d: %08X:%04X %08X:%04X"
2049 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", 2096 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d%n",
2050 bucket, src, srcp, dest, destp, sp->sk_state, 2097 bucket, src, srcp, dest, destp, sp->sk_state,
2051 sk_wmem_alloc_get(sp), 2098 sk_wmem_alloc_get(sp),
2052 sk_rmem_alloc_get(sp), 2099 sk_rmem_alloc_get(sp),
@@ -2162,16 +2209,10 @@ void __init udp_table_init(struct udp_table *table, const char *name)
2162 2209
2163void __init udp_init(void) 2210void __init udp_init(void)
2164{ 2211{
2165 unsigned long nr_pages, limit; 2212 unsigned long limit;
2166 2213
2167 udp_table_init(&udp_table, "UDP"); 2214 udp_table_init(&udp_table, "UDP");
2168 /* Set the pressure threshold up by the same strategy of TCP. It is a 2215 limit = nr_free_buffer_pages() / 8;
2169 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
2170 * toward zero with the amount of memory, with a floor of 128 pages.
2171 */
2172 nr_pages = totalram_pages - totalhigh_pages;
2173 limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
2174 limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
2175 limit = max(limit, 128UL); 2216 limit = max(limit, 128UL);
2176 sysctl_udp_mem[0] = limit / 4 * 3; 2217 sysctl_udp_mem[0] = limit / 4 * 3;
2177 sysctl_udp_mem[1] = limit; 2218 sysctl_udp_mem[1] = limit;
@@ -2200,7 +2241,7 @@ int udp4_ufo_send_check(struct sk_buff *skb)
2200 return 0; 2241 return 0;
2201} 2242}
2202 2243
2203struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) 2244struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features)
2204{ 2245{
2205 struct sk_buff *segs = ERR_PTR(-EINVAL); 2246 struct sk_buff *segs = ERR_PTR(-EINVAL);
2206 unsigned int mss; 2247 unsigned int mss;
@@ -2228,7 +2269,7 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features)
2228 /* Do software UFO. Complete and fill in the UDP checksum as HW cannot 2269 /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
2229 * do checksum of UDP packets sent as multiple IP fragments. 2270 * do checksum of UDP packets sent as multiple IP fragments.
2230 */ 2271 */
2231 offset = skb->csum_start - skb_headroom(skb); 2272 offset = skb_checksum_start_offset(skb);
2232 csum = skb_checksum(skb, offset, skb->len - offset, 0); 2273 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2233 offset += skb->csum_offset; 2274 offset += skb->csum_offset;
2234 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 2275 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index ab76aa928fa9..aee9963f7f5a 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -57,6 +57,7 @@ struct proto udplite_prot = {
57 .compat_setsockopt = compat_udp_setsockopt, 57 .compat_setsockopt = compat_udp_setsockopt,
58 .compat_getsockopt = compat_udp_getsockopt, 58 .compat_getsockopt = compat_udp_getsockopt,
59#endif 59#endif
60 .clear_sk = sk_prot_clear_portaddr_nulls,
60}; 61};
61EXPORT_SYMBOL(udplite_prot); 62EXPORT_SYMBOL(udplite_prot);
62 63
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 6f368413eb0e..534972e114ac 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -56,7 +56,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
56 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); 56 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
57 ip_select_ident(top_iph, dst->child, NULL); 57 ip_select_ident(top_iph, dst->child, NULL);
58 58
59 top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); 59 top_iph->ttl = ip4_dst_hoplimit(dst->child);
60 60
61 top_iph->saddr = x->props.saddr.a4; 61 top_iph->saddr = x->props.saddr.a4;
62 top_iph->daddr = x->id.daddr.a4; 62 top_iph->daddr = x->id.daddr.a4;
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 571aa96a175c..327a617d594c 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -32,7 +32,12 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
32 dst = skb_dst(skb); 32 dst = skb_dst(skb);
33 mtu = dst_mtu(dst); 33 mtu = dst_mtu(dst);
34 if (skb->len > mtu) { 34 if (skb->len > mtu) {
35 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 35 if (skb->sk)
36 ip_local_error(skb->sk, EMSGSIZE, ip_hdr(skb)->daddr,
37 inet_sk(skb->sk)->inet_dport, mtu);
38 else
39 icmp_send(skb, ICMP_DEST_UNREACH,
40 ICMP_FRAG_NEEDED, htonl(mtu));
36 ret = -EMSGSIZE; 41 ret = -EMSGSIZE;
37 } 42 }
38out: 43out:
@@ -69,7 +74,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
69} 74}
70EXPORT_SYMBOL(xfrm4_prepare_output); 75EXPORT_SYMBOL(xfrm4_prepare_output);
71 76
72static int xfrm4_output_finish(struct sk_buff *skb) 77int xfrm4_output_finish(struct sk_buff *skb)
73{ 78{
74#ifdef CONFIG_NETFILTER 79#ifdef CONFIG_NETFILTER
75 if (!skb_dst(skb)->xfrm) { 80 if (!skb_dst(skb)->xfrm) {
@@ -86,7 +91,11 @@ static int xfrm4_output_finish(struct sk_buff *skb)
86 91
87int xfrm4_output(struct sk_buff *skb) 92int xfrm4_output(struct sk_buff *skb)
88{ 93{
94 struct dst_entry *dst = skb_dst(skb);
95 struct xfrm_state *x = dst->xfrm;
96
89 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, 97 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb,
90 NULL, skb_dst(skb)->dev, xfrm4_output_finish, 98 NULL, dst->dev,
99 x->outer_mode->afinfo->output_finish,
91 !(IPCB(skb)->flags & IPSKB_REROUTED)); 100 !(IPCB(skb)->flags & IPSKB_REROUTED));
92} 101}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index a580349f0b8a..981e43eaf704 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -11,57 +11,60 @@
11#include <linux/err.h> 11#include <linux/err.h>
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/inetdevice.h> 13#include <linux/inetdevice.h>
14#include <linux/if_tunnel.h>
14#include <net/dst.h> 15#include <net/dst.h>
15#include <net/xfrm.h> 16#include <net/xfrm.h>
16#include <net/ip.h> 17#include <net/ip.h>
17 18
18static struct xfrm_policy_afinfo xfrm4_policy_afinfo; 19static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
19 20
20static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, 21static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
21 xfrm_address_t *saddr, 22 int tos,
22 xfrm_address_t *daddr) 23 const xfrm_address_t *saddr,
24 const xfrm_address_t *daddr)
23{ 25{
24 struct flowi fl = {
25 .nl_u = {
26 .ip4_u = {
27 .tos = tos,
28 .daddr = daddr->a4,
29 },
30 },
31 };
32 struct dst_entry *dst;
33 struct rtable *rt; 26 struct rtable *rt;
34 int err;
35 27
28 memset(fl4, 0, sizeof(*fl4));
29 fl4->daddr = daddr->a4;
30 fl4->flowi4_tos = tos;
36 if (saddr) 31 if (saddr)
37 fl.fl4_src = saddr->a4; 32 fl4->saddr = saddr->a4;
33
34 rt = __ip_route_output_key(net, fl4);
35 if (!IS_ERR(rt))
36 return &rt->dst;
38 37
39 err = __ip_route_output_key(net, &rt, &fl); 38 return ERR_CAST(rt);
40 dst = &rt->dst; 39}
41 if (err) 40
42 dst = ERR_PTR(err); 41static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
43 return dst; 42 const xfrm_address_t *saddr,
43 const xfrm_address_t *daddr)
44{
45 struct flowi4 fl4;
46
47 return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr);
44} 48}
45 49
46static int xfrm4_get_saddr(struct net *net, 50static int xfrm4_get_saddr(struct net *net,
47 xfrm_address_t *saddr, xfrm_address_t *daddr) 51 xfrm_address_t *saddr, xfrm_address_t *daddr)
48{ 52{
49 struct dst_entry *dst; 53 struct dst_entry *dst;
50 struct rtable *rt; 54 struct flowi4 fl4;
51 55
52 dst = xfrm4_dst_lookup(net, 0, NULL, daddr); 56 dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr);
53 if (IS_ERR(dst)) 57 if (IS_ERR(dst))
54 return -EHOSTUNREACH; 58 return -EHOSTUNREACH;
55 59
56 rt = (struct rtable *)dst; 60 saddr->a4 = fl4.saddr;
57 saddr->a4 = rt->rt_src;
58 dst_release(dst); 61 dst_release(dst);
59 return 0; 62 return 0;
60} 63}
61 64
62static int xfrm4_get_tos(struct flowi *fl) 65static int xfrm4_get_tos(const struct flowi *fl)
63{ 66{
64 return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */ 67 return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */
65} 68}
66 69
67static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, 70static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
@@ -71,19 +74,22 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
71} 74}
72 75
73static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, 76static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
74 struct flowi *fl) 77 const struct flowi *fl)
75{ 78{
76 struct rtable *rt = (struct rtable *)xdst->route; 79 struct rtable *rt = (struct rtable *)xdst->route;
80 const struct flowi4 *fl4 = &fl->u.ip4;
77 81
78 xdst->u.rt.fl = *fl; 82 rt->rt_key_dst = fl4->daddr;
83 rt->rt_key_src = fl4->saddr;
84 rt->rt_key_tos = fl4->flowi4_tos;
85 rt->rt_route_iif = fl4->flowi4_iif;
86 rt->rt_iif = fl4->flowi4_iif;
87 rt->rt_oif = fl4->flowi4_oif;
88 rt->rt_mark = fl4->flowi4_mark;
79 89
80 xdst->u.dst.dev = dev; 90 xdst->u.dst.dev = dev;
81 dev_hold(dev); 91 dev_hold(dev);
82 92
83 xdst->u.rt.idev = in_dev_get(dev);
84 if (!xdst->u.rt.idev)
85 return -ENODEV;
86
87 xdst->u.rt.peer = rt->peer; 93 xdst->u.rt.peer = rt->peer;
88 if (rt->peer) 94 if (rt->peer)
89 atomic_inc(&rt->peer->refcnt); 95 atomic_inc(&rt->peer->refcnt);
@@ -104,11 +110,12 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
104static void 110static void
105_decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) 111_decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
106{ 112{
107 struct iphdr *iph = ip_hdr(skb); 113 const struct iphdr *iph = ip_hdr(skb);
108 u8 *xprth = skb_network_header(skb) + iph->ihl * 4; 114 u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
115 struct flowi4 *fl4 = &fl->u.ip4;
109 116
110 memset(fl, 0, sizeof(struct flowi)); 117 memset(fl4, 0, sizeof(struct flowi4));
111 fl->mark = skb->mark; 118 fl4->flowi4_mark = skb->mark;
112 119
113 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { 120 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
114 switch (iph->protocol) { 121 switch (iph->protocol) {
@@ -121,8 +128,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
121 pskb_may_pull(skb, xprth + 4 - skb->data)) { 128 pskb_may_pull(skb, xprth + 4 - skb->data)) {
122 __be16 *ports = (__be16 *)xprth; 129 __be16 *ports = (__be16 *)xprth;
123 130
124 fl->fl_ip_sport = ports[!!reverse]; 131 fl4->fl4_sport = ports[!!reverse];
125 fl->fl_ip_dport = ports[!reverse]; 132 fl4->fl4_dport = ports[!reverse];
126 } 133 }
127 break; 134 break;
128 135
@@ -130,8 +137,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
130 if (pskb_may_pull(skb, xprth + 2 - skb->data)) { 137 if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
131 u8 *icmp = xprth; 138 u8 *icmp = xprth;
132 139
133 fl->fl_icmp_type = icmp[0]; 140 fl4->fl4_icmp_type = icmp[0];
134 fl->fl_icmp_code = icmp[1]; 141 fl4->fl4_icmp_code = icmp[1];
135 } 142 }
136 break; 143 break;
137 144
@@ -139,7 +146,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
139 if (pskb_may_pull(skb, xprth + 4 - skb->data)) { 146 if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
140 __be32 *ehdr = (__be32 *)xprth; 147 __be32 *ehdr = (__be32 *)xprth;
141 148
142 fl->fl_ipsec_spi = ehdr[0]; 149 fl4->fl4_ipsec_spi = ehdr[0];
143 } 150 }
144 break; 151 break;
145 152
@@ -147,7 +154,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
147 if (pskb_may_pull(skb, xprth + 8 - skb->data)) { 154 if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
148 __be32 *ah_hdr = (__be32*)xprth; 155 __be32 *ah_hdr = (__be32*)xprth;
149 156
150 fl->fl_ipsec_spi = ah_hdr[1]; 157 fl4->fl4_ipsec_spi = ah_hdr[1];
151 } 158 }
152 break; 159 break;
153 160
@@ -155,18 +162,32 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
155 if (pskb_may_pull(skb, xprth + 4 - skb->data)) { 162 if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
156 __be16 *ipcomp_hdr = (__be16 *)xprth; 163 __be16 *ipcomp_hdr = (__be16 *)xprth;
157 164
158 fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); 165 fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
166 }
167 break;
168
169 case IPPROTO_GRE:
170 if (pskb_may_pull(skb, xprth + 12 - skb->data)) {
171 __be16 *greflags = (__be16 *)xprth;
172 __be32 *gre_hdr = (__be32 *)xprth;
173
174 if (greflags[0] & GRE_KEY) {
175 if (greflags[0] & GRE_CSUM)
176 gre_hdr++;
177 fl4->fl4_gre_key = gre_hdr[1];
178 }
159 } 179 }
160 break; 180 break;
181
161 default: 182 default:
162 fl->fl_ipsec_spi = 0; 183 fl4->fl4_ipsec_spi = 0;
163 break; 184 break;
164 } 185 }
165 } 186 }
166 fl->proto = iph->protocol; 187 fl4->flowi4_proto = iph->protocol;
167 fl->fl4_dst = reverse ? iph->saddr : iph->daddr; 188 fl4->daddr = reverse ? iph->saddr : iph->daddr;
168 fl->fl4_src = reverse ? iph->daddr : iph->saddr; 189 fl4->saddr = reverse ? iph->daddr : iph->saddr;
169 fl->fl4_tos = iph->tos; 190 fl4->flowi4_tos = iph->tos;
170} 191}
171 192
172static inline int xfrm4_garbage_collect(struct dst_ops *ops) 193static inline int xfrm4_garbage_collect(struct dst_ops *ops)
@@ -174,7 +195,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
174 struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); 195 struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
175 196
176 xfrm4_policy_afinfo.garbage_collect(net); 197 xfrm4_policy_afinfo.garbage_collect(net);
177 return (atomic_read(&ops->entries) > ops->gc_thresh * 2); 198 return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
178} 199}
179 200
180static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) 201static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -189,37 +210,20 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
189{ 210{
190 struct xfrm_dst *xdst = (struct xfrm_dst *)dst; 211 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
191 212
192 if (likely(xdst->u.rt.idev)) 213 dst_destroy_metrics_generic(dst);
193 in_dev_put(xdst->u.rt.idev); 214
194 if (likely(xdst->u.rt.peer)) 215 if (likely(xdst->u.rt.peer))
195 inet_putpeer(xdst->u.rt.peer); 216 inet_putpeer(xdst->u.rt.peer);
217
196 xfrm_dst_destroy(xdst); 218 xfrm_dst_destroy(xdst);
197} 219}
198 220
199static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 221static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
200 int unregister) 222 int unregister)
201{ 223{
202 struct xfrm_dst *xdst;
203
204 if (!unregister) 224 if (!unregister)
205 return; 225 return;
206 226
207 xdst = (struct xfrm_dst *)dst;
208 if (xdst->u.rt.idev->dev == dev) {
209 struct in_device *loopback_idev =
210 in_dev_get(dev_net(dev)->loopback_dev);
211 BUG_ON(!loopback_idev);
212
213 do {
214 in_dev_put(xdst->u.rt.idev);
215 xdst->u.rt.idev = loopback_idev;
216 in_dev_hold(loopback_idev);
217 xdst = (struct xfrm_dst *)xdst->u.dst.child;
218 } while (xdst->u.dst.xfrm);
219
220 __in_dev_put(loopback_idev);
221 }
222
223 xfrm_dst_ifdown(dst, dev); 227 xfrm_dst_ifdown(dst, dev);
224} 228}
225 229
@@ -228,11 +232,11 @@ static struct dst_ops xfrm4_dst_ops = {
228 .protocol = cpu_to_be16(ETH_P_IP), 232 .protocol = cpu_to_be16(ETH_P_IP),
229 .gc = xfrm4_garbage_collect, 233 .gc = xfrm4_garbage_collect,
230 .update_pmtu = xfrm4_update_pmtu, 234 .update_pmtu = xfrm4_update_pmtu,
235 .cow_metrics = dst_cow_metrics_generic,
231 .destroy = xfrm4_dst_destroy, 236 .destroy = xfrm4_dst_destroy,
232 .ifdown = xfrm4_dst_ifdown, 237 .ifdown = xfrm4_dst_ifdown,
233 .local_out = __ip_local_out, 238 .local_out = __ip_local_out,
234 .gc_thresh = 1024, 239 .gc_thresh = 1024,
235 .entries = ATOMIC_INIT(0),
236}; 240};
237 241
238static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { 242static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
@@ -244,6 +248,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
244 .get_tos = xfrm4_get_tos, 248 .get_tos = xfrm4_get_tos,
245 .init_path = xfrm4_init_path, 249 .init_path = xfrm4_init_path,
246 .fill_dst = xfrm4_fill_dst, 250 .fill_dst = xfrm4_fill_dst,
251 .blackhole_route = ipv4_blackhole_route,
247}; 252};
248 253
249#ifdef CONFIG_SYSCTL 254#ifdef CONFIG_SYSCTL
@@ -288,6 +293,7 @@ void __init xfrm4_init(int rt_max_size)
288 * and start cleaning when were 1/2 full 293 * and start cleaning when were 1/2 full
289 */ 294 */
290 xfrm4_dst_ops.gc_thresh = rt_max_size/2; 295 xfrm4_dst_ops.gc_thresh = rt_max_size/2;
296 dst_entries_init(&xfrm4_dst_ops);
291 297
292 xfrm4_state_init(); 298 xfrm4_state_init();
293 xfrm4_policy_init(); 299 xfrm4_policy_init();
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 47947624eccc..d9ac0a0058b5 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,24 +21,26 @@ static int xfrm4_init_flags(struct xfrm_state *x)
21} 21}
22 22
23static void 23static void
24__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl) 24__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
25{ 25{
26 sel->daddr.a4 = fl->fl4_dst; 26 const struct flowi4 *fl4 = &fl->u.ip4;
27 sel->saddr.a4 = fl->fl4_src; 27
28 sel->dport = xfrm_flowi_dport(fl); 28 sel->daddr.a4 = fl4->daddr;
29 sel->saddr.a4 = fl4->saddr;
30 sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
29 sel->dport_mask = htons(0xffff); 31 sel->dport_mask = htons(0xffff);
30 sel->sport = xfrm_flowi_sport(fl); 32 sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
31 sel->sport_mask = htons(0xffff); 33 sel->sport_mask = htons(0xffff);
32 sel->family = AF_INET; 34 sel->family = AF_INET;
33 sel->prefixlen_d = 32; 35 sel->prefixlen_d = 32;
34 sel->prefixlen_s = 32; 36 sel->prefixlen_s = 32;
35 sel->proto = fl->proto; 37 sel->proto = fl4->flowi4_proto;
36 sel->ifindex = fl->oif; 38 sel->ifindex = fl4->flowi4_oif;
37} 39}
38 40
39static void 41static void
40xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl, 42xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
41 xfrm_address_t *daddr, xfrm_address_t *saddr) 43 const xfrm_address_t *daddr, const xfrm_address_t *saddr)
42{ 44{
43 x->id = tmpl->id; 45 x->id = tmpl->id;
44 if (x->id.daddr.a4 == 0) 46 if (x->id.daddr.a4 == 0)
@@ -53,7 +55,7 @@ xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
53 55
54int xfrm4_extract_header(struct sk_buff *skb) 56int xfrm4_extract_header(struct sk_buff *skb)
55{ 57{
56 struct iphdr *iph = ip_hdr(skb); 58 const struct iphdr *iph = ip_hdr(skb);
57 59
58 XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); 60 XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
59 XFRM_MODE_SKB_CB(skb)->id = iph->id; 61 XFRM_MODE_SKB_CB(skb)->id = iph->id;
@@ -76,6 +78,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
76 .init_tempsel = __xfrm4_init_tempsel, 78 .init_tempsel = __xfrm4_init_tempsel,
77 .init_temprop = xfrm4_init_temprop, 79 .init_temprop = xfrm4_init_temprop,
78 .output = xfrm4_output, 80 .output = xfrm4_output,
81 .output_finish = xfrm4_output_finish,
79 .extract_input = xfrm4_extract_input, 82 .extract_input = xfrm4_extract_input,
80 .extract_output = xfrm4_extract_output, 83 .extract_output = xfrm4_extract_output,
81 .transport_finish = xfrm4_transport_finish, 84 .transport_finish = xfrm4_transport_finish,
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 41f5982d2087..82806455e859 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
58 return -ENOENT; 58 return -ENOENT;
59} 59}
60 60
61static struct xfrm_tunnel xfrm_tunnel_handler = { 61static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
62 .handler = xfrm_tunnel_rcv, 62 .handler = xfrm_tunnel_rcv,
63 .err_handler = xfrm_tunnel_err, 63 .err_handler = xfrm_tunnel_err,
64 .priority = 2, 64 .priority = 2,
65}; 65};
66 66
67#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) 67#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
68static struct xfrm_tunnel xfrm64_tunnel_handler = { 68static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
69 .handler = xfrm_tunnel_rcv, 69 .handler = xfrm_tunnel_rcv,
70 .err_handler = xfrm_tunnel_err, 70 .err_handler = xfrm_tunnel_err,
71 .priority = 2, 71 .priority = 2,