aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig42
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/af_inet.c62
-rw-r--r--net/ipv4/ah4.c27
-rw-r--r--net/ipv4/arp.c36
-rw-r--r--net/ipv4/datagram.c11
-rw-r--r--net/ipv4/devinet.c140
-rw-r--r--net/ipv4/esp4.c104
-rw-r--r--net/ipv4/fib_frontend.c213
-rw-r--r--net/ipv4/fib_hash.c1133
-rw-r--r--net/ipv4/fib_lookup.h13
-rw-r--r--net/ipv4/fib_rules.c25
-rw-r--r--net/ipv4/fib_semantics.c258
-rw-r--r--net/ipv4/fib_trie.c282
-rw-r--r--net/ipv4/icmp.c240
-rw-r--r--net/ipv4/igmp.c45
-rw-r--r--net/ipv4/inet_connection_sock.c27
-rw-r--r--net/ipv4/inet_diag.c2
-rw-r--r--net/ipv4/inet_timewait_sock.c2
-rw-r--r--net/ipv4/inetpeer.c150
-rw-r--r--net/ipv4/ip_gre.c57
-rw-r--r--net/ipv4/ip_input.c2
-rw-r--r--net/ipv4/ip_options.c6
-rw-r--r--net/ipv4/ip_output.c345
-rw-r--r--net/ipv4/ipip.c41
-rw-r--r--net/ipv4/ipmr.c155
-rw-r--r--net/ipv4/netfilter.c36
-rw-r--r--net/ipv4/netfilter/Kconfig13
-rw-r--r--net/ipv4/netfilter/Makefile1
-rw-r--r--net/ipv4/netfilter/arp_tables.c5
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c6
-rw-r--r--net/ipv4/netfilter/ip_tables.c9
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c12
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c3
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c134
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c33
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c9
-rw-r--r--net/ipv4/raw.c59
-rw-r--r--net/ipv4/route.c1190
-rw-r--r--net/ipv4/syncookies.c25
-rw-r--r--net/ipv4/tcp.c20
-rw-r--r--net/ipv4/tcp_bic.c2
-rw-r--r--net/ipv4/tcp_cubic.c47
-rw-r--r--net/ipv4/tcp_highspeed.c2
-rw-r--r--net/ipv4/tcp_htcp.c2
-rw-r--r--net/ipv4/tcp_hybla.c2
-rw-r--r--net/ipv4/tcp_illinois.c2
-rw-r--r--net/ipv4/tcp_input.c33
-rw-r--r--net/ipv4/tcp_ipv4.c38
-rw-r--r--net/ipv4/tcp_lp.c2
-rw-r--r--net/ipv4/tcp_output.c2
-rw-r--r--net/ipv4/tcp_scalable.c2
-rw-r--r--net/ipv4/tcp_timer.c3
-rw-r--r--net/ipv4/tcp_vegas.c2
-rw-r--r--net/ipv4/tcp_veno.c2
-rw-r--r--net/ipv4/tcp_westwood.c2
-rw-r--r--net/ipv4/tcp_yeah.c2
-rw-r--r--net/ipv4/udp.c139
-rw-r--r--net/ipv4/xfrm4_policy.c74
-rw-r--r--net/ipv4/xfrm4_state.c20
64 files changed, 2271 insertions, 3120 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index a5a1050595d1..cbb505ba9324 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER
55 55
56 If unsure, say N here. 56 If unsure, say N here.
57 57
58choice
59 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
60 depends on IP_ADVANCED_ROUTER
61 default ASK_IP_FIB_HASH
62
63config ASK_IP_FIB_HASH
64 bool "FIB_HASH"
65 ---help---
66 Current FIB is very proven and good enough for most users.
67
68config IP_FIB_TRIE
69 bool "FIB_TRIE"
70 ---help---
71 Use new experimental LC-trie as FIB lookup algorithm.
72 This improves lookup performance if you have a large
73 number of routes.
74
75 LC-trie is a longest matching prefix lookup algorithm which
76 performs better than FIB_HASH for large routing tables.
77 But, it consumes more memory and is more complex.
78
79 LC-trie is described in:
80
81 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
82 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
83 June 1999
84
85 An experimental study of compression methods for dynamic tries
86 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
87 <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
88
89endchoice
90
91config IP_FIB_HASH
92 def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
93
94config IP_FIB_TRIE_STATS 58config IP_FIB_TRIE_STATS
95 bool "FIB TRIE statistics" 59 bool "FIB TRIE statistics"
96 depends on IP_FIB_TRIE 60 depends on IP_ADVANCED_ROUTER
97 ---help--- 61 ---help---
98 Keep track of statistics on structure of FIB TRIE table. 62 Keep track of statistics on structure of FIB TRIE table.
99 Useful for testing and measuring TRIE performance. 63 Useful for testing and measuring TRIE performance.
@@ -140,6 +104,9 @@ config IP_ROUTE_VERBOSE
140 handled by the klogd daemon which is responsible for kernel messages 104 handled by the klogd daemon which is responsible for kernel messages
141 ("man klogd"). 105 ("man klogd").
142 106
107config IP_ROUTE_CLASSID
108 bool
109
143config IP_PNP 110config IP_PNP
144 bool "IP: kernel level autoconfiguration" 111 bool "IP: kernel level autoconfiguration"
145 help 112 help
@@ -657,4 +624,3 @@ config TCP_MD5SIG
657 on the Internet. 624 on the Internet.
658 625
659 If unsure, say N. 626 If unsure, say N.
660
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 4978d22f9a75..0dc772d0d125 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,12 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \
10 tcp_minisocks.o tcp_cong.o \ 10 tcp_minisocks.o tcp_cong.o \
11 datagram.o raw.o udp.o udplite.o \ 11 datagram.o raw.o udp.o udplite.o \
12 arp.o icmp.o devinet.o af_inet.o igmp.o \ 12 arp.o icmp.o devinet.o af_inet.o igmp.o \
13 fib_frontend.o fib_semantics.o \ 13 fib_frontend.o fib_semantics.o fib_trie.o \
14 inet_fragment.o 14 inet_fragment.o
15 15
16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o 16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
17obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
18obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
19obj-$(CONFIG_PROC_FS) += proc.o 17obj-$(CONFIG_PROC_FS) += proc.o
20obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 18obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
21obj-$(CONFIG_IP_MROUTE) += ipmr.o 19obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f2b61107df6c..807d83c02ef6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -880,6 +880,19 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
880} 880}
881EXPORT_SYMBOL(inet_ioctl); 881EXPORT_SYMBOL(inet_ioctl);
882 882
883#ifdef CONFIG_COMPAT
884int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
885{
886 struct sock *sk = sock->sk;
887 int err = -ENOIOCTLCMD;
888
889 if (sk->sk_prot->compat_ioctl)
890 err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
891
892 return err;
893}
894#endif
895
883const struct proto_ops inet_stream_ops = { 896const struct proto_ops inet_stream_ops = {
884 .family = PF_INET, 897 .family = PF_INET,
885 .owner = THIS_MODULE, 898 .owner = THIS_MODULE,
@@ -903,6 +916,7 @@ const struct proto_ops inet_stream_ops = {
903#ifdef CONFIG_COMPAT 916#ifdef CONFIG_COMPAT
904 .compat_setsockopt = compat_sock_common_setsockopt, 917 .compat_setsockopt = compat_sock_common_setsockopt,
905 .compat_getsockopt = compat_sock_common_getsockopt, 918 .compat_getsockopt = compat_sock_common_getsockopt,
919 .compat_ioctl = inet_compat_ioctl,
906#endif 920#endif
907}; 921};
908EXPORT_SYMBOL(inet_stream_ops); 922EXPORT_SYMBOL(inet_stream_ops);
@@ -929,6 +943,7 @@ const struct proto_ops inet_dgram_ops = {
929#ifdef CONFIG_COMPAT 943#ifdef CONFIG_COMPAT
930 .compat_setsockopt = compat_sock_common_setsockopt, 944 .compat_setsockopt = compat_sock_common_setsockopt,
931 .compat_getsockopt = compat_sock_common_getsockopt, 945 .compat_getsockopt = compat_sock_common_getsockopt,
946 .compat_ioctl = inet_compat_ioctl,
932#endif 947#endif
933}; 948};
934EXPORT_SYMBOL(inet_dgram_ops); 949EXPORT_SYMBOL(inet_dgram_ops);
@@ -959,6 +974,7 @@ static const struct proto_ops inet_sockraw_ops = {
959#ifdef CONFIG_COMPAT 974#ifdef CONFIG_COMPAT
960 .compat_setsockopt = compat_sock_common_setsockopt, 975 .compat_setsockopt = compat_sock_common_setsockopt,
961 .compat_getsockopt = compat_sock_common_getsockopt, 976 .compat_getsockopt = compat_sock_common_getsockopt,
977 .compat_ioctl = inet_compat_ioctl,
962#endif 978#endif
963}; 979};
964 980
@@ -1085,23 +1101,20 @@ int sysctl_ip_dynaddr __read_mostly;
1085static int inet_sk_reselect_saddr(struct sock *sk) 1101static int inet_sk_reselect_saddr(struct sock *sk)
1086{ 1102{
1087 struct inet_sock *inet = inet_sk(sk); 1103 struct inet_sock *inet = inet_sk(sk);
1088 int err;
1089 struct rtable *rt;
1090 __be32 old_saddr = inet->inet_saddr; 1104 __be32 old_saddr = inet->inet_saddr;
1091 __be32 new_saddr;
1092 __be32 daddr = inet->inet_daddr; 1105 __be32 daddr = inet->inet_daddr;
1106 struct rtable *rt;
1107 __be32 new_saddr;
1093 1108
1094 if (inet->opt && inet->opt->srr) 1109 if (inet->opt && inet->opt->srr)
1095 daddr = inet->opt->faddr; 1110 daddr = inet->opt->faddr;
1096 1111
1097 /* Query new route. */ 1112 /* Query new route. */
1098 err = ip_route_connect(&rt, daddr, 0, 1113 rt = ip_route_connect(daddr, 0, RT_CONN_FLAGS(sk),
1099 RT_CONN_FLAGS(sk), 1114 sk->sk_bound_dev_if, sk->sk_protocol,
1100 sk->sk_bound_dev_if, 1115 inet->inet_sport, inet->inet_dport, sk, false);
1101 sk->sk_protocol, 1116 if (IS_ERR(rt))
1102 inet->inet_sport, inet->inet_dport, sk, 0); 1117 return PTR_ERR(rt);
1103 if (err)
1104 return err;
1105 1118
1106 sk_setup_caps(sk, &rt->dst); 1119 sk_setup_caps(sk, &rt->dst);
1107 1120
@@ -1144,25 +1157,16 @@ int inet_sk_rebuild_header(struct sock *sk)
1144 daddr = inet->inet_daddr; 1157 daddr = inet->inet_daddr;
1145 if (inet->opt && inet->opt->srr) 1158 if (inet->opt && inet->opt->srr)
1146 daddr = inet->opt->faddr; 1159 daddr = inet->opt->faddr;
1147{ 1160 rt = ip_route_output_ports(sock_net(sk), sk, daddr, inet->inet_saddr,
1148 struct flowi fl = { 1161 inet->inet_dport, inet->inet_sport,
1149 .oif = sk->sk_bound_dev_if, 1162 sk->sk_protocol, RT_CONN_FLAGS(sk),
1150 .mark = sk->sk_mark, 1163 sk->sk_bound_dev_if);
1151 .fl4_dst = daddr, 1164 if (!IS_ERR(rt)) {
1152 .fl4_src = inet->inet_saddr, 1165 err = 0;
1153 .fl4_tos = RT_CONN_FLAGS(sk),
1154 .proto = sk->sk_protocol,
1155 .flags = inet_sk_flowi_flags(sk),
1156 .fl_ip_sport = inet->inet_sport,
1157 .fl_ip_dport = inet->inet_dport,
1158 };
1159
1160 security_sk_classify_flow(sk, &fl);
1161 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
1162}
1163 if (!err)
1164 sk_setup_caps(sk, &rt->dst); 1166 sk_setup_caps(sk, &rt->dst);
1165 else { 1167 } else {
1168 err = PTR_ERR(rt);
1169
1166 /* Routing failed... */ 1170 /* Routing failed... */
1167 sk->sk_route_caps = 0; 1171 sk->sk_route_caps = 0;
1168 /* 1172 /*
@@ -1215,7 +1219,7 @@ out:
1215 return err; 1219 return err;
1216} 1220}
1217 1221
1218static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) 1222static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features)
1219{ 1223{
1220 struct sk_buff *segs = ERR_PTR(-EINVAL); 1224 struct sk_buff *segs = ERR_PTR(-EINVAL);
1221 struct iphdr *iph; 1225 struct iphdr *iph;
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 86961bec70ab..4286fd3cc0e2 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -201,11 +201,14 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
201 top_iph->ttl = 0; 201 top_iph->ttl = 0;
202 top_iph->check = 0; 202 top_iph->check = 0;
203 203
204 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; 204 if (x->props.flags & XFRM_STATE_ALIGN4)
205 ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
206 else
207 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
205 208
206 ah->reserved = 0; 209 ah->reserved = 0;
207 ah->spi = x->id.spi; 210 ah->spi = x->id.spi;
208 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); 211 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
209 212
210 sg_init_table(sg, nfrags); 213 sg_init_table(sg, nfrags);
211 skb_to_sgvec(skb, sg, 0, skb->len); 214 skb_to_sgvec(skb, sg, 0, skb->len);
@@ -299,9 +302,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
299 nexthdr = ah->nexthdr; 302 nexthdr = ah->nexthdr;
300 ah_hlen = (ah->hdrlen + 2) << 2; 303 ah_hlen = (ah->hdrlen + 2) << 2;
301 304
302 if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && 305 if (x->props.flags & XFRM_STATE_ALIGN4) {
303 ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) 306 if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
304 goto out; 307 ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
308 goto out;
309 } else {
310 if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
311 ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
312 goto out;
313 }
305 314
306 if (!pskb_may_pull(skb, ah_hlen)) 315 if (!pskb_may_pull(skb, ah_hlen))
307 goto out; 316 goto out;
@@ -450,8 +459,12 @@ static int ah_init_state(struct xfrm_state *x)
450 459
451 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); 460 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
452 461
453 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + 462 if (x->props.flags & XFRM_STATE_ALIGN4)
454 ahp->icv_trunc_len); 463 x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
464 ahp->icv_trunc_len);
465 else
466 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
467 ahp->icv_trunc_len);
455 if (x->props.mode == XFRM_MODE_TUNNEL) 468 if (x->props.mode == XFRM_MODE_TUNNEL)
456 x->props.header_len += sizeof(struct iphdr); 469 x->props.header_len += sizeof(struct iphdr);
457 x->data = ahp; 470 x->data = ahp;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 04c8b69fd426..090d273d7865 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -433,14 +433,13 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
433 433
434static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) 434static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
435{ 435{
436 struct flowi fl = { .fl4_dst = sip,
437 .fl4_src = tip };
438 struct rtable *rt; 436 struct rtable *rt;
439 int flag = 0; 437 int flag = 0;
440 /*unsigned long now; */ 438 /*unsigned long now; */
441 struct net *net = dev_net(dev); 439 struct net *net = dev_net(dev);
442 440
443 if (ip_route_output_key(net, &rt, &fl) < 0) 441 rt = ip_route_output(net, sip, tip, 0, 0);
442 if (IS_ERR(rt))
444 return 1; 443 return 1;
445 if (rt->dst.dev != dev) { 444 if (rt->dst.dev != dev) {
446 NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); 445 NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
@@ -1017,14 +1016,13 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
1017 IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; 1016 IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
1018 return 0; 1017 return 0;
1019 } 1018 }
1020 if (__in_dev_get_rcu(dev)) { 1019 if (__in_dev_get_rtnl(dev)) {
1021 IN_DEV_CONF_SET(__in_dev_get_rcu(dev), PROXY_ARP, on); 1020 IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on);
1022 return 0; 1021 return 0;
1023 } 1022 }
1024 return -ENXIO; 1023 return -ENXIO;
1025} 1024}
1026 1025
1027/* must be called with rcu_read_lock() */
1028static int arp_req_set_public(struct net *net, struct arpreq *r, 1026static int arp_req_set_public(struct net *net, struct arpreq *r,
1029 struct net_device *dev) 1027 struct net_device *dev)
1030{ 1028{
@@ -1062,12 +1060,10 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1062 if (r->arp_flags & ATF_PERM) 1060 if (r->arp_flags & ATF_PERM)
1063 r->arp_flags |= ATF_COM; 1061 r->arp_flags |= ATF_COM;
1064 if (dev == NULL) { 1062 if (dev == NULL) {
1065 struct flowi fl = { .fl4_dst = ip, 1063 struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
1066 .fl4_tos = RTO_ONLINK }; 1064
1067 struct rtable *rt; 1065 if (IS_ERR(rt))
1068 err = ip_route_output_key(net, &rt, &fl); 1066 return PTR_ERR(rt);
1069 if (err != 0)
1070 return err;
1071 dev = rt->dst.dev; 1067 dev = rt->dst.dev;
1072 ip_rt_put(rt); 1068 ip_rt_put(rt);
1073 if (!dev) 1069 if (!dev)
@@ -1178,7 +1174,6 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
1178static int arp_req_delete(struct net *net, struct arpreq *r, 1174static int arp_req_delete(struct net *net, struct arpreq *r,
1179 struct net_device *dev) 1175 struct net_device *dev)
1180{ 1176{
1181 int err;
1182 __be32 ip; 1177 __be32 ip;
1183 1178
1184 if (r->arp_flags & ATF_PUBL) 1179 if (r->arp_flags & ATF_PUBL)
@@ -1186,12 +1181,9 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
1186 1181
1187 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; 1182 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
1188 if (dev == NULL) { 1183 if (dev == NULL) {
1189 struct flowi fl = { .fl4_dst = ip, 1184 struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
1190 .fl4_tos = RTO_ONLINK }; 1185 if (IS_ERR(rt))
1191 struct rtable *rt; 1186 return PTR_ERR(rt);
1192 err = ip_route_output_key(net, &rt, &fl);
1193 if (err != 0)
1194 return err;
1195 dev = rt->dst.dev; 1187 dev = rt->dst.dev;
1196 ip_rt_put(rt); 1188 ip_rt_put(rt);
1197 if (!dev) 1189 if (!dev)
@@ -1233,10 +1225,10 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1233 if (!(r.arp_flags & ATF_NETMASK)) 1225 if (!(r.arp_flags & ATF_NETMASK))
1234 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = 1226 ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
1235 htonl(0xFFFFFFFFUL); 1227 htonl(0xFFFFFFFFUL);
1236 rcu_read_lock(); 1228 rtnl_lock();
1237 if (r.arp_dev[0]) { 1229 if (r.arp_dev[0]) {
1238 err = -ENODEV; 1230 err = -ENODEV;
1239 dev = dev_get_by_name_rcu(net, r.arp_dev); 1231 dev = __dev_get_by_name(net, r.arp_dev);
1240 if (dev == NULL) 1232 if (dev == NULL)
1241 goto out; 1233 goto out;
1242 1234
@@ -1263,7 +1255,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1263 break; 1255 break;
1264 } 1256 }
1265out: 1257out:
1266 rcu_read_unlock(); 1258 rtnl_unlock();
1267 if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r))) 1259 if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
1268 err = -EFAULT; 1260 err = -EFAULT;
1269 return err; 1261 return err;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 174be6caa5c8..85bd24ca4f6d 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -46,11 +46,12 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
46 if (!saddr) 46 if (!saddr)
47 saddr = inet->mc_addr; 47 saddr = inet->mc_addr;
48 } 48 }
49 err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, 49 rt = ip_route_connect(usin->sin_addr.s_addr, saddr,
50 RT_CONN_FLAGS(sk), oif, 50 RT_CONN_FLAGS(sk), oif,
51 sk->sk_protocol, 51 sk->sk_protocol,
52 inet->inet_sport, usin->sin_port, sk, 1); 52 inet->inet_sport, usin->sin_port, sk, true);
53 if (err) { 53 if (IS_ERR(rt)) {
54 err = PTR_ERR(rt);
54 if (err == -ENETUNREACH) 55 if (err == -ENETUNREACH)
55 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 56 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
56 return err; 57 return err;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 748cb5b337bd..5345b0bee6df 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -51,6 +51,7 @@
51#include <linux/inetdevice.h> 51#include <linux/inetdevice.h>
52#include <linux/igmp.h> 52#include <linux/igmp.h>
53#include <linux/slab.h> 53#include <linux/slab.h>
54#include <linux/hash.h>
54#ifdef CONFIG_SYSCTL 55#ifdef CONFIG_SYSCTL
55#include <linux/sysctl.h> 56#include <linux/sysctl.h>
56#endif 57#endif
@@ -63,6 +64,8 @@
63#include <net/rtnetlink.h> 64#include <net/rtnetlink.h>
64#include <net/net_namespace.h> 65#include <net/net_namespace.h>
65 66
67#include "fib_lookup.h"
68
66static struct ipv4_devconf ipv4_devconf = { 69static struct ipv4_devconf ipv4_devconf = {
67 .data = { 70 .data = {
68 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, 71 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
@@ -92,6 +95,85 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
92 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, 95 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
93}; 96};
94 97
98/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
99 * value. So if you change this define, make appropriate changes to
100 * inet_addr_hash as well.
101 */
102#define IN4_ADDR_HSIZE 256
103static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
104static DEFINE_SPINLOCK(inet_addr_hash_lock);
105
106static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
107{
108 u32 val = (__force u32) addr ^ hash_ptr(net, 8);
109
110 return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
111 (IN4_ADDR_HSIZE - 1));
112}
113
114static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
115{
116 unsigned int hash = inet_addr_hash(net, ifa->ifa_local);
117
118 spin_lock(&inet_addr_hash_lock);
119 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
120 spin_unlock(&inet_addr_hash_lock);
121}
122
123static void inet_hash_remove(struct in_ifaddr *ifa)
124{
125 spin_lock(&inet_addr_hash_lock);
126 hlist_del_init_rcu(&ifa->hash);
127 spin_unlock(&inet_addr_hash_lock);
128}
129
130/**
131 * __ip_dev_find - find the first device with a given source address.
132 * @net: the net namespace
133 * @addr: the source address
134 * @devref: if true, take a reference on the found device
135 *
136 * If a caller uses devref=false, it should be protected by RCU, or RTNL
137 */
138struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
139{
140 unsigned int hash = inet_addr_hash(net, addr);
141 struct net_device *result = NULL;
142 struct in_ifaddr *ifa;
143 struct hlist_node *node;
144
145 rcu_read_lock();
146 hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
147 struct net_device *dev = ifa->ifa_dev->dev;
148
149 if (!net_eq(dev_net(dev), net))
150 continue;
151 if (ifa->ifa_local == addr) {
152 result = dev;
153 break;
154 }
155 }
156 if (!result) {
157 struct flowi4 fl4 = { .daddr = addr };
158 struct fib_result res = { 0 };
159 struct fib_table *local;
160
161 /* Fallback to FIB local table so that communication
162 * over loopback subnets work.
163 */
164 local = fib_get_table(net, RT_TABLE_LOCAL);
165 if (local &&
166 !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
167 res.type == RTN_LOCAL)
168 result = FIB_RES_DEV(res);
169 }
170 if (result && devref)
171 dev_hold(result);
172 rcu_read_unlock();
173 return result;
174}
175EXPORT_SYMBOL(__ip_dev_find);
176
95static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); 177static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
96 178
97static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); 179static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
@@ -265,6 +347,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
265 } 347 }
266 348
267 if (!do_promote) { 349 if (!do_promote) {
350 inet_hash_remove(ifa);
268 *ifap1 = ifa->ifa_next; 351 *ifap1 = ifa->ifa_next;
269 352
270 rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); 353 rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
@@ -278,9 +361,21 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
278 } 361 }
279 } 362 }
280 363
364 /* On promotion all secondaries from subnet are changing
365 * the primary IP, we must remove all their routes silently
366 * and later to add them back with new prefsrc. Do this
367 * while all addresses are on the device list.
368 */
369 for (ifa = promote; ifa; ifa = ifa->ifa_next) {
370 if (ifa1->ifa_mask == ifa->ifa_mask &&
371 inet_ifa_match(ifa1->ifa_address, ifa))
372 fib_del_ifaddr(ifa, ifa1);
373 }
374
281 /* 2. Unlink it */ 375 /* 2. Unlink it */
282 376
283 *ifap = ifa1->ifa_next; 377 *ifap = ifa1->ifa_next;
378 inet_hash_remove(ifa1);
284 379
285 /* 3. Announce address deletion */ 380 /* 3. Announce address deletion */
286 381
@@ -296,6 +391,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
296 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); 391 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
297 392
298 if (promote) { 393 if (promote) {
394 struct in_ifaddr *next_sec = promote->ifa_next;
299 395
300 if (prev_prom) { 396 if (prev_prom) {
301 prev_prom->ifa_next = promote->ifa_next; 397 prev_prom->ifa_next = promote->ifa_next;
@@ -307,7 +403,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
307 rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); 403 rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid);
308 blocking_notifier_call_chain(&inetaddr_chain, 404 blocking_notifier_call_chain(&inetaddr_chain,
309 NETDEV_UP, promote); 405 NETDEV_UP, promote);
310 for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) { 406 for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
311 if (ifa1->ifa_mask != ifa->ifa_mask || 407 if (ifa1->ifa_mask != ifa->ifa_mask ||
312 !inet_ifa_match(ifa1->ifa_address, ifa)) 408 !inet_ifa_match(ifa1->ifa_address, ifa))
313 continue; 409 continue;
@@ -368,6 +464,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
368 ifa->ifa_next = *ifap; 464 ifa->ifa_next = *ifap;
369 *ifap = ifa; 465 *ifap = ifa;
370 466
467 inet_hash_insert(dev_net(in_dev->dev), ifa);
468
371 /* Send message first, then call notifier. 469 /* Send message first, then call notifier.
372 Notifier will trigger FIB update, so that 470 Notifier will trigger FIB update, so that
373 listeners of netlink will know about new ifaddr */ 471 listeners of netlink will know about new ifaddr */
@@ -521,6 +619,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
521 if (tb[IFA_ADDRESS] == NULL) 619 if (tb[IFA_ADDRESS] == NULL)
522 tb[IFA_ADDRESS] = tb[IFA_LOCAL]; 620 tb[IFA_ADDRESS] = tb[IFA_LOCAL];
523 621
622 INIT_HLIST_NODE(&ifa->hash);
524 ifa->ifa_prefixlen = ifm->ifa_prefixlen; 623 ifa->ifa_prefixlen = ifm->ifa_prefixlen;
525 ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); 624 ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
526 ifa->ifa_flags = ifm->ifa_flags; 625 ifa->ifa_flags = ifm->ifa_flags;
@@ -670,7 +769,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
670 ifap = &ifa->ifa_next) { 769 ifap = &ifa->ifa_next) {
671 if (!strcmp(ifr.ifr_name, ifa->ifa_label) && 770 if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
672 sin_orig.sin_addr.s_addr == 771 sin_orig.sin_addr.s_addr ==
673 ifa->ifa_address) { 772 ifa->ifa_local) {
674 break; /* found */ 773 break; /* found */
675 } 774 }
676 } 775 }
@@ -728,6 +827,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
728 if (!ifa) { 827 if (!ifa) {
729 ret = -ENOBUFS; 828 ret = -ENOBUFS;
730 ifa = inet_alloc_ifa(); 829 ifa = inet_alloc_ifa();
830 INIT_HLIST_NODE(&ifa->hash);
731 if (!ifa) 831 if (!ifa)
732 break; 832 break;
733 if (colon) 833 if (colon)
@@ -1030,6 +1130,21 @@ static inline bool inetdev_valid_mtu(unsigned mtu)
1030 return mtu >= 68; 1130 return mtu >= 68;
1031} 1131}
1032 1132
1133static void inetdev_send_gratuitous_arp(struct net_device *dev,
1134 struct in_device *in_dev)
1135
1136{
1137 struct in_ifaddr *ifa = in_dev->ifa_list;
1138
1139 if (!ifa)
1140 return;
1141
1142 arp_send(ARPOP_REQUEST, ETH_P_ARP,
1143 ifa->ifa_local, dev,
1144 ifa->ifa_local, NULL,
1145 dev->dev_addr, NULL);
1146}
1147
1033/* Called only under RTNL semaphore */ 1148/* Called only under RTNL semaphore */
1034 1149
1035static int inetdev_event(struct notifier_block *this, unsigned long event, 1150static int inetdev_event(struct notifier_block *this, unsigned long event,
@@ -1069,6 +1184,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1069 struct in_ifaddr *ifa = inet_alloc_ifa(); 1184 struct in_ifaddr *ifa = inet_alloc_ifa();
1070 1185
1071 if (ifa) { 1186 if (ifa) {
1187 INIT_HLIST_NODE(&ifa->hash);
1072 ifa->ifa_local = 1188 ifa->ifa_local =
1073 ifa->ifa_address = htonl(INADDR_LOOPBACK); 1189 ifa->ifa_address = htonl(INADDR_LOOPBACK);
1074 ifa->ifa_prefixlen = 8; 1190 ifa->ifa_prefixlen = 8;
@@ -1082,18 +1198,13 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1082 } 1198 }
1083 ip_mc_up(in_dev); 1199 ip_mc_up(in_dev);
1084 /* fall through */ 1200 /* fall through */
1085 case NETDEV_NOTIFY_PEERS:
1086 case NETDEV_CHANGEADDR: 1201 case NETDEV_CHANGEADDR:
1202 if (!IN_DEV_ARP_NOTIFY(in_dev))
1203 break;
1204 /* fall through */
1205 case NETDEV_NOTIFY_PEERS:
1087 /* Send gratuitous ARP to notify of link change */ 1206 /* Send gratuitous ARP to notify of link change */
1088 if (IN_DEV_ARP_NOTIFY(in_dev)) { 1207 inetdev_send_gratuitous_arp(dev, in_dev);
1089 struct in_ifaddr *ifa = in_dev->ifa_list;
1090
1091 if (ifa)
1092 arp_send(ARPOP_REQUEST, ETH_P_ARP,
1093 ifa->ifa_address, dev,
1094 ifa->ifa_address, NULL,
1095 dev->dev_addr, NULL);
1096 }
1097 break; 1208 break;
1098 case NETDEV_DOWN: 1209 case NETDEV_DOWN:
1099 ip_mc_down(in_dev); 1210 ip_mc_down(in_dev);
@@ -1710,6 +1821,11 @@ static struct rtnl_af_ops inet_af_ops = {
1710 1821
1711void __init devinet_init(void) 1822void __init devinet_init(void)
1712{ 1823{
1824 int i;
1825
1826 for (i = 0; i < IN4_ADDR_HSIZE; i++)
1827 INIT_HLIST_HEAD(&inet_addr_lst[i]);
1828
1713 register_pernet_subsys(&devinet_ops); 1829 register_pernet_subsys(&devinet_ops);
1714 1830
1715 register_gifconf(PF_INET, inet_gifconf); 1831 register_gifconf(PF_INET, inet_gifconf);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index e42a905180f0..03f994bcf7de 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -33,11 +33,14 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
33 * 33 *
34 * TODO: Use spare space in skb for this where possible. 34 * TODO: Use spare space in skb for this where possible.
35 */ 35 */
36static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) 36static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
37{ 37{
38 unsigned int len; 38 unsigned int len;
39 39
40 len = crypto_aead_ivsize(aead); 40 len = seqhilen;
41
42 len += crypto_aead_ivsize(aead);
43
41 if (len) { 44 if (len) {
42 len += crypto_aead_alignmask(aead) & 45 len += crypto_aead_alignmask(aead) &
43 ~(crypto_tfm_ctx_alignment() - 1); 46 ~(crypto_tfm_ctx_alignment() - 1);
@@ -52,10 +55,15 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags)
52 return kmalloc(len, GFP_ATOMIC); 55 return kmalloc(len, GFP_ATOMIC);
53} 56}
54 57
55static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp) 58static inline __be32 *esp_tmp_seqhi(void *tmp)
59{
60 return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
61}
62static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
56{ 63{
57 return crypto_aead_ivsize(aead) ? 64 return crypto_aead_ivsize(aead) ?
58 PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp; 65 PTR_ALIGN((u8 *)tmp + seqhilen,
66 crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
59} 67}
60 68
61static inline struct aead_givcrypt_request *esp_tmp_givreq( 69static inline struct aead_givcrypt_request *esp_tmp_givreq(
@@ -122,6 +130,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
122 int plen; 130 int plen;
123 int tfclen; 131 int tfclen;
124 int nfrags; 132 int nfrags;
133 int assoclen;
134 int sglists;
135 int seqhilen;
136 __be32 *seqhi;
125 137
126 /* skb is pure payload to encrypt */ 138 /* skb is pure payload to encrypt */
127 139
@@ -151,14 +163,25 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
151 goto error; 163 goto error;
152 nfrags = err; 164 nfrags = err;
153 165
154 tmp = esp_alloc_tmp(aead, nfrags + 1); 166 assoclen = sizeof(*esph);
167 sglists = 1;
168 seqhilen = 0;
169
170 if (x->props.flags & XFRM_STATE_ESN) {
171 sglists += 2;
172 seqhilen += sizeof(__be32);
173 assoclen += seqhilen;
174 }
175
176 tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
155 if (!tmp) 177 if (!tmp)
156 goto error; 178 goto error;
157 179
158 iv = esp_tmp_iv(aead, tmp); 180 seqhi = esp_tmp_seqhi(tmp);
181 iv = esp_tmp_iv(aead, tmp, seqhilen);
159 req = esp_tmp_givreq(aead, iv); 182 req = esp_tmp_givreq(aead, iv);
160 asg = esp_givreq_sg(aead, req); 183 asg = esp_givreq_sg(aead, req);
161 sg = asg + 1; 184 sg = asg + sglists;
162 185
163 /* Fill padding... */ 186 /* Fill padding... */
164 tail = skb_tail_pointer(trailer); 187 tail = skb_tail_pointer(trailer);
@@ -215,19 +238,27 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
215 } 238 }
216 239
217 esph->spi = x->id.spi; 240 esph->spi = x->id.spi;
218 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); 241 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
219 242
220 sg_init_table(sg, nfrags); 243 sg_init_table(sg, nfrags);
221 skb_to_sgvec(skb, sg, 244 skb_to_sgvec(skb, sg,
222 esph->enc_data + crypto_aead_ivsize(aead) - skb->data, 245 esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
223 clen + alen); 246 clen + alen);
224 sg_init_one(asg, esph, sizeof(*esph)); 247
248 if ((x->props.flags & XFRM_STATE_ESN)) {
249 sg_init_table(asg, 3);
250 sg_set_buf(asg, &esph->spi, sizeof(__be32));
251 *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
252 sg_set_buf(asg + 1, seqhi, seqhilen);
253 sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
254 } else
255 sg_init_one(asg, esph, sizeof(*esph));
225 256
226 aead_givcrypt_set_callback(req, 0, esp_output_done, skb); 257 aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
227 aead_givcrypt_set_crypt(req, sg, sg, clen, iv); 258 aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
228 aead_givcrypt_set_assoc(req, asg, sizeof(*esph)); 259 aead_givcrypt_set_assoc(req, asg, assoclen);
229 aead_givcrypt_set_giv(req, esph->enc_data, 260 aead_givcrypt_set_giv(req, esph->enc_data,
230 XFRM_SKB_CB(skb)->seq.output); 261 XFRM_SKB_CB(skb)->seq.output.low);
231 262
232 ESP_SKB_CB(skb)->tmp = tmp; 263 ESP_SKB_CB(skb)->tmp = tmp;
233 err = crypto_aead_givencrypt(req); 264 err = crypto_aead_givencrypt(req);
@@ -346,6 +377,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
346 struct sk_buff *trailer; 377 struct sk_buff *trailer;
347 int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); 378 int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
348 int nfrags; 379 int nfrags;
380 int assoclen;
381 int sglists;
382 int seqhilen;
383 __be32 *seqhi;
349 void *tmp; 384 void *tmp;
350 u8 *iv; 385 u8 *iv;
351 struct scatterlist *sg; 386 struct scatterlist *sg;
@@ -362,16 +397,27 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
362 goto out; 397 goto out;
363 nfrags = err; 398 nfrags = err;
364 399
400 assoclen = sizeof(*esph);
401 sglists = 1;
402 seqhilen = 0;
403
404 if (x->props.flags & XFRM_STATE_ESN) {
405 sglists += 2;
406 seqhilen += sizeof(__be32);
407 assoclen += seqhilen;
408 }
409
365 err = -ENOMEM; 410 err = -ENOMEM;
366 tmp = esp_alloc_tmp(aead, nfrags + 1); 411 tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
367 if (!tmp) 412 if (!tmp)
368 goto out; 413 goto out;
369 414
370 ESP_SKB_CB(skb)->tmp = tmp; 415 ESP_SKB_CB(skb)->tmp = tmp;
371 iv = esp_tmp_iv(aead, tmp); 416 seqhi = esp_tmp_seqhi(tmp);
417 iv = esp_tmp_iv(aead, tmp, seqhilen);
372 req = esp_tmp_req(aead, iv); 418 req = esp_tmp_req(aead, iv);
373 asg = esp_req_sg(aead, req); 419 asg = esp_req_sg(aead, req);
374 sg = asg + 1; 420 sg = asg + sglists;
375 421
376 skb->ip_summed = CHECKSUM_NONE; 422 skb->ip_summed = CHECKSUM_NONE;
377 423
@@ -382,11 +428,19 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
382 428
383 sg_init_table(sg, nfrags); 429 sg_init_table(sg, nfrags);
384 skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); 430 skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
385 sg_init_one(asg, esph, sizeof(*esph)); 431
432 if ((x->props.flags & XFRM_STATE_ESN)) {
433 sg_init_table(asg, 3);
434 sg_set_buf(asg, &esph->spi, sizeof(__be32));
435 *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
436 sg_set_buf(asg + 1, seqhi, seqhilen);
437 sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
438 } else
439 sg_init_one(asg, esph, sizeof(*esph));
386 440
387 aead_request_set_callback(req, 0, esp_input_done, skb); 441 aead_request_set_callback(req, 0, esp_input_done, skb);
388 aead_request_set_crypt(req, sg, sg, elen, iv); 442 aead_request_set_crypt(req, sg, sg, elen, iv);
389 aead_request_set_assoc(req, asg, sizeof(*esph)); 443 aead_request_set_assoc(req, asg, assoclen);
390 444
391 err = crypto_aead_decrypt(req); 445 err = crypto_aead_decrypt(req);
392 if (err == -EINPROGRESS) 446 if (err == -EINPROGRESS)
@@ -500,10 +554,20 @@ static int esp_init_authenc(struct xfrm_state *x)
500 goto error; 554 goto error;
501 555
502 err = -ENAMETOOLONG; 556 err = -ENAMETOOLONG;
503 if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)", 557
504 x->aalg ? x->aalg->alg_name : "digest_null", 558 if ((x->props.flags & XFRM_STATE_ESN)) {
505 x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) 559 if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
506 goto error; 560 "authencesn(%s,%s)",
561 x->aalg ? x->aalg->alg_name : "digest_null",
562 x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
563 goto error;
564 } else {
565 if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
566 "authenc(%s,%s)",
567 x->aalg ? x->aalg->alg_name : "digest_null",
568 x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
569 goto error;
570 }
507 571
508 aead = crypto_alloc_aead(authenc_name, 0, 0); 572 aead = crypto_alloc_aead(authenc_name, 0, 0);
509 err = PTR_ERR(aead); 573 err = PTR_ERR(aead);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1d2cdd43a878..f116ce8f1b46 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -51,11 +51,11 @@ static int __net_init fib4_rules_init(struct net *net)
51{ 51{
52 struct fib_table *local_table, *main_table; 52 struct fib_table *local_table, *main_table;
53 53
54 local_table = fib_hash_table(RT_TABLE_LOCAL); 54 local_table = fib_trie_table(RT_TABLE_LOCAL);
55 if (local_table == NULL) 55 if (local_table == NULL)
56 return -ENOMEM; 56 return -ENOMEM;
57 57
58 main_table = fib_hash_table(RT_TABLE_MAIN); 58 main_table = fib_trie_table(RT_TABLE_MAIN);
59 if (main_table == NULL) 59 if (main_table == NULL)
60 goto fail; 60 goto fail;
61 61
@@ -82,7 +82,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
82 if (tb) 82 if (tb)
83 return tb; 83 return tb;
84 84
85 tb = fib_hash_table(id); 85 tb = fib_trie_table(id);
86 if (!tb) 86 if (!tb)
87 return NULL; 87 return NULL;
88 h = id & (FIB_TABLE_HASHSZ - 1); 88 h = id & (FIB_TABLE_HASHSZ - 1);
@@ -114,21 +114,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
114} 114}
115#endif /* CONFIG_IP_MULTIPLE_TABLES */ 115#endif /* CONFIG_IP_MULTIPLE_TABLES */
116 116
117void fib_select_default(struct net *net,
118 const struct flowi *flp, struct fib_result *res)
119{
120 struct fib_table *tb;
121 int table = RT_TABLE_MAIN;
122#ifdef CONFIG_IP_MULTIPLE_TABLES
123 if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
124 return;
125 table = res->r->table;
126#endif
127 tb = fib_get_table(net, table);
128 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
129 fib_table_select_default(tb, flp, res);
130}
131
132static void fib_flush(struct net *net) 117static void fib_flush(struct net *net)
133{ 118{
134 int flushed = 0; 119 int flushed = 0;
@@ -147,46 +132,6 @@ static void fib_flush(struct net *net)
147 rt_cache_flush(net, -1); 132 rt_cache_flush(net, -1);
148} 133}
149 134
150/**
151 * __ip_dev_find - find the first device with a given source address.
152 * @net: the net namespace
153 * @addr: the source address
154 * @devref: if true, take a reference on the found device
155 *
156 * If a caller uses devref=false, it should be protected by RCU, or RTNL
157 */
158struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
159{
160 struct flowi fl = {
161 .fl4_dst = addr,
162 };
163 struct fib_result res = { 0 };
164 struct net_device *dev = NULL;
165 struct fib_table *local_table;
166
167#ifdef CONFIG_IP_MULTIPLE_TABLES
168 res.r = NULL;
169#endif
170
171 rcu_read_lock();
172 local_table = fib_get_table(net, RT_TABLE_LOCAL);
173 if (!local_table ||
174 fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
175 rcu_read_unlock();
176 return NULL;
177 }
178 if (res.type != RTN_LOCAL)
179 goto out;
180 dev = FIB_RES_DEV(res);
181
182 if (dev && devref)
183 dev_hold(dev);
184out:
185 rcu_read_unlock();
186 return dev;
187}
188EXPORT_SYMBOL(__ip_dev_find);
189
190/* 135/*
191 * Find address type as if only "dev" was present in the system. If 136 * Find address type as if only "dev" was present in the system. If
192 * on_dev is NULL then all interfaces are taken into consideration. 137 * on_dev is NULL then all interfaces are taken into consideration.
@@ -195,7 +140,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
195 const struct net_device *dev, 140 const struct net_device *dev,
196 __be32 addr) 141 __be32 addr)
197{ 142{
198 struct flowi fl = { .fl4_dst = addr }; 143 struct flowi4 fl4 = { .daddr = addr };
199 struct fib_result res; 144 struct fib_result res;
200 unsigned ret = RTN_BROADCAST; 145 unsigned ret = RTN_BROADCAST;
201 struct fib_table *local_table; 146 struct fib_table *local_table;
@@ -213,7 +158,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
213 if (local_table) { 158 if (local_table) {
214 ret = RTN_UNICAST; 159 ret = RTN_UNICAST;
215 rcu_read_lock(); 160 rcu_read_lock();
216 if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) { 161 if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
217 if (!dev || dev == res.fi->fib_dev) 162 if (!dev || dev == res.fi->fib_dev)
218 ret = res.type; 163 ret = res.type;
219 } 164 }
@@ -248,19 +193,21 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
248 u32 *itag, u32 mark) 193 u32 *itag, u32 mark)
249{ 194{
250 struct in_device *in_dev; 195 struct in_device *in_dev;
251 struct flowi fl = { 196 struct flowi4 fl4;
252 .fl4_dst = src,
253 .fl4_src = dst,
254 .fl4_tos = tos,
255 .mark = mark,
256 .iif = oif
257 };
258 struct fib_result res; 197 struct fib_result res;
259 int no_addr, rpf, accept_local; 198 int no_addr, rpf, accept_local;
260 bool dev_match; 199 bool dev_match;
261 int ret; 200 int ret;
262 struct net *net; 201 struct net *net;
263 202
203 fl4.flowi4_oif = 0;
204 fl4.flowi4_iif = oif;
205 fl4.flowi4_mark = mark;
206 fl4.daddr = src;
207 fl4.saddr = dst;
208 fl4.flowi4_tos = tos;
209 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
210
264 no_addr = rpf = accept_local = 0; 211 no_addr = rpf = accept_local = 0;
265 in_dev = __in_dev_get_rcu(dev); 212 in_dev = __in_dev_get_rcu(dev);
266 if (in_dev) { 213 if (in_dev) {
@@ -268,20 +215,20 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
268 rpf = IN_DEV_RPFILTER(in_dev); 215 rpf = IN_DEV_RPFILTER(in_dev);
269 accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); 216 accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
270 if (mark && !IN_DEV_SRC_VMARK(in_dev)) 217 if (mark && !IN_DEV_SRC_VMARK(in_dev))
271 fl.mark = 0; 218 fl4.flowi4_mark = 0;
272 } 219 }
273 220
274 if (in_dev == NULL) 221 if (in_dev == NULL)
275 goto e_inval; 222 goto e_inval;
276 223
277 net = dev_net(dev); 224 net = dev_net(dev);
278 if (fib_lookup(net, &fl, &res)) 225 if (fib_lookup(net, &fl4, &res))
279 goto last_resort; 226 goto last_resort;
280 if (res.type != RTN_UNICAST) { 227 if (res.type != RTN_UNICAST) {
281 if (res.type != RTN_LOCAL || !accept_local) 228 if (res.type != RTN_LOCAL || !accept_local)
282 goto e_inval; 229 goto e_inval;
283 } 230 }
284 *spec_dst = FIB_RES_PREFSRC(res); 231 *spec_dst = FIB_RES_PREFSRC(net, res);
285 fib_combine_itag(itag, &res); 232 fib_combine_itag(itag, &res);
286 dev_match = false; 233 dev_match = false;
287 234
@@ -306,12 +253,12 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
306 goto last_resort; 253 goto last_resort;
307 if (rpf == 1) 254 if (rpf == 1)
308 goto e_rpf; 255 goto e_rpf;
309 fl.oif = dev->ifindex; 256 fl4.flowi4_oif = dev->ifindex;
310 257
311 ret = 0; 258 ret = 0;
312 if (fib_lookup(net, &fl, &res) == 0) { 259 if (fib_lookup(net, &fl4, &res) == 0) {
313 if (res.type == RTN_UNICAST) { 260 if (res.type == RTN_UNICAST) {
314 *spec_dst = FIB_RES_PREFSRC(res); 261 *spec_dst = FIB_RES_PREFSRC(net, res);
315 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 262 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
316 } 263 }
317 } 264 }
@@ -775,12 +722,17 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
775 } 722 }
776} 723}
777 724
778static void fib_del_ifaddr(struct in_ifaddr *ifa) 725/* Delete primary or secondary address.
726 * Optionally, on secondary address promotion consider the addresses
727 * from subnet iprim as deleted, even if they are in device list.
728 * In this case the secondary ifa can be in device list.
729 */
730void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
779{ 731{
780 struct in_device *in_dev = ifa->ifa_dev; 732 struct in_device *in_dev = ifa->ifa_dev;
781 struct net_device *dev = in_dev->dev; 733 struct net_device *dev = in_dev->dev;
782 struct in_ifaddr *ifa1; 734 struct in_ifaddr *ifa1;
783 struct in_ifaddr *prim = ifa; 735 struct in_ifaddr *prim = ifa, *prim1 = NULL;
784 __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; 736 __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
785 __be32 any = ifa->ifa_address & ifa->ifa_mask; 737 __be32 any = ifa->ifa_address & ifa->ifa_mask;
786#define LOCAL_OK 1 738#define LOCAL_OK 1
@@ -788,17 +740,26 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
788#define BRD0_OK 4 740#define BRD0_OK 4
789#define BRD1_OK 8 741#define BRD1_OK 8
790 unsigned ok = 0; 742 unsigned ok = 0;
743 int subnet = 0; /* Primary network */
744 int gone = 1; /* Address is missing */
745 int same_prefsrc = 0; /* Another primary with same IP */
791 746
792 if (!(ifa->ifa_flags & IFA_F_SECONDARY)) 747 if (ifa->ifa_flags & IFA_F_SECONDARY) {
793 fib_magic(RTM_DELROUTE,
794 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
795 any, ifa->ifa_prefixlen, prim);
796 else {
797 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); 748 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
798 if (prim == NULL) { 749 if (prim == NULL) {
799 printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); 750 printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
800 return; 751 return;
801 } 752 }
753 if (iprim && iprim != prim) {
754 printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n");
755 return;
756 }
757 } else if (!ipv4_is_zeronet(any) &&
758 (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
759 fib_magic(RTM_DELROUTE,
760 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
761 any, ifa->ifa_prefixlen, prim);
762 subnet = 1;
802 } 763 }
803 764
804 /* Deletion is more complicated than add. 765 /* Deletion is more complicated than add.
@@ -808,6 +769,49 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
808 */ 769 */
809 770
810 for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { 771 for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
772 if (ifa1 == ifa) {
773 /* promotion, keep the IP */
774 gone = 0;
775 continue;
776 }
777 /* Ignore IFAs from our subnet */
778 if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
779 inet_ifa_match(ifa1->ifa_address, iprim))
780 continue;
781
782 /* Ignore ifa1 if it uses different primary IP (prefsrc) */
783 if (ifa1->ifa_flags & IFA_F_SECONDARY) {
784 /* Another address from our subnet? */
785 if (ifa1->ifa_mask == prim->ifa_mask &&
786 inet_ifa_match(ifa1->ifa_address, prim))
787 prim1 = prim;
788 else {
789 /* We reached the secondaries, so
790 * same_prefsrc should be determined.
791 */
792 if (!same_prefsrc)
793 continue;
794 /* Search new prim1 if ifa1 is not
795 * using the current prim1
796 */
797 if (!prim1 ||
798 ifa1->ifa_mask != prim1->ifa_mask ||
799 !inet_ifa_match(ifa1->ifa_address, prim1))
800 prim1 = inet_ifa_byprefix(in_dev,
801 ifa1->ifa_address,
802 ifa1->ifa_mask);
803 if (!prim1)
804 continue;
805 if (prim1->ifa_local != prim->ifa_local)
806 continue;
807 }
808 } else {
809 if (prim->ifa_local != ifa1->ifa_local)
810 continue;
811 prim1 = ifa1;
812 if (prim != prim1)
813 same_prefsrc = 1;
814 }
811 if (ifa->ifa_local == ifa1->ifa_local) 815 if (ifa->ifa_local == ifa1->ifa_local)
812 ok |= LOCAL_OK; 816 ok |= LOCAL_OK;
813 if (ifa->ifa_broadcast == ifa1->ifa_broadcast) 817 if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
@@ -816,19 +820,37 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
816 ok |= BRD1_OK; 820 ok |= BRD1_OK;
817 if (any == ifa1->ifa_broadcast) 821 if (any == ifa1->ifa_broadcast)
818 ok |= BRD0_OK; 822 ok |= BRD0_OK;
823 /* primary has network specific broadcasts */
824 if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
825 __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
826 __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
827
828 if (!ipv4_is_zeronet(any1)) {
829 if (ifa->ifa_broadcast == brd1 ||
830 ifa->ifa_broadcast == any1)
831 ok |= BRD_OK;
832 if (brd == brd1 || brd == any1)
833 ok |= BRD1_OK;
834 if (any == brd1 || any == any1)
835 ok |= BRD0_OK;
836 }
837 }
819 } 838 }
820 839
821 if (!(ok & BRD_OK)) 840 if (!(ok & BRD_OK))
822 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); 841 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
823 if (!(ok & BRD1_OK)) 842 if (subnet && ifa->ifa_prefixlen < 31) {
824 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); 843 if (!(ok & BRD1_OK))
825 if (!(ok & BRD0_OK)) 844 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
826 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); 845 if (!(ok & BRD0_OK))
846 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
847 }
827 if (!(ok & LOCAL_OK)) { 848 if (!(ok & LOCAL_OK)) {
828 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); 849 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
829 850
830 /* Check, that this local address finally disappeared. */ 851 /* Check, that this local address finally disappeared. */
831 if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { 852 if (gone &&
853 inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
832 /* And the last, but not the least thing. 854 /* And the last, but not the least thing.
833 * We must flush stray FIB entries. 855 * We must flush stray FIB entries.
834 * 856 *
@@ -849,11 +871,11 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
849{ 871{
850 872
851 struct fib_result res; 873 struct fib_result res;
852 struct flowi fl = { 874 struct flowi4 fl4 = {
853 .mark = frn->fl_mark, 875 .flowi4_mark = frn->fl_mark,
854 .fl4_dst = frn->fl_addr, 876 .daddr = frn->fl_addr,
855 .fl4_tos = frn->fl_tos, 877 .flowi4_tos = frn->fl_tos,
856 .fl4_scope = frn->fl_scope, 878 .flowi4_scope = frn->fl_scope,
857 }; 879 };
858 880
859#ifdef CONFIG_IP_MULTIPLE_TABLES 881#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -866,7 +888,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
866 888
867 frn->tb_id = tb->tb_id; 889 frn->tb_id = tb->tb_id;
868 rcu_read_lock(); 890 rcu_read_lock();
869 frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF); 891 frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
870 892
871 if (!frn->err) { 893 if (!frn->err) {
872 frn->prefixlen = res.prefixlen; 894 frn->prefixlen = res.prefixlen;
@@ -938,6 +960,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
938{ 960{
939 struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; 961 struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
940 struct net_device *dev = ifa->ifa_dev->dev; 962 struct net_device *dev = ifa->ifa_dev->dev;
963 struct net *net = dev_net(dev);
941 964
942 switch (event) { 965 switch (event) {
943 case NETDEV_UP: 966 case NETDEV_UP:
@@ -945,10 +968,12 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
945#ifdef CONFIG_IP_ROUTE_MULTIPATH 968#ifdef CONFIG_IP_ROUTE_MULTIPATH
946 fib_sync_up(dev); 969 fib_sync_up(dev);
947#endif 970#endif
971 atomic_inc(&net->ipv4.dev_addr_genid);
948 rt_cache_flush(dev_net(dev), -1); 972 rt_cache_flush(dev_net(dev), -1);
949 break; 973 break;
950 case NETDEV_DOWN: 974 case NETDEV_DOWN:
951 fib_del_ifaddr(ifa); 975 fib_del_ifaddr(ifa, NULL);
976 atomic_inc(&net->ipv4.dev_addr_genid);
952 if (ifa->ifa_dev->ifa_list == NULL) { 977 if (ifa->ifa_dev->ifa_list == NULL) {
953 /* Last address was deleted from this interface. 978 /* Last address was deleted from this interface.
954 * Disable IP. 979 * Disable IP.
@@ -966,6 +991,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
966{ 991{
967 struct net_device *dev = ptr; 992 struct net_device *dev = ptr;
968 struct in_device *in_dev = __in_dev_get_rtnl(dev); 993 struct in_device *in_dev = __in_dev_get_rtnl(dev);
994 struct net *net = dev_net(dev);
969 995
970 if (event == NETDEV_UNREGISTER) { 996 if (event == NETDEV_UNREGISTER) {
971 fib_disable_ip(dev, 2, -1); 997 fib_disable_ip(dev, 2, -1);
@@ -983,6 +1009,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
983#ifdef CONFIG_IP_ROUTE_MULTIPATH 1009#ifdef CONFIG_IP_ROUTE_MULTIPATH
984 fib_sync_up(dev); 1010 fib_sync_up(dev);
985#endif 1011#endif
1012 atomic_inc(&net->ipv4.dev_addr_genid);
986 rt_cache_flush(dev_net(dev), -1); 1013 rt_cache_flush(dev_net(dev), -1);
987 break; 1014 break;
988 case NETDEV_DOWN: 1015 case NETDEV_DOWN:
@@ -1101,5 +1128,5 @@ void __init ip_fib_init(void)
1101 register_netdevice_notifier(&fib_netdev_notifier); 1128 register_netdevice_notifier(&fib_netdev_notifier);
1102 register_inetaddr_notifier(&fib_inetaddr_notifier); 1129 register_inetaddr_notifier(&fib_inetaddr_notifier);
1103 1130
1104 fib_hash_init(); 1131 fib_trie_init();
1105} 1132}
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
deleted file mode 100644
index b3acb0417b21..000000000000
--- a/net/ipv4/fib_hash.c
+++ /dev/null
@@ -1,1133 +0,0 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IPv4 FIB: lookup engine and maintenance routines.
7 *
8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16#include <asm/uaccess.h>
17#include <asm/system.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/mm.h>
22#include <linux/string.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/errno.h>
26#include <linux/in.h>
27#include <linux/inet.h>
28#include <linux/inetdevice.h>
29#include <linux/netdevice.h>
30#include <linux/if_arp.h>
31#include <linux/proc_fs.h>
32#include <linux/skbuff.h>
33#include <linux/netlink.h>
34#include <linux/init.h>
35#include <linux/slab.h>
36
37#include <net/net_namespace.h>
38#include <net/ip.h>
39#include <net/protocol.h>
40#include <net/route.h>
41#include <net/tcp.h>
42#include <net/sock.h>
43#include <net/ip_fib.h>
44
45#include "fib_lookup.h"
46
47static struct kmem_cache *fn_hash_kmem __read_mostly;
48static struct kmem_cache *fn_alias_kmem __read_mostly;
49
50struct fib_node {
51 struct hlist_node fn_hash;
52 struct list_head fn_alias;
53 __be32 fn_key;
54 struct fib_alias fn_embedded_alias;
55};
56
57#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
58
59struct fn_zone {
60 struct fn_zone __rcu *fz_next; /* Next not empty zone */
61 struct hlist_head __rcu *fz_hash; /* Hash table pointer */
62 seqlock_t fz_lock;
63 u32 fz_hashmask; /* (fz_divisor - 1) */
64
65 u8 fz_order; /* Zone order (0..32) */
66 u8 fz_revorder; /* 32 - fz_order */
67 __be32 fz_mask; /* inet_make_mask(order) */
68#define FZ_MASK(fz) ((fz)->fz_mask)
69
70 struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
71
72 int fz_nent; /* Number of entries */
73 int fz_divisor; /* Hash size (mask+1) */
74};
75
76struct fn_hash {
77 struct fn_zone *fn_zones[33];
78 struct fn_zone __rcu *fn_zone_list;
79};
80
81static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
82{
83 u32 h = ntohl(key) >> fz->fz_revorder;
84 h ^= (h>>20);
85 h ^= (h>>10);
86 h ^= (h>>5);
87 h &= fz->fz_hashmask;
88 return h;
89}
90
91static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
92{
93 return dst & FZ_MASK(fz);
94}
95
96static unsigned int fib_hash_genid;
97
98#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
99
100static struct hlist_head *fz_hash_alloc(int divisor)
101{
102 unsigned long size = divisor * sizeof(struct hlist_head);
103
104 if (size <= PAGE_SIZE)
105 return kzalloc(size, GFP_KERNEL);
106
107 return (struct hlist_head *)
108 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
109}
110
111/* The fib hash lock must be held when this is called. */
112static inline void fn_rebuild_zone(struct fn_zone *fz,
113 struct hlist_head *old_ht,
114 int old_divisor)
115{
116 int i;
117
118 for (i = 0; i < old_divisor; i++) {
119 struct hlist_node *node, *n;
120 struct fib_node *f;
121
122 hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
123 struct hlist_head *new_head;
124
125 hlist_del_rcu(&f->fn_hash);
126
127 new_head = rcu_dereference_protected(fz->fz_hash, 1) +
128 fn_hash(f->fn_key, fz);
129 hlist_add_head_rcu(&f->fn_hash, new_head);
130 }
131 }
132}
133
134static void fz_hash_free(struct hlist_head *hash, int divisor)
135{
136 unsigned long size = divisor * sizeof(struct hlist_head);
137
138 if (size <= PAGE_SIZE)
139 kfree(hash);
140 else
141 free_pages((unsigned long)hash, get_order(size));
142}
143
144static void fn_rehash_zone(struct fn_zone *fz)
145{
146 struct hlist_head *ht, *old_ht;
147 int old_divisor, new_divisor;
148 u32 new_hashmask;
149
150 new_divisor = old_divisor = fz->fz_divisor;
151
152 switch (old_divisor) {
153 case EMBEDDED_HASH_SIZE:
154 new_divisor *= EMBEDDED_HASH_SIZE;
155 break;
156 case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
157 new_divisor *= (EMBEDDED_HASH_SIZE/2);
158 break;
159 default:
160 if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
161 printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
162 return;
163 }
164 new_divisor = (old_divisor << 1);
165 break;
166 }
167
168 new_hashmask = (new_divisor - 1);
169
170#if RT_CACHE_DEBUG >= 2
171 printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n",
172 fz->fz_order, old_divisor);
173#endif
174
175 ht = fz_hash_alloc(new_divisor);
176
177 if (ht) {
178 struct fn_zone nfz;
179
180 memcpy(&nfz, fz, sizeof(nfz));
181
182 write_seqlock_bh(&fz->fz_lock);
183 old_ht = rcu_dereference_protected(fz->fz_hash, 1);
184 RCU_INIT_POINTER(nfz.fz_hash, ht);
185 nfz.fz_hashmask = new_hashmask;
186 nfz.fz_divisor = new_divisor;
187 fn_rebuild_zone(&nfz, old_ht, old_divisor);
188 fib_hash_genid++;
189 rcu_assign_pointer(fz->fz_hash, ht);
190 fz->fz_hashmask = new_hashmask;
191 fz->fz_divisor = new_divisor;
192 write_sequnlock_bh(&fz->fz_lock);
193
194 if (old_ht != fz->fz_embedded_hash) {
195 synchronize_rcu();
196 fz_hash_free(old_ht, old_divisor);
197 }
198 }
199}
200
201static void fn_free_node_rcu(struct rcu_head *head)
202{
203 struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
204
205 kmem_cache_free(fn_hash_kmem, f);
206}
207
208static inline void fn_free_node(struct fib_node *f)
209{
210 call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
211}
212
213static void fn_free_alias_rcu(struct rcu_head *head)
214{
215 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
216
217 kmem_cache_free(fn_alias_kmem, fa);
218}
219
220static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
221{
222 fib_release_info(fa->fa_info);
223 if (fa == &f->fn_embedded_alias)
224 fa->fa_info = NULL;
225 else
226 call_rcu(&fa->rcu, fn_free_alias_rcu);
227}
228
229static struct fn_zone *
230fn_new_zone(struct fn_hash *table, int z)
231{
232 int i;
233 struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
234 if (!fz)
235 return NULL;
236
237 seqlock_init(&fz->fz_lock);
238 fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
239 fz->fz_hashmask = fz->fz_divisor - 1;
240 RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
241 fz->fz_order = z;
242 fz->fz_revorder = 32 - z;
243 fz->fz_mask = inet_make_mask(z);
244
245 /* Find the first not empty zone with more specific mask */
246 for (i = z + 1; i <= 32; i++)
247 if (table->fn_zones[i])
248 break;
249 if (i > 32) {
250 /* No more specific masks, we are the first. */
251 rcu_assign_pointer(fz->fz_next,
252 rtnl_dereference(table->fn_zone_list));
253 rcu_assign_pointer(table->fn_zone_list, fz);
254 } else {
255 rcu_assign_pointer(fz->fz_next,
256 rtnl_dereference(table->fn_zones[i]->fz_next));
257 rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
258 }
259 table->fn_zones[z] = fz;
260 fib_hash_genid++;
261 return fz;
262}
263
264int fib_table_lookup(struct fib_table *tb,
265 const struct flowi *flp, struct fib_result *res,
266 int fib_flags)
267{
268 int err;
269 struct fn_zone *fz;
270 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
271
272 rcu_read_lock();
273 for (fz = rcu_dereference(t->fn_zone_list);
274 fz != NULL;
275 fz = rcu_dereference(fz->fz_next)) {
276 struct hlist_head *head;
277 struct hlist_node *node;
278 struct fib_node *f;
279 __be32 k;
280 unsigned int seq;
281
282 do {
283 seq = read_seqbegin(&fz->fz_lock);
284 k = fz_key(flp->fl4_dst, fz);
285
286 head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
287 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
288 if (f->fn_key != k)
289 continue;
290
291 err = fib_semantic_match(&f->fn_alias,
292 flp, res,
293 fz->fz_order, fib_flags);
294 if (err <= 0)
295 goto out;
296 }
297 } while (read_seqretry(&fz->fz_lock, seq));
298 }
299 err = 1;
300out:
301 rcu_read_unlock();
302 return err;
303}
304
305void fib_table_select_default(struct fib_table *tb,
306 const struct flowi *flp, struct fib_result *res)
307{
308 int order, last_idx;
309 struct hlist_node *node;
310 struct fib_node *f;
311 struct fib_info *fi = NULL;
312 struct fib_info *last_resort;
313 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
314 struct fn_zone *fz = t->fn_zones[0];
315 struct hlist_head *head;
316
317 if (fz == NULL)
318 return;
319
320 last_idx = -1;
321 last_resort = NULL;
322 order = -1;
323
324 rcu_read_lock();
325 head = rcu_dereference(fz->fz_hash);
326 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
327 struct fib_alias *fa;
328
329 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
330 struct fib_info *next_fi = fa->fa_info;
331
332 if (fa->fa_scope != res->scope ||
333 fa->fa_type != RTN_UNICAST)
334 continue;
335
336 if (next_fi->fib_priority > res->fi->fib_priority)
337 break;
338 if (!next_fi->fib_nh[0].nh_gw ||
339 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
340 continue;
341
342 fib_alias_accessed(fa);
343
344 if (fi == NULL) {
345 if (next_fi != res->fi)
346 break;
347 } else if (!fib_detect_death(fi, order, &last_resort,
348 &last_idx, tb->tb_default)) {
349 fib_result_assign(res, fi);
350 tb->tb_default = order;
351 goto out;
352 }
353 fi = next_fi;
354 order++;
355 }
356 }
357
358 if (order <= 0 || fi == NULL) {
359 tb->tb_default = -1;
360 goto out;
361 }
362
363 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
364 tb->tb_default)) {
365 fib_result_assign(res, fi);
366 tb->tb_default = order;
367 goto out;
368 }
369
370 if (last_idx >= 0)
371 fib_result_assign(res, last_resort);
372 tb->tb_default = last_idx;
373out:
374 rcu_read_unlock();
375}
376
377/* Insert node F to FZ. */
378static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
379{
380 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
381
382 hlist_add_head_rcu(&f->fn_hash, head);
383}
384
385/* Return the node in FZ matching KEY. */
386static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
387{
388 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
389 struct hlist_node *node;
390 struct fib_node *f;
391
392 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
393 if (f->fn_key == key)
394 return f;
395 }
396
397 return NULL;
398}
399
400
401static struct fib_alias *fib_fast_alloc(struct fib_node *f)
402{
403 struct fib_alias *fa = &f->fn_embedded_alias;
404
405 if (fa->fa_info != NULL)
406 fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
407 return fa;
408}
409
410/* Caller must hold RTNL. */
411int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
412{
413 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
414 struct fib_node *new_f = NULL;
415 struct fib_node *f;
416 struct fib_alias *fa, *new_fa;
417 struct fn_zone *fz;
418 struct fib_info *fi;
419 u8 tos = cfg->fc_tos;
420 __be32 key;
421 int err;
422
423 if (cfg->fc_dst_len > 32)
424 return -EINVAL;
425
426 fz = table->fn_zones[cfg->fc_dst_len];
427 if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
428 return -ENOBUFS;
429
430 key = 0;
431 if (cfg->fc_dst) {
432 if (cfg->fc_dst & ~FZ_MASK(fz))
433 return -EINVAL;
434 key = fz_key(cfg->fc_dst, fz);
435 }
436
437 fi = fib_create_info(cfg);
438 if (IS_ERR(fi))
439 return PTR_ERR(fi);
440
441 if (fz->fz_nent > (fz->fz_divisor<<1) &&
442 fz->fz_divisor < FZ_MAX_DIVISOR &&
443 (cfg->fc_dst_len == 32 ||
444 (1 << cfg->fc_dst_len) > fz->fz_divisor))
445 fn_rehash_zone(fz);
446
447 f = fib_find_node(fz, key);
448
449 if (!f)
450 fa = NULL;
451 else
452 fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
453
454 /* Now fa, if non-NULL, points to the first fib alias
455 * with the same keys [prefix,tos,priority], if such key already
456 * exists or to the node before which we will insert new one.
457 *
458 * If fa is NULL, we will need to allocate a new one and
459 * insert to the head of f.
460 *
461 * If f is NULL, no fib node matched the destination key
462 * and we need to allocate a new one of those as well.
463 */
464
465 if (fa && fa->fa_tos == tos &&
466 fa->fa_info->fib_priority == fi->fib_priority) {
467 struct fib_alias *fa_first, *fa_match;
468
469 err = -EEXIST;
470 if (cfg->fc_nlflags & NLM_F_EXCL)
471 goto out;
472
473 /* We have 2 goals:
474 * 1. Find exact match for type, scope, fib_info to avoid
475 * duplicate routes
476 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
477 */
478 fa_match = NULL;
479 fa_first = fa;
480 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
481 list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
482 if (fa->fa_tos != tos)
483 break;
484 if (fa->fa_info->fib_priority != fi->fib_priority)
485 break;
486 if (fa->fa_type == cfg->fc_type &&
487 fa->fa_scope == cfg->fc_scope &&
488 fa->fa_info == fi) {
489 fa_match = fa;
490 break;
491 }
492 }
493
494 if (cfg->fc_nlflags & NLM_F_REPLACE) {
495 u8 state;
496
497 fa = fa_first;
498 if (fa_match) {
499 if (fa == fa_match)
500 err = 0;
501 goto out;
502 }
503 err = -ENOBUFS;
504 new_fa = fib_fast_alloc(f);
505 if (new_fa == NULL)
506 goto out;
507
508 new_fa->fa_tos = fa->fa_tos;
509 new_fa->fa_info = fi;
510 new_fa->fa_type = cfg->fc_type;
511 new_fa->fa_scope = cfg->fc_scope;
512 state = fa->fa_state;
513 new_fa->fa_state = state & ~FA_S_ACCESSED;
514 fib_hash_genid++;
515 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
516
517 fn_free_alias(fa, f);
518 if (state & FA_S_ACCESSED)
519 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
520 rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
521 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
522 return 0;
523 }
524
525 /* Error if we find a perfect match which
526 * uses the same scope, type, and nexthop
527 * information.
528 */
529 if (fa_match)
530 goto out;
531
532 if (!(cfg->fc_nlflags & NLM_F_APPEND))
533 fa = fa_first;
534 }
535
536 err = -ENOENT;
537 if (!(cfg->fc_nlflags & NLM_F_CREATE))
538 goto out;
539
540 err = -ENOBUFS;
541
542 if (!f) {
543 new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
544 if (new_f == NULL)
545 goto out;
546
547 INIT_HLIST_NODE(&new_f->fn_hash);
548 INIT_LIST_HEAD(&new_f->fn_alias);
549 new_f->fn_key = key;
550 f = new_f;
551 }
552
553 new_fa = fib_fast_alloc(f);
554 if (new_fa == NULL)
555 goto out;
556
557 new_fa->fa_info = fi;
558 new_fa->fa_tos = tos;
559 new_fa->fa_type = cfg->fc_type;
560 new_fa->fa_scope = cfg->fc_scope;
561 new_fa->fa_state = 0;
562
563 /*
564 * Insert new entry to the list.
565 */
566
567 if (new_f)
568 fib_insert_node(fz, new_f);
569 list_add_tail_rcu(&new_fa->fa_list,
570 (fa ? &fa->fa_list : &f->fn_alias));
571 fib_hash_genid++;
572
573 if (new_f)
574 fz->fz_nent++;
575 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
576
577 rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
578 &cfg->fc_nlinfo, 0);
579 return 0;
580
581out:
582 if (new_f)
583 kmem_cache_free(fn_hash_kmem, new_f);
584 fib_release_info(fi);
585 return err;
586}
587
588int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
589{
590 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
591 struct fib_node *f;
592 struct fib_alias *fa, *fa_to_delete;
593 struct fn_zone *fz;
594 __be32 key;
595
596 if (cfg->fc_dst_len > 32)
597 return -EINVAL;
598
599 if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL)
600 return -ESRCH;
601
602 key = 0;
603 if (cfg->fc_dst) {
604 if (cfg->fc_dst & ~FZ_MASK(fz))
605 return -EINVAL;
606 key = fz_key(cfg->fc_dst, fz);
607 }
608
609 f = fib_find_node(fz, key);
610
611 if (!f)
612 fa = NULL;
613 else
614 fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
615 if (!fa)
616 return -ESRCH;
617
618 fa_to_delete = NULL;
619 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
620 list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
621 struct fib_info *fi = fa->fa_info;
622
623 if (fa->fa_tos != cfg->fc_tos)
624 break;
625
626 if ((!cfg->fc_type ||
627 fa->fa_type == cfg->fc_type) &&
628 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
629 fa->fa_scope == cfg->fc_scope) &&
630 (!cfg->fc_protocol ||
631 fi->fib_protocol == cfg->fc_protocol) &&
632 fib_nh_match(cfg, fi) == 0) {
633 fa_to_delete = fa;
634 break;
635 }
636 }
637
638 if (fa_to_delete) {
639 int kill_fn;
640
641 fa = fa_to_delete;
642 rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
643 tb->tb_id, &cfg->fc_nlinfo, 0);
644
645 kill_fn = 0;
646 list_del_rcu(&fa->fa_list);
647 if (list_empty(&f->fn_alias)) {
648 hlist_del_rcu(&f->fn_hash);
649 kill_fn = 1;
650 }
651 fib_hash_genid++;
652
653 if (fa->fa_state & FA_S_ACCESSED)
654 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
655 fn_free_alias(fa, f);
656 if (kill_fn) {
657 fn_free_node(f);
658 fz->fz_nent--;
659 }
660
661 return 0;
662 }
663 return -ESRCH;
664}
665
666static int fn_flush_list(struct fn_zone *fz, int idx)
667{
668 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
669 struct hlist_node *node, *n;
670 struct fib_node *f;
671 int found = 0;
672
673 hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
674 struct fib_alias *fa, *fa_node;
675 int kill_f;
676
677 kill_f = 0;
678 list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
679 struct fib_info *fi = fa->fa_info;
680
681 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
682 list_del_rcu(&fa->fa_list);
683 if (list_empty(&f->fn_alias)) {
684 hlist_del_rcu(&f->fn_hash);
685 kill_f = 1;
686 }
687 fib_hash_genid++;
688
689 fn_free_alias(fa, f);
690 found++;
691 }
692 }
693 if (kill_f) {
694 fn_free_node(f);
695 fz->fz_nent--;
696 }
697 }
698 return found;
699}
700
701/* caller must hold RTNL. */
702int fib_table_flush(struct fib_table *tb)
703{
704 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
705 struct fn_zone *fz;
706 int found = 0;
707
708 for (fz = rtnl_dereference(table->fn_zone_list);
709 fz != NULL;
710 fz = rtnl_dereference(fz->fz_next)) {
711 int i;
712
713 for (i = fz->fz_divisor - 1; i >= 0; i--)
714 found += fn_flush_list(fz, i);
715 }
716 return found;
717}
718
719void fib_free_table(struct fib_table *tb)
720{
721 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
722 struct fn_zone *fz, *next;
723
724 next = table->fn_zone_list;
725 while (next != NULL) {
726 fz = next;
727 next = fz->fz_next;
728
729 if (fz->fz_hash != fz->fz_embedded_hash)
730 fz_hash_free(fz->fz_hash, fz->fz_divisor);
731
732 kfree(fz);
733 }
734
735 kfree(tb);
736}
737
738static inline int
739fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
740 struct fib_table *tb,
741 struct fn_zone *fz,
742 struct hlist_head *head)
743{
744 struct hlist_node *node;
745 struct fib_node *f;
746 int i, s_i;
747
748 s_i = cb->args[4];
749 i = 0;
750 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
751 struct fib_alias *fa;
752
753 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
754 if (i < s_i)
755 goto next;
756
757 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
758 cb->nlh->nlmsg_seq,
759 RTM_NEWROUTE,
760 tb->tb_id,
761 fa->fa_type,
762 fa->fa_scope,
763 f->fn_key,
764 fz->fz_order,
765 fa->fa_tos,
766 fa->fa_info,
767 NLM_F_MULTI) < 0) {
768 cb->args[4] = i;
769 return -1;
770 }
771next:
772 i++;
773 }
774 }
775 cb->args[4] = i;
776 return skb->len;
777}
778
779static inline int
780fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
781 struct fib_table *tb,
782 struct fn_zone *fz)
783{
784 int h, s_h;
785 struct hlist_head *head = rcu_dereference(fz->fz_hash);
786
787 if (head == NULL)
788 return skb->len;
789 s_h = cb->args[3];
790 for (h = s_h; h < fz->fz_divisor; h++) {
791 if (hlist_empty(head + h))
792 continue;
793 if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
794 cb->args[3] = h;
795 return -1;
796 }
797 memset(&cb->args[4], 0,
798 sizeof(cb->args) - 4*sizeof(cb->args[0]));
799 }
800 cb->args[3] = h;
801 return skb->len;
802}
803
804int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
805 struct netlink_callback *cb)
806{
807 int m = 0, s_m;
808 struct fn_zone *fz;
809 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
810
811 s_m = cb->args[2];
812 rcu_read_lock();
813 for (fz = rcu_dereference(table->fn_zone_list);
814 fz != NULL;
815 fz = rcu_dereference(fz->fz_next), m++) {
816 if (m < s_m)
817 continue;
818 if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
819 cb->args[2] = m;
820 rcu_read_unlock();
821 return -1;
822 }
823 memset(&cb->args[3], 0,
824 sizeof(cb->args) - 3*sizeof(cb->args[0]));
825 }
826 rcu_read_unlock();
827 cb->args[2] = m;
828 return skb->len;
829}
830
831void __init fib_hash_init(void)
832{
833 fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
834 0, SLAB_PANIC, NULL);
835
836 fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
837 0, SLAB_PANIC, NULL);
838
839}
840
841struct fib_table *fib_hash_table(u32 id)
842{
843 struct fib_table *tb;
844
845 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
846 GFP_KERNEL);
847 if (tb == NULL)
848 return NULL;
849
850 tb->tb_id = id;
851 tb->tb_default = -1;
852
853 memset(tb->tb_data, 0, sizeof(struct fn_hash));
854 return tb;
855}
856
857/* ------------------------------------------------------------------------ */
858#ifdef CONFIG_PROC_FS
859
860struct fib_iter_state {
861 struct seq_net_private p;
862 struct fn_zone *zone;
863 int bucket;
864 struct hlist_head *hash_head;
865 struct fib_node *fn;
866 struct fib_alias *fa;
867 loff_t pos;
868 unsigned int genid;
869 int valid;
870};
871
872static struct fib_alias *fib_get_first(struct seq_file *seq)
873{
874 struct fib_iter_state *iter = seq->private;
875 struct fib_table *main_table;
876 struct fn_hash *table;
877
878 main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
879 table = (struct fn_hash *)main_table->tb_data;
880
881 iter->bucket = 0;
882 iter->hash_head = NULL;
883 iter->fn = NULL;
884 iter->fa = NULL;
885 iter->pos = 0;
886 iter->genid = fib_hash_genid;
887 iter->valid = 1;
888
889 for (iter->zone = rcu_dereference(table->fn_zone_list);
890 iter->zone != NULL;
891 iter->zone = rcu_dereference(iter->zone->fz_next)) {
892 int maxslot;
893
894 if (!iter->zone->fz_nent)
895 continue;
896
897 iter->hash_head = rcu_dereference(iter->zone->fz_hash);
898 maxslot = iter->zone->fz_divisor;
899
900 for (iter->bucket = 0; iter->bucket < maxslot;
901 ++iter->bucket, ++iter->hash_head) {
902 struct hlist_node *node;
903 struct fib_node *fn;
904
905 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
906 struct fib_alias *fa;
907
908 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
909 iter->fn = fn;
910 iter->fa = fa;
911 goto out;
912 }
913 }
914 }
915 }
916out:
917 return iter->fa;
918}
919
920static struct fib_alias *fib_get_next(struct seq_file *seq)
921{
922 struct fib_iter_state *iter = seq->private;
923 struct fib_node *fn;
924 struct fib_alias *fa;
925
926 /* Advance FA, if any. */
927 fn = iter->fn;
928 fa = iter->fa;
929 if (fa) {
930 BUG_ON(!fn);
931 list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
932 iter->fa = fa;
933 goto out;
934 }
935 }
936
937 fa = iter->fa = NULL;
938
939 /* Advance FN. */
940 if (fn) {
941 struct hlist_node *node = &fn->fn_hash;
942 hlist_for_each_entry_continue(fn, node, fn_hash) {
943 iter->fn = fn;
944
945 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
946 iter->fa = fa;
947 goto out;
948 }
949 }
950 }
951
952 fn = iter->fn = NULL;
953
954 /* Advance hash chain. */
955 if (!iter->zone)
956 goto out;
957
958 for (;;) {
959 struct hlist_node *node;
960 int maxslot;
961
962 maxslot = iter->zone->fz_divisor;
963
964 while (++iter->bucket < maxslot) {
965 iter->hash_head++;
966
967 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
968 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
969 iter->fn = fn;
970 iter->fa = fa;
971 goto out;
972 }
973 }
974 }
975
976 iter->zone = rcu_dereference(iter->zone->fz_next);
977
978 if (!iter->zone)
979 goto out;
980
981 iter->bucket = 0;
982 iter->hash_head = rcu_dereference(iter->zone->fz_hash);
983
984 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
985 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
986 iter->fn = fn;
987 iter->fa = fa;
988 goto out;
989 }
990 }
991 }
992out:
993 iter->pos++;
994 return fa;
995}
996
997static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
998{
999 struct fib_iter_state *iter = seq->private;
1000 struct fib_alias *fa;
1001
1002 if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
1003 fa = iter->fa;
1004 pos -= iter->pos;
1005 } else
1006 fa = fib_get_first(seq);
1007
1008 if (fa)
1009 while (pos && (fa = fib_get_next(seq)))
1010 --pos;
1011 return pos ? NULL : fa;
1012}
1013
1014static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
1015 __acquires(RCU)
1016{
1017 void *v = NULL;
1018
1019 rcu_read_lock();
1020 if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
1021 v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1022 return v;
1023}
1024
1025static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1026{
1027 ++*pos;
1028 return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
1029}
1030
1031static void fib_seq_stop(struct seq_file *seq, void *v)
1032 __releases(RCU)
1033{
1034 rcu_read_unlock();
1035}
1036
1037static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
1038{
1039 static const unsigned type2flags[RTN_MAX + 1] = {
1040 [7] = RTF_REJECT,
1041 [8] = RTF_REJECT,
1042 };
1043 unsigned flags = type2flags[type];
1044
1045 if (fi && fi->fib_nh->nh_gw)
1046 flags |= RTF_GATEWAY;
1047 if (mask == htonl(0xFFFFFFFF))
1048 flags |= RTF_HOST;
1049 flags |= RTF_UP;
1050 return flags;
1051}
1052
1053/*
1054 * This outputs /proc/net/route.
1055 *
1056 * It always works in backward compatibility mode.
1057 * The format of the file is not supposed to be changed.
1058 */
1059static int fib_seq_show(struct seq_file *seq, void *v)
1060{
1061 struct fib_iter_state *iter;
1062 int len;
1063 __be32 prefix, mask;
1064 unsigned flags;
1065 struct fib_node *f;
1066 struct fib_alias *fa;
1067 struct fib_info *fi;
1068
1069 if (v == SEQ_START_TOKEN) {
1070 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
1071 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
1072 "\tWindow\tIRTT");
1073 goto out;
1074 }
1075
1076 iter = seq->private;
1077 f = iter->fn;
1078 fa = iter->fa;
1079 fi = fa->fa_info;
1080 prefix = f->fn_key;
1081 mask = FZ_MASK(iter->zone);
1082 flags = fib_flag_trans(fa->fa_type, mask, fi);
1083 if (fi)
1084 seq_printf(seq,
1085 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
1086 fi->fib_dev ? fi->fib_dev->name : "*", prefix,
1087 fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
1088 mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
1089 fi->fib_window,
1090 fi->fib_rtt >> 3, &len);
1091 else
1092 seq_printf(seq,
1093 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
1094 prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len);
1095
1096 seq_printf(seq, "%*s\n", 127 - len, "");
1097out:
1098 return 0;
1099}
1100
1101static const struct seq_operations fib_seq_ops = {
1102 .start = fib_seq_start,
1103 .next = fib_seq_next,
1104 .stop = fib_seq_stop,
1105 .show = fib_seq_show,
1106};
1107
1108static int fib_seq_open(struct inode *inode, struct file *file)
1109{
1110 return seq_open_net(inode, file, &fib_seq_ops,
1111 sizeof(struct fib_iter_state));
1112}
1113
1114static const struct file_operations fib_seq_fops = {
1115 .owner = THIS_MODULE,
1116 .open = fib_seq_open,
1117 .read = seq_read,
1118 .llseek = seq_lseek,
1119 .release = seq_release_net,
1120};
1121
1122int __net_init fib_proc_init(struct net *net)
1123{
1124 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops))
1125 return -ENOMEM;
1126 return 0;
1127}
1128
1129void __net_exit fib_proc_exit(struct net *net)
1130{
1131 proc_net_remove(net, "route");
1132}
1133#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c079cc0ec651..af0f14aba169 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -10,7 +10,6 @@ struct fib_alias {
10 struct fib_info *fa_info; 10 struct fib_info *fa_info;
11 u8 fa_tos; 11 u8 fa_tos;
12 u8 fa_type; 12 u8 fa_type;
13 u8 fa_scope;
14 u8 fa_state; 13 u8 fa_state;
15 struct rcu_head rcu; 14 struct rcu_head rcu;
16}; 15};
@@ -25,14 +24,11 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
25} 24}
26 25
27/* Exported by fib_semantics.c */ 26/* Exported by fib_semantics.c */
28extern int fib_semantic_match(struct list_head *head,
29 const struct flowi *flp,
30 struct fib_result *res, int prefixlen, int fib_flags);
31extern void fib_release_info(struct fib_info *); 27extern void fib_release_info(struct fib_info *);
32extern struct fib_info *fib_create_info(struct fib_config *cfg); 28extern struct fib_info *fib_create_info(struct fib_config *cfg);
33extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); 29extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
34extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 30extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
35 u32 tb_id, u8 type, u8 scope, __be32 dst, 31 u32 tb_id, u8 type, __be32 dst,
36 int dst_len, u8 tos, struct fib_info *fi, 32 int dst_len, u8 tos, struct fib_info *fi,
37 unsigned int); 33 unsigned int);
38extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, 34extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
@@ -51,4 +47,11 @@ static inline void fib_result_assign(struct fib_result *res,
51 res->fi = fi; 47 res->fi = fi;
52} 48}
53 49
50struct fib_prop {
51 int error;
52 u8 scope;
53};
54
55extern const struct fib_prop fib_props[RTN_MAX + 1];
56
54#endif /* _FIB_LOOKUP_H */ 57#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 7981a24f5c7b..a53bb1b5b118 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -41,19 +41,19 @@ struct fib4_rule {
41 __be32 srcmask; 41 __be32 srcmask;
42 __be32 dst; 42 __be32 dst;
43 __be32 dstmask; 43 __be32 dstmask;
44#ifdef CONFIG_NET_CLS_ROUTE 44#ifdef CONFIG_IP_ROUTE_CLASSID
45 u32 tclassid; 45 u32 tclassid;
46#endif 46#endif
47}; 47};
48 48
49#ifdef CONFIG_NET_CLS_ROUTE 49#ifdef CONFIG_IP_ROUTE_CLASSID
50u32 fib_rules_tclass(struct fib_result *res) 50u32 fib_rules_tclass(const struct fib_result *res)
51{ 51{
52 return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; 52 return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
53} 53}
54#endif 54#endif
55 55
56int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) 56int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
57{ 57{
58 struct fib_lookup_arg arg = { 58 struct fib_lookup_arg arg = {
59 .result = res, 59 .result = res,
@@ -61,7 +61,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
61 }; 61 };
62 int err; 62 int err;
63 63
64 err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg); 64 err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
65 res->r = arg.rule; 65 res->r = arg.rule;
66 66
67 return err; 67 return err;
@@ -95,7 +95,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
95 if (!tbl) 95 if (!tbl)
96 goto errout; 96 goto errout;
97 97
98 err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags); 98 err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
99 if (err > 0) 99 if (err > 0)
100 err = -EAGAIN; 100 err = -EAGAIN;
101errout: 101errout:
@@ -106,14 +106,15 @@ errout:
106static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) 106static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
107{ 107{
108 struct fib4_rule *r = (struct fib4_rule *) rule; 108 struct fib4_rule *r = (struct fib4_rule *) rule;
109 __be32 daddr = fl->fl4_dst; 109 struct flowi4 *fl4 = &fl->u.ip4;
110 __be32 saddr = fl->fl4_src; 110 __be32 daddr = fl4->daddr;
111 __be32 saddr = fl4->saddr;
111 112
112 if (((saddr ^ r->src) & r->srcmask) || 113 if (((saddr ^ r->src) & r->srcmask) ||
113 ((daddr ^ r->dst) & r->dstmask)) 114 ((daddr ^ r->dst) & r->dstmask))
114 return 0; 115 return 0;
115 116
116 if (r->tos && (r->tos != fl->fl4_tos)) 117 if (r->tos && (r->tos != fl4->flowi4_tos))
117 return 0; 118 return 0;
118 119
119 return 1; 120 return 1;
@@ -165,7 +166,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
165 if (frh->dst_len) 166 if (frh->dst_len)
166 rule4->dst = nla_get_be32(tb[FRA_DST]); 167 rule4->dst = nla_get_be32(tb[FRA_DST]);
167 168
168#ifdef CONFIG_NET_CLS_ROUTE 169#ifdef CONFIG_IP_ROUTE_CLASSID
169 if (tb[FRA_FLOW]) 170 if (tb[FRA_FLOW])
170 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); 171 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
171#endif 172#endif
@@ -195,7 +196,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
195 if (frh->tos && (rule4->tos != frh->tos)) 196 if (frh->tos && (rule4->tos != frh->tos))
196 return 0; 197 return 0;
197 198
198#ifdef CONFIG_NET_CLS_ROUTE 199#ifdef CONFIG_IP_ROUTE_CLASSID
199 if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) 200 if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
200 return 0; 201 return 0;
201#endif 202#endif
@@ -224,7 +225,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
224 if (rule4->src_len) 225 if (rule4->src_len)
225 NLA_PUT_BE32(skb, FRA_SRC, rule4->src); 226 NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
226 227
227#ifdef CONFIG_NET_CLS_ROUTE 228#ifdef CONFIG_IP_ROUTE_CLASSID
228 if (rule4->tclassid) 229 if (rule4->tclassid)
229 NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); 230 NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
230#endif 231#endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 12d3dc3df1b7..641a5a2a9f9c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -49,7 +49,7 @@
49static DEFINE_SPINLOCK(fib_info_lock); 49static DEFINE_SPINLOCK(fib_info_lock);
50static struct hlist_head *fib_info_hash; 50static struct hlist_head *fib_info_hash;
51static struct hlist_head *fib_info_laddrhash; 51static struct hlist_head *fib_info_laddrhash;
52static unsigned int fib_hash_size; 52static unsigned int fib_info_hash_size;
53static unsigned int fib_info_cnt; 53static unsigned int fib_info_cnt;
54 54
55#define DEVINDEX_HASHBITS 8 55#define DEVINDEX_HASHBITS 8
@@ -90,11 +90,7 @@ static DEFINE_SPINLOCK(fib_multipath_lock);
90#define endfor_nexthops(fi) } 90#define endfor_nexthops(fi) }
91 91
92 92
93static const struct 93const struct fib_prop fib_props[RTN_MAX + 1] = {
94{
95 int error;
96 u8 scope;
97} fib_props[RTN_MAX + 1] = {
98 [RTN_UNSPEC] = { 94 [RTN_UNSPEC] = {
99 .error = 0, 95 .error = 0,
100 .scope = RT_SCOPE_NOWHERE, 96 .scope = RT_SCOPE_NOWHERE,
@@ -152,6 +148,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
152{ 148{
153 struct fib_info *fi = container_of(head, struct fib_info, rcu); 149 struct fib_info *fi = container_of(head, struct fib_info, rcu);
154 150
151 if (fi->fib_metrics != (u32 *) dst_default_metrics)
152 kfree(fi->fib_metrics);
155 kfree(fi); 153 kfree(fi);
156} 154}
157 155
@@ -200,7 +198,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
200#ifdef CONFIG_IP_ROUTE_MULTIPATH 198#ifdef CONFIG_IP_ROUTE_MULTIPATH
201 nh->nh_weight != onh->nh_weight || 199 nh->nh_weight != onh->nh_weight ||
202#endif 200#endif
203#ifdef CONFIG_NET_CLS_ROUTE 201#ifdef CONFIG_IP_ROUTE_CLASSID
204 nh->nh_tclassid != onh->nh_tclassid || 202 nh->nh_tclassid != onh->nh_tclassid ||
205#endif 203#endif
206 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) 204 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
@@ -221,10 +219,10 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
221 219
222static inline unsigned int fib_info_hashfn(const struct fib_info *fi) 220static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
223{ 221{
224 unsigned int mask = (fib_hash_size - 1); 222 unsigned int mask = (fib_info_hash_size - 1);
225 unsigned int val = fi->fib_nhs; 223 unsigned int val = fi->fib_nhs;
226 224
227 val ^= fi->fib_protocol; 225 val ^= (fi->fib_protocol << 8) | fi->fib_scope;
228 val ^= (__force u32)fi->fib_prefsrc; 226 val ^= (__force u32)fi->fib_prefsrc;
229 val ^= fi->fib_priority; 227 val ^= fi->fib_priority;
230 for_nexthops(fi) { 228 for_nexthops(fi) {
@@ -250,10 +248,11 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
250 if (fi->fib_nhs != nfi->fib_nhs) 248 if (fi->fib_nhs != nfi->fib_nhs)
251 continue; 249 continue;
252 if (nfi->fib_protocol == fi->fib_protocol && 250 if (nfi->fib_protocol == fi->fib_protocol &&
251 nfi->fib_scope == fi->fib_scope &&
253 nfi->fib_prefsrc == fi->fib_prefsrc && 252 nfi->fib_prefsrc == fi->fib_prefsrc &&
254 nfi->fib_priority == fi->fib_priority && 253 nfi->fib_priority == fi->fib_priority &&
255 memcmp(nfi->fib_metrics, fi->fib_metrics, 254 memcmp(nfi->fib_metrics, fi->fib_metrics,
256 sizeof(fi->fib_metrics)) == 0 && 255 sizeof(u32) * RTAX_MAX) == 0 &&
257 ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && 256 ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
258 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) 257 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
259 return fi; 258 return fi;
@@ -330,7 +329,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
330 goto errout; 329 goto errout;
331 330
332 err = fib_dump_info(skb, info->pid, seq, event, tb_id, 331 err = fib_dump_info(skb, info->pid, seq, event, tb_id,
333 fa->fa_type, fa->fa_scope, key, dst_len, 332 fa->fa_type, key, dst_len,
334 fa->fa_tos, fa->fa_info, nlm_flags); 333 fa->fa_tos, fa->fa_info, nlm_flags);
335 if (err < 0) { 334 if (err < 0) {
336 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ 335 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
@@ -422,7 +421,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
422 421
423 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 422 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
424 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; 423 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
425#ifdef CONFIG_NET_CLS_ROUTE 424#ifdef CONFIG_IP_ROUTE_CLASSID
426 nla = nla_find(attrs, attrlen, RTA_FLOW); 425 nla = nla_find(attrs, attrlen, RTA_FLOW);
427 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 426 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
428#endif 427#endif
@@ -476,7 +475,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
476 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 475 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
477 if (nla && nla_get_be32(nla) != nh->nh_gw) 476 if (nla && nla_get_be32(nla) != nh->nh_gw)
478 return 1; 477 return 1;
479#ifdef CONFIG_NET_CLS_ROUTE 478#ifdef CONFIG_IP_ROUTE_CLASSID
480 nla = nla_find(attrs, attrlen, RTA_FLOW); 479 nla = nla_find(attrs, attrlen, RTA_FLOW);
481 if (nla && nla_get_u32(nla) != nh->nh_tclassid) 480 if (nla && nla_get_u32(nla) != nh->nh_tclassid)
482 return 1; 481 return 1;
@@ -562,16 +561,16 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
562 } 561 }
563 rcu_read_lock(); 562 rcu_read_lock();
564 { 563 {
565 struct flowi fl = { 564 struct flowi4 fl4 = {
566 .fl4_dst = nh->nh_gw, 565 .daddr = nh->nh_gw,
567 .fl4_scope = cfg->fc_scope + 1, 566 .flowi4_scope = cfg->fc_scope + 1,
568 .oif = nh->nh_oif, 567 .flowi4_oif = nh->nh_oif,
569 }; 568 };
570 569
571 /* It is not necessary, but requires a bit of thinking */ 570 /* It is not necessary, but requires a bit of thinking */
572 if (fl.fl4_scope < RT_SCOPE_LINK) 571 if (fl4.flowi4_scope < RT_SCOPE_LINK)
573 fl.fl4_scope = RT_SCOPE_LINK; 572 fl4.flowi4_scope = RT_SCOPE_LINK;
574 err = fib_lookup(net, &fl, &res); 573 err = fib_lookup(net, &fl4, &res);
575 if (err) { 574 if (err) {
576 rcu_read_unlock(); 575 rcu_read_unlock();
577 return err; 576 return err;
@@ -613,14 +612,14 @@ out:
613 612
614static inline unsigned int fib_laddr_hashfn(__be32 val) 613static inline unsigned int fib_laddr_hashfn(__be32 val)
615{ 614{
616 unsigned int mask = (fib_hash_size - 1); 615 unsigned int mask = (fib_info_hash_size - 1);
617 616
618 return ((__force u32)val ^ 617 return ((__force u32)val ^
619 ((__force u32)val >> 7) ^ 618 ((__force u32)val >> 7) ^
620 ((__force u32)val >> 14)) & mask; 619 ((__force u32)val >> 14)) & mask;
621} 620}
622 621
623static struct hlist_head *fib_hash_alloc(int bytes) 622static struct hlist_head *fib_info_hash_alloc(int bytes)
624{ 623{
625 if (bytes <= PAGE_SIZE) 624 if (bytes <= PAGE_SIZE)
626 return kzalloc(bytes, GFP_KERNEL); 625 return kzalloc(bytes, GFP_KERNEL);
@@ -630,7 +629,7 @@ static struct hlist_head *fib_hash_alloc(int bytes)
630 get_order(bytes)); 629 get_order(bytes));
631} 630}
632 631
633static void fib_hash_free(struct hlist_head *hash, int bytes) 632static void fib_info_hash_free(struct hlist_head *hash, int bytes)
634{ 633{
635 if (!hash) 634 if (!hash)
636 return; 635 return;
@@ -641,18 +640,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes)
641 free_pages((unsigned long) hash, get_order(bytes)); 640 free_pages((unsigned long) hash, get_order(bytes));
642} 641}
643 642
644static void fib_hash_move(struct hlist_head *new_info_hash, 643static void fib_info_hash_move(struct hlist_head *new_info_hash,
645 struct hlist_head *new_laddrhash, 644 struct hlist_head *new_laddrhash,
646 unsigned int new_size) 645 unsigned int new_size)
647{ 646{
648 struct hlist_head *old_info_hash, *old_laddrhash; 647 struct hlist_head *old_info_hash, *old_laddrhash;
649 unsigned int old_size = fib_hash_size; 648 unsigned int old_size = fib_info_hash_size;
650 unsigned int i, bytes; 649 unsigned int i, bytes;
651 650
652 spin_lock_bh(&fib_info_lock); 651 spin_lock_bh(&fib_info_lock);
653 old_info_hash = fib_info_hash; 652 old_info_hash = fib_info_hash;
654 old_laddrhash = fib_info_laddrhash; 653 old_laddrhash = fib_info_laddrhash;
655 fib_hash_size = new_size; 654 fib_info_hash_size = new_size;
656 655
657 for (i = 0; i < old_size; i++) { 656 for (i = 0; i < old_size; i++) {
658 struct hlist_head *head = &fib_info_hash[i]; 657 struct hlist_head *head = &fib_info_hash[i];
@@ -693,8 +692,18 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
693 spin_unlock_bh(&fib_info_lock); 692 spin_unlock_bh(&fib_info_lock);
694 693
695 bytes = old_size * sizeof(struct hlist_head *); 694 bytes = old_size * sizeof(struct hlist_head *);
696 fib_hash_free(old_info_hash, bytes); 695 fib_info_hash_free(old_info_hash, bytes);
697 fib_hash_free(old_laddrhash, bytes); 696 fib_info_hash_free(old_laddrhash, bytes);
697}
698
699__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
700{
701 nh->nh_saddr = inet_select_addr(nh->nh_dev,
702 nh->nh_gw,
703 nh->nh_parent->fib_scope);
704 nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
705
706 return nh->nh_saddr;
698} 707}
699 708
700struct fib_info *fib_create_info(struct fib_config *cfg) 709struct fib_info *fib_create_info(struct fib_config *cfg)
@@ -705,6 +714,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
705 int nhs = 1; 714 int nhs = 1;
706 struct net *net = cfg->fc_nlinfo.nl_net; 715 struct net *net = cfg->fc_nlinfo.nl_net;
707 716
717 if (cfg->fc_type > RTN_MAX)
718 goto err_inval;
719
708 /* Fast check to catch the most weird cases */ 720 /* Fast check to catch the most weird cases */
709 if (fib_props[cfg->fc_type].scope > cfg->fc_scope) 721 if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
710 goto err_inval; 722 goto err_inval;
@@ -718,8 +730,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
718#endif 730#endif
719 731
720 err = -ENOBUFS; 732 err = -ENOBUFS;
721 if (fib_info_cnt >= fib_hash_size) { 733 if (fib_info_cnt >= fib_info_hash_size) {
722 unsigned int new_size = fib_hash_size << 1; 734 unsigned int new_size = fib_info_hash_size << 1;
723 struct hlist_head *new_info_hash; 735 struct hlist_head *new_info_hash;
724 struct hlist_head *new_laddrhash; 736 struct hlist_head *new_laddrhash;
725 unsigned int bytes; 737 unsigned int bytes;
@@ -727,25 +739,32 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
727 if (!new_size) 739 if (!new_size)
728 new_size = 1; 740 new_size = 1;
729 bytes = new_size * sizeof(struct hlist_head *); 741 bytes = new_size * sizeof(struct hlist_head *);
730 new_info_hash = fib_hash_alloc(bytes); 742 new_info_hash = fib_info_hash_alloc(bytes);
731 new_laddrhash = fib_hash_alloc(bytes); 743 new_laddrhash = fib_info_hash_alloc(bytes);
732 if (!new_info_hash || !new_laddrhash) { 744 if (!new_info_hash || !new_laddrhash) {
733 fib_hash_free(new_info_hash, bytes); 745 fib_info_hash_free(new_info_hash, bytes);
734 fib_hash_free(new_laddrhash, bytes); 746 fib_info_hash_free(new_laddrhash, bytes);
735 } else 747 } else
736 fib_hash_move(new_info_hash, new_laddrhash, new_size); 748 fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
737 749
738 if (!fib_hash_size) 750 if (!fib_info_hash_size)
739 goto failure; 751 goto failure;
740 } 752 }
741 753
742 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); 754 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
743 if (fi == NULL) 755 if (fi == NULL)
744 goto failure; 756 goto failure;
757 if (cfg->fc_mx) {
758 fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
759 if (!fi->fib_metrics)
760 goto failure;
761 } else
762 fi->fib_metrics = (u32 *) dst_default_metrics;
745 fib_info_cnt++; 763 fib_info_cnt++;
746 764
747 fi->fib_net = hold_net(net); 765 fi->fib_net = hold_net(net);
748 fi->fib_protocol = cfg->fc_protocol; 766 fi->fib_protocol = cfg->fc_protocol;
767 fi->fib_scope = cfg->fc_scope;
749 fi->fib_flags = cfg->fc_flags; 768 fi->fib_flags = cfg->fc_flags;
750 fi->fib_priority = cfg->fc_priority; 769 fi->fib_priority = cfg->fc_priority;
751 fi->fib_prefsrc = cfg->fc_prefsrc; 770 fi->fib_prefsrc = cfg->fc_prefsrc;
@@ -779,7 +798,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
779 goto err_inval; 798 goto err_inval;
780 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) 799 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
781 goto err_inval; 800 goto err_inval;
782#ifdef CONFIG_NET_CLS_ROUTE 801#ifdef CONFIG_IP_ROUTE_CLASSID
783 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) 802 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
784 goto err_inval; 803 goto err_inval;
785#endif 804#endif
@@ -792,7 +811,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
792 nh->nh_oif = cfg->fc_oif; 811 nh->nh_oif = cfg->fc_oif;
793 nh->nh_gw = cfg->fc_gw; 812 nh->nh_gw = cfg->fc_gw;
794 nh->nh_flags = cfg->fc_flags; 813 nh->nh_flags = cfg->fc_flags;
795#ifdef CONFIG_NET_CLS_ROUTE 814#ifdef CONFIG_IP_ROUTE_CLASSID
796 nh->nh_tclassid = cfg->fc_flow; 815 nh->nh_tclassid = cfg->fc_flow;
797#endif 816#endif
798#ifdef CONFIG_IP_ROUTE_MULTIPATH 817#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -804,6 +823,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
804 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) 823 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
805 goto err_inval; 824 goto err_inval;
806 goto link_it; 825 goto link_it;
826 } else {
827 switch (cfg->fc_type) {
828 case RTN_UNICAST:
829 case RTN_LOCAL:
830 case RTN_BROADCAST:
831 case RTN_ANYCAST:
832 case RTN_MULTICAST:
833 break;
834 default:
835 goto err_inval;
836 }
807 } 837 }
808 838
809 if (cfg->fc_scope > RT_SCOPE_HOST) 839 if (cfg->fc_scope > RT_SCOPE_HOST)
@@ -835,6 +865,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
835 goto err_inval; 865 goto err_inval;
836 } 866 }
837 867
868 change_nexthops(fi) {
869 fib_info_update_nh_saddr(net, nexthop_nh);
870 } endfor_nexthops(fi)
871
838link_it: 872link_it:
839 ofi = fib_find_info(fi); 873 ofi = fib_find_info(fi);
840 if (ofi) { 874 if (ofi) {
@@ -880,86 +914,8 @@ failure:
880 return ERR_PTR(err); 914 return ERR_PTR(err);
881} 915}
882 916
883/* Note! fib_semantic_match intentionally uses RCU list functions. */
884int fib_semantic_match(struct list_head *head, const struct flowi *flp,
885 struct fib_result *res, int prefixlen, int fib_flags)
886{
887 struct fib_alias *fa;
888 int nh_sel = 0;
889
890 list_for_each_entry_rcu(fa, head, fa_list) {
891 int err;
892
893 if (fa->fa_tos &&
894 fa->fa_tos != flp->fl4_tos)
895 continue;
896
897 if (fa->fa_scope < flp->fl4_scope)
898 continue;
899
900 fib_alias_accessed(fa);
901
902 err = fib_props[fa->fa_type].error;
903 if (err == 0) {
904 struct fib_info *fi = fa->fa_info;
905
906 if (fi->fib_flags & RTNH_F_DEAD)
907 continue;
908
909 switch (fa->fa_type) {
910 case RTN_UNICAST:
911 case RTN_LOCAL:
912 case RTN_BROADCAST:
913 case RTN_ANYCAST:
914 case RTN_MULTICAST:
915 for_nexthops(fi) {
916 if (nh->nh_flags & RTNH_F_DEAD)
917 continue;
918 if (!flp->oif || flp->oif == nh->nh_oif)
919 break;
920 }
921#ifdef CONFIG_IP_ROUTE_MULTIPATH
922 if (nhsel < fi->fib_nhs) {
923 nh_sel = nhsel;
924 goto out_fill_res;
925 }
926#else
927 if (nhsel < 1)
928 goto out_fill_res;
929#endif
930 endfor_nexthops(fi);
931 continue;
932
933 default:
934 pr_warning("fib_semantic_match bad type %#x\n",
935 fa->fa_type);
936 return -EINVAL;
937 }
938 }
939 return err;
940 }
941 return 1;
942
943out_fill_res:
944 res->prefixlen = prefixlen;
945 res->nh_sel = nh_sel;
946 res->type = fa->fa_type;
947 res->scope = fa->fa_scope;
948 res->fi = fa->fa_info;
949 if (!(fib_flags & FIB_LOOKUP_NOREF))
950 atomic_inc(&res->fi->fib_clntref);
951 return 0;
952}
953
954/* Find appropriate source address to this destination */
955
956__be32 __fib_res_prefsrc(struct fib_result *res)
957{
958 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
959}
960
961int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 917int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
962 u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, 918 u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
963 struct fib_info *fi, unsigned int flags) 919 struct fib_info *fi, unsigned int flags)
964{ 920{
965 struct nlmsghdr *nlh; 921 struct nlmsghdr *nlh;
@@ -981,7 +937,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
981 NLA_PUT_U32(skb, RTA_TABLE, tb_id); 937 NLA_PUT_U32(skb, RTA_TABLE, tb_id);
982 rtm->rtm_type = type; 938 rtm->rtm_type = type;
983 rtm->rtm_flags = fi->fib_flags; 939 rtm->rtm_flags = fi->fib_flags;
984 rtm->rtm_scope = scope; 940 rtm->rtm_scope = fi->fib_scope;
985 rtm->rtm_protocol = fi->fib_protocol; 941 rtm->rtm_protocol = fi->fib_protocol;
986 942
987 if (rtm->rtm_dst_len) 943 if (rtm->rtm_dst_len)
@@ -1002,7 +958,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
1002 958
1003 if (fi->fib_nh->nh_oif) 959 if (fi->fib_nh->nh_oif)
1004 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); 960 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
1005#ifdef CONFIG_NET_CLS_ROUTE 961#ifdef CONFIG_IP_ROUTE_CLASSID
1006 if (fi->fib_nh[0].nh_tclassid) 962 if (fi->fib_nh[0].nh_tclassid)
1007 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); 963 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
1008#endif 964#endif
@@ -1027,7 +983,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
1027 983
1028 if (nh->nh_gw) 984 if (nh->nh_gw)
1029 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); 985 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1030#ifdef CONFIG_NET_CLS_ROUTE 986#ifdef CONFIG_IP_ROUTE_CLASSID
1031 if (nh->nh_tclassid) 987 if (nh->nh_tclassid)
1032 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); 988 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1033#endif 989#endif
@@ -1125,6 +1081,62 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1125 return ret; 1081 return ret;
1126} 1082}
1127 1083
1084/* Must be invoked inside of an RCU protected region. */
1085void fib_select_default(struct fib_result *res)
1086{
1087 struct fib_info *fi = NULL, *last_resort = NULL;
1088 struct list_head *fa_head = res->fa_head;
1089 struct fib_table *tb = res->table;
1090 int order = -1, last_idx = -1;
1091 struct fib_alias *fa;
1092
1093 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1094 struct fib_info *next_fi = fa->fa_info;
1095
1096 if (next_fi->fib_scope != res->scope ||
1097 fa->fa_type != RTN_UNICAST)
1098 continue;
1099
1100 if (next_fi->fib_priority > res->fi->fib_priority)
1101 break;
1102 if (!next_fi->fib_nh[0].nh_gw ||
1103 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1104 continue;
1105
1106 fib_alias_accessed(fa);
1107
1108 if (fi == NULL) {
1109 if (next_fi != res->fi)
1110 break;
1111 } else if (!fib_detect_death(fi, order, &last_resort,
1112 &last_idx, tb->tb_default)) {
1113 fib_result_assign(res, fi);
1114 tb->tb_default = order;
1115 goto out;
1116 }
1117 fi = next_fi;
1118 order++;
1119 }
1120
1121 if (order <= 0 || fi == NULL) {
1122 tb->tb_default = -1;
1123 goto out;
1124 }
1125
1126 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1127 tb->tb_default)) {
1128 fib_result_assign(res, fi);
1129 tb->tb_default = order;
1130 goto out;
1131 }
1132
1133 if (last_idx >= 0)
1134 fib_result_assign(res, last_resort);
1135 tb->tb_default = last_idx;
1136out:
1137 return;
1138}
1139
1128#ifdef CONFIG_IP_ROUTE_MULTIPATH 1140#ifdef CONFIG_IP_ROUTE_MULTIPATH
1129 1141
1130/* 1142/*
@@ -1189,7 +1201,7 @@ int fib_sync_up(struct net_device *dev)
1189 * The algorithm is suboptimal, but it provides really 1201 * The algorithm is suboptimal, but it provides really
1190 * fair weighted route distribution. 1202 * fair weighted route distribution.
1191 */ 1203 */
1192void fib_select_multipath(const struct flowi *flp, struct fib_result *res) 1204void fib_select_multipath(struct fib_result *res)
1193{ 1205{
1194 struct fib_info *fi = res->fi; 1206 struct fib_info *fi = res->fi;
1195 int w; 1207 int w;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0f280348e0fd..b92c86f6e9b3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -95,7 +95,7 @@ typedef unsigned int t_key;
95#define IS_TNODE(n) (!(n->parent & T_LEAF)) 95#define IS_TNODE(n) (!(n->parent & T_LEAF))
96#define IS_LEAF(n) (n->parent & T_LEAF) 96#define IS_LEAF(n) (n->parent & T_LEAF)
97 97
98struct node { 98struct rt_trie_node {
99 unsigned long parent; 99 unsigned long parent;
100 t_key key; 100 t_key key;
101}; 101};
@@ -126,7 +126,7 @@ struct tnode {
126 struct work_struct work; 126 struct work_struct work;
127 struct tnode *tnode_free; 127 struct tnode *tnode_free;
128 }; 128 };
129 struct node *child[0]; 129 struct rt_trie_node *child[0];
130}; 130};
131 131
132#ifdef CONFIG_IP_FIB_TRIE_STATS 132#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -151,16 +151,16 @@ struct trie_stat {
151}; 151};
152 152
153struct trie { 153struct trie {
154 struct node *trie; 154 struct rt_trie_node *trie;
155#ifdef CONFIG_IP_FIB_TRIE_STATS 155#ifdef CONFIG_IP_FIB_TRIE_STATS
156 struct trie_use_stats stats; 156 struct trie_use_stats stats;
157#endif 157#endif
158}; 158};
159 159
160static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); 160static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
161static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, 161static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
162 int wasfull); 162 int wasfull);
163static struct node *resize(struct trie *t, struct tnode *tn); 163static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
164static struct tnode *inflate(struct trie *t, struct tnode *tn); 164static struct tnode *inflate(struct trie *t, struct tnode *tn);
165static struct tnode *halve(struct trie *t, struct tnode *tn); 165static struct tnode *halve(struct trie *t, struct tnode *tn);
166/* tnodes to free after resize(); protected by RTNL */ 166/* tnodes to free after resize(); protected by RTNL */
@@ -177,12 +177,12 @@ static const int sync_pages = 128;
177static struct kmem_cache *fn_alias_kmem __read_mostly; 177static struct kmem_cache *fn_alias_kmem __read_mostly;
178static struct kmem_cache *trie_leaf_kmem __read_mostly; 178static struct kmem_cache *trie_leaf_kmem __read_mostly;
179 179
180static inline struct tnode *node_parent(struct node *node) 180static inline struct tnode *node_parent(struct rt_trie_node *node)
181{ 181{
182 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); 182 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
183} 183}
184 184
185static inline struct tnode *node_parent_rcu(struct node *node) 185static inline struct tnode *node_parent_rcu(struct rt_trie_node *node)
186{ 186{
187 struct tnode *ret = node_parent(node); 187 struct tnode *ret = node_parent(node);
188 188
@@ -192,22 +192,22 @@ static inline struct tnode *node_parent_rcu(struct node *node)
192/* Same as rcu_assign_pointer 192/* Same as rcu_assign_pointer
193 * but that macro() assumes that value is a pointer. 193 * but that macro() assumes that value is a pointer.
194 */ 194 */
195static inline void node_set_parent(struct node *node, struct tnode *ptr) 195static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
196{ 196{
197 smp_wmb(); 197 smp_wmb();
198 node->parent = (unsigned long)ptr | NODE_TYPE(node); 198 node->parent = (unsigned long)ptr | NODE_TYPE(node);
199} 199}
200 200
201static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) 201static inline struct rt_trie_node *tnode_get_child(struct tnode *tn, unsigned int i)
202{ 202{
203 BUG_ON(i >= 1U << tn->bits); 203 BUG_ON(i >= 1U << tn->bits);
204 204
205 return tn->child[i]; 205 return tn->child[i];
206} 206}
207 207
208static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) 208static inline struct rt_trie_node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
209{ 209{
210 struct node *ret = tnode_get_child(tn, i); 210 struct rt_trie_node *ret = tnode_get_child(tn, i);
211 211
212 return rcu_dereference_rtnl(ret); 212 return rcu_dereference_rtnl(ret);
213} 213}
@@ -217,12 +217,12 @@ static inline int tnode_child_length(const struct tnode *tn)
217 return 1 << tn->bits; 217 return 1 << tn->bits;
218} 218}
219 219
220static inline t_key mask_pfx(t_key k, unsigned short l) 220static inline t_key mask_pfx(t_key k, unsigned int l)
221{ 221{
222 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); 222 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
223} 223}
224 224
225static inline t_key tkey_extract_bits(t_key a, int offset, int bits) 225static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
226{ 226{
227 if (offset < KEYLENGTH) 227 if (offset < KEYLENGTH)
228 return ((t_key)(a << offset)) >> (KEYLENGTH - bits); 228 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
@@ -378,7 +378,7 @@ static void __tnode_free_rcu(struct rcu_head *head)
378{ 378{
379 struct tnode *tn = container_of(head, struct tnode, rcu); 379 struct tnode *tn = container_of(head, struct tnode, rcu);
380 size_t size = sizeof(struct tnode) + 380 size_t size = sizeof(struct tnode) +
381 (sizeof(struct node *) << tn->bits); 381 (sizeof(struct rt_trie_node *) << tn->bits);
382 382
383 if (size <= PAGE_SIZE) 383 if (size <= PAGE_SIZE)
384 kfree(tn); 384 kfree(tn);
@@ -402,7 +402,7 @@ static void tnode_free_safe(struct tnode *tn)
402 tn->tnode_free = tnode_free_head; 402 tn->tnode_free = tnode_free_head;
403 tnode_free_head = tn; 403 tnode_free_head = tn;
404 tnode_free_size += sizeof(struct tnode) + 404 tnode_free_size += sizeof(struct tnode) +
405 (sizeof(struct node *) << tn->bits); 405 (sizeof(struct rt_trie_node *) << tn->bits);
406} 406}
407 407
408static void tnode_free_flush(void) 408static void tnode_free_flush(void)
@@ -443,7 +443,7 @@ static struct leaf_info *leaf_info_new(int plen)
443 443
444static struct tnode *tnode_new(t_key key, int pos, int bits) 444static struct tnode *tnode_new(t_key key, int pos, int bits)
445{ 445{
446 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); 446 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
447 struct tnode *tn = tnode_alloc(sz); 447 struct tnode *tn = tnode_alloc(sz);
448 448
449 if (tn) { 449 if (tn) {
@@ -456,7 +456,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
456 } 456 }
457 457
458 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), 458 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
459 sizeof(struct node) << bits); 459 sizeof(struct rt_trie_node) << bits);
460 return tn; 460 return tn;
461} 461}
462 462
@@ -465,7 +465,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
465 * and no bits are skipped. See discussion in dyntree paper p. 6 465 * and no bits are skipped. See discussion in dyntree paper p. 6
466 */ 466 */
467 467
468static inline int tnode_full(const struct tnode *tn, const struct node *n) 468static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
469{ 469{
470 if (n == NULL || IS_LEAF(n)) 470 if (n == NULL || IS_LEAF(n))
471 return 0; 471 return 0;
@@ -474,7 +474,7 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n)
474} 474}
475 475
476static inline void put_child(struct trie *t, struct tnode *tn, int i, 476static inline void put_child(struct trie *t, struct tnode *tn, int i,
477 struct node *n) 477 struct rt_trie_node *n)
478{ 478{
479 tnode_put_child_reorg(tn, i, n, -1); 479 tnode_put_child_reorg(tn, i, n, -1);
480} 480}
@@ -484,10 +484,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i,
484 * Update the value of full_children and empty_children. 484 * Update the value of full_children and empty_children.
485 */ 485 */
486 486
487static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, 487static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
488 int wasfull) 488 int wasfull)
489{ 489{
490 struct node *chi = tn->child[i]; 490 struct rt_trie_node *chi = tn->child[i];
491 int isfull; 491 int isfull;
492 492
493 BUG_ON(i >= 1<<tn->bits); 493 BUG_ON(i >= 1<<tn->bits);
@@ -515,7 +515,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
515} 515}
516 516
517#define MAX_WORK 10 517#define MAX_WORK 10
518static struct node *resize(struct trie *t, struct tnode *tn) 518static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
519{ 519{
520 int i; 520 int i;
521 struct tnode *old_tn; 521 struct tnode *old_tn;
@@ -605,7 +605,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
605 605
606 /* Keep root node larger */ 606 /* Keep root node larger */
607 607
608 if (!node_parent((struct node *)tn)) { 608 if (!node_parent((struct rt_trie_node *)tn)) {
609 inflate_threshold_use = inflate_threshold_root; 609 inflate_threshold_use = inflate_threshold_root;
610 halve_threshold_use = halve_threshold_root; 610 halve_threshold_use = halve_threshold_root;
611 } else { 611 } else {
@@ -635,7 +635,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
635 635
636 /* Return if at least one inflate is run */ 636 /* Return if at least one inflate is run */
637 if (max_work != MAX_WORK) 637 if (max_work != MAX_WORK)
638 return (struct node *) tn; 638 return (struct rt_trie_node *) tn;
639 639
640 /* 640 /*
641 * Halve as long as the number of empty children in this 641 * Halve as long as the number of empty children in this
@@ -663,7 +663,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
663 if (tn->empty_children == tnode_child_length(tn) - 1) { 663 if (tn->empty_children == tnode_child_length(tn) - 1) {
664one_child: 664one_child:
665 for (i = 0; i < tnode_child_length(tn); i++) { 665 for (i = 0; i < tnode_child_length(tn); i++) {
666 struct node *n; 666 struct rt_trie_node *n;
667 667
668 n = tn->child[i]; 668 n = tn->child[i];
669 if (!n) 669 if (!n)
@@ -676,7 +676,7 @@ one_child:
676 return n; 676 return n;
677 } 677 }
678 } 678 }
679 return (struct node *) tn; 679 return (struct rt_trie_node *) tn;
680} 680}
681 681
682static struct tnode *inflate(struct trie *t, struct tnode *tn) 682static struct tnode *inflate(struct trie *t, struct tnode *tn)
@@ -723,14 +723,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
723 goto nomem; 723 goto nomem;
724 } 724 }
725 725
726 put_child(t, tn, 2*i, (struct node *) left); 726 put_child(t, tn, 2*i, (struct rt_trie_node *) left);
727 put_child(t, tn, 2*i+1, (struct node *) right); 727 put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
728 } 728 }
729 } 729 }
730 730
731 for (i = 0; i < olen; i++) { 731 for (i = 0; i < olen; i++) {
732 struct tnode *inode; 732 struct tnode *inode;
733 struct node *node = tnode_get_child(oldtnode, i); 733 struct rt_trie_node *node = tnode_get_child(oldtnode, i);
734 struct tnode *left, *right; 734 struct tnode *left, *right;
735 int size, j; 735 int size, j;
736 736
@@ -825,7 +825,7 @@ nomem:
825static struct tnode *halve(struct trie *t, struct tnode *tn) 825static struct tnode *halve(struct trie *t, struct tnode *tn)
826{ 826{
827 struct tnode *oldtnode = tn; 827 struct tnode *oldtnode = tn;
828 struct node *left, *right; 828 struct rt_trie_node *left, *right;
829 int i; 829 int i;
830 int olen = tnode_child_length(tn); 830 int olen = tnode_child_length(tn);
831 831
@@ -856,7 +856,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
856 if (!newn) 856 if (!newn)
857 goto nomem; 857 goto nomem;
858 858
859 put_child(t, tn, i/2, (struct node *)newn); 859 put_child(t, tn, i/2, (struct rt_trie_node *)newn);
860 } 860 }
861 861
862 } 862 }
@@ -958,7 +958,7 @@ fib_find_node(struct trie *t, u32 key)
958{ 958{
959 int pos; 959 int pos;
960 struct tnode *tn; 960 struct tnode *tn;
961 struct node *n; 961 struct rt_trie_node *n;
962 962
963 pos = 0; 963 pos = 0;
964 n = rcu_dereference_rtnl(t->trie); 964 n = rcu_dereference_rtnl(t->trie);
@@ -993,17 +993,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
993 993
994 key = tn->key; 994 key = tn->key;
995 995
996 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { 996 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
997 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 997 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
998 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 998 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
999 tn = (struct tnode *) resize(t, (struct tnode *)tn); 999 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1000 1000
1001 tnode_put_child_reorg((struct tnode *)tp, cindex, 1001 tnode_put_child_reorg((struct tnode *)tp, cindex,
1002 (struct node *)tn, wasfull); 1002 (struct rt_trie_node *)tn, wasfull);
1003 1003
1004 tp = node_parent((struct node *) tn); 1004 tp = node_parent((struct rt_trie_node *) tn);
1005 if (!tp) 1005 if (!tp)
1006 rcu_assign_pointer(t->trie, (struct node *)tn); 1006 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1007 1007
1008 tnode_free_flush(); 1008 tnode_free_flush();
1009 if (!tp) 1009 if (!tp)
@@ -1015,7 +1015,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1015 if (IS_TNODE(tn)) 1015 if (IS_TNODE(tn))
1016 tn = (struct tnode *)resize(t, (struct tnode *)tn); 1016 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1017 1017
1018 rcu_assign_pointer(t->trie, (struct node *)tn); 1018 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1019 tnode_free_flush(); 1019 tnode_free_flush();
1020} 1020}
1021 1021
@@ -1025,7 +1025,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1025{ 1025{
1026 int pos, newpos; 1026 int pos, newpos;
1027 struct tnode *tp = NULL, *tn = NULL; 1027 struct tnode *tp = NULL, *tn = NULL;
1028 struct node *n; 1028 struct rt_trie_node *n;
1029 struct leaf *l; 1029 struct leaf *l;
1030 int missbit; 1030 int missbit;
1031 struct list_head *fa_head = NULL; 1031 struct list_head *fa_head = NULL;
@@ -1111,10 +1111,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1111 if (t->trie && n == NULL) { 1111 if (t->trie && n == NULL) {
1112 /* Case 2: n is NULL, and will just insert a new leaf */ 1112 /* Case 2: n is NULL, and will just insert a new leaf */
1113 1113
1114 node_set_parent((struct node *)l, tp); 1114 node_set_parent((struct rt_trie_node *)l, tp);
1115 1115
1116 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1116 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1117 put_child(t, (struct tnode *)tp, cindex, (struct node *)l); 1117 put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
1118 } else { 1118 } else {
1119 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1119 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1120 /* 1120 /*
@@ -1141,18 +1141,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1141 return NULL; 1141 return NULL;
1142 } 1142 }
1143 1143
1144 node_set_parent((struct node *)tn, tp); 1144 node_set_parent((struct rt_trie_node *)tn, tp);
1145 1145
1146 missbit = tkey_extract_bits(key, newpos, 1); 1146 missbit = tkey_extract_bits(key, newpos, 1);
1147 put_child(t, tn, missbit, (struct node *)l); 1147 put_child(t, tn, missbit, (struct rt_trie_node *)l);
1148 put_child(t, tn, 1-missbit, n); 1148 put_child(t, tn, 1-missbit, n);
1149 1149
1150 if (tp) { 1150 if (tp) {
1151 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1151 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1152 put_child(t, (struct tnode *)tp, cindex, 1152 put_child(t, (struct tnode *)tp, cindex,
1153 (struct node *)tn); 1153 (struct rt_trie_node *)tn);
1154 } else { 1154 } else {
1155 rcu_assign_pointer(t->trie, (struct node *)tn); 1155 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1156 tp = tn; 1156 tp = tn;
1157 } 1157 }
1158 } 1158 }
@@ -1245,7 +1245,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1245 if (fa->fa_info->fib_priority != fi->fib_priority) 1245 if (fa->fa_info->fib_priority != fi->fib_priority)
1246 break; 1246 break;
1247 if (fa->fa_type == cfg->fc_type && 1247 if (fa->fa_type == cfg->fc_type &&
1248 fa->fa_scope == cfg->fc_scope &&
1249 fa->fa_info == fi) { 1248 fa->fa_info == fi) {
1250 fa_match = fa; 1249 fa_match = fa;
1251 break; 1250 break;
@@ -1271,7 +1270,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1271 new_fa->fa_tos = fa->fa_tos; 1270 new_fa->fa_tos = fa->fa_tos;
1272 new_fa->fa_info = fi; 1271 new_fa->fa_info = fi;
1273 new_fa->fa_type = cfg->fc_type; 1272 new_fa->fa_type = cfg->fc_type;
1274 new_fa->fa_scope = cfg->fc_scope;
1275 state = fa->fa_state; 1273 state = fa->fa_state;
1276 new_fa->fa_state = state & ~FA_S_ACCESSED; 1274 new_fa->fa_state = state & ~FA_S_ACCESSED;
1277 1275
@@ -1308,7 +1306,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1308 new_fa->fa_info = fi; 1306 new_fa->fa_info = fi;
1309 new_fa->fa_tos = tos; 1307 new_fa->fa_tos = tos;
1310 new_fa->fa_type = cfg->fc_type; 1308 new_fa->fa_type = cfg->fc_type;
1311 new_fa->fa_scope = cfg->fc_scope;
1312 new_fa->fa_state = 0; 1309 new_fa->fa_state = 0;
1313 /* 1310 /*
1314 * Insert new entry to the list. 1311 * Insert new entry to the list.
@@ -1340,8 +1337,8 @@ err:
1340} 1337}
1341 1338
1342/* should be called with rcu_read_lock */ 1339/* should be called with rcu_read_lock */
1343static int check_leaf(struct trie *t, struct leaf *l, 1340static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1344 t_key key, const struct flowi *flp, 1341 t_key key, const struct flowi4 *flp,
1345 struct fib_result *res, int fib_flags) 1342 struct fib_result *res, int fib_flags)
1346{ 1343{
1347 struct leaf_info *li; 1344 struct leaf_info *li;
@@ -1349,40 +1346,75 @@ static int check_leaf(struct trie *t, struct leaf *l,
1349 struct hlist_node *node; 1346 struct hlist_node *node;
1350 1347
1351 hlist_for_each_entry_rcu(li, node, hhead, hlist) { 1348 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1352 int err; 1349 struct fib_alias *fa;
1353 int plen = li->plen; 1350 int plen = li->plen;
1354 __be32 mask = inet_make_mask(plen); 1351 __be32 mask = inet_make_mask(plen);
1355 1352
1356 if (l->key != (key & ntohl(mask))) 1353 if (l->key != (key & ntohl(mask)))
1357 continue; 1354 continue;
1358 1355
1359 err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags); 1356 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1357 struct fib_info *fi = fa->fa_info;
1358 int nhsel, err;
1360 1359
1360 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1361 continue;
1362 if (fa->fa_info->fib_scope < flp->flowi4_scope)
1363 continue;
1364 fib_alias_accessed(fa);
1365 err = fib_props[fa->fa_type].error;
1366 if (err) {
1361#ifdef CONFIG_IP_FIB_TRIE_STATS 1367#ifdef CONFIG_IP_FIB_TRIE_STATS
1362 if (err <= 0) 1368 t->stats.semantic_match_passed++;
1363 t->stats.semantic_match_passed++; 1369#endif
1364 else 1370 return err;
1365 t->stats.semantic_match_miss++; 1371 }
1372 if (fi->fib_flags & RTNH_F_DEAD)
1373 continue;
1374 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1375 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1376
1377 if (nh->nh_flags & RTNH_F_DEAD)
1378 continue;
1379 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1380 continue;
1381
1382#ifdef CONFIG_IP_FIB_TRIE_STATS
1383 t->stats.semantic_match_passed++;
1384#endif
1385 res->prefixlen = plen;
1386 res->nh_sel = nhsel;
1387 res->type = fa->fa_type;
1388 res->scope = fa->fa_info->fib_scope;
1389 res->fi = fi;
1390 res->table = tb;
1391 res->fa_head = &li->falh;
1392 if (!(fib_flags & FIB_LOOKUP_NOREF))
1393 atomic_inc(&res->fi->fib_clntref);
1394 return 0;
1395 }
1396 }
1397
1398#ifdef CONFIG_IP_FIB_TRIE_STATS
1399 t->stats.semantic_match_miss++;
1366#endif 1400#endif
1367 if (err <= 0)
1368 return err;
1369 } 1401 }
1370 1402
1371 return 1; 1403 return 1;
1372} 1404}
1373 1405
1374int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, 1406int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1375 struct fib_result *res, int fib_flags) 1407 struct fib_result *res, int fib_flags)
1376{ 1408{
1377 struct trie *t = (struct trie *) tb->tb_data; 1409 struct trie *t = (struct trie *) tb->tb_data;
1378 int ret; 1410 int ret;
1379 struct node *n; 1411 struct rt_trie_node *n;
1380 struct tnode *pn; 1412 struct tnode *pn;
1381 int pos, bits; 1413 unsigned int pos, bits;
1382 t_key key = ntohl(flp->fl4_dst); 1414 t_key key = ntohl(flp->daddr);
1383 int chopped_off; 1415 unsigned int chopped_off;
1384 t_key cindex = 0; 1416 t_key cindex = 0;
1385 int current_prefix_length = KEYLENGTH; 1417 unsigned int current_prefix_length = KEYLENGTH;
1386 struct tnode *cn; 1418 struct tnode *cn;
1387 t_key pref_mismatch; 1419 t_key pref_mismatch;
1388 1420
@@ -1398,7 +1430,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1398 1430
1399 /* Just a leaf? */ 1431 /* Just a leaf? */
1400 if (IS_LEAF(n)) { 1432 if (IS_LEAF(n)) {
1401 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); 1433 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1402 goto found; 1434 goto found;
1403 } 1435 }
1404 1436
@@ -1423,7 +1455,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1423 } 1455 }
1424 1456
1425 if (IS_LEAF(n)) { 1457 if (IS_LEAF(n)) {
1426 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); 1458 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1427 if (ret > 0) 1459 if (ret > 0)
1428 goto backtrace; 1460 goto backtrace;
1429 goto found; 1461 goto found;
@@ -1541,7 +1573,7 @@ backtrace:
1541 if (chopped_off <= pn->bits) { 1573 if (chopped_off <= pn->bits) {
1542 cindex &= ~(1 << (chopped_off-1)); 1574 cindex &= ~(1 << (chopped_off-1));
1543 } else { 1575 } else {
1544 struct tnode *parent = node_parent_rcu((struct node *) pn); 1576 struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1545 if (!parent) 1577 if (!parent)
1546 goto failed; 1578 goto failed;
1547 1579
@@ -1568,7 +1600,7 @@ found:
1568 */ 1600 */
1569static void trie_leaf_remove(struct trie *t, struct leaf *l) 1601static void trie_leaf_remove(struct trie *t, struct leaf *l)
1570{ 1602{
1571 struct tnode *tp = node_parent((struct node *) l); 1603 struct tnode *tp = node_parent((struct rt_trie_node *) l);
1572 1604
1573 pr_debug("entering trie_leaf_remove(%p)\n", l); 1605 pr_debug("entering trie_leaf_remove(%p)\n", l);
1574 1606
@@ -1629,7 +1661,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1629 1661
1630 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && 1662 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1631 (cfg->fc_scope == RT_SCOPE_NOWHERE || 1663 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1632 fa->fa_scope == cfg->fc_scope) && 1664 fa->fa_info->fib_scope == cfg->fc_scope) &&
1665 (!cfg->fc_prefsrc ||
1666 fi->fib_prefsrc == cfg->fc_prefsrc) &&
1633 (!cfg->fc_protocol || 1667 (!cfg->fc_protocol ||
1634 fi->fib_protocol == cfg->fc_protocol) && 1668 fi->fib_protocol == cfg->fc_protocol) &&
1635 fib_nh_match(cfg, fi) == 0) { 1669 fib_nh_match(cfg, fi) == 0) {
@@ -1706,7 +1740,7 @@ static int trie_flush_leaf(struct leaf *l)
1706 * Scan for the next right leaf starting at node p->child[idx] 1740 * Scan for the next right leaf starting at node p->child[idx]
1707 * Since we have back pointer, no recursion necessary. 1741 * Since we have back pointer, no recursion necessary.
1708 */ 1742 */
1709static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) 1743static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1710{ 1744{
1711 do { 1745 do {
1712 t_key idx; 1746 t_key idx;
@@ -1732,7 +1766,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1732 } 1766 }
1733 1767
1734 /* Node empty, walk back up to parent */ 1768 /* Node empty, walk back up to parent */
1735 c = (struct node *) p; 1769 c = (struct rt_trie_node *) p;
1736 } while ((p = node_parent_rcu(c)) != NULL); 1770 } while ((p = node_parent_rcu(c)) != NULL);
1737 1771
1738 return NULL; /* Root of trie */ 1772 return NULL; /* Root of trie */
@@ -1753,7 +1787,7 @@ static struct leaf *trie_firstleaf(struct trie *t)
1753 1787
1754static struct leaf *trie_nextleaf(struct leaf *l) 1788static struct leaf *trie_nextleaf(struct leaf *l)
1755{ 1789{
1756 struct node *c = (struct node *) l; 1790 struct rt_trie_node *c = (struct rt_trie_node *) l;
1757 struct tnode *p = node_parent_rcu(c); 1791 struct tnode *p = node_parent_rcu(c);
1758 1792
1759 if (!p) 1793 if (!p)
@@ -1802,80 +1836,6 @@ void fib_free_table(struct fib_table *tb)
1802 kfree(tb); 1836 kfree(tb);
1803} 1837}
1804 1838
1805void fib_table_select_default(struct fib_table *tb,
1806 const struct flowi *flp,
1807 struct fib_result *res)
1808{
1809 struct trie *t = (struct trie *) tb->tb_data;
1810 int order, last_idx;
1811 struct fib_info *fi = NULL;
1812 struct fib_info *last_resort;
1813 struct fib_alias *fa = NULL;
1814 struct list_head *fa_head;
1815 struct leaf *l;
1816
1817 last_idx = -1;
1818 last_resort = NULL;
1819 order = -1;
1820
1821 rcu_read_lock();
1822
1823 l = fib_find_node(t, 0);
1824 if (!l)
1825 goto out;
1826
1827 fa_head = get_fa_head(l, 0);
1828 if (!fa_head)
1829 goto out;
1830
1831 if (list_empty(fa_head))
1832 goto out;
1833
1834 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1835 struct fib_info *next_fi = fa->fa_info;
1836
1837 if (fa->fa_scope != res->scope ||
1838 fa->fa_type != RTN_UNICAST)
1839 continue;
1840
1841 if (next_fi->fib_priority > res->fi->fib_priority)
1842 break;
1843 if (!next_fi->fib_nh[0].nh_gw ||
1844 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1845 continue;
1846
1847 fib_alias_accessed(fa);
1848
1849 if (fi == NULL) {
1850 if (next_fi != res->fi)
1851 break;
1852 } else if (!fib_detect_death(fi, order, &last_resort,
1853 &last_idx, tb->tb_default)) {
1854 fib_result_assign(res, fi);
1855 tb->tb_default = order;
1856 goto out;
1857 }
1858 fi = next_fi;
1859 order++;
1860 }
1861 if (order <= 0 || fi == NULL) {
1862 tb->tb_default = -1;
1863 goto out;
1864 }
1865
1866 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1867 tb->tb_default)) {
1868 fib_result_assign(res, fi);
1869 tb->tb_default = order;
1870 goto out;
1871 }
1872 if (last_idx >= 0)
1873 fib_result_assign(res, last_resort);
1874 tb->tb_default = last_idx;
1875out:
1876 rcu_read_unlock();
1877}
1878
1879static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, 1839static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1880 struct fib_table *tb, 1840 struct fib_table *tb,
1881 struct sk_buff *skb, struct netlink_callback *cb) 1841 struct sk_buff *skb, struct netlink_callback *cb)
@@ -1900,7 +1860,6 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1900 RTM_NEWROUTE, 1860 RTM_NEWROUTE,
1901 tb->tb_id, 1861 tb->tb_id,
1902 fa->fa_type, 1862 fa->fa_type,
1903 fa->fa_scope,
1904 xkey, 1863 xkey,
1905 plen, 1864 plen,
1906 fa->fa_tos, 1865 fa->fa_tos,
@@ -1990,7 +1949,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1990 return skb->len; 1949 return skb->len;
1991} 1950}
1992 1951
1993void __init fib_hash_init(void) 1952void __init fib_trie_init(void)
1994{ 1953{
1995 fn_alias_kmem = kmem_cache_create("ip_fib_alias", 1954 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1996 sizeof(struct fib_alias), 1955 sizeof(struct fib_alias),
@@ -2003,8 +1962,7 @@ void __init fib_hash_init(void)
2003} 1962}
2004 1963
2005 1964
2006/* Fix more generic FIB names for init later */ 1965struct fib_table *fib_trie_table(u32 id)
2007struct fib_table *fib_hash_table(u32 id)
2008{ 1966{
2009 struct fib_table *tb; 1967 struct fib_table *tb;
2010 struct trie *t; 1968 struct trie *t;
@@ -2036,7 +1994,7 @@ struct fib_trie_iter {
2036 unsigned int depth; 1994 unsigned int depth;
2037}; 1995};
2038 1996
2039static struct node *fib_trie_get_next(struct fib_trie_iter *iter) 1997static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2040{ 1998{
2041 struct tnode *tn = iter->tnode; 1999 struct tnode *tn = iter->tnode;
2042 unsigned int cindex = iter->index; 2000 unsigned int cindex = iter->index;
@@ -2050,7 +2008,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2050 iter->tnode, iter->index, iter->depth); 2008 iter->tnode, iter->index, iter->depth);
2051rescan: 2009rescan:
2052 while (cindex < (1<<tn->bits)) { 2010 while (cindex < (1<<tn->bits)) {
2053 struct node *n = tnode_get_child_rcu(tn, cindex); 2011 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2054 2012
2055 if (n) { 2013 if (n) {
2056 if (IS_LEAF(n)) { 2014 if (IS_LEAF(n)) {
@@ -2069,7 +2027,7 @@ rescan:
2069 } 2027 }
2070 2028
2071 /* Current node exhausted, pop back up */ 2029 /* Current node exhausted, pop back up */
2072 p = node_parent_rcu((struct node *)tn); 2030 p = node_parent_rcu((struct rt_trie_node *)tn);
2073 if (p) { 2031 if (p) {
2074 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; 2032 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2075 tn = p; 2033 tn = p;
@@ -2081,10 +2039,10 @@ rescan:
2081 return NULL; 2039 return NULL;
2082} 2040}
2083 2041
2084static struct node *fib_trie_get_first(struct fib_trie_iter *iter, 2042static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2085 struct trie *t) 2043 struct trie *t)
2086{ 2044{
2087 struct node *n; 2045 struct rt_trie_node *n;
2088 2046
2089 if (!t) 2047 if (!t)
2090 return NULL; 2048 return NULL;
@@ -2108,7 +2066,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2108 2066
2109static void trie_collect_stats(struct trie *t, struct trie_stat *s) 2067static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2110{ 2068{
2111 struct node *n; 2069 struct rt_trie_node *n;
2112 struct fib_trie_iter iter; 2070 struct fib_trie_iter iter;
2113 2071
2114 memset(s, 0, sizeof(*s)); 2072 memset(s, 0, sizeof(*s));
@@ -2181,7 +2139,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2181 seq_putc(seq, '\n'); 2139 seq_putc(seq, '\n');
2182 seq_printf(seq, "\tPointers: %u\n", pointers); 2140 seq_printf(seq, "\tPointers: %u\n", pointers);
2183 2141
2184 bytes += sizeof(struct node *) * pointers; 2142 bytes += sizeof(struct rt_trie_node *) * pointers;
2185 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); 2143 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2186 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); 2144 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2187} 2145}
@@ -2262,7 +2220,7 @@ static const struct file_operations fib_triestat_fops = {
2262 .release = single_release_net, 2220 .release = single_release_net,
2263}; 2221};
2264 2222
2265static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) 2223static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2266{ 2224{
2267 struct fib_trie_iter *iter = seq->private; 2225 struct fib_trie_iter *iter = seq->private;
2268 struct net *net = seq_file_net(seq); 2226 struct net *net = seq_file_net(seq);
@@ -2275,7 +2233,7 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2275 struct fib_table *tb; 2233 struct fib_table *tb;
2276 2234
2277 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { 2235 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2278 struct node *n; 2236 struct rt_trie_node *n;
2279 2237
2280 for (n = fib_trie_get_first(iter, 2238 for (n = fib_trie_get_first(iter,
2281 (struct trie *) tb->tb_data); 2239 (struct trie *) tb->tb_data);
@@ -2304,7 +2262,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2304 struct fib_table *tb = iter->tb; 2262 struct fib_table *tb = iter->tb;
2305 struct hlist_node *tb_node; 2263 struct hlist_node *tb_node;
2306 unsigned int h; 2264 unsigned int h;
2307 struct node *n; 2265 struct rt_trie_node *n;
2308 2266
2309 ++*pos; 2267 ++*pos;
2310 /* next node in same table */ 2268 /* next node in same table */
@@ -2390,7 +2348,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2390static int fib_trie_seq_show(struct seq_file *seq, void *v) 2348static int fib_trie_seq_show(struct seq_file *seq, void *v)
2391{ 2349{
2392 const struct fib_trie_iter *iter = seq->private; 2350 const struct fib_trie_iter *iter = seq->private;
2393 struct node *n = v; 2351 struct rt_trie_node *n = v;
2394 2352
2395 if (!node_parent_rcu(n)) 2353 if (!node_parent_rcu(n))
2396 fib_table_print(seq, iter->tb); 2354 fib_table_print(seq, iter->tb);
@@ -2422,7 +2380,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2422 seq_indent(seq, iter->depth+1); 2380 seq_indent(seq, iter->depth+1);
2423 seq_printf(seq, " /%d %s %s", li->plen, 2381 seq_printf(seq, " /%d %s %s", li->plen,
2424 rtn_scope(buf1, sizeof(buf1), 2382 rtn_scope(buf1, sizeof(buf1),
2425 fa->fa_scope), 2383 fa->fa_info->fib_scope),
2426 rtn_type(buf2, sizeof(buf2), 2384 rtn_type(buf2, sizeof(buf2),
2427 fa->fa_type)); 2385 fa->fa_type));
2428 if (fa->fa_tos) 2386 if (fa->fa_tos)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4aa1b7f01ea0..a91dc1611081 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -233,48 +233,11 @@ static inline void icmp_xmit_unlock(struct sock *sk)
233 * Send an ICMP frame. 233 * Send an ICMP frame.
234 */ 234 */
235 235
236/* 236static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
237 * Check transmit rate limitation for given message.
238 * The rate information is held in the destination cache now.
239 * This function is generic and could be used for other purposes
240 * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
241 *
242 * Note that the same dst_entry fields are modified by functions in
243 * route.c too, but these work for packet destinations while xrlim_allow
244 * works for icmp destinations. This means the rate limiting information
245 * for one "ip object" is shared - and these ICMPs are twice limited:
246 * by source and by destination.
247 *
248 * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
249 * SHOULD allow setting of rate limits
250 *
251 * Shared between ICMPv4 and ICMPv6.
252 */
253#define XRLIM_BURST_FACTOR 6
254int xrlim_allow(struct dst_entry *dst, int timeout)
255{
256 unsigned long now, token = dst->rate_tokens;
257 int rc = 0;
258
259 now = jiffies;
260 token += now - dst->rate_last;
261 dst->rate_last = now;
262 if (token > XRLIM_BURST_FACTOR * timeout)
263 token = XRLIM_BURST_FACTOR * timeout;
264 if (token >= timeout) {
265 token -= timeout;
266 rc = 1;
267 }
268 dst->rate_tokens = token;
269 return rc;
270}
271EXPORT_SYMBOL(xrlim_allow);
272
273static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
274 int type, int code) 237 int type, int code)
275{ 238{
276 struct dst_entry *dst = &rt->dst; 239 struct dst_entry *dst = &rt->dst;
277 int rc = 1; 240 bool rc = true;
278 241
279 if (type > NR_ICMP_TYPES) 242 if (type > NR_ICMP_TYPES)
280 goto out; 243 goto out;
@@ -288,8 +251,12 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
288 goto out; 251 goto out;
289 252
290 /* Limit if icmp type is enabled in ratemask. */ 253 /* Limit if icmp type is enabled in ratemask. */
291 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) 254 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
292 rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit); 255 if (!rt->peer)
256 rt_bind_peer(rt, 1);
257 rc = inet_peer_xrlim_allow(rt->peer,
258 net->ipv4.sysctl_icmp_ratelimit);
259 }
293out: 260out:
294 return rc; 261 return rc;
295} 262}
@@ -386,12 +353,15 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
386 daddr = icmp_param->replyopts.faddr; 353 daddr = icmp_param->replyopts.faddr;
387 } 354 }
388 { 355 {
389 struct flowi fl = { .fl4_dst= daddr, 356 struct flowi4 fl4 = {
390 .fl4_src = rt->rt_spec_dst, 357 .daddr = daddr,
391 .fl4_tos = RT_TOS(ip_hdr(skb)->tos), 358 .saddr = rt->rt_spec_dst,
392 .proto = IPPROTO_ICMP }; 359 .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
393 security_skb_classify_flow(skb, &fl); 360 .flowi4_proto = IPPROTO_ICMP,
394 if (ip_route_output_key(net, &rt, &fl)) 361 };
362 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
363 rt = ip_route_output_key(net, &fl4);
364 if (IS_ERR(rt))
395 goto out_unlock; 365 goto out_unlock;
396 } 366 }
397 if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, 367 if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type,
@@ -402,6 +372,97 @@ out_unlock:
402 icmp_xmit_unlock(sk); 372 icmp_xmit_unlock(sk);
403} 373}
404 374
375static struct rtable *icmp_route_lookup(struct net *net, struct sk_buff *skb_in,
376 struct iphdr *iph,
377 __be32 saddr, u8 tos,
378 int type, int code,
379 struct icmp_bxm *param)
380{
381 struct flowi4 fl4 = {
382 .daddr = (param->replyopts.srr ?
383 param->replyopts.faddr : iph->saddr),
384 .saddr = saddr,
385 .flowi4_tos = RT_TOS(tos),
386 .flowi4_proto = IPPROTO_ICMP,
387 .fl4_icmp_type = type,
388 .fl4_icmp_code = code,
389 };
390 struct rtable *rt, *rt2;
391 int err;
392
393 security_skb_classify_flow(skb_in, flowi4_to_flowi(&fl4));
394 rt = __ip_route_output_key(net, &fl4);
395 if (IS_ERR(rt))
396 return rt;
397
398 /* No need to clone since we're just using its address. */
399 rt2 = rt;
400
401 if (!fl4.saddr)
402 fl4.saddr = rt->rt_src;
403
404 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
405 flowi4_to_flowi(&fl4), NULL, 0);
406 if (!IS_ERR(rt)) {
407 if (rt != rt2)
408 return rt;
409 } else if (PTR_ERR(rt) == -EPERM) {
410 rt = NULL;
411 } else
412 return rt;
413
414 err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4), AF_INET);
415 if (err)
416 goto relookup_failed;
417
418 if (inet_addr_type(net, fl4.saddr) == RTN_LOCAL) {
419 rt2 = __ip_route_output_key(net, &fl4);
420 if (IS_ERR(rt2))
421 err = PTR_ERR(rt2);
422 } else {
423 struct flowi4 fl4_2 = {};
424 unsigned long orefdst;
425
426 fl4_2.daddr = fl4.saddr;
427 rt2 = ip_route_output_key(net, &fl4_2);
428 if (IS_ERR(rt2)) {
429 err = PTR_ERR(rt2);
430 goto relookup_failed;
431 }
432 /* Ugh! */
433 orefdst = skb_in->_skb_refdst; /* save old refdst */
434 err = ip_route_input(skb_in, fl4.daddr, fl4.saddr,
435 RT_TOS(tos), rt2->dst.dev);
436
437 dst_release(&rt2->dst);
438 rt2 = skb_rtable(skb_in);
439 skb_in->_skb_refdst = orefdst; /* restore old refdst */
440 }
441
442 if (err)
443 goto relookup_failed;
444
445 rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
446 flowi4_to_flowi(&fl4), NULL,
447 XFRM_LOOKUP_ICMP);
448 if (!IS_ERR(rt2)) {
449 dst_release(&rt->dst);
450 rt = rt2;
451 } else if (PTR_ERR(rt2) == -EPERM) {
452 if (rt)
453 dst_release(&rt->dst);
454 return rt2;
455 } else {
456 err = PTR_ERR(rt2);
457 goto relookup_failed;
458 }
459 return rt;
460
461relookup_failed:
462 if (rt)
463 return rt;
464 return ERR_PTR(err);
465}
405 466
406/* 467/*
407 * Send an ICMP message in response to a situation 468 * Send an ICMP message in response to a situation
@@ -507,7 +568,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
507 rcu_read_lock(); 568 rcu_read_lock();
508 if (rt_is_input_route(rt) && 569 if (rt_is_input_route(rt) &&
509 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) 570 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
510 dev = dev_get_by_index_rcu(net, rt->fl.iif); 571 dev = dev_get_by_index_rcu(net, rt->rt_iif);
511 572
512 if (dev) 573 if (dev)
513 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); 574 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -539,86 +600,11 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
539 ipc.opt = &icmp_param.replyopts; 600 ipc.opt = &icmp_param.replyopts;
540 ipc.tx_flags = 0; 601 ipc.tx_flags = 0;
541 602
542 { 603 rt = icmp_route_lookup(net, skb_in, iph, saddr, tos,
543 struct flowi fl = { 604 type, code, &icmp_param);
544 .fl4_dst = icmp_param.replyopts.srr ? 605 if (IS_ERR(rt))
545 icmp_param.replyopts.faddr : iph->saddr, 606 goto out_unlock;
546 .fl4_src = saddr,
547 .fl4_tos = RT_TOS(tos),
548 .proto = IPPROTO_ICMP,
549 .fl_icmp_type = type,
550 .fl_icmp_code = code,
551 };
552 int err;
553 struct rtable *rt2;
554
555 security_skb_classify_flow(skb_in, &fl);
556 if (__ip_route_output_key(net, &rt, &fl))
557 goto out_unlock;
558
559 /* No need to clone since we're just using its address. */
560 rt2 = rt;
561
562 if (!fl.nl_u.ip4_u.saddr)
563 fl.nl_u.ip4_u.saddr = rt->rt_src;
564
565 err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0);
566 switch (err) {
567 case 0:
568 if (rt != rt2)
569 goto route_done;
570 break;
571 case -EPERM:
572 rt = NULL;
573 break;
574 default:
575 goto out_unlock;
576 }
577
578 if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET))
579 goto relookup_failed;
580
581 if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL)
582 err = __ip_route_output_key(net, &rt2, &fl);
583 else {
584 struct flowi fl2 = {};
585 unsigned long orefdst;
586
587 fl2.fl4_dst = fl.fl4_src;
588 if (ip_route_output_key(net, &rt2, &fl2))
589 goto relookup_failed;
590
591 /* Ugh! */
592 orefdst = skb_in->_skb_refdst; /* save old refdst */
593 err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
594 RT_TOS(tos), rt2->dst.dev);
595
596 dst_release(&rt2->dst);
597 rt2 = skb_rtable(skb_in);
598 skb_in->_skb_refdst = orefdst; /* restore old refdst */
599 }
600
601 if (err)
602 goto relookup_failed;
603
604 err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL,
605 XFRM_LOOKUP_ICMP);
606 switch (err) {
607 case 0:
608 dst_release(&rt->dst);
609 rt = rt2;
610 break;
611 case -EPERM:
612 goto ende;
613 default:
614relookup_failed:
615 if (!rt)
616 goto out_unlock;
617 break;
618 }
619 }
620 607
621route_done:
622 if (!icmpv4_xrlim_allow(net, rt, type, code)) 608 if (!icmpv4_xrlim_allow(net, rt, type, code))
623 goto ende; 609 goto ende;
624 610
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index e0e77e297de3..1fd3d9ce8398 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -321,14 +321,12 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
321 } 321 }
322 igmp_skb_size(skb) = size; 322 igmp_skb_size(skb) = size;
323 323
324 { 324 rt = ip_route_output_ports(net, NULL, IGMPV3_ALL_MCR, 0,
325 struct flowi fl = { .oif = dev->ifindex, 325 0, 0,
326 .fl4_dst = IGMPV3_ALL_MCR, 326 IPPROTO_IGMP, 0, dev->ifindex);
327 .proto = IPPROTO_IGMP }; 327 if (IS_ERR(rt)) {
328 if (ip_route_output_key(net, &rt, &fl)) { 328 kfree_skb(skb);
329 kfree_skb(skb); 329 return NULL;
330 return NULL;
331 }
332 } 330 }
333 if (rt->rt_src == 0) { 331 if (rt->rt_src == 0) {
334 kfree_skb(skb); 332 kfree_skb(skb);
@@ -666,13 +664,12 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
666 else 664 else
667 dst = group; 665 dst = group;
668 666
669 { 667 rt = ip_route_output_ports(net, NULL, dst, 0,
670 struct flowi fl = { .oif = dev->ifindex, 668 0, 0,
671 .fl4_dst = dst, 669 IPPROTO_IGMP, 0, dev->ifindex);
672 .proto = IPPROTO_IGMP }; 670 if (IS_ERR(rt))
673 if (ip_route_output_key(net, &rt, &fl)) 671 return -1;
674 return -1; 672
675 }
676 if (rt->rt_src == 0) { 673 if (rt->rt_src == 0) {
677 ip_rt_put(rt); 674 ip_rt_put(rt);
678 return -1; 675 return -1;
@@ -1439,8 +1436,6 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
1439/* RTNL is locked */ 1436/* RTNL is locked */
1440static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) 1437static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1441{ 1438{
1442 struct flowi fl = { .fl4_dst = imr->imr_multiaddr.s_addr };
1443 struct rtable *rt;
1444 struct net_device *dev = NULL; 1439 struct net_device *dev = NULL;
1445 struct in_device *idev = NULL; 1440 struct in_device *idev = NULL;
1446 1441
@@ -1454,9 +1449,14 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1454 return NULL; 1449 return NULL;
1455 } 1450 }
1456 1451
1457 if (!dev && !ip_route_output_key(net, &rt, &fl)) { 1452 if (!dev) {
1458 dev = rt->dst.dev; 1453 struct rtable *rt = ip_route_output(net,
1459 ip_rt_put(rt); 1454 imr->imr_multiaddr.s_addr,
1455 0, 0, 0);
1456 if (!IS_ERR(rt)) {
1457 dev = rt->dst.dev;
1458 ip_rt_put(rt);
1459 }
1460 } 1460 }
1461 if (dev) { 1461 if (dev) {
1462 imr->imr_ifindex = dev->ifindex; 1462 imr->imr_ifindex = dev->ifindex;
@@ -2329,13 +2329,13 @@ void ip_mc_drop_socket(struct sock *sk)
2329 rtnl_unlock(); 2329 rtnl_unlock();
2330} 2330}
2331 2331
2332int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) 2332/* called with rcu_read_lock() */
2333int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
2333{ 2334{
2334 struct ip_mc_list *im; 2335 struct ip_mc_list *im;
2335 struct ip_sf_list *psf; 2336 struct ip_sf_list *psf;
2336 int rv = 0; 2337 int rv = 0;
2337 2338
2338 rcu_read_lock();
2339 for_each_pmc_rcu(in_dev, im) { 2339 for_each_pmc_rcu(in_dev, im) {
2340 if (im->multiaddr == mc_addr) 2340 if (im->multiaddr == mc_addr)
2341 break; 2341 break;
@@ -2357,7 +2357,6 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
2357 } else 2357 } else
2358 rv = 1; /* unspecified source; tentatively allow */ 2358 rv = 1; /* unspecified source; tentatively allow */
2359 } 2359 }
2360 rcu_read_unlock();
2361 return rv; 2360 return rv;
2362} 2361}
2363 2362
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 97e5fb765265..6c0b7f4a3d7d 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -356,20 +356,23 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
356 struct rtable *rt; 356 struct rtable *rt;
357 const struct inet_request_sock *ireq = inet_rsk(req); 357 const struct inet_request_sock *ireq = inet_rsk(req);
358 struct ip_options *opt = inet_rsk(req)->opt; 358 struct ip_options *opt = inet_rsk(req)->opt;
359 struct flowi fl = { .oif = sk->sk_bound_dev_if, 359 struct flowi4 fl4 = {
360 .mark = sk->sk_mark, 360 .flowi4_oif = sk->sk_bound_dev_if,
361 .fl4_dst = ((opt && opt->srr) ? 361 .flowi4_mark = sk->sk_mark,
362 opt->faddr : ireq->rmt_addr), 362 .daddr = ((opt && opt->srr) ?
363 .fl4_src = ireq->loc_addr, 363 opt->faddr : ireq->rmt_addr),
364 .fl4_tos = RT_CONN_FLAGS(sk), 364 .saddr = ireq->loc_addr,
365 .proto = sk->sk_protocol, 365 .flowi4_tos = RT_CONN_FLAGS(sk),
366 .flags = inet_sk_flowi_flags(sk), 366 .flowi4_proto = sk->sk_protocol,
367 .fl_ip_sport = inet_sk(sk)->inet_sport, 367 .flowi4_flags = inet_sk_flowi_flags(sk),
368 .fl_ip_dport = ireq->rmt_port }; 368 .fl4_sport = inet_sk(sk)->inet_sport,
369 .fl4_dport = ireq->rmt_port,
370 };
369 struct net *net = sock_net(sk); 371 struct net *net = sock_net(sk);
370 372
371 security_req_classify_flow(req, &fl); 373 security_req_classify_flow(req, flowi4_to_flowi(&fl4));
372 if (ip_route_output_flow(net, &rt, &fl, sk, 0)) 374 rt = ip_route_output_flow(net, &fl4, sk);
375 if (IS_ERR(rt))
373 goto no_route; 376 goto no_route;
374 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 377 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
375 goto route_err; 378 goto route_err;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 2746c1fa6417..2ada17129fce 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -858,7 +858,7 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
858 nlmsg_len(nlh) < hdrlen) 858 nlmsg_len(nlh) < hdrlen)
859 return -EINVAL; 859 return -EINVAL;
860 860
861 if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) { 861 if (nlh->nlmsg_flags & NLM_F_DUMP) {
862 if (nlmsg_attrlen(nlh, hdrlen)) { 862 if (nlmsg_attrlen(nlh, hdrlen)) {
863 struct nlattr *attr; 863 struct nlattr *attr;
864 864
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index c5af909cf701..3c8dfa16614d 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -505,7 +505,9 @@ restart:
505 } 505 }
506 506
507 rcu_read_unlock(); 507 rcu_read_unlock();
508 local_bh_disable();
508 inet_twsk_deschedule(tw, twdr); 509 inet_twsk_deschedule(tw, twdr);
510 local_bh_enable();
509 inet_twsk_put(tw); 511 inet_twsk_put(tw);
510 goto restart_rcu; 512 goto restart_rcu;
511 } 513 }
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index d9bc85751c74..dd1b20eca1a2 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -81,19 +81,19 @@ static const struct inet_peer peer_fake_node = {
81 81
82struct inet_peer_base { 82struct inet_peer_base {
83 struct inet_peer __rcu *root; 83 struct inet_peer __rcu *root;
84 spinlock_t lock; 84 seqlock_t lock;
85 int total; 85 int total;
86}; 86};
87 87
88static struct inet_peer_base v4_peers = { 88static struct inet_peer_base v4_peers = {
89 .root = peer_avl_empty_rcu, 89 .root = peer_avl_empty_rcu,
90 .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock), 90 .lock = __SEQLOCK_UNLOCKED(v4_peers.lock),
91 .total = 0, 91 .total = 0,
92}; 92};
93 93
94static struct inet_peer_base v6_peers = { 94static struct inet_peer_base v6_peers = {
95 .root = peer_avl_empty_rcu, 95 .root = peer_avl_empty_rcu,
96 .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock), 96 .lock = __SEQLOCK_UNLOCKED(v6_peers.lock),
97 .total = 0, 97 .total = 0,
98}; 98};
99 99
@@ -167,9 +167,9 @@ static int addr_compare(const struct inetpeer_addr *a,
167 int i, n = (a->family == AF_INET ? 1 : 4); 167 int i, n = (a->family == AF_INET ? 1 : 4);
168 168
169 for (i = 0; i < n; i++) { 169 for (i = 0; i < n; i++) {
170 if (a->a6[i] == b->a6[i]) 170 if (a->addr.a6[i] == b->addr.a6[i])
171 continue; 171 continue;
172 if (a->a6[i] < b->a6[i]) 172 if (a->addr.a6[i] < b->addr.a6[i])
173 return -1; 173 return -1;
174 return 1; 174 return 1;
175 } 175 }
@@ -177,6 +177,9 @@ static int addr_compare(const struct inetpeer_addr *a,
177 return 0; 177 return 0;
178} 178}
179 179
180#define rcu_deref_locked(X, BASE) \
181 rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
182
180/* 183/*
181 * Called with local BH disabled and the pool lock held. 184 * Called with local BH disabled and the pool lock held.
182 */ 185 */
@@ -187,8 +190,7 @@ static int addr_compare(const struct inetpeer_addr *a,
187 \ 190 \
188 stackptr = _stack; \ 191 stackptr = _stack; \
189 *stackptr++ = &_base->root; \ 192 *stackptr++ = &_base->root; \
190 for (u = rcu_dereference_protected(_base->root, \ 193 for (u = rcu_deref_locked(_base->root, _base); \
191 lockdep_is_held(&_base->lock)); \
192 u != peer_avl_empty; ) { \ 194 u != peer_avl_empty; ) { \
193 int cmp = addr_compare(_daddr, &u->daddr); \ 195 int cmp = addr_compare(_daddr, &u->daddr); \
194 if (cmp == 0) \ 196 if (cmp == 0) \
@@ -198,23 +200,22 @@ static int addr_compare(const struct inetpeer_addr *a,
198 else \ 200 else \
199 v = &u->avl_right; \ 201 v = &u->avl_right; \
200 *stackptr++ = v; \ 202 *stackptr++ = v; \
201 u = rcu_dereference_protected(*v, \ 203 u = rcu_deref_locked(*v, _base); \
202 lockdep_is_held(&_base->lock)); \
203 } \ 204 } \
204 u; \ 205 u; \
205}) 206})
206 207
207/* 208/*
208 * Called with rcu_read_lock_bh() 209 * Called with rcu_read_lock()
209 * Because we hold no lock against a writer, its quite possible we fall 210 * Because we hold no lock against a writer, its quite possible we fall
210 * in an endless loop. 211 * in an endless loop.
211 * But every pointer we follow is guaranteed to be valid thanks to RCU. 212 * But every pointer we follow is guaranteed to be valid thanks to RCU.
212 * We exit from this function if number of links exceeds PEER_MAXDEPTH 213 * We exit from this function if number of links exceeds PEER_MAXDEPTH
213 */ 214 */
214static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, 215static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
215 struct inet_peer_base *base) 216 struct inet_peer_base *base)
216{ 217{
217 struct inet_peer *u = rcu_dereference_bh(base->root); 218 struct inet_peer *u = rcu_dereference(base->root);
218 int count = 0; 219 int count = 0;
219 220
220 while (u != peer_avl_empty) { 221 while (u != peer_avl_empty) {
@@ -230,9 +231,9 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
230 return u; 231 return u;
231 } 232 }
232 if (cmp == -1) 233 if (cmp == -1)
233 u = rcu_dereference_bh(u->avl_left); 234 u = rcu_dereference(u->avl_left);
234 else 235 else
235 u = rcu_dereference_bh(u->avl_right); 236 u = rcu_dereference(u->avl_right);
236 if (unlikely(++count == PEER_MAXDEPTH)) 237 if (unlikely(++count == PEER_MAXDEPTH))
237 break; 238 break;
238 } 239 }
@@ -246,13 +247,11 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
246 struct inet_peer __rcu **v; \ 247 struct inet_peer __rcu **v; \
247 *stackptr++ = &start->avl_left; \ 248 *stackptr++ = &start->avl_left; \
248 v = &start->avl_left; \ 249 v = &start->avl_left; \
249 for (u = rcu_dereference_protected(*v, \ 250 for (u = rcu_deref_locked(*v, base); \
250 lockdep_is_held(&base->lock)); \
251 u->avl_right != peer_avl_empty_rcu; ) { \ 251 u->avl_right != peer_avl_empty_rcu; ) { \
252 v = &u->avl_right; \ 252 v = &u->avl_right; \
253 *stackptr++ = v; \ 253 *stackptr++ = v; \
254 u = rcu_dereference_protected(*v, \ 254 u = rcu_deref_locked(*v, base); \
255 lockdep_is_held(&base->lock)); \
256 } \ 255 } \
257 u; \ 256 u; \
258}) 257})
@@ -271,21 +270,16 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
271 270
272 while (stackend > stack) { 271 while (stackend > stack) {
273 nodep = *--stackend; 272 nodep = *--stackend;
274 node = rcu_dereference_protected(*nodep, 273 node = rcu_deref_locked(*nodep, base);
275 lockdep_is_held(&base->lock)); 274 l = rcu_deref_locked(node->avl_left, base);
276 l = rcu_dereference_protected(node->avl_left, 275 r = rcu_deref_locked(node->avl_right, base);
277 lockdep_is_held(&base->lock));
278 r = rcu_dereference_protected(node->avl_right,
279 lockdep_is_held(&base->lock));
280 lh = node_height(l); 276 lh = node_height(l);
281 rh = node_height(r); 277 rh = node_height(r);
282 if (lh > rh + 1) { /* l: RH+2 */ 278 if (lh > rh + 1) { /* l: RH+2 */
283 struct inet_peer *ll, *lr, *lrl, *lrr; 279 struct inet_peer *ll, *lr, *lrl, *lrr;
284 int lrh; 280 int lrh;
285 ll = rcu_dereference_protected(l->avl_left, 281 ll = rcu_deref_locked(l->avl_left, base);
286 lockdep_is_held(&base->lock)); 282 lr = rcu_deref_locked(l->avl_right, base);
287 lr = rcu_dereference_protected(l->avl_right,
288 lockdep_is_held(&base->lock));
289 lrh = node_height(lr); 283 lrh = node_height(lr);
290 if (lrh <= node_height(ll)) { /* ll: RH+1 */ 284 if (lrh <= node_height(ll)) { /* ll: RH+1 */
291 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ 285 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
@@ -296,10 +290,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
296 l->avl_height = node->avl_height + 1; 290 l->avl_height = node->avl_height + 1;
297 RCU_INIT_POINTER(*nodep, l); 291 RCU_INIT_POINTER(*nodep, l);
298 } else { /* ll: RH, lr: RH+1 */ 292 } else { /* ll: RH, lr: RH+1 */
299 lrl = rcu_dereference_protected(lr->avl_left, 293 lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
300 lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */ 294 lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
301 lrr = rcu_dereference_protected(lr->avl_right,
302 lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */
303 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ 295 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
304 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ 296 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
305 node->avl_height = rh + 1; /* node: RH+1 */ 297 node->avl_height = rh + 1; /* node: RH+1 */
@@ -314,10 +306,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
314 } else if (rh > lh + 1) { /* r: LH+2 */ 306 } else if (rh > lh + 1) { /* r: LH+2 */
315 struct inet_peer *rr, *rl, *rlr, *rll; 307 struct inet_peer *rr, *rl, *rlr, *rll;
316 int rlh; 308 int rlh;
317 rr = rcu_dereference_protected(r->avl_right, 309 rr = rcu_deref_locked(r->avl_right, base);
318 lockdep_is_held(&base->lock)); 310 rl = rcu_deref_locked(r->avl_left, base);
319 rl = rcu_dereference_protected(r->avl_left,
320 lockdep_is_held(&base->lock));
321 rlh = node_height(rl); 311 rlh = node_height(rl);
322 if (rlh <= node_height(rr)) { /* rr: LH+1 */ 312 if (rlh <= node_height(rr)) { /* rr: LH+1 */
323 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ 313 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
@@ -328,10 +318,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
328 r->avl_height = node->avl_height + 1; 318 r->avl_height = node->avl_height + 1;
329 RCU_INIT_POINTER(*nodep, r); 319 RCU_INIT_POINTER(*nodep, r);
330 } else { /* rr: RH, rl: RH+1 */ 320 } else { /* rr: RH, rl: RH+1 */
331 rlr = rcu_dereference_protected(rl->avl_right, 321 rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
332 lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */ 322 rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
333 rll = rcu_dereference_protected(rl->avl_left,
334 lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */
335 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ 323 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
336 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ 324 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
337 node->avl_height = lh + 1; /* node: LH+1 */ 325 node->avl_height = lh + 1; /* node: LH+1 */
@@ -372,7 +360,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
372 360
373 do_free = 0; 361 do_free = 0;
374 362
375 spin_lock_bh(&base->lock); 363 write_seqlock_bh(&base->lock);
376 /* Check the reference counter. It was artificially incremented by 1 364 /* Check the reference counter. It was artificially incremented by 1
377 * in cleanup() function to prevent sudden disappearing. If we can 365 * in cleanup() function to prevent sudden disappearing. If we can
378 * atomically (because of lockless readers) take this last reference, 366 * atomically (because of lockless readers) take this last reference,
@@ -392,8 +380,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
392 /* look for a node to insert instead of p */ 380 /* look for a node to insert instead of p */
393 struct inet_peer *t; 381 struct inet_peer *t;
394 t = lookup_rightempty(p, base); 382 t = lookup_rightempty(p, base);
395 BUG_ON(rcu_dereference_protected(*stackptr[-1], 383 BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
396 lockdep_is_held(&base->lock)) != t);
397 **--stackptr = t->avl_left; 384 **--stackptr = t->avl_left;
398 /* t is removed, t->daddr > x->daddr for any 385 /* t is removed, t->daddr > x->daddr for any
399 * x in p->avl_left subtree. 386 * x in p->avl_left subtree.
@@ -409,10 +396,10 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
409 base->total--; 396 base->total--;
410 do_free = 1; 397 do_free = 1;
411 } 398 }
412 spin_unlock_bh(&base->lock); 399 write_sequnlock_bh(&base->lock);
413 400
414 if (do_free) 401 if (do_free)
415 call_rcu_bh(&p->rcu, inetpeer_free_rcu); 402 call_rcu(&p->rcu, inetpeer_free_rcu);
416 else 403 else
417 /* The node is used again. Decrease the reference counter 404 /* The node is used again. Decrease the reference counter
418 * back. The loop "cleanup -> unlink_from_unused 405 * back. The loop "cleanup -> unlink_from_unused
@@ -475,15 +462,19 @@ static int cleanup_once(unsigned long ttl)
475struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) 462struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
476{ 463{
477 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; 464 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
478 struct inet_peer_base *base = family_to_base(AF_INET); 465 struct inet_peer_base *base = family_to_base(daddr->family);
479 struct inet_peer *p; 466 struct inet_peer *p;
467 unsigned int sequence;
468 int invalidated;
480 469
481 /* Look up for the address quickly, lockless. 470 /* Look up for the address quickly, lockless.
482 * Because of a concurrent writer, we might not find an existing entry. 471 * Because of a concurrent writer, we might not find an existing entry.
483 */ 472 */
484 rcu_read_lock_bh(); 473 rcu_read_lock();
485 p = lookup_rcu_bh(daddr, base); 474 sequence = read_seqbegin(&base->lock);
486 rcu_read_unlock_bh(); 475 p = lookup_rcu(daddr, base);
476 invalidated = read_seqretry(&base->lock, sequence);
477 rcu_read_unlock();
487 478
488 if (p) { 479 if (p) {
489 /* The existing node has been found. 480 /* The existing node has been found.
@@ -493,14 +484,18 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
493 return p; 484 return p;
494 } 485 }
495 486
487 /* If no writer did a change during our lookup, we can return early. */
488 if (!create && !invalidated)
489 return NULL;
490
496 /* retry an exact lookup, taking the lock before. 491 /* retry an exact lookup, taking the lock before.
497 * At least, nodes should be hot in our cache. 492 * At least, nodes should be hot in our cache.
498 */ 493 */
499 spin_lock_bh(&base->lock); 494 write_seqlock_bh(&base->lock);
500 p = lookup(daddr, stack, base); 495 p = lookup(daddr, stack, base);
501 if (p != peer_avl_empty) { 496 if (p != peer_avl_empty) {
502 atomic_inc(&p->refcnt); 497 atomic_inc(&p->refcnt);
503 spin_unlock_bh(&base->lock); 498 write_sequnlock_bh(&base->lock);
504 /* Remove the entry from unused list if it was there. */ 499 /* Remove the entry from unused list if it was there. */
505 unlink_from_unused(p); 500 unlink_from_unused(p);
506 return p; 501 return p;
@@ -510,8 +505,14 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
510 p->daddr = *daddr; 505 p->daddr = *daddr;
511 atomic_set(&p->refcnt, 1); 506 atomic_set(&p->refcnt, 1);
512 atomic_set(&p->rid, 0); 507 atomic_set(&p->rid, 0);
513 atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4)); 508 atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4));
514 p->tcp_ts_stamp = 0; 509 p->tcp_ts_stamp = 0;
510 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
511 p->rate_tokens = 0;
512 p->rate_last = 0;
513 p->pmtu_expires = 0;
514 p->pmtu_orig = 0;
515 memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
515 INIT_LIST_HEAD(&p->unused); 516 INIT_LIST_HEAD(&p->unused);
516 517
517 518
@@ -519,7 +520,7 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
519 link_to_pool(p, base); 520 link_to_pool(p, base);
520 base->total++; 521 base->total++;
521 } 522 }
522 spin_unlock_bh(&base->lock); 523 write_sequnlock_bh(&base->lock);
523 524
524 if (base->total >= inet_peer_threshold) 525 if (base->total >= inet_peer_threshold)
525 /* Remove one less-recently-used entry. */ 526 /* Remove one less-recently-used entry. */
@@ -579,3 +580,44 @@ void inet_putpeer(struct inet_peer *p)
579 local_bh_enable(); 580 local_bh_enable();
580} 581}
581EXPORT_SYMBOL_GPL(inet_putpeer); 582EXPORT_SYMBOL_GPL(inet_putpeer);
583
584/*
585 * Check transmit rate limitation for given message.
586 * The rate information is held in the inet_peer entries now.
587 * This function is generic and could be used for other purposes
588 * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
589 *
590 * Note that the same inet_peer fields are modified by functions in
591 * route.c too, but these work for packet destinations while xrlim_allow
592 * works for icmp destinations. This means the rate limiting information
593 * for one "ip object" is shared - and these ICMPs are twice limited:
594 * by source and by destination.
595 *
596 * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
597 * SHOULD allow setting of rate limits
598 *
599 * Shared between ICMPv4 and ICMPv6.
600 */
601#define XRLIM_BURST_FACTOR 6
602bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
603{
604 unsigned long now, token;
605 bool rc = false;
606
607 if (!peer)
608 return true;
609
610 token = peer->rate_tokens;
611 now = jiffies;
612 token += now - peer->rate_last;
613 peer->rate_last = now;
614 if (token > XRLIM_BURST_FACTOR * timeout)
615 token = XRLIM_BURST_FACTOR * timeout;
616 if (token >= timeout) {
617 token -= timeout;
618 rc = true;
619 }
620 peer->rate_tokens = token;
621 return rc;
622}
623EXPORT_SYMBOL(inet_peer_xrlim_allow);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index eb68a0e34e49..da5941f18c3c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -769,18 +769,12 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
769 tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph); 769 tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
770 } 770 }
771 771
772 { 772 rt = ip_route_output_gre(dev_net(dev), dst, tiph->saddr,
773 struct flowi fl = { 773 tunnel->parms.o_key, RT_TOS(tos),
774 .oif = tunnel->parms.link, 774 tunnel->parms.link);
775 .fl4_dst = dst, 775 if (IS_ERR(rt)) {
776 .fl4_src = tiph->saddr, 776 dev->stats.tx_carrier_errors++;
777 .fl4_tos = RT_TOS(tos), 777 goto tx_error;
778 .fl_gre_key = tunnel->parms.o_key
779 };
780 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
781 dev->stats.tx_carrier_errors++;
782 goto tx_error;
783 }
784 } 778 }
785 tdev = rt->dst.dev; 779 tdev = rt->dst.dev;
786 780
@@ -944,17 +938,13 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
944 /* Guess output device to choose reasonable mtu and needed_headroom */ 938 /* Guess output device to choose reasonable mtu and needed_headroom */
945 939
946 if (iph->daddr) { 940 if (iph->daddr) {
947 struct flowi fl = { 941 struct rtable *rt = ip_route_output_gre(dev_net(dev),
948 .oif = tunnel->parms.link, 942 iph->daddr, iph->saddr,
949 .fl4_dst = iph->daddr, 943 tunnel->parms.o_key,
950 .fl4_src = iph->saddr, 944 RT_TOS(iph->tos),
951 .fl4_tos = RT_TOS(iph->tos), 945 tunnel->parms.link);
952 .proto = IPPROTO_GRE, 946
953 .fl_gre_key = tunnel->parms.o_key 947 if (!IS_ERR(rt)) {
954 };
955 struct rtable *rt;
956
957 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
958 tdev = rt->dst.dev; 948 tdev = rt->dst.dev;
959 ip_rt_put(rt); 949 ip_rt_put(rt);
960 } 950 }
@@ -1206,17 +1196,14 @@ static int ipgre_open(struct net_device *dev)
1206 struct ip_tunnel *t = netdev_priv(dev); 1196 struct ip_tunnel *t = netdev_priv(dev);
1207 1197
1208 if (ipv4_is_multicast(t->parms.iph.daddr)) { 1198 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1209 struct flowi fl = { 1199 struct rtable *rt = ip_route_output_gre(dev_net(dev),
1210 .oif = t->parms.link, 1200 t->parms.iph.daddr,
1211 .fl4_dst = t->parms.iph.daddr, 1201 t->parms.iph.saddr,
1212 .fl4_src = t->parms.iph.saddr, 1202 t->parms.o_key,
1213 .fl4_tos = RT_TOS(t->parms.iph.tos), 1203 RT_TOS(t->parms.iph.tos),
1214 .proto = IPPROTO_GRE, 1204 t->parms.link);
1215 .fl_gre_key = t->parms.o_key 1205
1216 }; 1206 if (IS_ERR(rt))
1217 struct rtable *rt;
1218
1219 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1220 return -EADDRNOTAVAIL; 1207 return -EADDRNOTAVAIL;
1221 dev = rt->dst.dev; 1208 dev = rt->dst.dev;
1222 ip_rt_put(rt); 1209 ip_rt_put(rt);
@@ -1764,4 +1751,4 @@ module_exit(ipgre_fini);
1764MODULE_LICENSE("GPL"); 1751MODULE_LICENSE("GPL");
1765MODULE_ALIAS_RTNL_LINK("gre"); 1752MODULE_ALIAS_RTNL_LINK("gre");
1766MODULE_ALIAS_RTNL_LINK("gretap"); 1753MODULE_ALIAS_RTNL_LINK("gretap");
1767MODULE_ALIAS("gre0"); 1754MODULE_ALIAS_NETDEV("gre0");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d859bcc26cb7..d7b2b0987a3b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
340 } 340 }
341 } 341 }
342 342
343#ifdef CONFIG_NET_CLS_ROUTE 343#ifdef CONFIG_IP_ROUTE_CLASSID
344 if (unlikely(skb_dst(skb)->tclassid)) { 344 if (unlikely(skb_dst(skb)->tclassid)) {
345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); 345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
346 u32 idx = skb_dst(skb)->tclassid; 346 u32 idx = skb_dst(skb)->tclassid;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 1906fa35860c..28a736f3442f 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -140,11 +140,11 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
140 } else { 140 } else {
141 dopt->ts_needtime = 0; 141 dopt->ts_needtime = 0;
142 142
143 if (soffset + 8 <= optlen) { 143 if (soffset + 7 <= optlen) {
144 __be32 addr; 144 __be32 addr;
145 145
146 memcpy(&addr, sptr+soffset-1, 4); 146 memcpy(&addr, dptr+soffset-1, 4);
147 if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_LOCAL) { 147 if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) {
148 dopt->ts_needtime = 1; 148 dopt->ts_needtime = 1;
149 soffset += 8; 149 soffset += 8;
150 } 150 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 04c7b3ba6b39..67f241b97649 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -339,25 +339,19 @@ int ip_queue_xmit(struct sk_buff *skb)
339 if(opt && opt->srr) 339 if(opt && opt->srr)
340 daddr = opt->faddr; 340 daddr = opt->faddr;
341 341
342 { 342 /* If this fails, retransmit mechanism of transport layer will
343 struct flowi fl = { .oif = sk->sk_bound_dev_if, 343 * keep trying until route appears or the connection times
344 .mark = sk->sk_mark, 344 * itself out.
345 .fl4_dst = daddr, 345 */
346 .fl4_src = inet->inet_saddr, 346 rt = ip_route_output_ports(sock_net(sk), sk,
347 .fl4_tos = RT_CONN_FLAGS(sk), 347 daddr, inet->inet_saddr,
348 .proto = sk->sk_protocol, 348 inet->inet_dport,
349 .flags = inet_sk_flowi_flags(sk), 349 inet->inet_sport,
350 .fl_ip_sport = inet->inet_sport, 350 sk->sk_protocol,
351 .fl_ip_dport = inet->inet_dport }; 351 RT_CONN_FLAGS(sk),
352 352 sk->sk_bound_dev_if);
353 /* If this fails, retransmit mechanism of transport layer will 353 if (IS_ERR(rt))
354 * keep trying until route appears or the connection times 354 goto no_route;
355 * itself out.
356 */
357 security_sk_classify_flow(sk, &fl);
358 if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
359 goto no_route;
360 }
361 sk_setup_caps(sk, &rt->dst); 355 sk_setup_caps(sk, &rt->dst);
362 } 356 }
363 skb_dst_set_noref(skb, &rt->dst); 357 skb_dst_set_noref(skb, &rt->dst);
@@ -733,6 +727,7 @@ csum_page(struct page *page, int offset, int copy)
733} 727}
734 728
735static inline int ip_ufo_append_data(struct sock *sk, 729static inline int ip_ufo_append_data(struct sock *sk,
730 struct sk_buff_head *queue,
736 int getfrag(void *from, char *to, int offset, int len, 731 int getfrag(void *from, char *to, int offset, int len,
737 int odd, struct sk_buff *skb), 732 int odd, struct sk_buff *skb),
738 void *from, int length, int hh_len, int fragheaderlen, 733 void *from, int length, int hh_len, int fragheaderlen,
@@ -745,7 +740,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
745 * device, so create one single skb packet containing complete 740 * device, so create one single skb packet containing complete
746 * udp datagram 741 * udp datagram
747 */ 742 */
748 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { 743 if ((skb = skb_peek_tail(queue)) == NULL) {
749 skb = sock_alloc_send_skb(sk, 744 skb = sock_alloc_send_skb(sk,
750 hh_len + fragheaderlen + transhdrlen + 20, 745 hh_len + fragheaderlen + transhdrlen + 20,
751 (flags & MSG_DONTWAIT), &err); 746 (flags & MSG_DONTWAIT), &err);
@@ -767,40 +762,28 @@ static inline int ip_ufo_append_data(struct sock *sk,
767 762
768 skb->ip_summed = CHECKSUM_PARTIAL; 763 skb->ip_summed = CHECKSUM_PARTIAL;
769 skb->csum = 0; 764 skb->csum = 0;
770 sk->sk_sndmsg_off = 0;
771 765
772 /* specify the length of each IP datagram fragment */ 766 /* specify the length of each IP datagram fragment */
773 skb_shinfo(skb)->gso_size = mtu - fragheaderlen; 767 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
774 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 768 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
775 __skb_queue_tail(&sk->sk_write_queue, skb); 769 __skb_queue_tail(queue, skb);
776 } 770 }
777 771
778 return skb_append_datato_frags(sk, skb, getfrag, from, 772 return skb_append_datato_frags(sk, skb, getfrag, from,
779 (length - transhdrlen)); 773 (length - transhdrlen));
780} 774}
781 775
782/* 776static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
783 * ip_append_data() and ip_append_page() can make one large IP datagram 777 struct inet_cork *cork,
784 * from many pieces of data. Each pieces will be holded on the socket 778 int getfrag(void *from, char *to, int offset,
785 * until ip_push_pending_frames() is called. Each piece can be a page 779 int len, int odd, struct sk_buff *skb),
786 * or non-page data. 780 void *from, int length, int transhdrlen,
787 * 781 unsigned int flags)
788 * Not only UDP, other transport protocols - e.g. raw sockets - can use
789 * this interface potentially.
790 *
791 * LATER: length must be adjusted by pad at tail, when it is required.
792 */
793int ip_append_data(struct sock *sk,
794 int getfrag(void *from, char *to, int offset, int len,
795 int odd, struct sk_buff *skb),
796 void *from, int length, int transhdrlen,
797 struct ipcm_cookie *ipc, struct rtable **rtp,
798 unsigned int flags)
799{ 782{
800 struct inet_sock *inet = inet_sk(sk); 783 struct inet_sock *inet = inet_sk(sk);
801 struct sk_buff *skb; 784 struct sk_buff *skb;
802 785
803 struct ip_options *opt = NULL; 786 struct ip_options *opt = cork->opt;
804 int hh_len; 787 int hh_len;
805 int exthdrlen; 788 int exthdrlen;
806 int mtu; 789 int mtu;
@@ -809,58 +792,19 @@ int ip_append_data(struct sock *sk,
809 int offset = 0; 792 int offset = 0;
810 unsigned int maxfraglen, fragheaderlen; 793 unsigned int maxfraglen, fragheaderlen;
811 int csummode = CHECKSUM_NONE; 794 int csummode = CHECKSUM_NONE;
812 struct rtable *rt; 795 struct rtable *rt = (struct rtable *)cork->dst;
813 796
814 if (flags&MSG_PROBE) 797 exthdrlen = transhdrlen ? rt->dst.header_len : 0;
815 return 0; 798 length += exthdrlen;
816 799 transhdrlen += exthdrlen;
817 if (skb_queue_empty(&sk->sk_write_queue)) { 800 mtu = cork->fragsize;
818 /*
819 * setup for corking.
820 */
821 opt = ipc->opt;
822 if (opt) {
823 if (inet->cork.opt == NULL) {
824 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
825 if (unlikely(inet->cork.opt == NULL))
826 return -ENOBUFS;
827 }
828 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
829 inet->cork.flags |= IPCORK_OPT;
830 inet->cork.addr = ipc->addr;
831 }
832 rt = *rtp;
833 if (unlikely(!rt))
834 return -EFAULT;
835 /*
836 * We steal reference to this route, caller should not release it
837 */
838 *rtp = NULL;
839 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
840 rt->dst.dev->mtu :
841 dst_mtu(rt->dst.path);
842 inet->cork.dst = &rt->dst;
843 inet->cork.length = 0;
844 sk->sk_sndmsg_page = NULL;
845 sk->sk_sndmsg_off = 0;
846 exthdrlen = rt->dst.header_len;
847 length += exthdrlen;
848 transhdrlen += exthdrlen;
849 } else {
850 rt = (struct rtable *)inet->cork.dst;
851 if (inet->cork.flags & IPCORK_OPT)
852 opt = inet->cork.opt;
853 801
854 transhdrlen = 0;
855 exthdrlen = 0;
856 mtu = inet->cork.fragsize;
857 }
858 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 802 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
859 803
860 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 804 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
861 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 805 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
862 806
863 if (inet->cork.length + length > 0xFFFF - fragheaderlen) { 807 if (cork->length + length > 0xFFFF - fragheaderlen) {
864 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, 808 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
865 mtu-exthdrlen); 809 mtu-exthdrlen);
866 return -EMSGSIZE; 810 return -EMSGSIZE;
@@ -876,15 +820,15 @@ int ip_append_data(struct sock *sk,
876 !exthdrlen) 820 !exthdrlen)
877 csummode = CHECKSUM_PARTIAL; 821 csummode = CHECKSUM_PARTIAL;
878 822
879 skb = skb_peek_tail(&sk->sk_write_queue); 823 skb = skb_peek_tail(queue);
880 824
881 inet->cork.length += length; 825 cork->length += length;
882 if (((length > mtu) || (skb && skb_is_gso(skb))) && 826 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
883 (sk->sk_protocol == IPPROTO_UDP) && 827 (sk->sk_protocol == IPPROTO_UDP) &&
884 (rt->dst.dev->features & NETIF_F_UFO)) { 828 (rt->dst.dev->features & NETIF_F_UFO)) {
885 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, 829 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
886 fragheaderlen, transhdrlen, mtu, 830 hh_len, fragheaderlen, transhdrlen,
887 flags); 831 mtu, flags);
888 if (err) 832 if (err)
889 goto error; 833 goto error;
890 return 0; 834 return 0;
@@ -961,7 +905,7 @@ alloc_new_skb:
961 else 905 else
962 /* only the initial fragment is 906 /* only the initial fragment is
963 time stamped */ 907 time stamped */
964 ipc->tx_flags = 0; 908 cork->tx_flags = 0;
965 } 909 }
966 if (skb == NULL) 910 if (skb == NULL)
967 goto error; 911 goto error;
@@ -972,7 +916,7 @@ alloc_new_skb:
972 skb->ip_summed = csummode; 916 skb->ip_summed = csummode;
973 skb->csum = 0; 917 skb->csum = 0;
974 skb_reserve(skb, hh_len); 918 skb_reserve(skb, hh_len);
975 skb_shinfo(skb)->tx_flags = ipc->tx_flags; 919 skb_shinfo(skb)->tx_flags = cork->tx_flags;
976 920
977 /* 921 /*
978 * Find where to start putting bytes. 922 * Find where to start putting bytes.
@@ -1009,7 +953,7 @@ alloc_new_skb:
1009 /* 953 /*
1010 * Put the packet on the pending queue. 954 * Put the packet on the pending queue.
1011 */ 955 */
1012 __skb_queue_tail(&sk->sk_write_queue, skb); 956 __skb_queue_tail(queue, skb);
1013 continue; 957 continue;
1014 } 958 }
1015 959
@@ -1029,8 +973,8 @@ alloc_new_skb:
1029 } else { 973 } else {
1030 int i = skb_shinfo(skb)->nr_frags; 974 int i = skb_shinfo(skb)->nr_frags;
1031 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 975 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1032 struct page *page = sk->sk_sndmsg_page; 976 struct page *page = cork->page;
1033 int off = sk->sk_sndmsg_off; 977 int off = cork->off;
1034 unsigned int left; 978 unsigned int left;
1035 979
1036 if (page && (left = PAGE_SIZE - off) > 0) { 980 if (page && (left = PAGE_SIZE - off) > 0) {
@@ -1042,7 +986,7 @@ alloc_new_skb:
1042 goto error; 986 goto error;
1043 } 987 }
1044 get_page(page); 988 get_page(page);
1045 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); 989 skb_fill_page_desc(skb, i, page, off, 0);
1046 frag = &skb_shinfo(skb)->frags[i]; 990 frag = &skb_shinfo(skb)->frags[i];
1047 } 991 }
1048 } else if (i < MAX_SKB_FRAGS) { 992 } else if (i < MAX_SKB_FRAGS) {
@@ -1053,8 +997,8 @@ alloc_new_skb:
1053 err = -ENOMEM; 997 err = -ENOMEM;
1054 goto error; 998 goto error;
1055 } 999 }
1056 sk->sk_sndmsg_page = page; 1000 cork->page = page;
1057 sk->sk_sndmsg_off = 0; 1001 cork->off = 0;
1058 1002
1059 skb_fill_page_desc(skb, i, page, 0, 0); 1003 skb_fill_page_desc(skb, i, page, 0, 0);
1060 frag = &skb_shinfo(skb)->frags[i]; 1004 frag = &skb_shinfo(skb)->frags[i];
@@ -1066,7 +1010,7 @@ alloc_new_skb:
1066 err = -EFAULT; 1010 err = -EFAULT;
1067 goto error; 1011 goto error;
1068 } 1012 }
1069 sk->sk_sndmsg_off += copy; 1013 cork->off += copy;
1070 frag->size += copy; 1014 frag->size += copy;
1071 skb->len += copy; 1015 skb->len += copy;
1072 skb->data_len += copy; 1016 skb->data_len += copy;
@@ -1080,11 +1024,87 @@ alloc_new_skb:
1080 return 0; 1024 return 0;
1081 1025
1082error: 1026error:
1083 inet->cork.length -= length; 1027 cork->length -= length;
1084 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1028 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1085 return err; 1029 return err;
1086} 1030}
1087 1031
1032static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1033 struct ipcm_cookie *ipc, struct rtable **rtp)
1034{
1035 struct inet_sock *inet = inet_sk(sk);
1036 struct ip_options *opt;
1037 struct rtable *rt;
1038
1039 /*
1040 * setup for corking.
1041 */
1042 opt = ipc->opt;
1043 if (opt) {
1044 if (cork->opt == NULL) {
1045 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1046 sk->sk_allocation);
1047 if (unlikely(cork->opt == NULL))
1048 return -ENOBUFS;
1049 }
1050 memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen);
1051 cork->flags |= IPCORK_OPT;
1052 cork->addr = ipc->addr;
1053 }
1054 rt = *rtp;
1055 if (unlikely(!rt))
1056 return -EFAULT;
1057 /*
1058 * We steal reference to this route, caller should not release it
1059 */
1060 *rtp = NULL;
1061 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1062 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1063 cork->dst = &rt->dst;
1064 cork->length = 0;
1065 cork->tx_flags = ipc->tx_flags;
1066 cork->page = NULL;
1067 cork->off = 0;
1068
1069 return 0;
1070}
1071
1072/*
1073 * ip_append_data() and ip_append_page() can make one large IP datagram
1074 * from many pieces of data. Each pieces will be holded on the socket
1075 * until ip_push_pending_frames() is called. Each piece can be a page
1076 * or non-page data.
1077 *
1078 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1079 * this interface potentially.
1080 *
1081 * LATER: length must be adjusted by pad at tail, when it is required.
1082 */
1083int ip_append_data(struct sock *sk,
1084 int getfrag(void *from, char *to, int offset, int len,
1085 int odd, struct sk_buff *skb),
1086 void *from, int length, int transhdrlen,
1087 struct ipcm_cookie *ipc, struct rtable **rtp,
1088 unsigned int flags)
1089{
1090 struct inet_sock *inet = inet_sk(sk);
1091 int err;
1092
1093 if (flags&MSG_PROBE)
1094 return 0;
1095
1096 if (skb_queue_empty(&sk->sk_write_queue)) {
1097 err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
1098 if (err)
1099 return err;
1100 } else {
1101 transhdrlen = 0;
1102 }
1103
1104 return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
1105 from, length, transhdrlen, flags);
1106}
1107
1088ssize_t ip_append_page(struct sock *sk, struct page *page, 1108ssize_t ip_append_page(struct sock *sk, struct page *page,
1089 int offset, size_t size, int flags) 1109 int offset, size_t size, int flags)
1090{ 1110{
@@ -1228,40 +1248,41 @@ error:
1228 return err; 1248 return err;
1229} 1249}
1230 1250
1231static void ip_cork_release(struct inet_sock *inet) 1251static void ip_cork_release(struct inet_cork *cork)
1232{ 1252{
1233 inet->cork.flags &= ~IPCORK_OPT; 1253 cork->flags &= ~IPCORK_OPT;
1234 kfree(inet->cork.opt); 1254 kfree(cork->opt);
1235 inet->cork.opt = NULL; 1255 cork->opt = NULL;
1236 dst_release(inet->cork.dst); 1256 dst_release(cork->dst);
1237 inet->cork.dst = NULL; 1257 cork->dst = NULL;
1238} 1258}
1239 1259
1240/* 1260/*
1241 * Combined all pending IP fragments on the socket as one IP datagram 1261 * Combined all pending IP fragments on the socket as one IP datagram
1242 * and push them out. 1262 * and push them out.
1243 */ 1263 */
1244int ip_push_pending_frames(struct sock *sk) 1264struct sk_buff *__ip_make_skb(struct sock *sk,
1265 struct sk_buff_head *queue,
1266 struct inet_cork *cork)
1245{ 1267{
1246 struct sk_buff *skb, *tmp_skb; 1268 struct sk_buff *skb, *tmp_skb;
1247 struct sk_buff **tail_skb; 1269 struct sk_buff **tail_skb;
1248 struct inet_sock *inet = inet_sk(sk); 1270 struct inet_sock *inet = inet_sk(sk);
1249 struct net *net = sock_net(sk); 1271 struct net *net = sock_net(sk);
1250 struct ip_options *opt = NULL; 1272 struct ip_options *opt = NULL;
1251 struct rtable *rt = (struct rtable *)inet->cork.dst; 1273 struct rtable *rt = (struct rtable *)cork->dst;
1252 struct iphdr *iph; 1274 struct iphdr *iph;
1253 __be16 df = 0; 1275 __be16 df = 0;
1254 __u8 ttl; 1276 __u8 ttl;
1255 int err = 0;
1256 1277
1257 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) 1278 if ((skb = __skb_dequeue(queue)) == NULL)
1258 goto out; 1279 goto out;
1259 tail_skb = &(skb_shinfo(skb)->frag_list); 1280 tail_skb = &(skb_shinfo(skb)->frag_list);
1260 1281
1261 /* move skb->data to ip header from ext header */ 1282 /* move skb->data to ip header from ext header */
1262 if (skb->data < skb_network_header(skb)) 1283 if (skb->data < skb_network_header(skb))
1263 __skb_pull(skb, skb_network_offset(skb)); 1284 __skb_pull(skb, skb_network_offset(skb));
1264 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { 1285 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1265 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1286 __skb_pull(tmp_skb, skb_network_header_len(skb));
1266 *tail_skb = tmp_skb; 1287 *tail_skb = tmp_skb;
1267 tail_skb = &(tmp_skb->next); 1288 tail_skb = &(tmp_skb->next);
@@ -1287,8 +1308,8 @@ int ip_push_pending_frames(struct sock *sk)
1287 ip_dont_fragment(sk, &rt->dst))) 1308 ip_dont_fragment(sk, &rt->dst)))
1288 df = htons(IP_DF); 1309 df = htons(IP_DF);
1289 1310
1290 if (inet->cork.flags & IPCORK_OPT) 1311 if (cork->flags & IPCORK_OPT)
1291 opt = inet->cork.opt; 1312 opt = cork->opt;
1292 1313
1293 if (rt->rt_type == RTN_MULTICAST) 1314 if (rt->rt_type == RTN_MULTICAST)
1294 ttl = inet->mc_ttl; 1315 ttl = inet->mc_ttl;
@@ -1300,7 +1321,7 @@ int ip_push_pending_frames(struct sock *sk)
1300 iph->ihl = 5; 1321 iph->ihl = 5;
1301 if (opt) { 1322 if (opt) {
1302 iph->ihl += opt->optlen>>2; 1323 iph->ihl += opt->optlen>>2;
1303 ip_options_build(skb, opt, inet->cork.addr, rt, 0); 1324 ip_options_build(skb, opt, cork->addr, rt, 0);
1304 } 1325 }
1305 iph->tos = inet->tos; 1326 iph->tos = inet->tos;
1306 iph->frag_off = df; 1327 iph->frag_off = df;
@@ -1316,44 +1337,95 @@ int ip_push_pending_frames(struct sock *sk)
1316 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec 1337 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1317 * on dst refcount 1338 * on dst refcount
1318 */ 1339 */
1319 inet->cork.dst = NULL; 1340 cork->dst = NULL;
1320 skb_dst_set(skb, &rt->dst); 1341 skb_dst_set(skb, &rt->dst);
1321 1342
1322 if (iph->protocol == IPPROTO_ICMP) 1343 if (iph->protocol == IPPROTO_ICMP)
1323 icmp_out_count(net, ((struct icmphdr *) 1344 icmp_out_count(net, ((struct icmphdr *)
1324 skb_transport_header(skb))->type); 1345 skb_transport_header(skb))->type);
1325 1346
1326 /* Netfilter gets whole the not fragmented skb. */ 1347 ip_cork_release(cork);
1348out:
1349 return skb;
1350}
1351
1352int ip_send_skb(struct sk_buff *skb)
1353{
1354 struct net *net = sock_net(skb->sk);
1355 int err;
1356
1327 err = ip_local_out(skb); 1357 err = ip_local_out(skb);
1328 if (err) { 1358 if (err) {
1329 if (err > 0) 1359 if (err > 0)
1330 err = net_xmit_errno(err); 1360 err = net_xmit_errno(err);
1331 if (err) 1361 if (err)
1332 goto error; 1362 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1333 } 1363 }
1334 1364
1335out:
1336 ip_cork_release(inet);
1337 return err; 1365 return err;
1366}
1338 1367
1339error: 1368int ip_push_pending_frames(struct sock *sk)
1340 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); 1369{
1341 goto out; 1370 struct sk_buff *skb;
1371
1372 skb = ip_finish_skb(sk);
1373 if (!skb)
1374 return 0;
1375
1376 /* Netfilter gets whole the not fragmented skb. */
1377 return ip_send_skb(skb);
1342} 1378}
1343 1379
1344/* 1380/*
1345 * Throw away all pending data on the socket. 1381 * Throw away all pending data on the socket.
1346 */ 1382 */
1347void ip_flush_pending_frames(struct sock *sk) 1383static void __ip_flush_pending_frames(struct sock *sk,
1384 struct sk_buff_head *queue,
1385 struct inet_cork *cork)
1348{ 1386{
1349 struct sk_buff *skb; 1387 struct sk_buff *skb;
1350 1388
1351 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) 1389 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1352 kfree_skb(skb); 1390 kfree_skb(skb);
1353 1391
1354 ip_cork_release(inet_sk(sk)); 1392 ip_cork_release(cork);
1393}
1394
1395void ip_flush_pending_frames(struct sock *sk)
1396{
1397 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
1355} 1398}
1356 1399
1400struct sk_buff *ip_make_skb(struct sock *sk,
1401 int getfrag(void *from, char *to, int offset,
1402 int len, int odd, struct sk_buff *skb),
1403 void *from, int length, int transhdrlen,
1404 struct ipcm_cookie *ipc, struct rtable **rtp,
1405 unsigned int flags)
1406{
1407 struct inet_cork cork = {};
1408 struct sk_buff_head queue;
1409 int err;
1410
1411 if (flags & MSG_PROBE)
1412 return NULL;
1413
1414 __skb_queue_head_init(&queue);
1415
1416 err = ip_setup_cork(sk, &cork, ipc, rtp);
1417 if (err)
1418 return ERR_PTR(err);
1419
1420 err = __ip_append_data(sk, &queue, &cork, getfrag,
1421 from, length, transhdrlen, flags);
1422 if (err) {
1423 __ip_flush_pending_frames(sk, &queue, &cork);
1424 return ERR_PTR(err);
1425 }
1426
1427 return __ip_make_skb(sk, &queue, &cork);
1428}
1357 1429
1358/* 1430/*
1359 * Fetch data from kernel space and fill in checksum if needed. 1431 * Fetch data from kernel space and fill in checksum if needed.
@@ -1402,16 +1474,19 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1402 } 1474 }
1403 1475
1404 { 1476 {
1405 struct flowi fl = { .oif = arg->bound_dev_if, 1477 struct flowi4 fl4 = {
1406 .fl4_dst = daddr, 1478 .flowi4_oif = arg->bound_dev_if,
1407 .fl4_src = rt->rt_spec_dst, 1479 .daddr = daddr,
1408 .fl4_tos = RT_TOS(ip_hdr(skb)->tos), 1480 .saddr = rt->rt_spec_dst,
1409 .fl_ip_sport = tcp_hdr(skb)->dest, 1481 .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
1410 .fl_ip_dport = tcp_hdr(skb)->source, 1482 .fl4_sport = tcp_hdr(skb)->dest,
1411 .proto = sk->sk_protocol, 1483 .fl4_dport = tcp_hdr(skb)->source,
1412 .flags = ip_reply_arg_flowi_flags(arg) }; 1484 .flowi4_proto = sk->sk_protocol,
1413 security_skb_classify_flow(skb, &fl); 1485 .flowi4_flags = ip_reply_arg_flowi_flags(arg),
1414 if (ip_route_output_key(sock_net(sk), &rt, &fl)) 1486 };
1487 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1488 rt = ip_route_output_key(sock_net(sk), &fl4);
1489 if (IS_ERR(rt))
1415 return; 1490 return;
1416 } 1491 }
1417 1492
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 988f52fba54a..bfc17c5914e7 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -460,19 +460,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
460 goto tx_error_icmp; 460 goto tx_error_icmp;
461 } 461 }
462 462
463 { 463 rt = ip_route_output_ports(dev_net(dev), NULL,
464 struct flowi fl = { 464 dst, tiph->saddr,
465 .oif = tunnel->parms.link, 465 0, 0,
466 .fl4_dst = dst, 466 IPPROTO_IPIP, RT_TOS(tos),
467 .fl4_src= tiph->saddr, 467 tunnel->parms.link);
468 .fl4_tos = RT_TOS(tos), 468 if (IS_ERR(rt)) {
469 .proto = IPPROTO_IPIP 469 dev->stats.tx_carrier_errors++;
470 }; 470 goto tx_error_icmp;
471
472 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
473 dev->stats.tx_carrier_errors++;
474 goto tx_error_icmp;
475 }
476 } 471 }
477 tdev = rt->dst.dev; 472 tdev = rt->dst.dev;
478 473
@@ -583,16 +578,14 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
583 iph = &tunnel->parms.iph; 578 iph = &tunnel->parms.iph;
584 579
585 if (iph->daddr) { 580 if (iph->daddr) {
586 struct flowi fl = { 581 struct rtable *rt = ip_route_output_ports(dev_net(dev), NULL,
587 .oif = tunnel->parms.link, 582 iph->daddr, iph->saddr,
588 .fl4_dst = iph->daddr, 583 0, 0,
589 .fl4_src = iph->saddr, 584 IPPROTO_IPIP,
590 .fl4_tos = RT_TOS(iph->tos), 585 RT_TOS(iph->tos),
591 .proto = IPPROTO_IPIP 586 tunnel->parms.link);
592 }; 587
593 struct rtable *rt; 588 if (!IS_ERR(rt)) {
594
595 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
596 tdev = rt->dst.dev; 589 tdev = rt->dst.dev;
597 ip_rt_put(rt); 590 ip_rt_put(rt);
598 } 591 }
@@ -913,4 +906,4 @@ static void __exit ipip_fini(void)
913module_init(ipip_init); 906module_init(ipip_init);
914module_exit(ipip_fini); 907module_exit(ipip_fini);
915MODULE_LICENSE("GPL"); 908MODULE_LICENSE("GPL");
916MODULE_ALIAS("tunl0"); 909MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3f3a9afd73e0..1f62eaeb6de4 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -60,6 +60,7 @@
60#include <linux/notifier.h> 60#include <linux/notifier.h>
61#include <linux/if_arp.h> 61#include <linux/if_arp.h>
62#include <linux/netfilter_ipv4.h> 62#include <linux/netfilter_ipv4.h>
63#include <linux/compat.h>
63#include <net/ipip.h> 64#include <net/ipip.h>
64#include <net/checksum.h> 65#include <net/checksum.h>
65#include <net/netlink.h> 66#include <net/netlink.h>
@@ -147,14 +148,15 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
147 return NULL; 148 return NULL;
148} 149}
149 150
150static int ipmr_fib_lookup(struct net *net, struct flowi *flp, 151static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
151 struct mr_table **mrt) 152 struct mr_table **mrt)
152{ 153{
153 struct ipmr_result res; 154 struct ipmr_result res;
154 struct fib_lookup_arg arg = { .result = &res, }; 155 struct fib_lookup_arg arg = { .result = &res, };
155 int err; 156 int err;
156 157
157 err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg); 158 err = fib_rules_lookup(net->ipv4.mr_rules_ops,
159 flowi4_to_flowi(flp4), 0, &arg);
158 if (err < 0) 160 if (err < 0)
159 return err; 161 return err;
160 *mrt = res.mrt; 162 *mrt = res.mrt;
@@ -282,7 +284,7 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
282 return net->ipv4.mrt; 284 return net->ipv4.mrt;
283} 285}
284 286
285static int ipmr_fib_lookup(struct net *net, struct flowi *flp, 287static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
286 struct mr_table **mrt) 288 struct mr_table **mrt)
287{ 289{
288 *mrt = net->ipv4.mrt; 290 *mrt = net->ipv4.mrt;
@@ -434,14 +436,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
434{ 436{
435 struct net *net = dev_net(dev); 437 struct net *net = dev_net(dev);
436 struct mr_table *mrt; 438 struct mr_table *mrt;
437 struct flowi fl = { 439 struct flowi4 fl4 = {
438 .oif = dev->ifindex, 440 .flowi4_oif = dev->ifindex,
439 .iif = skb->skb_iif, 441 .flowi4_iif = skb->skb_iif,
440 .mark = skb->mark, 442 .flowi4_mark = skb->mark,
441 }; 443 };
442 int err; 444 int err;
443 445
444 err = ipmr_fib_lookup(net, &fl, &mrt); 446 err = ipmr_fib_lookup(net, &fl4, &mrt);
445 if (err < 0) { 447 if (err < 0) {
446 kfree_skb(skb); 448 kfree_skb(skb);
447 return err; 449 return err;
@@ -1434,6 +1436,81 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1434 } 1436 }
1435} 1437}
1436 1438
1439#ifdef CONFIG_COMPAT
1440struct compat_sioc_sg_req {
1441 struct in_addr src;
1442 struct in_addr grp;
1443 compat_ulong_t pktcnt;
1444 compat_ulong_t bytecnt;
1445 compat_ulong_t wrong_if;
1446};
1447
1448struct compat_sioc_vif_req {
1449 vifi_t vifi; /* Which iface */
1450 compat_ulong_t icount;
1451 compat_ulong_t ocount;
1452 compat_ulong_t ibytes;
1453 compat_ulong_t obytes;
1454};
1455
1456int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
1457{
1458 struct compat_sioc_sg_req sr;
1459 struct compat_sioc_vif_req vr;
1460 struct vif_device *vif;
1461 struct mfc_cache *c;
1462 struct net *net = sock_net(sk);
1463 struct mr_table *mrt;
1464
1465 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1466 if (mrt == NULL)
1467 return -ENOENT;
1468
1469 switch (cmd) {
1470 case SIOCGETVIFCNT:
1471 if (copy_from_user(&vr, arg, sizeof(vr)))
1472 return -EFAULT;
1473 if (vr.vifi >= mrt->maxvif)
1474 return -EINVAL;
1475 read_lock(&mrt_lock);
1476 vif = &mrt->vif_table[vr.vifi];
1477 if (VIF_EXISTS(mrt, vr.vifi)) {
1478 vr.icount = vif->pkt_in;
1479 vr.ocount = vif->pkt_out;
1480 vr.ibytes = vif->bytes_in;
1481 vr.obytes = vif->bytes_out;
1482 read_unlock(&mrt_lock);
1483
1484 if (copy_to_user(arg, &vr, sizeof(vr)))
1485 return -EFAULT;
1486 return 0;
1487 }
1488 read_unlock(&mrt_lock);
1489 return -EADDRNOTAVAIL;
1490 case SIOCGETSGCNT:
1491 if (copy_from_user(&sr, arg, sizeof(sr)))
1492 return -EFAULT;
1493
1494 rcu_read_lock();
1495 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1496 if (c) {
1497 sr.pktcnt = c->mfc_un.res.pkt;
1498 sr.bytecnt = c->mfc_un.res.bytes;
1499 sr.wrong_if = c->mfc_un.res.wrong_if;
1500 rcu_read_unlock();
1501
1502 if (copy_to_user(arg, &sr, sizeof(sr)))
1503 return -EFAULT;
1504 return 0;
1505 }
1506 rcu_read_unlock();
1507 return -EADDRNOTAVAIL;
1508 default:
1509 return -ENOIOCTLCMD;
1510 }
1511}
1512#endif
1513
1437 1514
1438static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) 1515static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1439{ 1516{
@@ -1535,26 +1612,20 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1535#endif 1612#endif
1536 1613
1537 if (vif->flags & VIFF_TUNNEL) { 1614 if (vif->flags & VIFF_TUNNEL) {
1538 struct flowi fl = { 1615 rt = ip_route_output_ports(net, NULL,
1539 .oif = vif->link, 1616 vif->remote, vif->local,
1540 .fl4_dst = vif->remote, 1617 0, 0,
1541 .fl4_src = vif->local, 1618 IPPROTO_IPIP,
1542 .fl4_tos = RT_TOS(iph->tos), 1619 RT_TOS(iph->tos), vif->link);
1543 .proto = IPPROTO_IPIP 1620 if (IS_ERR(rt))
1544 };
1545
1546 if (ip_route_output_key(net, &rt, &fl))
1547 goto out_free; 1621 goto out_free;
1548 encap = sizeof(struct iphdr); 1622 encap = sizeof(struct iphdr);
1549 } else { 1623 } else {
1550 struct flowi fl = { 1624 rt = ip_route_output_ports(net, NULL, iph->daddr, 0,
1551 .oif = vif->link, 1625 0, 0,
1552 .fl4_dst = iph->daddr, 1626 IPPROTO_IPIP,
1553 .fl4_tos = RT_TOS(iph->tos), 1627 RT_TOS(iph->tos), vif->link);
1554 .proto = IPPROTO_IPIP 1628 if (IS_ERR(rt))
1555 };
1556
1557 if (ip_route_output_key(net, &rt, &fl))
1558 goto out_free; 1629 goto out_free;
1559 } 1630 }
1560 1631
@@ -1717,6 +1788,24 @@ dont_forward:
1717 return 0; 1788 return 0;
1718} 1789}
1719 1790
1791static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct rtable *rt)
1792{
1793 struct flowi4 fl4 = {
1794 .daddr = rt->rt_key_dst,
1795 .saddr = rt->rt_key_src,
1796 .flowi4_tos = rt->rt_tos,
1797 .flowi4_oif = rt->rt_oif,
1798 .flowi4_iif = rt->rt_iif,
1799 .flowi4_mark = rt->rt_mark,
1800 };
1801 struct mr_table *mrt;
1802 int err;
1803
1804 err = ipmr_fib_lookup(net, &fl4, &mrt);
1805 if (err)
1806 return ERR_PTR(err);
1807 return mrt;
1808}
1720 1809
1721/* 1810/*
1722 * Multicast packets for forwarding arrive here 1811 * Multicast packets for forwarding arrive here
@@ -1729,7 +1818,6 @@ int ip_mr_input(struct sk_buff *skb)
1729 struct net *net = dev_net(skb->dev); 1818 struct net *net = dev_net(skb->dev);
1730 int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; 1819 int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1731 struct mr_table *mrt; 1820 struct mr_table *mrt;
1732 int err;
1733 1821
1734 /* Packet is looped back after forward, it should not be 1822 /* Packet is looped back after forward, it should not be
1735 * forwarded second time, but still can be delivered locally. 1823 * forwarded second time, but still can be delivered locally.
@@ -1737,12 +1825,11 @@ int ip_mr_input(struct sk_buff *skb)
1737 if (IPCB(skb)->flags & IPSKB_FORWARDED) 1825 if (IPCB(skb)->flags & IPSKB_FORWARDED)
1738 goto dont_forward; 1826 goto dont_forward;
1739 1827
1740 err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); 1828 mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb));
1741 if (err < 0) { 1829 if (IS_ERR(mrt)) {
1742 kfree_skb(skb); 1830 kfree_skb(skb);
1743 return err; 1831 return PTR_ERR(mrt);
1744 } 1832 }
1745
1746 if (!local) { 1833 if (!local) {
1747 if (IPCB(skb)->opt.router_alert) { 1834 if (IPCB(skb)->opt.router_alert) {
1748 if (ip_call_ra_chain(skb)) 1835 if (ip_call_ra_chain(skb))
@@ -1870,9 +1957,9 @@ int pim_rcv_v1(struct sk_buff *skb)
1870 1957
1871 pim = igmp_hdr(skb); 1958 pim = igmp_hdr(skb);
1872 1959
1873 if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) 1960 mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb));
1961 if (IS_ERR(mrt))
1874 goto drop; 1962 goto drop;
1875
1876 if (!mrt->mroute_do_pim || 1963 if (!mrt->mroute_do_pim ||
1877 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 1964 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1878 goto drop; 1965 goto drop;
@@ -1902,9 +1989,9 @@ static int pim_rcv(struct sk_buff *skb)
1902 csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 1989 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1903 goto drop; 1990 goto drop;
1904 1991
1905 if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) 1992 mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb));
1993 if (IS_ERR(mrt))
1906 goto drop; 1994 goto drop;
1907
1908 if (__pim_rcv(mrt, skb, sizeof(*pim))) { 1995 if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1909drop: 1996drop:
1910 kfree_skb(skb); 1997 kfree_skb(skb);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 994a1f29ebbc..f3c0b549b8e1 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -16,7 +16,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
16 struct net *net = dev_net(skb_dst(skb)->dev); 16 struct net *net = dev_net(skb_dst(skb)->dev);
17 const struct iphdr *iph = ip_hdr(skb); 17 const struct iphdr *iph = ip_hdr(skb);
18 struct rtable *rt; 18 struct rtable *rt;
19 struct flowi fl = {}; 19 struct flowi4 fl4 = {};
20 unsigned long orefdst; 20 unsigned long orefdst;
21 unsigned int hh_len; 21 unsigned int hh_len;
22 unsigned int type; 22 unsigned int type;
@@ -31,14 +31,15 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
31 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. 31 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
32 */ 32 */
33 if (addr_type == RTN_LOCAL) { 33 if (addr_type == RTN_LOCAL) {
34 fl.fl4_dst = iph->daddr; 34 fl4.daddr = iph->daddr;
35 if (type == RTN_LOCAL) 35 if (type == RTN_LOCAL)
36 fl.fl4_src = iph->saddr; 36 fl4.saddr = iph->saddr;
37 fl.fl4_tos = RT_TOS(iph->tos); 37 fl4.flowi4_tos = RT_TOS(iph->tos);
38 fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; 38 fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
39 fl.mark = skb->mark; 39 fl4.flowi4_mark = skb->mark;
40 fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; 40 fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
41 if (ip_route_output_key(net, &rt, &fl) != 0) 41 rt = ip_route_output_key(net, &fl4);
42 if (IS_ERR(rt))
42 return -1; 43 return -1;
43 44
44 /* Drop old route. */ 45 /* Drop old route. */
@@ -47,8 +48,9 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
47 } else { 48 } else {
48 /* non-local src, find valid iif to satisfy 49 /* non-local src, find valid iif to satisfy
49 * rp-filter when calling ip_route_input. */ 50 * rp-filter when calling ip_route_input. */
50 fl.fl4_dst = iph->saddr; 51 fl4.daddr = iph->saddr;
51 if (ip_route_output_key(net, &rt, &fl) != 0) 52 rt = ip_route_output_key(net, &fl4);
53 if (IS_ERR(rt))
52 return -1; 54 return -1;
53 55
54 orefdst = skb->_skb_refdst; 56 orefdst = skb->_skb_refdst;
@@ -66,10 +68,11 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
66 68
67#ifdef CONFIG_XFRM 69#ifdef CONFIG_XFRM
68 if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && 70 if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
69 xfrm_decode_session(skb, &fl, AF_INET) == 0) { 71 xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
70 struct dst_entry *dst = skb_dst(skb); 72 struct dst_entry *dst = skb_dst(skb);
71 skb_dst_set(skb, NULL); 73 skb_dst_set(skb, NULL);
72 if (xfrm_lookup(net, &dst, &fl, skb->sk, 0)) 74 dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
75 if (IS_ERR(dst))
73 return -1; 76 return -1;
74 skb_dst_set(skb, dst); 77 skb_dst_set(skb, dst);
75 } 78 }
@@ -102,7 +105,8 @@ int ip_xfrm_me_harder(struct sk_buff *skb)
102 dst = ((struct xfrm_dst *)dst)->route; 105 dst = ((struct xfrm_dst *)dst)->route;
103 dst_hold(dst); 106 dst_hold(dst);
104 107
105 if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0) 108 dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
109 if (IS_ERR(dst))
106 return -1; 110 return -1;
107 111
108 skb_dst_drop(skb); 112 skb_dst_drop(skb);
@@ -219,7 +223,11 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
219 223
220static int nf_ip_route(struct dst_entry **dst, struct flowi *fl) 224static int nf_ip_route(struct dst_entry **dst, struct flowi *fl)
221{ 225{
222 return ip_route_output_key(&init_net, (struct rtable **)dst, fl); 226 struct rtable *rt = ip_route_output_key(&init_net, &fl->u.ip4);
227 if (IS_ERR(rt))
228 return PTR_ERR(rt);
229 *dst = &rt->dst;
230 return 0;
223} 231}
224 232
225static const struct nf_afinfo nf_ip_afinfo = { 233static const struct nf_afinfo nf_ip_afinfo = {
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index babd1a2bae5f..1dfc18a03fd4 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -64,16 +64,6 @@ config IP_NF_IPTABLES
64if IP_NF_IPTABLES 64if IP_NF_IPTABLES
65 65
66# The matches. 66# The matches.
67config IP_NF_MATCH_ADDRTYPE
68 tristate '"addrtype" address type match support'
69 depends on NETFILTER_ADVANCED
70 help
71 This option allows you to match what routing thinks of an address,
72 eg. UNICAST, LOCAL, BROADCAST, ...
73
74 If you want to compile it as a module, say M here and read
75 <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
76
77config IP_NF_MATCH_AH 67config IP_NF_MATCH_AH
78 tristate '"ah" match support' 68 tristate '"ah" match support'
79 depends on NETFILTER_ADVANCED 69 depends on NETFILTER_ADVANCED
@@ -206,8 +196,9 @@ config IP_NF_TARGET_REDIRECT
206 196
207config NF_NAT_SNMP_BASIC 197config NF_NAT_SNMP_BASIC
208 tristate "Basic SNMP-ALG support" 198 tristate "Basic SNMP-ALG support"
209 depends on NF_NAT 199 depends on NF_CONNTRACK_SNMP && NF_NAT
210 depends on NETFILTER_ADVANCED 200 depends on NETFILTER_ADVANCED
201 default NF_NAT && NF_CONNTRACK_SNMP
211 ---help--- 202 ---help---
212 203
213 This module implements an Application Layer Gateway (ALG) for 204 This module implements an Application Layer Gateway (ALG) for
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 19eb59d01037..dca2082ec683 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -48,7 +48,6 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
48obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o 48obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
49 49
50# matches 50# matches
51obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
52obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o 51obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
53obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o 52obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
54 53
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e855fffaed95..4b5d457c2d76 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -866,6 +866,7 @@ static int compat_table_info(const struct xt_table_info *info,
866 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 866 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
867 newinfo->initial_entries = 0; 867 newinfo->initial_entries = 0;
868 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 868 loc_cpu_entry = info->entries[raw_smp_processor_id()];
869 xt_compat_init_offsets(NFPROTO_ARP, info->number);
869 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 870 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
870 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 871 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
871 if (ret != 0) 872 if (ret != 0)
@@ -1065,6 +1066,7 @@ static int do_replace(struct net *net, const void __user *user,
1065 /* overflow check */ 1066 /* overflow check */
1066 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1067 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1067 return -ENOMEM; 1068 return -ENOMEM;
1069 tmp.name[sizeof(tmp.name)-1] = 0;
1068 1070
1069 newinfo = xt_alloc_table_info(tmp.size); 1071 newinfo = xt_alloc_table_info(tmp.size);
1070 if (!newinfo) 1072 if (!newinfo)
@@ -1333,6 +1335,7 @@ static int translate_compat_table(const char *name,
1333 duprintf("translate_compat_table: size %u\n", info->size); 1335 duprintf("translate_compat_table: size %u\n", info->size);
1334 j = 0; 1336 j = 0;
1335 xt_compat_lock(NFPROTO_ARP); 1337 xt_compat_lock(NFPROTO_ARP);
1338 xt_compat_init_offsets(NFPROTO_ARP, number);
1336 /* Walk through entries, checking offsets. */ 1339 /* Walk through entries, checking offsets. */
1337 xt_entry_foreach(iter0, entry0, total_size) { 1340 xt_entry_foreach(iter0, entry0, total_size) {
1338 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1341 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
@@ -1486,6 +1489,7 @@ static int compat_do_replace(struct net *net, void __user *user,
1486 return -ENOMEM; 1489 return -ENOMEM;
1487 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1490 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1488 return -ENOMEM; 1491 return -ENOMEM;
1492 tmp.name[sizeof(tmp.name)-1] = 0;
1489 1493
1490 newinfo = xt_alloc_table_info(tmp.size); 1494 newinfo = xt_alloc_table_info(tmp.size);
1491 if (!newinfo) 1495 if (!newinfo)
@@ -1738,6 +1742,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
1738 ret = -EFAULT; 1742 ret = -EFAULT;
1739 break; 1743 break;
1740 } 1744 }
1745 rev.name[sizeof(rev.name)-1] = 0;
1741 1746
1742 try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, 1747 try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name,
1743 rev.revision, 1, &ret), 1748 rev.revision, 1, &ret),
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index b8ddcc480ed9..a5e52a9f0a12 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -60,12 +60,12 @@ static int checkentry(const struct xt_tgchk_param *par)
60 60
61 if (mangle->flags & ~ARPT_MANGLE_MASK || 61 if (mangle->flags & ~ARPT_MANGLE_MASK ||
62 !(mangle->flags & ARPT_MANGLE_MASK)) 62 !(mangle->flags & ARPT_MANGLE_MASK))
63 return false; 63 return -EINVAL;
64 64
65 if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && 65 if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
66 mangle->target != XT_CONTINUE) 66 mangle->target != XT_CONTINUE)
67 return false; 67 return -EINVAL;
68 return true; 68 return 0;
69} 69}
70 70
71static struct xt_target arpt_mangle_reg __read_mostly = { 71static struct xt_target arpt_mangle_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 652efea013dc..ffcea0d1678e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -387,7 +387,7 @@ ipt_do_table(struct sk_buff *skb,
387 verdict = (unsigned)(-v) - 1; 387 verdict = (unsigned)(-v) - 1;
388 break; 388 break;
389 } 389 }
390 if (*stackptr == 0) { 390 if (*stackptr <= origptr) {
391 e = get_entry(table_base, 391 e = get_entry(table_base,
392 private->underflow[hook]); 392 private->underflow[hook]);
393 pr_debug("Underflow (this is normal) " 393 pr_debug("Underflow (this is normal) "
@@ -427,10 +427,10 @@ ipt_do_table(struct sk_buff *skb,
427 /* Verdict */ 427 /* Verdict */
428 break; 428 break;
429 } while (!acpar.hotdrop); 429 } while (!acpar.hotdrop);
430 xt_info_rdunlock_bh();
431 pr_debug("Exiting %s; resetting sp from %u to %u\n", 430 pr_debug("Exiting %s; resetting sp from %u to %u\n",
432 __func__, *stackptr, origptr); 431 __func__, *stackptr, origptr);
433 *stackptr = origptr; 432 *stackptr = origptr;
433 xt_info_rdunlock_bh();
434#ifdef DEBUG_ALLOW_ALL 434#ifdef DEBUG_ALLOW_ALL
435 return NF_ACCEPT; 435 return NF_ACCEPT;
436#else 436#else
@@ -1063,6 +1063,7 @@ static int compat_table_info(const struct xt_table_info *info,
1063 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 1063 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
1064 newinfo->initial_entries = 0; 1064 newinfo->initial_entries = 0;
1065 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 1065 loc_cpu_entry = info->entries[raw_smp_processor_id()];
1066 xt_compat_init_offsets(AF_INET, info->number);
1066 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 1067 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
1067 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 1068 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
1068 if (ret != 0) 1069 if (ret != 0)
@@ -1261,6 +1262,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
1261 /* overflow check */ 1262 /* overflow check */
1262 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1263 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1263 return -ENOMEM; 1264 return -ENOMEM;
1265 tmp.name[sizeof(tmp.name)-1] = 0;
1264 1266
1265 newinfo = xt_alloc_table_info(tmp.size); 1267 newinfo = xt_alloc_table_info(tmp.size);
1266 if (!newinfo) 1268 if (!newinfo)
@@ -1664,6 +1666,7 @@ translate_compat_table(struct net *net,
1664 duprintf("translate_compat_table: size %u\n", info->size); 1666 duprintf("translate_compat_table: size %u\n", info->size);
1665 j = 0; 1667 j = 0;
1666 xt_compat_lock(AF_INET); 1668 xt_compat_lock(AF_INET);
1669 xt_compat_init_offsets(AF_INET, number);
1667 /* Walk through entries, checking offsets. */ 1670 /* Walk through entries, checking offsets. */
1668 xt_entry_foreach(iter0, entry0, total_size) { 1671 xt_entry_foreach(iter0, entry0, total_size) {
1669 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1672 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
@@ -1805,6 +1808,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
1805 return -ENOMEM; 1808 return -ENOMEM;
1806 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1809 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1807 return -ENOMEM; 1810 return -ENOMEM;
1811 tmp.name[sizeof(tmp.name)-1] = 0;
1808 1812
1809 newinfo = xt_alloc_table_info(tmp.size); 1813 newinfo = xt_alloc_table_info(tmp.size);
1810 if (!newinfo) 1814 if (!newinfo)
@@ -2034,6 +2038,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2034 ret = -EFAULT; 2038 ret = -EFAULT;
2035 break; 2039 break;
2036 } 2040 }
2041 rev.name[sizeof(rev.name)-1] = 0;
2037 2042
2038 if (cmd == IPT_SO_GET_REVISION_TARGET) 2043 if (cmd == IPT_SO_GET_REVISION_TARGET)
2039 target = 1; 2044 target = 1;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1e26a4897655..d609ac3cb9a4 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
300 * that the ->target() function isn't called after ->destroy() */ 300 * that the ->target() function isn't called after ->destroy() */
301 301
302 ct = nf_ct_get(skb, &ctinfo); 302 ct = nf_ct_get(skb, &ctinfo);
303 if (ct == NULL) { 303 if (ct == NULL)
304 pr_info("no conntrack!\n");
305 /* FIXME: need to drop invalid ones, since replies
306 * to outgoing connections of other nodes will be
307 * marked as INVALID */
308 return NF_DROP; 304 return NF_DROP;
309 }
310 305
311 /* special case: ICMP error handling. conntrack distinguishes between 306 /* special case: ICMP error handling. conntrack distinguishes between
312 * error messages (RELATED) and information requests (see below) */ 307 * error messages (RELATED) and information requests (see below) */
@@ -669,8 +664,11 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
669 char buffer[PROC_WRITELEN+1]; 664 char buffer[PROC_WRITELEN+1];
670 unsigned long nodenum; 665 unsigned long nodenum;
671 666
672 if (copy_from_user(buffer, input, PROC_WRITELEN)) 667 if (size > PROC_WRITELEN)
668 return -EIO;
669 if (copy_from_user(buffer, input, size))
673 return -EFAULT; 670 return -EFAULT;
671 buffer[size] = 0;
674 672
675 if (*buffer == '+') { 673 if (*buffer == '+') {
676 nodenum = simple_strtoul(buffer+1, NULL, 10); 674 nodenum = simple_strtoul(buffer+1, NULL, 10);
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 72ffc8fda2e9..d76d6c9ed946 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf,
442 } 442 }
443#endif 443#endif
444 444
445 /* MAC logging for input path only. */ 445 if (in != NULL)
446 if (in && !out)
447 dump_mac_header(m, loginfo, skb); 446 dump_mac_header(m, loginfo, skb);
448 447
449 dump_packet(m, loginfo, skb, 0); 448 dump_packet(m, loginfo, skb, 0);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
deleted file mode 100644
index db8bff0fb86d..000000000000
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ /dev/null
@@ -1,134 +0,0 @@
1/*
2 * iptables module to match inet_addr_type() of an ip.
3 *
4 * Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
5 * (C) 2007 Laszlo Attila Toth <panther@balabit.hu>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/skbuff.h>
15#include <linux/netdevice.h>
16#include <linux/ip.h>
17#include <net/route.h>
18
19#include <linux/netfilter_ipv4/ipt_addrtype.h>
20#include <linux/netfilter/x_tables.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
24MODULE_DESCRIPTION("Xtables: address type match for IPv4");
25
26static inline bool match_type(struct net *net, const struct net_device *dev,
27 __be32 addr, u_int16_t mask)
28{
29 return !!(mask & (1 << inet_dev_addr_type(net, dev, addr)));
30}
31
32static bool
33addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
34{
35 struct net *net = dev_net(par->in ? par->in : par->out);
36 const struct ipt_addrtype_info *info = par->matchinfo;
37 const struct iphdr *iph = ip_hdr(skb);
38 bool ret = true;
39
40 if (info->source)
41 ret &= match_type(net, NULL, iph->saddr, info->source) ^
42 info->invert_source;
43 if (info->dest)
44 ret &= match_type(net, NULL, iph->daddr, info->dest) ^
45 info->invert_dest;
46
47 return ret;
48}
49
50static bool
51addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
52{
53 struct net *net = dev_net(par->in ? par->in : par->out);
54 const struct ipt_addrtype_info_v1 *info = par->matchinfo;
55 const struct iphdr *iph = ip_hdr(skb);
56 const struct net_device *dev = NULL;
57 bool ret = true;
58
59 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN)
60 dev = par->in;
61 else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT)
62 dev = par->out;
63
64 if (info->source)
65 ret &= match_type(net, dev, iph->saddr, info->source) ^
66 (info->flags & IPT_ADDRTYPE_INVERT_SOURCE);
67 if (ret && info->dest)
68 ret &= match_type(net, dev, iph->daddr, info->dest) ^
69 !!(info->flags & IPT_ADDRTYPE_INVERT_DEST);
70 return ret;
71}
72
73static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
74{
75 struct ipt_addrtype_info_v1 *info = par->matchinfo;
76
77 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN &&
78 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
79 pr_info("both incoming and outgoing "
80 "interface limitation cannot be selected\n");
81 return -EINVAL;
82 }
83
84 if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
85 (1 << NF_INET_LOCAL_IN)) &&
86 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
87 pr_info("output interface limitation "
88 "not valid in PREROUTING and INPUT\n");
89 return -EINVAL;
90 }
91
92 if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
93 (1 << NF_INET_LOCAL_OUT)) &&
94 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) {
95 pr_info("input interface limitation "
96 "not valid in POSTROUTING and OUTPUT\n");
97 return -EINVAL;
98 }
99
100 return 0;
101}
102
103static struct xt_match addrtype_mt_reg[] __read_mostly = {
104 {
105 .name = "addrtype",
106 .family = NFPROTO_IPV4,
107 .match = addrtype_mt_v0,
108 .matchsize = sizeof(struct ipt_addrtype_info),
109 .me = THIS_MODULE
110 },
111 {
112 .name = "addrtype",
113 .family = NFPROTO_IPV4,
114 .revision = 1,
115 .match = addrtype_mt_v1,
116 .checkentry = addrtype_mt_checkentry_v1,
117 .matchsize = sizeof(struct ipt_addrtype_info_v1),
118 .me = THIS_MODULE
119 }
120};
121
122static int __init addrtype_mt_init(void)
123{
124 return xt_register_matches(addrtype_mt_reg,
125 ARRAY_SIZE(addrtype_mt_reg));
126}
127
128static void __exit addrtype_mt_exit(void)
129{
130 xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg));
131}
132
133module_init(addrtype_mt_init);
134module_exit(addrtype_mt_exit);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 294a2a32f293..aef5d1fbe77d 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, 60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
61 dev_net(out)->ipv4.iptable_mangle); 61 dev_net(out)->ipv4.iptable_mangle);
62 /* Reroute for ANY change. */ 62 /* Reroute for ANY change. */
63 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { 63 if (ret != NF_DROP && ret != NF_STOLEN) {
64 iph = ip_hdr(skb); 64 iph = ip_hdr(skb);
65 65
66 if (iph->saddr != saddr || 66 if (iph->saddr != saddr ||
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 63f60fc5d26a..5585980fce2e 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -20,6 +20,7 @@
20#include <net/netfilter/nf_conntrack_l4proto.h> 20#include <net/netfilter/nf_conntrack_l4proto.h>
21#include <net/netfilter/nf_conntrack_expect.h> 21#include <net/netfilter/nf_conntrack_expect.h>
22#include <net/netfilter/nf_conntrack_acct.h> 22#include <net/netfilter/nf_conntrack_acct.h>
23#include <linux/rculist_nulls.h>
23 24
24struct ct_iter_state { 25struct ct_iter_state {
25 struct seq_net_private p; 26 struct seq_net_private p;
@@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
35 for (st->bucket = 0; 36 for (st->bucket = 0;
36 st->bucket < net->ct.htable_size; 37 st->bucket < net->ct.htable_size;
37 st->bucket++) { 38 st->bucket++) {
38 n = rcu_dereference(net->ct.hash[st->bucket].first); 39 n = rcu_dereference(
40 hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
39 if (!is_a_nulls(n)) 41 if (!is_a_nulls(n))
40 return n; 42 return n;
41 } 43 }
@@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
48 struct net *net = seq_file_net(seq); 50 struct net *net = seq_file_net(seq);
49 struct ct_iter_state *st = seq->private; 51 struct ct_iter_state *st = seq->private;
50 52
51 head = rcu_dereference(head->next); 53 head = rcu_dereference(hlist_nulls_next_rcu(head));
52 while (is_a_nulls(head)) { 54 while (is_a_nulls(head)) {
53 if (likely(get_nulls_value(head) == st->bucket)) { 55 if (likely(get_nulls_value(head) == st->bucket)) {
54 if (++st->bucket >= net->ct.htable_size) 56 if (++st->bucket >= net->ct.htable_size)
55 return NULL; 57 return NULL;
56 } 58 }
57 head = rcu_dereference(net->ct.hash[st->bucket].first); 59 head = rcu_dereference(
60 hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
58 } 61 }
59 return head; 62 return head;
60} 63}
@@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
217 struct hlist_node *n; 220 struct hlist_node *n;
218 221
219 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { 222 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
220 n = rcu_dereference(net->ct.expect_hash[st->bucket].first); 223 n = rcu_dereference(
224 hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
221 if (n) 225 if (n)
222 return n; 226 return n;
223 } 227 }
@@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
230 struct net *net = seq_file_net(seq); 234 struct net *net = seq_file_net(seq);
231 struct ct_expect_iter_state *st = seq->private; 235 struct ct_expect_iter_state *st = seq->private;
232 236
233 head = rcu_dereference(head->next); 237 head = rcu_dereference(hlist_next_rcu(head));
234 while (head == NULL) { 238 while (head == NULL) {
235 if (++st->bucket >= nf_ct_expect_hsize) 239 if (++st->bucket >= nf_ct_expect_hsize)
236 return NULL; 240 return NULL;
237 head = rcu_dereference(net->ct.expect_hash[st->bucket].first); 241 head = rcu_dereference(
242 hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
238 } 243 }
239 return head; 244 return head;
240} 245}
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 0f23b3f06df0..703f366fd235 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb,
44 44
45 /* Try to get same port: if not, try to change it. */ 45 /* Try to get same port: if not, try to change it. */
46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
47 int ret; 47 int res;
48 48
49 exp->tuple.dst.u.tcp.port = htons(port); 49 exp->tuple.dst.u.tcp.port = htons(port);
50 ret = nf_ct_expect_related(exp); 50 res = nf_ct_expect_related(exp);
51 if (ret == 0) 51 if (res == 0)
52 break; 52 break;
53 else if (ret != -EBUSY) { 53 else if (res != -EBUSY) {
54 port = 0; 54 port = 0;
55 break; 55 break;
56 } 56 }
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index c04787ce1a71..21bcf471b25a 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
221 manips not an issue. */ 221 manips not an issue. */
222 if (maniptype == IP_NAT_MANIP_SRC && 222 if (maniptype == IP_NAT_MANIP_SRC &&
223 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { 223 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
224 if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { 224 /* try the original tuple first */
225 if (in_range(orig_tuple, range)) {
226 if (!nf_nat_used_tuple(orig_tuple, ct)) {
227 *tuple = *orig_tuple;
228 return;
229 }
230 } else if (find_appropriate_src(net, zone, orig_tuple, tuple,
231 range)) {
225 pr_debug("get_unique_tuple: Found current src map\n"); 232 pr_debug("get_unique_tuple: Found current src map\n");
226 if (!nf_nat_used_tuple(tuple, ct)) 233 if (!nf_nat_used_tuple(tuple, ct))
227 return; 234 return;
@@ -266,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct,
266 struct net *net = nf_ct_net(ct); 273 struct net *net = nf_ct_net(ct);
267 struct nf_conntrack_tuple curr_tuple, new_tuple; 274 struct nf_conntrack_tuple curr_tuple, new_tuple;
268 struct nf_conn_nat *nat; 275 struct nf_conn_nat *nat;
269 int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
270 276
271 /* nat helper or nfctnetlink also setup binding */ 277 /* nat helper or nfctnetlink also setup binding */
272 nat = nfct_nat(ct); 278 nat = nfct_nat(ct);
@@ -306,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct,
306 ct->status |= IPS_DST_NAT; 312 ct->status |= IPS_DST_NAT;
307 } 313 }
308 314
309 /* Place in source hash if this is the first time. */ 315 if (maniptype == IP_NAT_MANIP_SRC) {
310 if (have_to_hash) {
311 unsigned int srchash; 316 unsigned int srchash;
312 317
313 srchash = hash_by_src(net, nf_ct_zone(ct), 318 srchash = hash_by_src(net, nf_ct_zone(ct),
@@ -323,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct,
323 328
324 /* It's done. */ 329 /* It's done. */
325 if (maniptype == IP_NAT_MANIP_DST) 330 if (maniptype == IP_NAT_MANIP_DST)
326 set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); 331 ct->status |= IPS_DST_NAT_DONE;
327 else 332 else
328 set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); 333 ct->status |= IPS_SRC_NAT_DONE;
329 334
330 return NF_ACCEPT; 335 return NF_ACCEPT;
331} 336}
@@ -502,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
502 int ret = 0; 507 int ret = 0;
503 508
504 spin_lock_bh(&nf_nat_lock); 509 spin_lock_bh(&nf_nat_lock);
505 if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { 510 if (rcu_dereference_protected(
511 nf_nat_protos[proto->protonum],
512 lockdep_is_held(&nf_nat_lock)
513 ) != &nf_nat_unknown_protocol) {
506 ret = -EBUSY; 514 ret = -EBUSY;
507 goto out; 515 goto out;
508 } 516 }
@@ -532,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
532 if (nat == NULL || nat->ct == NULL) 540 if (nat == NULL || nat->ct == NULL)
533 return; 541 return;
534 542
535 NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK); 543 NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
536 544
537 spin_lock_bh(&nf_nat_lock); 545 spin_lock_bh(&nf_nat_lock);
538 hlist_del_rcu(&nat->bysource); 546 hlist_del_rcu(&nat->bysource);
@@ -545,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old)
545 struct nf_conn_nat *old_nat = old; 553 struct nf_conn_nat *old_nat = old;
546 struct nf_conn *ct = old_nat->ct; 554 struct nf_conn *ct = old_nat->ct;
547 555
548 if (!ct || !(ct->status & IPS_NAT_DONE_MASK)) 556 if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
549 return; 557 return;
550 558
551 spin_lock_bh(&nf_nat_lock); 559 spin_lock_bh(&nf_nat_lock);
552 new_nat->ct = ct;
553 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); 560 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
554 spin_unlock_bh(&nf_nat_lock); 561 spin_unlock_bh(&nf_nat_lock);
555} 562}
@@ -679,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net)
679{ 686{
680 /* Leave them the same for the moment. */ 687 /* Leave them the same for the moment. */
681 net->ipv4.nat_htable_size = net->ct.htable_size; 688 net->ipv4.nat_htable_size = net->ct.htable_size;
682 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 689 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
683 &net->ipv4.nat_vmalloced, 0);
684 if (!net->ipv4.nat_bysource) 690 if (!net->ipv4.nat_bysource)
685 return -ENOMEM; 691 return -ENOMEM;
686 return 0; 692 return 0;
@@ -702,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
702{ 708{
703 nf_ct_iterate_cleanup(net, &clean_nat, NULL); 709 nf_ct_iterate_cleanup(net, &clean_nat, NULL);
704 synchronize_rcu(); 710 synchronize_rcu();
705 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, 711 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
706 net->ipv4.nat_htable_size);
707} 712}
708 713
709static struct pernet_operations nf_nat_net_ops = { 714static struct pernet_operations nf_nat_net_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ee5f419d0a56..8812a02078ab 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -54,6 +54,7 @@
54#include <net/netfilter/nf_conntrack_expect.h> 54#include <net/netfilter/nf_conntrack_expect.h>
55#include <net/netfilter/nf_conntrack_helper.h> 55#include <net/netfilter/nf_conntrack_helper.h>
56#include <net/netfilter/nf_nat_helper.h> 56#include <net/netfilter/nf_nat_helper.h>
57#include <linux/netfilter/nf_conntrack_snmp.h>
57 58
58MODULE_LICENSE("GPL"); 59MODULE_LICENSE("GPL");
59MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); 60MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
@@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void)
1310{ 1311{
1311 int ret = 0; 1312 int ret = 0;
1312 1313
1313 ret = nf_conntrack_helper_register(&snmp_helper); 1314 BUG_ON(nf_nat_snmp_hook != NULL);
1314 if (ret < 0) 1315 rcu_assign_pointer(nf_nat_snmp_hook, help);
1315 return ret; 1316
1316 ret = nf_conntrack_helper_register(&snmp_trap_helper); 1317 ret = nf_conntrack_helper_register(&snmp_trap_helper);
1317 if (ret < 0) { 1318 if (ret < 0) {
1318 nf_conntrack_helper_unregister(&snmp_helper); 1319 nf_conntrack_helper_unregister(&snmp_helper);
@@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void)
1323 1324
1324static void __exit nf_nat_snmp_basic_fini(void) 1325static void __exit nf_nat_snmp_basic_fini(void)
1325{ 1326{
1326 nf_conntrack_helper_unregister(&snmp_helper); 1327 rcu_assign_pointer(nf_nat_snmp_hook, NULL);
1327 nf_conntrack_helper_unregister(&snmp_trap_helper); 1328 nf_conntrack_helper_unregister(&snmp_trap_helper);
1328} 1329}
1329 1330
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 95481fee8bdb..7317bdf1d457 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -31,6 +31,7 @@
31#ifdef CONFIG_XFRM 31#ifdef CONFIG_XFRM
32static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) 32static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
33{ 33{
34 struct flowi4 *fl4 = &fl->u.ip4;
34 const struct nf_conn *ct; 35 const struct nf_conn *ct;
35 const struct nf_conntrack_tuple *t; 36 const struct nf_conntrack_tuple *t;
36 enum ip_conntrack_info ctinfo; 37 enum ip_conntrack_info ctinfo;
@@ -49,25 +50,25 @@ static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
49 statusbit = IPS_SRC_NAT; 50 statusbit = IPS_SRC_NAT;
50 51
51 if (ct->status & statusbit) { 52 if (ct->status & statusbit) {
52 fl->fl4_dst = t->dst.u3.ip; 53 fl4->daddr = t->dst.u3.ip;
53 if (t->dst.protonum == IPPROTO_TCP || 54 if (t->dst.protonum == IPPROTO_TCP ||
54 t->dst.protonum == IPPROTO_UDP || 55 t->dst.protonum == IPPROTO_UDP ||
55 t->dst.protonum == IPPROTO_UDPLITE || 56 t->dst.protonum == IPPROTO_UDPLITE ||
56 t->dst.protonum == IPPROTO_DCCP || 57 t->dst.protonum == IPPROTO_DCCP ||
57 t->dst.protonum == IPPROTO_SCTP) 58 t->dst.protonum == IPPROTO_SCTP)
58 fl->fl_ip_dport = t->dst.u.tcp.port; 59 fl4->fl4_dport = t->dst.u.tcp.port;
59 } 60 }
60 61
61 statusbit ^= IPS_NAT_MASK; 62 statusbit ^= IPS_NAT_MASK;
62 63
63 if (ct->status & statusbit) { 64 if (ct->status & statusbit) {
64 fl->fl4_src = t->src.u3.ip; 65 fl4->saddr = t->src.u3.ip;
65 if (t->dst.protonum == IPPROTO_TCP || 66 if (t->dst.protonum == IPPROTO_TCP ||
66 t->dst.protonum == IPPROTO_UDP || 67 t->dst.protonum == IPPROTO_UDP ||
67 t->dst.protonum == IPPROTO_UDPLITE || 68 t->dst.protonum == IPPROTO_UDPLITE ||
68 t->dst.protonum == IPPROTO_DCCP || 69 t->dst.protonum == IPPROTO_DCCP ||
69 t->dst.protonum == IPPROTO_SCTP) 70 t->dst.protonum == IPPROTO_SCTP)
70 fl->fl_ip_sport = t->src.u.tcp.port; 71 fl4->fl4_sport = t->src.u.tcp.port;
71 } 72 }
72} 73}
73#endif 74#endif
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index a3d5ab786e81..2d3c72e5bbbf 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -76,6 +76,7 @@
76#include <linux/seq_file.h> 76#include <linux/seq_file.h>
77#include <linux/netfilter.h> 77#include <linux/netfilter.h>
78#include <linux/netfilter_ipv4.h> 78#include <linux/netfilter_ipv4.h>
79#include <linux/compat.h>
79 80
80static struct raw_hashinfo raw_v4_hashinfo = { 81static struct raw_hashinfo raw_v4_hashinfo = {
81 .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), 82 .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
@@ -401,7 +402,7 @@ error:
401 return err; 402 return err;
402} 403}
403 404
404static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) 405static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
405{ 406{
406 struct iovec *iov; 407 struct iovec *iov;
407 u8 __user *type = NULL; 408 u8 __user *type = NULL;
@@ -417,7 +418,7 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
417 if (!iov) 418 if (!iov)
418 continue; 419 continue;
419 420
420 switch (fl->proto) { 421 switch (fl4->flowi4_proto) {
421 case IPPROTO_ICMP: 422 case IPPROTO_ICMP:
422 /* check if one-byte field is readable or not. */ 423 /* check if one-byte field is readable or not. */
423 if (iov->iov_base && iov->iov_len < 1) 424 if (iov->iov_base && iov->iov_len < 1)
@@ -432,8 +433,8 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
432 code = iov->iov_base; 433 code = iov->iov_base;
433 434
434 if (type && code) { 435 if (type && code) {
435 if (get_user(fl->fl_icmp_type, type) || 436 if (get_user(fl4->fl4_icmp_type, type) ||
436 get_user(fl->fl_icmp_code, code)) 437 get_user(fl4->fl4_icmp_code, code))
437 return -EFAULT; 438 return -EFAULT;
438 probed = 1; 439 probed = 1;
439 } 440 }
@@ -547,25 +548,31 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
547 } 548 }
548 549
549 { 550 {
550 struct flowi fl = { .oif = ipc.oif, 551 struct flowi4 fl4 = {
551 .mark = sk->sk_mark, 552 .flowi4_oif = ipc.oif,
552 .fl4_dst = daddr, 553 .flowi4_mark = sk->sk_mark,
553 .fl4_src = saddr, 554 .daddr = daddr,
554 .fl4_tos = tos, 555 .saddr = saddr,
555 .proto = inet->hdrincl ? IPPROTO_RAW : 556 .flowi4_tos = tos,
556 sk->sk_protocol, 557 .flowi4_proto = (inet->hdrincl ?
557 }; 558 IPPROTO_RAW :
559 sk->sk_protocol),
560 .flowi4_flags = FLOWI_FLAG_CAN_SLEEP,
561 };
558 if (!inet->hdrincl) { 562 if (!inet->hdrincl) {
559 err = raw_probe_proto_opt(&fl, msg); 563 err = raw_probe_proto_opt(&fl4, msg);
560 if (err) 564 if (err)
561 goto done; 565 goto done;
562 } 566 }
563 567
564 security_sk_classify_flow(sk, &fl); 568 security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
565 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); 569 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
570 if (IS_ERR(rt)) {
571 err = PTR_ERR(rt);
572 rt = NULL;
573 goto done;
574 }
566 } 575 }
567 if (err)
568 goto done;
569 576
570 err = -EACCES; 577 err = -EACCES;
571 if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) 578 if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
@@ -838,6 +845,23 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
838 } 845 }
839} 846}
840 847
848#ifdef CONFIG_COMPAT
849static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
850{
851 switch (cmd) {
852 case SIOCOUTQ:
853 case SIOCINQ:
854 return -ENOIOCTLCMD;
855 default:
856#ifdef CONFIG_IP_MROUTE
857 return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg));
858#else
859 return -ENOIOCTLCMD;
860#endif
861 }
862}
863#endif
864
841struct proto raw_prot = { 865struct proto raw_prot = {
842 .name = "RAW", 866 .name = "RAW",
843 .owner = THIS_MODULE, 867 .owner = THIS_MODULE,
@@ -860,6 +884,7 @@ struct proto raw_prot = {
860#ifdef CONFIG_COMPAT 884#ifdef CONFIG_COMPAT
861 .compat_setsockopt = compat_raw_setsockopt, 885 .compat_setsockopt = compat_raw_setsockopt,
862 .compat_getsockopt = compat_raw_getsockopt, 886 .compat_getsockopt = compat_raw_getsockopt,
887 .compat_ioctl = compat_raw_ioctl,
863#endif 888#endif
864}; 889};
865 890
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 351dc4e85242..4b0c81180804 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -109,8 +109,8 @@
109#include <linux/sysctl.h> 109#include <linux/sysctl.h>
110#endif 110#endif
111 111
112#define RT_FL_TOS(oldflp) \ 112#define RT_FL_TOS(oldflp4) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) 113 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 114
115#define IP_MAX_MTU 0xFFF0 115#define IP_MAX_MTU 0xFFF0
116 116
@@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256; 131static int ip_rt_min_advmss __read_mostly = 256;
132static int rt_chain_length_max __read_mostly = 20; 132static int rt_chain_length_max __read_mostly = 20;
133 133
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
136
137/* 134/*
138 * Interface to generic destination cache. 135 * Interface to generic destination cache.
139 */ 136 */
@@ -152,6 +149,41 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
152{ 149{
153} 150}
154 151
152static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
153{
154 struct rtable *rt = (struct rtable *) dst;
155 struct inet_peer *peer;
156 u32 *p = NULL;
157
158 if (!rt->peer)
159 rt_bind_peer(rt, 1);
160
161 peer = rt->peer;
162 if (peer) {
163 u32 *old_p = __DST_METRICS_PTR(old);
164 unsigned long prev, new;
165
166 p = peer->metrics;
167 if (inet_metrics_new(peer))
168 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
169
170 new = (unsigned long) p;
171 prev = cmpxchg(&dst->_metrics, old, new);
172
173 if (prev != old) {
174 p = __DST_METRICS_PTR(prev);
175 if (prev & DST_METRICS_READ_ONLY)
176 p = NULL;
177 } else {
178 if (rt->fi) {
179 fib_info_put(rt->fi);
180 rt->fi = NULL;
181 }
182 }
183 }
184 return p;
185}
186
155static struct dst_ops ipv4_dst_ops = { 187static struct dst_ops ipv4_dst_ops = {
156 .family = AF_INET, 188 .family = AF_INET,
157 .protocol = cpu_to_be16(ETH_P_IP), 189 .protocol = cpu_to_be16(ETH_P_IP),
@@ -159,6 +191,7 @@ static struct dst_ops ipv4_dst_ops = {
159 .check = ipv4_dst_check, 191 .check = ipv4_dst_check,
160 .default_advmss = ipv4_default_advmss, 192 .default_advmss = ipv4_default_advmss,
161 .default_mtu = ipv4_default_mtu, 193 .default_mtu = ipv4_default_mtu,
194 .cow_metrics = ipv4_cow_metrics,
162 .destroy = ipv4_dst_destroy, 195 .destroy = ipv4_dst_destroy,
163 .ifdown = ipv4_dst_ifdown, 196 .ifdown = ipv4_dst_ifdown,
164 .negative_advice = ipv4_negative_advice, 197 .negative_advice = ipv4_negative_advice,
@@ -171,7 +204,7 @@ static struct dst_ops ipv4_dst_ops = {
171 204
172const __u8 ip_tos2prio[16] = { 205const __u8 ip_tos2prio[16] = {
173 TC_PRIO_BESTEFFORT, 206 TC_PRIO_BESTEFFORT,
174 ECN_OR_COST(FILLER), 207 ECN_OR_COST(BESTEFFORT),
175 TC_PRIO_BESTEFFORT, 208 TC_PRIO_BESTEFFORT,
176 ECN_OR_COST(BESTEFFORT), 209 ECN_OR_COST(BESTEFFORT),
177 TC_PRIO_BULK, 210 TC_PRIO_BULK,
@@ -391,7 +424,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
391 dst_metric(&r->dst, RTAX_WINDOW), 424 dst_metric(&r->dst, RTAX_WINDOW),
392 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 425 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
393 dst_metric(&r->dst, RTAX_RTTVAR)), 426 dst_metric(&r->dst, RTAX_RTTVAR)),
394 r->fl.fl4_tos, 427 r->rt_tos,
395 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, 428 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
396 r->dst.hh ? (r->dst.hh->hh_output == 429 r->dst.hh ? (r->dst.hh->hh_output ==
397 dev_queue_xmit) : 0, 430 dev_queue_xmit) : 0,
@@ -514,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = {
514 .release = seq_release, 547 .release = seq_release,
515}; 548};
516 549
517#ifdef CONFIG_NET_CLS_ROUTE 550#ifdef CONFIG_IP_ROUTE_CLASSID
518static int rt_acct_proc_show(struct seq_file *m, void *v) 551static int rt_acct_proc_show(struct seq_file *m, void *v)
519{ 552{
520 struct ip_rt_acct *dst, *src; 553 struct ip_rt_acct *dst, *src;
@@ -567,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
567 if (!pde) 600 if (!pde)
568 goto err2; 601 goto err2;
569 602
570#ifdef CONFIG_NET_CLS_ROUTE 603#ifdef CONFIG_IP_ROUTE_CLASSID
571 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 604 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
572 if (!pde) 605 if (!pde)
573 goto err3; 606 goto err3;
574#endif 607#endif
575 return 0; 608 return 0;
576 609
577#ifdef CONFIG_NET_CLS_ROUTE 610#ifdef CONFIG_IP_ROUTE_CLASSID
578err3: 611err3:
579 remove_proc_entry("rt_cache", net->proc_net_stat); 612 remove_proc_entry("rt_cache", net->proc_net_stat);
580#endif 613#endif
@@ -588,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
588{ 621{
589 remove_proc_entry("rt_cache", net->proc_net_stat); 622 remove_proc_entry("rt_cache", net->proc_net_stat);
590 remove_proc_entry("rt_cache", net->proc_net); 623 remove_proc_entry("rt_cache", net->proc_net);
591#ifdef CONFIG_NET_CLS_ROUTE 624#ifdef CONFIG_IP_ROUTE_CLASSID
592 remove_proc_entry("rt_acct", net->proc_net); 625 remove_proc_entry("rt_acct", net->proc_net);
593#endif 626#endif
594} 627}
@@ -632,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth)
632static inline int rt_valuable(struct rtable *rth) 665static inline int rt_valuable(struct rtable *rth)
633{ 666{
634 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 667 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
635 rth->dst.expires; 668 (rth->peer && rth->peer->pmtu_expires);
636} 669}
637 670
638static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 671static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -643,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
643 if (atomic_read(&rth->dst.__refcnt)) 676 if (atomic_read(&rth->dst.__refcnt))
644 goto out; 677 goto out;
645 678
646 ret = 1;
647 if (rth->dst.expires &&
648 time_after_eq(jiffies, rth->dst.expires))
649 goto out;
650
651 age = jiffies - rth->dst.lastuse; 679 age = jiffies - rth->dst.lastuse;
652 ret = 0;
653 if ((age <= tmo1 && !rt_fast_clean(rth)) || 680 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
654 (age <= tmo2 && rt_valuable(rth))) 681 (age <= tmo2 && rt_valuable(rth)))
655 goto out; 682 goto out;
@@ -684,22 +711,22 @@ static inline bool rt_caching(const struct net *net)
684 net->ipv4.sysctl_rt_cache_rebuild_count; 711 net->ipv4.sysctl_rt_cache_rebuild_count;
685} 712}
686 713
687static inline bool compare_hash_inputs(const struct flowi *fl1, 714static inline bool compare_hash_inputs(const struct rtable *rt1,
688 const struct flowi *fl2) 715 const struct rtable *rt2)
689{ 716{
690 return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) | 717 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
691 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) | 718 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
692 (fl1->iif ^ fl2->iif)) == 0); 719 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
693} 720}
694 721
695static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 722static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
696{ 723{
697 return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) | 724 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
698 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) | 725 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
699 (fl1->mark ^ fl2->mark) | 726 (rt1->rt_mark ^ rt2->rt_mark) |
700 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) | 727 (rt1->rt_tos ^ rt2->rt_tos) |
701 (fl1->oif ^ fl2->oif) | 728 (rt1->rt_oif ^ rt2->rt_oif) |
702 (fl1->iif ^ fl2->iif)) == 0; 729 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
703} 730}
704 731
705static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 732static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
@@ -786,104 +813,13 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
786 const struct rtable *aux = head; 813 const struct rtable *aux = head;
787 814
788 while (aux != rth) { 815 while (aux != rth) {
789 if (compare_hash_inputs(&aux->fl, &rth->fl)) 816 if (compare_hash_inputs(aux, rth))
790 return 0; 817 return 0;
791 aux = rcu_dereference_protected(aux->dst.rt_next, 1); 818 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
792 } 819 }
793 return ONE; 820 return ONE;
794} 821}
795 822
796static void rt_check_expire(void)
797{
798 static unsigned int rover;
799 unsigned int i = rover, goal;
800 struct rtable *rth;
801 struct rtable __rcu **rthp;
802 unsigned long samples = 0;
803 unsigned long sum = 0, sum2 = 0;
804 unsigned long delta;
805 u64 mult;
806
807 delta = jiffies - expires_ljiffies;
808 expires_ljiffies = jiffies;
809 mult = ((u64)delta) << rt_hash_log;
810 if (ip_rt_gc_timeout > 1)
811 do_div(mult, ip_rt_gc_timeout);
812 goal = (unsigned int)mult;
813 if (goal > rt_hash_mask)
814 goal = rt_hash_mask + 1;
815 for (; goal > 0; goal--) {
816 unsigned long tmo = ip_rt_gc_timeout;
817 unsigned long length;
818
819 i = (i + 1) & rt_hash_mask;
820 rthp = &rt_hash_table[i].chain;
821
822 if (need_resched())
823 cond_resched();
824
825 samples++;
826
827 if (rcu_dereference_raw(*rthp) == NULL)
828 continue;
829 length = 0;
830 spin_lock_bh(rt_hash_lock_addr(i));
831 while ((rth = rcu_dereference_protected(*rthp,
832 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
833 prefetch(rth->dst.rt_next);
834 if (rt_is_expired(rth)) {
835 *rthp = rth->dst.rt_next;
836 rt_free(rth);
837 continue;
838 }
839 if (rth->dst.expires) {
840 /* Entry is expired even if it is in use */
841 if (time_before_eq(jiffies, rth->dst.expires)) {
842nofree:
843 tmo >>= 1;
844 rthp = &rth->dst.rt_next;
845 /*
846 * We only count entries on
847 * a chain with equal hash inputs once
848 * so that entries for different QOS
849 * levels, and other non-hash input
850 * attributes don't unfairly skew
851 * the length computation
852 */
853 length += has_noalias(rt_hash_table[i].chain, rth);
854 continue;
855 }
856 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
857 goto nofree;
858
859 /* Cleanup aged off entries. */
860 *rthp = rth->dst.rt_next;
861 rt_free(rth);
862 }
863 spin_unlock_bh(rt_hash_lock_addr(i));
864 sum += length;
865 sum2 += length*length;
866 }
867 if (samples) {
868 unsigned long avg = sum / samples;
869 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
870 rt_chain_length_max = max_t(unsigned long,
871 ip_rt_gc_elasticity,
872 (avg + 4*sd) >> FRACT_BITS);
873 }
874 rover = i;
875}
876
877/*
878 * rt_worker_func() is run in process context.
879 * we call rt_check_expire() to scan part of the hash table
880 */
881static void rt_worker_func(struct work_struct *work)
882{
883 rt_check_expire();
884 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
885}
886
887/* 823/*
888 * Pertubation of rt_genid by a small quantity [1..256] 824 * Pertubation of rt_genid by a small quantity [1..256]
889 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 825 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -1078,8 +1014,8 @@ static int slow_chain_length(const struct rtable *head)
1078 return length >> FRACT_BITS; 1014 return length >> FRACT_BITS;
1079} 1015}
1080 1016
1081static int rt_intern_hash(unsigned hash, struct rtable *rt, 1017static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1082 struct rtable **rp, struct sk_buff *skb, int ifindex) 1018 struct sk_buff *skb, int ifindex)
1083{ 1019{
1084 struct rtable *rth, *cand; 1020 struct rtable *rth, *cand;
1085 struct rtable __rcu **rthp, **candp; 1021 struct rtable __rcu **rthp, **candp;
@@ -1120,7 +1056,7 @@ restart:
1120 printk(KERN_WARNING 1056 printk(KERN_WARNING
1121 "Neighbour table failure & not caching routes.\n"); 1057 "Neighbour table failure & not caching routes.\n");
1122 ip_rt_put(rt); 1058 ip_rt_put(rt);
1123 return err; 1059 return ERR_PTR(err);
1124 } 1060 }
1125 } 1061 }
1126 1062
@@ -1137,7 +1073,7 @@ restart:
1137 rt_free(rth); 1073 rt_free(rth);
1138 continue; 1074 continue;
1139 } 1075 }
1140 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { 1076 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1141 /* Put it first */ 1077 /* Put it first */
1142 *rthp = rth->dst.rt_next; 1078 *rthp = rth->dst.rt_next;
1143 /* 1079 /*
@@ -1157,11 +1093,9 @@ restart:
1157 spin_unlock_bh(rt_hash_lock_addr(hash)); 1093 spin_unlock_bh(rt_hash_lock_addr(hash));
1158 1094
1159 rt_drop(rt); 1095 rt_drop(rt);
1160 if (rp) 1096 if (skb)
1161 *rp = rth;
1162 else
1163 skb_dst_set(skb, &rth->dst); 1097 skb_dst_set(skb, &rth->dst);
1164 return 0; 1098 return rth;
1165 } 1099 }
1166 1100
1167 if (!atomic_read(&rth->dst.__refcnt)) { 1101 if (!atomic_read(&rth->dst.__refcnt)) {
@@ -1202,7 +1136,7 @@ restart:
1202 rt_emergency_hash_rebuild(net); 1136 rt_emergency_hash_rebuild(net);
1203 spin_unlock_bh(rt_hash_lock_addr(hash)); 1137 spin_unlock_bh(rt_hash_lock_addr(hash));
1204 1138
1205 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1139 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1206 ifindex, rt_genid(net)); 1140 ifindex, rt_genid(net));
1207 goto restart; 1141 goto restart;
1208 } 1142 }
@@ -1218,7 +1152,7 @@ restart:
1218 1152
1219 if (err != -ENOBUFS) { 1153 if (err != -ENOBUFS) {
1220 rt_drop(rt); 1154 rt_drop(rt);
1221 return err; 1155 return ERR_PTR(err);
1222 } 1156 }
1223 1157
1224 /* Neighbour tables are full and nothing 1158 /* Neighbour tables are full and nothing
@@ -1239,7 +1173,7 @@ restart:
1239 if (net_ratelimit()) 1173 if (net_ratelimit())
1240 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); 1174 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1241 rt_drop(rt); 1175 rt_drop(rt);
1242 return -ENOBUFS; 1176 return ERR_PTR(-ENOBUFS);
1243 } 1177 }
1244 } 1178 }
1245 1179
@@ -1265,11 +1199,16 @@ restart:
1265 spin_unlock_bh(rt_hash_lock_addr(hash)); 1199 spin_unlock_bh(rt_hash_lock_addr(hash));
1266 1200
1267skip_hashing: 1201skip_hashing:
1268 if (rp) 1202 if (skb)
1269 *rp = rt;
1270 else
1271 skb_dst_set(skb, &rt->dst); 1203 skb_dst_set(skb, &rt->dst);
1272 return 0; 1204 return rt;
1205}
1206
1207static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1208
1209static u32 rt_peer_genid(void)
1210{
1211 return atomic_read(&__rt_peer_genid);
1273} 1212}
1274 1213
1275void rt_bind_peer(struct rtable *rt, int create) 1214void rt_bind_peer(struct rtable *rt, int create)
@@ -1280,6 +1219,8 @@ void rt_bind_peer(struct rtable *rt, int create)
1280 1219
1281 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) 1220 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1282 inet_putpeer(peer); 1221 inet_putpeer(peer);
1222 else
1223 rt->rt_peer_genid = rt_peer_genid();
1283} 1224}
1284 1225
1285/* 1226/*
@@ -1349,13 +1290,8 @@ static void rt_del(unsigned hash, struct rtable *rt)
1349void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1290void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1350 __be32 saddr, struct net_device *dev) 1291 __be32 saddr, struct net_device *dev)
1351{ 1292{
1352 int i, k;
1353 struct in_device *in_dev = __in_dev_get_rcu(dev); 1293 struct in_device *in_dev = __in_dev_get_rcu(dev);
1354 struct rtable *rth; 1294 struct inet_peer *peer;
1355 struct rtable __rcu **rthp;
1356 __be32 skeys[2] = { saddr, 0 };
1357 int ikeys[2] = { dev->ifindex, 0 };
1358 struct netevent_redirect netevent;
1359 struct net *net; 1295 struct net *net;
1360 1296
1361 if (!in_dev) 1297 if (!in_dev)
@@ -1367,9 +1303,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1367 ipv4_is_zeronet(new_gw)) 1303 ipv4_is_zeronet(new_gw))
1368 goto reject_redirect; 1304 goto reject_redirect;
1369 1305
1370 if (!rt_caching(net))
1371 goto reject_redirect;
1372
1373 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1306 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1374 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1307 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1375 goto reject_redirect; 1308 goto reject_redirect;
@@ -1380,91 +1313,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1380 goto reject_redirect; 1313 goto reject_redirect;
1381 } 1314 }
1382 1315
1383 for (i = 0; i < 2; i++) { 1316 peer = inet_getpeer_v4(daddr, 1);
1384 for (k = 0; k < 2; k++) { 1317 if (peer) {
1385 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1318 peer->redirect_learned.a4 = new_gw;
1386 rt_genid(net));
1387
1388 rthp = &rt_hash_table[hash].chain;
1389
1390 while ((rth = rcu_dereference(*rthp)) != NULL) {
1391 struct rtable *rt;
1392
1393 if (rth->fl.fl4_dst != daddr ||
1394 rth->fl.fl4_src != skeys[i] ||
1395 rth->fl.oif != ikeys[k] ||
1396 rt_is_input_route(rth) ||
1397 rt_is_expired(rth) ||
1398 !net_eq(dev_net(rth->dst.dev), net)) {
1399 rthp = &rth->dst.rt_next;
1400 continue;
1401 }
1402
1403 if (rth->rt_dst != daddr ||
1404 rth->rt_src != saddr ||
1405 rth->dst.error ||
1406 rth->rt_gateway != old_gw ||
1407 rth->dst.dev != dev)
1408 break;
1409
1410 dst_hold(&rth->dst);
1411
1412 rt = dst_alloc(&ipv4_dst_ops);
1413 if (rt == NULL) {
1414 ip_rt_put(rth);
1415 return;
1416 }
1417
1418 /* Copy all the information. */
1419 *rt = *rth;
1420 rt->dst.__use = 1;
1421 atomic_set(&rt->dst.__refcnt, 1);
1422 rt->dst.child = NULL;
1423 if (rt->dst.dev)
1424 dev_hold(rt->dst.dev);
1425 rt->dst.obsolete = -1;
1426 rt->dst.lastuse = jiffies;
1427 rt->dst.path = &rt->dst;
1428 rt->dst.neighbour = NULL;
1429 rt->dst.hh = NULL;
1430#ifdef CONFIG_XFRM
1431 rt->dst.xfrm = NULL;
1432#endif
1433 rt->rt_genid = rt_genid(net);
1434 rt->rt_flags |= RTCF_REDIRECTED;
1435
1436 /* Gateway is different ... */
1437 rt->rt_gateway = new_gw;
1438
1439 /* Redirect received -> path was valid */
1440 dst_confirm(&rth->dst);
1441
1442 if (rt->peer)
1443 atomic_inc(&rt->peer->refcnt);
1444
1445 if (arp_bind_neighbour(&rt->dst) ||
1446 !(rt->dst.neighbour->nud_state &
1447 NUD_VALID)) {
1448 if (rt->dst.neighbour)
1449 neigh_event_send(rt->dst.neighbour, NULL);
1450 ip_rt_put(rth);
1451 rt_drop(rt);
1452 goto do_next;
1453 }
1454 1319
1455 netevent.old = &rth->dst; 1320 inet_putpeer(peer);
1456 netevent.new = &rt->dst;
1457 call_netevent_notifiers(NETEVENT_REDIRECT,
1458 &netevent);
1459 1321
1460 rt_del(hash, rth); 1322 atomic_inc(&__rt_peer_genid);
1461 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1462 ip_rt_put(rt);
1463 goto do_next;
1464 }
1465 do_next:
1466 ;
1467 }
1468 } 1323 }
1469 return; 1324 return;
1470 1325
@@ -1488,18 +1343,24 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1488 if (dst->obsolete > 0) { 1343 if (dst->obsolete > 0) {
1489 ip_rt_put(rt); 1344 ip_rt_put(rt);
1490 ret = NULL; 1345 ret = NULL;
1491 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1346 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1492 (rt->dst.expires && 1347 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1493 time_after_eq(jiffies, rt->dst.expires))) { 1348 rt->rt_oif,
1494 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1495 rt->fl.oif,
1496 rt_genid(dev_net(dst->dev))); 1349 rt_genid(dev_net(dst->dev)));
1497#if RT_CACHE_DEBUG >= 1 1350#if RT_CACHE_DEBUG >= 1
1498 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", 1351 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1499 &rt->rt_dst, rt->fl.fl4_tos); 1352 &rt->rt_dst, rt->rt_tos);
1500#endif 1353#endif
1501 rt_del(hash, rt); 1354 rt_del(hash, rt);
1502 ret = NULL; 1355 ret = NULL;
1356 } else if (rt->peer &&
1357 rt->peer->pmtu_expires &&
1358 time_after_eq(jiffies, rt->peer->pmtu_expires)) {
1359 unsigned long orig = rt->peer->pmtu_expires;
1360
1361 if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1362 dst_metric_set(dst, RTAX_MTU,
1363 rt->peer->pmtu_orig);
1503 } 1364 }
1504 } 1365 }
1505 return ret; 1366 return ret;
@@ -1525,6 +1386,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1525{ 1386{
1526 struct rtable *rt = skb_rtable(skb); 1387 struct rtable *rt = skb_rtable(skb);
1527 struct in_device *in_dev; 1388 struct in_device *in_dev;
1389 struct inet_peer *peer;
1528 int log_martians; 1390 int log_martians;
1529 1391
1530 rcu_read_lock(); 1392 rcu_read_lock();
@@ -1536,33 +1398,41 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1536 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1398 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1537 rcu_read_unlock(); 1399 rcu_read_unlock();
1538 1400
1401 if (!rt->peer)
1402 rt_bind_peer(rt, 1);
1403 peer = rt->peer;
1404 if (!peer) {
1405 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1406 return;
1407 }
1408
1539 /* No redirected packets during ip_rt_redirect_silence; 1409 /* No redirected packets during ip_rt_redirect_silence;
1540 * reset the algorithm. 1410 * reset the algorithm.
1541 */ 1411 */
1542 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) 1412 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1543 rt->dst.rate_tokens = 0; 1413 peer->rate_tokens = 0;
1544 1414
1545 /* Too many ignored redirects; do not send anything 1415 /* Too many ignored redirects; do not send anything
1546 * set dst.rate_last to the last seen redirected packet. 1416 * set dst.rate_last to the last seen redirected packet.
1547 */ 1417 */
1548 if (rt->dst.rate_tokens >= ip_rt_redirect_number) { 1418 if (peer->rate_tokens >= ip_rt_redirect_number) {
1549 rt->dst.rate_last = jiffies; 1419 peer->rate_last = jiffies;
1550 return; 1420 return;
1551 } 1421 }
1552 1422
1553 /* Check for load limit; set rate_last to the latest sent 1423 /* Check for load limit; set rate_last to the latest sent
1554 * redirect. 1424 * redirect.
1555 */ 1425 */
1556 if (rt->dst.rate_tokens == 0 || 1426 if (peer->rate_tokens == 0 ||
1557 time_after(jiffies, 1427 time_after(jiffies,
1558 (rt->dst.rate_last + 1428 (peer->rate_last +
1559 (ip_rt_redirect_load << rt->dst.rate_tokens)))) { 1429 (ip_rt_redirect_load << peer->rate_tokens)))) {
1560 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1430 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1561 rt->dst.rate_last = jiffies; 1431 peer->rate_last = jiffies;
1562 ++rt->dst.rate_tokens; 1432 ++peer->rate_tokens;
1563#ifdef CONFIG_IP_ROUTE_VERBOSE 1433#ifdef CONFIG_IP_ROUTE_VERBOSE
1564 if (log_martians && 1434 if (log_martians &&
1565 rt->dst.rate_tokens == ip_rt_redirect_number && 1435 peer->rate_tokens == ip_rt_redirect_number &&
1566 net_ratelimit()) 1436 net_ratelimit())
1567 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1437 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1568 &rt->rt_src, rt->rt_iif, 1438 &rt->rt_src, rt->rt_iif,
@@ -1574,7 +1444,9 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1574static int ip_error(struct sk_buff *skb) 1444static int ip_error(struct sk_buff *skb)
1575{ 1445{
1576 struct rtable *rt = skb_rtable(skb); 1446 struct rtable *rt = skb_rtable(skb);
1447 struct inet_peer *peer;
1577 unsigned long now; 1448 unsigned long now;
1449 bool send;
1578 int code; 1450 int code;
1579 1451
1580 switch (rt->dst.error) { 1452 switch (rt->dst.error) {
@@ -1594,15 +1466,24 @@ static int ip_error(struct sk_buff *skb)
1594 break; 1466 break;
1595 } 1467 }
1596 1468
1597 now = jiffies; 1469 if (!rt->peer)
1598 rt->dst.rate_tokens += now - rt->dst.rate_last; 1470 rt_bind_peer(rt, 1);
1599 if (rt->dst.rate_tokens > ip_rt_error_burst) 1471 peer = rt->peer;
1600 rt->dst.rate_tokens = ip_rt_error_burst; 1472
1601 rt->dst.rate_last = now; 1473 send = true;
1602 if (rt->dst.rate_tokens >= ip_rt_error_cost) { 1474 if (peer) {
1603 rt->dst.rate_tokens -= ip_rt_error_cost; 1475 now = jiffies;
1604 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1476 peer->rate_tokens += now - peer->rate_last;
1477 if (peer->rate_tokens > ip_rt_error_burst)
1478 peer->rate_tokens = ip_rt_error_burst;
1479 peer->rate_last = now;
1480 if (peer->rate_tokens >= ip_rt_error_cost)
1481 peer->rate_tokens -= ip_rt_error_cost;
1482 else
1483 send = false;
1605 } 1484 }
1485 if (send)
1486 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1606 1487
1607out: kfree_skb(skb); 1488out: kfree_skb(skb);
1608 return 0; 1489 return 0;
@@ -1630,88 +1511,140 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1630 unsigned short new_mtu, 1511 unsigned short new_mtu,
1631 struct net_device *dev) 1512 struct net_device *dev)
1632{ 1513{
1633 int i, k;
1634 unsigned short old_mtu = ntohs(iph->tot_len); 1514 unsigned short old_mtu = ntohs(iph->tot_len);
1635 struct rtable *rth;
1636 int ikeys[2] = { dev->ifindex, 0 };
1637 __be32 skeys[2] = { iph->saddr, 0, };
1638 __be32 daddr = iph->daddr;
1639 unsigned short est_mtu = 0; 1515 unsigned short est_mtu = 0;
1516 struct inet_peer *peer;
1640 1517
1641 for (k = 0; k < 2; k++) { 1518 peer = inet_getpeer_v4(iph->daddr, 1);
1642 for (i = 0; i < 2; i++) { 1519 if (peer) {
1643 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1520 unsigned short mtu = new_mtu;
1644 rt_genid(net));
1645
1646 rcu_read_lock();
1647 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1648 rth = rcu_dereference(rth->dst.rt_next)) {
1649 unsigned short mtu = new_mtu;
1650
1651 if (rth->fl.fl4_dst != daddr ||
1652 rth->fl.fl4_src != skeys[i] ||
1653 rth->rt_dst != daddr ||
1654 rth->rt_src != iph->saddr ||
1655 rth->fl.oif != ikeys[k] ||
1656 rt_is_input_route(rth) ||
1657 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1658 !net_eq(dev_net(rth->dst.dev), net) ||
1659 rt_is_expired(rth))
1660 continue;
1661 1521
1662 if (new_mtu < 68 || new_mtu >= old_mtu) { 1522 if (new_mtu < 68 || new_mtu >= old_mtu) {
1523 /* BSD 4.2 derived systems incorrectly adjust
1524 * tot_len by the IP header length, and report
1525 * a zero MTU in the ICMP message.
1526 */
1527 if (mtu == 0 &&
1528 old_mtu >= 68 + (iph->ihl << 2))
1529 old_mtu -= iph->ihl << 2;
1530 mtu = guess_mtu(old_mtu);
1531 }
1663 1532
1664 /* BSD 4.2 compatibility hack :-( */ 1533 if (mtu < ip_rt_min_pmtu)
1665 if (mtu == 0 && 1534 mtu = ip_rt_min_pmtu;
1666 old_mtu >= dst_mtu(&rth->dst) && 1535 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1667 old_mtu >= 68 + (iph->ihl << 2)) 1536 unsigned long pmtu_expires;
1668 old_mtu -= iph->ihl << 2;
1669 1537
1670 mtu = guess_mtu(old_mtu); 1538 pmtu_expires = jiffies + ip_rt_mtu_expires;
1671 } 1539 if (!pmtu_expires)
1672 if (mtu <= dst_mtu(&rth->dst)) { 1540 pmtu_expires = 1UL;
1673 if (mtu < dst_mtu(&rth->dst)) { 1541
1674 dst_confirm(&rth->dst); 1542 est_mtu = mtu;
1675 if (mtu < ip_rt_min_pmtu) { 1543 peer->pmtu_learned = mtu;
1676 u32 lock = dst_metric(&rth->dst, 1544 peer->pmtu_expires = pmtu_expires;
1677 RTAX_LOCK);
1678 mtu = ip_rt_min_pmtu;
1679 lock |= (1 << RTAX_MTU);
1680 dst_metric_set(&rth->dst, RTAX_LOCK,
1681 lock);
1682 }
1683 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1684 dst_set_expires(&rth->dst,
1685 ip_rt_mtu_expires);
1686 }
1687 est_mtu = mtu;
1688 }
1689 }
1690 rcu_read_unlock();
1691 } 1545 }
1546
1547 inet_putpeer(peer);
1548
1549 atomic_inc(&__rt_peer_genid);
1692 } 1550 }
1693 return est_mtu ? : new_mtu; 1551 return est_mtu ? : new_mtu;
1694} 1552}
1695 1553
1554static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1555{
1556 unsigned long expires = peer->pmtu_expires;
1557
1558 if (time_before(jiffies, expires)) {
1559 u32 orig_dst_mtu = dst_mtu(dst);
1560 if (peer->pmtu_learned < orig_dst_mtu) {
1561 if (!peer->pmtu_orig)
1562 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1563 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1564 }
1565 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1566 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1567}
1568
1696static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1569static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1697{ 1570{
1698 if (dst_mtu(dst) > mtu && mtu >= 68 && 1571 struct rtable *rt = (struct rtable *) dst;
1699 !(dst_metric_locked(dst, RTAX_MTU))) { 1572 struct inet_peer *peer;
1700 if (mtu < ip_rt_min_pmtu) { 1573
1701 u32 lock = dst_metric(dst, RTAX_LOCK); 1574 dst_confirm(dst);
1575
1576 if (!rt->peer)
1577 rt_bind_peer(rt, 1);
1578 peer = rt->peer;
1579 if (peer) {
1580 if (mtu < ip_rt_min_pmtu)
1702 mtu = ip_rt_min_pmtu; 1581 mtu = ip_rt_min_pmtu;
1703 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU)); 1582 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1583 unsigned long pmtu_expires;
1584
1585 pmtu_expires = jiffies + ip_rt_mtu_expires;
1586 if (!pmtu_expires)
1587 pmtu_expires = 1UL;
1588
1589 peer->pmtu_learned = mtu;
1590 peer->pmtu_expires = pmtu_expires;
1591
1592 atomic_inc(&__rt_peer_genid);
1593 rt->rt_peer_genid = rt_peer_genid();
1704 } 1594 }
1705 dst_metric_set(dst, RTAX_MTU, mtu); 1595 check_peer_pmtu(dst, peer);
1706 dst_set_expires(dst, ip_rt_mtu_expires); 1596 }
1707 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); 1597}
1598
1599static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1600{
1601 struct rtable *rt = (struct rtable *) dst;
1602 __be32 orig_gw = rt->rt_gateway;
1603
1604 dst_confirm(&rt->dst);
1605
1606 neigh_release(rt->dst.neighbour);
1607 rt->dst.neighbour = NULL;
1608
1609 rt->rt_gateway = peer->redirect_learned.a4;
1610 if (arp_bind_neighbour(&rt->dst) ||
1611 !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1612 if (rt->dst.neighbour)
1613 neigh_event_send(rt->dst.neighbour, NULL);
1614 rt->rt_gateway = orig_gw;
1615 return -EAGAIN;
1616 } else {
1617 rt->rt_flags |= RTCF_REDIRECTED;
1618 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1619 rt->dst.neighbour);
1708 } 1620 }
1621 return 0;
1709} 1622}
1710 1623
1711static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1624static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1712{ 1625{
1713 if (rt_is_expired((struct rtable *)dst)) 1626 struct rtable *rt = (struct rtable *) dst;
1627
1628 if (rt_is_expired(rt))
1714 return NULL; 1629 return NULL;
1630 if (rt->rt_peer_genid != rt_peer_genid()) {
1631 struct inet_peer *peer;
1632
1633 if (!rt->peer)
1634 rt_bind_peer(rt, 0);
1635
1636 peer = rt->peer;
1637 if (peer && peer->pmtu_expires)
1638 check_peer_pmtu(dst, peer);
1639
1640 if (peer && peer->redirect_learned.a4 &&
1641 peer->redirect_learned.a4 != rt->rt_gateway) {
1642 if (check_peer_redir(dst, peer))
1643 return NULL;
1644 }
1645
1646 rt->rt_peer_genid = rt_peer_genid();
1647 }
1715 return dst; 1648 return dst;
1716} 1649}
1717 1650
@@ -1720,6 +1653,10 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
1720 struct rtable *rt = (struct rtable *) dst; 1653 struct rtable *rt = (struct rtable *) dst;
1721 struct inet_peer *peer = rt->peer; 1654 struct inet_peer *peer = rt->peer;
1722 1655
1656 if (rt->fi) {
1657 fib_info_put(rt->fi);
1658 rt->fi = NULL;
1659 }
1723 if (peer) { 1660 if (peer) {
1724 rt->peer = NULL; 1661 rt->peer = NULL;
1725 inet_putpeer(peer); 1662 inet_putpeer(peer);
@@ -1734,8 +1671,14 @@ static void ipv4_link_failure(struct sk_buff *skb)
1734 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1671 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1735 1672
1736 rt = skb_rtable(skb); 1673 rt = skb_rtable(skb);
1737 if (rt) 1674 if (rt &&
1738 dst_set_expires(&rt->dst, 0); 1675 rt->peer &&
1676 rt->peer->pmtu_expires) {
1677 unsigned long orig = rt->peer->pmtu_expires;
1678
1679 if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1680 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1681 }
1739} 1682}
1740 1683
1741static int ip_rt_bug(struct sk_buff *skb) 1684static int ip_rt_bug(struct sk_buff *skb)
@@ -1764,9 +1707,18 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1764 if (rt_is_output_route(rt)) 1707 if (rt_is_output_route(rt))
1765 src = rt->rt_src; 1708 src = rt->rt_src;
1766 else { 1709 else {
1710 struct flowi4 fl4 = {
1711 .daddr = rt->rt_key_dst,
1712 .saddr = rt->rt_key_src,
1713 .flowi4_tos = rt->rt_tos,
1714 .flowi4_oif = rt->rt_oif,
1715 .flowi4_iif = rt->rt_iif,
1716 .flowi4_mark = rt->rt_mark,
1717 };
1718
1767 rcu_read_lock(); 1719 rcu_read_lock();
1768 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) 1720 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1769 src = FIB_RES_PREFSRC(res); 1721 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1770 else 1722 else
1771 src = inet_select_addr(rt->dst.dev, rt->rt_gateway, 1723 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1772 RT_SCOPE_UNIVERSE); 1724 RT_SCOPE_UNIVERSE);
@@ -1775,7 +1727,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1775 memcpy(addr, &src, 4); 1727 memcpy(addr, &src, 4);
1776} 1728}
1777 1729
1778#ifdef CONFIG_NET_CLS_ROUTE 1730#ifdef CONFIG_IP_ROUTE_CLASSID
1779static void set_class_tag(struct rtable *rt, u32 tag) 1731static void set_class_tag(struct rtable *rt, u32 tag)
1780{ 1732{
1781 if (!(rt->dst.tclassid & 0xFFFF)) 1733 if (!(rt->dst.tclassid & 0xFFFF))
@@ -1815,17 +1767,54 @@ static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1815 return mtu; 1767 return mtu;
1816} 1768}
1817 1769
1818static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 1770static void rt_init_metrics(struct rtable *rt, const struct flowi4 *oldflp4,
1771 struct fib_info *fi)
1772{
1773 struct inet_peer *peer;
1774 int create = 0;
1775
1776 /* If a peer entry exists for this destination, we must hook
1777 * it up in order to get at cached metrics.
1778 */
1779 if (oldflp4 && (oldflp4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1780 create = 1;
1781
1782 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1783 if (peer) {
1784 rt->rt_peer_genid = rt_peer_genid();
1785 if (inet_metrics_new(peer))
1786 memcpy(peer->metrics, fi->fib_metrics,
1787 sizeof(u32) * RTAX_MAX);
1788 dst_init_metrics(&rt->dst, peer->metrics, false);
1789
1790 if (peer->pmtu_expires)
1791 check_peer_pmtu(&rt->dst, peer);
1792 if (peer->redirect_learned.a4 &&
1793 peer->redirect_learned.a4 != rt->rt_gateway) {
1794 rt->rt_gateway = peer->redirect_learned.a4;
1795 rt->rt_flags |= RTCF_REDIRECTED;
1796 }
1797 } else {
1798 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1799 rt->fi = fi;
1800 atomic_inc(&fi->fib_clntref);
1801 }
1802 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1803 }
1804}
1805
1806static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *oldflp4,
1807 const struct fib_result *res,
1808 struct fib_info *fi, u16 type, u32 itag)
1819{ 1809{
1820 struct dst_entry *dst = &rt->dst; 1810 struct dst_entry *dst = &rt->dst;
1821 struct fib_info *fi = res->fi;
1822 1811
1823 if (fi) { 1812 if (fi) {
1824 if (FIB_RES_GW(*res) && 1813 if (FIB_RES_GW(*res) &&
1825 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1814 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1826 rt->rt_gateway = FIB_RES_GW(*res); 1815 rt->rt_gateway = FIB_RES_GW(*res);
1827 dst_import_metrics(dst, fi->fib_metrics); 1816 rt_init_metrics(rt, oldflp4, fi);
1828#ifdef CONFIG_NET_CLS_ROUTE 1817#ifdef CONFIG_IP_ROUTE_CLASSID
1829 dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1818 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1830#endif 1819#endif
1831 } 1820 }
@@ -1835,13 +1824,26 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1835 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) 1824 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1836 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); 1825 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1837 1826
1838#ifdef CONFIG_NET_CLS_ROUTE 1827#ifdef CONFIG_IP_ROUTE_CLASSID
1839#ifdef CONFIG_IP_MULTIPLE_TABLES 1828#ifdef CONFIG_IP_MULTIPLE_TABLES
1840 set_class_tag(rt, fib_rules_tclass(res)); 1829 set_class_tag(rt, fib_rules_tclass(res));
1841#endif 1830#endif
1842 set_class_tag(rt, itag); 1831 set_class_tag(rt, itag);
1843#endif 1832#endif
1844 rt->rt_type = res->type; 1833 rt->rt_type = type;
1834}
1835
1836static struct rtable *rt_dst_alloc(bool nopolicy, bool noxfrm)
1837{
1838 struct rtable *rt = dst_alloc(&ipv4_dst_ops, 1);
1839 if (rt) {
1840 rt->dst.obsolete = -1;
1841
1842 rt->dst.flags = DST_HOST |
1843 (nopolicy ? DST_NOPOLICY : 0) |
1844 (noxfrm ? DST_NOXFRM : 0);
1845 }
1846 return rt;
1845} 1847}
1846 1848
1847/* called in rcu_read_lock() section */ 1849/* called in rcu_read_lock() section */
@@ -1874,31 +1876,25 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1874 if (err < 0) 1876 if (err < 0)
1875 goto e_err; 1877 goto e_err;
1876 } 1878 }
1877 rth = dst_alloc(&ipv4_dst_ops); 1879 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1878 if (!rth) 1880 if (!rth)
1879 goto e_nobufs; 1881 goto e_nobufs;
1880 1882
1881 rth->dst.output = ip_rt_bug; 1883 rth->dst.output = ip_rt_bug;
1882 rth->dst.obsolete = -1;
1883 1884
1884 atomic_set(&rth->dst.__refcnt, 1); 1885 rth->rt_key_dst = daddr;
1885 rth->dst.flags= DST_HOST;
1886 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1887 rth->dst.flags |= DST_NOPOLICY;
1888 rth->fl.fl4_dst = daddr;
1889 rth->rt_dst = daddr; 1886 rth->rt_dst = daddr;
1890 rth->fl.fl4_tos = tos; 1887 rth->rt_tos = tos;
1891 rth->fl.mark = skb->mark; 1888 rth->rt_mark = skb->mark;
1892 rth->fl.fl4_src = saddr; 1889 rth->rt_key_src = saddr;
1893 rth->rt_src = saddr; 1890 rth->rt_src = saddr;
1894#ifdef CONFIG_NET_CLS_ROUTE 1891#ifdef CONFIG_IP_ROUTE_CLASSID
1895 rth->dst.tclassid = itag; 1892 rth->dst.tclassid = itag;
1896#endif 1893#endif
1897 rth->rt_iif = 1894 rth->rt_iif = dev->ifindex;
1898 rth->fl.iif = dev->ifindex;
1899 rth->dst.dev = init_net.loopback_dev; 1895 rth->dst.dev = init_net.loopback_dev;
1900 dev_hold(rth->dst.dev); 1896 dev_hold(rth->dst.dev);
1901 rth->fl.oif = 0; 1897 rth->rt_oif = 0;
1902 rth->rt_gateway = daddr; 1898 rth->rt_gateway = daddr;
1903 rth->rt_spec_dst= spec_dst; 1899 rth->rt_spec_dst= spec_dst;
1904 rth->rt_genid = rt_genid(dev_net(dev)); 1900 rth->rt_genid = rt_genid(dev_net(dev));
@@ -1916,7 +1912,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1916 RT_CACHE_STAT_INC(in_slow_mc); 1912 RT_CACHE_STAT_INC(in_slow_mc);
1917 1913
1918 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1914 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1919 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); 1915 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1916 err = 0;
1917 if (IS_ERR(rth))
1918 err = PTR_ERR(rth);
1920 1919
1921e_nobufs: 1920e_nobufs:
1922 return -ENOBUFS; 1921 return -ENOBUFS;
@@ -1959,7 +1958,7 @@ static void ip_handle_martian_source(struct net_device *dev,
1959 1958
1960/* called in rcu_read_lock() section */ 1959/* called in rcu_read_lock() section */
1961static int __mkroute_input(struct sk_buff *skb, 1960static int __mkroute_input(struct sk_buff *skb,
1962 struct fib_result *res, 1961 const struct fib_result *res,
1963 struct in_device *in_dev, 1962 struct in_device *in_dev,
1964 __be32 daddr, __be32 saddr, u32 tos, 1963 __be32 daddr, __be32 saddr, u32 tos,
1965 struct rtable **result) 1964 struct rtable **result)
@@ -2013,39 +2012,31 @@ static int __mkroute_input(struct sk_buff *skb,
2013 } 2012 }
2014 } 2013 }
2015 2014
2016 2015 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY),
2017 rth = dst_alloc(&ipv4_dst_ops); 2016 IN_DEV_CONF_GET(out_dev, NOXFRM));
2018 if (!rth) { 2017 if (!rth) {
2019 err = -ENOBUFS; 2018 err = -ENOBUFS;
2020 goto cleanup; 2019 goto cleanup;
2021 } 2020 }
2022 2021
2023 atomic_set(&rth->dst.__refcnt, 1); 2022 rth->rt_key_dst = daddr;
2024 rth->dst.flags= DST_HOST;
2025 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2026 rth->dst.flags |= DST_NOPOLICY;
2027 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2028 rth->dst.flags |= DST_NOXFRM;
2029 rth->fl.fl4_dst = daddr;
2030 rth->rt_dst = daddr; 2023 rth->rt_dst = daddr;
2031 rth->fl.fl4_tos = tos; 2024 rth->rt_tos = tos;
2032 rth->fl.mark = skb->mark; 2025 rth->rt_mark = skb->mark;
2033 rth->fl.fl4_src = saddr; 2026 rth->rt_key_src = saddr;
2034 rth->rt_src = saddr; 2027 rth->rt_src = saddr;
2035 rth->rt_gateway = daddr; 2028 rth->rt_gateway = daddr;
2036 rth->rt_iif = 2029 rth->rt_iif = in_dev->dev->ifindex;
2037 rth->fl.iif = in_dev->dev->ifindex;
2038 rth->dst.dev = (out_dev)->dev; 2030 rth->dst.dev = (out_dev)->dev;
2039 dev_hold(rth->dst.dev); 2031 dev_hold(rth->dst.dev);
2040 rth->fl.oif = 0; 2032 rth->rt_oif = 0;
2041 rth->rt_spec_dst= spec_dst; 2033 rth->rt_spec_dst= spec_dst;
2042 2034
2043 rth->dst.obsolete = -1;
2044 rth->dst.input = ip_forward; 2035 rth->dst.input = ip_forward;
2045 rth->dst.output = ip_output; 2036 rth->dst.output = ip_output;
2046 rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); 2037 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2047 2038
2048 rt_set_nexthop(rth, res, itag); 2039 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2049 2040
2050 rth->rt_flags = flags; 2041 rth->rt_flags = flags;
2051 2042
@@ -2057,7 +2048,7 @@ static int __mkroute_input(struct sk_buff *skb,
2057 2048
2058static int ip_mkroute_input(struct sk_buff *skb, 2049static int ip_mkroute_input(struct sk_buff *skb,
2059 struct fib_result *res, 2050 struct fib_result *res,
2060 const struct flowi *fl, 2051 const struct flowi4 *fl4,
2061 struct in_device *in_dev, 2052 struct in_device *in_dev,
2062 __be32 daddr, __be32 saddr, u32 tos) 2053 __be32 daddr, __be32 saddr, u32 tos)
2063{ 2054{
@@ -2066,8 +2057,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
2066 unsigned hash; 2057 unsigned hash;
2067 2058
2068#ifdef CONFIG_IP_ROUTE_MULTIPATH 2059#ifdef CONFIG_IP_ROUTE_MULTIPATH
2069 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) 2060 if (res->fi && res->fi->fib_nhs > 1)
2070 fib_select_multipath(fl, res); 2061 fib_select_multipath(res);
2071#endif 2062#endif
2072 2063
2073 /* create a routing cache entry */ 2064 /* create a routing cache entry */
@@ -2076,9 +2067,12 @@ static int ip_mkroute_input(struct sk_buff *skb,
2076 return err; 2067 return err;
2077 2068
2078 /* put it into the cache */ 2069 /* put it into the cache */
2079 hash = rt_hash(daddr, saddr, fl->iif, 2070 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2080 rt_genid(dev_net(rth->dst.dev))); 2071 rt_genid(dev_net(rth->dst.dev)));
2081 return rt_intern_hash(hash, rth, NULL, skb, fl->iif); 2072 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2073 if (IS_ERR(rth))
2074 return PTR_ERR(rth);
2075 return 0;
2082} 2076}
2083 2077
2084/* 2078/*
@@ -2097,12 +2091,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2097{ 2091{
2098 struct fib_result res; 2092 struct fib_result res;
2099 struct in_device *in_dev = __in_dev_get_rcu(dev); 2093 struct in_device *in_dev = __in_dev_get_rcu(dev);
2100 struct flowi fl = { .fl4_dst = daddr, 2094 struct flowi4 fl4;
2101 .fl4_src = saddr,
2102 .fl4_tos = tos,
2103 .fl4_scope = RT_SCOPE_UNIVERSE,
2104 .mark = skb->mark,
2105 .iif = dev->ifindex };
2106 unsigned flags = 0; 2095 unsigned flags = 0;
2107 u32 itag = 0; 2096 u32 itag = 0;
2108 struct rtable * rth; 2097 struct rtable * rth;
@@ -2139,7 +2128,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2139 /* 2128 /*
2140 * Now we are ready to route packet. 2129 * Now we are ready to route packet.
2141 */ 2130 */
2142 err = fib_lookup(net, &fl, &res); 2131 fl4.flowi4_oif = 0;
2132 fl4.flowi4_iif = dev->ifindex;
2133 fl4.flowi4_mark = skb->mark;
2134 fl4.flowi4_tos = tos;
2135 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2136 fl4.daddr = daddr;
2137 fl4.saddr = saddr;
2138 err = fib_lookup(net, &fl4, &res);
2143 if (err != 0) { 2139 if (err != 0) {
2144 if (!IN_DEV_FORWARD(in_dev)) 2140 if (!IN_DEV_FORWARD(in_dev))
2145 goto e_hostunreach; 2141 goto e_hostunreach;
@@ -2168,7 +2164,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2168 if (res.type != RTN_UNICAST) 2164 if (res.type != RTN_UNICAST)
2169 goto martian_destination; 2165 goto martian_destination;
2170 2166
2171 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 2167 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2172out: return err; 2168out: return err;
2173 2169
2174brd_input: 2170brd_input:
@@ -2190,29 +2186,23 @@ brd_input:
2190 RT_CACHE_STAT_INC(in_brd); 2186 RT_CACHE_STAT_INC(in_brd);
2191 2187
2192local_input: 2188local_input:
2193 rth = dst_alloc(&ipv4_dst_ops); 2189 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2194 if (!rth) 2190 if (!rth)
2195 goto e_nobufs; 2191 goto e_nobufs;
2196 2192
2197 rth->dst.output= ip_rt_bug; 2193 rth->dst.output= ip_rt_bug;
2198 rth->dst.obsolete = -1;
2199 rth->rt_genid = rt_genid(net); 2194 rth->rt_genid = rt_genid(net);
2200 2195
2201 atomic_set(&rth->dst.__refcnt, 1); 2196 rth->rt_key_dst = daddr;
2202 rth->dst.flags= DST_HOST;
2203 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2204 rth->dst.flags |= DST_NOPOLICY;
2205 rth->fl.fl4_dst = daddr;
2206 rth->rt_dst = daddr; 2197 rth->rt_dst = daddr;
2207 rth->fl.fl4_tos = tos; 2198 rth->rt_tos = tos;
2208 rth->fl.mark = skb->mark; 2199 rth->rt_mark = skb->mark;
2209 rth->fl.fl4_src = saddr; 2200 rth->rt_key_src = saddr;
2210 rth->rt_src = saddr; 2201 rth->rt_src = saddr;
2211#ifdef CONFIG_NET_CLS_ROUTE 2202#ifdef CONFIG_IP_ROUTE_CLASSID
2212 rth->dst.tclassid = itag; 2203 rth->dst.tclassid = itag;
2213#endif 2204#endif
2214 rth->rt_iif = 2205 rth->rt_iif = dev->ifindex;
2215 rth->fl.iif = dev->ifindex;
2216 rth->dst.dev = net->loopback_dev; 2206 rth->dst.dev = net->loopback_dev;
2217 dev_hold(rth->dst.dev); 2207 dev_hold(rth->dst.dev);
2218 rth->rt_gateway = daddr; 2208 rth->rt_gateway = daddr;
@@ -2225,8 +2215,11 @@ local_input:
2225 rth->rt_flags &= ~RTCF_LOCAL; 2215 rth->rt_flags &= ~RTCF_LOCAL;
2226 } 2216 }
2227 rth->rt_type = res.type; 2217 rth->rt_type = res.type;
2228 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); 2218 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2229 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); 2219 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2220 err = 0;
2221 if (IS_ERR(rth))
2222 err = PTR_ERR(rth);
2230 goto out; 2223 goto out;
2231 2224
2232no_route: 2225no_route:
@@ -2288,12 +2281,12 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2288 2281
2289 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2282 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2290 rth = rcu_dereference(rth->dst.rt_next)) { 2283 rth = rcu_dereference(rth->dst.rt_next)) {
2291 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | 2284 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2292 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | 2285 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2293 (rth->fl.iif ^ iif) | 2286 (rth->rt_iif ^ iif) |
2294 rth->fl.oif | 2287 rth->rt_oif |
2295 (rth->fl.fl4_tos ^ tos)) == 0 && 2288 (rth->rt_tos ^ tos)) == 0 &&
2296 rth->fl.mark == skb->mark && 2289 rth->rt_mark == skb->mark &&
2297 net_eq(dev_net(rth->dst.dev), net) && 2290 net_eq(dev_net(rth->dst.dev), net) &&
2298 !rt_is_expired(rth)) { 2291 !rt_is_expired(rth)) {
2299 if (noref) { 2292 if (noref) {
@@ -2326,8 +2319,8 @@ skip_cache:
2326 struct in_device *in_dev = __in_dev_get_rcu(dev); 2319 struct in_device *in_dev = __in_dev_get_rcu(dev);
2327 2320
2328 if (in_dev) { 2321 if (in_dev) {
2329 int our = ip_check_mc(in_dev, daddr, saddr, 2322 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2330 ip_hdr(skb)->protocol); 2323 ip_hdr(skb)->protocol);
2331 if (our 2324 if (our
2332#ifdef CONFIG_IP_MROUTE 2325#ifdef CONFIG_IP_MROUTE
2333 || 2326 ||
@@ -2351,98 +2344,91 @@ skip_cache:
2351EXPORT_SYMBOL(ip_route_input_common); 2344EXPORT_SYMBOL(ip_route_input_common);
2352 2345
2353/* called with rcu_read_lock() */ 2346/* called with rcu_read_lock() */
2354static int __mkroute_output(struct rtable **result, 2347static struct rtable *__mkroute_output(const struct fib_result *res,
2355 struct fib_result *res, 2348 const struct flowi4 *fl4,
2356 const struct flowi *fl, 2349 const struct flowi4 *oldflp4,
2357 const struct flowi *oldflp, 2350 struct net_device *dev_out,
2358 struct net_device *dev_out, 2351 unsigned int flags)
2359 unsigned flags)
2360{ 2352{
2361 struct rtable *rth; 2353 struct fib_info *fi = res->fi;
2354 u32 tos = RT_FL_TOS(oldflp4);
2362 struct in_device *in_dev; 2355 struct in_device *in_dev;
2363 u32 tos = RT_FL_TOS(oldflp); 2356 u16 type = res->type;
2357 struct rtable *rth;
2364 2358
2365 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK)) 2359 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2366 return -EINVAL; 2360 return ERR_PTR(-EINVAL);
2367 2361
2368 if (ipv4_is_lbcast(fl->fl4_dst)) 2362 if (ipv4_is_lbcast(fl4->daddr))
2369 res->type = RTN_BROADCAST; 2363 type = RTN_BROADCAST;
2370 else if (ipv4_is_multicast(fl->fl4_dst)) 2364 else if (ipv4_is_multicast(fl4->daddr))
2371 res->type = RTN_MULTICAST; 2365 type = RTN_MULTICAST;
2372 else if (ipv4_is_zeronet(fl->fl4_dst)) 2366 else if (ipv4_is_zeronet(fl4->daddr))
2373 return -EINVAL; 2367 return ERR_PTR(-EINVAL);
2374 2368
2375 if (dev_out->flags & IFF_LOOPBACK) 2369 if (dev_out->flags & IFF_LOOPBACK)
2376 flags |= RTCF_LOCAL; 2370 flags |= RTCF_LOCAL;
2377 2371
2378 in_dev = __in_dev_get_rcu(dev_out); 2372 in_dev = __in_dev_get_rcu(dev_out);
2379 if (!in_dev) 2373 if (!in_dev)
2380 return -EINVAL; 2374 return ERR_PTR(-EINVAL);
2381 2375
2382 if (res->type == RTN_BROADCAST) { 2376 if (type == RTN_BROADCAST) {
2383 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2377 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2384 res->fi = NULL; 2378 fi = NULL;
2385 } else if (res->type == RTN_MULTICAST) { 2379 } else if (type == RTN_MULTICAST) {
2386 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2380 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2387 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 2381 if (!ip_check_mc_rcu(in_dev, oldflp4->daddr, oldflp4->saddr,
2388 oldflp->proto)) 2382 oldflp4->flowi4_proto))
2389 flags &= ~RTCF_LOCAL; 2383 flags &= ~RTCF_LOCAL;
2390 /* If multicast route do not exist use 2384 /* If multicast route do not exist use
2391 * default one, but do not gateway in this case. 2385 * default one, but do not gateway in this case.
2392 * Yes, it is hack. 2386 * Yes, it is hack.
2393 */ 2387 */
2394 if (res->fi && res->prefixlen < 4) 2388 if (fi && res->prefixlen < 4)
2395 res->fi = NULL; 2389 fi = NULL;
2396 } 2390 }
2397 2391
2398 2392 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY),
2399 rth = dst_alloc(&ipv4_dst_ops); 2393 IN_DEV_CONF_GET(in_dev, NOXFRM));
2400 if (!rth) 2394 if (!rth)
2401 return -ENOBUFS; 2395 return ERR_PTR(-ENOBUFS);
2402 2396
2403 atomic_set(&rth->dst.__refcnt, 1); 2397 rth->rt_key_dst = oldflp4->daddr;
2404 rth->dst.flags= DST_HOST; 2398 rth->rt_tos = tos;
2405 if (IN_DEV_CONF_GET(in_dev, NOXFRM)) 2399 rth->rt_key_src = oldflp4->saddr;
2406 rth->dst.flags |= DST_NOXFRM; 2400 rth->rt_oif = oldflp4->flowi4_oif;
2407 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2401 rth->rt_mark = oldflp4->flowi4_mark;
2408 rth->dst.flags |= DST_NOPOLICY; 2402 rth->rt_dst = fl4->daddr;
2409 2403 rth->rt_src = fl4->saddr;
2410 rth->fl.fl4_dst = oldflp->fl4_dst; 2404 rth->rt_iif = 0;
2411 rth->fl.fl4_tos = tos;
2412 rth->fl.fl4_src = oldflp->fl4_src;
2413 rth->fl.oif = oldflp->oif;
2414 rth->fl.mark = oldflp->mark;
2415 rth->rt_dst = fl->fl4_dst;
2416 rth->rt_src = fl->fl4_src;
2417 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2418 /* get references to the devices that are to be hold by the routing 2405 /* get references to the devices that are to be hold by the routing
2419 cache entry */ 2406 cache entry */
2420 rth->dst.dev = dev_out; 2407 rth->dst.dev = dev_out;
2421 dev_hold(dev_out); 2408 dev_hold(dev_out);
2422 rth->rt_gateway = fl->fl4_dst; 2409 rth->rt_gateway = fl4->daddr;
2423 rth->rt_spec_dst= fl->fl4_src; 2410 rth->rt_spec_dst= fl4->saddr;
2424 2411
2425 rth->dst.output=ip_output; 2412 rth->dst.output=ip_output;
2426 rth->dst.obsolete = -1;
2427 rth->rt_genid = rt_genid(dev_net(dev_out)); 2413 rth->rt_genid = rt_genid(dev_net(dev_out));
2428 2414
2429 RT_CACHE_STAT_INC(out_slow_tot); 2415 RT_CACHE_STAT_INC(out_slow_tot);
2430 2416
2431 if (flags & RTCF_LOCAL) { 2417 if (flags & RTCF_LOCAL) {
2432 rth->dst.input = ip_local_deliver; 2418 rth->dst.input = ip_local_deliver;
2433 rth->rt_spec_dst = fl->fl4_dst; 2419 rth->rt_spec_dst = fl4->daddr;
2434 } 2420 }
2435 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2421 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2436 rth->rt_spec_dst = fl->fl4_src; 2422 rth->rt_spec_dst = fl4->saddr;
2437 if (flags & RTCF_LOCAL && 2423 if (flags & RTCF_LOCAL &&
2438 !(dev_out->flags & IFF_LOOPBACK)) { 2424 !(dev_out->flags & IFF_LOOPBACK)) {
2439 rth->dst.output = ip_mc_output; 2425 rth->dst.output = ip_mc_output;
2440 RT_CACHE_STAT_INC(out_slow_mc); 2426 RT_CACHE_STAT_INC(out_slow_mc);
2441 } 2427 }
2442#ifdef CONFIG_IP_MROUTE 2428#ifdef CONFIG_IP_MROUTE
2443 if (res->type == RTN_MULTICAST) { 2429 if (type == RTN_MULTICAST) {
2444 if (IN_DEV_MFORWARD(in_dev) && 2430 if (IN_DEV_MFORWARD(in_dev) &&
2445 !ipv4_is_local_multicast(oldflp->fl4_dst)) { 2431 !ipv4_is_local_multicast(oldflp4->daddr)) {
2446 rth->dst.input = ip_mr_input; 2432 rth->dst.input = ip_mr_input;
2447 rth->dst.output = ip_mc_output; 2433 rth->dst.output = ip_mc_output;
2448 } 2434 }
@@ -2450,31 +2436,10 @@ static int __mkroute_output(struct rtable **result,
2450#endif 2436#endif
2451 } 2437 }
2452 2438
2453 rt_set_nexthop(rth, res, 0); 2439 rt_set_nexthop(rth, oldflp4, res, fi, type, 0);
2454 2440
2455 rth->rt_flags = flags; 2441 rth->rt_flags = flags;
2456 *result = rth; 2442 return rth;
2457 return 0;
2458}
2459
2460/* called with rcu_read_lock() */
2461static int ip_mkroute_output(struct rtable **rp,
2462 struct fib_result *res,
2463 const struct flowi *fl,
2464 const struct flowi *oldflp,
2465 struct net_device *dev_out,
2466 unsigned flags)
2467{
2468 struct rtable *rth = NULL;
2469 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2470 unsigned hash;
2471 if (err == 0) {
2472 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2473 rt_genid(dev_net(dev_out)));
2474 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2475 }
2476
2477 return err;
2478} 2443}
2479 2444
2480/* 2445/*
@@ -2482,34 +2447,36 @@ static int ip_mkroute_output(struct rtable **rp,
2482 * called with rcu_read_lock(); 2447 * called with rcu_read_lock();
2483 */ 2448 */
2484 2449
2485static int ip_route_output_slow(struct net *net, struct rtable **rp, 2450static struct rtable *ip_route_output_slow(struct net *net,
2486 const struct flowi *oldflp) 2451 const struct flowi4 *oldflp4)
2487{ 2452{
2488 u32 tos = RT_FL_TOS(oldflp); 2453 u32 tos = RT_FL_TOS(oldflp4);
2489 struct flowi fl = { .fl4_dst = oldflp->fl4_dst, 2454 struct flowi4 fl4;
2490 .fl4_src = oldflp->fl4_src,
2491 .fl4_tos = tos & IPTOS_RT_MASK,
2492 .fl4_scope = ((tos & RTO_ONLINK) ?
2493 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2494 .mark = oldflp->mark,
2495 .iif = net->loopback_dev->ifindex,
2496 .oif = oldflp->oif };
2497 struct fib_result res; 2455 struct fib_result res;
2498 unsigned int flags = 0; 2456 unsigned int flags = 0;
2499 struct net_device *dev_out = NULL; 2457 struct net_device *dev_out = NULL;
2500 int err; 2458 struct rtable *rth;
2501
2502 2459
2503 res.fi = NULL; 2460 res.fi = NULL;
2504#ifdef CONFIG_IP_MULTIPLE_TABLES 2461#ifdef CONFIG_IP_MULTIPLE_TABLES
2505 res.r = NULL; 2462 res.r = NULL;
2506#endif 2463#endif
2507 2464
2508 if (oldflp->fl4_src) { 2465 fl4.flowi4_oif = oldflp4->flowi4_oif;
2509 err = -EINVAL; 2466 fl4.flowi4_iif = net->loopback_dev->ifindex;
2510 if (ipv4_is_multicast(oldflp->fl4_src) || 2467 fl4.flowi4_mark = oldflp4->flowi4_mark;
2511 ipv4_is_lbcast(oldflp->fl4_src) || 2468 fl4.daddr = oldflp4->daddr;
2512 ipv4_is_zeronet(oldflp->fl4_src)) 2469 fl4.saddr = oldflp4->saddr;
2470 fl4.flowi4_tos = tos & IPTOS_RT_MASK;
2471 fl4.flowi4_scope = ((tos & RTO_ONLINK) ?
2472 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2473
2474 rcu_read_lock();
2475 if (oldflp4->saddr) {
2476 rth = ERR_PTR(-EINVAL);
2477 if (ipv4_is_multicast(oldflp4->saddr) ||
2478 ipv4_is_lbcast(oldflp4->saddr) ||
2479 ipv4_is_zeronet(oldflp4->saddr))
2513 goto out; 2480 goto out;
2514 2481
2515 /* I removed check for oif == dev_out->oif here. 2482 /* I removed check for oif == dev_out->oif here.
@@ -2520,11 +2487,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2520 of another iface. --ANK 2487 of another iface. --ANK
2521 */ 2488 */
2522 2489
2523 if (oldflp->oif == 0 && 2490 if (oldflp4->flowi4_oif == 0 &&
2524 (ipv4_is_multicast(oldflp->fl4_dst) || 2491 (ipv4_is_multicast(oldflp4->daddr) ||
2525 ipv4_is_lbcast(oldflp->fl4_dst))) { 2492 ipv4_is_lbcast(oldflp4->daddr))) {
2526 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2493 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2527 dev_out = __ip_dev_find(net, oldflp->fl4_src, false); 2494 dev_out = __ip_dev_find(net, oldflp4->saddr, false);
2528 if (dev_out == NULL) 2495 if (dev_out == NULL)
2529 goto out; 2496 goto out;
2530 2497
@@ -2543,60 +2510,60 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2543 Luckily, this hack is good workaround. 2510 Luckily, this hack is good workaround.
2544 */ 2511 */
2545 2512
2546 fl.oif = dev_out->ifindex; 2513 fl4.flowi4_oif = dev_out->ifindex;
2547 goto make_route; 2514 goto make_route;
2548 } 2515 }
2549 2516
2550 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { 2517 if (!(oldflp4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2551 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2518 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2552 if (!__ip_dev_find(net, oldflp->fl4_src, false)) 2519 if (!__ip_dev_find(net, oldflp4->saddr, false))
2553 goto out; 2520 goto out;
2554 } 2521 }
2555 } 2522 }
2556 2523
2557 2524
2558 if (oldflp->oif) { 2525 if (oldflp4->flowi4_oif) {
2559 dev_out = dev_get_by_index_rcu(net, oldflp->oif); 2526 dev_out = dev_get_by_index_rcu(net, oldflp4->flowi4_oif);
2560 err = -ENODEV; 2527 rth = ERR_PTR(-ENODEV);
2561 if (dev_out == NULL) 2528 if (dev_out == NULL)
2562 goto out; 2529 goto out;
2563 2530
2564 /* RACE: Check return value of inet_select_addr instead. */ 2531 /* RACE: Check return value of inet_select_addr instead. */
2565 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 2532 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2566 err = -ENETUNREACH; 2533 rth = ERR_PTR(-ENETUNREACH);
2567 goto out; 2534 goto out;
2568 } 2535 }
2569 if (ipv4_is_local_multicast(oldflp->fl4_dst) || 2536 if (ipv4_is_local_multicast(oldflp4->daddr) ||
2570 ipv4_is_lbcast(oldflp->fl4_dst)) { 2537 ipv4_is_lbcast(oldflp4->daddr)) {
2571 if (!fl.fl4_src) 2538 if (!fl4.saddr)
2572 fl.fl4_src = inet_select_addr(dev_out, 0, 2539 fl4.saddr = inet_select_addr(dev_out, 0,
2573 RT_SCOPE_LINK); 2540 RT_SCOPE_LINK);
2574 goto make_route; 2541 goto make_route;
2575 } 2542 }
2576 if (!fl.fl4_src) { 2543 if (!fl4.saddr) {
2577 if (ipv4_is_multicast(oldflp->fl4_dst)) 2544 if (ipv4_is_multicast(oldflp4->daddr))
2578 fl.fl4_src = inet_select_addr(dev_out, 0, 2545 fl4.saddr = inet_select_addr(dev_out, 0,
2579 fl.fl4_scope); 2546 fl4.flowi4_scope);
2580 else if (!oldflp->fl4_dst) 2547 else if (!oldflp4->daddr)
2581 fl.fl4_src = inet_select_addr(dev_out, 0, 2548 fl4.saddr = inet_select_addr(dev_out, 0,
2582 RT_SCOPE_HOST); 2549 RT_SCOPE_HOST);
2583 } 2550 }
2584 } 2551 }
2585 2552
2586 if (!fl.fl4_dst) { 2553 if (!fl4.daddr) {
2587 fl.fl4_dst = fl.fl4_src; 2554 fl4.daddr = fl4.saddr;
2588 if (!fl.fl4_dst) 2555 if (!fl4.daddr)
2589 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); 2556 fl4.daddr = fl4.saddr = htonl(INADDR_LOOPBACK);
2590 dev_out = net->loopback_dev; 2557 dev_out = net->loopback_dev;
2591 fl.oif = net->loopback_dev->ifindex; 2558 fl4.flowi4_oif = net->loopback_dev->ifindex;
2592 res.type = RTN_LOCAL; 2559 res.type = RTN_LOCAL;
2593 flags |= RTCF_LOCAL; 2560 flags |= RTCF_LOCAL;
2594 goto make_route; 2561 goto make_route;
2595 } 2562 }
2596 2563
2597 if (fib_lookup(net, &fl, &res)) { 2564 if (fib_lookup(net, &fl4, &res)) {
2598 res.fi = NULL; 2565 res.fi = NULL;
2599 if (oldflp->oif) { 2566 if (oldflp4->flowi4_oif) {
2600 /* Apparently, routing tables are wrong. Assume, 2567 /* Apparently, routing tables are wrong. Assume,
2601 that the destination is on link. 2568 that the destination is on link.
2602 2569
@@ -2615,90 +2582,93 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2615 likely IPv6, but we do not. 2582 likely IPv6, but we do not.
2616 */ 2583 */
2617 2584
2618 if (fl.fl4_src == 0) 2585 if (fl4.saddr == 0)
2619 fl.fl4_src = inet_select_addr(dev_out, 0, 2586 fl4.saddr = inet_select_addr(dev_out, 0,
2620 RT_SCOPE_LINK); 2587 RT_SCOPE_LINK);
2621 res.type = RTN_UNICAST; 2588 res.type = RTN_UNICAST;
2622 goto make_route; 2589 goto make_route;
2623 } 2590 }
2624 err = -ENETUNREACH; 2591 rth = ERR_PTR(-ENETUNREACH);
2625 goto out; 2592 goto out;
2626 } 2593 }
2627 2594
2628 if (res.type == RTN_LOCAL) { 2595 if (res.type == RTN_LOCAL) {
2629 if (!fl.fl4_src) { 2596 if (!fl4.saddr) {
2630 if (res.fi->fib_prefsrc) 2597 if (res.fi->fib_prefsrc)
2631 fl.fl4_src = res.fi->fib_prefsrc; 2598 fl4.saddr = res.fi->fib_prefsrc;
2632 else 2599 else
2633 fl.fl4_src = fl.fl4_dst; 2600 fl4.saddr = fl4.daddr;
2634 } 2601 }
2635 dev_out = net->loopback_dev; 2602 dev_out = net->loopback_dev;
2636 fl.oif = dev_out->ifindex; 2603 fl4.flowi4_oif = dev_out->ifindex;
2637 res.fi = NULL; 2604 res.fi = NULL;
2638 flags |= RTCF_LOCAL; 2605 flags |= RTCF_LOCAL;
2639 goto make_route; 2606 goto make_route;
2640 } 2607 }
2641 2608
2642#ifdef CONFIG_IP_ROUTE_MULTIPATH 2609#ifdef CONFIG_IP_ROUTE_MULTIPATH
2643 if (res.fi->fib_nhs > 1 && fl.oif == 0) 2610 if (res.fi->fib_nhs > 1 && fl4.flowi4_oif == 0)
2644 fib_select_multipath(&fl, &res); 2611 fib_select_multipath(&res);
2645 else 2612 else
2646#endif 2613#endif
2647 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) 2614 if (!res.prefixlen && res.type == RTN_UNICAST && !fl4.flowi4_oif)
2648 fib_select_default(net, &fl, &res); 2615 fib_select_default(&res);
2649 2616
2650 if (!fl.fl4_src) 2617 if (!fl4.saddr)
2651 fl.fl4_src = FIB_RES_PREFSRC(res); 2618 fl4.saddr = FIB_RES_PREFSRC(net, res);
2652 2619
2653 dev_out = FIB_RES_DEV(res); 2620 dev_out = FIB_RES_DEV(res);
2654 fl.oif = dev_out->ifindex; 2621 fl4.flowi4_oif = dev_out->ifindex;
2655 2622
2656 2623
2657make_route: 2624make_route:
2658 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2625 rth = __mkroute_output(&res, &fl4, oldflp4, dev_out, flags);
2626 if (!IS_ERR(rth)) {
2627 unsigned int hash;
2659 2628
2660out: return err; 2629 hash = rt_hash(oldflp4->daddr, oldflp4->saddr, oldflp4->flowi4_oif,
2630 rt_genid(dev_net(dev_out)));
2631 rth = rt_intern_hash(hash, rth, NULL, oldflp4->flowi4_oif);
2632 }
2633
2634out:
2635 rcu_read_unlock();
2636 return rth;
2661} 2637}
2662 2638
2663int __ip_route_output_key(struct net *net, struct rtable **rp, 2639struct rtable *__ip_route_output_key(struct net *net, const struct flowi4 *flp4)
2664 const struct flowi *flp)
2665{ 2640{
2666 unsigned int hash;
2667 int res;
2668 struct rtable *rth; 2641 struct rtable *rth;
2642 unsigned int hash;
2669 2643
2670 if (!rt_caching(net)) 2644 if (!rt_caching(net))
2671 goto slow_output; 2645 goto slow_output;
2672 2646
2673 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); 2647 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2674 2648
2675 rcu_read_lock_bh(); 2649 rcu_read_lock_bh();
2676 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; 2650 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2677 rth = rcu_dereference_bh(rth->dst.rt_next)) { 2651 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2678 if (rth->fl.fl4_dst == flp->fl4_dst && 2652 if (rth->rt_key_dst == flp4->daddr &&
2679 rth->fl.fl4_src == flp->fl4_src && 2653 rth->rt_key_src == flp4->saddr &&
2680 rt_is_output_route(rth) && 2654 rt_is_output_route(rth) &&
2681 rth->fl.oif == flp->oif && 2655 rth->rt_oif == flp4->flowi4_oif &&
2682 rth->fl.mark == flp->mark && 2656 rth->rt_mark == flp4->flowi4_mark &&
2683 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2657 !((rth->rt_tos ^ flp4->flowi4_tos) &
2684 (IPTOS_RT_MASK | RTO_ONLINK)) && 2658 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2685 net_eq(dev_net(rth->dst.dev), net) && 2659 net_eq(dev_net(rth->dst.dev), net) &&
2686 !rt_is_expired(rth)) { 2660 !rt_is_expired(rth)) {
2687 dst_use(&rth->dst, jiffies); 2661 dst_use(&rth->dst, jiffies);
2688 RT_CACHE_STAT_INC(out_hit); 2662 RT_CACHE_STAT_INC(out_hit);
2689 rcu_read_unlock_bh(); 2663 rcu_read_unlock_bh();
2690 *rp = rth; 2664 return rth;
2691 return 0;
2692 } 2665 }
2693 RT_CACHE_STAT_INC(out_hlist_search); 2666 RT_CACHE_STAT_INC(out_hlist_search);
2694 } 2667 }
2695 rcu_read_unlock_bh(); 2668 rcu_read_unlock_bh();
2696 2669
2697slow_output: 2670slow_output:
2698 rcu_read_lock(); 2671 return ip_route_output_slow(net, flp4);
2699 res = ip_route_output_slow(net, rp, flp);
2700 rcu_read_unlock();
2701 return res;
2702} 2672}
2703EXPORT_SYMBOL_GPL(__ip_route_output_key); 2673EXPORT_SYMBOL_GPL(__ip_route_output_key);
2704 2674
@@ -2707,6 +2677,11 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo
2707 return NULL; 2677 return NULL;
2708} 2678}
2709 2679
2680static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2681{
2682 return 0;
2683}
2684
2710static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2685static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2711{ 2686{
2712} 2687}
@@ -2716,20 +2691,19 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
2716 .protocol = cpu_to_be16(ETH_P_IP), 2691 .protocol = cpu_to_be16(ETH_P_IP),
2717 .destroy = ipv4_dst_destroy, 2692 .destroy = ipv4_dst_destroy,
2718 .check = ipv4_blackhole_dst_check, 2693 .check = ipv4_blackhole_dst_check,
2694 .default_mtu = ipv4_blackhole_default_mtu,
2695 .default_advmss = ipv4_default_advmss,
2719 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2696 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2720}; 2697};
2721 2698
2722 2699struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2723static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2724{ 2700{
2725 struct rtable *ort = *rp; 2701 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, 1);
2726 struct rtable *rt = (struct rtable *) 2702 struct rtable *ort = (struct rtable *) dst_orig;
2727 dst_alloc(&ipv4_dst_blackhole_ops);
2728 2703
2729 if (rt) { 2704 if (rt) {
2730 struct dst_entry *new = &rt->dst; 2705 struct dst_entry *new = &rt->dst;
2731 2706
2732 atomic_set(&new->__refcnt, 1);
2733 new->__use = 1; 2707 new->__use = 1;
2734 new->input = dst_discard; 2708 new->input = dst_discard;
2735 new->output = dst_discard; 2709 new->output = dst_discard;
@@ -2739,7 +2713,12 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2739 if (new->dev) 2713 if (new->dev)
2740 dev_hold(new->dev); 2714 dev_hold(new->dev);
2741 2715
2742 rt->fl = ort->fl; 2716 rt->rt_key_dst = ort->rt_key_dst;
2717 rt->rt_key_src = ort->rt_key_src;
2718 rt->rt_tos = ort->rt_tos;
2719 rt->rt_iif = ort->rt_iif;
2720 rt->rt_oif = ort->rt_oif;
2721 rt->rt_mark = ort->rt_mark;
2743 2722
2744 rt->rt_genid = rt_genid(net); 2723 rt->rt_genid = rt_genid(net);
2745 rt->rt_flags = ort->rt_flags; 2724 rt->rt_flags = ort->rt_flags;
@@ -2752,46 +2731,40 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2752 rt->peer = ort->peer; 2731 rt->peer = ort->peer;
2753 if (rt->peer) 2732 if (rt->peer)
2754 atomic_inc(&rt->peer->refcnt); 2733 atomic_inc(&rt->peer->refcnt);
2734 rt->fi = ort->fi;
2735 if (rt->fi)
2736 atomic_inc(&rt->fi->fib_clntref);
2755 2737
2756 dst_free(new); 2738 dst_free(new);
2757 } 2739 }
2758 2740
2759 dst_release(&(*rp)->dst); 2741 dst_release(dst_orig);
2760 *rp = rt; 2742
2761 return rt ? 0 : -ENOMEM; 2743 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2762} 2744}
2763 2745
2764int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, 2746struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2765 struct sock *sk, int flags) 2747 struct sock *sk)
2766{ 2748{
2767 int err; 2749 struct rtable *rt = __ip_route_output_key(net, flp4);
2768 2750
2769 if ((err = __ip_route_output_key(net, rp, flp)) != 0) 2751 if (IS_ERR(rt))
2770 return err; 2752 return rt;
2771 2753
2772 if (flp->proto) { 2754 if (flp4->flowi4_proto) {
2773 if (!flp->fl4_src) 2755 if (!flp4->saddr)
2774 flp->fl4_src = (*rp)->rt_src; 2756 flp4->saddr = rt->rt_src;
2775 if (!flp->fl4_dst) 2757 if (!flp4->daddr)
2776 flp->fl4_dst = (*rp)->rt_dst; 2758 flp4->daddr = rt->rt_dst;
2777 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, 2759 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2778 flags ? XFRM_LOOKUP_WAIT : 0); 2760 flowi4_to_flowi(flp4),
2779 if (err == -EREMOTE) 2761 sk, 0);
2780 err = ipv4_dst_blackhole(net, rp, flp);
2781
2782 return err;
2783 } 2762 }
2784 2763
2785 return 0; 2764 return rt;
2786} 2765}
2787EXPORT_SYMBOL_GPL(ip_route_output_flow); 2766EXPORT_SYMBOL_GPL(ip_route_output_flow);
2788 2767
2789int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2790{
2791 return ip_route_output_flow(net, rp, flp, NULL, 0);
2792}
2793EXPORT_SYMBOL(ip_route_output_key);
2794
2795static int rt_fill_info(struct net *net, 2768static int rt_fill_info(struct net *net,
2796 struct sk_buff *skb, u32 pid, u32 seq, int event, 2769 struct sk_buff *skb, u32 pid, u32 seq, int event,
2797 int nowait, unsigned int flags) 2770 int nowait, unsigned int flags)
@@ -2810,7 +2783,7 @@ static int rt_fill_info(struct net *net,
2810 r->rtm_family = AF_INET; 2783 r->rtm_family = AF_INET;
2811 r->rtm_dst_len = 32; 2784 r->rtm_dst_len = 32;
2812 r->rtm_src_len = 0; 2785 r->rtm_src_len = 0;
2813 r->rtm_tos = rt->fl.fl4_tos; 2786 r->rtm_tos = rt->rt_tos;
2814 r->rtm_table = RT_TABLE_MAIN; 2787 r->rtm_table = RT_TABLE_MAIN;
2815 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); 2788 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2816 r->rtm_type = rt->rt_type; 2789 r->rtm_type = rt->rt_type;
@@ -2822,19 +2795,19 @@ static int rt_fill_info(struct net *net,
2822 2795
2823 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); 2796 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2824 2797
2825 if (rt->fl.fl4_src) { 2798 if (rt->rt_key_src) {
2826 r->rtm_src_len = 32; 2799 r->rtm_src_len = 32;
2827 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); 2800 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2828 } 2801 }
2829 if (rt->dst.dev) 2802 if (rt->dst.dev)
2830 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); 2803 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2831#ifdef CONFIG_NET_CLS_ROUTE 2804#ifdef CONFIG_IP_ROUTE_CLASSID
2832 if (rt->dst.tclassid) 2805 if (rt->dst.tclassid)
2833 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); 2806 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2834#endif 2807#endif
2835 if (rt_is_input_route(rt)) 2808 if (rt_is_input_route(rt))
2836 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2809 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2837 else if (rt->rt_src != rt->fl.fl4_src) 2810 else if (rt->rt_src != rt->rt_key_src)
2838 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 2811 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2839 2812
2840 if (rt->rt_dst != rt->rt_gateway) 2813 if (rt->rt_dst != rt->rt_gateway)
@@ -2843,11 +2816,12 @@ static int rt_fill_info(struct net *net,
2843 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) 2816 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2844 goto nla_put_failure; 2817 goto nla_put_failure;
2845 2818
2846 if (rt->fl.mark) 2819 if (rt->rt_mark)
2847 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); 2820 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2848 2821
2849 error = rt->dst.error; 2822 error = rt->dst.error;
2850 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; 2823 expires = (rt->peer && rt->peer->pmtu_expires) ?
2824 rt->peer->pmtu_expires - jiffies : 0;
2851 if (rt->peer) { 2825 if (rt->peer) {
2852 inet_peer_refcheck(rt->peer); 2826 inet_peer_refcheck(rt->peer);
2853 id = atomic_read(&rt->peer->ip_id_count) & 0xffff; 2827 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
@@ -2877,7 +2851,7 @@ static int rt_fill_info(struct net *net,
2877 } 2851 }
2878 } else 2852 } else
2879#endif 2853#endif
2880 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); 2854 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2881 } 2855 }
2882 2856
2883 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, 2857 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
@@ -2951,14 +2925,18 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2951 if (err == 0 && rt->dst.error) 2925 if (err == 0 && rt->dst.error)
2952 err = -rt->dst.error; 2926 err = -rt->dst.error;
2953 } else { 2927 } else {
2954 struct flowi fl = { 2928 struct flowi4 fl4 = {
2955 .fl4_dst = dst, 2929 .daddr = dst,
2956 .fl4_src = src, 2930 .saddr = src,
2957 .fl4_tos = rtm->rtm_tos, 2931 .flowi4_tos = rtm->rtm_tos,
2958 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 2932 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2959 .mark = mark, 2933 .flowi4_mark = mark,
2960 }; 2934 };
2961 err = ip_route_output_key(net, &rt, &fl); 2935 rt = ip_route_output_key(net, &fl4);
2936
2937 err = 0;
2938 if (IS_ERR(rt))
2939 err = PTR_ERR(rt);
2962 } 2940 }
2963 2941
2964 if (err) 2942 if (err)
@@ -3241,6 +3219,8 @@ static __net_init int rt_genid_init(struct net *net)
3241{ 3219{
3242 get_random_bytes(&net->ipv4.rt_genid, 3220 get_random_bytes(&net->ipv4.rt_genid,
3243 sizeof(net->ipv4.rt_genid)); 3221 sizeof(net->ipv4.rt_genid));
3222 get_random_bytes(&net->ipv4.dev_addr_genid,
3223 sizeof(net->ipv4.dev_addr_genid));
3244 return 0; 3224 return 0;
3245} 3225}
3246 3226
@@ -3249,9 +3229,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
3249}; 3229};
3250 3230
3251 3231
3252#ifdef CONFIG_NET_CLS_ROUTE 3232#ifdef CONFIG_IP_ROUTE_CLASSID
3253struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3233struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3254#endif /* CONFIG_NET_CLS_ROUTE */ 3234#endif /* CONFIG_IP_ROUTE_CLASSID */
3255 3235
3256static __initdata unsigned long rhash_entries; 3236static __initdata unsigned long rhash_entries;
3257static int __init set_rhash_entries(char *str) 3237static int __init set_rhash_entries(char *str)
@@ -3267,7 +3247,7 @@ int __init ip_rt_init(void)
3267{ 3247{
3268 int rc = 0; 3248 int rc = 0;
3269 3249
3270#ifdef CONFIG_NET_CLS_ROUTE 3250#ifdef CONFIG_IP_ROUTE_CLASSID
3271 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3251 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3272 if (!ip_rt_acct) 3252 if (!ip_rt_acct)
3273 panic("IP: failed to allocate ip_rt_acct\n"); 3253 panic("IP: failed to allocate ip_rt_acct\n");
@@ -3304,14 +3284,6 @@ int __init ip_rt_init(void)
3304 devinet_init(); 3284 devinet_init();
3305 ip_fib_init(); 3285 ip_fib_init();
3306 3286
3307 /* All the timers, started at system startup tend
3308 to synchronize. Perturb it a bit.
3309 */
3310 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3311 expires_ljiffies = jiffies;
3312 schedule_delayed_work(&expires_work,
3313 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3314
3315 if (ip_rt_proc_init()) 3287 if (ip_rt_proc_init())
3316 printk(KERN_ERR "Unable to create route proc files\n"); 3288 printk(KERN_ERR "Unable to create route proc files\n");
3317#ifdef CONFIG_XFRM 3289#ifdef CONFIG_XFRM
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 47519205a014..8b44c6d2a79b 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -345,17 +345,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
345 * no easy way to do this. 345 * no easy way to do this.
346 */ 346 */
347 { 347 {
348 struct flowi fl = { .mark = sk->sk_mark, 348 struct flowi4 fl4 = {
349 .fl4_dst = ((opt && opt->srr) ? 349 .flowi4_mark = sk->sk_mark,
350 opt->faddr : ireq->rmt_addr), 350 .daddr = ((opt && opt->srr) ?
351 .fl4_src = ireq->loc_addr, 351 opt->faddr : ireq->rmt_addr),
352 .fl4_tos = RT_CONN_FLAGS(sk), 352 .saddr = ireq->loc_addr,
353 .proto = IPPROTO_TCP, 353 .flowi4_tos = RT_CONN_FLAGS(sk),
354 .flags = inet_sk_flowi_flags(sk), 354 .flowi4_proto = IPPROTO_TCP,
355 .fl_ip_sport = th->dest, 355 .flowi4_flags = inet_sk_flowi_flags(sk),
356 .fl_ip_dport = th->source }; 356 .fl4_sport = th->dest,
357 security_req_classify_flow(req, &fl); 357 .fl4_dport = th->source,
358 if (ip_route_output_key(sock_net(sk), &rt, &fl)) { 358 };
359 security_req_classify_flow(req, flowi4_to_flowi(&fl4));
360 rt = ip_route_output_key(sock_net(sk), &fl4);
361 if (IS_ERR(rt)) {
359 reqsk_free(req); 362 reqsk_free(req);
360 goto out; 363 goto out;
361 } 364 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6c11eece262c..b22d45010545 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -505,6 +505,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
505 else 505 else
506 answ = tp->write_seq - tp->snd_una; 506 answ = tp->write_seq - tp->snd_una;
507 break; 507 break;
508 case SIOCOUTQNSD:
509 if (sk->sk_state == TCP_LISTEN)
510 return -EINVAL;
511
512 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
513 answ = 0;
514 else
515 answ = tp->write_seq - tp->snd_nxt;
516 break;
508 default: 517 default:
509 return -ENOIOCTLCMD; 518 return -ENOIOCTLCMD;
510 } 519 }
@@ -873,9 +882,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
873 flags); 882 flags);
874 883
875 lock_sock(sk); 884 lock_sock(sk);
876 TCP_CHECK_TIMER(sk);
877 res = do_tcp_sendpages(sk, &page, offset, size, flags); 885 res = do_tcp_sendpages(sk, &page, offset, size, flags);
878 TCP_CHECK_TIMER(sk);
879 release_sock(sk); 886 release_sock(sk);
880 return res; 887 return res;
881} 888}
@@ -916,7 +923,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
916 long timeo; 923 long timeo;
917 924
918 lock_sock(sk); 925 lock_sock(sk);
919 TCP_CHECK_TIMER(sk);
920 926
921 flags = msg->msg_flags; 927 flags = msg->msg_flags;
922 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 928 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -1104,7 +1110,6 @@ wait_for_memory:
1104out: 1110out:
1105 if (copied) 1111 if (copied)
1106 tcp_push(sk, flags, mss_now, tp->nonagle); 1112 tcp_push(sk, flags, mss_now, tp->nonagle);
1107 TCP_CHECK_TIMER(sk);
1108 release_sock(sk); 1113 release_sock(sk);
1109 return copied; 1114 return copied;
1110 1115
@@ -1123,7 +1128,6 @@ do_error:
1123 goto out; 1128 goto out;
1124out_err: 1129out_err:
1125 err = sk_stream_error(sk, flags, err); 1130 err = sk_stream_error(sk, flags, err);
1126 TCP_CHECK_TIMER(sk);
1127 release_sock(sk); 1131 release_sock(sk);
1128 return err; 1132 return err;
1129} 1133}
@@ -1415,8 +1419,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1415 1419
1416 lock_sock(sk); 1420 lock_sock(sk);
1417 1421
1418 TCP_CHECK_TIMER(sk);
1419
1420 err = -ENOTCONN; 1422 err = -ENOTCONN;
1421 if (sk->sk_state == TCP_LISTEN) 1423 if (sk->sk_state == TCP_LISTEN)
1422 goto out; 1424 goto out;
@@ -1767,12 +1769,10 @@ skip_copy:
1767 /* Clean up data we have read: This will do ACK frames. */ 1769 /* Clean up data we have read: This will do ACK frames. */
1768 tcp_cleanup_rbuf(sk, copied); 1770 tcp_cleanup_rbuf(sk, copied);
1769 1771
1770 TCP_CHECK_TIMER(sk);
1771 release_sock(sk); 1772 release_sock(sk);
1772 return copied; 1773 return copied;
1773 1774
1774out: 1775out:
1775 TCP_CHECK_TIMER(sk);
1776 release_sock(sk); 1776 release_sock(sk);
1777 return err; 1777 return err;
1778 1778
@@ -2653,7 +2653,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2653EXPORT_SYMBOL(compat_tcp_getsockopt); 2653EXPORT_SYMBOL(compat_tcp_getsockopt);
2654#endif 2654#endif
2655 2655
2656struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) 2656struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features)
2657{ 2657{
2658 struct sk_buff *segs = ERR_PTR(-EINVAL); 2658 struct sk_buff *segs = ERR_PTR(-EINVAL);
2659 struct tcphdr *th; 2659 struct tcphdr *th;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 3b53fd1af23f..6187eb4d1dcf 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -209,7 +209,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
209} 209}
210 210
211 211
212static struct tcp_congestion_ops bictcp = { 212static struct tcp_congestion_ops bictcp __read_mostly = {
213 .init = bictcp_init, 213 .init = bictcp_init,
214 .ssthresh = bictcp_recalc_ssthresh, 214 .ssthresh = bictcp_recalc_ssthresh,
215 .cong_avoid = bictcp_cong_avoid, 215 .cong_avoid = bictcp_cong_avoid,
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 71d5f2f29fa6..34340c9c95fa 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -39,7 +39,7 @@
39 39
40/* Number of delay samples for detecting the increase of delay */ 40/* Number of delay samples for detecting the increase of delay */
41#define HYSTART_MIN_SAMPLES 8 41#define HYSTART_MIN_SAMPLES 8
42#define HYSTART_DELAY_MIN (2U<<3) 42#define HYSTART_DELAY_MIN (4U<<3)
43#define HYSTART_DELAY_MAX (16U<<3) 43#define HYSTART_DELAY_MAX (16U<<3)
44#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) 44#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
45 45
@@ -52,6 +52,7 @@ static int tcp_friendliness __read_mostly = 1;
52static int hystart __read_mostly = 1; 52static int hystart __read_mostly = 1;
53static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; 53static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
54static int hystart_low_window __read_mostly = 16; 54static int hystart_low_window __read_mostly = 16;
55static int hystart_ack_delta __read_mostly = 2;
55 56
56static u32 cube_rtt_scale __read_mostly; 57static u32 cube_rtt_scale __read_mostly;
57static u32 beta_scale __read_mostly; 58static u32 beta_scale __read_mostly;
@@ -75,6 +76,8 @@ MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
75 " 1: packet-train 2: delay 3: both packet-train and delay"); 76 " 1: packet-train 2: delay 3: both packet-train and delay");
76module_param(hystart_low_window, int, 0644); 77module_param(hystart_low_window, int, 0644);
77MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); 78MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
79module_param(hystart_ack_delta, int, 0644);
80MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
78 81
79/* BIC TCP Parameters */ 82/* BIC TCP Parameters */
80struct bictcp { 83struct bictcp {
@@ -85,7 +88,7 @@ struct bictcp {
85 u32 last_time; /* time when updated last_cwnd */ 88 u32 last_time; /* time when updated last_cwnd */
86 u32 bic_origin_point;/* origin point of bic function */ 89 u32 bic_origin_point;/* origin point of bic function */
87 u32 bic_K; /* time to origin point from the beginning of the current epoch */ 90 u32 bic_K; /* time to origin point from the beginning of the current epoch */
88 u32 delay_min; /* min delay */ 91 u32 delay_min; /* min delay (msec << 3) */
89 u32 epoch_start; /* beginning of an epoch */ 92 u32 epoch_start; /* beginning of an epoch */
90 u32 ack_cnt; /* number of acks */ 93 u32 ack_cnt; /* number of acks */
91 u32 tcp_cwnd; /* estimated tcp cwnd */ 94 u32 tcp_cwnd; /* estimated tcp cwnd */
@@ -95,7 +98,7 @@ struct bictcp {
95 u8 found; /* the exit point is found? */ 98 u8 found; /* the exit point is found? */
96 u32 round_start; /* beginning of each round */ 99 u32 round_start; /* beginning of each round */
97 u32 end_seq; /* end_seq of the round */ 100 u32 end_seq; /* end_seq of the round */
98 u32 last_jiffies; /* last time when the ACK spacing is close */ 101 u32 last_ack; /* last time when the ACK spacing is close */
99 u32 curr_rtt; /* the minimum rtt of current round */ 102 u32 curr_rtt; /* the minimum rtt of current round */
100}; 103};
101 104
@@ -116,12 +119,21 @@ static inline void bictcp_reset(struct bictcp *ca)
116 ca->found = 0; 119 ca->found = 0;
117} 120}
118 121
122static inline u32 bictcp_clock(void)
123{
124#if HZ < 1000
125 return ktime_to_ms(ktime_get_real());
126#else
127 return jiffies_to_msecs(jiffies);
128#endif
129}
130
119static inline void bictcp_hystart_reset(struct sock *sk) 131static inline void bictcp_hystart_reset(struct sock *sk)
120{ 132{
121 struct tcp_sock *tp = tcp_sk(sk); 133 struct tcp_sock *tp = tcp_sk(sk);
122 struct bictcp *ca = inet_csk_ca(sk); 134 struct bictcp *ca = inet_csk_ca(sk);
123 135
124 ca->round_start = ca->last_jiffies = jiffies; 136 ca->round_start = ca->last_ack = bictcp_clock();
125 ca->end_seq = tp->snd_nxt; 137 ca->end_seq = tp->snd_nxt;
126 ca->curr_rtt = 0; 138 ca->curr_rtt = 0;
127 ca->sample_cnt = 0; 139 ca->sample_cnt = 0;
@@ -236,8 +248,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
236 */ 248 */
237 249
238 /* change the unit from HZ to bictcp_HZ */ 250 /* change the unit from HZ to bictcp_HZ */
239 t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start) 251 t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3)
240 << BICTCP_HZ) / HZ; 252 - ca->epoch_start) << BICTCP_HZ) / HZ;
241 253
242 if (t < ca->bic_K) /* t - K */ 254 if (t < ca->bic_K) /* t - K */
243 offs = ca->bic_K - t; 255 offs = ca->bic_K - t;
@@ -258,6 +270,13 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
258 ca->cnt = 100 * cwnd; /* very small increment*/ 270 ca->cnt = 100 * cwnd; /* very small increment*/
259 } 271 }
260 272
273 /*
274 * The initial growth of cubic function may be too conservative
275 * when the available bandwidth is still unknown.
276 */
277 if (ca->loss_cwnd == 0 && ca->cnt > 20)
278 ca->cnt = 20; /* increase cwnd 5% per RTT */
279
261 /* TCP Friendly */ 280 /* TCP Friendly */
262 if (tcp_friendliness) { 281 if (tcp_friendliness) {
263 u32 scale = beta_scale; 282 u32 scale = beta_scale;
@@ -339,12 +358,12 @@ static void hystart_update(struct sock *sk, u32 delay)
339 struct bictcp *ca = inet_csk_ca(sk); 358 struct bictcp *ca = inet_csk_ca(sk);
340 359
341 if (!(ca->found & hystart_detect)) { 360 if (!(ca->found & hystart_detect)) {
342 u32 curr_jiffies = jiffies; 361 u32 now = bictcp_clock();
343 362
344 /* first detection parameter - ack-train detection */ 363 /* first detection parameter - ack-train detection */
345 if (curr_jiffies - ca->last_jiffies <= msecs_to_jiffies(2)) { 364 if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
346 ca->last_jiffies = curr_jiffies; 365 ca->last_ack = now;
347 if (curr_jiffies - ca->round_start >= ca->delay_min>>4) 366 if ((s32)(now - ca->round_start) > ca->delay_min >> 4)
348 ca->found |= HYSTART_ACK_TRAIN; 367 ca->found |= HYSTART_ACK_TRAIN;
349 } 368 }
350 369
@@ -391,7 +410,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
391 if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) 410 if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ)
392 return; 411 return;
393 412
394 delay = usecs_to_jiffies(rtt_us) << 3; 413 delay = (rtt_us << 3) / USEC_PER_MSEC;
395 if (delay == 0) 414 if (delay == 0)
396 delay = 1; 415 delay = 1;
397 416
@@ -405,7 +424,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
405 hystart_update(sk, delay); 424 hystart_update(sk, delay);
406} 425}
407 426
408static struct tcp_congestion_ops cubictcp = { 427static struct tcp_congestion_ops cubictcp __read_mostly = {
409 .init = bictcp_init, 428 .init = bictcp_init,
410 .ssthresh = bictcp_recalc_ssthresh, 429 .ssthresh = bictcp_recalc_ssthresh,
411 .cong_avoid = bictcp_cong_avoid, 430 .cong_avoid = bictcp_cong_avoid,
@@ -447,6 +466,10 @@ static int __init cubictcp_register(void)
447 /* divide by bic_scale and by constant Srtt (100ms) */ 466 /* divide by bic_scale and by constant Srtt (100ms) */
448 do_div(cube_factor, bic_scale * 10); 467 do_div(cube_factor, bic_scale * 10);
449 468
469 /* hystart needs ms clock resolution */
470 if (hystart && HZ < 1000)
471 cubictcp.flags |= TCP_CONG_RTT_STAMP;
472
450 return tcp_register_congestion_control(&cubictcp); 473 return tcp_register_congestion_control(&cubictcp);
451} 474}
452 475
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 8b6caaf75bb9..30f27f6b3655 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -158,7 +158,7 @@ static u32 hstcp_ssthresh(struct sock *sk)
158} 158}
159 159
160 160
161static struct tcp_congestion_ops tcp_highspeed = { 161static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
162 .init = hstcp_init, 162 .init = hstcp_init,
163 .ssthresh = hstcp_ssthresh, 163 .ssthresh = hstcp_ssthresh,
164 .cong_avoid = hstcp_cong_avoid, 164 .cong_avoid = hstcp_cong_avoid,
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 7c94a4955416..c1a8175361e8 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -284,7 +284,7 @@ static void htcp_state(struct sock *sk, u8 new_state)
284 } 284 }
285} 285}
286 286
287static struct tcp_congestion_ops htcp = { 287static struct tcp_congestion_ops htcp __read_mostly = {
288 .init = htcp_init, 288 .init = htcp_init,
289 .ssthresh = htcp_recalc_ssthresh, 289 .ssthresh = htcp_recalc_ssthresh,
290 .cong_avoid = htcp_cong_avoid, 290 .cong_avoid = htcp_cong_avoid,
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 377bc9349371..fe3ecf484b44 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -162,7 +162,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
162 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); 162 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
163} 163}
164 164
165static struct tcp_congestion_ops tcp_hybla = { 165static struct tcp_congestion_ops tcp_hybla __read_mostly = {
166 .init = hybla_init, 166 .init = hybla_init,
167 .ssthresh = tcp_reno_ssthresh, 167 .ssthresh = tcp_reno_ssthresh,
168 .min_cwnd = tcp_reno_min_cwnd, 168 .min_cwnd = tcp_reno_min_cwnd,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 00ca688d8964..813b43a76fec 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -322,7 +322,7 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
322 } 322 }
323} 323}
324 324
325static struct tcp_congestion_ops tcp_illinois = { 325static struct tcp_congestion_ops tcp_illinois __read_mostly = {
326 .flags = TCP_CONG_RTT_STAMP, 326 .flags = TCP_CONG_RTT_STAMP,
327 .init = tcp_illinois_init, 327 .init = tcp_illinois_init,
328 .ssthresh = tcp_illinois_ssthresh, 328 .ssthresh = tcp_illinois_ssthresh,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2549b29b062d..bef9f04c22ba 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
817 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 817 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
818 818
819 if (!cwnd) 819 if (!cwnd)
820 cwnd = rfc3390_bytes_to_packets(tp->mss_cache); 820 cwnd = TCP_INIT_CWND;
821 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 821 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
822} 822}
823 823
@@ -1222,7 +1222,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
1222 } 1222 }
1223 1223
1224 /* D-SACK for already forgotten data... Do dumb counting. */ 1224 /* D-SACK for already forgotten data... Do dumb counting. */
1225 if (dup_sack && 1225 if (dup_sack && tp->undo_marker && tp->undo_retrans &&
1226 !after(end_seq_0, prior_snd_una) && 1226 !after(end_seq_0, prior_snd_una) &&
1227 after(end_seq_0, tp->undo_marker)) 1227 after(end_seq_0, tp->undo_marker))
1228 tp->undo_retrans--; 1228 tp->undo_retrans--;
@@ -1299,7 +1299,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1299 1299
1300 /* Account D-SACK for retransmitted packet. */ 1300 /* Account D-SACK for retransmitted packet. */
1301 if (dup_sack && (sacked & TCPCB_RETRANS)) { 1301 if (dup_sack && (sacked & TCPCB_RETRANS)) {
1302 if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) 1302 if (tp->undo_marker && tp->undo_retrans &&
1303 after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
1303 tp->undo_retrans--; 1304 tp->undo_retrans--;
1304 if (sacked & TCPCB_SACKED_ACKED) 1305 if (sacked & TCPCB_SACKED_ACKED)
1305 state->reord = min(fack_count, state->reord); 1306 state->reord = min(fack_count, state->reord);
@@ -2658,7 +2659,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2658#define DBGUNDO(x...) do { } while (0) 2659#define DBGUNDO(x...) do { } while (0)
2659#endif 2660#endif
2660 2661
2661static void tcp_undo_cwr(struct sock *sk, const int undo) 2662static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
2662{ 2663{
2663 struct tcp_sock *tp = tcp_sk(sk); 2664 struct tcp_sock *tp = tcp_sk(sk);
2664 2665
@@ -2670,14 +2671,13 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
2670 else 2671 else
2671 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); 2672 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
2672 2673
2673 if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { 2674 if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) {
2674 tp->snd_ssthresh = tp->prior_ssthresh; 2675 tp->snd_ssthresh = tp->prior_ssthresh;
2675 TCP_ECN_withdraw_cwr(tp); 2676 TCP_ECN_withdraw_cwr(tp);
2676 } 2677 }
2677 } else { 2678 } else {
2678 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); 2679 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
2679 } 2680 }
2680 tcp_moderate_cwnd(tp);
2681 tp->snd_cwnd_stamp = tcp_time_stamp; 2681 tp->snd_cwnd_stamp = tcp_time_stamp;
2682} 2682}
2683 2683
@@ -2698,7 +2698,7 @@ static int tcp_try_undo_recovery(struct sock *sk)
2698 * or our original transmission succeeded. 2698 * or our original transmission succeeded.
2699 */ 2699 */
2700 DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); 2700 DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
2701 tcp_undo_cwr(sk, 1); 2701 tcp_undo_cwr(sk, true);
2702 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) 2702 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
2703 mib_idx = LINUX_MIB_TCPLOSSUNDO; 2703 mib_idx = LINUX_MIB_TCPLOSSUNDO;
2704 else 2704 else
@@ -2725,7 +2725,7 @@ static void tcp_try_undo_dsack(struct sock *sk)
2725 2725
2726 if (tp->undo_marker && !tp->undo_retrans) { 2726 if (tp->undo_marker && !tp->undo_retrans) {
2727 DBGUNDO(sk, "D-SACK"); 2727 DBGUNDO(sk, "D-SACK");
2728 tcp_undo_cwr(sk, 1); 2728 tcp_undo_cwr(sk, true);
2729 tp->undo_marker = 0; 2729 tp->undo_marker = 0;
2730 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); 2730 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
2731 } 2731 }
@@ -2778,7 +2778,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
2778 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); 2778 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
2779 2779
2780 DBGUNDO(sk, "Hoe"); 2780 DBGUNDO(sk, "Hoe");
2781 tcp_undo_cwr(sk, 0); 2781 tcp_undo_cwr(sk, false);
2782 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); 2782 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
2783 2783
2784 /* So... Do not make Hoe's retransmit yet. 2784 /* So... Do not make Hoe's retransmit yet.
@@ -2807,7 +2807,7 @@ static int tcp_try_undo_loss(struct sock *sk)
2807 2807
2808 DBGUNDO(sk, "partial loss"); 2808 DBGUNDO(sk, "partial loss");
2809 tp->lost_out = 0; 2809 tp->lost_out = 0;
2810 tcp_undo_cwr(sk, 1); 2810 tcp_undo_cwr(sk, true);
2811 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); 2811 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
2812 inet_csk(sk)->icsk_retransmits = 0; 2812 inet_csk(sk)->icsk_retransmits = 0;
2813 tp->undo_marker = 0; 2813 tp->undo_marker = 0;
@@ -2821,8 +2821,11 @@ static int tcp_try_undo_loss(struct sock *sk)
2821static inline void tcp_complete_cwr(struct sock *sk) 2821static inline void tcp_complete_cwr(struct sock *sk)
2822{ 2822{
2823 struct tcp_sock *tp = tcp_sk(sk); 2823 struct tcp_sock *tp = tcp_sk(sk);
2824 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); 2824 /* Do not moderate cwnd if it's already undone in cwr or recovery */
2825 tp->snd_cwnd_stamp = tcp_time_stamp; 2825 if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) {
2826 tp->snd_cwnd = tp->snd_ssthresh;
2827 tp->snd_cwnd_stamp = tcp_time_stamp;
2828 }
2826 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); 2829 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
2827} 2830}
2828 2831
@@ -3349,7 +3352,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3349 net_invalid_timestamp())) 3352 net_invalid_timestamp()))
3350 rtt_us = ktime_us_delta(ktime_get_real(), 3353 rtt_us = ktime_us_delta(ktime_get_real(),
3351 last_ackt); 3354 last_ackt);
3352 else if (ca_seq_rtt > 0) 3355 else if (ca_seq_rtt >= 0)
3353 rtt_us = jiffies_to_usecs(ca_seq_rtt); 3356 rtt_us = jiffies_to_usecs(ca_seq_rtt);
3354 } 3357 }
3355 3358
@@ -3493,7 +3496,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
3493 if (flag & FLAG_ECE) 3496 if (flag & FLAG_ECE)
3494 tcp_ratehalving_spur_to_response(sk); 3497 tcp_ratehalving_spur_to_response(sk);
3495 else 3498 else
3496 tcp_undo_cwr(sk, 1); 3499 tcp_undo_cwr(sk, true);
3497} 3500}
3498 3501
3499/* F-RTO spurious RTO detection algorithm (RFC4138) 3502/* F-RTO spurious RTO detection algorithm (RFC4138)
@@ -4399,7 +4402,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4399 if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) { 4402 if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
4400 tp->ucopy.len -= chunk; 4403 tp->ucopy.len -= chunk;
4401 tp->copied_seq += chunk; 4404 tp->copied_seq += chunk;
4402 eaten = (chunk == skb->len && !th->fin); 4405 eaten = (chunk == skb->len);
4403 tcp_rcv_space_adjust(sk); 4406 tcp_rcv_space_adjust(sk);
4404 } 4407 }
4405 local_bh_disable(); 4408 local_bh_disable();
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 856f68466d49..f7e6c2c2d2bb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -149,9 +149,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149 struct inet_sock *inet = inet_sk(sk); 149 struct inet_sock *inet = inet_sk(sk);
150 struct tcp_sock *tp = tcp_sk(sk); 150 struct tcp_sock *tp = tcp_sk(sk);
151 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 151 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
152 __be16 orig_sport, orig_dport;
152 struct rtable *rt; 153 struct rtable *rt;
153 __be32 daddr, nexthop; 154 __be32 daddr, nexthop;
154 int tmp;
155 int err; 155 int err;
156 156
157 if (addr_len < sizeof(struct sockaddr_in)) 157 if (addr_len < sizeof(struct sockaddr_in))
@@ -167,14 +167,17 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
167 nexthop = inet->opt->faddr; 167 nexthop = inet->opt->faddr;
168 } 168 }
169 169
170 tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr, 170 orig_sport = inet->inet_sport;
171 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 171 orig_dport = usin->sin_port;
172 IPPROTO_TCP, 172 rt = ip_route_connect(nexthop, inet->inet_saddr,
173 inet->inet_sport, usin->sin_port, sk, 1); 173 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174 if (tmp < 0) { 174 IPPROTO_TCP,
175 if (tmp == -ENETUNREACH) 175 orig_sport, orig_dport, sk, true);
176 if (IS_ERR(rt)) {
177 err = PTR_ERR(rt);
178 if (err == -ENETUNREACH)
176 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 179 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
177 return tmp; 180 return err;
178 } 181 }
179 182
180 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 183 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
@@ -233,11 +236,14 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
233 if (err) 236 if (err)
234 goto failure; 237 goto failure;
235 238
236 err = ip_route_newports(&rt, IPPROTO_TCP, 239 rt = ip_route_newports(rt, IPPROTO_TCP,
237 inet->inet_sport, inet->inet_dport, sk); 240 orig_sport, orig_dport,
238 if (err) 241 inet->inet_sport, inet->inet_dport, sk);
242 if (IS_ERR(rt)) {
243 err = PTR_ERR(rt);
244 rt = NULL;
239 goto failure; 245 goto failure;
240 246 }
241 /* OK, now commit destination to socket. */ 247 /* OK, now commit destination to socket. */
242 sk->sk_gso_type = SKB_GSO_TCPV4; 248 sk->sk_gso_type = SKB_GSO_TCPV4;
243 sk_setup_caps(sk, &rt->dst); 249 sk_setup_caps(sk, &rt->dst);
@@ -1341,7 +1347,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1341 tcp_death_row.sysctl_tw_recycle && 1347 tcp_death_row.sysctl_tw_recycle &&
1342 (dst = inet_csk_route_req(sk, req)) != NULL && 1348 (dst = inet_csk_route_req(sk, req)) != NULL &&
1343 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1349 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1344 peer->daddr.a4 == saddr) { 1350 peer->daddr.addr.a4 == saddr) {
1345 inet_peer_refcheck(peer); 1351 inet_peer_refcheck(peer);
1346 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && 1352 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1347 (s32)(peer->tcp_ts - req->ts_recent) > 1353 (s32)(peer->tcp_ts - req->ts_recent) >
@@ -1556,12 +1562,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1556 1562
1557 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1563 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1558 sock_rps_save_rxhash(sk, skb->rxhash); 1564 sock_rps_save_rxhash(sk, skb->rxhash);
1559 TCP_CHECK_TIMER(sk);
1560 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1565 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1561 rsk = sk; 1566 rsk = sk;
1562 goto reset; 1567 goto reset;
1563 } 1568 }
1564 TCP_CHECK_TIMER(sk);
1565 return 0; 1569 return 0;
1566 } 1570 }
1567 1571
@@ -1583,13 +1587,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1583 } else 1587 } else
1584 sock_rps_save_rxhash(sk, skb->rxhash); 1588 sock_rps_save_rxhash(sk, skb->rxhash);
1585 1589
1586
1587 TCP_CHECK_TIMER(sk);
1588 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { 1590 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1589 rsk = sk; 1591 rsk = sk;
1590 goto reset; 1592 goto reset;
1591 } 1593 }
1592 TCP_CHECK_TIMER(sk);
1593 return 0; 1594 return 0;
1594 1595
1595reset: 1596reset:
@@ -1994,7 +1995,6 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
1994 } 1995 }
1995 req = req->dl_next; 1996 req = req->dl_next;
1996 } 1997 }
1997 st->offset = 0;
1998 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) 1998 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1999 break; 1999 break;
2000get_req: 2000get_req:
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index de870377fbba..656d431c99ad 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -313,7 +313,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
313 lp->last_drop = tcp_time_stamp; 313 lp->last_drop = tcp_time_stamp;
314} 314}
315 315
316static struct tcp_congestion_ops tcp_lp = { 316static struct tcp_congestion_ops tcp_lp __read_mostly = {
317 .flags = TCP_CONG_RTT_STAMP, 317 .flags = TCP_CONG_RTT_STAMP,
318 .init = tcp_lp_init, 318 .init = tcp_lp_init,
319 .ssthresh = tcp_reno_ssthresh, 319 .ssthresh = tcp_reno_ssthresh,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 406f320336e6..dfa5beb0c1c8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2162,7 +2162,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2162 if (!tp->retrans_stamp) 2162 if (!tp->retrans_stamp)
2163 tp->retrans_stamp = TCP_SKB_CB(skb)->when; 2163 tp->retrans_stamp = TCP_SKB_CB(skb)->when;
2164 2164
2165 tp->undo_retrans++; 2165 tp->undo_retrans += tcp_skb_pcount(skb);
2166 2166
2167 /* snd_nxt is stored to detect loss of retransmitted segment, 2167 /* snd_nxt is stored to detect loss of retransmitted segment,
2168 * see tcp_input.c tcp_sacktag_write_queue(). 2168 * see tcp_input.c tcp_sacktag_write_queue().
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index a76513779e2b..8ce55b8aaec8 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -35,7 +35,7 @@ static u32 tcp_scalable_ssthresh(struct sock *sk)
35} 35}
36 36
37 37
38static struct tcp_congestion_ops tcp_scalable = { 38static struct tcp_congestion_ops tcp_scalable __read_mostly = {
39 .ssthresh = tcp_scalable_ssthresh, 39 .ssthresh = tcp_scalable_ssthresh,
40 .cong_avoid = tcp_scalable_cong_avoid, 40 .cong_avoid = tcp_scalable_cong_avoid,
41 .min_cwnd = tcp_reno_min_cwnd, 41 .min_cwnd = tcp_reno_min_cwnd,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 74a6aa003657..ecd44b0c45f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -259,7 +259,6 @@ static void tcp_delack_timer(unsigned long data)
259 tcp_send_ack(sk); 259 tcp_send_ack(sk);
260 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); 260 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
261 } 261 }
262 TCP_CHECK_TIMER(sk);
263 262
264out: 263out:
265 if (tcp_memory_pressure) 264 if (tcp_memory_pressure)
@@ -481,7 +480,6 @@ static void tcp_write_timer(unsigned long data)
481 tcp_probe_timer(sk); 480 tcp_probe_timer(sk);
482 break; 481 break;
483 } 482 }
484 TCP_CHECK_TIMER(sk);
485 483
486out: 484out:
487 sk_mem_reclaim(sk); 485 sk_mem_reclaim(sk);
@@ -589,7 +587,6 @@ static void tcp_keepalive_timer (unsigned long data)
589 elapsed = keepalive_time_when(tp) - elapsed; 587 elapsed = keepalive_time_when(tp) - elapsed;
590 } 588 }
591 589
592 TCP_CHECK_TIMER(sk);
593 sk_mem_reclaim(sk); 590 sk_mem_reclaim(sk);
594 591
595resched: 592resched:
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index c6743eec9b7d..80fa2bfd7ede 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -304,7 +304,7 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
304} 304}
305EXPORT_SYMBOL_GPL(tcp_vegas_get_info); 305EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
306 306
307static struct tcp_congestion_ops tcp_vegas = { 307static struct tcp_congestion_ops tcp_vegas __read_mostly = {
308 .flags = TCP_CONG_RTT_STAMP, 308 .flags = TCP_CONG_RTT_STAMP,
309 .init = tcp_vegas_init, 309 .init = tcp_vegas_init,
310 .ssthresh = tcp_reno_ssthresh, 310 .ssthresh = tcp_reno_ssthresh,
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 38bc0b52d745..ac43cd747bce 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -201,7 +201,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
201 return max(tp->snd_cwnd >> 1U, 2U); 201 return max(tp->snd_cwnd >> 1U, 2U);
202} 202}
203 203
204static struct tcp_congestion_ops tcp_veno = { 204static struct tcp_congestion_ops tcp_veno __read_mostly = {
205 .flags = TCP_CONG_RTT_STAMP, 205 .flags = TCP_CONG_RTT_STAMP,
206 .init = tcp_veno_init, 206 .init = tcp_veno_init,
207 .ssthresh = tcp_veno_ssthresh, 207 .ssthresh = tcp_veno_ssthresh,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index a534dda5456e..1b91bf48e277 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -272,7 +272,7 @@ static void tcp_westwood_info(struct sock *sk, u32 ext,
272} 272}
273 273
274 274
275static struct tcp_congestion_ops tcp_westwood = { 275static struct tcp_congestion_ops tcp_westwood __read_mostly = {
276 .init = tcp_westwood_init, 276 .init = tcp_westwood_init,
277 .ssthresh = tcp_reno_ssthresh, 277 .ssthresh = tcp_reno_ssthresh,
278 .cong_avoid = tcp_reno_cong_avoid, 278 .cong_avoid = tcp_reno_cong_avoid,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index a0f240358892..dc7f43179c9a 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -225,7 +225,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {
225 return tp->snd_cwnd - reduction; 225 return tp->snd_cwnd - reduction;
226} 226}
227 227
228static struct tcp_congestion_ops tcp_yeah = { 228static struct tcp_congestion_ops tcp_yeah __read_mostly = {
229 .flags = TCP_CONG_RTT_STAMP, 229 .flags = TCP_CONG_RTT_STAMP,
230 .init = tcp_yeah_init, 230 .init = tcp_yeah_init,
231 .ssthresh = tcp_yeah_ssthresh, 231 .ssthresh = tcp_yeah_ssthresh,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8157b17959ee..588f47af5faf 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -663,75 +663,72 @@ void udp_flush_pending_frames(struct sock *sk)
663EXPORT_SYMBOL(udp_flush_pending_frames); 663EXPORT_SYMBOL(udp_flush_pending_frames);
664 664
665/** 665/**
666 * udp4_hwcsum_outgoing - handle outgoing HW checksumming 666 * udp4_hwcsum - handle outgoing HW checksumming
667 * @sk: socket we are sending on
668 * @skb: sk_buff containing the filled-in UDP header 667 * @skb: sk_buff containing the filled-in UDP header
669 * (checksum field must be zeroed out) 668 * (checksum field must be zeroed out)
669 * @src: source IP address
670 * @dst: destination IP address
670 */ 671 */
671static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, 672static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
672 __be32 src, __be32 dst, int len)
673{ 673{
674 unsigned int offset;
675 struct udphdr *uh = udp_hdr(skb); 674 struct udphdr *uh = udp_hdr(skb);
675 struct sk_buff *frags = skb_shinfo(skb)->frag_list;
676 int offset = skb_transport_offset(skb);
677 int len = skb->len - offset;
678 int hlen = len;
676 __wsum csum = 0; 679 __wsum csum = 0;
677 680
678 if (skb_queue_len(&sk->sk_write_queue) == 1) { 681 if (!frags) {
679 /* 682 /*
680 * Only one fragment on the socket. 683 * Only one fragment on the socket.
681 */ 684 */
682 skb->csum_start = skb_transport_header(skb) - skb->head; 685 skb->csum_start = skb_transport_header(skb) - skb->head;
683 skb->csum_offset = offsetof(struct udphdr, check); 686 skb->csum_offset = offsetof(struct udphdr, check);
684 uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); 687 uh->check = ~csum_tcpudp_magic(src, dst, len,
688 IPPROTO_UDP, 0);
685 } else { 689 } else {
686 /* 690 /*
687 * HW-checksum won't work as there are two or more 691 * HW-checksum won't work as there are two or more
688 * fragments on the socket so that all csums of sk_buffs 692 * fragments on the socket so that all csums of sk_buffs
689 * should be together 693 * should be together
690 */ 694 */
691 offset = skb_transport_offset(skb); 695 do {
692 skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); 696 csum = csum_add(csum, frags->csum);
697 hlen -= frags->len;
698 } while ((frags = frags->next));
693 699
700 csum = skb_checksum(skb, offset, hlen, csum);
694 skb->ip_summed = CHECKSUM_NONE; 701 skb->ip_summed = CHECKSUM_NONE;
695 702
696 skb_queue_walk(&sk->sk_write_queue, skb) {
697 csum = csum_add(csum, skb->csum);
698 }
699
700 uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum); 703 uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
701 if (uh->check == 0) 704 if (uh->check == 0)
702 uh->check = CSUM_MANGLED_0; 705 uh->check = CSUM_MANGLED_0;
703 } 706 }
704} 707}
705 708
706/* 709static int udp_send_skb(struct sk_buff *skb, __be32 daddr, __be32 dport)
707 * Push out all pending data as one UDP datagram. Socket is locked.
708 */
709static int udp_push_pending_frames(struct sock *sk)
710{ 710{
711 struct udp_sock *up = udp_sk(sk); 711 struct sock *sk = skb->sk;
712 struct inet_sock *inet = inet_sk(sk); 712 struct inet_sock *inet = inet_sk(sk);
713 struct flowi *fl = &inet->cork.fl;
714 struct sk_buff *skb;
715 struct udphdr *uh; 713 struct udphdr *uh;
714 struct rtable *rt = (struct rtable *)skb_dst(skb);
716 int err = 0; 715 int err = 0;
717 int is_udplite = IS_UDPLITE(sk); 716 int is_udplite = IS_UDPLITE(sk);
717 int offset = skb_transport_offset(skb);
718 int len = skb->len - offset;
718 __wsum csum = 0; 719 __wsum csum = 0;
719 720
720 /* Grab the skbuff where UDP header space exists. */
721 if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
722 goto out;
723
724 /* 721 /*
725 * Create a UDP header 722 * Create a UDP header
726 */ 723 */
727 uh = udp_hdr(skb); 724 uh = udp_hdr(skb);
728 uh->source = fl->fl_ip_sport; 725 uh->source = inet->inet_sport;
729 uh->dest = fl->fl_ip_dport; 726 uh->dest = dport;
730 uh->len = htons(up->len); 727 uh->len = htons(len);
731 uh->check = 0; 728 uh->check = 0;
732 729
733 if (is_udplite) /* UDP-Lite */ 730 if (is_udplite) /* UDP-Lite */
734 csum = udplite_csum_outgoing(sk, skb); 731 csum = udplite_csum(skb);
735 732
736 else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ 733 else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */
737 734
@@ -740,20 +737,20 @@ static int udp_push_pending_frames(struct sock *sk)
740 737
741 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ 738 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
742 739
743 udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len); 740 udp4_hwcsum(skb, rt->rt_src, daddr);
744 goto send; 741 goto send;
745 742
746 } else /* `normal' UDP */ 743 } else
747 csum = udp_csum_outgoing(sk, skb); 744 csum = udp_csum(skb);
748 745
749 /* add protocol-dependent pseudo-header */ 746 /* add protocol-dependent pseudo-header */
750 uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, 747 uh->check = csum_tcpudp_magic(rt->rt_src, daddr, len,
751 sk->sk_protocol, csum); 748 sk->sk_protocol, csum);
752 if (uh->check == 0) 749 if (uh->check == 0)
753 uh->check = CSUM_MANGLED_0; 750 uh->check = CSUM_MANGLED_0;
754 751
755send: 752send:
756 err = ip_push_pending_frames(sk); 753 err = ip_send_skb(skb);
757 if (err) { 754 if (err) {
758 if (err == -ENOBUFS && !inet->recverr) { 755 if (err == -ENOBUFS && !inet->recverr) {
759 UDP_INC_STATS_USER(sock_net(sk), 756 UDP_INC_STATS_USER(sock_net(sk),
@@ -763,6 +760,26 @@ send:
763 } else 760 } else
764 UDP_INC_STATS_USER(sock_net(sk), 761 UDP_INC_STATS_USER(sock_net(sk),
765 UDP_MIB_OUTDATAGRAMS, is_udplite); 762 UDP_MIB_OUTDATAGRAMS, is_udplite);
763 return err;
764}
765
766/*
767 * Push out all pending data as one UDP datagram. Socket is locked.
768 */
769static int udp_push_pending_frames(struct sock *sk)
770{
771 struct udp_sock *up = udp_sk(sk);
772 struct inet_sock *inet = inet_sk(sk);
773 struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
774 struct sk_buff *skb;
775 int err = 0;
776
777 skb = ip_finish_skb(sk);
778 if (!skb)
779 goto out;
780
781 err = udp_send_skb(skb, fl4->daddr, fl4->fl4_dport);
782
766out: 783out:
767 up->len = 0; 784 up->len = 0;
768 up->pending = 0; 785 up->pending = 0;
@@ -774,6 +791,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
774{ 791{
775 struct inet_sock *inet = inet_sk(sk); 792 struct inet_sock *inet = inet_sk(sk);
776 struct udp_sock *up = udp_sk(sk); 793 struct udp_sock *up = udp_sk(sk);
794 struct flowi4 *fl4;
777 int ulen = len; 795 int ulen = len;
778 struct ipcm_cookie ipc; 796 struct ipcm_cookie ipc;
779 struct rtable *rt = NULL; 797 struct rtable *rt = NULL;
@@ -785,6 +803,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
785 int err, is_udplite = IS_UDPLITE(sk); 803 int err, is_udplite = IS_UDPLITE(sk);
786 int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; 804 int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
787 int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); 805 int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
806 struct sk_buff *skb;
788 807
789 if (len > 0xFFFF) 808 if (len > 0xFFFF)
790 return -EMSGSIZE; 809 return -EMSGSIZE;
@@ -799,6 +818,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
799 ipc.opt = NULL; 818 ipc.opt = NULL;
800 ipc.tx_flags = 0; 819 ipc.tx_flags = 0;
801 820
821 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
822
802 if (up->pending) { 823 if (up->pending) {
803 /* 824 /*
804 * There are pending frames. 825 * There are pending frames.
@@ -888,20 +909,25 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
888 rt = (struct rtable *)sk_dst_check(sk, 0); 909 rt = (struct rtable *)sk_dst_check(sk, 0);
889 910
890 if (rt == NULL) { 911 if (rt == NULL) {
891 struct flowi fl = { .oif = ipc.oif, 912 struct flowi4 fl4 = {
892 .mark = sk->sk_mark, 913 .flowi4_oif = ipc.oif,
893 .fl4_dst = faddr, 914 .flowi4_mark = sk->sk_mark,
894 .fl4_src = saddr, 915 .daddr = faddr,
895 .fl4_tos = tos, 916 .saddr = saddr,
896 .proto = sk->sk_protocol, 917 .flowi4_tos = tos,
897 .flags = inet_sk_flowi_flags(sk), 918 .flowi4_proto = sk->sk_protocol,
898 .fl_ip_sport = inet->inet_sport, 919 .flowi4_flags = (inet_sk_flowi_flags(sk) |
899 .fl_ip_dport = dport }; 920 FLOWI_FLAG_CAN_SLEEP),
921 .fl4_sport = inet->inet_sport,
922 .fl4_dport = dport,
923 };
900 struct net *net = sock_net(sk); 924 struct net *net = sock_net(sk);
901 925
902 security_sk_classify_flow(sk, &fl); 926 security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
903 err = ip_route_output_flow(net, &rt, &fl, sk, 1); 927 rt = ip_route_output_flow(net, &fl4, sk);
904 if (err) { 928 if (IS_ERR(rt)) {
929 err = PTR_ERR(rt);
930 rt = NULL;
905 if (err == -ENETUNREACH) 931 if (err == -ENETUNREACH)
906 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); 932 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
907 goto out; 933 goto out;
@@ -923,6 +949,17 @@ back_from_confirm:
923 if (!ipc.addr) 949 if (!ipc.addr)
924 daddr = ipc.addr = rt->rt_dst; 950 daddr = ipc.addr = rt->rt_dst;
925 951
952 /* Lockless fast path for the non-corking case. */
953 if (!corkreq) {
954 skb = ip_make_skb(sk, getfrag, msg->msg_iov, ulen,
955 sizeof(struct udphdr), &ipc, &rt,
956 msg->msg_flags);
957 err = PTR_ERR(skb);
958 if (skb && !IS_ERR(skb))
959 err = udp_send_skb(skb, daddr, dport);
960 goto out;
961 }
962
926 lock_sock(sk); 963 lock_sock(sk);
927 if (unlikely(up->pending)) { 964 if (unlikely(up->pending)) {
928 /* The socket is already corked while preparing it. */ 965 /* The socket is already corked while preparing it. */
@@ -936,15 +973,15 @@ back_from_confirm:
936 /* 973 /*
937 * Now cork the socket to pend data. 974 * Now cork the socket to pend data.
938 */ 975 */
939 inet->cork.fl.fl4_dst = daddr; 976 fl4 = &inet->cork.fl.u.ip4;
940 inet->cork.fl.fl_ip_dport = dport; 977 fl4->daddr = daddr;
941 inet->cork.fl.fl4_src = saddr; 978 fl4->saddr = saddr;
942 inet->cork.fl.fl_ip_sport = inet->inet_sport; 979 fl4->fl4_dport = dport;
980 fl4->fl4_sport = inet->inet_sport;
943 up->pending = AF_INET; 981 up->pending = AF_INET;
944 982
945do_append_data: 983do_append_data:
946 up->len += ulen; 984 up->len += ulen;
947 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
948 err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, 985 err = ip_append_data(sk, getfrag, msg->msg_iov, ulen,
949 sizeof(struct udphdr), &ipc, &rt, 986 sizeof(struct udphdr), &ipc, &rt,
950 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); 987 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
@@ -2199,7 +2236,7 @@ int udp4_ufo_send_check(struct sk_buff *skb)
2199 return 0; 2236 return 0;
2200} 2237}
2201 2238
2202struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) 2239struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features)
2203{ 2240{
2204 struct sk_buff *segs = ERR_PTR(-EINVAL); 2241 struct sk_buff *segs = ERR_PTR(-EINVAL);
2205 unsigned int mss; 2242 unsigned int mss;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index b057d40addec..13e0e7f659ff 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -19,25 +19,23 @@
19static struct xfrm_policy_afinfo xfrm4_policy_afinfo; 19static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
20 20
21static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, 21static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
22 xfrm_address_t *saddr, 22 const xfrm_address_t *saddr,
23 xfrm_address_t *daddr) 23 const xfrm_address_t *daddr)
24{ 24{
25 struct flowi fl = { 25 struct flowi4 fl4 = {
26 .fl4_dst = daddr->a4, 26 .daddr = daddr->a4,
27 .fl4_tos = tos, 27 .flowi4_tos = tos,
28 }; 28 };
29 struct dst_entry *dst;
30 struct rtable *rt; 29 struct rtable *rt;
31 int err;
32 30
33 if (saddr) 31 if (saddr)
34 fl.fl4_src = saddr->a4; 32 fl4.saddr = saddr->a4;
33
34 rt = __ip_route_output_key(net, &fl4);
35 if (!IS_ERR(rt))
36 return &rt->dst;
35 37
36 err = __ip_route_output_key(net, &rt, &fl); 38 return ERR_CAST(rt);
37 dst = &rt->dst;
38 if (err)
39 dst = ERR_PTR(err);
40 return dst;
41} 39}
42 40
43static int xfrm4_get_saddr(struct net *net, 41static int xfrm4_get_saddr(struct net *net,
@@ -56,9 +54,9 @@ static int xfrm4_get_saddr(struct net *net,
56 return 0; 54 return 0;
57} 55}
58 56
59static int xfrm4_get_tos(struct flowi *fl) 57static int xfrm4_get_tos(const struct flowi *fl)
60{ 58{
61 return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */ 59 return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */
62} 60}
63 61
64static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, 62static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
@@ -68,11 +66,17 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
68} 66}
69 67
70static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, 68static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
71 struct flowi *fl) 69 const struct flowi *fl)
72{ 70{
73 struct rtable *rt = (struct rtable *)xdst->route; 71 struct rtable *rt = (struct rtable *)xdst->route;
72 const struct flowi4 *fl4 = &fl->u.ip4;
74 73
75 xdst->u.rt.fl = *fl; 74 rt->rt_key_dst = fl4->daddr;
75 rt->rt_key_src = fl4->saddr;
76 rt->rt_tos = fl4->flowi4_tos;
77 rt->rt_iif = fl4->flowi4_iif;
78 rt->rt_oif = fl4->flowi4_oif;
79 rt->rt_mark = fl4->flowi4_mark;
76 80
77 xdst->u.dst.dev = dev; 81 xdst->u.dst.dev = dev;
78 dev_hold(dev); 82 dev_hold(dev);
@@ -99,9 +103,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
99{ 103{
100 struct iphdr *iph = ip_hdr(skb); 104 struct iphdr *iph = ip_hdr(skb);
101 u8 *xprth = skb_network_header(skb) + iph->ihl * 4; 105 u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
106 struct flowi4 *fl4 = &fl->u.ip4;
102 107
103 memset(fl, 0, sizeof(struct flowi)); 108 memset(fl4, 0, sizeof(struct flowi4));
104 fl->mark = skb->mark; 109 fl4->flowi4_mark = skb->mark;
105 110
106 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { 111 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
107 switch (iph->protocol) { 112 switch (iph->protocol) {
@@ -114,8 +119,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
114 pskb_may_pull(skb, xprth + 4 - skb->data)) { 119 pskb_may_pull(skb, xprth + 4 - skb->data)) {
115 __be16 *ports = (__be16 *)xprth; 120 __be16 *ports = (__be16 *)xprth;
116 121
117 fl->fl_ip_sport = ports[!!reverse]; 122 fl4->fl4_sport = ports[!!reverse];
118 fl->fl_ip_dport = ports[!reverse]; 123 fl4->fl4_dport = ports[!reverse];
119 } 124 }
120 break; 125 break;
121 126
@@ -123,8 +128,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
123 if (pskb_may_pull(skb, xprth + 2 - skb->data)) { 128 if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
124 u8 *icmp = xprth; 129 u8 *icmp = xprth;
125 130
126 fl->fl_icmp_type = icmp[0]; 131 fl4->fl4_icmp_type = icmp[0];
127 fl->fl_icmp_code = icmp[1]; 132 fl4->fl4_icmp_code = icmp[1];
128 } 133 }
129 break; 134 break;
130 135
@@ -132,7 +137,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
132 if (pskb_may_pull(skb, xprth + 4 - skb->data)) { 137 if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
133 __be32 *ehdr = (__be32 *)xprth; 138 __be32 *ehdr = (__be32 *)xprth;
134 139
135 fl->fl_ipsec_spi = ehdr[0]; 140 fl4->fl4_ipsec_spi = ehdr[0];
136 } 141 }
137 break; 142 break;
138 143
@@ -140,7 +145,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
140 if (pskb_may_pull(skb, xprth + 8 - skb->data)) { 145 if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
141 __be32 *ah_hdr = (__be32*)xprth; 146 __be32 *ah_hdr = (__be32*)xprth;
142 147
143 fl->fl_ipsec_spi = ah_hdr[1]; 148 fl4->fl4_ipsec_spi = ah_hdr[1];
144 } 149 }
145 break; 150 break;
146 151
@@ -148,7 +153,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
148 if (pskb_may_pull(skb, xprth + 4 - skb->data)) { 153 if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
149 __be16 *ipcomp_hdr = (__be16 *)xprth; 154 __be16 *ipcomp_hdr = (__be16 *)xprth;
150 155
151 fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); 156 fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
152 } 157 }
153 break; 158 break;
154 159
@@ -160,20 +165,20 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
160 if (greflags[0] & GRE_KEY) { 165 if (greflags[0] & GRE_KEY) {
161 if (greflags[0] & GRE_CSUM) 166 if (greflags[0] & GRE_CSUM)
162 gre_hdr++; 167 gre_hdr++;
163 fl->fl_gre_key = gre_hdr[1]; 168 fl4->fl4_gre_key = gre_hdr[1];
164 } 169 }
165 } 170 }
166 break; 171 break;
167 172
168 default: 173 default:
169 fl->fl_ipsec_spi = 0; 174 fl4->fl4_ipsec_spi = 0;
170 break; 175 break;
171 } 176 }
172 } 177 }
173 fl->proto = iph->protocol; 178 fl4->flowi4_proto = iph->protocol;
174 fl->fl4_dst = reverse ? iph->saddr : iph->daddr; 179 fl4->daddr = reverse ? iph->saddr : iph->daddr;
175 fl->fl4_src = reverse ? iph->daddr : iph->saddr; 180 fl4->saddr = reverse ? iph->daddr : iph->saddr;
176 fl->fl4_tos = iph->tos; 181 fl4->flowi4_tos = iph->tos;
177} 182}
178 183
179static inline int xfrm4_garbage_collect(struct dst_ops *ops) 184static inline int xfrm4_garbage_collect(struct dst_ops *ops)
@@ -196,8 +201,11 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
196{ 201{
197 struct xfrm_dst *xdst = (struct xfrm_dst *)dst; 202 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
198 203
204 dst_destroy_metrics_generic(dst);
205
199 if (likely(xdst->u.rt.peer)) 206 if (likely(xdst->u.rt.peer))
200 inet_putpeer(xdst->u.rt.peer); 207 inet_putpeer(xdst->u.rt.peer);
208
201 xfrm_dst_destroy(xdst); 209 xfrm_dst_destroy(xdst);
202} 210}
203 211
@@ -215,6 +223,7 @@ static struct dst_ops xfrm4_dst_ops = {
215 .protocol = cpu_to_be16(ETH_P_IP), 223 .protocol = cpu_to_be16(ETH_P_IP),
216 .gc = xfrm4_garbage_collect, 224 .gc = xfrm4_garbage_collect,
217 .update_pmtu = xfrm4_update_pmtu, 225 .update_pmtu = xfrm4_update_pmtu,
226 .cow_metrics = dst_cow_metrics_generic,
218 .destroy = xfrm4_dst_destroy, 227 .destroy = xfrm4_dst_destroy,
219 .ifdown = xfrm4_dst_ifdown, 228 .ifdown = xfrm4_dst_ifdown,
220 .local_out = __ip_local_out, 229 .local_out = __ip_local_out,
@@ -230,6 +239,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
230 .get_tos = xfrm4_get_tos, 239 .get_tos = xfrm4_get_tos,
231 .init_path = xfrm4_init_path, 240 .init_path = xfrm4_init_path,
232 .fill_dst = xfrm4_fill_dst, 241 .fill_dst = xfrm4_fill_dst,
242 .blackhole_route = ipv4_blackhole_route,
233}; 243};
234 244
235#ifdef CONFIG_SYSCTL 245#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 47947624eccc..1717c64628d1 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,24 +21,26 @@ static int xfrm4_init_flags(struct xfrm_state *x)
21} 21}
22 22
23static void 23static void
24__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl) 24__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
25{ 25{
26 sel->daddr.a4 = fl->fl4_dst; 26 const struct flowi4 *fl4 = &fl->u.ip4;
27 sel->saddr.a4 = fl->fl4_src; 27
28 sel->dport = xfrm_flowi_dport(fl); 28 sel->daddr.a4 = fl4->daddr;
29 sel->saddr.a4 = fl4->saddr;
30 sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
29 sel->dport_mask = htons(0xffff); 31 sel->dport_mask = htons(0xffff);
30 sel->sport = xfrm_flowi_sport(fl); 32 sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
31 sel->sport_mask = htons(0xffff); 33 sel->sport_mask = htons(0xffff);
32 sel->family = AF_INET; 34 sel->family = AF_INET;
33 sel->prefixlen_d = 32; 35 sel->prefixlen_d = 32;
34 sel->prefixlen_s = 32; 36 sel->prefixlen_s = 32;
35 sel->proto = fl->proto; 37 sel->proto = fl4->flowi4_proto;
36 sel->ifindex = fl->oif; 38 sel->ifindex = fl4->flowi4_oif;
37} 39}
38 40
39static void 41static void
40xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl, 42xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
41 xfrm_address_t *daddr, xfrm_address_t *saddr) 43 const xfrm_address_t *daddr, const xfrm_address_t *saddr)
42{ 44{
43 x->id = tmpl->id; 45 x->id = tmpl->id;
44 if (x->id.daddr.a4 == 0) 46 if (x->id.daddr.a4 == 0)