aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig42
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/af_inet.c46
-rw-r--r--net/ipv4/ah4.c27
-rw-r--r--net/ipv4/arp.c25
-rw-r--r--net/ipv4/datagram.c11
-rw-r--r--net/ipv4/devinet.c78
-rw-r--r--net/ipv4/esp4.c104
-rw-r--r--net/ipv4/fib_frontend.c105
-rw-r--r--net/ipv4/fib_hash.c1133
-rw-r--r--net/ipv4/fib_lookup.h10
-rw-r--r--net/ipv4/fib_rules.c25
-rw-r--r--net/ipv4/fib_semantics.c257
-rw-r--r--net/ipv4/fib_trie.c272
-rw-r--r--net/ipv4/icmp.c240
-rw-r--r--net/ipv4/igmp.c45
-rw-r--r--net/ipv4/inet_connection_sock.c27
-rw-r--r--net/ipv4/inetpeer.c148
-rw-r--r--net/ipv4/ip_gre.c56
-rw-r--r--net/ipv4/ip_input.c2
-rw-r--r--net/ipv4/ip_output.c345
-rw-r--r--net/ipv4/ipip.c39
-rw-r--r--net/ipv4/ipmr.c79
-rw-r--r--net/ipv4/netfilter.c36
-rw-r--r--net/ipv4/netfilter/Kconfig13
-rw-r--r--net/ipv4/netfilter/Makefile1
-rw-r--r--net/ipv4/netfilter/arp_tables.c5
-rw-r--r--net/ipv4/netfilter/ip_tables.c5
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c7
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c3
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c134
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c33
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c9
-rw-r--r--net/ipv4/raw.c39
-rw-r--r--net/ipv4/route.c1179
-rw-r--r--net/ipv4/syncookies.c25
-rw-r--r--net/ipv4/tcp.c20
-rw-r--r--net/ipv4/tcp_bic.c2
-rw-r--r--net/ipv4/tcp_cubic.c2
-rw-r--r--net/ipv4/tcp_highspeed.c2
-rw-r--r--net/ipv4/tcp_htcp.c2
-rw-r--r--net/ipv4/tcp_hybla.c2
-rw-r--r--net/ipv4/tcp_illinois.c2
-rw-r--r--net/ipv4/tcp_input.c2
-rw-r--r--net/ipv4/tcp_ipv4.c37
-rw-r--r--net/ipv4/tcp_lp.c2
-rw-r--r--net/ipv4/tcp_scalable.c2
-rw-r--r--net/ipv4/tcp_timer.c3
-rw-r--r--net/ipv4/tcp_vegas.c2
-rw-r--r--net/ipv4/tcp_veno.c2
-rw-r--r--net/ipv4/tcp_westwood.c2
-rw-r--r--net/ipv4/tcp_yeah.c2
-rw-r--r--net/ipv4/udp.c139
-rw-r--r--net/ipv4/xfrm4_policy.c74
-rw-r--r--net/ipv4/xfrm4_state.c20
59 files changed, 1931 insertions, 3033 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index a5a1050595d1..cbb505ba9324 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER
55 55
56 If unsure, say N here. 56 If unsure, say N here.
57 57
58choice
59 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
60 depends on IP_ADVANCED_ROUTER
61 default ASK_IP_FIB_HASH
62
63config ASK_IP_FIB_HASH
64 bool "FIB_HASH"
65 ---help---
66 Current FIB is very proven and good enough for most users.
67
68config IP_FIB_TRIE
69 bool "FIB_TRIE"
70 ---help---
71 Use new experimental LC-trie as FIB lookup algorithm.
72 This improves lookup performance if you have a large
73 number of routes.
74
75 LC-trie is a longest matching prefix lookup algorithm which
76 performs better than FIB_HASH for large routing tables.
77 But, it consumes more memory and is more complex.
78
79 LC-trie is described in:
80
81 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
82 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
83 June 1999
84
85 An experimental study of compression methods for dynamic tries
86 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
87 <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
88
89endchoice
90
91config IP_FIB_HASH
92 def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
93
94config IP_FIB_TRIE_STATS 58config IP_FIB_TRIE_STATS
95 bool "FIB TRIE statistics" 59 bool "FIB TRIE statistics"
96 depends on IP_FIB_TRIE 60 depends on IP_ADVANCED_ROUTER
97 ---help--- 61 ---help---
98 Keep track of statistics on structure of FIB TRIE table. 62 Keep track of statistics on structure of FIB TRIE table.
99 Useful for testing and measuring TRIE performance. 63 Useful for testing and measuring TRIE performance.
@@ -140,6 +104,9 @@ config IP_ROUTE_VERBOSE
140 handled by the klogd daemon which is responsible for kernel messages 104 handled by the klogd daemon which is responsible for kernel messages
141 ("man klogd"). 105 ("man klogd").
142 106
107config IP_ROUTE_CLASSID
108 bool
109
143config IP_PNP 110config IP_PNP
144 bool "IP: kernel level autoconfiguration" 111 bool "IP: kernel level autoconfiguration"
145 help 112 help
@@ -657,4 +624,3 @@ config TCP_MD5SIG
657 on the Internet. 624 on the Internet.
658 625
659 If unsure, say N. 626 If unsure, say N.
660
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 4978d22f9a75..0dc772d0d125 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,12 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \
10 tcp_minisocks.o tcp_cong.o \ 10 tcp_minisocks.o tcp_cong.o \
11 datagram.o raw.o udp.o udplite.o \ 11 datagram.o raw.o udp.o udplite.o \
12 arp.o icmp.o devinet.o af_inet.o igmp.o \ 12 arp.o icmp.o devinet.o af_inet.o igmp.o \
13 fib_frontend.o fib_semantics.o \ 13 fib_frontend.o fib_semantics.o fib_trie.o \
14 inet_fragment.o 14 inet_fragment.o
15 15
16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o 16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
17obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
18obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
19obj-$(CONFIG_PROC_FS) += proc.o 17obj-$(CONFIG_PROC_FS) += proc.o
20obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 18obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
21obj-$(CONFIG_IP_MROUTE) += ipmr.o 19obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 45b89d7bda5a..807d83c02ef6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1101,23 +1101,20 @@ int sysctl_ip_dynaddr __read_mostly;
1101static int inet_sk_reselect_saddr(struct sock *sk) 1101static int inet_sk_reselect_saddr(struct sock *sk)
1102{ 1102{
1103 struct inet_sock *inet = inet_sk(sk); 1103 struct inet_sock *inet = inet_sk(sk);
1104 int err;
1105 struct rtable *rt;
1106 __be32 old_saddr = inet->inet_saddr; 1104 __be32 old_saddr = inet->inet_saddr;
1107 __be32 new_saddr;
1108 __be32 daddr = inet->inet_daddr; 1105 __be32 daddr = inet->inet_daddr;
1106 struct rtable *rt;
1107 __be32 new_saddr;
1109 1108
1110 if (inet->opt && inet->opt->srr) 1109 if (inet->opt && inet->opt->srr)
1111 daddr = inet->opt->faddr; 1110 daddr = inet->opt->faddr;
1112 1111
1113 /* Query new route. */ 1112 /* Query new route. */
1114 err = ip_route_connect(&rt, daddr, 0, 1113 rt = ip_route_connect(daddr, 0, RT_CONN_FLAGS(sk),
1115 RT_CONN_FLAGS(sk), 1114 sk->sk_bound_dev_if, sk->sk_protocol,
1116 sk->sk_bound_dev_if, 1115 inet->inet_sport, inet->inet_dport, sk, false);
1117 sk->sk_protocol, 1116 if (IS_ERR(rt))
1118 inet->inet_sport, inet->inet_dport, sk, 0); 1117 return PTR_ERR(rt);
1119 if (err)
1120 return err;
1121 1118
1122 sk_setup_caps(sk, &rt->dst); 1119 sk_setup_caps(sk, &rt->dst);
1123 1120
@@ -1160,25 +1157,16 @@ int inet_sk_rebuild_header(struct sock *sk)
1160 daddr = inet->inet_daddr; 1157 daddr = inet->inet_daddr;
1161 if (inet->opt && inet->opt->srr) 1158 if (inet->opt && inet->opt->srr)
1162 daddr = inet->opt->faddr; 1159 daddr = inet->opt->faddr;
1163{ 1160 rt = ip_route_output_ports(sock_net(sk), sk, daddr, inet->inet_saddr,
1164 struct flowi fl = { 1161 inet->inet_dport, inet->inet_sport,
1165 .oif = sk->sk_bound_dev_if, 1162 sk->sk_protocol, RT_CONN_FLAGS(sk),
1166 .mark = sk->sk_mark, 1163 sk->sk_bound_dev_if);
1167 .fl4_dst = daddr, 1164 if (!IS_ERR(rt)) {
1168 .fl4_src = inet->inet_saddr, 1165 err = 0;
1169 .fl4_tos = RT_CONN_FLAGS(sk),
1170 .proto = sk->sk_protocol,
1171 .flags = inet_sk_flowi_flags(sk),
1172 .fl_ip_sport = inet->inet_sport,
1173 .fl_ip_dport = inet->inet_dport,
1174 };
1175
1176 security_sk_classify_flow(sk, &fl);
1177 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
1178}
1179 if (!err)
1180 sk_setup_caps(sk, &rt->dst); 1166 sk_setup_caps(sk, &rt->dst);
1181 else { 1167 } else {
1168 err = PTR_ERR(rt);
1169
1182 /* Routing failed... */ 1170 /* Routing failed... */
1183 sk->sk_route_caps = 0; 1171 sk->sk_route_caps = 0;
1184 /* 1172 /*
@@ -1231,7 +1219,7 @@ out:
1231 return err; 1219 return err;
1232} 1220}
1233 1221
1234static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) 1222static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features)
1235{ 1223{
1236 struct sk_buff *segs = ERR_PTR(-EINVAL); 1224 struct sk_buff *segs = ERR_PTR(-EINVAL);
1237 struct iphdr *iph; 1225 struct iphdr *iph;
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 86961bec70ab..4286fd3cc0e2 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -201,11 +201,14 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
201 top_iph->ttl = 0; 201 top_iph->ttl = 0;
202 top_iph->check = 0; 202 top_iph->check = 0;
203 203
204 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; 204 if (x->props.flags & XFRM_STATE_ALIGN4)
205 ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
206 else
207 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
205 208
206 ah->reserved = 0; 209 ah->reserved = 0;
207 ah->spi = x->id.spi; 210 ah->spi = x->id.spi;
208 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); 211 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
209 212
210 sg_init_table(sg, nfrags); 213 sg_init_table(sg, nfrags);
211 skb_to_sgvec(skb, sg, 0, skb->len); 214 skb_to_sgvec(skb, sg, 0, skb->len);
@@ -299,9 +302,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
299 nexthdr = ah->nexthdr; 302 nexthdr = ah->nexthdr;
300 ah_hlen = (ah->hdrlen + 2) << 2; 303 ah_hlen = (ah->hdrlen + 2) << 2;
301 304
302 if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && 305 if (x->props.flags & XFRM_STATE_ALIGN4) {
303 ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) 306 if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
304 goto out; 307 ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
308 goto out;
309 } else {
310 if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
311 ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
312 goto out;
313 }
305 314
306 if (!pskb_may_pull(skb, ah_hlen)) 315 if (!pskb_may_pull(skb, ah_hlen))
307 goto out; 316 goto out;
@@ -450,8 +459,12 @@ static int ah_init_state(struct xfrm_state *x)
450 459
451 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); 460 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
452 461
453 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + 462 if (x->props.flags & XFRM_STATE_ALIGN4)
454 ahp->icv_trunc_len); 463 x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
464 ahp->icv_trunc_len);
465 else
466 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
467 ahp->icv_trunc_len);
455 if (x->props.mode == XFRM_MODE_TUNNEL) 468 if (x->props.mode == XFRM_MODE_TUNNEL)
456 x->props.header_len += sizeof(struct iphdr); 469 x->props.header_len += sizeof(struct iphdr);
457 x->data = ahp; 470 x->data = ahp;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 7927589813b5..090d273d7865 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -433,14 +433,13 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
433 433
434static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) 434static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
435{ 435{
436 struct flowi fl = { .fl4_dst = sip,
437 .fl4_src = tip };
438 struct rtable *rt; 436 struct rtable *rt;
439 int flag = 0; 437 int flag = 0;
440 /*unsigned long now; */ 438 /*unsigned long now; */
441 struct net *net = dev_net(dev); 439 struct net *net = dev_net(dev);
442 440
443 if (ip_route_output_key(net, &rt, &fl) < 0) 441 rt = ip_route_output(net, sip, tip, 0, 0);
442 if (IS_ERR(rt))
444 return 1; 443 return 1;
445 if (rt->dst.dev != dev) { 444 if (rt->dst.dev != dev) {
446 NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); 445 NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
@@ -1061,12 +1060,10 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1061 if (r->arp_flags & ATF_PERM) 1060 if (r->arp_flags & ATF_PERM)
1062 r->arp_flags |= ATF_COM; 1061 r->arp_flags |= ATF_COM;
1063 if (dev == NULL) { 1062 if (dev == NULL) {
1064 struct flowi fl = { .fl4_dst = ip, 1063 struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
1065 .fl4_tos = RTO_ONLINK }; 1064
1066 struct rtable *rt; 1065 if (IS_ERR(rt))
1067 err = ip_route_output_key(net, &rt, &fl); 1066 return PTR_ERR(rt);
1068 if (err != 0)
1069 return err;
1070 dev = rt->dst.dev; 1067 dev = rt->dst.dev;
1071 ip_rt_put(rt); 1068 ip_rt_put(rt);
1072 if (!dev) 1069 if (!dev)
@@ -1177,7 +1174,6 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
1177static int arp_req_delete(struct net *net, struct arpreq *r, 1174static int arp_req_delete(struct net *net, struct arpreq *r,
1178 struct net_device *dev) 1175 struct net_device *dev)
1179{ 1176{
1180 int err;
1181 __be32 ip; 1177 __be32 ip;
1182 1178
1183 if (r->arp_flags & ATF_PUBL) 1179 if (r->arp_flags & ATF_PUBL)
@@ -1185,12 +1181,9 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
1185 1181
1186 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; 1182 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
1187 if (dev == NULL) { 1183 if (dev == NULL) {
1188 struct flowi fl = { .fl4_dst = ip, 1184 struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
1189 .fl4_tos = RTO_ONLINK }; 1185 if (IS_ERR(rt))
1190 struct rtable *rt; 1186 return PTR_ERR(rt);
1191 err = ip_route_output_key(net, &rt, &fl);
1192 if (err != 0)
1193 return err;
1194 dev = rt->dst.dev; 1187 dev = rt->dst.dev;
1195 ip_rt_put(rt); 1188 ip_rt_put(rt);
1196 if (!dev) 1189 if (!dev)
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 174be6caa5c8..85bd24ca4f6d 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -46,11 +46,12 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
46 if (!saddr) 46 if (!saddr)
47 saddr = inet->mc_addr; 47 saddr = inet->mc_addr;
48 } 48 }
49 err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, 49 rt = ip_route_connect(usin->sin_addr.s_addr, saddr,
50 RT_CONN_FLAGS(sk), oif, 50 RT_CONN_FLAGS(sk), oif,
51 sk->sk_protocol, 51 sk->sk_protocol,
52 inet->inet_sport, usin->sin_port, sk, 1); 52 inet->inet_sport, usin->sin_port, sk, true);
53 if (err) { 53 if (IS_ERR(rt)) {
54 err = PTR_ERR(rt);
54 if (err == -ENETUNREACH) 55 if (err == -ENETUNREACH)
55 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 56 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
56 return err; 57 return err;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 036652c8166d..6d85800daeb7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -51,6 +51,7 @@
51#include <linux/inetdevice.h> 51#include <linux/inetdevice.h>
52#include <linux/igmp.h> 52#include <linux/igmp.h>
53#include <linux/slab.h> 53#include <linux/slab.h>
54#include <linux/hash.h>
54#ifdef CONFIG_SYSCTL 55#ifdef CONFIG_SYSCTL
55#include <linux/sysctl.h> 56#include <linux/sysctl.h>
56#endif 57#endif
@@ -92,6 +93,71 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
92 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, 93 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
93}; 94};
94 95
96/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
97 * value. So if you change this define, make appropriate changes to
98 * inet_addr_hash as well.
99 */
100#define IN4_ADDR_HSIZE 256
101static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
102static DEFINE_SPINLOCK(inet_addr_hash_lock);
103
104static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
105{
106 u32 val = (__force u32) addr ^ hash_ptr(net, 8);
107
108 return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
109 (IN4_ADDR_HSIZE - 1));
110}
111
112static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
113{
114 unsigned int hash = inet_addr_hash(net, ifa->ifa_local);
115
116 spin_lock(&inet_addr_hash_lock);
117 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
118 spin_unlock(&inet_addr_hash_lock);
119}
120
121static void inet_hash_remove(struct in_ifaddr *ifa)
122{
123 spin_lock(&inet_addr_hash_lock);
124 hlist_del_init_rcu(&ifa->hash);
125 spin_unlock(&inet_addr_hash_lock);
126}
127
128/**
129 * __ip_dev_find - find the first device with a given source address.
130 * @net: the net namespace
131 * @addr: the source address
132 * @devref: if true, take a reference on the found device
133 *
134 * If a caller uses devref=false, it should be protected by RCU, or RTNL
135 */
136struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
137{
138 unsigned int hash = inet_addr_hash(net, addr);
139 struct net_device *result = NULL;
140 struct in_ifaddr *ifa;
141 struct hlist_node *node;
142
143 rcu_read_lock();
144 hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
145 struct net_device *dev = ifa->ifa_dev->dev;
146
147 if (!net_eq(dev_net(dev), net))
148 continue;
149 if (ifa->ifa_local == addr) {
150 result = dev;
151 break;
152 }
153 }
154 if (result && devref)
155 dev_hold(result);
156 rcu_read_unlock();
157 return result;
158}
159EXPORT_SYMBOL(__ip_dev_find);
160
95static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); 161static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
96 162
97static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); 163static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
@@ -265,6 +331,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
265 } 331 }
266 332
267 if (!do_promote) { 333 if (!do_promote) {
334 inet_hash_remove(ifa);
268 *ifap1 = ifa->ifa_next; 335 *ifap1 = ifa->ifa_next;
269 336
270 rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); 337 rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
@@ -281,6 +348,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
281 /* 2. Unlink it */ 348 /* 2. Unlink it */
282 349
283 *ifap = ifa1->ifa_next; 350 *ifap = ifa1->ifa_next;
351 inet_hash_remove(ifa1);
284 352
285 /* 3. Announce address deletion */ 353 /* 3. Announce address deletion */
286 354
@@ -368,6 +436,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
368 ifa->ifa_next = *ifap; 436 ifa->ifa_next = *ifap;
369 *ifap = ifa; 437 *ifap = ifa;
370 438
439 inet_hash_insert(dev_net(in_dev->dev), ifa);
440
371 /* Send message first, then call notifier. 441 /* Send message first, then call notifier.
372 Notifier will trigger FIB update, so that 442 Notifier will trigger FIB update, so that
373 listeners of netlink will know about new ifaddr */ 443 listeners of netlink will know about new ifaddr */
@@ -521,6 +591,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
521 if (tb[IFA_ADDRESS] == NULL) 591 if (tb[IFA_ADDRESS] == NULL)
522 tb[IFA_ADDRESS] = tb[IFA_LOCAL]; 592 tb[IFA_ADDRESS] = tb[IFA_LOCAL];
523 593
594 INIT_HLIST_NODE(&ifa->hash);
524 ifa->ifa_prefixlen = ifm->ifa_prefixlen; 595 ifa->ifa_prefixlen = ifm->ifa_prefixlen;
525 ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); 596 ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
526 ifa->ifa_flags = ifm->ifa_flags; 597 ifa->ifa_flags = ifm->ifa_flags;
@@ -728,6 +799,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
728 if (!ifa) { 799 if (!ifa) {
729 ret = -ENOBUFS; 800 ret = -ENOBUFS;
730 ifa = inet_alloc_ifa(); 801 ifa = inet_alloc_ifa();
802 INIT_HLIST_NODE(&ifa->hash);
731 if (!ifa) 803 if (!ifa)
732 break; 804 break;
733 if (colon) 805 if (colon)
@@ -1084,6 +1156,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1084 struct in_ifaddr *ifa = inet_alloc_ifa(); 1156 struct in_ifaddr *ifa = inet_alloc_ifa();
1085 1157
1086 if (ifa) { 1158 if (ifa) {
1159 INIT_HLIST_NODE(&ifa->hash);
1087 ifa->ifa_local = 1160 ifa->ifa_local =
1088 ifa->ifa_address = htonl(INADDR_LOOPBACK); 1161 ifa->ifa_address = htonl(INADDR_LOOPBACK);
1089 ifa->ifa_prefixlen = 8; 1162 ifa->ifa_prefixlen = 8;
@@ -1720,6 +1793,11 @@ static struct rtnl_af_ops inet_af_ops = {
1720 1793
1721void __init devinet_init(void) 1794void __init devinet_init(void)
1722{ 1795{
1796 int i;
1797
1798 for (i = 0; i < IN4_ADDR_HSIZE; i++)
1799 INIT_HLIST_HEAD(&inet_addr_lst[i]);
1800
1723 register_pernet_subsys(&devinet_ops); 1801 register_pernet_subsys(&devinet_ops);
1724 1802
1725 register_gifconf(PF_INET, inet_gifconf); 1803 register_gifconf(PF_INET, inet_gifconf);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index e42a905180f0..03f994bcf7de 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -33,11 +33,14 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
33 * 33 *
34 * TODO: Use spare space in skb for this where possible. 34 * TODO: Use spare space in skb for this where possible.
35 */ 35 */
36static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) 36static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
37{ 37{
38 unsigned int len; 38 unsigned int len;
39 39
40 len = crypto_aead_ivsize(aead); 40 len = seqhilen;
41
42 len += crypto_aead_ivsize(aead);
43
41 if (len) { 44 if (len) {
42 len += crypto_aead_alignmask(aead) & 45 len += crypto_aead_alignmask(aead) &
43 ~(crypto_tfm_ctx_alignment() - 1); 46 ~(crypto_tfm_ctx_alignment() - 1);
@@ -52,10 +55,15 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags)
52 return kmalloc(len, GFP_ATOMIC); 55 return kmalloc(len, GFP_ATOMIC);
53} 56}
54 57
55static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp) 58static inline __be32 *esp_tmp_seqhi(void *tmp)
59{
60 return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
61}
62static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
56{ 63{
57 return crypto_aead_ivsize(aead) ? 64 return crypto_aead_ivsize(aead) ?
58 PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp; 65 PTR_ALIGN((u8 *)tmp + seqhilen,
66 crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
59} 67}
60 68
61static inline struct aead_givcrypt_request *esp_tmp_givreq( 69static inline struct aead_givcrypt_request *esp_tmp_givreq(
@@ -122,6 +130,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
122 int plen; 130 int plen;
123 int tfclen; 131 int tfclen;
124 int nfrags; 132 int nfrags;
133 int assoclen;
134 int sglists;
135 int seqhilen;
136 __be32 *seqhi;
125 137
126 /* skb is pure payload to encrypt */ 138 /* skb is pure payload to encrypt */
127 139
@@ -151,14 +163,25 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
151 goto error; 163 goto error;
152 nfrags = err; 164 nfrags = err;
153 165
154 tmp = esp_alloc_tmp(aead, nfrags + 1); 166 assoclen = sizeof(*esph);
167 sglists = 1;
168 seqhilen = 0;
169
170 if (x->props.flags & XFRM_STATE_ESN) {
171 sglists += 2;
172 seqhilen += sizeof(__be32);
173 assoclen += seqhilen;
174 }
175
176 tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
155 if (!tmp) 177 if (!tmp)
156 goto error; 178 goto error;
157 179
158 iv = esp_tmp_iv(aead, tmp); 180 seqhi = esp_tmp_seqhi(tmp);
181 iv = esp_tmp_iv(aead, tmp, seqhilen);
159 req = esp_tmp_givreq(aead, iv); 182 req = esp_tmp_givreq(aead, iv);
160 asg = esp_givreq_sg(aead, req); 183 asg = esp_givreq_sg(aead, req);
161 sg = asg + 1; 184 sg = asg + sglists;
162 185
163 /* Fill padding... */ 186 /* Fill padding... */
164 tail = skb_tail_pointer(trailer); 187 tail = skb_tail_pointer(trailer);
@@ -215,19 +238,27 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
215 } 238 }
216 239
217 esph->spi = x->id.spi; 240 esph->spi = x->id.spi;
218 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); 241 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
219 242
220 sg_init_table(sg, nfrags); 243 sg_init_table(sg, nfrags);
221 skb_to_sgvec(skb, sg, 244 skb_to_sgvec(skb, sg,
222 esph->enc_data + crypto_aead_ivsize(aead) - skb->data, 245 esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
223 clen + alen); 246 clen + alen);
224 sg_init_one(asg, esph, sizeof(*esph)); 247
248 if ((x->props.flags & XFRM_STATE_ESN)) {
249 sg_init_table(asg, 3);
250 sg_set_buf(asg, &esph->spi, sizeof(__be32));
251 *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
252 sg_set_buf(asg + 1, seqhi, seqhilen);
253 sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
254 } else
255 sg_init_one(asg, esph, sizeof(*esph));
225 256
226 aead_givcrypt_set_callback(req, 0, esp_output_done, skb); 257 aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
227 aead_givcrypt_set_crypt(req, sg, sg, clen, iv); 258 aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
228 aead_givcrypt_set_assoc(req, asg, sizeof(*esph)); 259 aead_givcrypt_set_assoc(req, asg, assoclen);
229 aead_givcrypt_set_giv(req, esph->enc_data, 260 aead_givcrypt_set_giv(req, esph->enc_data,
230 XFRM_SKB_CB(skb)->seq.output); 261 XFRM_SKB_CB(skb)->seq.output.low);
231 262
232 ESP_SKB_CB(skb)->tmp = tmp; 263 ESP_SKB_CB(skb)->tmp = tmp;
233 err = crypto_aead_givencrypt(req); 264 err = crypto_aead_givencrypt(req);
@@ -346,6 +377,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
346 struct sk_buff *trailer; 377 struct sk_buff *trailer;
347 int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); 378 int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
348 int nfrags; 379 int nfrags;
380 int assoclen;
381 int sglists;
382 int seqhilen;
383 __be32 *seqhi;
349 void *tmp; 384 void *tmp;
350 u8 *iv; 385 u8 *iv;
351 struct scatterlist *sg; 386 struct scatterlist *sg;
@@ -362,16 +397,27 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
362 goto out; 397 goto out;
363 nfrags = err; 398 nfrags = err;
364 399
400 assoclen = sizeof(*esph);
401 sglists = 1;
402 seqhilen = 0;
403
404 if (x->props.flags & XFRM_STATE_ESN) {
405 sglists += 2;
406 seqhilen += sizeof(__be32);
407 assoclen += seqhilen;
408 }
409
365 err = -ENOMEM; 410 err = -ENOMEM;
366 tmp = esp_alloc_tmp(aead, nfrags + 1); 411 tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
367 if (!tmp) 412 if (!tmp)
368 goto out; 413 goto out;
369 414
370 ESP_SKB_CB(skb)->tmp = tmp; 415 ESP_SKB_CB(skb)->tmp = tmp;
371 iv = esp_tmp_iv(aead, tmp); 416 seqhi = esp_tmp_seqhi(tmp);
417 iv = esp_tmp_iv(aead, tmp, seqhilen);
372 req = esp_tmp_req(aead, iv); 418 req = esp_tmp_req(aead, iv);
373 asg = esp_req_sg(aead, req); 419 asg = esp_req_sg(aead, req);
374 sg = asg + 1; 420 sg = asg + sglists;
375 421
376 skb->ip_summed = CHECKSUM_NONE; 422 skb->ip_summed = CHECKSUM_NONE;
377 423
@@ -382,11 +428,19 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
382 428
383 sg_init_table(sg, nfrags); 429 sg_init_table(sg, nfrags);
384 skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); 430 skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
385 sg_init_one(asg, esph, sizeof(*esph)); 431
432 if ((x->props.flags & XFRM_STATE_ESN)) {
433 sg_init_table(asg, 3);
434 sg_set_buf(asg, &esph->spi, sizeof(__be32));
435 *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
436 sg_set_buf(asg + 1, seqhi, seqhilen);
437 sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
438 } else
439 sg_init_one(asg, esph, sizeof(*esph));
386 440
387 aead_request_set_callback(req, 0, esp_input_done, skb); 441 aead_request_set_callback(req, 0, esp_input_done, skb);
388 aead_request_set_crypt(req, sg, sg, elen, iv); 442 aead_request_set_crypt(req, sg, sg, elen, iv);
389 aead_request_set_assoc(req, asg, sizeof(*esph)); 443 aead_request_set_assoc(req, asg, assoclen);
390 444
391 err = crypto_aead_decrypt(req); 445 err = crypto_aead_decrypt(req);
392 if (err == -EINPROGRESS) 446 if (err == -EINPROGRESS)
@@ -500,10 +554,20 @@ static int esp_init_authenc(struct xfrm_state *x)
500 goto error; 554 goto error;
501 555
502 err = -ENAMETOOLONG; 556 err = -ENAMETOOLONG;
503 if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)", 557
504 x->aalg ? x->aalg->alg_name : "digest_null", 558 if ((x->props.flags & XFRM_STATE_ESN)) {
505 x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) 559 if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
506 goto error; 560 "authencesn(%s,%s)",
561 x->aalg ? x->aalg->alg_name : "digest_null",
562 x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
563 goto error;
564 } else {
565 if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
566 "authenc(%s,%s)",
567 x->aalg ? x->aalg->alg_name : "digest_null",
568 x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
569 goto error;
570 }
507 571
508 aead = crypto_alloc_aead(authenc_name, 0, 0); 572 aead = crypto_alloc_aead(authenc_name, 0, 0);
509 err = PTR_ERR(aead); 573 err = PTR_ERR(aead);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1d2cdd43a878..a373a259253c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -51,11 +51,11 @@ static int __net_init fib4_rules_init(struct net *net)
51{ 51{
52 struct fib_table *local_table, *main_table; 52 struct fib_table *local_table, *main_table;
53 53
54 local_table = fib_hash_table(RT_TABLE_LOCAL); 54 local_table = fib_trie_table(RT_TABLE_LOCAL);
55 if (local_table == NULL) 55 if (local_table == NULL)
56 return -ENOMEM; 56 return -ENOMEM;
57 57
58 main_table = fib_hash_table(RT_TABLE_MAIN); 58 main_table = fib_trie_table(RT_TABLE_MAIN);
59 if (main_table == NULL) 59 if (main_table == NULL)
60 goto fail; 60 goto fail;
61 61
@@ -82,7 +82,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
82 if (tb) 82 if (tb)
83 return tb; 83 return tb;
84 84
85 tb = fib_hash_table(id); 85 tb = fib_trie_table(id);
86 if (!tb) 86 if (!tb)
87 return NULL; 87 return NULL;
88 h = id & (FIB_TABLE_HASHSZ - 1); 88 h = id & (FIB_TABLE_HASHSZ - 1);
@@ -114,21 +114,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
114} 114}
115#endif /* CONFIG_IP_MULTIPLE_TABLES */ 115#endif /* CONFIG_IP_MULTIPLE_TABLES */
116 116
117void fib_select_default(struct net *net,
118 const struct flowi *flp, struct fib_result *res)
119{
120 struct fib_table *tb;
121 int table = RT_TABLE_MAIN;
122#ifdef CONFIG_IP_MULTIPLE_TABLES
123 if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
124 return;
125 table = res->r->table;
126#endif
127 tb = fib_get_table(net, table);
128 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
129 fib_table_select_default(tb, flp, res);
130}
131
132static void fib_flush(struct net *net) 117static void fib_flush(struct net *net)
133{ 118{
134 int flushed = 0; 119 int flushed = 0;
@@ -147,46 +132,6 @@ static void fib_flush(struct net *net)
147 rt_cache_flush(net, -1); 132 rt_cache_flush(net, -1);
148} 133}
149 134
150/**
151 * __ip_dev_find - find the first device with a given source address.
152 * @net: the net namespace
153 * @addr: the source address
154 * @devref: if true, take a reference on the found device
155 *
156 * If a caller uses devref=false, it should be protected by RCU, or RTNL
157 */
158struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
159{
160 struct flowi fl = {
161 .fl4_dst = addr,
162 };
163 struct fib_result res = { 0 };
164 struct net_device *dev = NULL;
165 struct fib_table *local_table;
166
167#ifdef CONFIG_IP_MULTIPLE_TABLES
168 res.r = NULL;
169#endif
170
171 rcu_read_lock();
172 local_table = fib_get_table(net, RT_TABLE_LOCAL);
173 if (!local_table ||
174 fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
175 rcu_read_unlock();
176 return NULL;
177 }
178 if (res.type != RTN_LOCAL)
179 goto out;
180 dev = FIB_RES_DEV(res);
181
182 if (dev && devref)
183 dev_hold(dev);
184out:
185 rcu_read_unlock();
186 return dev;
187}
188EXPORT_SYMBOL(__ip_dev_find);
189
190/* 135/*
191 * Find address type as if only "dev" was present in the system. If 136 * Find address type as if only "dev" was present in the system. If
192 * on_dev is NULL then all interfaces are taken into consideration. 137 * on_dev is NULL then all interfaces are taken into consideration.
@@ -195,7 +140,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
195 const struct net_device *dev, 140 const struct net_device *dev,
196 __be32 addr) 141 __be32 addr)
197{ 142{
198 struct flowi fl = { .fl4_dst = addr }; 143 struct flowi4 fl4 = { .daddr = addr };
199 struct fib_result res; 144 struct fib_result res;
200 unsigned ret = RTN_BROADCAST; 145 unsigned ret = RTN_BROADCAST;
201 struct fib_table *local_table; 146 struct fib_table *local_table;
@@ -213,7 +158,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
213 if (local_table) { 158 if (local_table) {
214 ret = RTN_UNICAST; 159 ret = RTN_UNICAST;
215 rcu_read_lock(); 160 rcu_read_lock();
216 if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) { 161 if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
217 if (!dev || dev == res.fi->fib_dev) 162 if (!dev || dev == res.fi->fib_dev)
218 ret = res.type; 163 ret = res.type;
219 } 164 }
@@ -248,19 +193,21 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
248 u32 *itag, u32 mark) 193 u32 *itag, u32 mark)
249{ 194{
250 struct in_device *in_dev; 195 struct in_device *in_dev;
251 struct flowi fl = { 196 struct flowi4 fl4;
252 .fl4_dst = src,
253 .fl4_src = dst,
254 .fl4_tos = tos,
255 .mark = mark,
256 .iif = oif
257 };
258 struct fib_result res; 197 struct fib_result res;
259 int no_addr, rpf, accept_local; 198 int no_addr, rpf, accept_local;
260 bool dev_match; 199 bool dev_match;
261 int ret; 200 int ret;
262 struct net *net; 201 struct net *net;
263 202
203 fl4.flowi4_oif = 0;
204 fl4.flowi4_iif = oif;
205 fl4.flowi4_mark = mark;
206 fl4.daddr = src;
207 fl4.saddr = dst;
208 fl4.flowi4_tos = tos;
209 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
210
264 no_addr = rpf = accept_local = 0; 211 no_addr = rpf = accept_local = 0;
265 in_dev = __in_dev_get_rcu(dev); 212 in_dev = __in_dev_get_rcu(dev);
266 if (in_dev) { 213 if (in_dev) {
@@ -268,14 +215,14 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
268 rpf = IN_DEV_RPFILTER(in_dev); 215 rpf = IN_DEV_RPFILTER(in_dev);
269 accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); 216 accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
270 if (mark && !IN_DEV_SRC_VMARK(in_dev)) 217 if (mark && !IN_DEV_SRC_VMARK(in_dev))
271 fl.mark = 0; 218 fl4.flowi4_mark = 0;
272 } 219 }
273 220
274 if (in_dev == NULL) 221 if (in_dev == NULL)
275 goto e_inval; 222 goto e_inval;
276 223
277 net = dev_net(dev); 224 net = dev_net(dev);
278 if (fib_lookup(net, &fl, &res)) 225 if (fib_lookup(net, &fl4, &res))
279 goto last_resort; 226 goto last_resort;
280 if (res.type != RTN_UNICAST) { 227 if (res.type != RTN_UNICAST) {
281 if (res.type != RTN_LOCAL || !accept_local) 228 if (res.type != RTN_LOCAL || !accept_local)
@@ -306,10 +253,10 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
306 goto last_resort; 253 goto last_resort;
307 if (rpf == 1) 254 if (rpf == 1)
308 goto e_rpf; 255 goto e_rpf;
309 fl.oif = dev->ifindex; 256 fl4.flowi4_oif = dev->ifindex;
310 257
311 ret = 0; 258 ret = 0;
312 if (fib_lookup(net, &fl, &res) == 0) { 259 if (fib_lookup(net, &fl4, &res) == 0) {
313 if (res.type == RTN_UNICAST) { 260 if (res.type == RTN_UNICAST) {
314 *spec_dst = FIB_RES_PREFSRC(res); 261 *spec_dst = FIB_RES_PREFSRC(res);
315 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 262 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
@@ -849,11 +796,11 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
849{ 796{
850 797
851 struct fib_result res; 798 struct fib_result res;
852 struct flowi fl = { 799 struct flowi4 fl4 = {
853 .mark = frn->fl_mark, 800 .flowi4_mark = frn->fl_mark,
854 .fl4_dst = frn->fl_addr, 801 .daddr = frn->fl_addr,
855 .fl4_tos = frn->fl_tos, 802 .flowi4_tos = frn->fl_tos,
856 .fl4_scope = frn->fl_scope, 803 .flowi4_scope = frn->fl_scope,
857 }; 804 };
858 805
859#ifdef CONFIG_IP_MULTIPLE_TABLES 806#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -866,7 +813,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
866 813
867 frn->tb_id = tb->tb_id; 814 frn->tb_id = tb->tb_id;
868 rcu_read_lock(); 815 rcu_read_lock();
869 frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF); 816 frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
870 817
871 if (!frn->err) { 818 if (!frn->err) {
872 frn->prefixlen = res.prefixlen; 819 frn->prefixlen = res.prefixlen;
@@ -945,10 +892,12 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
945#ifdef CONFIG_IP_ROUTE_MULTIPATH 892#ifdef CONFIG_IP_ROUTE_MULTIPATH
946 fib_sync_up(dev); 893 fib_sync_up(dev);
947#endif 894#endif
895 fib_update_nh_saddrs(dev);
948 rt_cache_flush(dev_net(dev), -1); 896 rt_cache_flush(dev_net(dev), -1);
949 break; 897 break;
950 case NETDEV_DOWN: 898 case NETDEV_DOWN:
951 fib_del_ifaddr(ifa); 899 fib_del_ifaddr(ifa);
900 fib_update_nh_saddrs(dev);
952 if (ifa->ifa_dev->ifa_list == NULL) { 901 if (ifa->ifa_dev->ifa_list == NULL) {
953 /* Last address was deleted from this interface. 902 /* Last address was deleted from this interface.
954 * Disable IP. 903 * Disable IP.
@@ -1101,5 +1050,5 @@ void __init ip_fib_init(void)
1101 register_netdevice_notifier(&fib_netdev_notifier); 1050 register_netdevice_notifier(&fib_netdev_notifier);
1102 register_inetaddr_notifier(&fib_inetaddr_notifier); 1051 register_inetaddr_notifier(&fib_inetaddr_notifier);
1103 1052
1104 fib_hash_init(); 1053 fib_trie_init();
1105} 1054}
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
deleted file mode 100644
index b3acb0417b21..000000000000
--- a/net/ipv4/fib_hash.c
+++ /dev/null
@@ -1,1133 +0,0 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IPv4 FIB: lookup engine and maintenance routines.
7 *
8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16#include <asm/uaccess.h>
17#include <asm/system.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/mm.h>
22#include <linux/string.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/errno.h>
26#include <linux/in.h>
27#include <linux/inet.h>
28#include <linux/inetdevice.h>
29#include <linux/netdevice.h>
30#include <linux/if_arp.h>
31#include <linux/proc_fs.h>
32#include <linux/skbuff.h>
33#include <linux/netlink.h>
34#include <linux/init.h>
35#include <linux/slab.h>
36
37#include <net/net_namespace.h>
38#include <net/ip.h>
39#include <net/protocol.h>
40#include <net/route.h>
41#include <net/tcp.h>
42#include <net/sock.h>
43#include <net/ip_fib.h>
44
45#include "fib_lookup.h"
46
47static struct kmem_cache *fn_hash_kmem __read_mostly;
48static struct kmem_cache *fn_alias_kmem __read_mostly;
49
50struct fib_node {
51 struct hlist_node fn_hash;
52 struct list_head fn_alias;
53 __be32 fn_key;
54 struct fib_alias fn_embedded_alias;
55};
56
57#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
58
59struct fn_zone {
60 struct fn_zone __rcu *fz_next; /* Next not empty zone */
61 struct hlist_head __rcu *fz_hash; /* Hash table pointer */
62 seqlock_t fz_lock;
63 u32 fz_hashmask; /* (fz_divisor - 1) */
64
65 u8 fz_order; /* Zone order (0..32) */
66 u8 fz_revorder; /* 32 - fz_order */
67 __be32 fz_mask; /* inet_make_mask(order) */
68#define FZ_MASK(fz) ((fz)->fz_mask)
69
70 struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
71
72 int fz_nent; /* Number of entries */
73 int fz_divisor; /* Hash size (mask+1) */
74};
75
76struct fn_hash {
77 struct fn_zone *fn_zones[33];
78 struct fn_zone __rcu *fn_zone_list;
79};
80
81static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
82{
83 u32 h = ntohl(key) >> fz->fz_revorder;
84 h ^= (h>>20);
85 h ^= (h>>10);
86 h ^= (h>>5);
87 h &= fz->fz_hashmask;
88 return h;
89}
90
91static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
92{
93 return dst & FZ_MASK(fz);
94}
95
96static unsigned int fib_hash_genid;
97
98#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
99
100static struct hlist_head *fz_hash_alloc(int divisor)
101{
102 unsigned long size = divisor * sizeof(struct hlist_head);
103
104 if (size <= PAGE_SIZE)
105 return kzalloc(size, GFP_KERNEL);
106
107 return (struct hlist_head *)
108 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
109}
110
111/* The fib hash lock must be held when this is called. */
112static inline void fn_rebuild_zone(struct fn_zone *fz,
113 struct hlist_head *old_ht,
114 int old_divisor)
115{
116 int i;
117
118 for (i = 0; i < old_divisor; i++) {
119 struct hlist_node *node, *n;
120 struct fib_node *f;
121
122 hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
123 struct hlist_head *new_head;
124
125 hlist_del_rcu(&f->fn_hash);
126
127 new_head = rcu_dereference_protected(fz->fz_hash, 1) +
128 fn_hash(f->fn_key, fz);
129 hlist_add_head_rcu(&f->fn_hash, new_head);
130 }
131 }
132}
133
134static void fz_hash_free(struct hlist_head *hash, int divisor)
135{
136 unsigned long size = divisor * sizeof(struct hlist_head);
137
138 if (size <= PAGE_SIZE)
139 kfree(hash);
140 else
141 free_pages((unsigned long)hash, get_order(size));
142}
143
144static void fn_rehash_zone(struct fn_zone *fz)
145{
146 struct hlist_head *ht, *old_ht;
147 int old_divisor, new_divisor;
148 u32 new_hashmask;
149
150 new_divisor = old_divisor = fz->fz_divisor;
151
152 switch (old_divisor) {
153 case EMBEDDED_HASH_SIZE:
154 new_divisor *= EMBEDDED_HASH_SIZE;
155 break;
156 case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
157 new_divisor *= (EMBEDDED_HASH_SIZE/2);
158 break;
159 default:
160 if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
161 printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
162 return;
163 }
164 new_divisor = (old_divisor << 1);
165 break;
166 }
167
168 new_hashmask = (new_divisor - 1);
169
170#if RT_CACHE_DEBUG >= 2
171 printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n",
172 fz->fz_order, old_divisor);
173#endif
174
175 ht = fz_hash_alloc(new_divisor);
176
177 if (ht) {
178 struct fn_zone nfz;
179
180 memcpy(&nfz, fz, sizeof(nfz));
181
182 write_seqlock_bh(&fz->fz_lock);
183 old_ht = rcu_dereference_protected(fz->fz_hash, 1);
184 RCU_INIT_POINTER(nfz.fz_hash, ht);
185 nfz.fz_hashmask = new_hashmask;
186 nfz.fz_divisor = new_divisor;
187 fn_rebuild_zone(&nfz, old_ht, old_divisor);
188 fib_hash_genid++;
189 rcu_assign_pointer(fz->fz_hash, ht);
190 fz->fz_hashmask = new_hashmask;
191 fz->fz_divisor = new_divisor;
192 write_sequnlock_bh(&fz->fz_lock);
193
194 if (old_ht != fz->fz_embedded_hash) {
195 synchronize_rcu();
196 fz_hash_free(old_ht, old_divisor);
197 }
198 }
199}
200
201static void fn_free_node_rcu(struct rcu_head *head)
202{
203 struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
204
205 kmem_cache_free(fn_hash_kmem, f);
206}
207
208static inline void fn_free_node(struct fib_node *f)
209{
210 call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
211}
212
213static void fn_free_alias_rcu(struct rcu_head *head)
214{
215 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
216
217 kmem_cache_free(fn_alias_kmem, fa);
218}
219
220static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
221{
222 fib_release_info(fa->fa_info);
223 if (fa == &f->fn_embedded_alias)
224 fa->fa_info = NULL;
225 else
226 call_rcu(&fa->rcu, fn_free_alias_rcu);
227}
228
229static struct fn_zone *
230fn_new_zone(struct fn_hash *table, int z)
231{
232 int i;
233 struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
234 if (!fz)
235 return NULL;
236
237 seqlock_init(&fz->fz_lock);
238 fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
239 fz->fz_hashmask = fz->fz_divisor - 1;
240 RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
241 fz->fz_order = z;
242 fz->fz_revorder = 32 - z;
243 fz->fz_mask = inet_make_mask(z);
244
245 /* Find the first not empty zone with more specific mask */
246 for (i = z + 1; i <= 32; i++)
247 if (table->fn_zones[i])
248 break;
249 if (i > 32) {
250 /* No more specific masks, we are the first. */
251 rcu_assign_pointer(fz->fz_next,
252 rtnl_dereference(table->fn_zone_list));
253 rcu_assign_pointer(table->fn_zone_list, fz);
254 } else {
255 rcu_assign_pointer(fz->fz_next,
256 rtnl_dereference(table->fn_zones[i]->fz_next));
257 rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
258 }
259 table->fn_zones[z] = fz;
260 fib_hash_genid++;
261 return fz;
262}
263
264int fib_table_lookup(struct fib_table *tb,
265 const struct flowi *flp, struct fib_result *res,
266 int fib_flags)
267{
268 int err;
269 struct fn_zone *fz;
270 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
271
272 rcu_read_lock();
273 for (fz = rcu_dereference(t->fn_zone_list);
274 fz != NULL;
275 fz = rcu_dereference(fz->fz_next)) {
276 struct hlist_head *head;
277 struct hlist_node *node;
278 struct fib_node *f;
279 __be32 k;
280 unsigned int seq;
281
282 do {
283 seq = read_seqbegin(&fz->fz_lock);
284 k = fz_key(flp->fl4_dst, fz);
285
286 head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
287 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
288 if (f->fn_key != k)
289 continue;
290
291 err = fib_semantic_match(&f->fn_alias,
292 flp, res,
293 fz->fz_order, fib_flags);
294 if (err <= 0)
295 goto out;
296 }
297 } while (read_seqretry(&fz->fz_lock, seq));
298 }
299 err = 1;
300out:
301 rcu_read_unlock();
302 return err;
303}
304
305void fib_table_select_default(struct fib_table *tb,
306 const struct flowi *flp, struct fib_result *res)
307{
308 int order, last_idx;
309 struct hlist_node *node;
310 struct fib_node *f;
311 struct fib_info *fi = NULL;
312 struct fib_info *last_resort;
313 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
314 struct fn_zone *fz = t->fn_zones[0];
315 struct hlist_head *head;
316
317 if (fz == NULL)
318 return;
319
320 last_idx = -1;
321 last_resort = NULL;
322 order = -1;
323
324 rcu_read_lock();
325 head = rcu_dereference(fz->fz_hash);
326 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
327 struct fib_alias *fa;
328
329 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
330 struct fib_info *next_fi = fa->fa_info;
331
332 if (fa->fa_scope != res->scope ||
333 fa->fa_type != RTN_UNICAST)
334 continue;
335
336 if (next_fi->fib_priority > res->fi->fib_priority)
337 break;
338 if (!next_fi->fib_nh[0].nh_gw ||
339 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
340 continue;
341
342 fib_alias_accessed(fa);
343
344 if (fi == NULL) {
345 if (next_fi != res->fi)
346 break;
347 } else if (!fib_detect_death(fi, order, &last_resort,
348 &last_idx, tb->tb_default)) {
349 fib_result_assign(res, fi);
350 tb->tb_default = order;
351 goto out;
352 }
353 fi = next_fi;
354 order++;
355 }
356 }
357
358 if (order <= 0 || fi == NULL) {
359 tb->tb_default = -1;
360 goto out;
361 }
362
363 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
364 tb->tb_default)) {
365 fib_result_assign(res, fi);
366 tb->tb_default = order;
367 goto out;
368 }
369
370 if (last_idx >= 0)
371 fib_result_assign(res, last_resort);
372 tb->tb_default = last_idx;
373out:
374 rcu_read_unlock();
375}
376
377/* Insert node F to FZ. */
378static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
379{
380 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
381
382 hlist_add_head_rcu(&f->fn_hash, head);
383}
384
385/* Return the node in FZ matching KEY. */
386static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
387{
388 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
389 struct hlist_node *node;
390 struct fib_node *f;
391
392 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
393 if (f->fn_key == key)
394 return f;
395 }
396
397 return NULL;
398}
399
400
401static struct fib_alias *fib_fast_alloc(struct fib_node *f)
402{
403 struct fib_alias *fa = &f->fn_embedded_alias;
404
405 if (fa->fa_info != NULL)
406 fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
407 return fa;
408}
409
410/* Caller must hold RTNL. */
411int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
412{
413 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
414 struct fib_node *new_f = NULL;
415 struct fib_node *f;
416 struct fib_alias *fa, *new_fa;
417 struct fn_zone *fz;
418 struct fib_info *fi;
419 u8 tos = cfg->fc_tos;
420 __be32 key;
421 int err;
422
423 if (cfg->fc_dst_len > 32)
424 return -EINVAL;
425
426 fz = table->fn_zones[cfg->fc_dst_len];
427 if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
428 return -ENOBUFS;
429
430 key = 0;
431 if (cfg->fc_dst) {
432 if (cfg->fc_dst & ~FZ_MASK(fz))
433 return -EINVAL;
434 key = fz_key(cfg->fc_dst, fz);
435 }
436
437 fi = fib_create_info(cfg);
438 if (IS_ERR(fi))
439 return PTR_ERR(fi);
440
441 if (fz->fz_nent > (fz->fz_divisor<<1) &&
442 fz->fz_divisor < FZ_MAX_DIVISOR &&
443 (cfg->fc_dst_len == 32 ||
444 (1 << cfg->fc_dst_len) > fz->fz_divisor))
445 fn_rehash_zone(fz);
446
447 f = fib_find_node(fz, key);
448
449 if (!f)
450 fa = NULL;
451 else
452 fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
453
454 /* Now fa, if non-NULL, points to the first fib alias
455 * with the same keys [prefix,tos,priority], if such key already
456 * exists or to the node before which we will insert new one.
457 *
458 * If fa is NULL, we will need to allocate a new one and
459 * insert to the head of f.
460 *
461 * If f is NULL, no fib node matched the destination key
462 * and we need to allocate a new one of those as well.
463 */
464
465 if (fa && fa->fa_tos == tos &&
466 fa->fa_info->fib_priority == fi->fib_priority) {
467 struct fib_alias *fa_first, *fa_match;
468
469 err = -EEXIST;
470 if (cfg->fc_nlflags & NLM_F_EXCL)
471 goto out;
472
473 /* We have 2 goals:
474 * 1. Find exact match for type, scope, fib_info to avoid
475 * duplicate routes
476 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
477 */
478 fa_match = NULL;
479 fa_first = fa;
480 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
481 list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
482 if (fa->fa_tos != tos)
483 break;
484 if (fa->fa_info->fib_priority != fi->fib_priority)
485 break;
486 if (fa->fa_type == cfg->fc_type &&
487 fa->fa_scope == cfg->fc_scope &&
488 fa->fa_info == fi) {
489 fa_match = fa;
490 break;
491 }
492 }
493
494 if (cfg->fc_nlflags & NLM_F_REPLACE) {
495 u8 state;
496
497 fa = fa_first;
498 if (fa_match) {
499 if (fa == fa_match)
500 err = 0;
501 goto out;
502 }
503 err = -ENOBUFS;
504 new_fa = fib_fast_alloc(f);
505 if (new_fa == NULL)
506 goto out;
507
508 new_fa->fa_tos = fa->fa_tos;
509 new_fa->fa_info = fi;
510 new_fa->fa_type = cfg->fc_type;
511 new_fa->fa_scope = cfg->fc_scope;
512 state = fa->fa_state;
513 new_fa->fa_state = state & ~FA_S_ACCESSED;
514 fib_hash_genid++;
515 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
516
517 fn_free_alias(fa, f);
518 if (state & FA_S_ACCESSED)
519 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
520 rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
521 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
522 return 0;
523 }
524
525 /* Error if we find a perfect match which
526 * uses the same scope, type, and nexthop
527 * information.
528 */
529 if (fa_match)
530 goto out;
531
532 if (!(cfg->fc_nlflags & NLM_F_APPEND))
533 fa = fa_first;
534 }
535
536 err = -ENOENT;
537 if (!(cfg->fc_nlflags & NLM_F_CREATE))
538 goto out;
539
540 err = -ENOBUFS;
541
542 if (!f) {
543 new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
544 if (new_f == NULL)
545 goto out;
546
547 INIT_HLIST_NODE(&new_f->fn_hash);
548 INIT_LIST_HEAD(&new_f->fn_alias);
549 new_f->fn_key = key;
550 f = new_f;
551 }
552
553 new_fa = fib_fast_alloc(f);
554 if (new_fa == NULL)
555 goto out;
556
557 new_fa->fa_info = fi;
558 new_fa->fa_tos = tos;
559 new_fa->fa_type = cfg->fc_type;
560 new_fa->fa_scope = cfg->fc_scope;
561 new_fa->fa_state = 0;
562
563 /*
564 * Insert new entry to the list.
565 */
566
567 if (new_f)
568 fib_insert_node(fz, new_f);
569 list_add_tail_rcu(&new_fa->fa_list,
570 (fa ? &fa->fa_list : &f->fn_alias));
571 fib_hash_genid++;
572
573 if (new_f)
574 fz->fz_nent++;
575 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
576
577 rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
578 &cfg->fc_nlinfo, 0);
579 return 0;
580
581out:
582 if (new_f)
583 kmem_cache_free(fn_hash_kmem, new_f);
584 fib_release_info(fi);
585 return err;
586}
587
588int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
589{
590 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
591 struct fib_node *f;
592 struct fib_alias *fa, *fa_to_delete;
593 struct fn_zone *fz;
594 __be32 key;
595
596 if (cfg->fc_dst_len > 32)
597 return -EINVAL;
598
599 if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL)
600 return -ESRCH;
601
602 key = 0;
603 if (cfg->fc_dst) {
604 if (cfg->fc_dst & ~FZ_MASK(fz))
605 return -EINVAL;
606 key = fz_key(cfg->fc_dst, fz);
607 }
608
609 f = fib_find_node(fz, key);
610
611 if (!f)
612 fa = NULL;
613 else
614 fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
615 if (!fa)
616 return -ESRCH;
617
618 fa_to_delete = NULL;
619 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
620 list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
621 struct fib_info *fi = fa->fa_info;
622
623 if (fa->fa_tos != cfg->fc_tos)
624 break;
625
626 if ((!cfg->fc_type ||
627 fa->fa_type == cfg->fc_type) &&
628 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
629 fa->fa_scope == cfg->fc_scope) &&
630 (!cfg->fc_protocol ||
631 fi->fib_protocol == cfg->fc_protocol) &&
632 fib_nh_match(cfg, fi) == 0) {
633 fa_to_delete = fa;
634 break;
635 }
636 }
637
638 if (fa_to_delete) {
639 int kill_fn;
640
641 fa = fa_to_delete;
642 rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
643 tb->tb_id, &cfg->fc_nlinfo, 0);
644
645 kill_fn = 0;
646 list_del_rcu(&fa->fa_list);
647 if (list_empty(&f->fn_alias)) {
648 hlist_del_rcu(&f->fn_hash);
649 kill_fn = 1;
650 }
651 fib_hash_genid++;
652
653 if (fa->fa_state & FA_S_ACCESSED)
654 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
655 fn_free_alias(fa, f);
656 if (kill_fn) {
657 fn_free_node(f);
658 fz->fz_nent--;
659 }
660
661 return 0;
662 }
663 return -ESRCH;
664}
665
666static int fn_flush_list(struct fn_zone *fz, int idx)
667{
668 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
669 struct hlist_node *node, *n;
670 struct fib_node *f;
671 int found = 0;
672
673 hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
674 struct fib_alias *fa, *fa_node;
675 int kill_f;
676
677 kill_f = 0;
678 list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
679 struct fib_info *fi = fa->fa_info;
680
681 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
682 list_del_rcu(&fa->fa_list);
683 if (list_empty(&f->fn_alias)) {
684 hlist_del_rcu(&f->fn_hash);
685 kill_f = 1;
686 }
687 fib_hash_genid++;
688
689 fn_free_alias(fa, f);
690 found++;
691 }
692 }
693 if (kill_f) {
694 fn_free_node(f);
695 fz->fz_nent--;
696 }
697 }
698 return found;
699}
700
701/* caller must hold RTNL. */
702int fib_table_flush(struct fib_table *tb)
703{
704 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
705 struct fn_zone *fz;
706 int found = 0;
707
708 for (fz = rtnl_dereference(table->fn_zone_list);
709 fz != NULL;
710 fz = rtnl_dereference(fz->fz_next)) {
711 int i;
712
713 for (i = fz->fz_divisor - 1; i >= 0; i--)
714 found += fn_flush_list(fz, i);
715 }
716 return found;
717}
718
719void fib_free_table(struct fib_table *tb)
720{
721 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
722 struct fn_zone *fz, *next;
723
724 next = table->fn_zone_list;
725 while (next != NULL) {
726 fz = next;
727 next = fz->fz_next;
728
729 if (fz->fz_hash != fz->fz_embedded_hash)
730 fz_hash_free(fz->fz_hash, fz->fz_divisor);
731
732 kfree(fz);
733 }
734
735 kfree(tb);
736}
737
738static inline int
739fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
740 struct fib_table *tb,
741 struct fn_zone *fz,
742 struct hlist_head *head)
743{
744 struct hlist_node *node;
745 struct fib_node *f;
746 int i, s_i;
747
748 s_i = cb->args[4];
749 i = 0;
750 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
751 struct fib_alias *fa;
752
753 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
754 if (i < s_i)
755 goto next;
756
757 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
758 cb->nlh->nlmsg_seq,
759 RTM_NEWROUTE,
760 tb->tb_id,
761 fa->fa_type,
762 fa->fa_scope,
763 f->fn_key,
764 fz->fz_order,
765 fa->fa_tos,
766 fa->fa_info,
767 NLM_F_MULTI) < 0) {
768 cb->args[4] = i;
769 return -1;
770 }
771next:
772 i++;
773 }
774 }
775 cb->args[4] = i;
776 return skb->len;
777}
778
779static inline int
780fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
781 struct fib_table *tb,
782 struct fn_zone *fz)
783{
784 int h, s_h;
785 struct hlist_head *head = rcu_dereference(fz->fz_hash);
786
787 if (head == NULL)
788 return skb->len;
789 s_h = cb->args[3];
790 for (h = s_h; h < fz->fz_divisor; h++) {
791 if (hlist_empty(head + h))
792 continue;
793 if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
794 cb->args[3] = h;
795 return -1;
796 }
797 memset(&cb->args[4], 0,
798 sizeof(cb->args) - 4*sizeof(cb->args[0]));
799 }
800 cb->args[3] = h;
801 return skb->len;
802}
803
804int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
805 struct netlink_callback *cb)
806{
807 int m = 0, s_m;
808 struct fn_zone *fz;
809 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
810
811 s_m = cb->args[2];
812 rcu_read_lock();
813 for (fz = rcu_dereference(table->fn_zone_list);
814 fz != NULL;
815 fz = rcu_dereference(fz->fz_next), m++) {
816 if (m < s_m)
817 continue;
818 if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
819 cb->args[2] = m;
820 rcu_read_unlock();
821 return -1;
822 }
823 memset(&cb->args[3], 0,
824 sizeof(cb->args) - 3*sizeof(cb->args[0]));
825 }
826 rcu_read_unlock();
827 cb->args[2] = m;
828 return skb->len;
829}
830
831void __init fib_hash_init(void)
832{
833 fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
834 0, SLAB_PANIC, NULL);
835
836 fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
837 0, SLAB_PANIC, NULL);
838
839}
840
841struct fib_table *fib_hash_table(u32 id)
842{
843 struct fib_table *tb;
844
845 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
846 GFP_KERNEL);
847 if (tb == NULL)
848 return NULL;
849
850 tb->tb_id = id;
851 tb->tb_default = -1;
852
853 memset(tb->tb_data, 0, sizeof(struct fn_hash));
854 return tb;
855}
856
857/* ------------------------------------------------------------------------ */
858#ifdef CONFIG_PROC_FS
859
860struct fib_iter_state {
861 struct seq_net_private p;
862 struct fn_zone *zone;
863 int bucket;
864 struct hlist_head *hash_head;
865 struct fib_node *fn;
866 struct fib_alias *fa;
867 loff_t pos;
868 unsigned int genid;
869 int valid;
870};
871
872static struct fib_alias *fib_get_first(struct seq_file *seq)
873{
874 struct fib_iter_state *iter = seq->private;
875 struct fib_table *main_table;
876 struct fn_hash *table;
877
878 main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
879 table = (struct fn_hash *)main_table->tb_data;
880
881 iter->bucket = 0;
882 iter->hash_head = NULL;
883 iter->fn = NULL;
884 iter->fa = NULL;
885 iter->pos = 0;
886 iter->genid = fib_hash_genid;
887 iter->valid = 1;
888
889 for (iter->zone = rcu_dereference(table->fn_zone_list);
890 iter->zone != NULL;
891 iter->zone = rcu_dereference(iter->zone->fz_next)) {
892 int maxslot;
893
894 if (!iter->zone->fz_nent)
895 continue;
896
897 iter->hash_head = rcu_dereference(iter->zone->fz_hash);
898 maxslot = iter->zone->fz_divisor;
899
900 for (iter->bucket = 0; iter->bucket < maxslot;
901 ++iter->bucket, ++iter->hash_head) {
902 struct hlist_node *node;
903 struct fib_node *fn;
904
905 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
906 struct fib_alias *fa;
907
908 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
909 iter->fn = fn;
910 iter->fa = fa;
911 goto out;
912 }
913 }
914 }
915 }
916out:
917 return iter->fa;
918}
919
920static struct fib_alias *fib_get_next(struct seq_file *seq)
921{
922 struct fib_iter_state *iter = seq->private;
923 struct fib_node *fn;
924 struct fib_alias *fa;
925
926 /* Advance FA, if any. */
927 fn = iter->fn;
928 fa = iter->fa;
929 if (fa) {
930 BUG_ON(!fn);
931 list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
932 iter->fa = fa;
933 goto out;
934 }
935 }
936
937 fa = iter->fa = NULL;
938
939 /* Advance FN. */
940 if (fn) {
941 struct hlist_node *node = &fn->fn_hash;
942 hlist_for_each_entry_continue(fn, node, fn_hash) {
943 iter->fn = fn;
944
945 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
946 iter->fa = fa;
947 goto out;
948 }
949 }
950 }
951
952 fn = iter->fn = NULL;
953
954 /* Advance hash chain. */
955 if (!iter->zone)
956 goto out;
957
958 for (;;) {
959 struct hlist_node *node;
960 int maxslot;
961
962 maxslot = iter->zone->fz_divisor;
963
964 while (++iter->bucket < maxslot) {
965 iter->hash_head++;
966
967 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
968 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
969 iter->fn = fn;
970 iter->fa = fa;
971 goto out;
972 }
973 }
974 }
975
976 iter->zone = rcu_dereference(iter->zone->fz_next);
977
978 if (!iter->zone)
979 goto out;
980
981 iter->bucket = 0;
982 iter->hash_head = rcu_dereference(iter->zone->fz_hash);
983
984 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
985 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
986 iter->fn = fn;
987 iter->fa = fa;
988 goto out;
989 }
990 }
991 }
992out:
993 iter->pos++;
994 return fa;
995}
996
997static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
998{
999 struct fib_iter_state *iter = seq->private;
1000 struct fib_alias *fa;
1001
1002 if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
1003 fa = iter->fa;
1004 pos -= iter->pos;
1005 } else
1006 fa = fib_get_first(seq);
1007
1008 if (fa)
1009 while (pos && (fa = fib_get_next(seq)))
1010 --pos;
1011 return pos ? NULL : fa;
1012}
1013
1014static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
1015 __acquires(RCU)
1016{
1017 void *v = NULL;
1018
1019 rcu_read_lock();
1020 if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
1021 v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1022 return v;
1023}
1024
1025static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1026{
1027 ++*pos;
1028 return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
1029}
1030
1031static void fib_seq_stop(struct seq_file *seq, void *v)
1032 __releases(RCU)
1033{
1034 rcu_read_unlock();
1035}
1036
1037static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
1038{
1039 static const unsigned type2flags[RTN_MAX + 1] = {
1040 [7] = RTF_REJECT,
1041 [8] = RTF_REJECT,
1042 };
1043 unsigned flags = type2flags[type];
1044
1045 if (fi && fi->fib_nh->nh_gw)
1046 flags |= RTF_GATEWAY;
1047 if (mask == htonl(0xFFFFFFFF))
1048 flags |= RTF_HOST;
1049 flags |= RTF_UP;
1050 return flags;
1051}
1052
1053/*
1054 * This outputs /proc/net/route.
1055 *
1056 * It always works in backward compatibility mode.
1057 * The format of the file is not supposed to be changed.
1058 */
1059static int fib_seq_show(struct seq_file *seq, void *v)
1060{
1061 struct fib_iter_state *iter;
1062 int len;
1063 __be32 prefix, mask;
1064 unsigned flags;
1065 struct fib_node *f;
1066 struct fib_alias *fa;
1067 struct fib_info *fi;
1068
1069 if (v == SEQ_START_TOKEN) {
1070 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
1071 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
1072 "\tWindow\tIRTT");
1073 goto out;
1074 }
1075
1076 iter = seq->private;
1077 f = iter->fn;
1078 fa = iter->fa;
1079 fi = fa->fa_info;
1080 prefix = f->fn_key;
1081 mask = FZ_MASK(iter->zone);
1082 flags = fib_flag_trans(fa->fa_type, mask, fi);
1083 if (fi)
1084 seq_printf(seq,
1085 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
1086 fi->fib_dev ? fi->fib_dev->name : "*", prefix,
1087 fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
1088 mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
1089 fi->fib_window,
1090 fi->fib_rtt >> 3, &len);
1091 else
1092 seq_printf(seq,
1093 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
1094 prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len);
1095
1096 seq_printf(seq, "%*s\n", 127 - len, "");
1097out:
1098 return 0;
1099}
1100
1101static const struct seq_operations fib_seq_ops = {
1102 .start = fib_seq_start,
1103 .next = fib_seq_next,
1104 .stop = fib_seq_stop,
1105 .show = fib_seq_show,
1106};
1107
1108static int fib_seq_open(struct inode *inode, struct file *file)
1109{
1110 return seq_open_net(inode, file, &fib_seq_ops,
1111 sizeof(struct fib_iter_state));
1112}
1113
1114static const struct file_operations fib_seq_fops = {
1115 .owner = THIS_MODULE,
1116 .open = fib_seq_open,
1117 .read = seq_read,
1118 .llseek = seq_lseek,
1119 .release = seq_release_net,
1120};
1121
1122int __net_init fib_proc_init(struct net *net)
1123{
1124 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops))
1125 return -ENOMEM;
1126 return 0;
1127}
1128
1129void __net_exit fib_proc_exit(struct net *net)
1130{
1131 proc_net_remove(net, "route");
1132}
1133#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c079cc0ec651..4ec323875a02 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -25,9 +25,6 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
25} 25}
26 26
27/* Exported by fib_semantics.c */ 27/* Exported by fib_semantics.c */
28extern int fib_semantic_match(struct list_head *head,
29 const struct flowi *flp,
30 struct fib_result *res, int prefixlen, int fib_flags);
31extern void fib_release_info(struct fib_info *); 28extern void fib_release_info(struct fib_info *);
32extern struct fib_info *fib_create_info(struct fib_config *cfg); 29extern struct fib_info *fib_create_info(struct fib_config *cfg);
33extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); 30extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
@@ -51,4 +48,11 @@ static inline void fib_result_assign(struct fib_result *res,
51 res->fi = fi; 48 res->fi = fi;
52} 49}
53 50
51struct fib_prop {
52 int error;
53 u8 scope;
54};
55
56extern const struct fib_prop fib_props[RTN_MAX + 1];
57
54#endif /* _FIB_LOOKUP_H */ 58#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 7981a24f5c7b..a53bb1b5b118 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -41,19 +41,19 @@ struct fib4_rule {
41 __be32 srcmask; 41 __be32 srcmask;
42 __be32 dst; 42 __be32 dst;
43 __be32 dstmask; 43 __be32 dstmask;
44#ifdef CONFIG_NET_CLS_ROUTE 44#ifdef CONFIG_IP_ROUTE_CLASSID
45 u32 tclassid; 45 u32 tclassid;
46#endif 46#endif
47}; 47};
48 48
49#ifdef CONFIG_NET_CLS_ROUTE 49#ifdef CONFIG_IP_ROUTE_CLASSID
50u32 fib_rules_tclass(struct fib_result *res) 50u32 fib_rules_tclass(const struct fib_result *res)
51{ 51{
52 return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; 52 return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
53} 53}
54#endif 54#endif
55 55
56int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) 56int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
57{ 57{
58 struct fib_lookup_arg arg = { 58 struct fib_lookup_arg arg = {
59 .result = res, 59 .result = res,
@@ -61,7 +61,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
61 }; 61 };
62 int err; 62 int err;
63 63
64 err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg); 64 err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
65 res->r = arg.rule; 65 res->r = arg.rule;
66 66
67 return err; 67 return err;
@@ -95,7 +95,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
95 if (!tbl) 95 if (!tbl)
96 goto errout; 96 goto errout;
97 97
98 err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags); 98 err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
99 if (err > 0) 99 if (err > 0)
100 err = -EAGAIN; 100 err = -EAGAIN;
101errout: 101errout:
@@ -106,14 +106,15 @@ errout:
106static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) 106static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
107{ 107{
108 struct fib4_rule *r = (struct fib4_rule *) rule; 108 struct fib4_rule *r = (struct fib4_rule *) rule;
109 __be32 daddr = fl->fl4_dst; 109 struct flowi4 *fl4 = &fl->u.ip4;
110 __be32 saddr = fl->fl4_src; 110 __be32 daddr = fl4->daddr;
111 __be32 saddr = fl4->saddr;
111 112
112 if (((saddr ^ r->src) & r->srcmask) || 113 if (((saddr ^ r->src) & r->srcmask) ||
113 ((daddr ^ r->dst) & r->dstmask)) 114 ((daddr ^ r->dst) & r->dstmask))
114 return 0; 115 return 0;
115 116
116 if (r->tos && (r->tos != fl->fl4_tos)) 117 if (r->tos && (r->tos != fl4->flowi4_tos))
117 return 0; 118 return 0;
118 119
119 return 1; 120 return 1;
@@ -165,7 +166,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
165 if (frh->dst_len) 166 if (frh->dst_len)
166 rule4->dst = nla_get_be32(tb[FRA_DST]); 167 rule4->dst = nla_get_be32(tb[FRA_DST]);
167 168
168#ifdef CONFIG_NET_CLS_ROUTE 169#ifdef CONFIG_IP_ROUTE_CLASSID
169 if (tb[FRA_FLOW]) 170 if (tb[FRA_FLOW])
170 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); 171 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
171#endif 172#endif
@@ -195,7 +196,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
195 if (frh->tos && (rule4->tos != frh->tos)) 196 if (frh->tos && (rule4->tos != frh->tos))
196 return 0; 197 return 0;
197 198
198#ifdef CONFIG_NET_CLS_ROUTE 199#ifdef CONFIG_IP_ROUTE_CLASSID
199 if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) 200 if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
200 return 0; 201 return 0;
201#endif 202#endif
@@ -224,7 +225,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
224 if (rule4->src_len) 225 if (rule4->src_len)
225 NLA_PUT_BE32(skb, FRA_SRC, rule4->src); 226 NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
226 227
227#ifdef CONFIG_NET_CLS_ROUTE 228#ifdef CONFIG_IP_ROUTE_CLASSID
228 if (rule4->tclassid) 229 if (rule4->tclassid)
229 NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); 230 NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
230#endif 231#endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 12d3dc3df1b7..622ac4c95026 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -49,7 +49,7 @@
49static DEFINE_SPINLOCK(fib_info_lock); 49static DEFINE_SPINLOCK(fib_info_lock);
50static struct hlist_head *fib_info_hash; 50static struct hlist_head *fib_info_hash;
51static struct hlist_head *fib_info_laddrhash; 51static struct hlist_head *fib_info_laddrhash;
52static unsigned int fib_hash_size; 52static unsigned int fib_info_hash_size;
53static unsigned int fib_info_cnt; 53static unsigned int fib_info_cnt;
54 54
55#define DEVINDEX_HASHBITS 8 55#define DEVINDEX_HASHBITS 8
@@ -90,11 +90,7 @@ static DEFINE_SPINLOCK(fib_multipath_lock);
90#define endfor_nexthops(fi) } 90#define endfor_nexthops(fi) }
91 91
92 92
93static const struct 93const struct fib_prop fib_props[RTN_MAX + 1] = {
94{
95 int error;
96 u8 scope;
97} fib_props[RTN_MAX + 1] = {
98 [RTN_UNSPEC] = { 94 [RTN_UNSPEC] = {
99 .error = 0, 95 .error = 0,
100 .scope = RT_SCOPE_NOWHERE, 96 .scope = RT_SCOPE_NOWHERE,
@@ -152,6 +148,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
152{ 148{
153 struct fib_info *fi = container_of(head, struct fib_info, rcu); 149 struct fib_info *fi = container_of(head, struct fib_info, rcu);
154 150
151 if (fi->fib_metrics != (u32 *) dst_default_metrics)
152 kfree(fi->fib_metrics);
155 kfree(fi); 153 kfree(fi);
156} 154}
157 155
@@ -200,7 +198,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
200#ifdef CONFIG_IP_ROUTE_MULTIPATH 198#ifdef CONFIG_IP_ROUTE_MULTIPATH
201 nh->nh_weight != onh->nh_weight || 199 nh->nh_weight != onh->nh_weight ||
202#endif 200#endif
203#ifdef CONFIG_NET_CLS_ROUTE 201#ifdef CONFIG_IP_ROUTE_CLASSID
204 nh->nh_tclassid != onh->nh_tclassid || 202 nh->nh_tclassid != onh->nh_tclassid ||
205#endif 203#endif
206 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) 204 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
@@ -221,7 +219,7 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
221 219
222static inline unsigned int fib_info_hashfn(const struct fib_info *fi) 220static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
223{ 221{
224 unsigned int mask = (fib_hash_size - 1); 222 unsigned int mask = (fib_info_hash_size - 1);
225 unsigned int val = fi->fib_nhs; 223 unsigned int val = fi->fib_nhs;
226 224
227 val ^= fi->fib_protocol; 225 val ^= fi->fib_protocol;
@@ -422,7 +420,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
422 420
423 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 421 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
424 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; 422 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
425#ifdef CONFIG_NET_CLS_ROUTE 423#ifdef CONFIG_IP_ROUTE_CLASSID
426 nla = nla_find(attrs, attrlen, RTA_FLOW); 424 nla = nla_find(attrs, attrlen, RTA_FLOW);
427 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 425 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
428#endif 426#endif
@@ -476,7 +474,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
476 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 474 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
477 if (nla && nla_get_be32(nla) != nh->nh_gw) 475 if (nla && nla_get_be32(nla) != nh->nh_gw)
478 return 1; 476 return 1;
479#ifdef CONFIG_NET_CLS_ROUTE 477#ifdef CONFIG_IP_ROUTE_CLASSID
480 nla = nla_find(attrs, attrlen, RTA_FLOW); 478 nla = nla_find(attrs, attrlen, RTA_FLOW);
481 if (nla && nla_get_u32(nla) != nh->nh_tclassid) 479 if (nla && nla_get_u32(nla) != nh->nh_tclassid)
482 return 1; 480 return 1;
@@ -562,16 +560,16 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
562 } 560 }
563 rcu_read_lock(); 561 rcu_read_lock();
564 { 562 {
565 struct flowi fl = { 563 struct flowi4 fl4 = {
566 .fl4_dst = nh->nh_gw, 564 .daddr = nh->nh_gw,
567 .fl4_scope = cfg->fc_scope + 1, 565 .flowi4_scope = cfg->fc_scope + 1,
568 .oif = nh->nh_oif, 566 .flowi4_oif = nh->nh_oif,
569 }; 567 };
570 568
571 /* It is not necessary, but requires a bit of thinking */ 569 /* It is not necessary, but requires a bit of thinking */
572 if (fl.fl4_scope < RT_SCOPE_LINK) 570 if (fl4.flowi4_scope < RT_SCOPE_LINK)
573 fl.fl4_scope = RT_SCOPE_LINK; 571 fl4.flowi4_scope = RT_SCOPE_LINK;
574 err = fib_lookup(net, &fl, &res); 572 err = fib_lookup(net, &fl4, &res);
575 if (err) { 573 if (err) {
576 rcu_read_unlock(); 574 rcu_read_unlock();
577 return err; 575 return err;
@@ -613,14 +611,14 @@ out:
613 611
614static inline unsigned int fib_laddr_hashfn(__be32 val) 612static inline unsigned int fib_laddr_hashfn(__be32 val)
615{ 613{
616 unsigned int mask = (fib_hash_size - 1); 614 unsigned int mask = (fib_info_hash_size - 1);
617 615
618 return ((__force u32)val ^ 616 return ((__force u32)val ^
619 ((__force u32)val >> 7) ^ 617 ((__force u32)val >> 7) ^
620 ((__force u32)val >> 14)) & mask; 618 ((__force u32)val >> 14)) & mask;
621} 619}
622 620
623static struct hlist_head *fib_hash_alloc(int bytes) 621static struct hlist_head *fib_info_hash_alloc(int bytes)
624{ 622{
625 if (bytes <= PAGE_SIZE) 623 if (bytes <= PAGE_SIZE)
626 return kzalloc(bytes, GFP_KERNEL); 624 return kzalloc(bytes, GFP_KERNEL);
@@ -630,7 +628,7 @@ static struct hlist_head *fib_hash_alloc(int bytes)
630 get_order(bytes)); 628 get_order(bytes));
631} 629}
632 630
633static void fib_hash_free(struct hlist_head *hash, int bytes) 631static void fib_info_hash_free(struct hlist_head *hash, int bytes)
634{ 632{
635 if (!hash) 633 if (!hash)
636 return; 634 return;
@@ -641,18 +639,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes)
641 free_pages((unsigned long) hash, get_order(bytes)); 639 free_pages((unsigned long) hash, get_order(bytes));
642} 640}
643 641
644static void fib_hash_move(struct hlist_head *new_info_hash, 642static void fib_info_hash_move(struct hlist_head *new_info_hash,
645 struct hlist_head *new_laddrhash, 643 struct hlist_head *new_laddrhash,
646 unsigned int new_size) 644 unsigned int new_size)
647{ 645{
648 struct hlist_head *old_info_hash, *old_laddrhash; 646 struct hlist_head *old_info_hash, *old_laddrhash;
649 unsigned int old_size = fib_hash_size; 647 unsigned int old_size = fib_info_hash_size;
650 unsigned int i, bytes; 648 unsigned int i, bytes;
651 649
652 spin_lock_bh(&fib_info_lock); 650 spin_lock_bh(&fib_info_lock);
653 old_info_hash = fib_info_hash; 651 old_info_hash = fib_info_hash;
654 old_laddrhash = fib_info_laddrhash; 652 old_laddrhash = fib_info_laddrhash;
655 fib_hash_size = new_size; 653 fib_info_hash_size = new_size;
656 654
657 for (i = 0; i < old_size; i++) { 655 for (i = 0; i < old_size; i++) {
658 struct hlist_head *head = &fib_info_hash[i]; 656 struct hlist_head *head = &fib_info_hash[i];
@@ -693,8 +691,8 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
693 spin_unlock_bh(&fib_info_lock); 691 spin_unlock_bh(&fib_info_lock);
694 692
695 bytes = old_size * sizeof(struct hlist_head *); 693 bytes = old_size * sizeof(struct hlist_head *);
696 fib_hash_free(old_info_hash, bytes); 694 fib_info_hash_free(old_info_hash, bytes);
697 fib_hash_free(old_laddrhash, bytes); 695 fib_info_hash_free(old_laddrhash, bytes);
698} 696}
699 697
700struct fib_info *fib_create_info(struct fib_config *cfg) 698struct fib_info *fib_create_info(struct fib_config *cfg)
@@ -705,6 +703,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
705 int nhs = 1; 703 int nhs = 1;
706 struct net *net = cfg->fc_nlinfo.nl_net; 704 struct net *net = cfg->fc_nlinfo.nl_net;
707 705
706 if (cfg->fc_type > RTN_MAX)
707 goto err_inval;
708
708 /* Fast check to catch the most weird cases */ 709 /* Fast check to catch the most weird cases */
709 if (fib_props[cfg->fc_type].scope > cfg->fc_scope) 710 if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
710 goto err_inval; 711 goto err_inval;
@@ -718,8 +719,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
718#endif 719#endif
719 720
720 err = -ENOBUFS; 721 err = -ENOBUFS;
721 if (fib_info_cnt >= fib_hash_size) { 722 if (fib_info_cnt >= fib_info_hash_size) {
722 unsigned int new_size = fib_hash_size << 1; 723 unsigned int new_size = fib_info_hash_size << 1;
723 struct hlist_head *new_info_hash; 724 struct hlist_head *new_info_hash;
724 struct hlist_head *new_laddrhash; 725 struct hlist_head *new_laddrhash;
725 unsigned int bytes; 726 unsigned int bytes;
@@ -727,21 +728,27 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
727 if (!new_size) 728 if (!new_size)
728 new_size = 1; 729 new_size = 1;
729 bytes = new_size * sizeof(struct hlist_head *); 730 bytes = new_size * sizeof(struct hlist_head *);
730 new_info_hash = fib_hash_alloc(bytes); 731 new_info_hash = fib_info_hash_alloc(bytes);
731 new_laddrhash = fib_hash_alloc(bytes); 732 new_laddrhash = fib_info_hash_alloc(bytes);
732 if (!new_info_hash || !new_laddrhash) { 733 if (!new_info_hash || !new_laddrhash) {
733 fib_hash_free(new_info_hash, bytes); 734 fib_info_hash_free(new_info_hash, bytes);
734 fib_hash_free(new_laddrhash, bytes); 735 fib_info_hash_free(new_laddrhash, bytes);
735 } else 736 } else
736 fib_hash_move(new_info_hash, new_laddrhash, new_size); 737 fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
737 738
738 if (!fib_hash_size) 739 if (!fib_info_hash_size)
739 goto failure; 740 goto failure;
740 } 741 }
741 742
742 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); 743 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
743 if (fi == NULL) 744 if (fi == NULL)
744 goto failure; 745 goto failure;
746 if (cfg->fc_mx) {
747 fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
748 if (!fi->fib_metrics)
749 goto failure;
750 } else
751 fi->fib_metrics = (u32 *) dst_default_metrics;
745 fib_info_cnt++; 752 fib_info_cnt++;
746 753
747 fi->fib_net = hold_net(net); 754 fi->fib_net = hold_net(net);
@@ -779,7 +786,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
779 goto err_inval; 786 goto err_inval;
780 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) 787 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
781 goto err_inval; 788 goto err_inval;
782#ifdef CONFIG_NET_CLS_ROUTE 789#ifdef CONFIG_IP_ROUTE_CLASSID
783 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) 790 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
784 goto err_inval; 791 goto err_inval;
785#endif 792#endif
@@ -792,7 +799,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
792 nh->nh_oif = cfg->fc_oif; 799 nh->nh_oif = cfg->fc_oif;
793 nh->nh_gw = cfg->fc_gw; 800 nh->nh_gw = cfg->fc_gw;
794 nh->nh_flags = cfg->fc_flags; 801 nh->nh_flags = cfg->fc_flags;
795#ifdef CONFIG_NET_CLS_ROUTE 802#ifdef CONFIG_IP_ROUTE_CLASSID
796 nh->nh_tclassid = cfg->fc_flow; 803 nh->nh_tclassid = cfg->fc_flow;
797#endif 804#endif
798#ifdef CONFIG_IP_ROUTE_MULTIPATH 805#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -804,6 +811,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
804 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) 811 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
805 goto err_inval; 812 goto err_inval;
806 goto link_it; 813 goto link_it;
814 } else {
815 switch (cfg->fc_type) {
816 case RTN_UNICAST:
817 case RTN_LOCAL:
818 case RTN_BROADCAST:
819 case RTN_ANYCAST:
820 case RTN_MULTICAST:
821 break;
822 default:
823 goto err_inval;
824 }
807 } 825 }
808 826
809 if (cfg->fc_scope > RT_SCOPE_HOST) 827 if (cfg->fc_scope > RT_SCOPE_HOST)
@@ -835,6 +853,13 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
835 goto err_inval; 853 goto err_inval;
836 } 854 }
837 855
856 change_nexthops(fi) {
857 nexthop_nh->nh_cfg_scope = cfg->fc_scope;
858 nexthop_nh->nh_saddr = inet_select_addr(nexthop_nh->nh_dev,
859 nexthop_nh->nh_gw,
860 nexthop_nh->nh_cfg_scope);
861 } endfor_nexthops(fi)
862
838link_it: 863link_it:
839 ofi = fib_find_info(fi); 864 ofi = fib_find_info(fi);
840 if (ofi) { 865 if (ofi) {
@@ -880,84 +905,6 @@ failure:
880 return ERR_PTR(err); 905 return ERR_PTR(err);
881} 906}
882 907
883/* Note! fib_semantic_match intentionally uses RCU list functions. */
884int fib_semantic_match(struct list_head *head, const struct flowi *flp,
885 struct fib_result *res, int prefixlen, int fib_flags)
886{
887 struct fib_alias *fa;
888 int nh_sel = 0;
889
890 list_for_each_entry_rcu(fa, head, fa_list) {
891 int err;
892
893 if (fa->fa_tos &&
894 fa->fa_tos != flp->fl4_tos)
895 continue;
896
897 if (fa->fa_scope < flp->fl4_scope)
898 continue;
899
900 fib_alias_accessed(fa);
901
902 err = fib_props[fa->fa_type].error;
903 if (err == 0) {
904 struct fib_info *fi = fa->fa_info;
905
906 if (fi->fib_flags & RTNH_F_DEAD)
907 continue;
908
909 switch (fa->fa_type) {
910 case RTN_UNICAST:
911 case RTN_LOCAL:
912 case RTN_BROADCAST:
913 case RTN_ANYCAST:
914 case RTN_MULTICAST:
915 for_nexthops(fi) {
916 if (nh->nh_flags & RTNH_F_DEAD)
917 continue;
918 if (!flp->oif || flp->oif == nh->nh_oif)
919 break;
920 }
921#ifdef CONFIG_IP_ROUTE_MULTIPATH
922 if (nhsel < fi->fib_nhs) {
923 nh_sel = nhsel;
924 goto out_fill_res;
925 }
926#else
927 if (nhsel < 1)
928 goto out_fill_res;
929#endif
930 endfor_nexthops(fi);
931 continue;
932
933 default:
934 pr_warning("fib_semantic_match bad type %#x\n",
935 fa->fa_type);
936 return -EINVAL;
937 }
938 }
939 return err;
940 }
941 return 1;
942
943out_fill_res:
944 res->prefixlen = prefixlen;
945 res->nh_sel = nh_sel;
946 res->type = fa->fa_type;
947 res->scope = fa->fa_scope;
948 res->fi = fa->fa_info;
949 if (!(fib_flags & FIB_LOOKUP_NOREF))
950 atomic_inc(&res->fi->fib_clntref);
951 return 0;
952}
953
954/* Find appropriate source address to this destination */
955
956__be32 __fib_res_prefsrc(struct fib_result *res)
957{
958 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
959}
960
961int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 908int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
962 u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, 909 u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
963 struct fib_info *fi, unsigned int flags) 910 struct fib_info *fi, unsigned int flags)
@@ -1002,7 +949,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
1002 949
1003 if (fi->fib_nh->nh_oif) 950 if (fi->fib_nh->nh_oif)
1004 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); 951 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
1005#ifdef CONFIG_NET_CLS_ROUTE 952#ifdef CONFIG_IP_ROUTE_CLASSID
1006 if (fi->fib_nh[0].nh_tclassid) 953 if (fi->fib_nh[0].nh_tclassid)
1007 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); 954 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
1008#endif 955#endif
@@ -1027,7 +974,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
1027 974
1028 if (nh->nh_gw) 975 if (nh->nh_gw)
1029 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); 976 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1030#ifdef CONFIG_NET_CLS_ROUTE 977#ifdef CONFIG_IP_ROUTE_CLASSID
1031 if (nh->nh_tclassid) 978 if (nh->nh_tclassid)
1032 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); 979 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1033#endif 980#endif
@@ -1125,6 +1072,80 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1125 return ret; 1072 return ret;
1126} 1073}
1127 1074
1075/* Must be invoked inside of an RCU protected region. */
1076void fib_select_default(struct fib_result *res)
1077{
1078 struct fib_info *fi = NULL, *last_resort = NULL;
1079 struct list_head *fa_head = res->fa_head;
1080 struct fib_table *tb = res->table;
1081 int order = -1, last_idx = -1;
1082 struct fib_alias *fa;
1083
1084 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1085 struct fib_info *next_fi = fa->fa_info;
1086
1087 if (fa->fa_scope != res->scope ||
1088 fa->fa_type != RTN_UNICAST)
1089 continue;
1090
1091 if (next_fi->fib_priority > res->fi->fib_priority)
1092 break;
1093 if (!next_fi->fib_nh[0].nh_gw ||
1094 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1095 continue;
1096
1097 fib_alias_accessed(fa);
1098
1099 if (fi == NULL) {
1100 if (next_fi != res->fi)
1101 break;
1102 } else if (!fib_detect_death(fi, order, &last_resort,
1103 &last_idx, tb->tb_default)) {
1104 fib_result_assign(res, fi);
1105 tb->tb_default = order;
1106 goto out;
1107 }
1108 fi = next_fi;
1109 order++;
1110 }
1111
1112 if (order <= 0 || fi == NULL) {
1113 tb->tb_default = -1;
1114 goto out;
1115 }
1116
1117 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1118 tb->tb_default)) {
1119 fib_result_assign(res, fi);
1120 tb->tb_default = order;
1121 goto out;
1122 }
1123
1124 if (last_idx >= 0)
1125 fib_result_assign(res, last_resort);
1126 tb->tb_default = last_idx;
1127out:
1128 return;
1129}
1130
1131void fib_update_nh_saddrs(struct net_device *dev)
1132{
1133 struct hlist_head *head;
1134 struct hlist_node *node;
1135 struct fib_nh *nh;
1136 unsigned int hash;
1137
1138 hash = fib_devindex_hashfn(dev->ifindex);
1139 head = &fib_info_devhash[hash];
1140 hlist_for_each_entry(nh, node, head, nh_hash) {
1141 if (nh->nh_dev != dev)
1142 continue;
1143 nh->nh_saddr = inet_select_addr(nh->nh_dev,
1144 nh->nh_gw,
1145 nh->nh_cfg_scope);
1146 }
1147}
1148
1128#ifdef CONFIG_IP_ROUTE_MULTIPATH 1149#ifdef CONFIG_IP_ROUTE_MULTIPATH
1129 1150
1130/* 1151/*
@@ -1189,7 +1210,7 @@ int fib_sync_up(struct net_device *dev)
1189 * The algorithm is suboptimal, but it provides really 1210 * The algorithm is suboptimal, but it provides really
1190 * fair weighted route distribution. 1211 * fair weighted route distribution.
1191 */ 1212 */
1192void fib_select_multipath(const struct flowi *flp, struct fib_result *res) 1213void fib_select_multipath(struct fib_result *res)
1193{ 1214{
1194 struct fib_info *fi = res->fi; 1215 struct fib_info *fi = res->fi;
1195 int w; 1216 int w;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0f280348e0fd..3d28a35c2e1a 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -95,7 +95,7 @@ typedef unsigned int t_key;
95#define IS_TNODE(n) (!(n->parent & T_LEAF)) 95#define IS_TNODE(n) (!(n->parent & T_LEAF))
96#define IS_LEAF(n) (n->parent & T_LEAF) 96#define IS_LEAF(n) (n->parent & T_LEAF)
97 97
98struct node { 98struct rt_trie_node {
99 unsigned long parent; 99 unsigned long parent;
100 t_key key; 100 t_key key;
101}; 101};
@@ -126,7 +126,7 @@ struct tnode {
126 struct work_struct work; 126 struct work_struct work;
127 struct tnode *tnode_free; 127 struct tnode *tnode_free;
128 }; 128 };
129 struct node *child[0]; 129 struct rt_trie_node *child[0];
130}; 130};
131 131
132#ifdef CONFIG_IP_FIB_TRIE_STATS 132#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -151,16 +151,16 @@ struct trie_stat {
151}; 151};
152 152
153struct trie { 153struct trie {
154 struct node *trie; 154 struct rt_trie_node *trie;
155#ifdef CONFIG_IP_FIB_TRIE_STATS 155#ifdef CONFIG_IP_FIB_TRIE_STATS
156 struct trie_use_stats stats; 156 struct trie_use_stats stats;
157#endif 157#endif
158}; 158};
159 159
160static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); 160static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
161static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, 161static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
162 int wasfull); 162 int wasfull);
163static struct node *resize(struct trie *t, struct tnode *tn); 163static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
164static struct tnode *inflate(struct trie *t, struct tnode *tn); 164static struct tnode *inflate(struct trie *t, struct tnode *tn);
165static struct tnode *halve(struct trie *t, struct tnode *tn); 165static struct tnode *halve(struct trie *t, struct tnode *tn);
166/* tnodes to free after resize(); protected by RTNL */ 166/* tnodes to free after resize(); protected by RTNL */
@@ -177,12 +177,12 @@ static const int sync_pages = 128;
177static struct kmem_cache *fn_alias_kmem __read_mostly; 177static struct kmem_cache *fn_alias_kmem __read_mostly;
178static struct kmem_cache *trie_leaf_kmem __read_mostly; 178static struct kmem_cache *trie_leaf_kmem __read_mostly;
179 179
180static inline struct tnode *node_parent(struct node *node) 180static inline struct tnode *node_parent(struct rt_trie_node *node)
181{ 181{
182 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); 182 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
183} 183}
184 184
185static inline struct tnode *node_parent_rcu(struct node *node) 185static inline struct tnode *node_parent_rcu(struct rt_trie_node *node)
186{ 186{
187 struct tnode *ret = node_parent(node); 187 struct tnode *ret = node_parent(node);
188 188
@@ -192,22 +192,22 @@ static inline struct tnode *node_parent_rcu(struct node *node)
192/* Same as rcu_assign_pointer 192/* Same as rcu_assign_pointer
193 * but that macro() assumes that value is a pointer. 193 * but that macro() assumes that value is a pointer.
194 */ 194 */
195static inline void node_set_parent(struct node *node, struct tnode *ptr) 195static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
196{ 196{
197 smp_wmb(); 197 smp_wmb();
198 node->parent = (unsigned long)ptr | NODE_TYPE(node); 198 node->parent = (unsigned long)ptr | NODE_TYPE(node);
199} 199}
200 200
201static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) 201static inline struct rt_trie_node *tnode_get_child(struct tnode *tn, unsigned int i)
202{ 202{
203 BUG_ON(i >= 1U << tn->bits); 203 BUG_ON(i >= 1U << tn->bits);
204 204
205 return tn->child[i]; 205 return tn->child[i];
206} 206}
207 207
208static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) 208static inline struct rt_trie_node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
209{ 209{
210 struct node *ret = tnode_get_child(tn, i); 210 struct rt_trie_node *ret = tnode_get_child(tn, i);
211 211
212 return rcu_dereference_rtnl(ret); 212 return rcu_dereference_rtnl(ret);
213} 213}
@@ -217,12 +217,12 @@ static inline int tnode_child_length(const struct tnode *tn)
217 return 1 << tn->bits; 217 return 1 << tn->bits;
218} 218}
219 219
220static inline t_key mask_pfx(t_key k, unsigned short l) 220static inline t_key mask_pfx(t_key k, unsigned int l)
221{ 221{
222 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); 222 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
223} 223}
224 224
225static inline t_key tkey_extract_bits(t_key a, int offset, int bits) 225static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
226{ 226{
227 if (offset < KEYLENGTH) 227 if (offset < KEYLENGTH)
228 return ((t_key)(a << offset)) >> (KEYLENGTH - bits); 228 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
@@ -378,7 +378,7 @@ static void __tnode_free_rcu(struct rcu_head *head)
378{ 378{
379 struct tnode *tn = container_of(head, struct tnode, rcu); 379 struct tnode *tn = container_of(head, struct tnode, rcu);
380 size_t size = sizeof(struct tnode) + 380 size_t size = sizeof(struct tnode) +
381 (sizeof(struct node *) << tn->bits); 381 (sizeof(struct rt_trie_node *) << tn->bits);
382 382
383 if (size <= PAGE_SIZE) 383 if (size <= PAGE_SIZE)
384 kfree(tn); 384 kfree(tn);
@@ -402,7 +402,7 @@ static void tnode_free_safe(struct tnode *tn)
402 tn->tnode_free = tnode_free_head; 402 tn->tnode_free = tnode_free_head;
403 tnode_free_head = tn; 403 tnode_free_head = tn;
404 tnode_free_size += sizeof(struct tnode) + 404 tnode_free_size += sizeof(struct tnode) +
405 (sizeof(struct node *) << tn->bits); 405 (sizeof(struct rt_trie_node *) << tn->bits);
406} 406}
407 407
408static void tnode_free_flush(void) 408static void tnode_free_flush(void)
@@ -443,7 +443,7 @@ static struct leaf_info *leaf_info_new(int plen)
443 443
444static struct tnode *tnode_new(t_key key, int pos, int bits) 444static struct tnode *tnode_new(t_key key, int pos, int bits)
445{ 445{
446 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); 446 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
447 struct tnode *tn = tnode_alloc(sz); 447 struct tnode *tn = tnode_alloc(sz);
448 448
449 if (tn) { 449 if (tn) {
@@ -456,7 +456,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
456 } 456 }
457 457
458 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), 458 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
459 sizeof(struct node) << bits); 459 sizeof(struct rt_trie_node) << bits);
460 return tn; 460 return tn;
461} 461}
462 462
@@ -465,7 +465,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
465 * and no bits are skipped. See discussion in dyntree paper p. 6 465 * and no bits are skipped. See discussion in dyntree paper p. 6
466 */ 466 */
467 467
468static inline int tnode_full(const struct tnode *tn, const struct node *n) 468static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
469{ 469{
470 if (n == NULL || IS_LEAF(n)) 470 if (n == NULL || IS_LEAF(n))
471 return 0; 471 return 0;
@@ -474,7 +474,7 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n)
474} 474}
475 475
476static inline void put_child(struct trie *t, struct tnode *tn, int i, 476static inline void put_child(struct trie *t, struct tnode *tn, int i,
477 struct node *n) 477 struct rt_trie_node *n)
478{ 478{
479 tnode_put_child_reorg(tn, i, n, -1); 479 tnode_put_child_reorg(tn, i, n, -1);
480} 480}
@@ -484,10 +484,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i,
484 * Update the value of full_children and empty_children. 484 * Update the value of full_children and empty_children.
485 */ 485 */
486 486
487static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, 487static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
488 int wasfull) 488 int wasfull)
489{ 489{
490 struct node *chi = tn->child[i]; 490 struct rt_trie_node *chi = tn->child[i];
491 int isfull; 491 int isfull;
492 492
493 BUG_ON(i >= 1<<tn->bits); 493 BUG_ON(i >= 1<<tn->bits);
@@ -515,7 +515,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
515} 515}
516 516
517#define MAX_WORK 10 517#define MAX_WORK 10
518static struct node *resize(struct trie *t, struct tnode *tn) 518static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
519{ 519{
520 int i; 520 int i;
521 struct tnode *old_tn; 521 struct tnode *old_tn;
@@ -605,7 +605,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
605 605
606 /* Keep root node larger */ 606 /* Keep root node larger */
607 607
608 if (!node_parent((struct node *)tn)) { 608 if (!node_parent((struct rt_trie_node *)tn)) {
609 inflate_threshold_use = inflate_threshold_root; 609 inflate_threshold_use = inflate_threshold_root;
610 halve_threshold_use = halve_threshold_root; 610 halve_threshold_use = halve_threshold_root;
611 } else { 611 } else {
@@ -635,7 +635,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
635 635
636 /* Return if at least one inflate is run */ 636 /* Return if at least one inflate is run */
637 if (max_work != MAX_WORK) 637 if (max_work != MAX_WORK)
638 return (struct node *) tn; 638 return (struct rt_trie_node *) tn;
639 639
640 /* 640 /*
641 * Halve as long as the number of empty children in this 641 * Halve as long as the number of empty children in this
@@ -663,7 +663,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
663 if (tn->empty_children == tnode_child_length(tn) - 1) { 663 if (tn->empty_children == tnode_child_length(tn) - 1) {
664one_child: 664one_child:
665 for (i = 0; i < tnode_child_length(tn); i++) { 665 for (i = 0; i < tnode_child_length(tn); i++) {
666 struct node *n; 666 struct rt_trie_node *n;
667 667
668 n = tn->child[i]; 668 n = tn->child[i];
669 if (!n) 669 if (!n)
@@ -676,7 +676,7 @@ one_child:
676 return n; 676 return n;
677 } 677 }
678 } 678 }
679 return (struct node *) tn; 679 return (struct rt_trie_node *) tn;
680} 680}
681 681
682static struct tnode *inflate(struct trie *t, struct tnode *tn) 682static struct tnode *inflate(struct trie *t, struct tnode *tn)
@@ -723,14 +723,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
723 goto nomem; 723 goto nomem;
724 } 724 }
725 725
726 put_child(t, tn, 2*i, (struct node *) left); 726 put_child(t, tn, 2*i, (struct rt_trie_node *) left);
727 put_child(t, tn, 2*i+1, (struct node *) right); 727 put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
728 } 728 }
729 } 729 }
730 730
731 for (i = 0; i < olen; i++) { 731 for (i = 0; i < olen; i++) {
732 struct tnode *inode; 732 struct tnode *inode;
733 struct node *node = tnode_get_child(oldtnode, i); 733 struct rt_trie_node *node = tnode_get_child(oldtnode, i);
734 struct tnode *left, *right; 734 struct tnode *left, *right;
735 int size, j; 735 int size, j;
736 736
@@ -825,7 +825,7 @@ nomem:
825static struct tnode *halve(struct trie *t, struct tnode *tn) 825static struct tnode *halve(struct trie *t, struct tnode *tn)
826{ 826{
827 struct tnode *oldtnode = tn; 827 struct tnode *oldtnode = tn;
828 struct node *left, *right; 828 struct rt_trie_node *left, *right;
829 int i; 829 int i;
830 int olen = tnode_child_length(tn); 830 int olen = tnode_child_length(tn);
831 831
@@ -856,7 +856,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
856 if (!newn) 856 if (!newn)
857 goto nomem; 857 goto nomem;
858 858
859 put_child(t, tn, i/2, (struct node *)newn); 859 put_child(t, tn, i/2, (struct rt_trie_node *)newn);
860 } 860 }
861 861
862 } 862 }
@@ -958,7 +958,7 @@ fib_find_node(struct trie *t, u32 key)
958{ 958{
959 int pos; 959 int pos;
960 struct tnode *tn; 960 struct tnode *tn;
961 struct node *n; 961 struct rt_trie_node *n;
962 962
963 pos = 0; 963 pos = 0;
964 n = rcu_dereference_rtnl(t->trie); 964 n = rcu_dereference_rtnl(t->trie);
@@ -993,17 +993,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
993 993
994 key = tn->key; 994 key = tn->key;
995 995
996 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { 996 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
997 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 997 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
998 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 998 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
999 tn = (struct tnode *) resize(t, (struct tnode *)tn); 999 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1000 1000
1001 tnode_put_child_reorg((struct tnode *)tp, cindex, 1001 tnode_put_child_reorg((struct tnode *)tp, cindex,
1002 (struct node *)tn, wasfull); 1002 (struct rt_trie_node *)tn, wasfull);
1003 1003
1004 tp = node_parent((struct node *) tn); 1004 tp = node_parent((struct rt_trie_node *) tn);
1005 if (!tp) 1005 if (!tp)
1006 rcu_assign_pointer(t->trie, (struct node *)tn); 1006 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1007 1007
1008 tnode_free_flush(); 1008 tnode_free_flush();
1009 if (!tp) 1009 if (!tp)
@@ -1015,7 +1015,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1015 if (IS_TNODE(tn)) 1015 if (IS_TNODE(tn))
1016 tn = (struct tnode *)resize(t, (struct tnode *)tn); 1016 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1017 1017
1018 rcu_assign_pointer(t->trie, (struct node *)tn); 1018 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1019 tnode_free_flush(); 1019 tnode_free_flush();
1020} 1020}
1021 1021
@@ -1025,7 +1025,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1025{ 1025{
1026 int pos, newpos; 1026 int pos, newpos;
1027 struct tnode *tp = NULL, *tn = NULL; 1027 struct tnode *tp = NULL, *tn = NULL;
1028 struct node *n; 1028 struct rt_trie_node *n;
1029 struct leaf *l; 1029 struct leaf *l;
1030 int missbit; 1030 int missbit;
1031 struct list_head *fa_head = NULL; 1031 struct list_head *fa_head = NULL;
@@ -1111,10 +1111,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1111 if (t->trie && n == NULL) { 1111 if (t->trie && n == NULL) {
1112 /* Case 2: n is NULL, and will just insert a new leaf */ 1112 /* Case 2: n is NULL, and will just insert a new leaf */
1113 1113
1114 node_set_parent((struct node *)l, tp); 1114 node_set_parent((struct rt_trie_node *)l, tp);
1115 1115
1116 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1116 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1117 put_child(t, (struct tnode *)tp, cindex, (struct node *)l); 1117 put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
1118 } else { 1118 } else {
1119 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1119 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1120 /* 1120 /*
@@ -1141,18 +1141,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1141 return NULL; 1141 return NULL;
1142 } 1142 }
1143 1143
1144 node_set_parent((struct node *)tn, tp); 1144 node_set_parent((struct rt_trie_node *)tn, tp);
1145 1145
1146 missbit = tkey_extract_bits(key, newpos, 1); 1146 missbit = tkey_extract_bits(key, newpos, 1);
1147 put_child(t, tn, missbit, (struct node *)l); 1147 put_child(t, tn, missbit, (struct rt_trie_node *)l);
1148 put_child(t, tn, 1-missbit, n); 1148 put_child(t, tn, 1-missbit, n);
1149 1149
1150 if (tp) { 1150 if (tp) {
1151 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1151 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1152 put_child(t, (struct tnode *)tp, cindex, 1152 put_child(t, (struct tnode *)tp, cindex,
1153 (struct node *)tn); 1153 (struct rt_trie_node *)tn);
1154 } else { 1154 } else {
1155 rcu_assign_pointer(t->trie, (struct node *)tn); 1155 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1156 tp = tn; 1156 tp = tn;
1157 } 1157 }
1158 } 1158 }
@@ -1340,8 +1340,8 @@ err:
1340} 1340}
1341 1341
1342/* should be called with rcu_read_lock */ 1342/* should be called with rcu_read_lock */
1343static int check_leaf(struct trie *t, struct leaf *l, 1343static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1344 t_key key, const struct flowi *flp, 1344 t_key key, const struct flowi4 *flp,
1345 struct fib_result *res, int fib_flags) 1345 struct fib_result *res, int fib_flags)
1346{ 1346{
1347 struct leaf_info *li; 1347 struct leaf_info *li;
@@ -1349,40 +1349,75 @@ static int check_leaf(struct trie *t, struct leaf *l,
1349 struct hlist_node *node; 1349 struct hlist_node *node;
1350 1350
1351 hlist_for_each_entry_rcu(li, node, hhead, hlist) { 1351 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1352 int err; 1352 struct fib_alias *fa;
1353 int plen = li->plen; 1353 int plen = li->plen;
1354 __be32 mask = inet_make_mask(plen); 1354 __be32 mask = inet_make_mask(plen);
1355 1355
1356 if (l->key != (key & ntohl(mask))) 1356 if (l->key != (key & ntohl(mask)))
1357 continue; 1357 continue;
1358 1358
1359 err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags); 1359 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1360 struct fib_info *fi = fa->fa_info;
1361 int nhsel, err;
1360 1362
1363 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1364 continue;
1365 if (fa->fa_scope < flp->flowi4_scope)
1366 continue;
1367 fib_alias_accessed(fa);
1368 err = fib_props[fa->fa_type].error;
1369 if (err) {
1361#ifdef CONFIG_IP_FIB_TRIE_STATS 1370#ifdef CONFIG_IP_FIB_TRIE_STATS
1362 if (err <= 0) 1371 t->stats.semantic_match_miss++;
1363 t->stats.semantic_match_passed++; 1372#endif
1364 else 1373 return 1;
1365 t->stats.semantic_match_miss++; 1374 }
1375 if (fi->fib_flags & RTNH_F_DEAD)
1376 continue;
1377 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1378 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1379
1380 if (nh->nh_flags & RTNH_F_DEAD)
1381 continue;
1382 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1383 continue;
1384
1385#ifdef CONFIG_IP_FIB_TRIE_STATS
1386 t->stats.semantic_match_passed++;
1387#endif
1388 res->prefixlen = plen;
1389 res->nh_sel = nhsel;
1390 res->type = fa->fa_type;
1391 res->scope = fa->fa_scope;
1392 res->fi = fi;
1393 res->table = tb;
1394 res->fa_head = &li->falh;
1395 if (!(fib_flags & FIB_LOOKUP_NOREF))
1396 atomic_inc(&res->fi->fib_clntref);
1397 return 0;
1398 }
1399 }
1400
1401#ifdef CONFIG_IP_FIB_TRIE_STATS
1402 t->stats.semantic_match_miss++;
1366#endif 1403#endif
1367 if (err <= 0)
1368 return err;
1369 } 1404 }
1370 1405
1371 return 1; 1406 return 1;
1372} 1407}
1373 1408
1374int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, 1409int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1375 struct fib_result *res, int fib_flags) 1410 struct fib_result *res, int fib_flags)
1376{ 1411{
1377 struct trie *t = (struct trie *) tb->tb_data; 1412 struct trie *t = (struct trie *) tb->tb_data;
1378 int ret; 1413 int ret;
1379 struct node *n; 1414 struct rt_trie_node *n;
1380 struct tnode *pn; 1415 struct tnode *pn;
1381 int pos, bits; 1416 unsigned int pos, bits;
1382 t_key key = ntohl(flp->fl4_dst); 1417 t_key key = ntohl(flp->daddr);
1383 int chopped_off; 1418 unsigned int chopped_off;
1384 t_key cindex = 0; 1419 t_key cindex = 0;
1385 int current_prefix_length = KEYLENGTH; 1420 unsigned int current_prefix_length = KEYLENGTH;
1386 struct tnode *cn; 1421 struct tnode *cn;
1387 t_key pref_mismatch; 1422 t_key pref_mismatch;
1388 1423
@@ -1398,7 +1433,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1398 1433
1399 /* Just a leaf? */ 1434 /* Just a leaf? */
1400 if (IS_LEAF(n)) { 1435 if (IS_LEAF(n)) {
1401 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); 1436 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1402 goto found; 1437 goto found;
1403 } 1438 }
1404 1439
@@ -1423,7 +1458,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1423 } 1458 }
1424 1459
1425 if (IS_LEAF(n)) { 1460 if (IS_LEAF(n)) {
1426 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); 1461 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1427 if (ret > 0) 1462 if (ret > 0)
1428 goto backtrace; 1463 goto backtrace;
1429 goto found; 1464 goto found;
@@ -1541,7 +1576,7 @@ backtrace:
1541 if (chopped_off <= pn->bits) { 1576 if (chopped_off <= pn->bits) {
1542 cindex &= ~(1 << (chopped_off-1)); 1577 cindex &= ~(1 << (chopped_off-1));
1543 } else { 1578 } else {
1544 struct tnode *parent = node_parent_rcu((struct node *) pn); 1579 struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1545 if (!parent) 1580 if (!parent)
1546 goto failed; 1581 goto failed;
1547 1582
@@ -1568,7 +1603,7 @@ found:
1568 */ 1603 */
1569static void trie_leaf_remove(struct trie *t, struct leaf *l) 1604static void trie_leaf_remove(struct trie *t, struct leaf *l)
1570{ 1605{
1571 struct tnode *tp = node_parent((struct node *) l); 1606 struct tnode *tp = node_parent((struct rt_trie_node *) l);
1572 1607
1573 pr_debug("entering trie_leaf_remove(%p)\n", l); 1608 pr_debug("entering trie_leaf_remove(%p)\n", l);
1574 1609
@@ -1706,7 +1741,7 @@ static int trie_flush_leaf(struct leaf *l)
1706 * Scan for the next right leaf starting at node p->child[idx] 1741 * Scan for the next right leaf starting at node p->child[idx]
1707 * Since we have back pointer, no recursion necessary. 1742 * Since we have back pointer, no recursion necessary.
1708 */ 1743 */
1709static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) 1744static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1710{ 1745{
1711 do { 1746 do {
1712 t_key idx; 1747 t_key idx;
@@ -1732,7 +1767,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1732 } 1767 }
1733 1768
1734 /* Node empty, walk back up to parent */ 1769 /* Node empty, walk back up to parent */
1735 c = (struct node *) p; 1770 c = (struct rt_trie_node *) p;
1736 } while ((p = node_parent_rcu(c)) != NULL); 1771 } while ((p = node_parent_rcu(c)) != NULL);
1737 1772
1738 return NULL; /* Root of trie */ 1773 return NULL; /* Root of trie */
@@ -1753,7 +1788,7 @@ static struct leaf *trie_firstleaf(struct trie *t)
1753 1788
1754static struct leaf *trie_nextleaf(struct leaf *l) 1789static struct leaf *trie_nextleaf(struct leaf *l)
1755{ 1790{
1756 struct node *c = (struct node *) l; 1791 struct rt_trie_node *c = (struct rt_trie_node *) l;
1757 struct tnode *p = node_parent_rcu(c); 1792 struct tnode *p = node_parent_rcu(c);
1758 1793
1759 if (!p) 1794 if (!p)
@@ -1802,80 +1837,6 @@ void fib_free_table(struct fib_table *tb)
1802 kfree(tb); 1837 kfree(tb);
1803} 1838}
1804 1839
1805void fib_table_select_default(struct fib_table *tb,
1806 const struct flowi *flp,
1807 struct fib_result *res)
1808{
1809 struct trie *t = (struct trie *) tb->tb_data;
1810 int order, last_idx;
1811 struct fib_info *fi = NULL;
1812 struct fib_info *last_resort;
1813 struct fib_alias *fa = NULL;
1814 struct list_head *fa_head;
1815 struct leaf *l;
1816
1817 last_idx = -1;
1818 last_resort = NULL;
1819 order = -1;
1820
1821 rcu_read_lock();
1822
1823 l = fib_find_node(t, 0);
1824 if (!l)
1825 goto out;
1826
1827 fa_head = get_fa_head(l, 0);
1828 if (!fa_head)
1829 goto out;
1830
1831 if (list_empty(fa_head))
1832 goto out;
1833
1834 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1835 struct fib_info *next_fi = fa->fa_info;
1836
1837 if (fa->fa_scope != res->scope ||
1838 fa->fa_type != RTN_UNICAST)
1839 continue;
1840
1841 if (next_fi->fib_priority > res->fi->fib_priority)
1842 break;
1843 if (!next_fi->fib_nh[0].nh_gw ||
1844 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1845 continue;
1846
1847 fib_alias_accessed(fa);
1848
1849 if (fi == NULL) {
1850 if (next_fi != res->fi)
1851 break;
1852 } else if (!fib_detect_death(fi, order, &last_resort,
1853 &last_idx, tb->tb_default)) {
1854 fib_result_assign(res, fi);
1855 tb->tb_default = order;
1856 goto out;
1857 }
1858 fi = next_fi;
1859 order++;
1860 }
1861 if (order <= 0 || fi == NULL) {
1862 tb->tb_default = -1;
1863 goto out;
1864 }
1865
1866 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1867 tb->tb_default)) {
1868 fib_result_assign(res, fi);
1869 tb->tb_default = order;
1870 goto out;
1871 }
1872 if (last_idx >= 0)
1873 fib_result_assign(res, last_resort);
1874 tb->tb_default = last_idx;
1875out:
1876 rcu_read_unlock();
1877}
1878
1879static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, 1840static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1880 struct fib_table *tb, 1841 struct fib_table *tb,
1881 struct sk_buff *skb, struct netlink_callback *cb) 1842 struct sk_buff *skb, struct netlink_callback *cb)
@@ -1990,7 +1951,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1990 return skb->len; 1951 return skb->len;
1991} 1952}
1992 1953
1993void __init fib_hash_init(void) 1954void __init fib_trie_init(void)
1994{ 1955{
1995 fn_alias_kmem = kmem_cache_create("ip_fib_alias", 1956 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1996 sizeof(struct fib_alias), 1957 sizeof(struct fib_alias),
@@ -2003,8 +1964,7 @@ void __init fib_hash_init(void)
2003} 1964}
2004 1965
2005 1966
2006/* Fix more generic FIB names for init later */ 1967struct fib_table *fib_trie_table(u32 id)
2007struct fib_table *fib_hash_table(u32 id)
2008{ 1968{
2009 struct fib_table *tb; 1969 struct fib_table *tb;
2010 struct trie *t; 1970 struct trie *t;
@@ -2036,7 +1996,7 @@ struct fib_trie_iter {
2036 unsigned int depth; 1996 unsigned int depth;
2037}; 1997};
2038 1998
2039static struct node *fib_trie_get_next(struct fib_trie_iter *iter) 1999static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2040{ 2000{
2041 struct tnode *tn = iter->tnode; 2001 struct tnode *tn = iter->tnode;
2042 unsigned int cindex = iter->index; 2002 unsigned int cindex = iter->index;
@@ -2050,7 +2010,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2050 iter->tnode, iter->index, iter->depth); 2010 iter->tnode, iter->index, iter->depth);
2051rescan: 2011rescan:
2052 while (cindex < (1<<tn->bits)) { 2012 while (cindex < (1<<tn->bits)) {
2053 struct node *n = tnode_get_child_rcu(tn, cindex); 2013 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2054 2014
2055 if (n) { 2015 if (n) {
2056 if (IS_LEAF(n)) { 2016 if (IS_LEAF(n)) {
@@ -2069,7 +2029,7 @@ rescan:
2069 } 2029 }
2070 2030
2071 /* Current node exhausted, pop back up */ 2031 /* Current node exhausted, pop back up */
2072 p = node_parent_rcu((struct node *)tn); 2032 p = node_parent_rcu((struct rt_trie_node *)tn);
2073 if (p) { 2033 if (p) {
2074 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; 2034 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2075 tn = p; 2035 tn = p;
@@ -2081,10 +2041,10 @@ rescan:
2081 return NULL; 2041 return NULL;
2082} 2042}
2083 2043
2084static struct node *fib_trie_get_first(struct fib_trie_iter *iter, 2044static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2085 struct trie *t) 2045 struct trie *t)
2086{ 2046{
2087 struct node *n; 2047 struct rt_trie_node *n;
2088 2048
2089 if (!t) 2049 if (!t)
2090 return NULL; 2050 return NULL;
@@ -2108,7 +2068,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2108 2068
2109static void trie_collect_stats(struct trie *t, struct trie_stat *s) 2069static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2110{ 2070{
2111 struct node *n; 2071 struct rt_trie_node *n;
2112 struct fib_trie_iter iter; 2072 struct fib_trie_iter iter;
2113 2073
2114 memset(s, 0, sizeof(*s)); 2074 memset(s, 0, sizeof(*s));
@@ -2181,7 +2141,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2181 seq_putc(seq, '\n'); 2141 seq_putc(seq, '\n');
2182 seq_printf(seq, "\tPointers: %u\n", pointers); 2142 seq_printf(seq, "\tPointers: %u\n", pointers);
2183 2143
2184 bytes += sizeof(struct node *) * pointers; 2144 bytes += sizeof(struct rt_trie_node *) * pointers;
2185 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); 2145 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2186 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); 2146 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2187} 2147}
@@ -2262,7 +2222,7 @@ static const struct file_operations fib_triestat_fops = {
2262 .release = single_release_net, 2222 .release = single_release_net,
2263}; 2223};
2264 2224
2265static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) 2225static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2266{ 2226{
2267 struct fib_trie_iter *iter = seq->private; 2227 struct fib_trie_iter *iter = seq->private;
2268 struct net *net = seq_file_net(seq); 2228 struct net *net = seq_file_net(seq);
@@ -2275,7 +2235,7 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2275 struct fib_table *tb; 2235 struct fib_table *tb;
2276 2236
2277 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { 2237 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2278 struct node *n; 2238 struct rt_trie_node *n;
2279 2239
2280 for (n = fib_trie_get_first(iter, 2240 for (n = fib_trie_get_first(iter,
2281 (struct trie *) tb->tb_data); 2241 (struct trie *) tb->tb_data);
@@ -2304,7 +2264,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2304 struct fib_table *tb = iter->tb; 2264 struct fib_table *tb = iter->tb;
2305 struct hlist_node *tb_node; 2265 struct hlist_node *tb_node;
2306 unsigned int h; 2266 unsigned int h;
2307 struct node *n; 2267 struct rt_trie_node *n;
2308 2268
2309 ++*pos; 2269 ++*pos;
2310 /* next node in same table */ 2270 /* next node in same table */
@@ -2390,7 +2350,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2390static int fib_trie_seq_show(struct seq_file *seq, void *v) 2350static int fib_trie_seq_show(struct seq_file *seq, void *v)
2391{ 2351{
2392 const struct fib_trie_iter *iter = seq->private; 2352 const struct fib_trie_iter *iter = seq->private;
2393 struct node *n = v; 2353 struct rt_trie_node *n = v;
2394 2354
2395 if (!node_parent_rcu(n)) 2355 if (!node_parent_rcu(n))
2396 fib_table_print(seq, iter->tb); 2356 fib_table_print(seq, iter->tb);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4aa1b7f01ea0..a91dc1611081 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -233,48 +233,11 @@ static inline void icmp_xmit_unlock(struct sock *sk)
233 * Send an ICMP frame. 233 * Send an ICMP frame.
234 */ 234 */
235 235
236/* 236static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
237 * Check transmit rate limitation for given message.
238 * The rate information is held in the destination cache now.
239 * This function is generic and could be used for other purposes
240 * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
241 *
242 * Note that the same dst_entry fields are modified by functions in
243 * route.c too, but these work for packet destinations while xrlim_allow
244 * works for icmp destinations. This means the rate limiting information
245 * for one "ip object" is shared - and these ICMPs are twice limited:
246 * by source and by destination.
247 *
248 * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
249 * SHOULD allow setting of rate limits
250 *
251 * Shared between ICMPv4 and ICMPv6.
252 */
253#define XRLIM_BURST_FACTOR 6
254int xrlim_allow(struct dst_entry *dst, int timeout)
255{
256 unsigned long now, token = dst->rate_tokens;
257 int rc = 0;
258
259 now = jiffies;
260 token += now - dst->rate_last;
261 dst->rate_last = now;
262 if (token > XRLIM_BURST_FACTOR * timeout)
263 token = XRLIM_BURST_FACTOR * timeout;
264 if (token >= timeout) {
265 token -= timeout;
266 rc = 1;
267 }
268 dst->rate_tokens = token;
269 return rc;
270}
271EXPORT_SYMBOL(xrlim_allow);
272
273static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
274 int type, int code) 237 int type, int code)
275{ 238{
276 struct dst_entry *dst = &rt->dst; 239 struct dst_entry *dst = &rt->dst;
277 int rc = 1; 240 bool rc = true;
278 241
279 if (type > NR_ICMP_TYPES) 242 if (type > NR_ICMP_TYPES)
280 goto out; 243 goto out;
@@ -288,8 +251,12 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
288 goto out; 251 goto out;
289 252
290 /* Limit if icmp type is enabled in ratemask. */ 253 /* Limit if icmp type is enabled in ratemask. */
291 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) 254 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
292 rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit); 255 if (!rt->peer)
256 rt_bind_peer(rt, 1);
257 rc = inet_peer_xrlim_allow(rt->peer,
258 net->ipv4.sysctl_icmp_ratelimit);
259 }
293out: 260out:
294 return rc; 261 return rc;
295} 262}
@@ -386,12 +353,15 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
386 daddr = icmp_param->replyopts.faddr; 353 daddr = icmp_param->replyopts.faddr;
387 } 354 }
388 { 355 {
389 struct flowi fl = { .fl4_dst= daddr, 356 struct flowi4 fl4 = {
390 .fl4_src = rt->rt_spec_dst, 357 .daddr = daddr,
391 .fl4_tos = RT_TOS(ip_hdr(skb)->tos), 358 .saddr = rt->rt_spec_dst,
392 .proto = IPPROTO_ICMP }; 359 .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
393 security_skb_classify_flow(skb, &fl); 360 .flowi4_proto = IPPROTO_ICMP,
394 if (ip_route_output_key(net, &rt, &fl)) 361 };
362 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
363 rt = ip_route_output_key(net, &fl4);
364 if (IS_ERR(rt))
395 goto out_unlock; 365 goto out_unlock;
396 } 366 }
397 if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, 367 if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type,
@@ -402,6 +372,97 @@ out_unlock:
402 icmp_xmit_unlock(sk); 372 icmp_xmit_unlock(sk);
403} 373}
404 374
375static struct rtable *icmp_route_lookup(struct net *net, struct sk_buff *skb_in,
376 struct iphdr *iph,
377 __be32 saddr, u8 tos,
378 int type, int code,
379 struct icmp_bxm *param)
380{
381 struct flowi4 fl4 = {
382 .daddr = (param->replyopts.srr ?
383 param->replyopts.faddr : iph->saddr),
384 .saddr = saddr,
385 .flowi4_tos = RT_TOS(tos),
386 .flowi4_proto = IPPROTO_ICMP,
387 .fl4_icmp_type = type,
388 .fl4_icmp_code = code,
389 };
390 struct rtable *rt, *rt2;
391 int err;
392
393 security_skb_classify_flow(skb_in, flowi4_to_flowi(&fl4));
394 rt = __ip_route_output_key(net, &fl4);
395 if (IS_ERR(rt))
396 return rt;
397
398 /* No need to clone since we're just using its address. */
399 rt2 = rt;
400
401 if (!fl4.saddr)
402 fl4.saddr = rt->rt_src;
403
404 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
405 flowi4_to_flowi(&fl4), NULL, 0);
406 if (!IS_ERR(rt)) {
407 if (rt != rt2)
408 return rt;
409 } else if (PTR_ERR(rt) == -EPERM) {
410 rt = NULL;
411 } else
412 return rt;
413
414 err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4), AF_INET);
415 if (err)
416 goto relookup_failed;
417
418 if (inet_addr_type(net, fl4.saddr) == RTN_LOCAL) {
419 rt2 = __ip_route_output_key(net, &fl4);
420 if (IS_ERR(rt2))
421 err = PTR_ERR(rt2);
422 } else {
423 struct flowi4 fl4_2 = {};
424 unsigned long orefdst;
425
426 fl4_2.daddr = fl4.saddr;
427 rt2 = ip_route_output_key(net, &fl4_2);
428 if (IS_ERR(rt2)) {
429 err = PTR_ERR(rt2);
430 goto relookup_failed;
431 }
432 /* Ugh! */
433 orefdst = skb_in->_skb_refdst; /* save old refdst */
434 err = ip_route_input(skb_in, fl4.daddr, fl4.saddr,
435 RT_TOS(tos), rt2->dst.dev);
436
437 dst_release(&rt2->dst);
438 rt2 = skb_rtable(skb_in);
439 skb_in->_skb_refdst = orefdst; /* restore old refdst */
440 }
441
442 if (err)
443 goto relookup_failed;
444
445 rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
446 flowi4_to_flowi(&fl4), NULL,
447 XFRM_LOOKUP_ICMP);
448 if (!IS_ERR(rt2)) {
449 dst_release(&rt->dst);
450 rt = rt2;
451 } else if (PTR_ERR(rt2) == -EPERM) {
452 if (rt)
453 dst_release(&rt->dst);
454 return rt2;
455 } else {
456 err = PTR_ERR(rt2);
457 goto relookup_failed;
458 }
459 return rt;
460
461relookup_failed:
462 if (rt)
463 return rt;
464 return ERR_PTR(err);
465}
405 466
406/* 467/*
407 * Send an ICMP message in response to a situation 468 * Send an ICMP message in response to a situation
@@ -507,7 +568,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
507 rcu_read_lock(); 568 rcu_read_lock();
508 if (rt_is_input_route(rt) && 569 if (rt_is_input_route(rt) &&
509 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) 570 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
510 dev = dev_get_by_index_rcu(net, rt->fl.iif); 571 dev = dev_get_by_index_rcu(net, rt->rt_iif);
511 572
512 if (dev) 573 if (dev)
513 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); 574 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -539,86 +600,11 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
539 ipc.opt = &icmp_param.replyopts; 600 ipc.opt = &icmp_param.replyopts;
540 ipc.tx_flags = 0; 601 ipc.tx_flags = 0;
541 602
542 { 603 rt = icmp_route_lookup(net, skb_in, iph, saddr, tos,
543 struct flowi fl = { 604 type, code, &icmp_param);
544 .fl4_dst = icmp_param.replyopts.srr ? 605 if (IS_ERR(rt))
545 icmp_param.replyopts.faddr : iph->saddr, 606 goto out_unlock;
546 .fl4_src = saddr,
547 .fl4_tos = RT_TOS(tos),
548 .proto = IPPROTO_ICMP,
549 .fl_icmp_type = type,
550 .fl_icmp_code = code,
551 };
552 int err;
553 struct rtable *rt2;
554
555 security_skb_classify_flow(skb_in, &fl);
556 if (__ip_route_output_key(net, &rt, &fl))
557 goto out_unlock;
558
559 /* No need to clone since we're just using its address. */
560 rt2 = rt;
561
562 if (!fl.nl_u.ip4_u.saddr)
563 fl.nl_u.ip4_u.saddr = rt->rt_src;
564
565 err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0);
566 switch (err) {
567 case 0:
568 if (rt != rt2)
569 goto route_done;
570 break;
571 case -EPERM:
572 rt = NULL;
573 break;
574 default:
575 goto out_unlock;
576 }
577
578 if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET))
579 goto relookup_failed;
580
581 if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL)
582 err = __ip_route_output_key(net, &rt2, &fl);
583 else {
584 struct flowi fl2 = {};
585 unsigned long orefdst;
586
587 fl2.fl4_dst = fl.fl4_src;
588 if (ip_route_output_key(net, &rt2, &fl2))
589 goto relookup_failed;
590
591 /* Ugh! */
592 orefdst = skb_in->_skb_refdst; /* save old refdst */
593 err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
594 RT_TOS(tos), rt2->dst.dev);
595
596 dst_release(&rt2->dst);
597 rt2 = skb_rtable(skb_in);
598 skb_in->_skb_refdst = orefdst; /* restore old refdst */
599 }
600
601 if (err)
602 goto relookup_failed;
603
604 err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL,
605 XFRM_LOOKUP_ICMP);
606 switch (err) {
607 case 0:
608 dst_release(&rt->dst);
609 rt = rt2;
610 break;
611 case -EPERM:
612 goto ende;
613 default:
614relookup_failed:
615 if (!rt)
616 goto out_unlock;
617 break;
618 }
619 }
620 607
621route_done:
622 if (!icmpv4_xrlim_allow(net, rt, type, code)) 608 if (!icmpv4_xrlim_allow(net, rt, type, code))
623 goto ende; 609 goto ende;
624 610
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index e0e77e297de3..1fd3d9ce8398 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -321,14 +321,12 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
321 } 321 }
322 igmp_skb_size(skb) = size; 322 igmp_skb_size(skb) = size;
323 323
324 { 324 rt = ip_route_output_ports(net, NULL, IGMPV3_ALL_MCR, 0,
325 struct flowi fl = { .oif = dev->ifindex, 325 0, 0,
326 .fl4_dst = IGMPV3_ALL_MCR, 326 IPPROTO_IGMP, 0, dev->ifindex);
327 .proto = IPPROTO_IGMP }; 327 if (IS_ERR(rt)) {
328 if (ip_route_output_key(net, &rt, &fl)) { 328 kfree_skb(skb);
329 kfree_skb(skb); 329 return NULL;
330 return NULL;
331 }
332 } 330 }
333 if (rt->rt_src == 0) { 331 if (rt->rt_src == 0) {
334 kfree_skb(skb); 332 kfree_skb(skb);
@@ -666,13 +664,12 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
666 else 664 else
667 dst = group; 665 dst = group;
668 666
669 { 667 rt = ip_route_output_ports(net, NULL, dst, 0,
670 struct flowi fl = { .oif = dev->ifindex, 668 0, 0,
671 .fl4_dst = dst, 669 IPPROTO_IGMP, 0, dev->ifindex);
672 .proto = IPPROTO_IGMP }; 670 if (IS_ERR(rt))
673 if (ip_route_output_key(net, &rt, &fl)) 671 return -1;
674 return -1; 672
675 }
676 if (rt->rt_src == 0) { 673 if (rt->rt_src == 0) {
677 ip_rt_put(rt); 674 ip_rt_put(rt);
678 return -1; 675 return -1;
@@ -1439,8 +1436,6 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
1439/* RTNL is locked */ 1436/* RTNL is locked */
1440static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) 1437static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1441{ 1438{
1442 struct flowi fl = { .fl4_dst = imr->imr_multiaddr.s_addr };
1443 struct rtable *rt;
1444 struct net_device *dev = NULL; 1439 struct net_device *dev = NULL;
1445 struct in_device *idev = NULL; 1440 struct in_device *idev = NULL;
1446 1441
@@ -1454,9 +1449,14 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1454 return NULL; 1449 return NULL;
1455 } 1450 }
1456 1451
1457 if (!dev && !ip_route_output_key(net, &rt, &fl)) { 1452 if (!dev) {
1458 dev = rt->dst.dev; 1453 struct rtable *rt = ip_route_output(net,
1459 ip_rt_put(rt); 1454 imr->imr_multiaddr.s_addr,
1455 0, 0, 0);
1456 if (!IS_ERR(rt)) {
1457 dev = rt->dst.dev;
1458 ip_rt_put(rt);
1459 }
1460 } 1460 }
1461 if (dev) { 1461 if (dev) {
1462 imr->imr_ifindex = dev->ifindex; 1462 imr->imr_ifindex = dev->ifindex;
@@ -2329,13 +2329,13 @@ void ip_mc_drop_socket(struct sock *sk)
2329 rtnl_unlock(); 2329 rtnl_unlock();
2330} 2330}
2331 2331
2332int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) 2332/* called with rcu_read_lock() */
2333int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
2333{ 2334{
2334 struct ip_mc_list *im; 2335 struct ip_mc_list *im;
2335 struct ip_sf_list *psf; 2336 struct ip_sf_list *psf;
2336 int rv = 0; 2337 int rv = 0;
2337 2338
2338 rcu_read_lock();
2339 for_each_pmc_rcu(in_dev, im) { 2339 for_each_pmc_rcu(in_dev, im) {
2340 if (im->multiaddr == mc_addr) 2340 if (im->multiaddr == mc_addr)
2341 break; 2341 break;
@@ -2357,7 +2357,6 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
2357 } else 2357 } else
2358 rv = 1; /* unspecified source; tentatively allow */ 2358 rv = 1; /* unspecified source; tentatively allow */
2359 } 2359 }
2360 rcu_read_unlock();
2361 return rv; 2360 return rv;
2362} 2361}
2363 2362
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 97e5fb765265..6c0b7f4a3d7d 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -356,20 +356,23 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
356 struct rtable *rt; 356 struct rtable *rt;
357 const struct inet_request_sock *ireq = inet_rsk(req); 357 const struct inet_request_sock *ireq = inet_rsk(req);
358 struct ip_options *opt = inet_rsk(req)->opt; 358 struct ip_options *opt = inet_rsk(req)->opt;
359 struct flowi fl = { .oif = sk->sk_bound_dev_if, 359 struct flowi4 fl4 = {
360 .mark = sk->sk_mark, 360 .flowi4_oif = sk->sk_bound_dev_if,
361 .fl4_dst = ((opt && opt->srr) ? 361 .flowi4_mark = sk->sk_mark,
362 opt->faddr : ireq->rmt_addr), 362 .daddr = ((opt && opt->srr) ?
363 .fl4_src = ireq->loc_addr, 363 opt->faddr : ireq->rmt_addr),
364 .fl4_tos = RT_CONN_FLAGS(sk), 364 .saddr = ireq->loc_addr,
365 .proto = sk->sk_protocol, 365 .flowi4_tos = RT_CONN_FLAGS(sk),
366 .flags = inet_sk_flowi_flags(sk), 366 .flowi4_proto = sk->sk_protocol,
367 .fl_ip_sport = inet_sk(sk)->inet_sport, 367 .flowi4_flags = inet_sk_flowi_flags(sk),
368 .fl_ip_dport = ireq->rmt_port }; 368 .fl4_sport = inet_sk(sk)->inet_sport,
369 .fl4_dport = ireq->rmt_port,
370 };
369 struct net *net = sock_net(sk); 371 struct net *net = sock_net(sk);
370 372
371 security_req_classify_flow(req, &fl); 373 security_req_classify_flow(req, flowi4_to_flowi(&fl4));
372 if (ip_route_output_flow(net, &rt, &fl, sk, 0)) 374 rt = ip_route_output_flow(net, &fl4, sk);
375 if (IS_ERR(rt))
373 goto no_route; 376 goto no_route;
374 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 377 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
375 goto route_err; 378 goto route_err;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index a96e65674ac3..dd1b20eca1a2 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -81,19 +81,19 @@ static const struct inet_peer peer_fake_node = {
81 81
82struct inet_peer_base { 82struct inet_peer_base {
83 struct inet_peer __rcu *root; 83 struct inet_peer __rcu *root;
84 spinlock_t lock; 84 seqlock_t lock;
85 int total; 85 int total;
86}; 86};
87 87
88static struct inet_peer_base v4_peers = { 88static struct inet_peer_base v4_peers = {
89 .root = peer_avl_empty_rcu, 89 .root = peer_avl_empty_rcu,
90 .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock), 90 .lock = __SEQLOCK_UNLOCKED(v4_peers.lock),
91 .total = 0, 91 .total = 0,
92}; 92};
93 93
94static struct inet_peer_base v6_peers = { 94static struct inet_peer_base v6_peers = {
95 .root = peer_avl_empty_rcu, 95 .root = peer_avl_empty_rcu,
96 .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock), 96 .lock = __SEQLOCK_UNLOCKED(v6_peers.lock),
97 .total = 0, 97 .total = 0,
98}; 98};
99 99
@@ -167,9 +167,9 @@ static int addr_compare(const struct inetpeer_addr *a,
167 int i, n = (a->family == AF_INET ? 1 : 4); 167 int i, n = (a->family == AF_INET ? 1 : 4);
168 168
169 for (i = 0; i < n; i++) { 169 for (i = 0; i < n; i++) {
170 if (a->a6[i] == b->a6[i]) 170 if (a->addr.a6[i] == b->addr.a6[i])
171 continue; 171 continue;
172 if (a->a6[i] < b->a6[i]) 172 if (a->addr.a6[i] < b->addr.a6[i])
173 return -1; 173 return -1;
174 return 1; 174 return 1;
175 } 175 }
@@ -177,6 +177,9 @@ static int addr_compare(const struct inetpeer_addr *a,
177 return 0; 177 return 0;
178} 178}
179 179
180#define rcu_deref_locked(X, BASE) \
181 rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
182
180/* 183/*
181 * Called with local BH disabled and the pool lock held. 184 * Called with local BH disabled and the pool lock held.
182 */ 185 */
@@ -187,8 +190,7 @@ static int addr_compare(const struct inetpeer_addr *a,
187 \ 190 \
188 stackptr = _stack; \ 191 stackptr = _stack; \
189 *stackptr++ = &_base->root; \ 192 *stackptr++ = &_base->root; \
190 for (u = rcu_dereference_protected(_base->root, \ 193 for (u = rcu_deref_locked(_base->root, _base); \
191 lockdep_is_held(&_base->lock)); \
192 u != peer_avl_empty; ) { \ 194 u != peer_avl_empty; ) { \
193 int cmp = addr_compare(_daddr, &u->daddr); \ 195 int cmp = addr_compare(_daddr, &u->daddr); \
194 if (cmp == 0) \ 196 if (cmp == 0) \
@@ -198,23 +200,22 @@ static int addr_compare(const struct inetpeer_addr *a,
198 else \ 200 else \
199 v = &u->avl_right; \ 201 v = &u->avl_right; \
200 *stackptr++ = v; \ 202 *stackptr++ = v; \
201 u = rcu_dereference_protected(*v, \ 203 u = rcu_deref_locked(*v, _base); \
202 lockdep_is_held(&_base->lock)); \
203 } \ 204 } \
204 u; \ 205 u; \
205}) 206})
206 207
207/* 208/*
208 * Called with rcu_read_lock_bh() 209 * Called with rcu_read_lock()
209 * Because we hold no lock against a writer, its quite possible we fall 210 * Because we hold no lock against a writer, its quite possible we fall
210 * in an endless loop. 211 * in an endless loop.
211 * But every pointer we follow is guaranteed to be valid thanks to RCU. 212 * But every pointer we follow is guaranteed to be valid thanks to RCU.
212 * We exit from this function if number of links exceeds PEER_MAXDEPTH 213 * We exit from this function if number of links exceeds PEER_MAXDEPTH
213 */ 214 */
214static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, 215static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
215 struct inet_peer_base *base) 216 struct inet_peer_base *base)
216{ 217{
217 struct inet_peer *u = rcu_dereference_bh(base->root); 218 struct inet_peer *u = rcu_dereference(base->root);
218 int count = 0; 219 int count = 0;
219 220
220 while (u != peer_avl_empty) { 221 while (u != peer_avl_empty) {
@@ -230,9 +231,9 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
230 return u; 231 return u;
231 } 232 }
232 if (cmp == -1) 233 if (cmp == -1)
233 u = rcu_dereference_bh(u->avl_left); 234 u = rcu_dereference(u->avl_left);
234 else 235 else
235 u = rcu_dereference_bh(u->avl_right); 236 u = rcu_dereference(u->avl_right);
236 if (unlikely(++count == PEER_MAXDEPTH)) 237 if (unlikely(++count == PEER_MAXDEPTH))
237 break; 238 break;
238 } 239 }
@@ -246,13 +247,11 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
246 struct inet_peer __rcu **v; \ 247 struct inet_peer __rcu **v; \
247 *stackptr++ = &start->avl_left; \ 248 *stackptr++ = &start->avl_left; \
248 v = &start->avl_left; \ 249 v = &start->avl_left; \
249 for (u = rcu_dereference_protected(*v, \ 250 for (u = rcu_deref_locked(*v, base); \
250 lockdep_is_held(&base->lock)); \
251 u->avl_right != peer_avl_empty_rcu; ) { \ 251 u->avl_right != peer_avl_empty_rcu; ) { \
252 v = &u->avl_right; \ 252 v = &u->avl_right; \
253 *stackptr++ = v; \ 253 *stackptr++ = v; \
254 u = rcu_dereference_protected(*v, \ 254 u = rcu_deref_locked(*v, base); \
255 lockdep_is_held(&base->lock)); \
256 } \ 255 } \
257 u; \ 256 u; \
258}) 257})
@@ -271,21 +270,16 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
271 270
272 while (stackend > stack) { 271 while (stackend > stack) {
273 nodep = *--stackend; 272 nodep = *--stackend;
274 node = rcu_dereference_protected(*nodep, 273 node = rcu_deref_locked(*nodep, base);
275 lockdep_is_held(&base->lock)); 274 l = rcu_deref_locked(node->avl_left, base);
276 l = rcu_dereference_protected(node->avl_left, 275 r = rcu_deref_locked(node->avl_right, base);
277 lockdep_is_held(&base->lock));
278 r = rcu_dereference_protected(node->avl_right,
279 lockdep_is_held(&base->lock));
280 lh = node_height(l); 276 lh = node_height(l);
281 rh = node_height(r); 277 rh = node_height(r);
282 if (lh > rh + 1) { /* l: RH+2 */ 278 if (lh > rh + 1) { /* l: RH+2 */
283 struct inet_peer *ll, *lr, *lrl, *lrr; 279 struct inet_peer *ll, *lr, *lrl, *lrr;
284 int lrh; 280 int lrh;
285 ll = rcu_dereference_protected(l->avl_left, 281 ll = rcu_deref_locked(l->avl_left, base);
286 lockdep_is_held(&base->lock)); 282 lr = rcu_deref_locked(l->avl_right, base);
287 lr = rcu_dereference_protected(l->avl_right,
288 lockdep_is_held(&base->lock));
289 lrh = node_height(lr); 283 lrh = node_height(lr);
290 if (lrh <= node_height(ll)) { /* ll: RH+1 */ 284 if (lrh <= node_height(ll)) { /* ll: RH+1 */
291 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ 285 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
@@ -296,10 +290,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
296 l->avl_height = node->avl_height + 1; 290 l->avl_height = node->avl_height + 1;
297 RCU_INIT_POINTER(*nodep, l); 291 RCU_INIT_POINTER(*nodep, l);
298 } else { /* ll: RH, lr: RH+1 */ 292 } else { /* ll: RH, lr: RH+1 */
299 lrl = rcu_dereference_protected(lr->avl_left, 293 lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
300 lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */ 294 lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
301 lrr = rcu_dereference_protected(lr->avl_right,
302 lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */
303 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ 295 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
304 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ 296 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
305 node->avl_height = rh + 1; /* node: RH+1 */ 297 node->avl_height = rh + 1; /* node: RH+1 */
@@ -314,10 +306,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
314 } else if (rh > lh + 1) { /* r: LH+2 */ 306 } else if (rh > lh + 1) { /* r: LH+2 */
315 struct inet_peer *rr, *rl, *rlr, *rll; 307 struct inet_peer *rr, *rl, *rlr, *rll;
316 int rlh; 308 int rlh;
317 rr = rcu_dereference_protected(r->avl_right, 309 rr = rcu_deref_locked(r->avl_right, base);
318 lockdep_is_held(&base->lock)); 310 rl = rcu_deref_locked(r->avl_left, base);
319 rl = rcu_dereference_protected(r->avl_left,
320 lockdep_is_held(&base->lock));
321 rlh = node_height(rl); 311 rlh = node_height(rl);
322 if (rlh <= node_height(rr)) { /* rr: LH+1 */ 312 if (rlh <= node_height(rr)) { /* rr: LH+1 */
323 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ 313 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
@@ -328,10 +318,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
328 r->avl_height = node->avl_height + 1; 318 r->avl_height = node->avl_height + 1;
329 RCU_INIT_POINTER(*nodep, r); 319 RCU_INIT_POINTER(*nodep, r);
330 } else { /* rr: RH, rl: RH+1 */ 320 } else { /* rr: RH, rl: RH+1 */
331 rlr = rcu_dereference_protected(rl->avl_right, 321 rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
332 lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */ 322 rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
333 rll = rcu_dereference_protected(rl->avl_left,
334 lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */
335 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ 323 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
336 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ 324 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
337 node->avl_height = lh + 1; /* node: LH+1 */ 325 node->avl_height = lh + 1; /* node: LH+1 */
@@ -372,7 +360,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
372 360
373 do_free = 0; 361 do_free = 0;
374 362
375 spin_lock_bh(&base->lock); 363 write_seqlock_bh(&base->lock);
376 /* Check the reference counter. It was artificially incremented by 1 364 /* Check the reference counter. It was artificially incremented by 1
377 * in cleanup() function to prevent sudden disappearing. If we can 365 * in cleanup() function to prevent sudden disappearing. If we can
378 * atomically (because of lockless readers) take this last reference, 366 * atomically (because of lockless readers) take this last reference,
@@ -392,8 +380,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
392 /* look for a node to insert instead of p */ 380 /* look for a node to insert instead of p */
393 struct inet_peer *t; 381 struct inet_peer *t;
394 t = lookup_rightempty(p, base); 382 t = lookup_rightempty(p, base);
395 BUG_ON(rcu_dereference_protected(*stackptr[-1], 383 BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
396 lockdep_is_held(&base->lock)) != t);
397 **--stackptr = t->avl_left; 384 **--stackptr = t->avl_left;
398 /* t is removed, t->daddr > x->daddr for any 385 /* t is removed, t->daddr > x->daddr for any
399 * x in p->avl_left subtree. 386 * x in p->avl_left subtree.
@@ -409,10 +396,10 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
409 base->total--; 396 base->total--;
410 do_free = 1; 397 do_free = 1;
411 } 398 }
412 spin_unlock_bh(&base->lock); 399 write_sequnlock_bh(&base->lock);
413 400
414 if (do_free) 401 if (do_free)
415 call_rcu_bh(&p->rcu, inetpeer_free_rcu); 402 call_rcu(&p->rcu, inetpeer_free_rcu);
416 else 403 else
417 /* The node is used again. Decrease the reference counter 404 /* The node is used again. Decrease the reference counter
418 * back. The loop "cleanup -> unlink_from_unused 405 * back. The loop "cleanup -> unlink_from_unused
@@ -477,13 +464,17 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
477 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; 464 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
478 struct inet_peer_base *base = family_to_base(daddr->family); 465 struct inet_peer_base *base = family_to_base(daddr->family);
479 struct inet_peer *p; 466 struct inet_peer *p;
467 unsigned int sequence;
468 int invalidated;
480 469
481 /* Look up for the address quickly, lockless. 470 /* Look up for the address quickly, lockless.
482 * Because of a concurrent writer, we might not find an existing entry. 471 * Because of a concurrent writer, we might not find an existing entry.
483 */ 472 */
484 rcu_read_lock_bh(); 473 rcu_read_lock();
485 p = lookup_rcu_bh(daddr, base); 474 sequence = read_seqbegin(&base->lock);
486 rcu_read_unlock_bh(); 475 p = lookup_rcu(daddr, base);
476 invalidated = read_seqretry(&base->lock, sequence);
477 rcu_read_unlock();
487 478
488 if (p) { 479 if (p) {
489 /* The existing node has been found. 480 /* The existing node has been found.
@@ -493,14 +484,18 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
493 return p; 484 return p;
494 } 485 }
495 486
487 /* If no writer did a change during our lookup, we can return early. */
488 if (!create && !invalidated)
489 return NULL;
490
496 /* retry an exact lookup, taking the lock before. 491 /* retry an exact lookup, taking the lock before.
497 * At least, nodes should be hot in our cache. 492 * At least, nodes should be hot in our cache.
498 */ 493 */
499 spin_lock_bh(&base->lock); 494 write_seqlock_bh(&base->lock);
500 p = lookup(daddr, stack, base); 495 p = lookup(daddr, stack, base);
501 if (p != peer_avl_empty) { 496 if (p != peer_avl_empty) {
502 atomic_inc(&p->refcnt); 497 atomic_inc(&p->refcnt);
503 spin_unlock_bh(&base->lock); 498 write_sequnlock_bh(&base->lock);
504 /* Remove the entry from unused list if it was there. */ 499 /* Remove the entry from unused list if it was there. */
505 unlink_from_unused(p); 500 unlink_from_unused(p);
506 return p; 501 return p;
@@ -510,8 +505,14 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
510 p->daddr = *daddr; 505 p->daddr = *daddr;
511 atomic_set(&p->refcnt, 1); 506 atomic_set(&p->refcnt, 1);
512 atomic_set(&p->rid, 0); 507 atomic_set(&p->rid, 0);
513 atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4)); 508 atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4));
514 p->tcp_ts_stamp = 0; 509 p->tcp_ts_stamp = 0;
510 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
511 p->rate_tokens = 0;
512 p->rate_last = 0;
513 p->pmtu_expires = 0;
514 p->pmtu_orig = 0;
515 memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
515 INIT_LIST_HEAD(&p->unused); 516 INIT_LIST_HEAD(&p->unused);
516 517
517 518
@@ -519,7 +520,7 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
519 link_to_pool(p, base); 520 link_to_pool(p, base);
520 base->total++; 521 base->total++;
521 } 522 }
522 spin_unlock_bh(&base->lock); 523 write_sequnlock_bh(&base->lock);
523 524
524 if (base->total >= inet_peer_threshold) 525 if (base->total >= inet_peer_threshold)
525 /* Remove one less-recently-used entry. */ 526 /* Remove one less-recently-used entry. */
@@ -579,3 +580,44 @@ void inet_putpeer(struct inet_peer *p)
579 local_bh_enable(); 580 local_bh_enable();
580} 581}
581EXPORT_SYMBOL_GPL(inet_putpeer); 582EXPORT_SYMBOL_GPL(inet_putpeer);
583
584/*
585 * Check transmit rate limitation for given message.
586 * The rate information is held in the inet_peer entries now.
587 * This function is generic and could be used for other purposes
588 * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
589 *
590 * Note that the same inet_peer fields are modified by functions in
591 * route.c too, but these work for packet destinations while xrlim_allow
592 * works for icmp destinations. This means the rate limiting information
593 * for one "ip object" is shared - and these ICMPs are twice limited:
594 * by source and by destination.
595 *
596 * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
597 * SHOULD allow setting of rate limits
598 *
599 * Shared between ICMPv4 and ICMPv6.
600 */
601#define XRLIM_BURST_FACTOR 6
602bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
603{
604 unsigned long now, token;
605 bool rc = false;
606
607 if (!peer)
608 return true;
609
610 token = peer->rate_tokens;
611 now = jiffies;
612 token += now - peer->rate_last;
613 peer->rate_last = now;
614 if (token > XRLIM_BURST_FACTOR * timeout)
615 token = XRLIM_BURST_FACTOR * timeout;
616 if (token >= timeout) {
617 token -= timeout;
618 rc = true;
619 }
620 peer->rate_tokens = token;
621 return rc;
622}
623EXPORT_SYMBOL(inet_peer_xrlim_allow);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index d1d0e2c256fc..da5941f18c3c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -769,19 +769,12 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
769 tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph); 769 tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
770 } 770 }
771 771
772 { 772 rt = ip_route_output_gre(dev_net(dev), dst, tiph->saddr,
773 struct flowi fl = { 773 tunnel->parms.o_key, RT_TOS(tos),
774 .oif = tunnel->parms.link, 774 tunnel->parms.link);
775 .fl4_dst = dst, 775 if (IS_ERR(rt)) {
776 .fl4_src = tiph->saddr, 776 dev->stats.tx_carrier_errors++;
777 .fl4_tos = RT_TOS(tos), 777 goto tx_error;
778 .proto = IPPROTO_GRE,
779 .fl_gre_key = tunnel->parms.o_key
780 };
781 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
782 dev->stats.tx_carrier_errors++;
783 goto tx_error;
784 }
785 } 778 }
786 tdev = rt->dst.dev; 779 tdev = rt->dst.dev;
787 780
@@ -945,17 +938,13 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
945 /* Guess output device to choose reasonable mtu and needed_headroom */ 938 /* Guess output device to choose reasonable mtu and needed_headroom */
946 939
947 if (iph->daddr) { 940 if (iph->daddr) {
948 struct flowi fl = { 941 struct rtable *rt = ip_route_output_gre(dev_net(dev),
949 .oif = tunnel->parms.link, 942 iph->daddr, iph->saddr,
950 .fl4_dst = iph->daddr, 943 tunnel->parms.o_key,
951 .fl4_src = iph->saddr, 944 RT_TOS(iph->tos),
952 .fl4_tos = RT_TOS(iph->tos), 945 tunnel->parms.link);
953 .proto = IPPROTO_GRE, 946
954 .fl_gre_key = tunnel->parms.o_key 947 if (!IS_ERR(rt)) {
955 };
956 struct rtable *rt;
957
958 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
959 tdev = rt->dst.dev; 948 tdev = rt->dst.dev;
960 ip_rt_put(rt); 949 ip_rt_put(rt);
961 } 950 }
@@ -1207,17 +1196,14 @@ static int ipgre_open(struct net_device *dev)
1207 struct ip_tunnel *t = netdev_priv(dev); 1196 struct ip_tunnel *t = netdev_priv(dev);
1208 1197
1209 if (ipv4_is_multicast(t->parms.iph.daddr)) { 1198 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1210 struct flowi fl = { 1199 struct rtable *rt = ip_route_output_gre(dev_net(dev),
1211 .oif = t->parms.link, 1200 t->parms.iph.daddr,
1212 .fl4_dst = t->parms.iph.daddr, 1201 t->parms.iph.saddr,
1213 .fl4_src = t->parms.iph.saddr, 1202 t->parms.o_key,
1214 .fl4_tos = RT_TOS(t->parms.iph.tos), 1203 RT_TOS(t->parms.iph.tos),
1215 .proto = IPPROTO_GRE, 1204 t->parms.link);
1216 .fl_gre_key = t->parms.o_key 1205
1217 }; 1206 if (IS_ERR(rt))
1218 struct rtable *rt;
1219
1220 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1221 return -EADDRNOTAVAIL; 1207 return -EADDRNOTAVAIL;
1222 dev = rt->dst.dev; 1208 dev = rt->dst.dev;
1223 ip_rt_put(rt); 1209 ip_rt_put(rt);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d859bcc26cb7..d7b2b0987a3b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
340 } 340 }
341 } 341 }
342 342
343#ifdef CONFIG_NET_CLS_ROUTE 343#ifdef CONFIG_IP_ROUTE_CLASSID
344 if (unlikely(skb_dst(skb)->tclassid)) { 344 if (unlikely(skb_dst(skb)->tclassid)) {
345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); 345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
346 u32 idx = skb_dst(skb)->tclassid; 346 u32 idx = skb_dst(skb)->tclassid;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 04c7b3ba6b39..67f241b97649 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -339,25 +339,19 @@ int ip_queue_xmit(struct sk_buff *skb)
339 if(opt && opt->srr) 339 if(opt && opt->srr)
340 daddr = opt->faddr; 340 daddr = opt->faddr;
341 341
342 { 342 /* If this fails, retransmit mechanism of transport layer will
343 struct flowi fl = { .oif = sk->sk_bound_dev_if, 343 * keep trying until route appears or the connection times
344 .mark = sk->sk_mark, 344 * itself out.
345 .fl4_dst = daddr, 345 */
346 .fl4_src = inet->inet_saddr, 346 rt = ip_route_output_ports(sock_net(sk), sk,
347 .fl4_tos = RT_CONN_FLAGS(sk), 347 daddr, inet->inet_saddr,
348 .proto = sk->sk_protocol, 348 inet->inet_dport,
349 .flags = inet_sk_flowi_flags(sk), 349 inet->inet_sport,
350 .fl_ip_sport = inet->inet_sport, 350 sk->sk_protocol,
351 .fl_ip_dport = inet->inet_dport }; 351 RT_CONN_FLAGS(sk),
352 352 sk->sk_bound_dev_if);
353 /* If this fails, retransmit mechanism of transport layer will 353 if (IS_ERR(rt))
354 * keep trying until route appears or the connection times 354 goto no_route;
355 * itself out.
356 */
357 security_sk_classify_flow(sk, &fl);
358 if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
359 goto no_route;
360 }
361 sk_setup_caps(sk, &rt->dst); 355 sk_setup_caps(sk, &rt->dst);
362 } 356 }
363 skb_dst_set_noref(skb, &rt->dst); 357 skb_dst_set_noref(skb, &rt->dst);
@@ -733,6 +727,7 @@ csum_page(struct page *page, int offset, int copy)
733} 727}
734 728
735static inline int ip_ufo_append_data(struct sock *sk, 729static inline int ip_ufo_append_data(struct sock *sk,
730 struct sk_buff_head *queue,
736 int getfrag(void *from, char *to, int offset, int len, 731 int getfrag(void *from, char *to, int offset, int len,
737 int odd, struct sk_buff *skb), 732 int odd, struct sk_buff *skb),
738 void *from, int length, int hh_len, int fragheaderlen, 733 void *from, int length, int hh_len, int fragheaderlen,
@@ -745,7 +740,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
745 * device, so create one single skb packet containing complete 740 * device, so create one single skb packet containing complete
746 * udp datagram 741 * udp datagram
747 */ 742 */
748 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { 743 if ((skb = skb_peek_tail(queue)) == NULL) {
749 skb = sock_alloc_send_skb(sk, 744 skb = sock_alloc_send_skb(sk,
750 hh_len + fragheaderlen + transhdrlen + 20, 745 hh_len + fragheaderlen + transhdrlen + 20,
751 (flags & MSG_DONTWAIT), &err); 746 (flags & MSG_DONTWAIT), &err);
@@ -767,40 +762,28 @@ static inline int ip_ufo_append_data(struct sock *sk,
767 762
768 skb->ip_summed = CHECKSUM_PARTIAL; 763 skb->ip_summed = CHECKSUM_PARTIAL;
769 skb->csum = 0; 764 skb->csum = 0;
770 sk->sk_sndmsg_off = 0;
771 765
772 /* specify the length of each IP datagram fragment */ 766 /* specify the length of each IP datagram fragment */
773 skb_shinfo(skb)->gso_size = mtu - fragheaderlen; 767 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
774 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 768 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
775 __skb_queue_tail(&sk->sk_write_queue, skb); 769 __skb_queue_tail(queue, skb);
776 } 770 }
777 771
778 return skb_append_datato_frags(sk, skb, getfrag, from, 772 return skb_append_datato_frags(sk, skb, getfrag, from,
779 (length - transhdrlen)); 773 (length - transhdrlen));
780} 774}
781 775
782/* 776static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
783 * ip_append_data() and ip_append_page() can make one large IP datagram 777 struct inet_cork *cork,
784 * from many pieces of data. Each pieces will be holded on the socket 778 int getfrag(void *from, char *to, int offset,
785 * until ip_push_pending_frames() is called. Each piece can be a page 779 int len, int odd, struct sk_buff *skb),
786 * or non-page data. 780 void *from, int length, int transhdrlen,
787 * 781 unsigned int flags)
788 * Not only UDP, other transport protocols - e.g. raw sockets - can use
789 * this interface potentially.
790 *
791 * LATER: length must be adjusted by pad at tail, when it is required.
792 */
793int ip_append_data(struct sock *sk,
794 int getfrag(void *from, char *to, int offset, int len,
795 int odd, struct sk_buff *skb),
796 void *from, int length, int transhdrlen,
797 struct ipcm_cookie *ipc, struct rtable **rtp,
798 unsigned int flags)
799{ 782{
800 struct inet_sock *inet = inet_sk(sk); 783 struct inet_sock *inet = inet_sk(sk);
801 struct sk_buff *skb; 784 struct sk_buff *skb;
802 785
803 struct ip_options *opt = NULL; 786 struct ip_options *opt = cork->opt;
804 int hh_len; 787 int hh_len;
805 int exthdrlen; 788 int exthdrlen;
806 int mtu; 789 int mtu;
@@ -809,58 +792,19 @@ int ip_append_data(struct sock *sk,
809 int offset = 0; 792 int offset = 0;
810 unsigned int maxfraglen, fragheaderlen; 793 unsigned int maxfraglen, fragheaderlen;
811 int csummode = CHECKSUM_NONE; 794 int csummode = CHECKSUM_NONE;
812 struct rtable *rt; 795 struct rtable *rt = (struct rtable *)cork->dst;
813 796
814 if (flags&MSG_PROBE) 797 exthdrlen = transhdrlen ? rt->dst.header_len : 0;
815 return 0; 798 length += exthdrlen;
816 799 transhdrlen += exthdrlen;
817 if (skb_queue_empty(&sk->sk_write_queue)) { 800 mtu = cork->fragsize;
818 /*
819 * setup for corking.
820 */
821 opt = ipc->opt;
822 if (opt) {
823 if (inet->cork.opt == NULL) {
824 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
825 if (unlikely(inet->cork.opt == NULL))
826 return -ENOBUFS;
827 }
828 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
829 inet->cork.flags |= IPCORK_OPT;
830 inet->cork.addr = ipc->addr;
831 }
832 rt = *rtp;
833 if (unlikely(!rt))
834 return -EFAULT;
835 /*
836 * We steal reference to this route, caller should not release it
837 */
838 *rtp = NULL;
839 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
840 rt->dst.dev->mtu :
841 dst_mtu(rt->dst.path);
842 inet->cork.dst = &rt->dst;
843 inet->cork.length = 0;
844 sk->sk_sndmsg_page = NULL;
845 sk->sk_sndmsg_off = 0;
846 exthdrlen = rt->dst.header_len;
847 length += exthdrlen;
848 transhdrlen += exthdrlen;
849 } else {
850 rt = (struct rtable *)inet->cork.dst;
851 if (inet->cork.flags & IPCORK_OPT)
852 opt = inet->cork.opt;
853 801
854 transhdrlen = 0;
855 exthdrlen = 0;
856 mtu = inet->cork.fragsize;
857 }
858 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 802 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
859 803
860 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 804 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
861 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 805 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
862 806
863 if (inet->cork.length + length > 0xFFFF - fragheaderlen) { 807 if (cork->length + length > 0xFFFF - fragheaderlen) {
864 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, 808 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
865 mtu-exthdrlen); 809 mtu-exthdrlen);
866 return -EMSGSIZE; 810 return -EMSGSIZE;
@@ -876,15 +820,15 @@ int ip_append_data(struct sock *sk,
876 !exthdrlen) 820 !exthdrlen)
877 csummode = CHECKSUM_PARTIAL; 821 csummode = CHECKSUM_PARTIAL;
878 822
879 skb = skb_peek_tail(&sk->sk_write_queue); 823 skb = skb_peek_tail(queue);
880 824
881 inet->cork.length += length; 825 cork->length += length;
882 if (((length > mtu) || (skb && skb_is_gso(skb))) && 826 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
883 (sk->sk_protocol == IPPROTO_UDP) && 827 (sk->sk_protocol == IPPROTO_UDP) &&
884 (rt->dst.dev->features & NETIF_F_UFO)) { 828 (rt->dst.dev->features & NETIF_F_UFO)) {
885 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, 829 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
886 fragheaderlen, transhdrlen, mtu, 830 hh_len, fragheaderlen, transhdrlen,
887 flags); 831 mtu, flags);
888 if (err) 832 if (err)
889 goto error; 833 goto error;
890 return 0; 834 return 0;
@@ -961,7 +905,7 @@ alloc_new_skb:
961 else 905 else
962 /* only the initial fragment is 906 /* only the initial fragment is
963 time stamped */ 907 time stamped */
964 ipc->tx_flags = 0; 908 cork->tx_flags = 0;
965 } 909 }
966 if (skb == NULL) 910 if (skb == NULL)
967 goto error; 911 goto error;
@@ -972,7 +916,7 @@ alloc_new_skb:
972 skb->ip_summed = csummode; 916 skb->ip_summed = csummode;
973 skb->csum = 0; 917 skb->csum = 0;
974 skb_reserve(skb, hh_len); 918 skb_reserve(skb, hh_len);
975 skb_shinfo(skb)->tx_flags = ipc->tx_flags; 919 skb_shinfo(skb)->tx_flags = cork->tx_flags;
976 920
977 /* 921 /*
978 * Find where to start putting bytes. 922 * Find where to start putting bytes.
@@ -1009,7 +953,7 @@ alloc_new_skb:
1009 /* 953 /*
1010 * Put the packet on the pending queue. 954 * Put the packet on the pending queue.
1011 */ 955 */
1012 __skb_queue_tail(&sk->sk_write_queue, skb); 956 __skb_queue_tail(queue, skb);
1013 continue; 957 continue;
1014 } 958 }
1015 959
@@ -1029,8 +973,8 @@ alloc_new_skb:
1029 } else { 973 } else {
1030 int i = skb_shinfo(skb)->nr_frags; 974 int i = skb_shinfo(skb)->nr_frags;
1031 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 975 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1032 struct page *page = sk->sk_sndmsg_page; 976 struct page *page = cork->page;
1033 int off = sk->sk_sndmsg_off; 977 int off = cork->off;
1034 unsigned int left; 978 unsigned int left;
1035 979
1036 if (page && (left = PAGE_SIZE - off) > 0) { 980 if (page && (left = PAGE_SIZE - off) > 0) {
@@ -1042,7 +986,7 @@ alloc_new_skb:
1042 goto error; 986 goto error;
1043 } 987 }
1044 get_page(page); 988 get_page(page);
1045 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); 989 skb_fill_page_desc(skb, i, page, off, 0);
1046 frag = &skb_shinfo(skb)->frags[i]; 990 frag = &skb_shinfo(skb)->frags[i];
1047 } 991 }
1048 } else if (i < MAX_SKB_FRAGS) { 992 } else if (i < MAX_SKB_FRAGS) {
@@ -1053,8 +997,8 @@ alloc_new_skb:
1053 err = -ENOMEM; 997 err = -ENOMEM;
1054 goto error; 998 goto error;
1055 } 999 }
1056 sk->sk_sndmsg_page = page; 1000 cork->page = page;
1057 sk->sk_sndmsg_off = 0; 1001 cork->off = 0;
1058 1002
1059 skb_fill_page_desc(skb, i, page, 0, 0); 1003 skb_fill_page_desc(skb, i, page, 0, 0);
1060 frag = &skb_shinfo(skb)->frags[i]; 1004 frag = &skb_shinfo(skb)->frags[i];
@@ -1066,7 +1010,7 @@ alloc_new_skb:
1066 err = -EFAULT; 1010 err = -EFAULT;
1067 goto error; 1011 goto error;
1068 } 1012 }
1069 sk->sk_sndmsg_off += copy; 1013 cork->off += copy;
1070 frag->size += copy; 1014 frag->size += copy;
1071 skb->len += copy; 1015 skb->len += copy;
1072 skb->data_len += copy; 1016 skb->data_len += copy;
@@ -1080,11 +1024,87 @@ alloc_new_skb:
1080 return 0; 1024 return 0;
1081 1025
1082error: 1026error:
1083 inet->cork.length -= length; 1027 cork->length -= length;
1084 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1028 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1085 return err; 1029 return err;
1086} 1030}
1087 1031
1032static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1033 struct ipcm_cookie *ipc, struct rtable **rtp)
1034{
1035 struct inet_sock *inet = inet_sk(sk);
1036 struct ip_options *opt;
1037 struct rtable *rt;
1038
1039 /*
1040 * setup for corking.
1041 */
1042 opt = ipc->opt;
1043 if (opt) {
1044 if (cork->opt == NULL) {
1045 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1046 sk->sk_allocation);
1047 if (unlikely(cork->opt == NULL))
1048 return -ENOBUFS;
1049 }
1050 memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen);
1051 cork->flags |= IPCORK_OPT;
1052 cork->addr = ipc->addr;
1053 }
1054 rt = *rtp;
1055 if (unlikely(!rt))
1056 return -EFAULT;
1057 /*
1058 * We steal reference to this route, caller should not release it
1059 */
1060 *rtp = NULL;
1061 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1062 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1063 cork->dst = &rt->dst;
1064 cork->length = 0;
1065 cork->tx_flags = ipc->tx_flags;
1066 cork->page = NULL;
1067 cork->off = 0;
1068
1069 return 0;
1070}
1071
1072/*
1073 * ip_append_data() and ip_append_page() can make one large IP datagram
1074 * from many pieces of data. Each pieces will be holded on the socket
1075 * until ip_push_pending_frames() is called. Each piece can be a page
1076 * or non-page data.
1077 *
1078 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1079 * this interface potentially.
1080 *
1081 * LATER: length must be adjusted by pad at tail, when it is required.
1082 */
1083int ip_append_data(struct sock *sk,
1084 int getfrag(void *from, char *to, int offset, int len,
1085 int odd, struct sk_buff *skb),
1086 void *from, int length, int transhdrlen,
1087 struct ipcm_cookie *ipc, struct rtable **rtp,
1088 unsigned int flags)
1089{
1090 struct inet_sock *inet = inet_sk(sk);
1091 int err;
1092
1093 if (flags&MSG_PROBE)
1094 return 0;
1095
1096 if (skb_queue_empty(&sk->sk_write_queue)) {
1097 err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
1098 if (err)
1099 return err;
1100 } else {
1101 transhdrlen = 0;
1102 }
1103
1104 return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
1105 from, length, transhdrlen, flags);
1106}
1107
1088ssize_t ip_append_page(struct sock *sk, struct page *page, 1108ssize_t ip_append_page(struct sock *sk, struct page *page,
1089 int offset, size_t size, int flags) 1109 int offset, size_t size, int flags)
1090{ 1110{
@@ -1228,40 +1248,41 @@ error:
1228 return err; 1248 return err;
1229} 1249}
1230 1250
1231static void ip_cork_release(struct inet_sock *inet) 1251static void ip_cork_release(struct inet_cork *cork)
1232{ 1252{
1233 inet->cork.flags &= ~IPCORK_OPT; 1253 cork->flags &= ~IPCORK_OPT;
1234 kfree(inet->cork.opt); 1254 kfree(cork->opt);
1235 inet->cork.opt = NULL; 1255 cork->opt = NULL;
1236 dst_release(inet->cork.dst); 1256 dst_release(cork->dst);
1237 inet->cork.dst = NULL; 1257 cork->dst = NULL;
1238} 1258}
1239 1259
1240/* 1260/*
1241 * Combined all pending IP fragments on the socket as one IP datagram 1261 * Combined all pending IP fragments on the socket as one IP datagram
1242 * and push them out. 1262 * and push them out.
1243 */ 1263 */
1244int ip_push_pending_frames(struct sock *sk) 1264struct sk_buff *__ip_make_skb(struct sock *sk,
1265 struct sk_buff_head *queue,
1266 struct inet_cork *cork)
1245{ 1267{
1246 struct sk_buff *skb, *tmp_skb; 1268 struct sk_buff *skb, *tmp_skb;
1247 struct sk_buff **tail_skb; 1269 struct sk_buff **tail_skb;
1248 struct inet_sock *inet = inet_sk(sk); 1270 struct inet_sock *inet = inet_sk(sk);
1249 struct net *net = sock_net(sk); 1271 struct net *net = sock_net(sk);
1250 struct ip_options *opt = NULL; 1272 struct ip_options *opt = NULL;
1251 struct rtable *rt = (struct rtable *)inet->cork.dst; 1273 struct rtable *rt = (struct rtable *)cork->dst;
1252 struct iphdr *iph; 1274 struct iphdr *iph;
1253 __be16 df = 0; 1275 __be16 df = 0;
1254 __u8 ttl; 1276 __u8 ttl;
1255 int err = 0;
1256 1277
1257 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) 1278 if ((skb = __skb_dequeue(queue)) == NULL)
1258 goto out; 1279 goto out;
1259 tail_skb = &(skb_shinfo(skb)->frag_list); 1280 tail_skb = &(skb_shinfo(skb)->frag_list);
1260 1281
1261 /* move skb->data to ip header from ext header */ 1282 /* move skb->data to ip header from ext header */
1262 if (skb->data < skb_network_header(skb)) 1283 if (skb->data < skb_network_header(skb))
1263 __skb_pull(skb, skb_network_offset(skb)); 1284 __skb_pull(skb, skb_network_offset(skb));
1264 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { 1285 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1265 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1286 __skb_pull(tmp_skb, skb_network_header_len(skb));
1266 *tail_skb = tmp_skb; 1287 *tail_skb = tmp_skb;
1267 tail_skb = &(tmp_skb->next); 1288 tail_skb = &(tmp_skb->next);
@@ -1287,8 +1308,8 @@ int ip_push_pending_frames(struct sock *sk)
1287 ip_dont_fragment(sk, &rt->dst))) 1308 ip_dont_fragment(sk, &rt->dst)))
1288 df = htons(IP_DF); 1309 df = htons(IP_DF);
1289 1310
1290 if (inet->cork.flags & IPCORK_OPT) 1311 if (cork->flags & IPCORK_OPT)
1291 opt = inet->cork.opt; 1312 opt = cork->opt;
1292 1313
1293 if (rt->rt_type == RTN_MULTICAST) 1314 if (rt->rt_type == RTN_MULTICAST)
1294 ttl = inet->mc_ttl; 1315 ttl = inet->mc_ttl;
@@ -1300,7 +1321,7 @@ int ip_push_pending_frames(struct sock *sk)
1300 iph->ihl = 5; 1321 iph->ihl = 5;
1301 if (opt) { 1322 if (opt) {
1302 iph->ihl += opt->optlen>>2; 1323 iph->ihl += opt->optlen>>2;
1303 ip_options_build(skb, opt, inet->cork.addr, rt, 0); 1324 ip_options_build(skb, opt, cork->addr, rt, 0);
1304 } 1325 }
1305 iph->tos = inet->tos; 1326 iph->tos = inet->tos;
1306 iph->frag_off = df; 1327 iph->frag_off = df;
@@ -1316,44 +1337,95 @@ int ip_push_pending_frames(struct sock *sk)
1316 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec 1337 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1317 * on dst refcount 1338 * on dst refcount
1318 */ 1339 */
1319 inet->cork.dst = NULL; 1340 cork->dst = NULL;
1320 skb_dst_set(skb, &rt->dst); 1341 skb_dst_set(skb, &rt->dst);
1321 1342
1322 if (iph->protocol == IPPROTO_ICMP) 1343 if (iph->protocol == IPPROTO_ICMP)
1323 icmp_out_count(net, ((struct icmphdr *) 1344 icmp_out_count(net, ((struct icmphdr *)
1324 skb_transport_header(skb))->type); 1345 skb_transport_header(skb))->type);
1325 1346
1326 /* Netfilter gets whole the not fragmented skb. */ 1347 ip_cork_release(cork);
1348out:
1349 return skb;
1350}
1351
1352int ip_send_skb(struct sk_buff *skb)
1353{
1354 struct net *net = sock_net(skb->sk);
1355 int err;
1356
1327 err = ip_local_out(skb); 1357 err = ip_local_out(skb);
1328 if (err) { 1358 if (err) {
1329 if (err > 0) 1359 if (err > 0)
1330 err = net_xmit_errno(err); 1360 err = net_xmit_errno(err);
1331 if (err) 1361 if (err)
1332 goto error; 1362 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1333 } 1363 }
1334 1364
1335out:
1336 ip_cork_release(inet);
1337 return err; 1365 return err;
1366}
1338 1367
1339error: 1368int ip_push_pending_frames(struct sock *sk)
1340 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); 1369{
1341 goto out; 1370 struct sk_buff *skb;
1371
1372 skb = ip_finish_skb(sk);
1373 if (!skb)
1374 return 0;
1375
1376 /* Netfilter gets whole the not fragmented skb. */
1377 return ip_send_skb(skb);
1342} 1378}
1343 1379
1344/* 1380/*
1345 * Throw away all pending data on the socket. 1381 * Throw away all pending data on the socket.
1346 */ 1382 */
1347void ip_flush_pending_frames(struct sock *sk) 1383static void __ip_flush_pending_frames(struct sock *sk,
1384 struct sk_buff_head *queue,
1385 struct inet_cork *cork)
1348{ 1386{
1349 struct sk_buff *skb; 1387 struct sk_buff *skb;
1350 1388
1351 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) 1389 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1352 kfree_skb(skb); 1390 kfree_skb(skb);
1353 1391
1354 ip_cork_release(inet_sk(sk)); 1392 ip_cork_release(cork);
1393}
1394
1395void ip_flush_pending_frames(struct sock *sk)
1396{
1397 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
1355} 1398}
1356 1399
1400struct sk_buff *ip_make_skb(struct sock *sk,
1401 int getfrag(void *from, char *to, int offset,
1402 int len, int odd, struct sk_buff *skb),
1403 void *from, int length, int transhdrlen,
1404 struct ipcm_cookie *ipc, struct rtable **rtp,
1405 unsigned int flags)
1406{
1407 struct inet_cork cork = {};
1408 struct sk_buff_head queue;
1409 int err;
1410
1411 if (flags & MSG_PROBE)
1412 return NULL;
1413
1414 __skb_queue_head_init(&queue);
1415
1416 err = ip_setup_cork(sk, &cork, ipc, rtp);
1417 if (err)
1418 return ERR_PTR(err);
1419
1420 err = __ip_append_data(sk, &queue, &cork, getfrag,
1421 from, length, transhdrlen, flags);
1422 if (err) {
1423 __ip_flush_pending_frames(sk, &queue, &cork);
1424 return ERR_PTR(err);
1425 }
1426
1427 return __ip_make_skb(sk, &queue, &cork);
1428}
1357 1429
1358/* 1430/*
1359 * Fetch data from kernel space and fill in checksum if needed. 1431 * Fetch data from kernel space and fill in checksum if needed.
@@ -1402,16 +1474,19 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1402 } 1474 }
1403 1475
1404 { 1476 {
1405 struct flowi fl = { .oif = arg->bound_dev_if, 1477 struct flowi4 fl4 = {
1406 .fl4_dst = daddr, 1478 .flowi4_oif = arg->bound_dev_if,
1407 .fl4_src = rt->rt_spec_dst, 1479 .daddr = daddr,
1408 .fl4_tos = RT_TOS(ip_hdr(skb)->tos), 1480 .saddr = rt->rt_spec_dst,
1409 .fl_ip_sport = tcp_hdr(skb)->dest, 1481 .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
1410 .fl_ip_dport = tcp_hdr(skb)->source, 1482 .fl4_sport = tcp_hdr(skb)->dest,
1411 .proto = sk->sk_protocol, 1483 .fl4_dport = tcp_hdr(skb)->source,
1412 .flags = ip_reply_arg_flowi_flags(arg) }; 1484 .flowi4_proto = sk->sk_protocol,
1413 security_skb_classify_flow(skb, &fl); 1485 .flowi4_flags = ip_reply_arg_flowi_flags(arg),
1414 if (ip_route_output_key(sock_net(sk), &rt, &fl)) 1486 };
1487 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1488 rt = ip_route_output_key(sock_net(sk), &fl4);
1489 if (IS_ERR(rt))
1415 return; 1490 return;
1416 } 1491 }
1417 1492
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index a5f58e7cbb26..bfc17c5914e7 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -460,19 +460,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
460 goto tx_error_icmp; 460 goto tx_error_icmp;
461 } 461 }
462 462
463 { 463 rt = ip_route_output_ports(dev_net(dev), NULL,
464 struct flowi fl = { 464 dst, tiph->saddr,
465 .oif = tunnel->parms.link, 465 0, 0,
466 .fl4_dst = dst, 466 IPPROTO_IPIP, RT_TOS(tos),
467 .fl4_src= tiph->saddr, 467 tunnel->parms.link);
468 .fl4_tos = RT_TOS(tos), 468 if (IS_ERR(rt)) {
469 .proto = IPPROTO_IPIP 469 dev->stats.tx_carrier_errors++;
470 }; 470 goto tx_error_icmp;
471
472 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
473 dev->stats.tx_carrier_errors++;
474 goto tx_error_icmp;
475 }
476 } 471 }
477 tdev = rt->dst.dev; 472 tdev = rt->dst.dev;
478 473
@@ -583,16 +578,14 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
583 iph = &tunnel->parms.iph; 578 iph = &tunnel->parms.iph;
584 579
585 if (iph->daddr) { 580 if (iph->daddr) {
586 struct flowi fl = { 581 struct rtable *rt = ip_route_output_ports(dev_net(dev), NULL,
587 .oif = tunnel->parms.link, 582 iph->daddr, iph->saddr,
588 .fl4_dst = iph->daddr, 583 0, 0,
589 .fl4_src = iph->saddr, 584 IPPROTO_IPIP,
590 .fl4_tos = RT_TOS(iph->tos), 585 RT_TOS(iph->tos),
591 .proto = IPPROTO_IPIP 586 tunnel->parms.link);
592 }; 587
593 struct rtable *rt; 588 if (!IS_ERR(rt)) {
594
595 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
596 tdev = rt->dst.dev; 589 tdev = rt->dst.dev;
597 ip_rt_put(rt); 590 ip_rt_put(rt);
598 } 591 }
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8b65a12654e7..1f62eaeb6de4 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -148,14 +148,15 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
148 return NULL; 148 return NULL;
149} 149}
150 150
151static int ipmr_fib_lookup(struct net *net, struct flowi *flp, 151static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
152 struct mr_table **mrt) 152 struct mr_table **mrt)
153{ 153{
154 struct ipmr_result res; 154 struct ipmr_result res;
155 struct fib_lookup_arg arg = { .result = &res, }; 155 struct fib_lookup_arg arg = { .result = &res, };
156 int err; 156 int err;
157 157
158 err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg); 158 err = fib_rules_lookup(net->ipv4.mr_rules_ops,
159 flowi4_to_flowi(flp4), 0, &arg);
159 if (err < 0) 160 if (err < 0)
160 return err; 161 return err;
161 *mrt = res.mrt; 162 *mrt = res.mrt;
@@ -283,7 +284,7 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
283 return net->ipv4.mrt; 284 return net->ipv4.mrt;
284} 285}
285 286
286static int ipmr_fib_lookup(struct net *net, struct flowi *flp, 287static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
287 struct mr_table **mrt) 288 struct mr_table **mrt)
288{ 289{
289 *mrt = net->ipv4.mrt; 290 *mrt = net->ipv4.mrt;
@@ -435,14 +436,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
435{ 436{
436 struct net *net = dev_net(dev); 437 struct net *net = dev_net(dev);
437 struct mr_table *mrt; 438 struct mr_table *mrt;
438 struct flowi fl = { 439 struct flowi4 fl4 = {
439 .oif = dev->ifindex, 440 .flowi4_oif = dev->ifindex,
440 .iif = skb->skb_iif, 441 .flowi4_iif = skb->skb_iif,
441 .mark = skb->mark, 442 .flowi4_mark = skb->mark,
442 }; 443 };
443 int err; 444 int err;
444 445
445 err = ipmr_fib_lookup(net, &fl, &mrt); 446 err = ipmr_fib_lookup(net, &fl4, &mrt);
446 if (err < 0) { 447 if (err < 0) {
447 kfree_skb(skb); 448 kfree_skb(skb);
448 return err; 449 return err;
@@ -1611,26 +1612,20 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1611#endif 1612#endif
1612 1613
1613 if (vif->flags & VIFF_TUNNEL) { 1614 if (vif->flags & VIFF_TUNNEL) {
1614 struct flowi fl = { 1615 rt = ip_route_output_ports(net, NULL,
1615 .oif = vif->link, 1616 vif->remote, vif->local,
1616 .fl4_dst = vif->remote, 1617 0, 0,
1617 .fl4_src = vif->local, 1618 IPPROTO_IPIP,
1618 .fl4_tos = RT_TOS(iph->tos), 1619 RT_TOS(iph->tos), vif->link);
1619 .proto = IPPROTO_IPIP 1620 if (IS_ERR(rt))
1620 };
1621
1622 if (ip_route_output_key(net, &rt, &fl))
1623 goto out_free; 1621 goto out_free;
1624 encap = sizeof(struct iphdr); 1622 encap = sizeof(struct iphdr);
1625 } else { 1623 } else {
1626 struct flowi fl = { 1624 rt = ip_route_output_ports(net, NULL, iph->daddr, 0,
1627 .oif = vif->link, 1625 0, 0,
1628 .fl4_dst = iph->daddr, 1626 IPPROTO_IPIP,
1629 .fl4_tos = RT_TOS(iph->tos), 1627 RT_TOS(iph->tos), vif->link);
1630 .proto = IPPROTO_IPIP 1628 if (IS_ERR(rt))
1631 };
1632
1633 if (ip_route_output_key(net, &rt, &fl))
1634 goto out_free; 1629 goto out_free;
1635 } 1630 }
1636 1631
@@ -1793,6 +1788,24 @@ dont_forward:
1793 return 0; 1788 return 0;
1794} 1789}
1795 1790
1791static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct rtable *rt)
1792{
1793 struct flowi4 fl4 = {
1794 .daddr = rt->rt_key_dst,
1795 .saddr = rt->rt_key_src,
1796 .flowi4_tos = rt->rt_tos,
1797 .flowi4_oif = rt->rt_oif,
1798 .flowi4_iif = rt->rt_iif,
1799 .flowi4_mark = rt->rt_mark,
1800 };
1801 struct mr_table *mrt;
1802 int err;
1803
1804 err = ipmr_fib_lookup(net, &fl4, &mrt);
1805 if (err)
1806 return ERR_PTR(err);
1807 return mrt;
1808}
1796 1809
1797/* 1810/*
1798 * Multicast packets for forwarding arrive here 1811 * Multicast packets for forwarding arrive here
@@ -1805,7 +1818,6 @@ int ip_mr_input(struct sk_buff *skb)
1805 struct net *net = dev_net(skb->dev); 1818 struct net *net = dev_net(skb->dev);
1806 int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; 1819 int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1807 struct mr_table *mrt; 1820 struct mr_table *mrt;
1808 int err;
1809 1821
1810 /* Packet is looped back after forward, it should not be 1822 /* Packet is looped back after forward, it should not be
1811 * forwarded second time, but still can be delivered locally. 1823 * forwarded second time, but still can be delivered locally.
@@ -1813,12 +1825,11 @@ int ip_mr_input(struct sk_buff *skb)
1813 if (IPCB(skb)->flags & IPSKB_FORWARDED) 1825 if (IPCB(skb)->flags & IPSKB_FORWARDED)
1814 goto dont_forward; 1826 goto dont_forward;
1815 1827
1816 err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); 1828 mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb));
1817 if (err < 0) { 1829 if (IS_ERR(mrt)) {
1818 kfree_skb(skb); 1830 kfree_skb(skb);
1819 return err; 1831 return PTR_ERR(mrt);
1820 } 1832 }
1821
1822 if (!local) { 1833 if (!local) {
1823 if (IPCB(skb)->opt.router_alert) { 1834 if (IPCB(skb)->opt.router_alert) {
1824 if (ip_call_ra_chain(skb)) 1835 if (ip_call_ra_chain(skb))
@@ -1946,9 +1957,9 @@ int pim_rcv_v1(struct sk_buff *skb)
1946 1957
1947 pim = igmp_hdr(skb); 1958 pim = igmp_hdr(skb);
1948 1959
1949 if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) 1960 mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb));
1961 if (IS_ERR(mrt))
1950 goto drop; 1962 goto drop;
1951
1952 if (!mrt->mroute_do_pim || 1963 if (!mrt->mroute_do_pim ||
1953 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 1964 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1954 goto drop; 1965 goto drop;
@@ -1978,9 +1989,9 @@ static int pim_rcv(struct sk_buff *skb)
1978 csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 1989 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1979 goto drop; 1990 goto drop;
1980 1991
1981 if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) 1992 mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb));
1993 if (IS_ERR(mrt))
1982 goto drop; 1994 goto drop;
1983
1984 if (__pim_rcv(mrt, skb, sizeof(*pim))) { 1995 if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1985drop: 1996drop:
1986 kfree_skb(skb); 1997 kfree_skb(skb);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 994a1f29ebbc..f3c0b549b8e1 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -16,7 +16,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
16 struct net *net = dev_net(skb_dst(skb)->dev); 16 struct net *net = dev_net(skb_dst(skb)->dev);
17 const struct iphdr *iph = ip_hdr(skb); 17 const struct iphdr *iph = ip_hdr(skb);
18 struct rtable *rt; 18 struct rtable *rt;
19 struct flowi fl = {}; 19 struct flowi4 fl4 = {};
20 unsigned long orefdst; 20 unsigned long orefdst;
21 unsigned int hh_len; 21 unsigned int hh_len;
22 unsigned int type; 22 unsigned int type;
@@ -31,14 +31,15 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
31 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. 31 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
32 */ 32 */
33 if (addr_type == RTN_LOCAL) { 33 if (addr_type == RTN_LOCAL) {
34 fl.fl4_dst = iph->daddr; 34 fl4.daddr = iph->daddr;
35 if (type == RTN_LOCAL) 35 if (type == RTN_LOCAL)
36 fl.fl4_src = iph->saddr; 36 fl4.saddr = iph->saddr;
37 fl.fl4_tos = RT_TOS(iph->tos); 37 fl4.flowi4_tos = RT_TOS(iph->tos);
38 fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; 38 fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
39 fl.mark = skb->mark; 39 fl4.flowi4_mark = skb->mark;
40 fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; 40 fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
41 if (ip_route_output_key(net, &rt, &fl) != 0) 41 rt = ip_route_output_key(net, &fl4);
42 if (IS_ERR(rt))
42 return -1; 43 return -1;
43 44
44 /* Drop old route. */ 45 /* Drop old route. */
@@ -47,8 +48,9 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
47 } else { 48 } else {
48 /* non-local src, find valid iif to satisfy 49 /* non-local src, find valid iif to satisfy
49 * rp-filter when calling ip_route_input. */ 50 * rp-filter when calling ip_route_input. */
50 fl.fl4_dst = iph->saddr; 51 fl4.daddr = iph->saddr;
51 if (ip_route_output_key(net, &rt, &fl) != 0) 52 rt = ip_route_output_key(net, &fl4);
53 if (IS_ERR(rt))
52 return -1; 54 return -1;
53 55
54 orefdst = skb->_skb_refdst; 56 orefdst = skb->_skb_refdst;
@@ -66,10 +68,11 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
66 68
67#ifdef CONFIG_XFRM 69#ifdef CONFIG_XFRM
68 if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && 70 if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
69 xfrm_decode_session(skb, &fl, AF_INET) == 0) { 71 xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
70 struct dst_entry *dst = skb_dst(skb); 72 struct dst_entry *dst = skb_dst(skb);
71 skb_dst_set(skb, NULL); 73 skb_dst_set(skb, NULL);
72 if (xfrm_lookup(net, &dst, &fl, skb->sk, 0)) 74 dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
75 if (IS_ERR(dst))
73 return -1; 76 return -1;
74 skb_dst_set(skb, dst); 77 skb_dst_set(skb, dst);
75 } 78 }
@@ -102,7 +105,8 @@ int ip_xfrm_me_harder(struct sk_buff *skb)
102 dst = ((struct xfrm_dst *)dst)->route; 105 dst = ((struct xfrm_dst *)dst)->route;
103 dst_hold(dst); 106 dst_hold(dst);
104 107
105 if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0) 108 dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
109 if (IS_ERR(dst))
106 return -1; 110 return -1;
107 111
108 skb_dst_drop(skb); 112 skb_dst_drop(skb);
@@ -219,7 +223,11 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
219 223
220static int nf_ip_route(struct dst_entry **dst, struct flowi *fl) 224static int nf_ip_route(struct dst_entry **dst, struct flowi *fl)
221{ 225{
222 return ip_route_output_key(&init_net, (struct rtable **)dst, fl); 226 struct rtable *rt = ip_route_output_key(&init_net, &fl->u.ip4);
227 if (IS_ERR(rt))
228 return PTR_ERR(rt);
229 *dst = &rt->dst;
230 return 0;
223} 231}
224 232
225static const struct nf_afinfo nf_ip_afinfo = { 233static const struct nf_afinfo nf_ip_afinfo = {
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index babd1a2bae5f..1dfc18a03fd4 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -64,16 +64,6 @@ config IP_NF_IPTABLES
64if IP_NF_IPTABLES 64if IP_NF_IPTABLES
65 65
66# The matches. 66# The matches.
67config IP_NF_MATCH_ADDRTYPE
68 tristate '"addrtype" address type match support'
69 depends on NETFILTER_ADVANCED
70 help
71 This option allows you to match what routing thinks of an address,
72 eg. UNICAST, LOCAL, BROADCAST, ...
73
74 If you want to compile it as a module, say M here and read
75 <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
76
77config IP_NF_MATCH_AH 67config IP_NF_MATCH_AH
78 tristate '"ah" match support' 68 tristate '"ah" match support'
79 depends on NETFILTER_ADVANCED 69 depends on NETFILTER_ADVANCED
@@ -206,8 +196,9 @@ config IP_NF_TARGET_REDIRECT
206 196
207config NF_NAT_SNMP_BASIC 197config NF_NAT_SNMP_BASIC
208 tristate "Basic SNMP-ALG support" 198 tristate "Basic SNMP-ALG support"
209 depends on NF_NAT 199 depends on NF_CONNTRACK_SNMP && NF_NAT
210 depends on NETFILTER_ADVANCED 200 depends on NETFILTER_ADVANCED
201 default NF_NAT && NF_CONNTRACK_SNMP
211 ---help--- 202 ---help---
212 203
213 This module implements an Application Layer Gateway (ALG) for 204 This module implements an Application Layer Gateway (ALG) for
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 19eb59d01037..dca2082ec683 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -48,7 +48,6 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
48obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o 48obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
49 49
50# matches 50# matches
51obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
52obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o 51obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
53obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o 52obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
54 53
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e855fffaed95..4b5d457c2d76 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -866,6 +866,7 @@ static int compat_table_info(const struct xt_table_info *info,
866 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 866 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
867 newinfo->initial_entries = 0; 867 newinfo->initial_entries = 0;
868 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 868 loc_cpu_entry = info->entries[raw_smp_processor_id()];
869 xt_compat_init_offsets(NFPROTO_ARP, info->number);
869 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 870 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
870 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 871 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
871 if (ret != 0) 872 if (ret != 0)
@@ -1065,6 +1066,7 @@ static int do_replace(struct net *net, const void __user *user,
1065 /* overflow check */ 1066 /* overflow check */
1066 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1067 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1067 return -ENOMEM; 1068 return -ENOMEM;
1069 tmp.name[sizeof(tmp.name)-1] = 0;
1068 1070
1069 newinfo = xt_alloc_table_info(tmp.size); 1071 newinfo = xt_alloc_table_info(tmp.size);
1070 if (!newinfo) 1072 if (!newinfo)
@@ -1333,6 +1335,7 @@ static int translate_compat_table(const char *name,
1333 duprintf("translate_compat_table: size %u\n", info->size); 1335 duprintf("translate_compat_table: size %u\n", info->size);
1334 j = 0; 1336 j = 0;
1335 xt_compat_lock(NFPROTO_ARP); 1337 xt_compat_lock(NFPROTO_ARP);
1338 xt_compat_init_offsets(NFPROTO_ARP, number);
1336 /* Walk through entries, checking offsets. */ 1339 /* Walk through entries, checking offsets. */
1337 xt_entry_foreach(iter0, entry0, total_size) { 1340 xt_entry_foreach(iter0, entry0, total_size) {
1338 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1341 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
@@ -1486,6 +1489,7 @@ static int compat_do_replace(struct net *net, void __user *user,
1486 return -ENOMEM; 1489 return -ENOMEM;
1487 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1490 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1488 return -ENOMEM; 1491 return -ENOMEM;
1492 tmp.name[sizeof(tmp.name)-1] = 0;
1489 1493
1490 newinfo = xt_alloc_table_info(tmp.size); 1494 newinfo = xt_alloc_table_info(tmp.size);
1491 if (!newinfo) 1495 if (!newinfo)
@@ -1738,6 +1742,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
1738 ret = -EFAULT; 1742 ret = -EFAULT;
1739 break; 1743 break;
1740 } 1744 }
1745 rev.name[sizeof(rev.name)-1] = 0;
1741 1746
1742 try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, 1747 try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name,
1743 rev.revision, 1, &ret), 1748 rev.revision, 1, &ret),
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 652efea013dc..b09ed0d080f9 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1063,6 +1063,7 @@ static int compat_table_info(const struct xt_table_info *info,
1063 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 1063 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
1064 newinfo->initial_entries = 0; 1064 newinfo->initial_entries = 0;
1065 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 1065 loc_cpu_entry = info->entries[raw_smp_processor_id()];
1066 xt_compat_init_offsets(AF_INET, info->number);
1066 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 1067 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
1067 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 1068 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
1068 if (ret != 0) 1069 if (ret != 0)
@@ -1261,6 +1262,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
1261 /* overflow check */ 1262 /* overflow check */
1262 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1263 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1263 return -ENOMEM; 1264 return -ENOMEM;
1265 tmp.name[sizeof(tmp.name)-1] = 0;
1264 1266
1265 newinfo = xt_alloc_table_info(tmp.size); 1267 newinfo = xt_alloc_table_info(tmp.size);
1266 if (!newinfo) 1268 if (!newinfo)
@@ -1664,6 +1666,7 @@ translate_compat_table(struct net *net,
1664 duprintf("translate_compat_table: size %u\n", info->size); 1666 duprintf("translate_compat_table: size %u\n", info->size);
1665 j = 0; 1667 j = 0;
1666 xt_compat_lock(AF_INET); 1668 xt_compat_lock(AF_INET);
1669 xt_compat_init_offsets(AF_INET, number);
1667 /* Walk through entries, checking offsets. */ 1670 /* Walk through entries, checking offsets. */
1668 xt_entry_foreach(iter0, entry0, total_size) { 1671 xt_entry_foreach(iter0, entry0, total_size) {
1669 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1672 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
@@ -1805,6 +1808,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
1805 return -ENOMEM; 1808 return -ENOMEM;
1806 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1809 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1807 return -ENOMEM; 1810 return -ENOMEM;
1811 tmp.name[sizeof(tmp.name)-1] = 0;
1808 1812
1809 newinfo = xt_alloc_table_info(tmp.size); 1813 newinfo = xt_alloc_table_info(tmp.size);
1810 if (!newinfo) 1814 if (!newinfo)
@@ -2034,6 +2038,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2034 ret = -EFAULT; 2038 ret = -EFAULT;
2035 break; 2039 break;
2036 } 2040 }
2041 rev.name[sizeof(rev.name)-1] = 0;
2037 2042
2038 if (cmd == IPT_SO_GET_REVISION_TARGET) 2043 if (cmd == IPT_SO_GET_REVISION_TARGET)
2039 target = 1; 2044 target = 1;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1e26a4897655..403ca57f6011 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
300 * that the ->target() function isn't called after ->destroy() */ 300 * that the ->target() function isn't called after ->destroy() */
301 301
302 ct = nf_ct_get(skb, &ctinfo); 302 ct = nf_ct_get(skb, &ctinfo);
303 if (ct == NULL) { 303 if (ct == NULL)
304 pr_info("no conntrack!\n");
305 /* FIXME: need to drop invalid ones, since replies
306 * to outgoing connections of other nodes will be
307 * marked as INVALID */
308 return NF_DROP; 304 return NF_DROP;
309 }
310 305
311 /* special case: ICMP error handling. conntrack distinguishes between 306 /* special case: ICMP error handling. conntrack distinguishes between
312 * error messages (RELATED) and information requests (see below) */ 307 * error messages (RELATED) and information requests (see below) */
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 72ffc8fda2e9..d76d6c9ed946 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf,
442 } 442 }
443#endif 443#endif
444 444
445 /* MAC logging for input path only. */ 445 if (in != NULL)
446 if (in && !out)
447 dump_mac_header(m, loginfo, skb); 446 dump_mac_header(m, loginfo, skb);
448 447
449 dump_packet(m, loginfo, skb, 0); 448 dump_packet(m, loginfo, skb, 0);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
deleted file mode 100644
index db8bff0fb86d..000000000000
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ /dev/null
@@ -1,134 +0,0 @@
1/*
2 * iptables module to match inet_addr_type() of an ip.
3 *
4 * Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
5 * (C) 2007 Laszlo Attila Toth <panther@balabit.hu>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/skbuff.h>
15#include <linux/netdevice.h>
16#include <linux/ip.h>
17#include <net/route.h>
18
19#include <linux/netfilter_ipv4/ipt_addrtype.h>
20#include <linux/netfilter/x_tables.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
24MODULE_DESCRIPTION("Xtables: address type match for IPv4");
25
26static inline bool match_type(struct net *net, const struct net_device *dev,
27 __be32 addr, u_int16_t mask)
28{
29 return !!(mask & (1 << inet_dev_addr_type(net, dev, addr)));
30}
31
32static bool
33addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
34{
35 struct net *net = dev_net(par->in ? par->in : par->out);
36 const struct ipt_addrtype_info *info = par->matchinfo;
37 const struct iphdr *iph = ip_hdr(skb);
38 bool ret = true;
39
40 if (info->source)
41 ret &= match_type(net, NULL, iph->saddr, info->source) ^
42 info->invert_source;
43 if (info->dest)
44 ret &= match_type(net, NULL, iph->daddr, info->dest) ^
45 info->invert_dest;
46
47 return ret;
48}
49
50static bool
51addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
52{
53 struct net *net = dev_net(par->in ? par->in : par->out);
54 const struct ipt_addrtype_info_v1 *info = par->matchinfo;
55 const struct iphdr *iph = ip_hdr(skb);
56 const struct net_device *dev = NULL;
57 bool ret = true;
58
59 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN)
60 dev = par->in;
61 else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT)
62 dev = par->out;
63
64 if (info->source)
65 ret &= match_type(net, dev, iph->saddr, info->source) ^
66 (info->flags & IPT_ADDRTYPE_INVERT_SOURCE);
67 if (ret && info->dest)
68 ret &= match_type(net, dev, iph->daddr, info->dest) ^
69 !!(info->flags & IPT_ADDRTYPE_INVERT_DEST);
70 return ret;
71}
72
73static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
74{
75 struct ipt_addrtype_info_v1 *info = par->matchinfo;
76
77 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN &&
78 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
79 pr_info("both incoming and outgoing "
80 "interface limitation cannot be selected\n");
81 return -EINVAL;
82 }
83
84 if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
85 (1 << NF_INET_LOCAL_IN)) &&
86 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
87 pr_info("output interface limitation "
88 "not valid in PREROUTING and INPUT\n");
89 return -EINVAL;
90 }
91
92 if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
93 (1 << NF_INET_LOCAL_OUT)) &&
94 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) {
95 pr_info("input interface limitation "
96 "not valid in POSTROUTING and OUTPUT\n");
97 return -EINVAL;
98 }
99
100 return 0;
101}
102
103static struct xt_match addrtype_mt_reg[] __read_mostly = {
104 {
105 .name = "addrtype",
106 .family = NFPROTO_IPV4,
107 .match = addrtype_mt_v0,
108 .matchsize = sizeof(struct ipt_addrtype_info),
109 .me = THIS_MODULE
110 },
111 {
112 .name = "addrtype",
113 .family = NFPROTO_IPV4,
114 .revision = 1,
115 .match = addrtype_mt_v1,
116 .checkentry = addrtype_mt_checkentry_v1,
117 .matchsize = sizeof(struct ipt_addrtype_info_v1),
118 .me = THIS_MODULE
119 }
120};
121
122static int __init addrtype_mt_init(void)
123{
124 return xt_register_matches(addrtype_mt_reg,
125 ARRAY_SIZE(addrtype_mt_reg));
126}
127
128static void __exit addrtype_mt_exit(void)
129{
130 xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg));
131}
132
133module_init(addrtype_mt_init);
134module_exit(addrtype_mt_exit);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 294a2a32f293..aef5d1fbe77d 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, 60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
61 dev_net(out)->ipv4.iptable_mangle); 61 dev_net(out)->ipv4.iptable_mangle);
62 /* Reroute for ANY change. */ 62 /* Reroute for ANY change. */
63 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { 63 if (ret != NF_DROP && ret != NF_STOLEN) {
64 iph = ip_hdr(skb); 64 iph = ip_hdr(skb);
65 65
66 if (iph->saddr != saddr || 66 if (iph->saddr != saddr ||
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 63f60fc5d26a..5585980fce2e 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -20,6 +20,7 @@
20#include <net/netfilter/nf_conntrack_l4proto.h> 20#include <net/netfilter/nf_conntrack_l4proto.h>
21#include <net/netfilter/nf_conntrack_expect.h> 21#include <net/netfilter/nf_conntrack_expect.h>
22#include <net/netfilter/nf_conntrack_acct.h> 22#include <net/netfilter/nf_conntrack_acct.h>
23#include <linux/rculist_nulls.h>
23 24
24struct ct_iter_state { 25struct ct_iter_state {
25 struct seq_net_private p; 26 struct seq_net_private p;
@@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
35 for (st->bucket = 0; 36 for (st->bucket = 0;
36 st->bucket < net->ct.htable_size; 37 st->bucket < net->ct.htable_size;
37 st->bucket++) { 38 st->bucket++) {
38 n = rcu_dereference(net->ct.hash[st->bucket].first); 39 n = rcu_dereference(
40 hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
39 if (!is_a_nulls(n)) 41 if (!is_a_nulls(n))
40 return n; 42 return n;
41 } 43 }
@@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
48 struct net *net = seq_file_net(seq); 50 struct net *net = seq_file_net(seq);
49 struct ct_iter_state *st = seq->private; 51 struct ct_iter_state *st = seq->private;
50 52
51 head = rcu_dereference(head->next); 53 head = rcu_dereference(hlist_nulls_next_rcu(head));
52 while (is_a_nulls(head)) { 54 while (is_a_nulls(head)) {
53 if (likely(get_nulls_value(head) == st->bucket)) { 55 if (likely(get_nulls_value(head) == st->bucket)) {
54 if (++st->bucket >= net->ct.htable_size) 56 if (++st->bucket >= net->ct.htable_size)
55 return NULL; 57 return NULL;
56 } 58 }
57 head = rcu_dereference(net->ct.hash[st->bucket].first); 59 head = rcu_dereference(
60 hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
58 } 61 }
59 return head; 62 return head;
60} 63}
@@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
217 struct hlist_node *n; 220 struct hlist_node *n;
218 221
219 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { 222 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
220 n = rcu_dereference(net->ct.expect_hash[st->bucket].first); 223 n = rcu_dereference(
224 hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
221 if (n) 225 if (n)
222 return n; 226 return n;
223 } 227 }
@@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
230 struct net *net = seq_file_net(seq); 234 struct net *net = seq_file_net(seq);
231 struct ct_expect_iter_state *st = seq->private; 235 struct ct_expect_iter_state *st = seq->private;
232 236
233 head = rcu_dereference(head->next); 237 head = rcu_dereference(hlist_next_rcu(head));
234 while (head == NULL) { 238 while (head == NULL) {
235 if (++st->bucket >= nf_ct_expect_hsize) 239 if (++st->bucket >= nf_ct_expect_hsize)
236 return NULL; 240 return NULL;
237 head = rcu_dereference(net->ct.expect_hash[st->bucket].first); 241 head = rcu_dereference(
242 hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
238 } 243 }
239 return head; 244 return head;
240} 245}
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 0f23b3f06df0..703f366fd235 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb,
44 44
45 /* Try to get same port: if not, try to change it. */ 45 /* Try to get same port: if not, try to change it. */
46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
47 int ret; 47 int res;
48 48
49 exp->tuple.dst.u.tcp.port = htons(port); 49 exp->tuple.dst.u.tcp.port = htons(port);
50 ret = nf_ct_expect_related(exp); 50 res = nf_ct_expect_related(exp);
51 if (ret == 0) 51 if (res == 0)
52 break; 52 break;
53 else if (ret != -EBUSY) { 53 else if (res != -EBUSY) {
54 port = 0; 54 port = 0;
55 break; 55 break;
56 } 56 }
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index c04787ce1a71..21bcf471b25a 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
221 manips not an issue. */ 221 manips not an issue. */
222 if (maniptype == IP_NAT_MANIP_SRC && 222 if (maniptype == IP_NAT_MANIP_SRC &&
223 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { 223 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
224 if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { 224 /* try the original tuple first */
225 if (in_range(orig_tuple, range)) {
226 if (!nf_nat_used_tuple(orig_tuple, ct)) {
227 *tuple = *orig_tuple;
228 return;
229 }
230 } else if (find_appropriate_src(net, zone, orig_tuple, tuple,
231 range)) {
225 pr_debug("get_unique_tuple: Found current src map\n"); 232 pr_debug("get_unique_tuple: Found current src map\n");
226 if (!nf_nat_used_tuple(tuple, ct)) 233 if (!nf_nat_used_tuple(tuple, ct))
227 return; 234 return;
@@ -266,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct,
266 struct net *net = nf_ct_net(ct); 273 struct net *net = nf_ct_net(ct);
267 struct nf_conntrack_tuple curr_tuple, new_tuple; 274 struct nf_conntrack_tuple curr_tuple, new_tuple;
268 struct nf_conn_nat *nat; 275 struct nf_conn_nat *nat;
269 int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
270 276
271 /* nat helper or nfctnetlink also setup binding */ 277 /* nat helper or nfctnetlink also setup binding */
272 nat = nfct_nat(ct); 278 nat = nfct_nat(ct);
@@ -306,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct,
306 ct->status |= IPS_DST_NAT; 312 ct->status |= IPS_DST_NAT;
307 } 313 }
308 314
309 /* Place in source hash if this is the first time. */ 315 if (maniptype == IP_NAT_MANIP_SRC) {
310 if (have_to_hash) {
311 unsigned int srchash; 316 unsigned int srchash;
312 317
313 srchash = hash_by_src(net, nf_ct_zone(ct), 318 srchash = hash_by_src(net, nf_ct_zone(ct),
@@ -323,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct,
323 328
324 /* It's done. */ 329 /* It's done. */
325 if (maniptype == IP_NAT_MANIP_DST) 330 if (maniptype == IP_NAT_MANIP_DST)
326 set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); 331 ct->status |= IPS_DST_NAT_DONE;
327 else 332 else
328 set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); 333 ct->status |= IPS_SRC_NAT_DONE;
329 334
330 return NF_ACCEPT; 335 return NF_ACCEPT;
331} 336}
@@ -502,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
502 int ret = 0; 507 int ret = 0;
503 508
504 spin_lock_bh(&nf_nat_lock); 509 spin_lock_bh(&nf_nat_lock);
505 if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { 510 if (rcu_dereference_protected(
511 nf_nat_protos[proto->protonum],
512 lockdep_is_held(&nf_nat_lock)
513 ) != &nf_nat_unknown_protocol) {
506 ret = -EBUSY; 514 ret = -EBUSY;
507 goto out; 515 goto out;
508 } 516 }
@@ -532,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
532 if (nat == NULL || nat->ct == NULL) 540 if (nat == NULL || nat->ct == NULL)
533 return; 541 return;
534 542
535 NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK); 543 NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
536 544
537 spin_lock_bh(&nf_nat_lock); 545 spin_lock_bh(&nf_nat_lock);
538 hlist_del_rcu(&nat->bysource); 546 hlist_del_rcu(&nat->bysource);
@@ -545,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old)
545 struct nf_conn_nat *old_nat = old; 553 struct nf_conn_nat *old_nat = old;
546 struct nf_conn *ct = old_nat->ct; 554 struct nf_conn *ct = old_nat->ct;
547 555
548 if (!ct || !(ct->status & IPS_NAT_DONE_MASK)) 556 if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
549 return; 557 return;
550 558
551 spin_lock_bh(&nf_nat_lock); 559 spin_lock_bh(&nf_nat_lock);
552 new_nat->ct = ct;
553 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); 560 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
554 spin_unlock_bh(&nf_nat_lock); 561 spin_unlock_bh(&nf_nat_lock);
555} 562}
@@ -679,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net)
679{ 686{
680 /* Leave them the same for the moment. */ 687 /* Leave them the same for the moment. */
681 net->ipv4.nat_htable_size = net->ct.htable_size; 688 net->ipv4.nat_htable_size = net->ct.htable_size;
682 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 689 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
683 &net->ipv4.nat_vmalloced, 0);
684 if (!net->ipv4.nat_bysource) 690 if (!net->ipv4.nat_bysource)
685 return -ENOMEM; 691 return -ENOMEM;
686 return 0; 692 return 0;
@@ -702,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
702{ 708{
703 nf_ct_iterate_cleanup(net, &clean_nat, NULL); 709 nf_ct_iterate_cleanup(net, &clean_nat, NULL);
704 synchronize_rcu(); 710 synchronize_rcu();
705 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, 711 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
706 net->ipv4.nat_htable_size);
707} 712}
708 713
709static struct pernet_operations nf_nat_net_ops = { 714static struct pernet_operations nf_nat_net_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ee5f419d0a56..8812a02078ab 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -54,6 +54,7 @@
54#include <net/netfilter/nf_conntrack_expect.h> 54#include <net/netfilter/nf_conntrack_expect.h>
55#include <net/netfilter/nf_conntrack_helper.h> 55#include <net/netfilter/nf_conntrack_helper.h>
56#include <net/netfilter/nf_nat_helper.h> 56#include <net/netfilter/nf_nat_helper.h>
57#include <linux/netfilter/nf_conntrack_snmp.h>
57 58
58MODULE_LICENSE("GPL"); 59MODULE_LICENSE("GPL");
59MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); 60MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
@@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void)
1310{ 1311{
1311 int ret = 0; 1312 int ret = 0;
1312 1313
1313 ret = nf_conntrack_helper_register(&snmp_helper); 1314 BUG_ON(nf_nat_snmp_hook != NULL);
1314 if (ret < 0) 1315 rcu_assign_pointer(nf_nat_snmp_hook, help);
1315 return ret; 1316
1316 ret = nf_conntrack_helper_register(&snmp_trap_helper); 1317 ret = nf_conntrack_helper_register(&snmp_trap_helper);
1317 if (ret < 0) { 1318 if (ret < 0) {
1318 nf_conntrack_helper_unregister(&snmp_helper); 1319 nf_conntrack_helper_unregister(&snmp_helper);
@@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void)
1323 1324
1324static void __exit nf_nat_snmp_basic_fini(void) 1325static void __exit nf_nat_snmp_basic_fini(void)
1325{ 1326{
1326 nf_conntrack_helper_unregister(&snmp_helper); 1327 rcu_assign_pointer(nf_nat_snmp_hook, NULL);
1327 nf_conntrack_helper_unregister(&snmp_trap_helper); 1328 nf_conntrack_helper_unregister(&snmp_trap_helper);
1328} 1329}
1329 1330
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 95481fee8bdb..7317bdf1d457 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -31,6 +31,7 @@
31#ifdef CONFIG_XFRM 31#ifdef CONFIG_XFRM
32static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) 32static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
33{ 33{
34 struct flowi4 *fl4 = &fl->u.ip4;
34 const struct nf_conn *ct; 35 const struct nf_conn *ct;
35 const struct nf_conntrack_tuple *t; 36 const struct nf_conntrack_tuple *t;
36 enum ip_conntrack_info ctinfo; 37 enum ip_conntrack_info ctinfo;
@@ -49,25 +50,25 @@ static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
49 statusbit = IPS_SRC_NAT; 50 statusbit = IPS_SRC_NAT;
50 51
51 if (ct->status & statusbit) { 52 if (ct->status & statusbit) {
52 fl->fl4_dst = t->dst.u3.ip; 53 fl4->daddr = t->dst.u3.ip;
53 if (t->dst.protonum == IPPROTO_TCP || 54 if (t->dst.protonum == IPPROTO_TCP ||
54 t->dst.protonum == IPPROTO_UDP || 55 t->dst.protonum == IPPROTO_UDP ||
55 t->dst.protonum == IPPROTO_UDPLITE || 56 t->dst.protonum == IPPROTO_UDPLITE ||
56 t->dst.protonum == IPPROTO_DCCP || 57 t->dst.protonum == IPPROTO_DCCP ||
57 t->dst.protonum == IPPROTO_SCTP) 58 t->dst.protonum == IPPROTO_SCTP)
58 fl->fl_ip_dport = t->dst.u.tcp.port; 59 fl4->fl4_dport = t->dst.u.tcp.port;
59 } 60 }
60 61
61 statusbit ^= IPS_NAT_MASK; 62 statusbit ^= IPS_NAT_MASK;
62 63
63 if (ct->status & statusbit) { 64 if (ct->status & statusbit) {
64 fl->fl4_src = t->src.u3.ip; 65 fl4->saddr = t->src.u3.ip;
65 if (t->dst.protonum == IPPROTO_TCP || 66 if (t->dst.protonum == IPPROTO_TCP ||
66 t->dst.protonum == IPPROTO_UDP || 67 t->dst.protonum == IPPROTO_UDP ||
67 t->dst.protonum == IPPROTO_UDPLITE || 68 t->dst.protonum == IPPROTO_UDPLITE ||
68 t->dst.protonum == IPPROTO_DCCP || 69 t->dst.protonum == IPPROTO_DCCP ||
69 t->dst.protonum == IPPROTO_SCTP) 70 t->dst.protonum == IPPROTO_SCTP)
70 fl->fl_ip_sport = t->src.u.tcp.port; 71 fl4->fl4_sport = t->src.u.tcp.port;
71 } 72 }
72} 73}
73#endif 74#endif
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 6390ba299b3d..e837ffd3edc3 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -402,7 +402,7 @@ error:
402 return err; 402 return err;
403} 403}
404 404
405static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) 405static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
406{ 406{
407 struct iovec *iov; 407 struct iovec *iov;
408 u8 __user *type = NULL; 408 u8 __user *type = NULL;
@@ -418,7 +418,7 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
418 if (!iov) 418 if (!iov)
419 continue; 419 continue;
420 420
421 switch (fl->proto) { 421 switch (fl4->flowi4_proto) {
422 case IPPROTO_ICMP: 422 case IPPROTO_ICMP:
423 /* check if one-byte field is readable or not. */ 423 /* check if one-byte field is readable or not. */
424 if (iov->iov_base && iov->iov_len < 1) 424 if (iov->iov_base && iov->iov_len < 1)
@@ -433,8 +433,8 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
433 code = iov->iov_base; 433 code = iov->iov_base;
434 434
435 if (type && code) { 435 if (type && code) {
436 if (get_user(fl->fl_icmp_type, type) || 436 if (get_user(fl4->fl4_icmp_type, type) ||
437 get_user(fl->fl_icmp_code, code)) 437 get_user(fl4->fl4_icmp_code, code))
438 return -EFAULT; 438 return -EFAULT;
439 probed = 1; 439 probed = 1;
440 } 440 }
@@ -548,25 +548,30 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
548 } 548 }
549 549
550 { 550 {
551 struct flowi fl = { .oif = ipc.oif, 551 struct flowi4 fl4 = {
552 .mark = sk->sk_mark, 552 .flowi4_oif = ipc.oif,
553 .fl4_dst = daddr, 553 .flowi4_mark = sk->sk_mark,
554 .fl4_src = saddr, 554 .daddr = daddr,
555 .fl4_tos = tos, 555 .saddr = saddr,
556 .proto = inet->hdrincl ? IPPROTO_RAW : 556 .flowi4_tos = tos,
557 sk->sk_protocol, 557 .flowi4_proto = (inet->hdrincl ?
558 }; 558 IPPROTO_RAW :
559 sk->sk_protocol),
560 .flowi4_flags = FLOWI_FLAG_CAN_SLEEP,
561 };
559 if (!inet->hdrincl) { 562 if (!inet->hdrincl) {
560 err = raw_probe_proto_opt(&fl, msg); 563 err = raw_probe_proto_opt(&fl4, msg);
561 if (err) 564 if (err)
562 goto done; 565 goto done;
563 } 566 }
564 567
565 security_sk_classify_flow(sk, &fl); 568 security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
566 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); 569 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
570 if (IS_ERR(rt)) {
571 err = PTR_ERR(rt);
572 goto done;
573 }
567 } 574 }
568 if (err)
569 goto done;
570 575
571 err = -EACCES; 576 err = -EACCES;
572 if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) 577 if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6ed6603c2f6d..209989cf7d1b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -109,8 +109,8 @@
109#include <linux/sysctl.h> 109#include <linux/sysctl.h>
110#endif 110#endif
111 111
112#define RT_FL_TOS(oldflp) \ 112#define RT_FL_TOS(oldflp4) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) 113 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 114
115#define IP_MAX_MTU 0xFFF0 115#define IP_MAX_MTU 0xFFF0
116 116
@@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256; 131static int ip_rt_min_advmss __read_mostly = 256;
132static int rt_chain_length_max __read_mostly = 20; 132static int rt_chain_length_max __read_mostly = 20;
133 133
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
136
137/* 134/*
138 * Interface to generic destination cache. 135 * Interface to generic destination cache.
139 */ 136 */
@@ -152,6 +149,41 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
152{ 149{
153} 150}
154 151
152static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
153{
154 struct rtable *rt = (struct rtable *) dst;
155 struct inet_peer *peer;
156 u32 *p = NULL;
157
158 if (!rt->peer)
159 rt_bind_peer(rt, 1);
160
161 peer = rt->peer;
162 if (peer) {
163 u32 *old_p = __DST_METRICS_PTR(old);
164 unsigned long prev, new;
165
166 p = peer->metrics;
167 if (inet_metrics_new(peer))
168 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
169
170 new = (unsigned long) p;
171 prev = cmpxchg(&dst->_metrics, old, new);
172
173 if (prev != old) {
174 p = __DST_METRICS_PTR(prev);
175 if (prev & DST_METRICS_READ_ONLY)
176 p = NULL;
177 } else {
178 if (rt->fi) {
179 fib_info_put(rt->fi);
180 rt->fi = NULL;
181 }
182 }
183 }
184 return p;
185}
186
155static struct dst_ops ipv4_dst_ops = { 187static struct dst_ops ipv4_dst_ops = {
156 .family = AF_INET, 188 .family = AF_INET,
157 .protocol = cpu_to_be16(ETH_P_IP), 189 .protocol = cpu_to_be16(ETH_P_IP),
@@ -159,6 +191,7 @@ static struct dst_ops ipv4_dst_ops = {
159 .check = ipv4_dst_check, 191 .check = ipv4_dst_check,
160 .default_advmss = ipv4_default_advmss, 192 .default_advmss = ipv4_default_advmss,
161 .default_mtu = ipv4_default_mtu, 193 .default_mtu = ipv4_default_mtu,
194 .cow_metrics = ipv4_cow_metrics,
162 .destroy = ipv4_dst_destroy, 195 .destroy = ipv4_dst_destroy,
163 .ifdown = ipv4_dst_ifdown, 196 .ifdown = ipv4_dst_ifdown,
164 .negative_advice = ipv4_negative_advice, 197 .negative_advice = ipv4_negative_advice,
@@ -391,7 +424,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
391 dst_metric(&r->dst, RTAX_WINDOW), 424 dst_metric(&r->dst, RTAX_WINDOW),
392 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 425 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
393 dst_metric(&r->dst, RTAX_RTTVAR)), 426 dst_metric(&r->dst, RTAX_RTTVAR)),
394 r->fl.fl4_tos, 427 r->rt_tos,
395 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, 428 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
396 r->dst.hh ? (r->dst.hh->hh_output == 429 r->dst.hh ? (r->dst.hh->hh_output ==
397 dev_queue_xmit) : 0, 430 dev_queue_xmit) : 0,
@@ -514,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = {
514 .release = seq_release, 547 .release = seq_release,
515}; 548};
516 549
517#ifdef CONFIG_NET_CLS_ROUTE 550#ifdef CONFIG_IP_ROUTE_CLASSID
518static int rt_acct_proc_show(struct seq_file *m, void *v) 551static int rt_acct_proc_show(struct seq_file *m, void *v)
519{ 552{
520 struct ip_rt_acct *dst, *src; 553 struct ip_rt_acct *dst, *src;
@@ -567,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
567 if (!pde) 600 if (!pde)
568 goto err2; 601 goto err2;
569 602
570#ifdef CONFIG_NET_CLS_ROUTE 603#ifdef CONFIG_IP_ROUTE_CLASSID
571 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 604 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
572 if (!pde) 605 if (!pde)
573 goto err3; 606 goto err3;
574#endif 607#endif
575 return 0; 608 return 0;
576 609
577#ifdef CONFIG_NET_CLS_ROUTE 610#ifdef CONFIG_IP_ROUTE_CLASSID
578err3: 611err3:
579 remove_proc_entry("rt_cache", net->proc_net_stat); 612 remove_proc_entry("rt_cache", net->proc_net_stat);
580#endif 613#endif
@@ -588,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
588{ 621{
589 remove_proc_entry("rt_cache", net->proc_net_stat); 622 remove_proc_entry("rt_cache", net->proc_net_stat);
590 remove_proc_entry("rt_cache", net->proc_net); 623 remove_proc_entry("rt_cache", net->proc_net);
591#ifdef CONFIG_NET_CLS_ROUTE 624#ifdef CONFIG_IP_ROUTE_CLASSID
592 remove_proc_entry("rt_acct", net->proc_net); 625 remove_proc_entry("rt_acct", net->proc_net);
593#endif 626#endif
594} 627}
@@ -632,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth)
632static inline int rt_valuable(struct rtable *rth) 665static inline int rt_valuable(struct rtable *rth)
633{ 666{
634 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 667 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
635 rth->dst.expires; 668 (rth->peer && rth->peer->pmtu_expires);
636} 669}
637 670
638static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 671static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -643,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
643 if (atomic_read(&rth->dst.__refcnt)) 676 if (atomic_read(&rth->dst.__refcnt))
644 goto out; 677 goto out;
645 678
646 ret = 1;
647 if (rth->dst.expires &&
648 time_after_eq(jiffies, rth->dst.expires))
649 goto out;
650
651 age = jiffies - rth->dst.lastuse; 679 age = jiffies - rth->dst.lastuse;
652 ret = 0;
653 if ((age <= tmo1 && !rt_fast_clean(rth)) || 680 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
654 (age <= tmo2 && rt_valuable(rth))) 681 (age <= tmo2 && rt_valuable(rth)))
655 goto out; 682 goto out;
@@ -684,22 +711,22 @@ static inline bool rt_caching(const struct net *net)
684 net->ipv4.sysctl_rt_cache_rebuild_count; 711 net->ipv4.sysctl_rt_cache_rebuild_count;
685} 712}
686 713
687static inline bool compare_hash_inputs(const struct flowi *fl1, 714static inline bool compare_hash_inputs(const struct rtable *rt1,
688 const struct flowi *fl2) 715 const struct rtable *rt2)
689{ 716{
690 return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) | 717 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
691 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) | 718 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
692 (fl1->iif ^ fl2->iif)) == 0); 719 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
693} 720}
694 721
695static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 722static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
696{ 723{
697 return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) | 724 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
698 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) | 725 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
699 (fl1->mark ^ fl2->mark) | 726 (rt1->rt_mark ^ rt2->rt_mark) |
700 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) | 727 (rt1->rt_tos ^ rt2->rt_tos) |
701 (fl1->oif ^ fl2->oif) | 728 (rt1->rt_oif ^ rt2->rt_oif) |
702 (fl1->iif ^ fl2->iif)) == 0; 729 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
703} 730}
704 731
705static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 732static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
@@ -786,104 +813,13 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
786 const struct rtable *aux = head; 813 const struct rtable *aux = head;
787 814
788 while (aux != rth) { 815 while (aux != rth) {
789 if (compare_hash_inputs(&aux->fl, &rth->fl)) 816 if (compare_hash_inputs(aux, rth))
790 return 0; 817 return 0;
791 aux = rcu_dereference_protected(aux->dst.rt_next, 1); 818 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
792 } 819 }
793 return ONE; 820 return ONE;
794} 821}
795 822
796static void rt_check_expire(void)
797{
798 static unsigned int rover;
799 unsigned int i = rover, goal;
800 struct rtable *rth;
801 struct rtable __rcu **rthp;
802 unsigned long samples = 0;
803 unsigned long sum = 0, sum2 = 0;
804 unsigned long delta;
805 u64 mult;
806
807 delta = jiffies - expires_ljiffies;
808 expires_ljiffies = jiffies;
809 mult = ((u64)delta) << rt_hash_log;
810 if (ip_rt_gc_timeout > 1)
811 do_div(mult, ip_rt_gc_timeout);
812 goal = (unsigned int)mult;
813 if (goal > rt_hash_mask)
814 goal = rt_hash_mask + 1;
815 for (; goal > 0; goal--) {
816 unsigned long tmo = ip_rt_gc_timeout;
817 unsigned long length;
818
819 i = (i + 1) & rt_hash_mask;
820 rthp = &rt_hash_table[i].chain;
821
822 if (need_resched())
823 cond_resched();
824
825 samples++;
826
827 if (rcu_dereference_raw(*rthp) == NULL)
828 continue;
829 length = 0;
830 spin_lock_bh(rt_hash_lock_addr(i));
831 while ((rth = rcu_dereference_protected(*rthp,
832 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
833 prefetch(rth->dst.rt_next);
834 if (rt_is_expired(rth)) {
835 *rthp = rth->dst.rt_next;
836 rt_free(rth);
837 continue;
838 }
839 if (rth->dst.expires) {
840 /* Entry is expired even if it is in use */
841 if (time_before_eq(jiffies, rth->dst.expires)) {
842nofree:
843 tmo >>= 1;
844 rthp = &rth->dst.rt_next;
845 /*
846 * We only count entries on
847 * a chain with equal hash inputs once
848 * so that entries for different QOS
849 * levels, and other non-hash input
850 * attributes don't unfairly skew
851 * the length computation
852 */
853 length += has_noalias(rt_hash_table[i].chain, rth);
854 continue;
855 }
856 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
857 goto nofree;
858
859 /* Cleanup aged off entries. */
860 *rthp = rth->dst.rt_next;
861 rt_free(rth);
862 }
863 spin_unlock_bh(rt_hash_lock_addr(i));
864 sum += length;
865 sum2 += length*length;
866 }
867 if (samples) {
868 unsigned long avg = sum / samples;
869 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
870 rt_chain_length_max = max_t(unsigned long,
871 ip_rt_gc_elasticity,
872 (avg + 4*sd) >> FRACT_BITS);
873 }
874 rover = i;
875}
876
877/*
878 * rt_worker_func() is run in process context.
879 * we call rt_check_expire() to scan part of the hash table
880 */
881static void rt_worker_func(struct work_struct *work)
882{
883 rt_check_expire();
884 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
885}
886
887/* 823/*
888 * Pertubation of rt_genid by a small quantity [1..256] 824 * Pertubation of rt_genid by a small quantity [1..256]
889 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 825 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -1078,8 +1014,8 @@ static int slow_chain_length(const struct rtable *head)
1078 return length >> FRACT_BITS; 1014 return length >> FRACT_BITS;
1079} 1015}
1080 1016
1081static int rt_intern_hash(unsigned hash, struct rtable *rt, 1017static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1082 struct rtable **rp, struct sk_buff *skb, int ifindex) 1018 struct sk_buff *skb, int ifindex)
1083{ 1019{
1084 struct rtable *rth, *cand; 1020 struct rtable *rth, *cand;
1085 struct rtable __rcu **rthp, **candp; 1021 struct rtable __rcu **rthp, **candp;
@@ -1120,7 +1056,7 @@ restart:
1120 printk(KERN_WARNING 1056 printk(KERN_WARNING
1121 "Neighbour table failure & not caching routes.\n"); 1057 "Neighbour table failure & not caching routes.\n");
1122 ip_rt_put(rt); 1058 ip_rt_put(rt);
1123 return err; 1059 return ERR_PTR(err);
1124 } 1060 }
1125 } 1061 }
1126 1062
@@ -1137,7 +1073,7 @@ restart:
1137 rt_free(rth); 1073 rt_free(rth);
1138 continue; 1074 continue;
1139 } 1075 }
1140 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { 1076 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1141 /* Put it first */ 1077 /* Put it first */
1142 *rthp = rth->dst.rt_next; 1078 *rthp = rth->dst.rt_next;
1143 /* 1079 /*
@@ -1157,11 +1093,9 @@ restart:
1157 spin_unlock_bh(rt_hash_lock_addr(hash)); 1093 spin_unlock_bh(rt_hash_lock_addr(hash));
1158 1094
1159 rt_drop(rt); 1095 rt_drop(rt);
1160 if (rp) 1096 if (skb)
1161 *rp = rth;
1162 else
1163 skb_dst_set(skb, &rth->dst); 1097 skb_dst_set(skb, &rth->dst);
1164 return 0; 1098 return rth;
1165 } 1099 }
1166 1100
1167 if (!atomic_read(&rth->dst.__refcnt)) { 1101 if (!atomic_read(&rth->dst.__refcnt)) {
@@ -1202,7 +1136,7 @@ restart:
1202 rt_emergency_hash_rebuild(net); 1136 rt_emergency_hash_rebuild(net);
1203 spin_unlock_bh(rt_hash_lock_addr(hash)); 1137 spin_unlock_bh(rt_hash_lock_addr(hash));
1204 1138
1205 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1139 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1206 ifindex, rt_genid(net)); 1140 ifindex, rt_genid(net));
1207 goto restart; 1141 goto restart;
1208 } 1142 }
@@ -1218,7 +1152,7 @@ restart:
1218 1152
1219 if (err != -ENOBUFS) { 1153 if (err != -ENOBUFS) {
1220 rt_drop(rt); 1154 rt_drop(rt);
1221 return err; 1155 return ERR_PTR(err);
1222 } 1156 }
1223 1157
1224 /* Neighbour tables are full and nothing 1158 /* Neighbour tables are full and nothing
@@ -1239,7 +1173,7 @@ restart:
1239 if (net_ratelimit()) 1173 if (net_ratelimit())
1240 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); 1174 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1241 rt_drop(rt); 1175 rt_drop(rt);
1242 return -ENOBUFS; 1176 return ERR_PTR(-ENOBUFS);
1243 } 1177 }
1244 } 1178 }
1245 1179
@@ -1265,11 +1199,16 @@ restart:
1265 spin_unlock_bh(rt_hash_lock_addr(hash)); 1199 spin_unlock_bh(rt_hash_lock_addr(hash));
1266 1200
1267skip_hashing: 1201skip_hashing:
1268 if (rp) 1202 if (skb)
1269 *rp = rt;
1270 else
1271 skb_dst_set(skb, &rt->dst); 1203 skb_dst_set(skb, &rt->dst);
1272 return 0; 1204 return rt;
1205}
1206
1207static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1208
1209static u32 rt_peer_genid(void)
1210{
1211 return atomic_read(&__rt_peer_genid);
1273} 1212}
1274 1213
1275void rt_bind_peer(struct rtable *rt, int create) 1214void rt_bind_peer(struct rtable *rt, int create)
@@ -1280,6 +1219,8 @@ void rt_bind_peer(struct rtable *rt, int create)
1280 1219
1281 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) 1220 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1282 inet_putpeer(peer); 1221 inet_putpeer(peer);
1222 else
1223 rt->rt_peer_genid = rt_peer_genid();
1283} 1224}
1284 1225
1285/* 1226/*
@@ -1349,13 +1290,8 @@ static void rt_del(unsigned hash, struct rtable *rt)
1349void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1290void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1350 __be32 saddr, struct net_device *dev) 1291 __be32 saddr, struct net_device *dev)
1351{ 1292{
1352 int i, k;
1353 struct in_device *in_dev = __in_dev_get_rcu(dev); 1293 struct in_device *in_dev = __in_dev_get_rcu(dev);
1354 struct rtable *rth; 1294 struct inet_peer *peer;
1355 struct rtable __rcu **rthp;
1356 __be32 skeys[2] = { saddr, 0 };
1357 int ikeys[2] = { dev->ifindex, 0 };
1358 struct netevent_redirect netevent;
1359 struct net *net; 1295 struct net *net;
1360 1296
1361 if (!in_dev) 1297 if (!in_dev)
@@ -1367,9 +1303,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1367 ipv4_is_zeronet(new_gw)) 1303 ipv4_is_zeronet(new_gw))
1368 goto reject_redirect; 1304 goto reject_redirect;
1369 1305
1370 if (!rt_caching(net))
1371 goto reject_redirect;
1372
1373 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1306 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1374 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1307 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1375 goto reject_redirect; 1308 goto reject_redirect;
@@ -1380,91 +1313,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1380 goto reject_redirect; 1313 goto reject_redirect;
1381 } 1314 }
1382 1315
1383 for (i = 0; i < 2; i++) { 1316 peer = inet_getpeer_v4(daddr, 1);
1384 for (k = 0; k < 2; k++) { 1317 if (peer) {
1385 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1318 peer->redirect_learned.a4 = new_gw;
1386 rt_genid(net));
1387
1388 rthp = &rt_hash_table[hash].chain;
1389
1390 while ((rth = rcu_dereference(*rthp)) != NULL) {
1391 struct rtable *rt;
1392
1393 if (rth->fl.fl4_dst != daddr ||
1394 rth->fl.fl4_src != skeys[i] ||
1395 rth->fl.oif != ikeys[k] ||
1396 rt_is_input_route(rth) ||
1397 rt_is_expired(rth) ||
1398 !net_eq(dev_net(rth->dst.dev), net)) {
1399 rthp = &rth->dst.rt_next;
1400 continue;
1401 }
1402
1403 if (rth->rt_dst != daddr ||
1404 rth->rt_src != saddr ||
1405 rth->dst.error ||
1406 rth->rt_gateway != old_gw ||
1407 rth->dst.dev != dev)
1408 break;
1409
1410 dst_hold(&rth->dst);
1411
1412 rt = dst_alloc(&ipv4_dst_ops);
1413 if (rt == NULL) {
1414 ip_rt_put(rth);
1415 return;
1416 }
1417
1418 /* Copy all the information. */
1419 *rt = *rth;
1420 rt->dst.__use = 1;
1421 atomic_set(&rt->dst.__refcnt, 1);
1422 rt->dst.child = NULL;
1423 if (rt->dst.dev)
1424 dev_hold(rt->dst.dev);
1425 rt->dst.obsolete = -1;
1426 rt->dst.lastuse = jiffies;
1427 rt->dst.path = &rt->dst;
1428 rt->dst.neighbour = NULL;
1429 rt->dst.hh = NULL;
1430#ifdef CONFIG_XFRM
1431 rt->dst.xfrm = NULL;
1432#endif
1433 rt->rt_genid = rt_genid(net);
1434 rt->rt_flags |= RTCF_REDIRECTED;
1435
1436 /* Gateway is different ... */
1437 rt->rt_gateway = new_gw;
1438
1439 /* Redirect received -> path was valid */
1440 dst_confirm(&rth->dst);
1441
1442 if (rt->peer)
1443 atomic_inc(&rt->peer->refcnt);
1444
1445 if (arp_bind_neighbour(&rt->dst) ||
1446 !(rt->dst.neighbour->nud_state &
1447 NUD_VALID)) {
1448 if (rt->dst.neighbour)
1449 neigh_event_send(rt->dst.neighbour, NULL);
1450 ip_rt_put(rth);
1451 rt_drop(rt);
1452 goto do_next;
1453 }
1454 1319
1455 netevent.old = &rth->dst; 1320 inet_putpeer(peer);
1456 netevent.new = &rt->dst;
1457 call_netevent_notifiers(NETEVENT_REDIRECT,
1458 &netevent);
1459 1321
1460 rt_del(hash, rth); 1322 atomic_inc(&__rt_peer_genid);
1461 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1462 ip_rt_put(rt);
1463 goto do_next;
1464 }
1465 do_next:
1466 ;
1467 }
1468 } 1323 }
1469 return; 1324 return;
1470 1325
@@ -1488,18 +1343,24 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1488 if (dst->obsolete > 0) { 1343 if (dst->obsolete > 0) {
1489 ip_rt_put(rt); 1344 ip_rt_put(rt);
1490 ret = NULL; 1345 ret = NULL;
1491 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1346 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1492 (rt->dst.expires && 1347 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1493 time_after_eq(jiffies, rt->dst.expires))) { 1348 rt->rt_oif,
1494 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1495 rt->fl.oif,
1496 rt_genid(dev_net(dst->dev))); 1349 rt_genid(dev_net(dst->dev)));
1497#if RT_CACHE_DEBUG >= 1 1350#if RT_CACHE_DEBUG >= 1
1498 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", 1351 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1499 &rt->rt_dst, rt->fl.fl4_tos); 1352 &rt->rt_dst, rt->rt_tos);
1500#endif 1353#endif
1501 rt_del(hash, rt); 1354 rt_del(hash, rt);
1502 ret = NULL; 1355 ret = NULL;
1356 } else if (rt->peer &&
1357 rt->peer->pmtu_expires &&
1358 time_after_eq(jiffies, rt->peer->pmtu_expires)) {
1359 unsigned long orig = rt->peer->pmtu_expires;
1360
1361 if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1362 dst_metric_set(dst, RTAX_MTU,
1363 rt->peer->pmtu_orig);
1503 } 1364 }
1504 } 1365 }
1505 return ret; 1366 return ret;
@@ -1525,6 +1386,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1525{ 1386{
1526 struct rtable *rt = skb_rtable(skb); 1387 struct rtable *rt = skb_rtable(skb);
1527 struct in_device *in_dev; 1388 struct in_device *in_dev;
1389 struct inet_peer *peer;
1528 int log_martians; 1390 int log_martians;
1529 1391
1530 rcu_read_lock(); 1392 rcu_read_lock();
@@ -1536,33 +1398,41 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1536 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1398 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1537 rcu_read_unlock(); 1399 rcu_read_unlock();
1538 1400
1401 if (!rt->peer)
1402 rt_bind_peer(rt, 1);
1403 peer = rt->peer;
1404 if (!peer) {
1405 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1406 return;
1407 }
1408
1539 /* No redirected packets during ip_rt_redirect_silence; 1409 /* No redirected packets during ip_rt_redirect_silence;
1540 * reset the algorithm. 1410 * reset the algorithm.
1541 */ 1411 */
1542 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) 1412 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1543 rt->dst.rate_tokens = 0; 1413 peer->rate_tokens = 0;
1544 1414
1545 /* Too many ignored redirects; do not send anything 1415 /* Too many ignored redirects; do not send anything
1546 * set dst.rate_last to the last seen redirected packet. 1416 * set dst.rate_last to the last seen redirected packet.
1547 */ 1417 */
1548 if (rt->dst.rate_tokens >= ip_rt_redirect_number) { 1418 if (peer->rate_tokens >= ip_rt_redirect_number) {
1549 rt->dst.rate_last = jiffies; 1419 peer->rate_last = jiffies;
1550 return; 1420 return;
1551 } 1421 }
1552 1422
1553 /* Check for load limit; set rate_last to the latest sent 1423 /* Check for load limit; set rate_last to the latest sent
1554 * redirect. 1424 * redirect.
1555 */ 1425 */
1556 if (rt->dst.rate_tokens == 0 || 1426 if (peer->rate_tokens == 0 ||
1557 time_after(jiffies, 1427 time_after(jiffies,
1558 (rt->dst.rate_last + 1428 (peer->rate_last +
1559 (ip_rt_redirect_load << rt->dst.rate_tokens)))) { 1429 (ip_rt_redirect_load << peer->rate_tokens)))) {
1560 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1430 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1561 rt->dst.rate_last = jiffies; 1431 peer->rate_last = jiffies;
1562 ++rt->dst.rate_tokens; 1432 ++peer->rate_tokens;
1563#ifdef CONFIG_IP_ROUTE_VERBOSE 1433#ifdef CONFIG_IP_ROUTE_VERBOSE
1564 if (log_martians && 1434 if (log_martians &&
1565 rt->dst.rate_tokens == ip_rt_redirect_number && 1435 peer->rate_tokens == ip_rt_redirect_number &&
1566 net_ratelimit()) 1436 net_ratelimit())
1567 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1437 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1568 &rt->rt_src, rt->rt_iif, 1438 &rt->rt_src, rt->rt_iif,
@@ -1574,7 +1444,9 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1574static int ip_error(struct sk_buff *skb) 1444static int ip_error(struct sk_buff *skb)
1575{ 1445{
1576 struct rtable *rt = skb_rtable(skb); 1446 struct rtable *rt = skb_rtable(skb);
1447 struct inet_peer *peer;
1577 unsigned long now; 1448 unsigned long now;
1449 bool send;
1578 int code; 1450 int code;
1579 1451
1580 switch (rt->dst.error) { 1452 switch (rt->dst.error) {
@@ -1594,15 +1466,24 @@ static int ip_error(struct sk_buff *skb)
1594 break; 1466 break;
1595 } 1467 }
1596 1468
1597 now = jiffies; 1469 if (!rt->peer)
1598 rt->dst.rate_tokens += now - rt->dst.rate_last; 1470 rt_bind_peer(rt, 1);
1599 if (rt->dst.rate_tokens > ip_rt_error_burst) 1471 peer = rt->peer;
1600 rt->dst.rate_tokens = ip_rt_error_burst; 1472
1601 rt->dst.rate_last = now; 1473 send = true;
1602 if (rt->dst.rate_tokens >= ip_rt_error_cost) { 1474 if (peer) {
1603 rt->dst.rate_tokens -= ip_rt_error_cost; 1475 now = jiffies;
1604 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1476 peer->rate_tokens += now - peer->rate_last;
1477 if (peer->rate_tokens > ip_rt_error_burst)
1478 peer->rate_tokens = ip_rt_error_burst;
1479 peer->rate_last = now;
1480 if (peer->rate_tokens >= ip_rt_error_cost)
1481 peer->rate_tokens -= ip_rt_error_cost;
1482 else
1483 send = false;
1605 } 1484 }
1485 if (send)
1486 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1606 1487
1607out: kfree_skb(skb); 1488out: kfree_skb(skb);
1608 return 0; 1489 return 0;
@@ -1630,88 +1511,142 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1630 unsigned short new_mtu, 1511 unsigned short new_mtu,
1631 struct net_device *dev) 1512 struct net_device *dev)
1632{ 1513{
1633 int i, k;
1634 unsigned short old_mtu = ntohs(iph->tot_len); 1514 unsigned short old_mtu = ntohs(iph->tot_len);
1635 struct rtable *rth;
1636 int ikeys[2] = { dev->ifindex, 0 };
1637 __be32 skeys[2] = { iph->saddr, 0, };
1638 __be32 daddr = iph->daddr;
1639 unsigned short est_mtu = 0; 1515 unsigned short est_mtu = 0;
1516 struct inet_peer *peer;
1640 1517
1641 for (k = 0; k < 2; k++) { 1518 peer = inet_getpeer_v4(iph->daddr, 1);
1642 for (i = 0; i < 2; i++) { 1519 if (peer) {
1643 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1520 unsigned short mtu = new_mtu;
1644 rt_genid(net));
1645
1646 rcu_read_lock();
1647 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1648 rth = rcu_dereference(rth->dst.rt_next)) {
1649 unsigned short mtu = new_mtu;
1650
1651 if (rth->fl.fl4_dst != daddr ||
1652 rth->fl.fl4_src != skeys[i] ||
1653 rth->rt_dst != daddr ||
1654 rth->rt_src != iph->saddr ||
1655 rth->fl.oif != ikeys[k] ||
1656 rt_is_input_route(rth) ||
1657 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1658 !net_eq(dev_net(rth->dst.dev), net) ||
1659 rt_is_expired(rth))
1660 continue;
1661 1521
1662 if (new_mtu < 68 || new_mtu >= old_mtu) { 1522 if (new_mtu < 68 || new_mtu >= old_mtu) {
1523 /* BSD 4.2 derived systems incorrectly adjust
1524 * tot_len by the IP header length, and report
1525 * a zero MTU in the ICMP message.
1526 */
1527 if (mtu == 0 &&
1528 old_mtu >= 68 + (iph->ihl << 2))
1529 old_mtu -= iph->ihl << 2;
1530 mtu = guess_mtu(old_mtu);
1531 }
1663 1532
1664 /* BSD 4.2 compatibility hack :-( */ 1533 if (mtu < ip_rt_min_pmtu)
1665 if (mtu == 0 && 1534 mtu = ip_rt_min_pmtu;
1666 old_mtu >= dst_mtu(&rth->dst) && 1535 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1667 old_mtu >= 68 + (iph->ihl << 2)) 1536 unsigned long pmtu_expires;
1668 old_mtu -= iph->ihl << 2;
1669 1537
1670 mtu = guess_mtu(old_mtu); 1538 pmtu_expires = jiffies + ip_rt_mtu_expires;
1671 } 1539 if (!pmtu_expires)
1672 if (mtu <= dst_mtu(&rth->dst)) { 1540 pmtu_expires = 1UL;
1673 if (mtu < dst_mtu(&rth->dst)) { 1541
1674 dst_confirm(&rth->dst); 1542 est_mtu = mtu;
1675 if (mtu < ip_rt_min_pmtu) { 1543 peer->pmtu_learned = mtu;
1676 u32 lock = dst_metric(&rth->dst, 1544 peer->pmtu_expires = pmtu_expires;
1677 RTAX_LOCK);
1678 mtu = ip_rt_min_pmtu;
1679 lock |= (1 << RTAX_MTU);
1680 dst_metric_set(&rth->dst, RTAX_LOCK,
1681 lock);
1682 }
1683 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1684 dst_set_expires(&rth->dst,
1685 ip_rt_mtu_expires);
1686 }
1687 est_mtu = mtu;
1688 }
1689 }
1690 rcu_read_unlock();
1691 } 1545 }
1546
1547 inet_putpeer(peer);
1548
1549 atomic_inc(&__rt_peer_genid);
1692 } 1550 }
1693 return est_mtu ? : new_mtu; 1551 return est_mtu ? : new_mtu;
1694} 1552}
1695 1553
1554static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1555{
1556 unsigned long expires = peer->pmtu_expires;
1557
1558 if (time_before(jiffies, expires)) {
1559 u32 orig_dst_mtu = dst_mtu(dst);
1560 if (peer->pmtu_learned < orig_dst_mtu) {
1561 if (!peer->pmtu_orig)
1562 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1563 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1564 }
1565 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1566 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1567}
1568
1696static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1569static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1697{ 1570{
1698 if (dst_mtu(dst) > mtu && mtu >= 68 && 1571 struct rtable *rt = (struct rtable *) dst;
1699 !(dst_metric_locked(dst, RTAX_MTU))) { 1572 struct inet_peer *peer;
1700 if (mtu < ip_rt_min_pmtu) { 1573
1701 u32 lock = dst_metric(dst, RTAX_LOCK); 1574 dst_confirm(dst);
1575
1576 if (!rt->peer)
1577 rt_bind_peer(rt, 1);
1578 peer = rt->peer;
1579 if (peer) {
1580 if (mtu < ip_rt_min_pmtu)
1702 mtu = ip_rt_min_pmtu; 1581 mtu = ip_rt_min_pmtu;
1703 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU)); 1582 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1583 unsigned long pmtu_expires;
1584
1585 pmtu_expires = jiffies + ip_rt_mtu_expires;
1586 if (!pmtu_expires)
1587 pmtu_expires = 1UL;
1588
1589 peer->pmtu_learned = mtu;
1590 peer->pmtu_expires = pmtu_expires;
1591
1592 atomic_inc(&__rt_peer_genid);
1593 rt->rt_peer_genid = rt_peer_genid();
1704 } 1594 }
1705 dst_metric_set(dst, RTAX_MTU, mtu); 1595 check_peer_pmtu(dst, peer);
1706 dst_set_expires(dst, ip_rt_mtu_expires); 1596
1707 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); 1597 inet_putpeer(peer);
1598 }
1599}
1600
1601static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1602{
1603 struct rtable *rt = (struct rtable *) dst;
1604 __be32 orig_gw = rt->rt_gateway;
1605
1606 dst_confirm(&rt->dst);
1607
1608 neigh_release(rt->dst.neighbour);
1609 rt->dst.neighbour = NULL;
1610
1611 rt->rt_gateway = peer->redirect_learned.a4;
1612 if (arp_bind_neighbour(&rt->dst) ||
1613 !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1614 if (rt->dst.neighbour)
1615 neigh_event_send(rt->dst.neighbour, NULL);
1616 rt->rt_gateway = orig_gw;
1617 return -EAGAIN;
1618 } else {
1619 rt->rt_flags |= RTCF_REDIRECTED;
1620 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1621 rt->dst.neighbour);
1708 } 1622 }
1623 return 0;
1709} 1624}
1710 1625
1711static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1626static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1712{ 1627{
1713 if (rt_is_expired((struct rtable *)dst)) 1628 struct rtable *rt = (struct rtable *) dst;
1629
1630 if (rt_is_expired(rt))
1714 return NULL; 1631 return NULL;
1632 if (rt->rt_peer_genid != rt_peer_genid()) {
1633 struct inet_peer *peer;
1634
1635 if (!rt->peer)
1636 rt_bind_peer(rt, 0);
1637
1638 peer = rt->peer;
1639 if (peer && peer->pmtu_expires)
1640 check_peer_pmtu(dst, peer);
1641
1642 if (peer && peer->redirect_learned.a4 &&
1643 peer->redirect_learned.a4 != rt->rt_gateway) {
1644 if (check_peer_redir(dst, peer))
1645 return NULL;
1646 }
1647
1648 rt->rt_peer_genid = rt_peer_genid();
1649 }
1715 return dst; 1650 return dst;
1716} 1651}
1717 1652
@@ -1720,6 +1655,10 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
1720 struct rtable *rt = (struct rtable *) dst; 1655 struct rtable *rt = (struct rtable *) dst;
1721 struct inet_peer *peer = rt->peer; 1656 struct inet_peer *peer = rt->peer;
1722 1657
1658 if (rt->fi) {
1659 fib_info_put(rt->fi);
1660 rt->fi = NULL;
1661 }
1723 if (peer) { 1662 if (peer) {
1724 rt->peer = NULL; 1663 rt->peer = NULL;
1725 inet_putpeer(peer); 1664 inet_putpeer(peer);
@@ -1734,8 +1673,14 @@ static void ipv4_link_failure(struct sk_buff *skb)
1734 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1673 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1735 1674
1736 rt = skb_rtable(skb); 1675 rt = skb_rtable(skb);
1737 if (rt) 1676 if (rt &&
1738 dst_set_expires(&rt->dst, 0); 1677 rt->peer &&
1678 rt->peer->pmtu_expires) {
1679 unsigned long orig = rt->peer->pmtu_expires;
1680
1681 if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1682 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1683 }
1739} 1684}
1740 1685
1741static int ip_rt_bug(struct sk_buff *skb) 1686static int ip_rt_bug(struct sk_buff *skb)
@@ -1764,8 +1709,17 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1764 if (rt_is_output_route(rt)) 1709 if (rt_is_output_route(rt))
1765 src = rt->rt_src; 1710 src = rt->rt_src;
1766 else { 1711 else {
1712 struct flowi4 fl4 = {
1713 .daddr = rt->rt_key_dst,
1714 .saddr = rt->rt_key_src,
1715 .flowi4_tos = rt->rt_tos,
1716 .flowi4_oif = rt->rt_oif,
1717 .flowi4_iif = rt->rt_iif,
1718 .flowi4_mark = rt->rt_mark,
1719 };
1720
1767 rcu_read_lock(); 1721 rcu_read_lock();
1768 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) 1722 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1769 src = FIB_RES_PREFSRC(res); 1723 src = FIB_RES_PREFSRC(res);
1770 else 1724 else
1771 src = inet_select_addr(rt->dst.dev, rt->rt_gateway, 1725 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
@@ -1775,7 +1729,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1775 memcpy(addr, &src, 4); 1729 memcpy(addr, &src, 4);
1776} 1730}
1777 1731
1778#ifdef CONFIG_NET_CLS_ROUTE 1732#ifdef CONFIG_IP_ROUTE_CLASSID
1779static void set_class_tag(struct rtable *rt, u32 tag) 1733static void set_class_tag(struct rtable *rt, u32 tag)
1780{ 1734{
1781 if (!(rt->dst.tclassid & 0xFFFF)) 1735 if (!(rt->dst.tclassid & 0xFFFF))
@@ -1815,17 +1769,54 @@ static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1815 return mtu; 1769 return mtu;
1816} 1770}
1817 1771
1818static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 1772static void rt_init_metrics(struct rtable *rt, const struct flowi4 *oldflp4,
1773 struct fib_info *fi)
1774{
1775 struct inet_peer *peer;
1776 int create = 0;
1777
1778 /* If a peer entry exists for this destination, we must hook
1779 * it up in order to get at cached metrics.
1780 */
1781 if (oldflp4 && (oldflp4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1782 create = 1;
1783
1784 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1785 if (peer) {
1786 rt->rt_peer_genid = rt_peer_genid();
1787 if (inet_metrics_new(peer))
1788 memcpy(peer->metrics, fi->fib_metrics,
1789 sizeof(u32) * RTAX_MAX);
1790 dst_init_metrics(&rt->dst, peer->metrics, false);
1791
1792 if (peer->pmtu_expires)
1793 check_peer_pmtu(&rt->dst, peer);
1794 if (peer->redirect_learned.a4 &&
1795 peer->redirect_learned.a4 != rt->rt_gateway) {
1796 rt->rt_gateway = peer->redirect_learned.a4;
1797 rt->rt_flags |= RTCF_REDIRECTED;
1798 }
1799 } else {
1800 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1801 rt->fi = fi;
1802 atomic_inc(&fi->fib_clntref);
1803 }
1804 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1805 }
1806}
1807
1808static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *oldflp4,
1809 const struct fib_result *res,
1810 struct fib_info *fi, u16 type, u32 itag)
1819{ 1811{
1820 struct dst_entry *dst = &rt->dst; 1812 struct dst_entry *dst = &rt->dst;
1821 struct fib_info *fi = res->fi;
1822 1813
1823 if (fi) { 1814 if (fi) {
1824 if (FIB_RES_GW(*res) && 1815 if (FIB_RES_GW(*res) &&
1825 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1816 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1826 rt->rt_gateway = FIB_RES_GW(*res); 1817 rt->rt_gateway = FIB_RES_GW(*res);
1827 dst_import_metrics(dst, fi->fib_metrics); 1818 rt_init_metrics(rt, oldflp4, fi);
1828#ifdef CONFIG_NET_CLS_ROUTE 1819#ifdef CONFIG_IP_ROUTE_CLASSID
1829 dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1820 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1830#endif 1821#endif
1831 } 1822 }
@@ -1835,13 +1826,26 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1835 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) 1826 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1836 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); 1827 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1837 1828
1838#ifdef CONFIG_NET_CLS_ROUTE 1829#ifdef CONFIG_IP_ROUTE_CLASSID
1839#ifdef CONFIG_IP_MULTIPLE_TABLES 1830#ifdef CONFIG_IP_MULTIPLE_TABLES
1840 set_class_tag(rt, fib_rules_tclass(res)); 1831 set_class_tag(rt, fib_rules_tclass(res));
1841#endif 1832#endif
1842 set_class_tag(rt, itag); 1833 set_class_tag(rt, itag);
1843#endif 1834#endif
1844 rt->rt_type = res->type; 1835 rt->rt_type = type;
1836}
1837
1838static struct rtable *rt_dst_alloc(bool nopolicy, bool noxfrm)
1839{
1840 struct rtable *rt = dst_alloc(&ipv4_dst_ops, 1);
1841 if (rt) {
1842 rt->dst.obsolete = -1;
1843
1844 rt->dst.flags = DST_HOST |
1845 (nopolicy ? DST_NOPOLICY : 0) |
1846 (noxfrm ? DST_NOXFRM : 0);
1847 }
1848 return rt;
1845} 1849}
1846 1850
1847/* called in rcu_read_lock() section */ 1851/* called in rcu_read_lock() section */
@@ -1874,31 +1878,25 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1874 if (err < 0) 1878 if (err < 0)
1875 goto e_err; 1879 goto e_err;
1876 } 1880 }
1877 rth = dst_alloc(&ipv4_dst_ops); 1881 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1878 if (!rth) 1882 if (!rth)
1879 goto e_nobufs; 1883 goto e_nobufs;
1880 1884
1881 rth->dst.output = ip_rt_bug; 1885 rth->dst.output = ip_rt_bug;
1882 rth->dst.obsolete = -1;
1883 1886
1884 atomic_set(&rth->dst.__refcnt, 1); 1887 rth->rt_key_dst = daddr;
1885 rth->dst.flags= DST_HOST;
1886 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1887 rth->dst.flags |= DST_NOPOLICY;
1888 rth->fl.fl4_dst = daddr;
1889 rth->rt_dst = daddr; 1888 rth->rt_dst = daddr;
1890 rth->fl.fl4_tos = tos; 1889 rth->rt_tos = tos;
1891 rth->fl.mark = skb->mark; 1890 rth->rt_mark = skb->mark;
1892 rth->fl.fl4_src = saddr; 1891 rth->rt_key_src = saddr;
1893 rth->rt_src = saddr; 1892 rth->rt_src = saddr;
1894#ifdef CONFIG_NET_CLS_ROUTE 1893#ifdef CONFIG_IP_ROUTE_CLASSID
1895 rth->dst.tclassid = itag; 1894 rth->dst.tclassid = itag;
1896#endif 1895#endif
1897 rth->rt_iif = 1896 rth->rt_iif = dev->ifindex;
1898 rth->fl.iif = dev->ifindex;
1899 rth->dst.dev = init_net.loopback_dev; 1897 rth->dst.dev = init_net.loopback_dev;
1900 dev_hold(rth->dst.dev); 1898 dev_hold(rth->dst.dev);
1901 rth->fl.oif = 0; 1899 rth->rt_oif = 0;
1902 rth->rt_gateway = daddr; 1900 rth->rt_gateway = daddr;
1903 rth->rt_spec_dst= spec_dst; 1901 rth->rt_spec_dst= spec_dst;
1904 rth->rt_genid = rt_genid(dev_net(dev)); 1902 rth->rt_genid = rt_genid(dev_net(dev));
@@ -1916,7 +1914,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1916 RT_CACHE_STAT_INC(in_slow_mc); 1914 RT_CACHE_STAT_INC(in_slow_mc);
1917 1915
1918 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1916 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1919 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); 1917 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1918 err = 0;
1919 if (IS_ERR(rth))
1920 err = PTR_ERR(rth);
1920 1921
1921e_nobufs: 1922e_nobufs:
1922 return -ENOBUFS; 1923 return -ENOBUFS;
@@ -1959,7 +1960,7 @@ static void ip_handle_martian_source(struct net_device *dev,
1959 1960
1960/* called in rcu_read_lock() section */ 1961/* called in rcu_read_lock() section */
1961static int __mkroute_input(struct sk_buff *skb, 1962static int __mkroute_input(struct sk_buff *skb,
1962 struct fib_result *res, 1963 const struct fib_result *res,
1963 struct in_device *in_dev, 1964 struct in_device *in_dev,
1964 __be32 daddr, __be32 saddr, u32 tos, 1965 __be32 daddr, __be32 saddr, u32 tos,
1965 struct rtable **result) 1966 struct rtable **result)
@@ -2013,39 +2014,31 @@ static int __mkroute_input(struct sk_buff *skb,
2013 } 2014 }
2014 } 2015 }
2015 2016
2016 2017 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY),
2017 rth = dst_alloc(&ipv4_dst_ops); 2018 IN_DEV_CONF_GET(out_dev, NOXFRM));
2018 if (!rth) { 2019 if (!rth) {
2019 err = -ENOBUFS; 2020 err = -ENOBUFS;
2020 goto cleanup; 2021 goto cleanup;
2021 } 2022 }
2022 2023
2023 atomic_set(&rth->dst.__refcnt, 1); 2024 rth->rt_key_dst = daddr;
2024 rth->dst.flags= DST_HOST;
2025 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2026 rth->dst.flags |= DST_NOPOLICY;
2027 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2028 rth->dst.flags |= DST_NOXFRM;
2029 rth->fl.fl4_dst = daddr;
2030 rth->rt_dst = daddr; 2025 rth->rt_dst = daddr;
2031 rth->fl.fl4_tos = tos; 2026 rth->rt_tos = tos;
2032 rth->fl.mark = skb->mark; 2027 rth->rt_mark = skb->mark;
2033 rth->fl.fl4_src = saddr; 2028 rth->rt_key_src = saddr;
2034 rth->rt_src = saddr; 2029 rth->rt_src = saddr;
2035 rth->rt_gateway = daddr; 2030 rth->rt_gateway = daddr;
2036 rth->rt_iif = 2031 rth->rt_iif = in_dev->dev->ifindex;
2037 rth->fl.iif = in_dev->dev->ifindex;
2038 rth->dst.dev = (out_dev)->dev; 2032 rth->dst.dev = (out_dev)->dev;
2039 dev_hold(rth->dst.dev); 2033 dev_hold(rth->dst.dev);
2040 rth->fl.oif = 0; 2034 rth->rt_oif = 0;
2041 rth->rt_spec_dst= spec_dst; 2035 rth->rt_spec_dst= spec_dst;
2042 2036
2043 rth->dst.obsolete = -1;
2044 rth->dst.input = ip_forward; 2037 rth->dst.input = ip_forward;
2045 rth->dst.output = ip_output; 2038 rth->dst.output = ip_output;
2046 rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); 2039 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2047 2040
2048 rt_set_nexthop(rth, res, itag); 2041 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2049 2042
2050 rth->rt_flags = flags; 2043 rth->rt_flags = flags;
2051 2044
@@ -2057,7 +2050,7 @@ static int __mkroute_input(struct sk_buff *skb,
2057 2050
2058static int ip_mkroute_input(struct sk_buff *skb, 2051static int ip_mkroute_input(struct sk_buff *skb,
2059 struct fib_result *res, 2052 struct fib_result *res,
2060 const struct flowi *fl, 2053 const struct flowi4 *fl4,
2061 struct in_device *in_dev, 2054 struct in_device *in_dev,
2062 __be32 daddr, __be32 saddr, u32 tos) 2055 __be32 daddr, __be32 saddr, u32 tos)
2063{ 2056{
@@ -2066,8 +2059,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
2066 unsigned hash; 2059 unsigned hash;
2067 2060
2068#ifdef CONFIG_IP_ROUTE_MULTIPATH 2061#ifdef CONFIG_IP_ROUTE_MULTIPATH
2069 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) 2062 if (res->fi && res->fi->fib_nhs > 1)
2070 fib_select_multipath(fl, res); 2063 fib_select_multipath(res);
2071#endif 2064#endif
2072 2065
2073 /* create a routing cache entry */ 2066 /* create a routing cache entry */
@@ -2076,9 +2069,12 @@ static int ip_mkroute_input(struct sk_buff *skb,
2076 return err; 2069 return err;
2077 2070
2078 /* put it into the cache */ 2071 /* put it into the cache */
2079 hash = rt_hash(daddr, saddr, fl->iif, 2072 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2080 rt_genid(dev_net(rth->dst.dev))); 2073 rt_genid(dev_net(rth->dst.dev)));
2081 return rt_intern_hash(hash, rth, NULL, skb, fl->iif); 2074 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2075 if (IS_ERR(rth))
2076 return PTR_ERR(rth);
2077 return 0;
2082} 2078}
2083 2079
2084/* 2080/*
@@ -2097,12 +2093,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2097{ 2093{
2098 struct fib_result res; 2094 struct fib_result res;
2099 struct in_device *in_dev = __in_dev_get_rcu(dev); 2095 struct in_device *in_dev = __in_dev_get_rcu(dev);
2100 struct flowi fl = { .fl4_dst = daddr, 2096 struct flowi4 fl4;
2101 .fl4_src = saddr,
2102 .fl4_tos = tos,
2103 .fl4_scope = RT_SCOPE_UNIVERSE,
2104 .mark = skb->mark,
2105 .iif = dev->ifindex };
2106 unsigned flags = 0; 2097 unsigned flags = 0;
2107 u32 itag = 0; 2098 u32 itag = 0;
2108 struct rtable * rth; 2099 struct rtable * rth;
@@ -2139,7 +2130,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2139 /* 2130 /*
2140 * Now we are ready to route packet. 2131 * Now we are ready to route packet.
2141 */ 2132 */
2142 err = fib_lookup(net, &fl, &res); 2133 fl4.flowi4_oif = 0;
2134 fl4.flowi4_iif = dev->ifindex;
2135 fl4.flowi4_mark = skb->mark;
2136 fl4.flowi4_tos = tos;
2137 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2138 fl4.daddr = daddr;
2139 fl4.saddr = saddr;
2140 err = fib_lookup(net, &fl4, &res);
2143 if (err != 0) { 2141 if (err != 0) {
2144 if (!IN_DEV_FORWARD(in_dev)) 2142 if (!IN_DEV_FORWARD(in_dev))
2145 goto e_hostunreach; 2143 goto e_hostunreach;
@@ -2168,7 +2166,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2168 if (res.type != RTN_UNICAST) 2166 if (res.type != RTN_UNICAST)
2169 goto martian_destination; 2167 goto martian_destination;
2170 2168
2171 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 2169 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2172out: return err; 2170out: return err;
2173 2171
2174brd_input: 2172brd_input:
@@ -2190,29 +2188,23 @@ brd_input:
2190 RT_CACHE_STAT_INC(in_brd); 2188 RT_CACHE_STAT_INC(in_brd);
2191 2189
2192local_input: 2190local_input:
2193 rth = dst_alloc(&ipv4_dst_ops); 2191 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2194 if (!rth) 2192 if (!rth)
2195 goto e_nobufs; 2193 goto e_nobufs;
2196 2194
2197 rth->dst.output= ip_rt_bug; 2195 rth->dst.output= ip_rt_bug;
2198 rth->dst.obsolete = -1;
2199 rth->rt_genid = rt_genid(net); 2196 rth->rt_genid = rt_genid(net);
2200 2197
2201 atomic_set(&rth->dst.__refcnt, 1); 2198 rth->rt_key_dst = daddr;
2202 rth->dst.flags= DST_HOST;
2203 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2204 rth->dst.flags |= DST_NOPOLICY;
2205 rth->fl.fl4_dst = daddr;
2206 rth->rt_dst = daddr; 2199 rth->rt_dst = daddr;
2207 rth->fl.fl4_tos = tos; 2200 rth->rt_tos = tos;
2208 rth->fl.mark = skb->mark; 2201 rth->rt_mark = skb->mark;
2209 rth->fl.fl4_src = saddr; 2202 rth->rt_key_src = saddr;
2210 rth->rt_src = saddr; 2203 rth->rt_src = saddr;
2211#ifdef CONFIG_NET_CLS_ROUTE 2204#ifdef CONFIG_IP_ROUTE_CLASSID
2212 rth->dst.tclassid = itag; 2205 rth->dst.tclassid = itag;
2213#endif 2206#endif
2214 rth->rt_iif = 2207 rth->rt_iif = dev->ifindex;
2215 rth->fl.iif = dev->ifindex;
2216 rth->dst.dev = net->loopback_dev; 2208 rth->dst.dev = net->loopback_dev;
2217 dev_hold(rth->dst.dev); 2209 dev_hold(rth->dst.dev);
2218 rth->rt_gateway = daddr; 2210 rth->rt_gateway = daddr;
@@ -2225,8 +2217,11 @@ local_input:
2225 rth->rt_flags &= ~RTCF_LOCAL; 2217 rth->rt_flags &= ~RTCF_LOCAL;
2226 } 2218 }
2227 rth->rt_type = res.type; 2219 rth->rt_type = res.type;
2228 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); 2220 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2229 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); 2221 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2222 err = 0;
2223 if (IS_ERR(rth))
2224 err = PTR_ERR(rth);
2230 goto out; 2225 goto out;
2231 2226
2232no_route: 2227no_route:
@@ -2288,12 +2283,12 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2288 2283
2289 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2284 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2290 rth = rcu_dereference(rth->dst.rt_next)) { 2285 rth = rcu_dereference(rth->dst.rt_next)) {
2291 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | 2286 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2292 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | 2287 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2293 (rth->fl.iif ^ iif) | 2288 (rth->rt_iif ^ iif) |
2294 rth->fl.oif | 2289 rth->rt_oif |
2295 (rth->fl.fl4_tos ^ tos)) == 0 && 2290 (rth->rt_tos ^ tos)) == 0 &&
2296 rth->fl.mark == skb->mark && 2291 rth->rt_mark == skb->mark &&
2297 net_eq(dev_net(rth->dst.dev), net) && 2292 net_eq(dev_net(rth->dst.dev), net) &&
2298 !rt_is_expired(rth)) { 2293 !rt_is_expired(rth)) {
2299 if (noref) { 2294 if (noref) {
@@ -2326,8 +2321,8 @@ skip_cache:
2326 struct in_device *in_dev = __in_dev_get_rcu(dev); 2321 struct in_device *in_dev = __in_dev_get_rcu(dev);
2327 2322
2328 if (in_dev) { 2323 if (in_dev) {
2329 int our = ip_check_mc(in_dev, daddr, saddr, 2324 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2330 ip_hdr(skb)->protocol); 2325 ip_hdr(skb)->protocol);
2331 if (our 2326 if (our
2332#ifdef CONFIG_IP_MROUTE 2327#ifdef CONFIG_IP_MROUTE
2333 || 2328 ||
@@ -2351,98 +2346,91 @@ skip_cache:
2351EXPORT_SYMBOL(ip_route_input_common); 2346EXPORT_SYMBOL(ip_route_input_common);
2352 2347
2353/* called with rcu_read_lock() */ 2348/* called with rcu_read_lock() */
2354static int __mkroute_output(struct rtable **result, 2349static struct rtable *__mkroute_output(const struct fib_result *res,
2355 struct fib_result *res, 2350 const struct flowi4 *fl4,
2356 const struct flowi *fl, 2351 const struct flowi4 *oldflp4,
2357 const struct flowi *oldflp, 2352 struct net_device *dev_out,
2358 struct net_device *dev_out, 2353 unsigned int flags)
2359 unsigned flags)
2360{ 2354{
2361 struct rtable *rth; 2355 struct fib_info *fi = res->fi;
2356 u32 tos = RT_FL_TOS(oldflp4);
2362 struct in_device *in_dev; 2357 struct in_device *in_dev;
2363 u32 tos = RT_FL_TOS(oldflp); 2358 u16 type = res->type;
2359 struct rtable *rth;
2364 2360
2365 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK)) 2361 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2366 return -EINVAL; 2362 return ERR_PTR(-EINVAL);
2367 2363
2368 if (ipv4_is_lbcast(fl->fl4_dst)) 2364 if (ipv4_is_lbcast(fl4->daddr))
2369 res->type = RTN_BROADCAST; 2365 type = RTN_BROADCAST;
2370 else if (ipv4_is_multicast(fl->fl4_dst)) 2366 else if (ipv4_is_multicast(fl4->daddr))
2371 res->type = RTN_MULTICAST; 2367 type = RTN_MULTICAST;
2372 else if (ipv4_is_zeronet(fl->fl4_dst)) 2368 else if (ipv4_is_zeronet(fl4->daddr))
2373 return -EINVAL; 2369 return ERR_PTR(-EINVAL);
2374 2370
2375 if (dev_out->flags & IFF_LOOPBACK) 2371 if (dev_out->flags & IFF_LOOPBACK)
2376 flags |= RTCF_LOCAL; 2372 flags |= RTCF_LOCAL;
2377 2373
2378 in_dev = __in_dev_get_rcu(dev_out); 2374 in_dev = __in_dev_get_rcu(dev_out);
2379 if (!in_dev) 2375 if (!in_dev)
2380 return -EINVAL; 2376 return ERR_PTR(-EINVAL);
2381 2377
2382 if (res->type == RTN_BROADCAST) { 2378 if (type == RTN_BROADCAST) {
2383 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2379 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2384 res->fi = NULL; 2380 fi = NULL;
2385 } else if (res->type == RTN_MULTICAST) { 2381 } else if (type == RTN_MULTICAST) {
2386 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2382 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2387 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 2383 if (!ip_check_mc_rcu(in_dev, oldflp4->daddr, oldflp4->saddr,
2388 oldflp->proto)) 2384 oldflp4->flowi4_proto))
2389 flags &= ~RTCF_LOCAL; 2385 flags &= ~RTCF_LOCAL;
2390 /* If multicast route do not exist use 2386 /* If multicast route do not exist use
2391 * default one, but do not gateway in this case. 2387 * default one, but do not gateway in this case.
2392 * Yes, it is hack. 2388 * Yes, it is hack.
2393 */ 2389 */
2394 if (res->fi && res->prefixlen < 4) 2390 if (fi && res->prefixlen < 4)
2395 res->fi = NULL; 2391 fi = NULL;
2396 } 2392 }
2397 2393
2398 2394 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY),
2399 rth = dst_alloc(&ipv4_dst_ops); 2395 IN_DEV_CONF_GET(in_dev, NOXFRM));
2400 if (!rth) 2396 if (!rth)
2401 return -ENOBUFS; 2397 return ERR_PTR(-ENOBUFS);
2402 2398
2403 atomic_set(&rth->dst.__refcnt, 1); 2399 rth->rt_key_dst = oldflp4->daddr;
2404 rth->dst.flags= DST_HOST; 2400 rth->rt_tos = tos;
2405 if (IN_DEV_CONF_GET(in_dev, NOXFRM)) 2401 rth->rt_key_src = oldflp4->saddr;
2406 rth->dst.flags |= DST_NOXFRM; 2402 rth->rt_oif = oldflp4->flowi4_oif;
2407 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2403 rth->rt_mark = oldflp4->flowi4_mark;
2408 rth->dst.flags |= DST_NOPOLICY; 2404 rth->rt_dst = fl4->daddr;
2409 2405 rth->rt_src = fl4->saddr;
2410 rth->fl.fl4_dst = oldflp->fl4_dst; 2406 rth->rt_iif = 0;
2411 rth->fl.fl4_tos = tos;
2412 rth->fl.fl4_src = oldflp->fl4_src;
2413 rth->fl.oif = oldflp->oif;
2414 rth->fl.mark = oldflp->mark;
2415 rth->rt_dst = fl->fl4_dst;
2416 rth->rt_src = fl->fl4_src;
2417 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2418 /* get references to the devices that are to be hold by the routing 2407 /* get references to the devices that are to be hold by the routing
2419 cache entry */ 2408 cache entry */
2420 rth->dst.dev = dev_out; 2409 rth->dst.dev = dev_out;
2421 dev_hold(dev_out); 2410 dev_hold(dev_out);
2422 rth->rt_gateway = fl->fl4_dst; 2411 rth->rt_gateway = fl4->daddr;
2423 rth->rt_spec_dst= fl->fl4_src; 2412 rth->rt_spec_dst= fl4->saddr;
2424 2413
2425 rth->dst.output=ip_output; 2414 rth->dst.output=ip_output;
2426 rth->dst.obsolete = -1;
2427 rth->rt_genid = rt_genid(dev_net(dev_out)); 2415 rth->rt_genid = rt_genid(dev_net(dev_out));
2428 2416
2429 RT_CACHE_STAT_INC(out_slow_tot); 2417 RT_CACHE_STAT_INC(out_slow_tot);
2430 2418
2431 if (flags & RTCF_LOCAL) { 2419 if (flags & RTCF_LOCAL) {
2432 rth->dst.input = ip_local_deliver; 2420 rth->dst.input = ip_local_deliver;
2433 rth->rt_spec_dst = fl->fl4_dst; 2421 rth->rt_spec_dst = fl4->daddr;
2434 } 2422 }
2435 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2423 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2436 rth->rt_spec_dst = fl->fl4_src; 2424 rth->rt_spec_dst = fl4->saddr;
2437 if (flags & RTCF_LOCAL && 2425 if (flags & RTCF_LOCAL &&
2438 !(dev_out->flags & IFF_LOOPBACK)) { 2426 !(dev_out->flags & IFF_LOOPBACK)) {
2439 rth->dst.output = ip_mc_output; 2427 rth->dst.output = ip_mc_output;
2440 RT_CACHE_STAT_INC(out_slow_mc); 2428 RT_CACHE_STAT_INC(out_slow_mc);
2441 } 2429 }
2442#ifdef CONFIG_IP_MROUTE 2430#ifdef CONFIG_IP_MROUTE
2443 if (res->type == RTN_MULTICAST) { 2431 if (type == RTN_MULTICAST) {
2444 if (IN_DEV_MFORWARD(in_dev) && 2432 if (IN_DEV_MFORWARD(in_dev) &&
2445 !ipv4_is_local_multicast(oldflp->fl4_dst)) { 2433 !ipv4_is_local_multicast(oldflp4->daddr)) {
2446 rth->dst.input = ip_mr_input; 2434 rth->dst.input = ip_mr_input;
2447 rth->dst.output = ip_mc_output; 2435 rth->dst.output = ip_mc_output;
2448 } 2436 }
@@ -2450,31 +2438,10 @@ static int __mkroute_output(struct rtable **result,
2450#endif 2438#endif
2451 } 2439 }
2452 2440
2453 rt_set_nexthop(rth, res, 0); 2441 rt_set_nexthop(rth, oldflp4, res, fi, type, 0);
2454 2442
2455 rth->rt_flags = flags; 2443 rth->rt_flags = flags;
2456 *result = rth; 2444 return rth;
2457 return 0;
2458}
2459
2460/* called with rcu_read_lock() */
2461static int ip_mkroute_output(struct rtable **rp,
2462 struct fib_result *res,
2463 const struct flowi *fl,
2464 const struct flowi *oldflp,
2465 struct net_device *dev_out,
2466 unsigned flags)
2467{
2468 struct rtable *rth = NULL;
2469 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2470 unsigned hash;
2471 if (err == 0) {
2472 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2473 rt_genid(dev_net(dev_out)));
2474 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2475 }
2476
2477 return err;
2478} 2445}
2479 2446
2480/* 2447/*
@@ -2482,34 +2449,36 @@ static int ip_mkroute_output(struct rtable **rp,
2482 * called with rcu_read_lock(); 2449 * called with rcu_read_lock();
2483 */ 2450 */
2484 2451
2485static int ip_route_output_slow(struct net *net, struct rtable **rp, 2452static struct rtable *ip_route_output_slow(struct net *net,
2486 const struct flowi *oldflp) 2453 const struct flowi4 *oldflp4)
2487{ 2454{
2488 u32 tos = RT_FL_TOS(oldflp); 2455 u32 tos = RT_FL_TOS(oldflp4);
2489 struct flowi fl = { .fl4_dst = oldflp->fl4_dst, 2456 struct flowi4 fl4;
2490 .fl4_src = oldflp->fl4_src,
2491 .fl4_tos = tos & IPTOS_RT_MASK,
2492 .fl4_scope = ((tos & RTO_ONLINK) ?
2493 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2494 .mark = oldflp->mark,
2495 .iif = net->loopback_dev->ifindex,
2496 .oif = oldflp->oif };
2497 struct fib_result res; 2457 struct fib_result res;
2498 unsigned int flags = 0; 2458 unsigned int flags = 0;
2499 struct net_device *dev_out = NULL; 2459 struct net_device *dev_out = NULL;
2500 int err; 2460 struct rtable *rth;
2501
2502 2461
2503 res.fi = NULL; 2462 res.fi = NULL;
2504#ifdef CONFIG_IP_MULTIPLE_TABLES 2463#ifdef CONFIG_IP_MULTIPLE_TABLES
2505 res.r = NULL; 2464 res.r = NULL;
2506#endif 2465#endif
2507 2466
2508 if (oldflp->fl4_src) { 2467 fl4.flowi4_oif = oldflp4->flowi4_oif;
2509 err = -EINVAL; 2468 fl4.flowi4_iif = net->loopback_dev->ifindex;
2510 if (ipv4_is_multicast(oldflp->fl4_src) || 2469 fl4.flowi4_mark = oldflp4->flowi4_mark;
2511 ipv4_is_lbcast(oldflp->fl4_src) || 2470 fl4.daddr = oldflp4->daddr;
2512 ipv4_is_zeronet(oldflp->fl4_src)) 2471 fl4.saddr = oldflp4->saddr;
2472 fl4.flowi4_tos = tos & IPTOS_RT_MASK;
2473 fl4.flowi4_scope = ((tos & RTO_ONLINK) ?
2474 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2475
2476 rcu_read_lock();
2477 if (oldflp4->saddr) {
2478 rth = ERR_PTR(-EINVAL);
2479 if (ipv4_is_multicast(oldflp4->saddr) ||
2480 ipv4_is_lbcast(oldflp4->saddr) ||
2481 ipv4_is_zeronet(oldflp4->saddr))
2513 goto out; 2482 goto out;
2514 2483
2515 /* I removed check for oif == dev_out->oif here. 2484 /* I removed check for oif == dev_out->oif here.
@@ -2520,11 +2489,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2520 of another iface. --ANK 2489 of another iface. --ANK
2521 */ 2490 */
2522 2491
2523 if (oldflp->oif == 0 && 2492 if (oldflp4->flowi4_oif == 0 &&
2524 (ipv4_is_multicast(oldflp->fl4_dst) || 2493 (ipv4_is_multicast(oldflp4->daddr) ||
2525 ipv4_is_lbcast(oldflp->fl4_dst))) { 2494 ipv4_is_lbcast(oldflp4->daddr))) {
2526 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2495 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2527 dev_out = __ip_dev_find(net, oldflp->fl4_src, false); 2496 dev_out = __ip_dev_find(net, oldflp4->saddr, false);
2528 if (dev_out == NULL) 2497 if (dev_out == NULL)
2529 goto out; 2498 goto out;
2530 2499
@@ -2543,60 +2512,60 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2543 Luckily, this hack is good workaround. 2512 Luckily, this hack is good workaround.
2544 */ 2513 */
2545 2514
2546 fl.oif = dev_out->ifindex; 2515 fl4.flowi4_oif = dev_out->ifindex;
2547 goto make_route; 2516 goto make_route;
2548 } 2517 }
2549 2518
2550 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { 2519 if (!(oldflp4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2551 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2520 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2552 if (!__ip_dev_find(net, oldflp->fl4_src, false)) 2521 if (!__ip_dev_find(net, oldflp4->saddr, false))
2553 goto out; 2522 goto out;
2554 } 2523 }
2555 } 2524 }
2556 2525
2557 2526
2558 if (oldflp->oif) { 2527 if (oldflp4->flowi4_oif) {
2559 dev_out = dev_get_by_index_rcu(net, oldflp->oif); 2528 dev_out = dev_get_by_index_rcu(net, oldflp4->flowi4_oif);
2560 err = -ENODEV; 2529 rth = ERR_PTR(-ENODEV);
2561 if (dev_out == NULL) 2530 if (dev_out == NULL)
2562 goto out; 2531 goto out;
2563 2532
2564 /* RACE: Check return value of inet_select_addr instead. */ 2533 /* RACE: Check return value of inet_select_addr instead. */
2565 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 2534 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2566 err = -ENETUNREACH; 2535 rth = ERR_PTR(-ENETUNREACH);
2567 goto out; 2536 goto out;
2568 } 2537 }
2569 if (ipv4_is_local_multicast(oldflp->fl4_dst) || 2538 if (ipv4_is_local_multicast(oldflp4->daddr) ||
2570 ipv4_is_lbcast(oldflp->fl4_dst)) { 2539 ipv4_is_lbcast(oldflp4->daddr)) {
2571 if (!fl.fl4_src) 2540 if (!fl4.saddr)
2572 fl.fl4_src = inet_select_addr(dev_out, 0, 2541 fl4.saddr = inet_select_addr(dev_out, 0,
2573 RT_SCOPE_LINK); 2542 RT_SCOPE_LINK);
2574 goto make_route; 2543 goto make_route;
2575 } 2544 }
2576 if (!fl.fl4_src) { 2545 if (!fl4.saddr) {
2577 if (ipv4_is_multicast(oldflp->fl4_dst)) 2546 if (ipv4_is_multicast(oldflp4->daddr))
2578 fl.fl4_src = inet_select_addr(dev_out, 0, 2547 fl4.saddr = inet_select_addr(dev_out, 0,
2579 fl.fl4_scope); 2548 fl4.flowi4_scope);
2580 else if (!oldflp->fl4_dst) 2549 else if (!oldflp4->daddr)
2581 fl.fl4_src = inet_select_addr(dev_out, 0, 2550 fl4.saddr = inet_select_addr(dev_out, 0,
2582 RT_SCOPE_HOST); 2551 RT_SCOPE_HOST);
2583 } 2552 }
2584 } 2553 }
2585 2554
2586 if (!fl.fl4_dst) { 2555 if (!fl4.daddr) {
2587 fl.fl4_dst = fl.fl4_src; 2556 fl4.daddr = fl4.saddr;
2588 if (!fl.fl4_dst) 2557 if (!fl4.daddr)
2589 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); 2558 fl4.daddr = fl4.saddr = htonl(INADDR_LOOPBACK);
2590 dev_out = net->loopback_dev; 2559 dev_out = net->loopback_dev;
2591 fl.oif = net->loopback_dev->ifindex; 2560 fl4.flowi4_oif = net->loopback_dev->ifindex;
2592 res.type = RTN_LOCAL; 2561 res.type = RTN_LOCAL;
2593 flags |= RTCF_LOCAL; 2562 flags |= RTCF_LOCAL;
2594 goto make_route; 2563 goto make_route;
2595 } 2564 }
2596 2565
2597 if (fib_lookup(net, &fl, &res)) { 2566 if (fib_lookup(net, &fl4, &res)) {
2598 res.fi = NULL; 2567 res.fi = NULL;
2599 if (oldflp->oif) { 2568 if (oldflp4->flowi4_oif) {
2600 /* Apparently, routing tables are wrong. Assume, 2569 /* Apparently, routing tables are wrong. Assume,
2601 that the destination is on link. 2570 that the destination is on link.
2602 2571
@@ -2615,90 +2584,93 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2615 likely IPv6, but we do not. 2584 likely IPv6, but we do not.
2616 */ 2585 */
2617 2586
2618 if (fl.fl4_src == 0) 2587 if (fl4.saddr == 0)
2619 fl.fl4_src = inet_select_addr(dev_out, 0, 2588 fl4.saddr = inet_select_addr(dev_out, 0,
2620 RT_SCOPE_LINK); 2589 RT_SCOPE_LINK);
2621 res.type = RTN_UNICAST; 2590 res.type = RTN_UNICAST;
2622 goto make_route; 2591 goto make_route;
2623 } 2592 }
2624 err = -ENETUNREACH; 2593 rth = ERR_PTR(-ENETUNREACH);
2625 goto out; 2594 goto out;
2626 } 2595 }
2627 2596
2628 if (res.type == RTN_LOCAL) { 2597 if (res.type == RTN_LOCAL) {
2629 if (!fl.fl4_src) { 2598 if (!fl4.saddr) {
2630 if (res.fi->fib_prefsrc) 2599 if (res.fi->fib_prefsrc)
2631 fl.fl4_src = res.fi->fib_prefsrc; 2600 fl4.saddr = res.fi->fib_prefsrc;
2632 else 2601 else
2633 fl.fl4_src = fl.fl4_dst; 2602 fl4.saddr = fl4.daddr;
2634 } 2603 }
2635 dev_out = net->loopback_dev; 2604 dev_out = net->loopback_dev;
2636 fl.oif = dev_out->ifindex; 2605 fl4.flowi4_oif = dev_out->ifindex;
2637 res.fi = NULL; 2606 res.fi = NULL;
2638 flags |= RTCF_LOCAL; 2607 flags |= RTCF_LOCAL;
2639 goto make_route; 2608 goto make_route;
2640 } 2609 }
2641 2610
2642#ifdef CONFIG_IP_ROUTE_MULTIPATH 2611#ifdef CONFIG_IP_ROUTE_MULTIPATH
2643 if (res.fi->fib_nhs > 1 && fl.oif == 0) 2612 if (res.fi->fib_nhs > 1 && fl4.flowi4_oif == 0)
2644 fib_select_multipath(&fl, &res); 2613 fib_select_multipath(&res);
2645 else 2614 else
2646#endif 2615#endif
2647 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) 2616 if (!res.prefixlen && res.type == RTN_UNICAST && !fl4.flowi4_oif)
2648 fib_select_default(net, &fl, &res); 2617 fib_select_default(&res);
2649 2618
2650 if (!fl.fl4_src) 2619 if (!fl4.saddr)
2651 fl.fl4_src = FIB_RES_PREFSRC(res); 2620 fl4.saddr = FIB_RES_PREFSRC(res);
2652 2621
2653 dev_out = FIB_RES_DEV(res); 2622 dev_out = FIB_RES_DEV(res);
2654 fl.oif = dev_out->ifindex; 2623 fl4.flowi4_oif = dev_out->ifindex;
2655 2624
2656 2625
2657make_route: 2626make_route:
2658 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2627 rth = __mkroute_output(&res, &fl4, oldflp4, dev_out, flags);
2628 if (!IS_ERR(rth)) {
2629 unsigned int hash;
2659 2630
2660out: return err; 2631 hash = rt_hash(oldflp4->daddr, oldflp4->saddr, oldflp4->flowi4_oif,
2632 rt_genid(dev_net(dev_out)));
2633 rth = rt_intern_hash(hash, rth, NULL, oldflp4->flowi4_oif);
2634 }
2635
2636out:
2637 rcu_read_unlock();
2638 return rth;
2661} 2639}
2662 2640
2663int __ip_route_output_key(struct net *net, struct rtable **rp, 2641struct rtable *__ip_route_output_key(struct net *net, const struct flowi4 *flp4)
2664 const struct flowi *flp)
2665{ 2642{
2666 unsigned int hash;
2667 int res;
2668 struct rtable *rth; 2643 struct rtable *rth;
2644 unsigned int hash;
2669 2645
2670 if (!rt_caching(net)) 2646 if (!rt_caching(net))
2671 goto slow_output; 2647 goto slow_output;
2672 2648
2673 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); 2649 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2674 2650
2675 rcu_read_lock_bh(); 2651 rcu_read_lock_bh();
2676 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; 2652 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2677 rth = rcu_dereference_bh(rth->dst.rt_next)) { 2653 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2678 if (rth->fl.fl4_dst == flp->fl4_dst && 2654 if (rth->rt_key_dst == flp4->daddr &&
2679 rth->fl.fl4_src == flp->fl4_src && 2655 rth->rt_key_src == flp4->saddr &&
2680 rt_is_output_route(rth) && 2656 rt_is_output_route(rth) &&
2681 rth->fl.oif == flp->oif && 2657 rth->rt_oif == flp4->flowi4_oif &&
2682 rth->fl.mark == flp->mark && 2658 rth->rt_mark == flp4->flowi4_mark &&
2683 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2659 !((rth->rt_tos ^ flp4->flowi4_tos) &
2684 (IPTOS_RT_MASK | RTO_ONLINK)) && 2660 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2685 net_eq(dev_net(rth->dst.dev), net) && 2661 net_eq(dev_net(rth->dst.dev), net) &&
2686 !rt_is_expired(rth)) { 2662 !rt_is_expired(rth)) {
2687 dst_use(&rth->dst, jiffies); 2663 dst_use(&rth->dst, jiffies);
2688 RT_CACHE_STAT_INC(out_hit); 2664 RT_CACHE_STAT_INC(out_hit);
2689 rcu_read_unlock_bh(); 2665 rcu_read_unlock_bh();
2690 *rp = rth; 2666 return rth;
2691 return 0;
2692 } 2667 }
2693 RT_CACHE_STAT_INC(out_hlist_search); 2668 RT_CACHE_STAT_INC(out_hlist_search);
2694 } 2669 }
2695 rcu_read_unlock_bh(); 2670 rcu_read_unlock_bh();
2696 2671
2697slow_output: 2672slow_output:
2698 rcu_read_lock(); 2673 return ip_route_output_slow(net, flp4);
2699 res = ip_route_output_slow(net, rp, flp);
2700 rcu_read_unlock();
2701 return res;
2702} 2674}
2703EXPORT_SYMBOL_GPL(__ip_route_output_key); 2675EXPORT_SYMBOL_GPL(__ip_route_output_key);
2704 2676
@@ -2726,17 +2698,14 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
2726 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2698 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2727}; 2699};
2728 2700
2729 2701struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2730static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2731{ 2702{
2732 struct rtable *ort = *rp; 2703 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, 1);
2733 struct rtable *rt = (struct rtable *) 2704 struct rtable *ort = (struct rtable *) dst_orig;
2734 dst_alloc(&ipv4_dst_blackhole_ops);
2735 2705
2736 if (rt) { 2706 if (rt) {
2737 struct dst_entry *new = &rt->dst; 2707 struct dst_entry *new = &rt->dst;
2738 2708
2739 atomic_set(&new->__refcnt, 1);
2740 new->__use = 1; 2709 new->__use = 1;
2741 new->input = dst_discard; 2710 new->input = dst_discard;
2742 new->output = dst_discard; 2711 new->output = dst_discard;
@@ -2746,7 +2715,12 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2746 if (new->dev) 2715 if (new->dev)
2747 dev_hold(new->dev); 2716 dev_hold(new->dev);
2748 2717
2749 rt->fl = ort->fl; 2718 rt->rt_key_dst = ort->rt_key_dst;
2719 rt->rt_key_src = ort->rt_key_src;
2720 rt->rt_tos = ort->rt_tos;
2721 rt->rt_iif = ort->rt_iif;
2722 rt->rt_oif = ort->rt_oif;
2723 rt->rt_mark = ort->rt_mark;
2750 2724
2751 rt->rt_genid = rt_genid(net); 2725 rt->rt_genid = rt_genid(net);
2752 rt->rt_flags = ort->rt_flags; 2726 rt->rt_flags = ort->rt_flags;
@@ -2759,46 +2733,40 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2759 rt->peer = ort->peer; 2733 rt->peer = ort->peer;
2760 if (rt->peer) 2734 if (rt->peer)
2761 atomic_inc(&rt->peer->refcnt); 2735 atomic_inc(&rt->peer->refcnt);
2736 rt->fi = ort->fi;
2737 if (rt->fi)
2738 atomic_inc(&rt->fi->fib_clntref);
2762 2739
2763 dst_free(new); 2740 dst_free(new);
2764 } 2741 }
2765 2742
2766 dst_release(&(*rp)->dst); 2743 dst_release(dst_orig);
2767 *rp = rt; 2744
2768 return rt ? 0 : -ENOMEM; 2745 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2769} 2746}
2770 2747
2771int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, 2748struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2772 struct sock *sk, int flags) 2749 struct sock *sk)
2773{ 2750{
2774 int err; 2751 struct rtable *rt = __ip_route_output_key(net, flp4);
2775 2752
2776 if ((err = __ip_route_output_key(net, rp, flp)) != 0) 2753 if (IS_ERR(rt))
2777 return err; 2754 return rt;
2778 2755
2779 if (flp->proto) { 2756 if (flp4->flowi4_proto) {
2780 if (!flp->fl4_src) 2757 if (!flp4->saddr)
2781 flp->fl4_src = (*rp)->rt_src; 2758 flp4->saddr = rt->rt_src;
2782 if (!flp->fl4_dst) 2759 if (!flp4->daddr)
2783 flp->fl4_dst = (*rp)->rt_dst; 2760 flp4->daddr = rt->rt_dst;
2784 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, 2761 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2785 flags ? XFRM_LOOKUP_WAIT : 0); 2762 flowi4_to_flowi(flp4),
2786 if (err == -EREMOTE) 2763 sk, 0);
2787 err = ipv4_dst_blackhole(net, rp, flp);
2788
2789 return err;
2790 } 2764 }
2791 2765
2792 return 0; 2766 return rt;
2793} 2767}
2794EXPORT_SYMBOL_GPL(ip_route_output_flow); 2768EXPORT_SYMBOL_GPL(ip_route_output_flow);
2795 2769
2796int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2797{
2798 return ip_route_output_flow(net, rp, flp, NULL, 0);
2799}
2800EXPORT_SYMBOL(ip_route_output_key);
2801
2802static int rt_fill_info(struct net *net, 2770static int rt_fill_info(struct net *net,
2803 struct sk_buff *skb, u32 pid, u32 seq, int event, 2771 struct sk_buff *skb, u32 pid, u32 seq, int event,
2804 int nowait, unsigned int flags) 2772 int nowait, unsigned int flags)
@@ -2817,7 +2785,7 @@ static int rt_fill_info(struct net *net,
2817 r->rtm_family = AF_INET; 2785 r->rtm_family = AF_INET;
2818 r->rtm_dst_len = 32; 2786 r->rtm_dst_len = 32;
2819 r->rtm_src_len = 0; 2787 r->rtm_src_len = 0;
2820 r->rtm_tos = rt->fl.fl4_tos; 2788 r->rtm_tos = rt->rt_tos;
2821 r->rtm_table = RT_TABLE_MAIN; 2789 r->rtm_table = RT_TABLE_MAIN;
2822 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); 2790 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2823 r->rtm_type = rt->rt_type; 2791 r->rtm_type = rt->rt_type;
@@ -2829,19 +2797,19 @@ static int rt_fill_info(struct net *net,
2829 2797
2830 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); 2798 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2831 2799
2832 if (rt->fl.fl4_src) { 2800 if (rt->rt_key_src) {
2833 r->rtm_src_len = 32; 2801 r->rtm_src_len = 32;
2834 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); 2802 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2835 } 2803 }
2836 if (rt->dst.dev) 2804 if (rt->dst.dev)
2837 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); 2805 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2838#ifdef CONFIG_NET_CLS_ROUTE 2806#ifdef CONFIG_IP_ROUTE_CLASSID
2839 if (rt->dst.tclassid) 2807 if (rt->dst.tclassid)
2840 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); 2808 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2841#endif 2809#endif
2842 if (rt_is_input_route(rt)) 2810 if (rt_is_input_route(rt))
2843 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2811 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2844 else if (rt->rt_src != rt->fl.fl4_src) 2812 else if (rt->rt_src != rt->rt_key_src)
2845 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 2813 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2846 2814
2847 if (rt->rt_dst != rt->rt_gateway) 2815 if (rt->rt_dst != rt->rt_gateway)
@@ -2850,11 +2818,12 @@ static int rt_fill_info(struct net *net,
2850 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) 2818 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2851 goto nla_put_failure; 2819 goto nla_put_failure;
2852 2820
2853 if (rt->fl.mark) 2821 if (rt->rt_mark)
2854 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); 2822 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2855 2823
2856 error = rt->dst.error; 2824 error = rt->dst.error;
2857 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; 2825 expires = (rt->peer && rt->peer->pmtu_expires) ?
2826 rt->peer->pmtu_expires - jiffies : 0;
2858 if (rt->peer) { 2827 if (rt->peer) {
2859 inet_peer_refcheck(rt->peer); 2828 inet_peer_refcheck(rt->peer);
2860 id = atomic_read(&rt->peer->ip_id_count) & 0xffff; 2829 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
@@ -2884,7 +2853,7 @@ static int rt_fill_info(struct net *net,
2884 } 2853 }
2885 } else 2854 } else
2886#endif 2855#endif
2887 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); 2856 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2888 } 2857 }
2889 2858
2890 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, 2859 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
@@ -2958,14 +2927,18 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2958 if (err == 0 && rt->dst.error) 2927 if (err == 0 && rt->dst.error)
2959 err = -rt->dst.error; 2928 err = -rt->dst.error;
2960 } else { 2929 } else {
2961 struct flowi fl = { 2930 struct flowi4 fl4 = {
2962 .fl4_dst = dst, 2931 .daddr = dst,
2963 .fl4_src = src, 2932 .saddr = src,
2964 .fl4_tos = rtm->rtm_tos, 2933 .flowi4_tos = rtm->rtm_tos,
2965 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 2934 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2966 .mark = mark, 2935 .flowi4_mark = mark,
2967 }; 2936 };
2968 err = ip_route_output_key(net, &rt, &fl); 2937 rt = ip_route_output_key(net, &fl4);
2938
2939 err = 0;
2940 if (IS_ERR(rt))
2941 err = PTR_ERR(rt);
2969 } 2942 }
2970 2943
2971 if (err) 2944 if (err)
@@ -3256,9 +3229,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
3256}; 3229};
3257 3230
3258 3231
3259#ifdef CONFIG_NET_CLS_ROUTE 3232#ifdef CONFIG_IP_ROUTE_CLASSID
3260struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3233struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3261#endif /* CONFIG_NET_CLS_ROUTE */ 3234#endif /* CONFIG_IP_ROUTE_CLASSID */
3262 3235
3263static __initdata unsigned long rhash_entries; 3236static __initdata unsigned long rhash_entries;
3264static int __init set_rhash_entries(char *str) 3237static int __init set_rhash_entries(char *str)
@@ -3274,7 +3247,7 @@ int __init ip_rt_init(void)
3274{ 3247{
3275 int rc = 0; 3248 int rc = 0;
3276 3249
3277#ifdef CONFIG_NET_CLS_ROUTE 3250#ifdef CONFIG_IP_ROUTE_CLASSID
3278 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3251 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3279 if (!ip_rt_acct) 3252 if (!ip_rt_acct)
3280 panic("IP: failed to allocate ip_rt_acct\n"); 3253 panic("IP: failed to allocate ip_rt_acct\n");
@@ -3311,14 +3284,6 @@ int __init ip_rt_init(void)
3311 devinet_init(); 3284 devinet_init();
3312 ip_fib_init(); 3285 ip_fib_init();
3313 3286
3314 /* All the timers, started at system startup tend
3315 to synchronize. Perturb it a bit.
3316 */
3317 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3318 expires_ljiffies = jiffies;
3319 schedule_delayed_work(&expires_work,
3320 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3321
3322 if (ip_rt_proc_init()) 3287 if (ip_rt_proc_init())
3323 printk(KERN_ERR "Unable to create route proc files\n"); 3288 printk(KERN_ERR "Unable to create route proc files\n");
3324#ifdef CONFIG_XFRM 3289#ifdef CONFIG_XFRM
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 47519205a014..8b44c6d2a79b 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -345,17 +345,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
345 * no easy way to do this. 345 * no easy way to do this.
346 */ 346 */
347 { 347 {
348 struct flowi fl = { .mark = sk->sk_mark, 348 struct flowi4 fl4 = {
349 .fl4_dst = ((opt && opt->srr) ? 349 .flowi4_mark = sk->sk_mark,
350 opt->faddr : ireq->rmt_addr), 350 .daddr = ((opt && opt->srr) ?
351 .fl4_src = ireq->loc_addr, 351 opt->faddr : ireq->rmt_addr),
352 .fl4_tos = RT_CONN_FLAGS(sk), 352 .saddr = ireq->loc_addr,
353 .proto = IPPROTO_TCP, 353 .flowi4_tos = RT_CONN_FLAGS(sk),
354 .flags = inet_sk_flowi_flags(sk), 354 .flowi4_proto = IPPROTO_TCP,
355 .fl_ip_sport = th->dest, 355 .flowi4_flags = inet_sk_flowi_flags(sk),
356 .fl_ip_dport = th->source }; 356 .fl4_sport = th->dest,
357 security_req_classify_flow(req, &fl); 357 .fl4_dport = th->source,
358 if (ip_route_output_key(sock_net(sk), &rt, &fl)) { 358 };
359 security_req_classify_flow(req, flowi4_to_flowi(&fl4));
360 rt = ip_route_output_key(sock_net(sk), &fl4);
361 if (IS_ERR(rt)) {
359 reqsk_free(req); 362 reqsk_free(req);
360 goto out; 363 goto out;
361 } 364 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6c11eece262c..b22d45010545 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -505,6 +505,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
505 else 505 else
506 answ = tp->write_seq - tp->snd_una; 506 answ = tp->write_seq - tp->snd_una;
507 break; 507 break;
508 case SIOCOUTQNSD:
509 if (sk->sk_state == TCP_LISTEN)
510 return -EINVAL;
511
512 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
513 answ = 0;
514 else
515 answ = tp->write_seq - tp->snd_nxt;
516 break;
508 default: 517 default:
509 return -ENOIOCTLCMD; 518 return -ENOIOCTLCMD;
510 } 519 }
@@ -873,9 +882,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
873 flags); 882 flags);
874 883
875 lock_sock(sk); 884 lock_sock(sk);
876 TCP_CHECK_TIMER(sk);
877 res = do_tcp_sendpages(sk, &page, offset, size, flags); 885 res = do_tcp_sendpages(sk, &page, offset, size, flags);
878 TCP_CHECK_TIMER(sk);
879 release_sock(sk); 886 release_sock(sk);
880 return res; 887 return res;
881} 888}
@@ -916,7 +923,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
916 long timeo; 923 long timeo;
917 924
918 lock_sock(sk); 925 lock_sock(sk);
919 TCP_CHECK_TIMER(sk);
920 926
921 flags = msg->msg_flags; 927 flags = msg->msg_flags;
922 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 928 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -1104,7 +1110,6 @@ wait_for_memory:
1104out: 1110out:
1105 if (copied) 1111 if (copied)
1106 tcp_push(sk, flags, mss_now, tp->nonagle); 1112 tcp_push(sk, flags, mss_now, tp->nonagle);
1107 TCP_CHECK_TIMER(sk);
1108 release_sock(sk); 1113 release_sock(sk);
1109 return copied; 1114 return copied;
1110 1115
@@ -1123,7 +1128,6 @@ do_error:
1123 goto out; 1128 goto out;
1124out_err: 1129out_err:
1125 err = sk_stream_error(sk, flags, err); 1130 err = sk_stream_error(sk, flags, err);
1126 TCP_CHECK_TIMER(sk);
1127 release_sock(sk); 1131 release_sock(sk);
1128 return err; 1132 return err;
1129} 1133}
@@ -1415,8 +1419,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1415 1419
1416 lock_sock(sk); 1420 lock_sock(sk);
1417 1421
1418 TCP_CHECK_TIMER(sk);
1419
1420 err = -ENOTCONN; 1422 err = -ENOTCONN;
1421 if (sk->sk_state == TCP_LISTEN) 1423 if (sk->sk_state == TCP_LISTEN)
1422 goto out; 1424 goto out;
@@ -1767,12 +1769,10 @@ skip_copy:
1767 /* Clean up data we have read: This will do ACK frames. */ 1769 /* Clean up data we have read: This will do ACK frames. */
1768 tcp_cleanup_rbuf(sk, copied); 1770 tcp_cleanup_rbuf(sk, copied);
1769 1771
1770 TCP_CHECK_TIMER(sk);
1771 release_sock(sk); 1772 release_sock(sk);
1772 return copied; 1773 return copied;
1773 1774
1774out: 1775out:
1775 TCP_CHECK_TIMER(sk);
1776 release_sock(sk); 1776 release_sock(sk);
1777 return err; 1777 return err;
1778 1778
@@ -2653,7 +2653,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2653EXPORT_SYMBOL(compat_tcp_getsockopt); 2653EXPORT_SYMBOL(compat_tcp_getsockopt);
2654#endif 2654#endif
2655 2655
2656struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) 2656struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features)
2657{ 2657{
2658 struct sk_buff *segs = ERR_PTR(-EINVAL); 2658 struct sk_buff *segs = ERR_PTR(-EINVAL);
2659 struct tcphdr *th; 2659 struct tcphdr *th;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 3b53fd1af23f..6187eb4d1dcf 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -209,7 +209,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
209} 209}
210 210
211 211
212static struct tcp_congestion_ops bictcp = { 212static struct tcp_congestion_ops bictcp __read_mostly = {
213 .init = bictcp_init, 213 .init = bictcp_init,
214 .ssthresh = bictcp_recalc_ssthresh, 214 .ssthresh = bictcp_recalc_ssthresh,
215 .cong_avoid = bictcp_cong_avoid, 215 .cong_avoid = bictcp_cong_avoid,
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 90d92dd4cf13..34340c9c95fa 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -424,7 +424,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
424 hystart_update(sk, delay); 424 hystart_update(sk, delay);
425} 425}
426 426
427static struct tcp_congestion_ops cubictcp = { 427static struct tcp_congestion_ops cubictcp __read_mostly = {
428 .init = bictcp_init, 428 .init = bictcp_init,
429 .ssthresh = bictcp_recalc_ssthresh, 429 .ssthresh = bictcp_recalc_ssthresh,
430 .cong_avoid = bictcp_cong_avoid, 430 .cong_avoid = bictcp_cong_avoid,
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 8b6caaf75bb9..30f27f6b3655 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -158,7 +158,7 @@ static u32 hstcp_ssthresh(struct sock *sk)
158} 158}
159 159
160 160
161static struct tcp_congestion_ops tcp_highspeed = { 161static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
162 .init = hstcp_init, 162 .init = hstcp_init,
163 .ssthresh = hstcp_ssthresh, 163 .ssthresh = hstcp_ssthresh,
164 .cong_avoid = hstcp_cong_avoid, 164 .cong_avoid = hstcp_cong_avoid,
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 7c94a4955416..c1a8175361e8 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -284,7 +284,7 @@ static void htcp_state(struct sock *sk, u8 new_state)
284 } 284 }
285} 285}
286 286
287static struct tcp_congestion_ops htcp = { 287static struct tcp_congestion_ops htcp __read_mostly = {
288 .init = htcp_init, 288 .init = htcp_init,
289 .ssthresh = htcp_recalc_ssthresh, 289 .ssthresh = htcp_recalc_ssthresh,
290 .cong_avoid = htcp_cong_avoid, 290 .cong_avoid = htcp_cong_avoid,
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 377bc9349371..fe3ecf484b44 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -162,7 +162,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
162 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); 162 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
163} 163}
164 164
165static struct tcp_congestion_ops tcp_hybla = { 165static struct tcp_congestion_ops tcp_hybla __read_mostly = {
166 .init = hybla_init, 166 .init = hybla_init,
167 .ssthresh = tcp_reno_ssthresh, 167 .ssthresh = tcp_reno_ssthresh,
168 .min_cwnd = tcp_reno_min_cwnd, 168 .min_cwnd = tcp_reno_min_cwnd,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 00ca688d8964..813b43a76fec 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -322,7 +322,7 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
322 } 322 }
323} 323}
324 324
325static struct tcp_congestion_ops tcp_illinois = { 325static struct tcp_congestion_ops tcp_illinois __read_mostly = {
326 .flags = TCP_CONG_RTT_STAMP, 326 .flags = TCP_CONG_RTT_STAMP,
327 .init = tcp_illinois_init, 327 .init = tcp_illinois_init,
328 .ssthresh = tcp_illinois_ssthresh, 328 .ssthresh = tcp_illinois_ssthresh,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e16b17efcf57..da782e7ab16d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
817 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 817 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
818 818
819 if (!cwnd) 819 if (!cwnd)
820 cwnd = rfc3390_bytes_to_packets(tp->mss_cache); 820 cwnd = TCP_INIT_CWND;
821 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 821 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
822} 822}
823 823
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 02f583b3744a..f7e6c2c2d2bb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -149,9 +149,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149 struct inet_sock *inet = inet_sk(sk); 149 struct inet_sock *inet = inet_sk(sk);
150 struct tcp_sock *tp = tcp_sk(sk); 150 struct tcp_sock *tp = tcp_sk(sk);
151 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 151 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
152 __be16 orig_sport, orig_dport;
152 struct rtable *rt; 153 struct rtable *rt;
153 __be32 daddr, nexthop; 154 __be32 daddr, nexthop;
154 int tmp;
155 int err; 155 int err;
156 156
157 if (addr_len < sizeof(struct sockaddr_in)) 157 if (addr_len < sizeof(struct sockaddr_in))
@@ -167,14 +167,17 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
167 nexthop = inet->opt->faddr; 167 nexthop = inet->opt->faddr;
168 } 168 }
169 169
170 tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr, 170 orig_sport = inet->inet_sport;
171 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 171 orig_dport = usin->sin_port;
172 IPPROTO_TCP, 172 rt = ip_route_connect(nexthop, inet->inet_saddr,
173 inet->inet_sport, usin->sin_port, sk, 1); 173 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174 if (tmp < 0) { 174 IPPROTO_TCP,
175 if (tmp == -ENETUNREACH) 175 orig_sport, orig_dport, sk, true);
176 if (IS_ERR(rt)) {
177 err = PTR_ERR(rt);
178 if (err == -ENETUNREACH)
176 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 179 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
177 return tmp; 180 return err;
178 } 181 }
179 182
180 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 183 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
@@ -233,11 +236,14 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
233 if (err) 236 if (err)
234 goto failure; 237 goto failure;
235 238
236 err = ip_route_newports(&rt, IPPROTO_TCP, 239 rt = ip_route_newports(rt, IPPROTO_TCP,
237 inet->inet_sport, inet->inet_dport, sk); 240 orig_sport, orig_dport,
238 if (err) 241 inet->inet_sport, inet->inet_dport, sk);
242 if (IS_ERR(rt)) {
243 err = PTR_ERR(rt);
244 rt = NULL;
239 goto failure; 245 goto failure;
240 246 }
241 /* OK, now commit destination to socket. */ 247 /* OK, now commit destination to socket. */
242 sk->sk_gso_type = SKB_GSO_TCPV4; 248 sk->sk_gso_type = SKB_GSO_TCPV4;
243 sk_setup_caps(sk, &rt->dst); 249 sk_setup_caps(sk, &rt->dst);
@@ -1341,7 +1347,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1341 tcp_death_row.sysctl_tw_recycle && 1347 tcp_death_row.sysctl_tw_recycle &&
1342 (dst = inet_csk_route_req(sk, req)) != NULL && 1348 (dst = inet_csk_route_req(sk, req)) != NULL &&
1343 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1349 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1344 peer->daddr.a4 == saddr) { 1350 peer->daddr.addr.a4 == saddr) {
1345 inet_peer_refcheck(peer); 1351 inet_peer_refcheck(peer);
1346 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && 1352 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1347 (s32)(peer->tcp_ts - req->ts_recent) > 1353 (s32)(peer->tcp_ts - req->ts_recent) >
@@ -1556,12 +1562,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1556 1562
1557 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1563 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1558 sock_rps_save_rxhash(sk, skb->rxhash); 1564 sock_rps_save_rxhash(sk, skb->rxhash);
1559 TCP_CHECK_TIMER(sk);
1560 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1565 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1561 rsk = sk; 1566 rsk = sk;
1562 goto reset; 1567 goto reset;
1563 } 1568 }
1564 TCP_CHECK_TIMER(sk);
1565 return 0; 1569 return 0;
1566 } 1570 }
1567 1571
@@ -1583,13 +1587,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1583 } else 1587 } else
1584 sock_rps_save_rxhash(sk, skb->rxhash); 1588 sock_rps_save_rxhash(sk, skb->rxhash);
1585 1589
1586
1587 TCP_CHECK_TIMER(sk);
1588 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { 1590 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1589 rsk = sk; 1591 rsk = sk;
1590 goto reset; 1592 goto reset;
1591 } 1593 }
1592 TCP_CHECK_TIMER(sk);
1593 return 0; 1594 return 0;
1594 1595
1595reset: 1596reset:
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index de870377fbba..656d431c99ad 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -313,7 +313,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
313 lp->last_drop = tcp_time_stamp; 313 lp->last_drop = tcp_time_stamp;
314} 314}
315 315
316static struct tcp_congestion_ops tcp_lp = { 316static struct tcp_congestion_ops tcp_lp __read_mostly = {
317 .flags = TCP_CONG_RTT_STAMP, 317 .flags = TCP_CONG_RTT_STAMP,
318 .init = tcp_lp_init, 318 .init = tcp_lp_init,
319 .ssthresh = tcp_reno_ssthresh, 319 .ssthresh = tcp_reno_ssthresh,
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index a76513779e2b..8ce55b8aaec8 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -35,7 +35,7 @@ static u32 tcp_scalable_ssthresh(struct sock *sk)
35} 35}
36 36
37 37
38static struct tcp_congestion_ops tcp_scalable = { 38static struct tcp_congestion_ops tcp_scalable __read_mostly = {
39 .ssthresh = tcp_scalable_ssthresh, 39 .ssthresh = tcp_scalable_ssthresh,
40 .cong_avoid = tcp_scalable_cong_avoid, 40 .cong_avoid = tcp_scalable_cong_avoid,
41 .min_cwnd = tcp_reno_min_cwnd, 41 .min_cwnd = tcp_reno_min_cwnd,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 74a6aa003657..ecd44b0c45f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -259,7 +259,6 @@ static void tcp_delack_timer(unsigned long data)
259 tcp_send_ack(sk); 259 tcp_send_ack(sk);
260 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); 260 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
261 } 261 }
262 TCP_CHECK_TIMER(sk);
263 262
264out: 263out:
265 if (tcp_memory_pressure) 264 if (tcp_memory_pressure)
@@ -481,7 +480,6 @@ static void tcp_write_timer(unsigned long data)
481 tcp_probe_timer(sk); 480 tcp_probe_timer(sk);
482 break; 481 break;
483 } 482 }
484 TCP_CHECK_TIMER(sk);
485 483
486out: 484out:
487 sk_mem_reclaim(sk); 485 sk_mem_reclaim(sk);
@@ -589,7 +587,6 @@ static void tcp_keepalive_timer (unsigned long data)
589 elapsed = keepalive_time_when(tp) - elapsed; 587 elapsed = keepalive_time_when(tp) - elapsed;
590 } 588 }
591 589
592 TCP_CHECK_TIMER(sk);
593 sk_mem_reclaim(sk); 590 sk_mem_reclaim(sk);
594 591
595resched: 592resched:
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index c6743eec9b7d..80fa2bfd7ede 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -304,7 +304,7 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
304} 304}
305EXPORT_SYMBOL_GPL(tcp_vegas_get_info); 305EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
306 306
307static struct tcp_congestion_ops tcp_vegas = { 307static struct tcp_congestion_ops tcp_vegas __read_mostly = {
308 .flags = TCP_CONG_RTT_STAMP, 308 .flags = TCP_CONG_RTT_STAMP,
309 .init = tcp_vegas_init, 309 .init = tcp_vegas_init,
310 .ssthresh = tcp_reno_ssthresh, 310 .ssthresh = tcp_reno_ssthresh,
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 38bc0b52d745..ac43cd747bce 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -201,7 +201,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
201 return max(tp->snd_cwnd >> 1U, 2U); 201 return max(tp->snd_cwnd >> 1U, 2U);
202} 202}
203 203
204static struct tcp_congestion_ops tcp_veno = { 204static struct tcp_congestion_ops tcp_veno __read_mostly = {
205 .flags = TCP_CONG_RTT_STAMP, 205 .flags = TCP_CONG_RTT_STAMP,
206 .init = tcp_veno_init, 206 .init = tcp_veno_init,
207 .ssthresh = tcp_veno_ssthresh, 207 .ssthresh = tcp_veno_ssthresh,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index a534dda5456e..1b91bf48e277 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -272,7 +272,7 @@ static void tcp_westwood_info(struct sock *sk, u32 ext,
272} 272}
273 273
274 274
275static struct tcp_congestion_ops tcp_westwood = { 275static struct tcp_congestion_ops tcp_westwood __read_mostly = {
276 .init = tcp_westwood_init, 276 .init = tcp_westwood_init,
277 .ssthresh = tcp_reno_ssthresh, 277 .ssthresh = tcp_reno_ssthresh,
278 .cong_avoid = tcp_reno_cong_avoid, 278 .cong_avoid = tcp_reno_cong_avoid,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index a0f240358892..dc7f43179c9a 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -225,7 +225,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {
225 return tp->snd_cwnd - reduction; 225 return tp->snd_cwnd - reduction;
226} 226}
227 227
228static struct tcp_congestion_ops tcp_yeah = { 228static struct tcp_congestion_ops tcp_yeah __read_mostly = {
229 .flags = TCP_CONG_RTT_STAMP, 229 .flags = TCP_CONG_RTT_STAMP,
230 .init = tcp_yeah_init, 230 .init = tcp_yeah_init,
231 .ssthresh = tcp_yeah_ssthresh, 231 .ssthresh = tcp_yeah_ssthresh,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8157b17959ee..588f47af5faf 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -663,75 +663,72 @@ void udp_flush_pending_frames(struct sock *sk)
663EXPORT_SYMBOL(udp_flush_pending_frames); 663EXPORT_SYMBOL(udp_flush_pending_frames);
664 664
665/** 665/**
666 * udp4_hwcsum_outgoing - handle outgoing HW checksumming 666 * udp4_hwcsum - handle outgoing HW checksumming
667 * @sk: socket we are sending on
668 * @skb: sk_buff containing the filled-in UDP header 667 * @skb: sk_buff containing the filled-in UDP header
669 * (checksum field must be zeroed out) 668 * (checksum field must be zeroed out)
669 * @src: source IP address
670 * @dst: destination IP address
670 */ 671 */
671static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, 672static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
672 __be32 src, __be32 dst, int len)
673{ 673{
674 unsigned int offset;
675 struct udphdr *uh = udp_hdr(skb); 674 struct udphdr *uh = udp_hdr(skb);
675 struct sk_buff *frags = skb_shinfo(skb)->frag_list;
676 int offset = skb_transport_offset(skb);
677 int len = skb->len - offset;
678 int hlen = len;
676 __wsum csum = 0; 679 __wsum csum = 0;
677 680
678 if (skb_queue_len(&sk->sk_write_queue) == 1) { 681 if (!frags) {
679 /* 682 /*
680 * Only one fragment on the socket. 683 * Only one fragment on the socket.
681 */ 684 */
682 skb->csum_start = skb_transport_header(skb) - skb->head; 685 skb->csum_start = skb_transport_header(skb) - skb->head;
683 skb->csum_offset = offsetof(struct udphdr, check); 686 skb->csum_offset = offsetof(struct udphdr, check);
684 uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); 687 uh->check = ~csum_tcpudp_magic(src, dst, len,
688 IPPROTO_UDP, 0);
685 } else { 689 } else {
686 /* 690 /*
687 * HW-checksum won't work as there are two or more 691 * HW-checksum won't work as there are two or more
688 * fragments on the socket so that all csums of sk_buffs 692 * fragments on the socket so that all csums of sk_buffs
689 * should be together 693 * should be together
690 */ 694 */
691 offset = skb_transport_offset(skb); 695 do {
692 skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); 696 csum = csum_add(csum, frags->csum);
697 hlen -= frags->len;
698 } while ((frags = frags->next));
693 699
700 csum = skb_checksum(skb, offset, hlen, csum);
694 skb->ip_summed = CHECKSUM_NONE; 701 skb->ip_summed = CHECKSUM_NONE;
695 702
696 skb_queue_walk(&sk->sk_write_queue, skb) {
697 csum = csum_add(csum, skb->csum);
698 }
699
700 uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum); 703 uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
701 if (uh->check == 0) 704 if (uh->check == 0)
702 uh->check = CSUM_MANGLED_0; 705 uh->check = CSUM_MANGLED_0;
703 } 706 }
704} 707}
705 708
706/* 709static int udp_send_skb(struct sk_buff *skb, __be32 daddr, __be32 dport)
707 * Push out all pending data as one UDP datagram. Socket is locked.
708 */
709static int udp_push_pending_frames(struct sock *sk)
710{ 710{
711 struct udp_sock *up = udp_sk(sk); 711 struct sock *sk = skb->sk;
712 struct inet_sock *inet = inet_sk(sk); 712 struct inet_sock *inet = inet_sk(sk);
713 struct flowi *fl = &inet->cork.fl;
714 struct sk_buff *skb;
715 struct udphdr *uh; 713 struct udphdr *uh;
714 struct rtable *rt = (struct rtable *)skb_dst(skb);
716 int err = 0; 715 int err = 0;
717 int is_udplite = IS_UDPLITE(sk); 716 int is_udplite = IS_UDPLITE(sk);
717 int offset = skb_transport_offset(skb);
718 int len = skb->len - offset;
718 __wsum csum = 0; 719 __wsum csum = 0;
719 720
720 /* Grab the skbuff where UDP header space exists. */
721 if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
722 goto out;
723
724 /* 721 /*
725 * Create a UDP header 722 * Create a UDP header
726 */ 723 */
727 uh = udp_hdr(skb); 724 uh = udp_hdr(skb);
728 uh->source = fl->fl_ip_sport; 725 uh->source = inet->inet_sport;
729 uh->dest = fl->fl_ip_dport; 726 uh->dest = dport;
730 uh->len = htons(up->len); 727 uh->len = htons(len);
731 uh->check = 0; 728 uh->check = 0;
732 729
733 if (is_udplite) /* UDP-Lite */ 730 if (is_udplite) /* UDP-Lite */
734 csum = udplite_csum_outgoing(sk, skb); 731 csum = udplite_csum(skb);
735 732
736 else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ 733 else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */
737 734
@@ -740,20 +737,20 @@ static int udp_push_pending_frames(struct sock *sk)
740 737
741 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ 738 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
742 739
743 udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len); 740 udp4_hwcsum(skb, rt->rt_src, daddr);
744 goto send; 741 goto send;
745 742
746 } else /* `normal' UDP */ 743 } else
747 csum = udp_csum_outgoing(sk, skb); 744 csum = udp_csum(skb);
748 745
749 /* add protocol-dependent pseudo-header */ 746 /* add protocol-dependent pseudo-header */
750 uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, 747 uh->check = csum_tcpudp_magic(rt->rt_src, daddr, len,
751 sk->sk_protocol, csum); 748 sk->sk_protocol, csum);
752 if (uh->check == 0) 749 if (uh->check == 0)
753 uh->check = CSUM_MANGLED_0; 750 uh->check = CSUM_MANGLED_0;
754 751
755send: 752send:
756 err = ip_push_pending_frames(sk); 753 err = ip_send_skb(skb);
757 if (err) { 754 if (err) {
758 if (err == -ENOBUFS && !inet->recverr) { 755 if (err == -ENOBUFS && !inet->recverr) {
759 UDP_INC_STATS_USER(sock_net(sk), 756 UDP_INC_STATS_USER(sock_net(sk),
@@ -763,6 +760,26 @@ send:
763 } else 760 } else
764 UDP_INC_STATS_USER(sock_net(sk), 761 UDP_INC_STATS_USER(sock_net(sk),
765 UDP_MIB_OUTDATAGRAMS, is_udplite); 762 UDP_MIB_OUTDATAGRAMS, is_udplite);
763 return err;
764}
765
766/*
767 * Push out all pending data as one UDP datagram. Socket is locked.
768 */
769static int udp_push_pending_frames(struct sock *sk)
770{
771 struct udp_sock *up = udp_sk(sk);
772 struct inet_sock *inet = inet_sk(sk);
773 struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
774 struct sk_buff *skb;
775 int err = 0;
776
777 skb = ip_finish_skb(sk);
778 if (!skb)
779 goto out;
780
781 err = udp_send_skb(skb, fl4->daddr, fl4->fl4_dport);
782
766out: 783out:
767 up->len = 0; 784 up->len = 0;
768 up->pending = 0; 785 up->pending = 0;
@@ -774,6 +791,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
774{ 791{
775 struct inet_sock *inet = inet_sk(sk); 792 struct inet_sock *inet = inet_sk(sk);
776 struct udp_sock *up = udp_sk(sk); 793 struct udp_sock *up = udp_sk(sk);
794 struct flowi4 *fl4;
777 int ulen = len; 795 int ulen = len;
778 struct ipcm_cookie ipc; 796 struct ipcm_cookie ipc;
779 struct rtable *rt = NULL; 797 struct rtable *rt = NULL;
@@ -785,6 +803,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
785 int err, is_udplite = IS_UDPLITE(sk); 803 int err, is_udplite = IS_UDPLITE(sk);
786 int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; 804 int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
787 int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); 805 int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
806 struct sk_buff *skb;
788 807
789 if (len > 0xFFFF) 808 if (len > 0xFFFF)
790 return -EMSGSIZE; 809 return -EMSGSIZE;
@@ -799,6 +818,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
799 ipc.opt = NULL; 818 ipc.opt = NULL;
800 ipc.tx_flags = 0; 819 ipc.tx_flags = 0;
801 820
821 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
822
802 if (up->pending) { 823 if (up->pending) {
803 /* 824 /*
804 * There are pending frames. 825 * There are pending frames.
@@ -888,20 +909,25 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
888 rt = (struct rtable *)sk_dst_check(sk, 0); 909 rt = (struct rtable *)sk_dst_check(sk, 0);
889 910
890 if (rt == NULL) { 911 if (rt == NULL) {
891 struct flowi fl = { .oif = ipc.oif, 912 struct flowi4 fl4 = {
892 .mark = sk->sk_mark, 913 .flowi4_oif = ipc.oif,
893 .fl4_dst = faddr, 914 .flowi4_mark = sk->sk_mark,
894 .fl4_src = saddr, 915 .daddr = faddr,
895 .fl4_tos = tos, 916 .saddr = saddr,
896 .proto = sk->sk_protocol, 917 .flowi4_tos = tos,
897 .flags = inet_sk_flowi_flags(sk), 918 .flowi4_proto = sk->sk_protocol,
898 .fl_ip_sport = inet->inet_sport, 919 .flowi4_flags = (inet_sk_flowi_flags(sk) |
899 .fl_ip_dport = dport }; 920 FLOWI_FLAG_CAN_SLEEP),
921 .fl4_sport = inet->inet_sport,
922 .fl4_dport = dport,
923 };
900 struct net *net = sock_net(sk); 924 struct net *net = sock_net(sk);
901 925
902 security_sk_classify_flow(sk, &fl); 926 security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
903 err = ip_route_output_flow(net, &rt, &fl, sk, 1); 927 rt = ip_route_output_flow(net, &fl4, sk);
904 if (err) { 928 if (IS_ERR(rt)) {
929 err = PTR_ERR(rt);
930 rt = NULL;
905 if (err == -ENETUNREACH) 931 if (err == -ENETUNREACH)
906 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); 932 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
907 goto out; 933 goto out;
@@ -923,6 +949,17 @@ back_from_confirm:
923 if (!ipc.addr) 949 if (!ipc.addr)
924 daddr = ipc.addr = rt->rt_dst; 950 daddr = ipc.addr = rt->rt_dst;
925 951
952 /* Lockless fast path for the non-corking case. */
953 if (!corkreq) {
954 skb = ip_make_skb(sk, getfrag, msg->msg_iov, ulen,
955 sizeof(struct udphdr), &ipc, &rt,
956 msg->msg_flags);
957 err = PTR_ERR(skb);
958 if (skb && !IS_ERR(skb))
959 err = udp_send_skb(skb, daddr, dport);
960 goto out;
961 }
962
926 lock_sock(sk); 963 lock_sock(sk);
927 if (unlikely(up->pending)) { 964 if (unlikely(up->pending)) {
928 /* The socket is already corked while preparing it. */ 965 /* The socket is already corked while preparing it. */
@@ -936,15 +973,15 @@ back_from_confirm:
936 /* 973 /*
937 * Now cork the socket to pend data. 974 * Now cork the socket to pend data.
938 */ 975 */
939 inet->cork.fl.fl4_dst = daddr; 976 fl4 = &inet->cork.fl.u.ip4;
940 inet->cork.fl.fl_ip_dport = dport; 977 fl4->daddr = daddr;
941 inet->cork.fl.fl4_src = saddr; 978 fl4->saddr = saddr;
942 inet->cork.fl.fl_ip_sport = inet->inet_sport; 979 fl4->fl4_dport = dport;
980 fl4->fl4_sport = inet->inet_sport;
943 up->pending = AF_INET; 981 up->pending = AF_INET;
944 982
945do_append_data: 983do_append_data:
946 up->len += ulen; 984 up->len += ulen;
947 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
948 err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, 985 err = ip_append_data(sk, getfrag, msg->msg_iov, ulen,
949 sizeof(struct udphdr), &ipc, &rt, 986 sizeof(struct udphdr), &ipc, &rt,
950 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); 987 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
@@ -2199,7 +2236,7 @@ int udp4_ufo_send_check(struct sk_buff *skb)
2199 return 0; 2236 return 0;
2200} 2237}
2201 2238
2202struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) 2239struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features)
2203{ 2240{
2204 struct sk_buff *segs = ERR_PTR(-EINVAL); 2241 struct sk_buff *segs = ERR_PTR(-EINVAL);
2205 unsigned int mss; 2242 unsigned int mss;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index b057d40addec..13e0e7f659ff 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -19,25 +19,23 @@
19static struct xfrm_policy_afinfo xfrm4_policy_afinfo; 19static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
20 20
21static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, 21static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
22 xfrm_address_t *saddr, 22 const xfrm_address_t *saddr,
23 xfrm_address_t *daddr) 23 const xfrm_address_t *daddr)
24{ 24{
25 struct flowi fl = { 25 struct flowi4 fl4 = {
26 .fl4_dst = daddr->a4, 26 .daddr = daddr->a4,
27 .fl4_tos = tos, 27 .flowi4_tos = tos,
28 }; 28 };
29 struct dst_entry *dst;
30 struct rtable *rt; 29 struct rtable *rt;
31 int err;
32 30
33 if (saddr) 31 if (saddr)
34 fl.fl4_src = saddr->a4; 32 fl4.saddr = saddr->a4;
33
34 rt = __ip_route_output_key(net, &fl4);
35 if (!IS_ERR(rt))
36 return &rt->dst;
35 37
36 err = __ip_route_output_key(net, &rt, &fl); 38 return ERR_CAST(rt);
37 dst = &rt->dst;
38 if (err)
39 dst = ERR_PTR(err);
40 return dst;
41} 39}
42 40
43static int xfrm4_get_saddr(struct net *net, 41static int xfrm4_get_saddr(struct net *net,
@@ -56,9 +54,9 @@ static int xfrm4_get_saddr(struct net *net,
56 return 0; 54 return 0;
57} 55}
58 56
59static int xfrm4_get_tos(struct flowi *fl) 57static int xfrm4_get_tos(const struct flowi *fl)
60{ 58{
61 return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */ 59 return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */
62} 60}
63 61
64static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, 62static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
@@ -68,11 +66,17 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
68} 66}
69 67
70static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, 68static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
71 struct flowi *fl) 69 const struct flowi *fl)
72{ 70{
73 struct rtable *rt = (struct rtable *)xdst->route; 71 struct rtable *rt = (struct rtable *)xdst->route;
72 const struct flowi4 *fl4 = &fl->u.ip4;
74 73
75 xdst->u.rt.fl = *fl; 74 rt->rt_key_dst = fl4->daddr;
75 rt->rt_key_src = fl4->saddr;
76 rt->rt_tos = fl4->flowi4_tos;
77 rt->rt_iif = fl4->flowi4_iif;
78 rt->rt_oif = fl4->flowi4_oif;
79 rt->rt_mark = fl4->flowi4_mark;
76 80
77 xdst->u.dst.dev = dev; 81 xdst->u.dst.dev = dev;
78 dev_hold(dev); 82 dev_hold(dev);
@@ -99,9 +103,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
99{ 103{
100 struct iphdr *iph = ip_hdr(skb); 104 struct iphdr *iph = ip_hdr(skb);
101 u8 *xprth = skb_network_header(skb) + iph->ihl * 4; 105 u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
106 struct flowi4 *fl4 = &fl->u.ip4;
102 107
103 memset(fl, 0, sizeof(struct flowi)); 108 memset(fl4, 0, sizeof(struct flowi4));
104 fl->mark = skb->mark; 109 fl4->flowi4_mark = skb->mark;
105 110
106 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { 111 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
107 switch (iph->protocol) { 112 switch (iph->protocol) {
@@ -114,8 +119,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
114 pskb_may_pull(skb, xprth + 4 - skb->data)) { 119 pskb_may_pull(skb, xprth + 4 - skb->data)) {
115 __be16 *ports = (__be16 *)xprth; 120 __be16 *ports = (__be16 *)xprth;
116 121
117 fl->fl_ip_sport = ports[!!reverse]; 122 fl4->fl4_sport = ports[!!reverse];
118 fl->fl_ip_dport = ports[!reverse]; 123 fl4->fl4_dport = ports[!reverse];
119 } 124 }
120 break; 125 break;
121 126
@@ -123,8 +128,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
123 if (pskb_may_pull(skb, xprth + 2 - skb->data)) { 128 if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
124 u8 *icmp = xprth; 129 u8 *icmp = xprth;
125 130
126 fl->fl_icmp_type = icmp[0]; 131 fl4->fl4_icmp_type = icmp[0];
127 fl->fl_icmp_code = icmp[1]; 132 fl4->fl4_icmp_code = icmp[1];
128 } 133 }
129 break; 134 break;
130 135
@@ -132,7 +137,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
132 if (pskb_may_pull(skb, xprth + 4 - skb->data)) { 137 if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
133 __be32 *ehdr = (__be32 *)xprth; 138 __be32 *ehdr = (__be32 *)xprth;
134 139
135 fl->fl_ipsec_spi = ehdr[0]; 140 fl4->fl4_ipsec_spi = ehdr[0];
136 } 141 }
137 break; 142 break;
138 143
@@ -140,7 +145,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
140 if (pskb_may_pull(skb, xprth + 8 - skb->data)) { 145 if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
141 __be32 *ah_hdr = (__be32*)xprth; 146 __be32 *ah_hdr = (__be32*)xprth;
142 147
143 fl->fl_ipsec_spi = ah_hdr[1]; 148 fl4->fl4_ipsec_spi = ah_hdr[1];
144 } 149 }
145 break; 150 break;
146 151
@@ -148,7 +153,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
148 if (pskb_may_pull(skb, xprth + 4 - skb->data)) { 153 if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
149 __be16 *ipcomp_hdr = (__be16 *)xprth; 154 __be16 *ipcomp_hdr = (__be16 *)xprth;
150 155
151 fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); 156 fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
152 } 157 }
153 break; 158 break;
154 159
@@ -160,20 +165,20 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
160 if (greflags[0] & GRE_KEY) { 165 if (greflags[0] & GRE_KEY) {
161 if (greflags[0] & GRE_CSUM) 166 if (greflags[0] & GRE_CSUM)
162 gre_hdr++; 167 gre_hdr++;
163 fl->fl_gre_key = gre_hdr[1]; 168 fl4->fl4_gre_key = gre_hdr[1];
164 } 169 }
165 } 170 }
166 break; 171 break;
167 172
168 default: 173 default:
169 fl->fl_ipsec_spi = 0; 174 fl4->fl4_ipsec_spi = 0;
170 break; 175 break;
171 } 176 }
172 } 177 }
173 fl->proto = iph->protocol; 178 fl4->flowi4_proto = iph->protocol;
174 fl->fl4_dst = reverse ? iph->saddr : iph->daddr; 179 fl4->daddr = reverse ? iph->saddr : iph->daddr;
175 fl->fl4_src = reverse ? iph->daddr : iph->saddr; 180 fl4->saddr = reverse ? iph->daddr : iph->saddr;
176 fl->fl4_tos = iph->tos; 181 fl4->flowi4_tos = iph->tos;
177} 182}
178 183
179static inline int xfrm4_garbage_collect(struct dst_ops *ops) 184static inline int xfrm4_garbage_collect(struct dst_ops *ops)
@@ -196,8 +201,11 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
196{ 201{
197 struct xfrm_dst *xdst = (struct xfrm_dst *)dst; 202 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
198 203
204 dst_destroy_metrics_generic(dst);
205
199 if (likely(xdst->u.rt.peer)) 206 if (likely(xdst->u.rt.peer))
200 inet_putpeer(xdst->u.rt.peer); 207 inet_putpeer(xdst->u.rt.peer);
208
201 xfrm_dst_destroy(xdst); 209 xfrm_dst_destroy(xdst);
202} 210}
203 211
@@ -215,6 +223,7 @@ static struct dst_ops xfrm4_dst_ops = {
215 .protocol = cpu_to_be16(ETH_P_IP), 223 .protocol = cpu_to_be16(ETH_P_IP),
216 .gc = xfrm4_garbage_collect, 224 .gc = xfrm4_garbage_collect,
217 .update_pmtu = xfrm4_update_pmtu, 225 .update_pmtu = xfrm4_update_pmtu,
226 .cow_metrics = dst_cow_metrics_generic,
218 .destroy = xfrm4_dst_destroy, 227 .destroy = xfrm4_dst_destroy,
219 .ifdown = xfrm4_dst_ifdown, 228 .ifdown = xfrm4_dst_ifdown,
220 .local_out = __ip_local_out, 229 .local_out = __ip_local_out,
@@ -230,6 +239,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
230 .get_tos = xfrm4_get_tos, 239 .get_tos = xfrm4_get_tos,
231 .init_path = xfrm4_init_path, 240 .init_path = xfrm4_init_path,
232 .fill_dst = xfrm4_fill_dst, 241 .fill_dst = xfrm4_fill_dst,
242 .blackhole_route = ipv4_blackhole_route,
233}; 243};
234 244
235#ifdef CONFIG_SYSCTL 245#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 47947624eccc..1717c64628d1 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,24 +21,26 @@ static int xfrm4_init_flags(struct xfrm_state *x)
21} 21}
22 22
23static void 23static void
24__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl) 24__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
25{ 25{
26 sel->daddr.a4 = fl->fl4_dst; 26 const struct flowi4 *fl4 = &fl->u.ip4;
27 sel->saddr.a4 = fl->fl4_src; 27
28 sel->dport = xfrm_flowi_dport(fl); 28 sel->daddr.a4 = fl4->daddr;
29 sel->saddr.a4 = fl4->saddr;
30 sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
29 sel->dport_mask = htons(0xffff); 31 sel->dport_mask = htons(0xffff);
30 sel->sport = xfrm_flowi_sport(fl); 32 sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
31 sel->sport_mask = htons(0xffff); 33 sel->sport_mask = htons(0xffff);
32 sel->family = AF_INET; 34 sel->family = AF_INET;
33 sel->prefixlen_d = 32; 35 sel->prefixlen_d = 32;
34 sel->prefixlen_s = 32; 36 sel->prefixlen_s = 32;
35 sel->proto = fl->proto; 37 sel->proto = fl4->flowi4_proto;
36 sel->ifindex = fl->oif; 38 sel->ifindex = fl4->flowi4_oif;
37} 39}
38 40
39static void 41static void
40xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl, 42xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
41 xfrm_address_t *daddr, xfrm_address_t *saddr) 43 const xfrm_address_t *daddr, const xfrm_address_t *saddr)
42{ 44{
43 x->id = tmpl->id; 45 x->id = tmpl->id;
44 if (x->id.daddr.a4 == 0) 46 if (x->id.daddr.a4 == 0)