aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorEric Paris <eparis@redhat.com>2011-05-26 17:20:14 -0400
committerEric Paris <eparis@redhat.com>2011-05-26 17:20:14 -0400
commitea77f7a2e8561012cf100c530170f12351c3b53e (patch)
tree7302ac1064f4e364aadda84020a176804fb86e22 /net/ipv4
parent7a627e3b9a2bd0f06945bbe64bcf403e788ecf6e (diff)
parent61c4f2c81c61f73549928dfd9f3e8f26aa36a8cf (diff)
Merge commit 'v2.6.39' into 20110526
Conflicts: lib/flex_array.c security/selinux/avc.c security/selinux/hooks.c security/selinux/ss/policydb.c security/smack/smack_lsm.c
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig42
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/af_inet.c46
-rw-r--r--net/ipv4/ah4.c27
-rw-r--r--net/ipv4/arp.c28
-rw-r--r--net/ipv4/cipso_ipv4.c8
-rw-r--r--net/ipv4/datagram.c11
-rw-r--r--net/ipv4/devinet.c116
-rw-r--r--net/ipv4/esp4.c104
-rw-r--r--net/ipv4/fib_frontend.c215
-rw-r--r--net/ipv4/fib_hash.c1133
-rw-r--r--net/ipv4/fib_lookup.h13
-rw-r--r--net/ipv4/fib_rules.c25
-rw-r--r--net/ipv4/fib_semantics.c258
-rw-r--r--net/ipv4/fib_trie.c287
-rw-r--r--net/ipv4/icmp.c242
-rw-r--r--net/ipv4/igmp.c45
-rw-r--r--net/ipv4/inet_connection_sock.c32
-rw-r--r--net/ipv4/inetpeer.c161
-rw-r--r--net/ipv4/ip_fragment.c31
-rw-r--r--net/ipv4/ip_gre.c58
-rw-r--r--net/ipv4/ip_input.c2
-rw-r--r--net/ipv4/ip_options.c12
-rw-r--r--net/ipv4/ip_output.c347
-rw-r--r--net/ipv4/ipconfig.c2
-rw-r--r--net/ipv4/ipip.c41
-rw-r--r--net/ipv4/ipmr.c79
-rw-r--r--net/ipv4/netfilter.c39
-rw-r--r--net/ipv4/netfilter/Kconfig13
-rw-r--r--net/ipv4/netfilter/Makefile1
-rw-r--r--net/ipv4/netfilter/arp_tables.c9
-rw-r--r--net/ipv4/netfilter/ip_tables.c11
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c12
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c3
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c134
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c37
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c9
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c9
-rw-r--r--net/ipv4/raw.c42
-rw-r--r--net/ipv4/route.c1200
-rw-r--r--net/ipv4/syncookies.c25
-rw-r--r--net/ipv4/sysctl_net_ipv4.c3
-rw-r--r--net/ipv4/tcp.c20
-rw-r--r--net/ipv4/tcp_bic.c2
-rw-r--r--net/ipv4/tcp_cubic.c56
-rw-r--r--net/ipv4/tcp_highspeed.c2
-rw-r--r--net/ipv4/tcp_htcp.c2
-rw-r--r--net/ipv4/tcp_hybla.c2
-rw-r--r--net/ipv4/tcp_illinois.c2
-rw-r--r--net/ipv4/tcp_input.c26
-rw-r--r--net/ipv4/tcp_ipv4.c37
-rw-r--r--net/ipv4/tcp_lp.c4
-rw-r--r--net/ipv4/tcp_output.c5
-rw-r--r--net/ipv4/tcp_scalable.c2
-rw-r--r--net/ipv4/tcp_timer.c3
-rw-r--r--net/ipv4/tcp_vegas.c2
-rw-r--r--net/ipv4/tcp_veno.c2
-rw-r--r--net/ipv4/tcp_westwood.c2
-rw-r--r--net/ipv4/tcp_yeah.c4
-rw-r--r--net/ipv4/udp.c141
-rw-r--r--net/ipv4/xfrm4_output.c8
-rw-r--r--net/ipv4/xfrm4_policy.c75
-rw-r--r--net/ipv4/xfrm4_state.c21
66 files changed, 2202 insertions, 3159 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index a5a1050595d1..cbb505ba9324 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER
55 55
56 If unsure, say N here. 56 If unsure, say N here.
57 57
58choice
59 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
60 depends on IP_ADVANCED_ROUTER
61 default ASK_IP_FIB_HASH
62
63config ASK_IP_FIB_HASH
64 bool "FIB_HASH"
65 ---help---
66 Current FIB is very proven and good enough for most users.
67
68config IP_FIB_TRIE
69 bool "FIB_TRIE"
70 ---help---
71 Use new experimental LC-trie as FIB lookup algorithm.
72 This improves lookup performance if you have a large
73 number of routes.
74
75 LC-trie is a longest matching prefix lookup algorithm which
76 performs better than FIB_HASH for large routing tables.
77 But, it consumes more memory and is more complex.
78
79 LC-trie is described in:
80
81 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
82 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
83 June 1999
84
85 An experimental study of compression methods for dynamic tries
86 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
87 <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
88
89endchoice
90
91config IP_FIB_HASH
92 def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
93
94config IP_FIB_TRIE_STATS 58config IP_FIB_TRIE_STATS
95 bool "FIB TRIE statistics" 59 bool "FIB TRIE statistics"
96 depends on IP_FIB_TRIE 60 depends on IP_ADVANCED_ROUTER
97 ---help--- 61 ---help---
98 Keep track of statistics on structure of FIB TRIE table. 62 Keep track of statistics on structure of FIB TRIE table.
99 Useful for testing and measuring TRIE performance. 63 Useful for testing and measuring TRIE performance.
@@ -140,6 +104,9 @@ config IP_ROUTE_VERBOSE
140 handled by the klogd daemon which is responsible for kernel messages 104 handled by the klogd daemon which is responsible for kernel messages
141 ("man klogd"). 105 ("man klogd").
142 106
107config IP_ROUTE_CLASSID
108 bool
109
143config IP_PNP 110config IP_PNP
144 bool "IP: kernel level autoconfiguration" 111 bool "IP: kernel level autoconfiguration"
145 help 112 help
@@ -657,4 +624,3 @@ config TCP_MD5SIG
657 on the Internet. 624 on the Internet.
658 625
659 If unsure, say N. 626 If unsure, say N.
660
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 4978d22f9a75..0dc772d0d125 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,12 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \
10 tcp_minisocks.o tcp_cong.o \ 10 tcp_minisocks.o tcp_cong.o \
11 datagram.o raw.o udp.o udplite.o \ 11 datagram.o raw.o udp.o udplite.o \
12 arp.o icmp.o devinet.o af_inet.o igmp.o \ 12 arp.o icmp.o devinet.o af_inet.o igmp.o \
13 fib_frontend.o fib_semantics.o \ 13 fib_frontend.o fib_semantics.o fib_trie.o \
14 inet_fragment.o 14 inet_fragment.o
15 15
16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o 16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
17obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
18obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
19obj-$(CONFIG_PROC_FS) += proc.o 17obj-$(CONFIG_PROC_FS) += proc.o
20obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 18obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
21obj-$(CONFIG_IP_MROUTE) += ipmr.o 19obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 45b89d7bda5a..807d83c02ef6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1101,23 +1101,20 @@ int sysctl_ip_dynaddr __read_mostly;
1101static int inet_sk_reselect_saddr(struct sock *sk) 1101static int inet_sk_reselect_saddr(struct sock *sk)
1102{ 1102{
1103 struct inet_sock *inet = inet_sk(sk); 1103 struct inet_sock *inet = inet_sk(sk);
1104 int err;
1105 struct rtable *rt;
1106 __be32 old_saddr = inet->inet_saddr; 1104 __be32 old_saddr = inet->inet_saddr;
1107 __be32 new_saddr;
1108 __be32 daddr = inet->inet_daddr; 1105 __be32 daddr = inet->inet_daddr;
1106 struct rtable *rt;
1107 __be32 new_saddr;
1109 1108
1110 if (inet->opt && inet->opt->srr) 1109 if (inet->opt && inet->opt->srr)
1111 daddr = inet->opt->faddr; 1110 daddr = inet->opt->faddr;
1112 1111
1113 /* Query new route. */ 1112 /* Query new route. */
1114 err = ip_route_connect(&rt, daddr, 0, 1113 rt = ip_route_connect(daddr, 0, RT_CONN_FLAGS(sk),
1115 RT_CONN_FLAGS(sk), 1114 sk->sk_bound_dev_if, sk->sk_protocol,
1116 sk->sk_bound_dev_if, 1115 inet->inet_sport, inet->inet_dport, sk, false);
1117 sk->sk_protocol, 1116 if (IS_ERR(rt))
1118 inet->inet_sport, inet->inet_dport, sk, 0); 1117 return PTR_ERR(rt);
1119 if (err)
1120 return err;
1121 1118
1122 sk_setup_caps(sk, &rt->dst); 1119 sk_setup_caps(sk, &rt->dst);
1123 1120
@@ -1160,25 +1157,16 @@ int inet_sk_rebuild_header(struct sock *sk)
1160 daddr = inet->inet_daddr; 1157 daddr = inet->inet_daddr;
1161 if (inet->opt && inet->opt->srr) 1158 if (inet->opt && inet->opt->srr)
1162 daddr = inet->opt->faddr; 1159 daddr = inet->opt->faddr;
1163{ 1160 rt = ip_route_output_ports(sock_net(sk), sk, daddr, inet->inet_saddr,
1164 struct flowi fl = { 1161 inet->inet_dport, inet->inet_sport,
1165 .oif = sk->sk_bound_dev_if, 1162 sk->sk_protocol, RT_CONN_FLAGS(sk),
1166 .mark = sk->sk_mark, 1163 sk->sk_bound_dev_if);
1167 .fl4_dst = daddr, 1164 if (!IS_ERR(rt)) {
1168 .fl4_src = inet->inet_saddr, 1165 err = 0;
1169 .fl4_tos = RT_CONN_FLAGS(sk),
1170 .proto = sk->sk_protocol,
1171 .flags = inet_sk_flowi_flags(sk),
1172 .fl_ip_sport = inet->inet_sport,
1173 .fl_ip_dport = inet->inet_dport,
1174 };
1175
1176 security_sk_classify_flow(sk, &fl);
1177 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
1178}
1179 if (!err)
1180 sk_setup_caps(sk, &rt->dst); 1166 sk_setup_caps(sk, &rt->dst);
1181 else { 1167 } else {
1168 err = PTR_ERR(rt);
1169
1182 /* Routing failed... */ 1170 /* Routing failed... */
1183 sk->sk_route_caps = 0; 1171 sk->sk_route_caps = 0;
1184 /* 1172 /*
@@ -1231,7 +1219,7 @@ out:
1231 return err; 1219 return err;
1232} 1220}
1233 1221
1234static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) 1222static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features)
1235{ 1223{
1236 struct sk_buff *segs = ERR_PTR(-EINVAL); 1224 struct sk_buff *segs = ERR_PTR(-EINVAL);
1237 struct iphdr *iph; 1225 struct iphdr *iph;
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 86961bec70ab..4286fd3cc0e2 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -201,11 +201,14 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
201 top_iph->ttl = 0; 201 top_iph->ttl = 0;
202 top_iph->check = 0; 202 top_iph->check = 0;
203 203
204 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; 204 if (x->props.flags & XFRM_STATE_ALIGN4)
205 ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
206 else
207 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
205 208
206 ah->reserved = 0; 209 ah->reserved = 0;
207 ah->spi = x->id.spi; 210 ah->spi = x->id.spi;
208 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); 211 ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
209 212
210 sg_init_table(sg, nfrags); 213 sg_init_table(sg, nfrags);
211 skb_to_sgvec(skb, sg, 0, skb->len); 214 skb_to_sgvec(skb, sg, 0, skb->len);
@@ -299,9 +302,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
299 nexthdr = ah->nexthdr; 302 nexthdr = ah->nexthdr;
300 ah_hlen = (ah->hdrlen + 2) << 2; 303 ah_hlen = (ah->hdrlen + 2) << 2;
301 304
302 if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && 305 if (x->props.flags & XFRM_STATE_ALIGN4) {
303 ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) 306 if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
304 goto out; 307 ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
308 goto out;
309 } else {
310 if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
311 ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
312 goto out;
313 }
305 314
306 if (!pskb_may_pull(skb, ah_hlen)) 315 if (!pskb_may_pull(skb, ah_hlen))
307 goto out; 316 goto out;
@@ -450,8 +459,12 @@ static int ah_init_state(struct xfrm_state *x)
450 459
451 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); 460 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
452 461
453 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + 462 if (x->props.flags & XFRM_STATE_ALIGN4)
454 ahp->icv_trunc_len); 463 x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
464 ahp->icv_trunc_len);
465 else
466 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
467 ahp->icv_trunc_len);
455 if (x->props.mode == XFRM_MODE_TUNNEL) 468 if (x->props.mode == XFRM_MODE_TUNNEL)
456 x->props.header_len += sizeof(struct iphdr); 469 x->props.header_len += sizeof(struct iphdr);
457 x->data = ahp; 470 x->data = ahp;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 7927589813b5..1b74d3b64371 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -215,6 +215,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
215 case ARPHRD_INFINIBAND: 215 case ARPHRD_INFINIBAND:
216 ip_ib_mc_map(addr, dev->broadcast, haddr); 216 ip_ib_mc_map(addr, dev->broadcast, haddr);
217 return 0; 217 return 0;
218 case ARPHRD_IPGRE:
219 ip_ipgre_mc_map(addr, dev->broadcast, haddr);
220 return 0;
218 default: 221 default:
219 if (dir) { 222 if (dir) {
220 memcpy(haddr, dev->broadcast, dev->addr_len); 223 memcpy(haddr, dev->broadcast, dev->addr_len);
@@ -433,14 +436,13 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
433 436
434static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) 437static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
435{ 438{
436 struct flowi fl = { .fl4_dst = sip,
437 .fl4_src = tip };
438 struct rtable *rt; 439 struct rtable *rt;
439 int flag = 0; 440 int flag = 0;
440 /*unsigned long now; */ 441 /*unsigned long now; */
441 struct net *net = dev_net(dev); 442 struct net *net = dev_net(dev);
442 443
443 if (ip_route_output_key(net, &rt, &fl) < 0) 444 rt = ip_route_output(net, sip, tip, 0, 0);
445 if (IS_ERR(rt))
444 return 1; 446 return 1;
445 if (rt->dst.dev != dev) { 447 if (rt->dst.dev != dev) {
446 NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); 448 NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
@@ -1061,12 +1063,10 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1061 if (r->arp_flags & ATF_PERM) 1063 if (r->arp_flags & ATF_PERM)
1062 r->arp_flags |= ATF_COM; 1064 r->arp_flags |= ATF_COM;
1063 if (dev == NULL) { 1065 if (dev == NULL) {
1064 struct flowi fl = { .fl4_dst = ip, 1066 struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
1065 .fl4_tos = RTO_ONLINK }; 1067
1066 struct rtable *rt; 1068 if (IS_ERR(rt))
1067 err = ip_route_output_key(net, &rt, &fl); 1069 return PTR_ERR(rt);
1068 if (err != 0)
1069 return err;
1070 dev = rt->dst.dev; 1070 dev = rt->dst.dev;
1071 ip_rt_put(rt); 1071 ip_rt_put(rt);
1072 if (!dev) 1072 if (!dev)
@@ -1177,7 +1177,6 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
1177static int arp_req_delete(struct net *net, struct arpreq *r, 1177static int arp_req_delete(struct net *net, struct arpreq *r,
1178 struct net_device *dev) 1178 struct net_device *dev)
1179{ 1179{
1180 int err;
1181 __be32 ip; 1180 __be32 ip;
1182 1181
1183 if (r->arp_flags & ATF_PUBL) 1182 if (r->arp_flags & ATF_PUBL)
@@ -1185,12 +1184,9 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
1185 1184
1186 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; 1185 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
1187 if (dev == NULL) { 1186 if (dev == NULL) {
1188 struct flowi fl = { .fl4_dst = ip, 1187 struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
1189 .fl4_tos = RTO_ONLINK }; 1188 if (IS_ERR(rt))
1190 struct rtable *rt; 1189 return PTR_ERR(rt);
1191 err = ip_route_output_key(net, &rt, &fl);
1192 if (err != 0)
1193 return err;
1194 dev = rt->dst.dev; 1190 dev = rt->dst.dev;
1195 ip_rt_put(rt); 1191 ip_rt_put(rt);
1196 if (!dev) 1192 if (!dev)
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 094e150c6260..a0af7ea87870 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -112,7 +112,7 @@ int cipso_v4_rbm_strictvalid = 1;
112/* The maximum number of category ranges permitted in the ranged category tag 112/* The maximum number of category ranges permitted in the ranged category tag
113 * (tag #5). You may note that the IETF draft states that the maximum number 113 * (tag #5). You may note that the IETF draft states that the maximum number
114 * of category ranges is 7, but if the low end of the last category range is 114 * of category ranges is 7, but if the low end of the last category range is
115 * zero then it is possibile to fit 8 category ranges because the zero should 115 * zero then it is possible to fit 8 category ranges because the zero should
116 * be omitted. */ 116 * be omitted. */
117#define CIPSO_V4_TAG_RNG_CAT_MAX 8 117#define CIPSO_V4_TAG_RNG_CAT_MAX 8
118 118
@@ -438,7 +438,7 @@ cache_add_failure:
438 * 438 *
439 * Description: 439 * Description:
440 * Search the DOI definition list for a DOI definition with a DOI value that 440 * Search the DOI definition list for a DOI definition with a DOI value that
441 * matches @doi. The caller is responsibile for calling rcu_read_[un]lock(). 441 * matches @doi. The caller is responsible for calling rcu_read_[un]lock().
442 * Returns a pointer to the DOI definition on success and NULL on failure. 442 * Returns a pointer to the DOI definition on success and NULL on failure.
443 */ 443 */
444static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi) 444static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
@@ -1293,7 +1293,7 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
1293 return ret_val; 1293 return ret_val;
1294 1294
1295 /* This will send packets using the "optimized" format when 1295 /* This will send packets using the "optimized" format when
1296 * possibile as specified in section 3.4.2.6 of the 1296 * possible as specified in section 3.4.2.6 of the
1297 * CIPSO draft. */ 1297 * CIPSO draft. */
1298 if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10) 1298 if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10)
1299 tag_len = 14; 1299 tag_len = 14;
@@ -1752,7 +1752,7 @@ validate_return:
1752} 1752}
1753 1753
1754/** 1754/**
1755 * cipso_v4_error - Send the correct reponse for a bad packet 1755 * cipso_v4_error - Send the correct response for a bad packet
1756 * @skb: the packet 1756 * @skb: the packet
1757 * @error: the error code 1757 * @error: the error code
1758 * @gateway: CIPSO gateway flag 1758 * @gateway: CIPSO gateway flag
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 174be6caa5c8..85bd24ca4f6d 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -46,11 +46,12 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
46 if (!saddr) 46 if (!saddr)
47 saddr = inet->mc_addr; 47 saddr = inet->mc_addr;
48 } 48 }
49 err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, 49 rt = ip_route_connect(usin->sin_addr.s_addr, saddr,
50 RT_CONN_FLAGS(sk), oif, 50 RT_CONN_FLAGS(sk), oif,
51 sk->sk_protocol, 51 sk->sk_protocol,
52 inet->inet_sport, usin->sin_port, sk, 1); 52 inet->inet_sport, usin->sin_port, sk, true);
53 if (err) { 53 if (IS_ERR(rt)) {
54 err = PTR_ERR(rt);
54 if (err == -ENETUNREACH) 55 if (err == -ENETUNREACH)
55 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 56 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
56 return err; 57 return err;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index df4616fce929..cd9ca0811cfa 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -51,6 +51,7 @@
51#include <linux/inetdevice.h> 51#include <linux/inetdevice.h>
52#include <linux/igmp.h> 52#include <linux/igmp.h>
53#include <linux/slab.h> 53#include <linux/slab.h>
54#include <linux/hash.h>
54#ifdef CONFIG_SYSCTL 55#ifdef CONFIG_SYSCTL
55#include <linux/sysctl.h> 56#include <linux/sysctl.h>
56#endif 57#endif
@@ -63,6 +64,8 @@
63#include <net/rtnetlink.h> 64#include <net/rtnetlink.h>
64#include <net/net_namespace.h> 65#include <net/net_namespace.h>
65 66
67#include "fib_lookup.h"
68
66static struct ipv4_devconf ipv4_devconf = { 69static struct ipv4_devconf ipv4_devconf = {
67 .data = { 70 .data = {
68 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, 71 [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
@@ -92,6 +95,85 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
92 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, 95 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
93}; 96};
94 97
98/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
99 * value. So if you change this define, make appropriate changes to
100 * inet_addr_hash as well.
101 */
102#define IN4_ADDR_HSIZE 256
103static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
104static DEFINE_SPINLOCK(inet_addr_hash_lock);
105
106static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
107{
108 u32 val = (__force u32) addr ^ hash_ptr(net, 8);
109
110 return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
111 (IN4_ADDR_HSIZE - 1));
112}
113
114static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
115{
116 unsigned int hash = inet_addr_hash(net, ifa->ifa_local);
117
118 spin_lock(&inet_addr_hash_lock);
119 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
120 spin_unlock(&inet_addr_hash_lock);
121}
122
123static void inet_hash_remove(struct in_ifaddr *ifa)
124{
125 spin_lock(&inet_addr_hash_lock);
126 hlist_del_init_rcu(&ifa->hash);
127 spin_unlock(&inet_addr_hash_lock);
128}
129
130/**
131 * __ip_dev_find - find the first device with a given source address.
132 * @net: the net namespace
133 * @addr: the source address
134 * @devref: if true, take a reference on the found device
135 *
136 * If a caller uses devref=false, it should be protected by RCU, or RTNL
137 */
138struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
139{
140 unsigned int hash = inet_addr_hash(net, addr);
141 struct net_device *result = NULL;
142 struct in_ifaddr *ifa;
143 struct hlist_node *node;
144
145 rcu_read_lock();
146 hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
147 struct net_device *dev = ifa->ifa_dev->dev;
148
149 if (!net_eq(dev_net(dev), net))
150 continue;
151 if (ifa->ifa_local == addr) {
152 result = dev;
153 break;
154 }
155 }
156 if (!result) {
157 struct flowi4 fl4 = { .daddr = addr };
158 struct fib_result res = { 0 };
159 struct fib_table *local;
160
161 /* Fallback to FIB local table so that communication
162 * over loopback subnets work.
163 */
164 local = fib_get_table(net, RT_TABLE_LOCAL);
165 if (local &&
166 !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
167 res.type == RTN_LOCAL)
168 result = FIB_RES_DEV(res);
169 }
170 if (result && devref)
171 dev_hold(result);
172 rcu_read_unlock();
173 return result;
174}
175EXPORT_SYMBOL(__ip_dev_find);
176
95static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); 177static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
96 178
97static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); 179static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
@@ -265,6 +347,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
265 } 347 }
266 348
267 if (!do_promote) { 349 if (!do_promote) {
350 inet_hash_remove(ifa);
268 *ifap1 = ifa->ifa_next; 351 *ifap1 = ifa->ifa_next;
269 352
270 rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); 353 rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
@@ -278,9 +361,21 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
278 } 361 }
279 } 362 }
280 363
364 /* On promotion all secondaries from subnet are changing
365 * the primary IP, we must remove all their routes silently
366 * and later to add them back with new prefsrc. Do this
367 * while all addresses are on the device list.
368 */
369 for (ifa = promote; ifa; ifa = ifa->ifa_next) {
370 if (ifa1->ifa_mask == ifa->ifa_mask &&
371 inet_ifa_match(ifa1->ifa_address, ifa))
372 fib_del_ifaddr(ifa, ifa1);
373 }
374
281 /* 2. Unlink it */ 375 /* 2. Unlink it */
282 376
283 *ifap = ifa1->ifa_next; 377 *ifap = ifa1->ifa_next;
378 inet_hash_remove(ifa1);
284 379
285 /* 3. Announce address deletion */ 380 /* 3. Announce address deletion */
286 381
@@ -296,6 +391,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
296 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); 391 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
297 392
298 if (promote) { 393 if (promote) {
394 struct in_ifaddr *next_sec = promote->ifa_next;
299 395
300 if (prev_prom) { 396 if (prev_prom) {
301 prev_prom->ifa_next = promote->ifa_next; 397 prev_prom->ifa_next = promote->ifa_next;
@@ -307,7 +403,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
307 rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid); 403 rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid);
308 blocking_notifier_call_chain(&inetaddr_chain, 404 blocking_notifier_call_chain(&inetaddr_chain,
309 NETDEV_UP, promote); 405 NETDEV_UP, promote);
310 for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) { 406 for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
311 if (ifa1->ifa_mask != ifa->ifa_mask || 407 if (ifa1->ifa_mask != ifa->ifa_mask ||
312 !inet_ifa_match(ifa1->ifa_address, ifa)) 408 !inet_ifa_match(ifa1->ifa_address, ifa))
313 continue; 409 continue;
@@ -368,6 +464,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
368 ifa->ifa_next = *ifap; 464 ifa->ifa_next = *ifap;
369 *ifap = ifa; 465 *ifap = ifa;
370 466
467 inet_hash_insert(dev_net(in_dev->dev), ifa);
468
371 /* Send message first, then call notifier. 469 /* Send message first, then call notifier.
372 Notifier will trigger FIB update, so that 470 Notifier will trigger FIB update, so that
373 listeners of netlink will know about new ifaddr */ 471 listeners of netlink will know about new ifaddr */
@@ -521,6 +619,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
521 if (tb[IFA_ADDRESS] == NULL) 619 if (tb[IFA_ADDRESS] == NULL)
522 tb[IFA_ADDRESS] = tb[IFA_LOCAL]; 620 tb[IFA_ADDRESS] = tb[IFA_LOCAL];
523 621
622 INIT_HLIST_NODE(&ifa->hash);
524 ifa->ifa_prefixlen = ifm->ifa_prefixlen; 623 ifa->ifa_prefixlen = ifm->ifa_prefixlen;
525 ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); 624 ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
526 ifa->ifa_flags = ifm->ifa_flags; 625 ifa->ifa_flags = ifm->ifa_flags;
@@ -670,7 +769,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
670 ifap = &ifa->ifa_next) { 769 ifap = &ifa->ifa_next) {
671 if (!strcmp(ifr.ifr_name, ifa->ifa_label) && 770 if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
672 sin_orig.sin_addr.s_addr == 771 sin_orig.sin_addr.s_addr ==
673 ifa->ifa_address) { 772 ifa->ifa_local) {
674 break; /* found */ 773 break; /* found */
675 } 774 }
676 } 775 }
@@ -728,6 +827,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
728 if (!ifa) { 827 if (!ifa) {
729 ret = -ENOBUFS; 828 ret = -ENOBUFS;
730 ifa = inet_alloc_ifa(); 829 ifa = inet_alloc_ifa();
830 INIT_HLIST_NODE(&ifa->hash);
731 if (!ifa) 831 if (!ifa)
732 break; 832 break;
733 if (colon) 833 if (colon)
@@ -1040,8 +1140,8 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev,
1040 return; 1140 return;
1041 1141
1042 arp_send(ARPOP_REQUEST, ETH_P_ARP, 1142 arp_send(ARPOP_REQUEST, ETH_P_ARP,
1043 ifa->ifa_address, dev, 1143 ifa->ifa_local, dev,
1044 ifa->ifa_address, NULL, 1144 ifa->ifa_local, NULL,
1045 dev->dev_addr, NULL); 1145 dev->dev_addr, NULL);
1046} 1146}
1047 1147
@@ -1084,6 +1184,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1084 struct in_ifaddr *ifa = inet_alloc_ifa(); 1184 struct in_ifaddr *ifa = inet_alloc_ifa();
1085 1185
1086 if (ifa) { 1186 if (ifa) {
1187 INIT_HLIST_NODE(&ifa->hash);
1087 ifa->ifa_local = 1188 ifa->ifa_local =
1088 ifa->ifa_address = htonl(INADDR_LOOPBACK); 1189 ifa->ifa_address = htonl(INADDR_LOOPBACK);
1089 ifa->ifa_prefixlen = 8; 1190 ifa->ifa_prefixlen = 8;
@@ -1579,7 +1680,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
1579 return; 1680 return;
1580 1681
1581 cnf->sysctl = NULL; 1682 cnf->sysctl = NULL;
1582 unregister_sysctl_table(t->sysctl_header); 1683 unregister_net_sysctl_table(t->sysctl_header);
1583 kfree(t->dev_name); 1684 kfree(t->dev_name);
1584 kfree(t); 1685 kfree(t);
1585} 1686}
@@ -1720,6 +1821,11 @@ static struct rtnl_af_ops inet_af_ops = {
1720 1821
1721void __init devinet_init(void) 1822void __init devinet_init(void)
1722{ 1823{
1824 int i;
1825
1826 for (i = 0; i < IN4_ADDR_HSIZE; i++)
1827 INIT_HLIST_HEAD(&inet_addr_lst[i]);
1828
1723 register_pernet_subsys(&devinet_ops); 1829 register_pernet_subsys(&devinet_ops);
1724 1830
1725 register_gifconf(PF_INET, inet_gifconf); 1831 register_gifconf(PF_INET, inet_gifconf);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index e42a905180f0..03f994bcf7de 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -33,11 +33,14 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
33 * 33 *
34 * TODO: Use spare space in skb for this where possible. 34 * TODO: Use spare space in skb for this where possible.
35 */ 35 */
36static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) 36static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
37{ 37{
38 unsigned int len; 38 unsigned int len;
39 39
40 len = crypto_aead_ivsize(aead); 40 len = seqhilen;
41
42 len += crypto_aead_ivsize(aead);
43
41 if (len) { 44 if (len) {
42 len += crypto_aead_alignmask(aead) & 45 len += crypto_aead_alignmask(aead) &
43 ~(crypto_tfm_ctx_alignment() - 1); 46 ~(crypto_tfm_ctx_alignment() - 1);
@@ -52,10 +55,15 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags)
52 return kmalloc(len, GFP_ATOMIC); 55 return kmalloc(len, GFP_ATOMIC);
53} 56}
54 57
55static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp) 58static inline __be32 *esp_tmp_seqhi(void *tmp)
59{
60 return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
61}
62static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
56{ 63{
57 return crypto_aead_ivsize(aead) ? 64 return crypto_aead_ivsize(aead) ?
58 PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp; 65 PTR_ALIGN((u8 *)tmp + seqhilen,
66 crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
59} 67}
60 68
61static inline struct aead_givcrypt_request *esp_tmp_givreq( 69static inline struct aead_givcrypt_request *esp_tmp_givreq(
@@ -122,6 +130,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
122 int plen; 130 int plen;
123 int tfclen; 131 int tfclen;
124 int nfrags; 132 int nfrags;
133 int assoclen;
134 int sglists;
135 int seqhilen;
136 __be32 *seqhi;
125 137
126 /* skb is pure payload to encrypt */ 138 /* skb is pure payload to encrypt */
127 139
@@ -151,14 +163,25 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
151 goto error; 163 goto error;
152 nfrags = err; 164 nfrags = err;
153 165
154 tmp = esp_alloc_tmp(aead, nfrags + 1); 166 assoclen = sizeof(*esph);
167 sglists = 1;
168 seqhilen = 0;
169
170 if (x->props.flags & XFRM_STATE_ESN) {
171 sglists += 2;
172 seqhilen += sizeof(__be32);
173 assoclen += seqhilen;
174 }
175
176 tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
155 if (!tmp) 177 if (!tmp)
156 goto error; 178 goto error;
157 179
158 iv = esp_tmp_iv(aead, tmp); 180 seqhi = esp_tmp_seqhi(tmp);
181 iv = esp_tmp_iv(aead, tmp, seqhilen);
159 req = esp_tmp_givreq(aead, iv); 182 req = esp_tmp_givreq(aead, iv);
160 asg = esp_givreq_sg(aead, req); 183 asg = esp_givreq_sg(aead, req);
161 sg = asg + 1; 184 sg = asg + sglists;
162 185
163 /* Fill padding... */ 186 /* Fill padding... */
164 tail = skb_tail_pointer(trailer); 187 tail = skb_tail_pointer(trailer);
@@ -215,19 +238,27 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
215 } 238 }
216 239
217 esph->spi = x->id.spi; 240 esph->spi = x->id.spi;
218 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); 241 esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
219 242
220 sg_init_table(sg, nfrags); 243 sg_init_table(sg, nfrags);
221 skb_to_sgvec(skb, sg, 244 skb_to_sgvec(skb, sg,
222 esph->enc_data + crypto_aead_ivsize(aead) - skb->data, 245 esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
223 clen + alen); 246 clen + alen);
224 sg_init_one(asg, esph, sizeof(*esph)); 247
248 if ((x->props.flags & XFRM_STATE_ESN)) {
249 sg_init_table(asg, 3);
250 sg_set_buf(asg, &esph->spi, sizeof(__be32));
251 *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
252 sg_set_buf(asg + 1, seqhi, seqhilen);
253 sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
254 } else
255 sg_init_one(asg, esph, sizeof(*esph));
225 256
226 aead_givcrypt_set_callback(req, 0, esp_output_done, skb); 257 aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
227 aead_givcrypt_set_crypt(req, sg, sg, clen, iv); 258 aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
228 aead_givcrypt_set_assoc(req, asg, sizeof(*esph)); 259 aead_givcrypt_set_assoc(req, asg, assoclen);
229 aead_givcrypt_set_giv(req, esph->enc_data, 260 aead_givcrypt_set_giv(req, esph->enc_data,
230 XFRM_SKB_CB(skb)->seq.output); 261 XFRM_SKB_CB(skb)->seq.output.low);
231 262
232 ESP_SKB_CB(skb)->tmp = tmp; 263 ESP_SKB_CB(skb)->tmp = tmp;
233 err = crypto_aead_givencrypt(req); 264 err = crypto_aead_givencrypt(req);
@@ -346,6 +377,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
346 struct sk_buff *trailer; 377 struct sk_buff *trailer;
347 int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); 378 int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
348 int nfrags; 379 int nfrags;
380 int assoclen;
381 int sglists;
382 int seqhilen;
383 __be32 *seqhi;
349 void *tmp; 384 void *tmp;
350 u8 *iv; 385 u8 *iv;
351 struct scatterlist *sg; 386 struct scatterlist *sg;
@@ -362,16 +397,27 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
362 goto out; 397 goto out;
363 nfrags = err; 398 nfrags = err;
364 399
400 assoclen = sizeof(*esph);
401 sglists = 1;
402 seqhilen = 0;
403
404 if (x->props.flags & XFRM_STATE_ESN) {
405 sglists += 2;
406 seqhilen += sizeof(__be32);
407 assoclen += seqhilen;
408 }
409
365 err = -ENOMEM; 410 err = -ENOMEM;
366 tmp = esp_alloc_tmp(aead, nfrags + 1); 411 tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
367 if (!tmp) 412 if (!tmp)
368 goto out; 413 goto out;
369 414
370 ESP_SKB_CB(skb)->tmp = tmp; 415 ESP_SKB_CB(skb)->tmp = tmp;
371 iv = esp_tmp_iv(aead, tmp); 416 seqhi = esp_tmp_seqhi(tmp);
417 iv = esp_tmp_iv(aead, tmp, seqhilen);
372 req = esp_tmp_req(aead, iv); 418 req = esp_tmp_req(aead, iv);
373 asg = esp_req_sg(aead, req); 419 asg = esp_req_sg(aead, req);
374 sg = asg + 1; 420 sg = asg + sglists;
375 421
376 skb->ip_summed = CHECKSUM_NONE; 422 skb->ip_summed = CHECKSUM_NONE;
377 423
@@ -382,11 +428,19 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
382 428
383 sg_init_table(sg, nfrags); 429 sg_init_table(sg, nfrags);
384 skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); 430 skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
385 sg_init_one(asg, esph, sizeof(*esph)); 431
432 if ((x->props.flags & XFRM_STATE_ESN)) {
433 sg_init_table(asg, 3);
434 sg_set_buf(asg, &esph->spi, sizeof(__be32));
435 *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
436 sg_set_buf(asg + 1, seqhi, seqhilen);
437 sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
438 } else
439 sg_init_one(asg, esph, sizeof(*esph));
386 440
387 aead_request_set_callback(req, 0, esp_input_done, skb); 441 aead_request_set_callback(req, 0, esp_input_done, skb);
388 aead_request_set_crypt(req, sg, sg, elen, iv); 442 aead_request_set_crypt(req, sg, sg, elen, iv);
389 aead_request_set_assoc(req, asg, sizeof(*esph)); 443 aead_request_set_assoc(req, asg, assoclen);
390 444
391 err = crypto_aead_decrypt(req); 445 err = crypto_aead_decrypt(req);
392 if (err == -EINPROGRESS) 446 if (err == -EINPROGRESS)
@@ -500,10 +554,20 @@ static int esp_init_authenc(struct xfrm_state *x)
500 goto error; 554 goto error;
501 555
502 err = -ENAMETOOLONG; 556 err = -ENAMETOOLONG;
503 if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)", 557
504 x->aalg ? x->aalg->alg_name : "digest_null", 558 if ((x->props.flags & XFRM_STATE_ESN)) {
505 x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) 559 if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
506 goto error; 560 "authencesn(%s,%s)",
561 x->aalg ? x->aalg->alg_name : "digest_null",
562 x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
563 goto error;
564 } else {
565 if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
566 "authenc(%s,%s)",
567 x->aalg ? x->aalg->alg_name : "digest_null",
568 x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
569 goto error;
570 }
507 571
508 aead = crypto_alloc_aead(authenc_name, 0, 0); 572 aead = crypto_alloc_aead(authenc_name, 0, 0);
509 err = PTR_ERR(aead); 573 err = PTR_ERR(aead);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1d2cdd43a878..451088330bbb 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -51,11 +51,11 @@ static int __net_init fib4_rules_init(struct net *net)
51{ 51{
52 struct fib_table *local_table, *main_table; 52 struct fib_table *local_table, *main_table;
53 53
54 local_table = fib_hash_table(RT_TABLE_LOCAL); 54 local_table = fib_trie_table(RT_TABLE_LOCAL);
55 if (local_table == NULL) 55 if (local_table == NULL)
56 return -ENOMEM; 56 return -ENOMEM;
57 57
58 main_table = fib_hash_table(RT_TABLE_MAIN); 58 main_table = fib_trie_table(RT_TABLE_MAIN);
59 if (main_table == NULL) 59 if (main_table == NULL)
60 goto fail; 60 goto fail;
61 61
@@ -82,7 +82,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
82 if (tb) 82 if (tb)
83 return tb; 83 return tb;
84 84
85 tb = fib_hash_table(id); 85 tb = fib_trie_table(id);
86 if (!tb) 86 if (!tb)
87 return NULL; 87 return NULL;
88 h = id & (FIB_TABLE_HASHSZ - 1); 88 h = id & (FIB_TABLE_HASHSZ - 1);
@@ -114,21 +114,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
114} 114}
115#endif /* CONFIG_IP_MULTIPLE_TABLES */ 115#endif /* CONFIG_IP_MULTIPLE_TABLES */
116 116
117void fib_select_default(struct net *net,
118 const struct flowi *flp, struct fib_result *res)
119{
120 struct fib_table *tb;
121 int table = RT_TABLE_MAIN;
122#ifdef CONFIG_IP_MULTIPLE_TABLES
123 if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
124 return;
125 table = res->r->table;
126#endif
127 tb = fib_get_table(net, table);
128 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
129 fib_table_select_default(tb, flp, res);
130}
131
132static void fib_flush(struct net *net) 117static void fib_flush(struct net *net)
133{ 118{
134 int flushed = 0; 119 int flushed = 0;
@@ -147,46 +132,6 @@ static void fib_flush(struct net *net)
147 rt_cache_flush(net, -1); 132 rt_cache_flush(net, -1);
148} 133}
149 134
150/**
151 * __ip_dev_find - find the first device with a given source address.
152 * @net: the net namespace
153 * @addr: the source address
154 * @devref: if true, take a reference on the found device
155 *
156 * If a caller uses devref=false, it should be protected by RCU, or RTNL
157 */
158struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
159{
160 struct flowi fl = {
161 .fl4_dst = addr,
162 };
163 struct fib_result res = { 0 };
164 struct net_device *dev = NULL;
165 struct fib_table *local_table;
166
167#ifdef CONFIG_IP_MULTIPLE_TABLES
168 res.r = NULL;
169#endif
170
171 rcu_read_lock();
172 local_table = fib_get_table(net, RT_TABLE_LOCAL);
173 if (!local_table ||
174 fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
175 rcu_read_unlock();
176 return NULL;
177 }
178 if (res.type != RTN_LOCAL)
179 goto out;
180 dev = FIB_RES_DEV(res);
181
182 if (dev && devref)
183 dev_hold(dev);
184out:
185 rcu_read_unlock();
186 return dev;
187}
188EXPORT_SYMBOL(__ip_dev_find);
189
190/* 135/*
191 * Find address type as if only "dev" was present in the system. If 136 * Find address type as if only "dev" was present in the system. If
192 * on_dev is NULL then all interfaces are taken into consideration. 137 * on_dev is NULL then all interfaces are taken into consideration.
@@ -195,7 +140,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
195 const struct net_device *dev, 140 const struct net_device *dev,
196 __be32 addr) 141 __be32 addr)
197{ 142{
198 struct flowi fl = { .fl4_dst = addr }; 143 struct flowi4 fl4 = { .daddr = addr };
199 struct fib_result res; 144 struct fib_result res;
200 unsigned ret = RTN_BROADCAST; 145 unsigned ret = RTN_BROADCAST;
201 struct fib_table *local_table; 146 struct fib_table *local_table;
@@ -213,7 +158,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
213 if (local_table) { 158 if (local_table) {
214 ret = RTN_UNICAST; 159 ret = RTN_UNICAST;
215 rcu_read_lock(); 160 rcu_read_lock();
216 if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) { 161 if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
217 if (!dev || dev == res.fi->fib_dev) 162 if (!dev || dev == res.fi->fib_dev)
218 ret = res.type; 163 ret = res.type;
219 } 164 }
@@ -248,19 +193,21 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
248 u32 *itag, u32 mark) 193 u32 *itag, u32 mark)
249{ 194{
250 struct in_device *in_dev; 195 struct in_device *in_dev;
251 struct flowi fl = { 196 struct flowi4 fl4;
252 .fl4_dst = src,
253 .fl4_src = dst,
254 .fl4_tos = tos,
255 .mark = mark,
256 .iif = oif
257 };
258 struct fib_result res; 197 struct fib_result res;
259 int no_addr, rpf, accept_local; 198 int no_addr, rpf, accept_local;
260 bool dev_match; 199 bool dev_match;
261 int ret; 200 int ret;
262 struct net *net; 201 struct net *net;
263 202
203 fl4.flowi4_oif = 0;
204 fl4.flowi4_iif = oif;
205 fl4.flowi4_mark = mark;
206 fl4.daddr = src;
207 fl4.saddr = dst;
208 fl4.flowi4_tos = tos;
209 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
210
264 no_addr = rpf = accept_local = 0; 211 no_addr = rpf = accept_local = 0;
265 in_dev = __in_dev_get_rcu(dev); 212 in_dev = __in_dev_get_rcu(dev);
266 if (in_dev) { 213 if (in_dev) {
@@ -268,20 +215,20 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
268 rpf = IN_DEV_RPFILTER(in_dev); 215 rpf = IN_DEV_RPFILTER(in_dev);
269 accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); 216 accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
270 if (mark && !IN_DEV_SRC_VMARK(in_dev)) 217 if (mark && !IN_DEV_SRC_VMARK(in_dev))
271 fl.mark = 0; 218 fl4.flowi4_mark = 0;
272 } 219 }
273 220
274 if (in_dev == NULL) 221 if (in_dev == NULL)
275 goto e_inval; 222 goto e_inval;
276 223
277 net = dev_net(dev); 224 net = dev_net(dev);
278 if (fib_lookup(net, &fl, &res)) 225 if (fib_lookup(net, &fl4, &res))
279 goto last_resort; 226 goto last_resort;
280 if (res.type != RTN_UNICAST) { 227 if (res.type != RTN_UNICAST) {
281 if (res.type != RTN_LOCAL || !accept_local) 228 if (res.type != RTN_LOCAL || !accept_local)
282 goto e_inval; 229 goto e_inval;
283 } 230 }
284 *spec_dst = FIB_RES_PREFSRC(res); 231 *spec_dst = FIB_RES_PREFSRC(net, res);
285 fib_combine_itag(itag, &res); 232 fib_combine_itag(itag, &res);
286 dev_match = false; 233 dev_match = false;
287 234
@@ -306,12 +253,12 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
306 goto last_resort; 253 goto last_resort;
307 if (rpf == 1) 254 if (rpf == 1)
308 goto e_rpf; 255 goto e_rpf;
309 fl.oif = dev->ifindex; 256 fl4.flowi4_oif = dev->ifindex;
310 257
311 ret = 0; 258 ret = 0;
312 if (fib_lookup(net, &fl, &res) == 0) { 259 if (fib_lookup(net, &fl4, &res) == 0) {
313 if (res.type == RTN_UNICAST) { 260 if (res.type == RTN_UNICAST) {
314 *spec_dst = FIB_RES_PREFSRC(res); 261 *spec_dst = FIB_RES_PREFSRC(net, res);
315 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 262 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
316 } 263 }
317 } 264 }
@@ -775,12 +722,17 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
775 } 722 }
776} 723}
777 724
778static void fib_del_ifaddr(struct in_ifaddr *ifa) 725/* Delete primary or secondary address.
726 * Optionally, on secondary address promotion consider the addresses
727 * from subnet iprim as deleted, even if they are in device list.
728 * In this case the secondary ifa can be in device list.
729 */
730void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
779{ 731{
780 struct in_device *in_dev = ifa->ifa_dev; 732 struct in_device *in_dev = ifa->ifa_dev;
781 struct net_device *dev = in_dev->dev; 733 struct net_device *dev = in_dev->dev;
782 struct in_ifaddr *ifa1; 734 struct in_ifaddr *ifa1;
783 struct in_ifaddr *prim = ifa; 735 struct in_ifaddr *prim = ifa, *prim1 = NULL;
784 __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; 736 __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
785 __be32 any = ifa->ifa_address & ifa->ifa_mask; 737 __be32 any = ifa->ifa_address & ifa->ifa_mask;
786#define LOCAL_OK 1 738#define LOCAL_OK 1
@@ -788,17 +740,26 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
788#define BRD0_OK 4 740#define BRD0_OK 4
789#define BRD1_OK 8 741#define BRD1_OK 8
790 unsigned ok = 0; 742 unsigned ok = 0;
743 int subnet = 0; /* Primary network */
744 int gone = 1; /* Address is missing */
745 int same_prefsrc = 0; /* Another primary with same IP */
791 746
792 if (!(ifa->ifa_flags & IFA_F_SECONDARY)) 747 if (ifa->ifa_flags & IFA_F_SECONDARY) {
793 fib_magic(RTM_DELROUTE,
794 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
795 any, ifa->ifa_prefixlen, prim);
796 else {
797 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); 748 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
798 if (prim == NULL) { 749 if (prim == NULL) {
799 printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); 750 printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
800 return; 751 return;
801 } 752 }
753 if (iprim && iprim != prim) {
754 printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n");
755 return;
756 }
757 } else if (!ipv4_is_zeronet(any) &&
758 (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
759 fib_magic(RTM_DELROUTE,
760 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
761 any, ifa->ifa_prefixlen, prim);
762 subnet = 1;
802 } 763 }
803 764
804 /* Deletion is more complicated than add. 765 /* Deletion is more complicated than add.
@@ -808,6 +769,49 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
808 */ 769 */
809 770
810 for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { 771 for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
772 if (ifa1 == ifa) {
773 /* promotion, keep the IP */
774 gone = 0;
775 continue;
776 }
777 /* Ignore IFAs from our subnet */
778 if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
779 inet_ifa_match(ifa1->ifa_address, iprim))
780 continue;
781
782 /* Ignore ifa1 if it uses different primary IP (prefsrc) */
783 if (ifa1->ifa_flags & IFA_F_SECONDARY) {
784 /* Another address from our subnet? */
785 if (ifa1->ifa_mask == prim->ifa_mask &&
786 inet_ifa_match(ifa1->ifa_address, prim))
787 prim1 = prim;
788 else {
789 /* We reached the secondaries, so
790 * same_prefsrc should be determined.
791 */
792 if (!same_prefsrc)
793 continue;
794 /* Search new prim1 if ifa1 is not
795 * using the current prim1
796 */
797 if (!prim1 ||
798 ifa1->ifa_mask != prim1->ifa_mask ||
799 !inet_ifa_match(ifa1->ifa_address, prim1))
800 prim1 = inet_ifa_byprefix(in_dev,
801 ifa1->ifa_address,
802 ifa1->ifa_mask);
803 if (!prim1)
804 continue;
805 if (prim1->ifa_local != prim->ifa_local)
806 continue;
807 }
808 } else {
809 if (prim->ifa_local != ifa1->ifa_local)
810 continue;
811 prim1 = ifa1;
812 if (prim != prim1)
813 same_prefsrc = 1;
814 }
811 if (ifa->ifa_local == ifa1->ifa_local) 815 if (ifa->ifa_local == ifa1->ifa_local)
812 ok |= LOCAL_OK; 816 ok |= LOCAL_OK;
813 if (ifa->ifa_broadcast == ifa1->ifa_broadcast) 817 if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
@@ -816,19 +820,37 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
816 ok |= BRD1_OK; 820 ok |= BRD1_OK;
817 if (any == ifa1->ifa_broadcast) 821 if (any == ifa1->ifa_broadcast)
818 ok |= BRD0_OK; 822 ok |= BRD0_OK;
823 /* primary has network specific broadcasts */
824 if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
825 __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
826 __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
827
828 if (!ipv4_is_zeronet(any1)) {
829 if (ifa->ifa_broadcast == brd1 ||
830 ifa->ifa_broadcast == any1)
831 ok |= BRD_OK;
832 if (brd == brd1 || brd == any1)
833 ok |= BRD1_OK;
834 if (any == brd1 || any == any1)
835 ok |= BRD0_OK;
836 }
837 }
819 } 838 }
820 839
821 if (!(ok & BRD_OK)) 840 if (!(ok & BRD_OK))
822 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); 841 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
823 if (!(ok & BRD1_OK)) 842 if (subnet && ifa->ifa_prefixlen < 31) {
824 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); 843 if (!(ok & BRD1_OK))
825 if (!(ok & BRD0_OK)) 844 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
826 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); 845 if (!(ok & BRD0_OK))
846 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
847 }
827 if (!(ok & LOCAL_OK)) { 848 if (!(ok & LOCAL_OK)) {
828 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); 849 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
829 850
830 /* Check, that this local address finally disappeared. */ 851 /* Check, that this local address finally disappeared. */
831 if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { 852 if (gone &&
853 inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
832 /* And the last, but not the least thing. 854 /* And the last, but not the least thing.
833 * We must flush stray FIB entries. 855 * We must flush stray FIB entries.
834 * 856 *
@@ -849,11 +871,11 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
849{ 871{
850 872
851 struct fib_result res; 873 struct fib_result res;
852 struct flowi fl = { 874 struct flowi4 fl4 = {
853 .mark = frn->fl_mark, 875 .flowi4_mark = frn->fl_mark,
854 .fl4_dst = frn->fl_addr, 876 .daddr = frn->fl_addr,
855 .fl4_tos = frn->fl_tos, 877 .flowi4_tos = frn->fl_tos,
856 .fl4_scope = frn->fl_scope, 878 .flowi4_scope = frn->fl_scope,
857 }; 879 };
858 880
859#ifdef CONFIG_IP_MULTIPLE_TABLES 881#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -866,7 +888,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
866 888
867 frn->tb_id = tb->tb_id; 889 frn->tb_id = tb->tb_id;
868 rcu_read_lock(); 890 rcu_read_lock();
869 frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF); 891 frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
870 892
871 if (!frn->err) { 893 if (!frn->err) {
872 frn->prefixlen = res.prefixlen; 894 frn->prefixlen = res.prefixlen;
@@ -938,6 +960,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
938{ 960{
939 struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; 961 struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
940 struct net_device *dev = ifa->ifa_dev->dev; 962 struct net_device *dev = ifa->ifa_dev->dev;
963 struct net *net = dev_net(dev);
941 964
942 switch (event) { 965 switch (event) {
943 case NETDEV_UP: 966 case NETDEV_UP:
@@ -945,10 +968,12 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
945#ifdef CONFIG_IP_ROUTE_MULTIPATH 968#ifdef CONFIG_IP_ROUTE_MULTIPATH
946 fib_sync_up(dev); 969 fib_sync_up(dev);
947#endif 970#endif
971 atomic_inc(&net->ipv4.dev_addr_genid);
948 rt_cache_flush(dev_net(dev), -1); 972 rt_cache_flush(dev_net(dev), -1);
949 break; 973 break;
950 case NETDEV_DOWN: 974 case NETDEV_DOWN:
951 fib_del_ifaddr(ifa); 975 fib_del_ifaddr(ifa, NULL);
976 atomic_inc(&net->ipv4.dev_addr_genid);
952 if (ifa->ifa_dev->ifa_list == NULL) { 977 if (ifa->ifa_dev->ifa_list == NULL) {
953 /* Last address was deleted from this interface. 978 /* Last address was deleted from this interface.
954 * Disable IP. 979 * Disable IP.
@@ -966,6 +991,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
966{ 991{
967 struct net_device *dev = ptr; 992 struct net_device *dev = ptr;
968 struct in_device *in_dev = __in_dev_get_rtnl(dev); 993 struct in_device *in_dev = __in_dev_get_rtnl(dev);
994 struct net *net = dev_net(dev);
969 995
970 if (event == NETDEV_UNREGISTER) { 996 if (event == NETDEV_UNREGISTER) {
971 fib_disable_ip(dev, 2, -1); 997 fib_disable_ip(dev, 2, -1);
@@ -983,6 +1009,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
983#ifdef CONFIG_IP_ROUTE_MULTIPATH 1009#ifdef CONFIG_IP_ROUTE_MULTIPATH
984 fib_sync_up(dev); 1010 fib_sync_up(dev);
985#endif 1011#endif
1012 atomic_inc(&net->ipv4.dev_addr_genid);
986 rt_cache_flush(dev_net(dev), -1); 1013 rt_cache_flush(dev_net(dev), -1);
987 break; 1014 break;
988 case NETDEV_DOWN: 1015 case NETDEV_DOWN:
@@ -1041,6 +1068,7 @@ static void ip_fib_net_exit(struct net *net)
1041 fib4_rules_exit(net); 1068 fib4_rules_exit(net);
1042#endif 1069#endif
1043 1070
1071 rtnl_lock();
1044 for (i = 0; i < FIB_TABLE_HASHSZ; i++) { 1072 for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
1045 struct fib_table *tb; 1073 struct fib_table *tb;
1046 struct hlist_head *head; 1074 struct hlist_head *head;
@@ -1053,6 +1081,7 @@ static void ip_fib_net_exit(struct net *net)
1053 fib_free_table(tb); 1081 fib_free_table(tb);
1054 } 1082 }
1055 } 1083 }
1084 rtnl_unlock();
1056 kfree(net->ipv4.fib_table_hash); 1085 kfree(net->ipv4.fib_table_hash);
1057} 1086}
1058 1087
@@ -1101,5 +1130,5 @@ void __init ip_fib_init(void)
1101 register_netdevice_notifier(&fib_netdev_notifier); 1130 register_netdevice_notifier(&fib_netdev_notifier);
1102 register_inetaddr_notifier(&fib_inetaddr_notifier); 1131 register_inetaddr_notifier(&fib_inetaddr_notifier);
1103 1132
1104 fib_hash_init(); 1133 fib_trie_init();
1105} 1134}
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
deleted file mode 100644
index b3acb0417b21..000000000000
--- a/net/ipv4/fib_hash.c
+++ /dev/null
@@ -1,1133 +0,0 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IPv4 FIB: lookup engine and maintenance routines.
7 *
8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16#include <asm/uaccess.h>
17#include <asm/system.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/mm.h>
22#include <linux/string.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/errno.h>
26#include <linux/in.h>
27#include <linux/inet.h>
28#include <linux/inetdevice.h>
29#include <linux/netdevice.h>
30#include <linux/if_arp.h>
31#include <linux/proc_fs.h>
32#include <linux/skbuff.h>
33#include <linux/netlink.h>
34#include <linux/init.h>
35#include <linux/slab.h>
36
37#include <net/net_namespace.h>
38#include <net/ip.h>
39#include <net/protocol.h>
40#include <net/route.h>
41#include <net/tcp.h>
42#include <net/sock.h>
43#include <net/ip_fib.h>
44
45#include "fib_lookup.h"
46
47static struct kmem_cache *fn_hash_kmem __read_mostly;
48static struct kmem_cache *fn_alias_kmem __read_mostly;
49
50struct fib_node {
51 struct hlist_node fn_hash;
52 struct list_head fn_alias;
53 __be32 fn_key;
54 struct fib_alias fn_embedded_alias;
55};
56
57#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
58
59struct fn_zone {
60 struct fn_zone __rcu *fz_next; /* Next not empty zone */
61 struct hlist_head __rcu *fz_hash; /* Hash table pointer */
62 seqlock_t fz_lock;
63 u32 fz_hashmask; /* (fz_divisor - 1) */
64
65 u8 fz_order; /* Zone order (0..32) */
66 u8 fz_revorder; /* 32 - fz_order */
67 __be32 fz_mask; /* inet_make_mask(order) */
68#define FZ_MASK(fz) ((fz)->fz_mask)
69
70 struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
71
72 int fz_nent; /* Number of entries */
73 int fz_divisor; /* Hash size (mask+1) */
74};
75
76struct fn_hash {
77 struct fn_zone *fn_zones[33];
78 struct fn_zone __rcu *fn_zone_list;
79};
80
81static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
82{
83 u32 h = ntohl(key) >> fz->fz_revorder;
84 h ^= (h>>20);
85 h ^= (h>>10);
86 h ^= (h>>5);
87 h &= fz->fz_hashmask;
88 return h;
89}
90
91static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
92{
93 return dst & FZ_MASK(fz);
94}
95
96static unsigned int fib_hash_genid;
97
98#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
99
100static struct hlist_head *fz_hash_alloc(int divisor)
101{
102 unsigned long size = divisor * sizeof(struct hlist_head);
103
104 if (size <= PAGE_SIZE)
105 return kzalloc(size, GFP_KERNEL);
106
107 return (struct hlist_head *)
108 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
109}
110
111/* The fib hash lock must be held when this is called. */
112static inline void fn_rebuild_zone(struct fn_zone *fz,
113 struct hlist_head *old_ht,
114 int old_divisor)
115{
116 int i;
117
118 for (i = 0; i < old_divisor; i++) {
119 struct hlist_node *node, *n;
120 struct fib_node *f;
121
122 hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
123 struct hlist_head *new_head;
124
125 hlist_del_rcu(&f->fn_hash);
126
127 new_head = rcu_dereference_protected(fz->fz_hash, 1) +
128 fn_hash(f->fn_key, fz);
129 hlist_add_head_rcu(&f->fn_hash, new_head);
130 }
131 }
132}
133
134static void fz_hash_free(struct hlist_head *hash, int divisor)
135{
136 unsigned long size = divisor * sizeof(struct hlist_head);
137
138 if (size <= PAGE_SIZE)
139 kfree(hash);
140 else
141 free_pages((unsigned long)hash, get_order(size));
142}
143
144static void fn_rehash_zone(struct fn_zone *fz)
145{
146 struct hlist_head *ht, *old_ht;
147 int old_divisor, new_divisor;
148 u32 new_hashmask;
149
150 new_divisor = old_divisor = fz->fz_divisor;
151
152 switch (old_divisor) {
153 case EMBEDDED_HASH_SIZE:
154 new_divisor *= EMBEDDED_HASH_SIZE;
155 break;
156 case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
157 new_divisor *= (EMBEDDED_HASH_SIZE/2);
158 break;
159 default:
160 if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
161 printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
162 return;
163 }
164 new_divisor = (old_divisor << 1);
165 break;
166 }
167
168 new_hashmask = (new_divisor - 1);
169
170#if RT_CACHE_DEBUG >= 2
171 printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n",
172 fz->fz_order, old_divisor);
173#endif
174
175 ht = fz_hash_alloc(new_divisor);
176
177 if (ht) {
178 struct fn_zone nfz;
179
180 memcpy(&nfz, fz, sizeof(nfz));
181
182 write_seqlock_bh(&fz->fz_lock);
183 old_ht = rcu_dereference_protected(fz->fz_hash, 1);
184 RCU_INIT_POINTER(nfz.fz_hash, ht);
185 nfz.fz_hashmask = new_hashmask;
186 nfz.fz_divisor = new_divisor;
187 fn_rebuild_zone(&nfz, old_ht, old_divisor);
188 fib_hash_genid++;
189 rcu_assign_pointer(fz->fz_hash, ht);
190 fz->fz_hashmask = new_hashmask;
191 fz->fz_divisor = new_divisor;
192 write_sequnlock_bh(&fz->fz_lock);
193
194 if (old_ht != fz->fz_embedded_hash) {
195 synchronize_rcu();
196 fz_hash_free(old_ht, old_divisor);
197 }
198 }
199}
200
201static void fn_free_node_rcu(struct rcu_head *head)
202{
203 struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
204
205 kmem_cache_free(fn_hash_kmem, f);
206}
207
208static inline void fn_free_node(struct fib_node *f)
209{
210 call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
211}
212
213static void fn_free_alias_rcu(struct rcu_head *head)
214{
215 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
216
217 kmem_cache_free(fn_alias_kmem, fa);
218}
219
220static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
221{
222 fib_release_info(fa->fa_info);
223 if (fa == &f->fn_embedded_alias)
224 fa->fa_info = NULL;
225 else
226 call_rcu(&fa->rcu, fn_free_alias_rcu);
227}
228
229static struct fn_zone *
230fn_new_zone(struct fn_hash *table, int z)
231{
232 int i;
233 struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
234 if (!fz)
235 return NULL;
236
237 seqlock_init(&fz->fz_lock);
238 fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
239 fz->fz_hashmask = fz->fz_divisor - 1;
240 RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
241 fz->fz_order = z;
242 fz->fz_revorder = 32 - z;
243 fz->fz_mask = inet_make_mask(z);
244
245 /* Find the first not empty zone with more specific mask */
246 for (i = z + 1; i <= 32; i++)
247 if (table->fn_zones[i])
248 break;
249 if (i > 32) {
250 /* No more specific masks, we are the first. */
251 rcu_assign_pointer(fz->fz_next,
252 rtnl_dereference(table->fn_zone_list));
253 rcu_assign_pointer(table->fn_zone_list, fz);
254 } else {
255 rcu_assign_pointer(fz->fz_next,
256 rtnl_dereference(table->fn_zones[i]->fz_next));
257 rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
258 }
259 table->fn_zones[z] = fz;
260 fib_hash_genid++;
261 return fz;
262}
263
264int fib_table_lookup(struct fib_table *tb,
265 const struct flowi *flp, struct fib_result *res,
266 int fib_flags)
267{
268 int err;
269 struct fn_zone *fz;
270 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
271
272 rcu_read_lock();
273 for (fz = rcu_dereference(t->fn_zone_list);
274 fz != NULL;
275 fz = rcu_dereference(fz->fz_next)) {
276 struct hlist_head *head;
277 struct hlist_node *node;
278 struct fib_node *f;
279 __be32 k;
280 unsigned int seq;
281
282 do {
283 seq = read_seqbegin(&fz->fz_lock);
284 k = fz_key(flp->fl4_dst, fz);
285
286 head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
287 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
288 if (f->fn_key != k)
289 continue;
290
291 err = fib_semantic_match(&f->fn_alias,
292 flp, res,
293 fz->fz_order, fib_flags);
294 if (err <= 0)
295 goto out;
296 }
297 } while (read_seqretry(&fz->fz_lock, seq));
298 }
299 err = 1;
300out:
301 rcu_read_unlock();
302 return err;
303}
304
305void fib_table_select_default(struct fib_table *tb,
306 const struct flowi *flp, struct fib_result *res)
307{
308 int order, last_idx;
309 struct hlist_node *node;
310 struct fib_node *f;
311 struct fib_info *fi = NULL;
312 struct fib_info *last_resort;
313 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
314 struct fn_zone *fz = t->fn_zones[0];
315 struct hlist_head *head;
316
317 if (fz == NULL)
318 return;
319
320 last_idx = -1;
321 last_resort = NULL;
322 order = -1;
323
324 rcu_read_lock();
325 head = rcu_dereference(fz->fz_hash);
326 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
327 struct fib_alias *fa;
328
329 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
330 struct fib_info *next_fi = fa->fa_info;
331
332 if (fa->fa_scope != res->scope ||
333 fa->fa_type != RTN_UNICAST)
334 continue;
335
336 if (next_fi->fib_priority > res->fi->fib_priority)
337 break;
338 if (!next_fi->fib_nh[0].nh_gw ||
339 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
340 continue;
341
342 fib_alias_accessed(fa);
343
344 if (fi == NULL) {
345 if (next_fi != res->fi)
346 break;
347 } else if (!fib_detect_death(fi, order, &last_resort,
348 &last_idx, tb->tb_default)) {
349 fib_result_assign(res, fi);
350 tb->tb_default = order;
351 goto out;
352 }
353 fi = next_fi;
354 order++;
355 }
356 }
357
358 if (order <= 0 || fi == NULL) {
359 tb->tb_default = -1;
360 goto out;
361 }
362
363 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
364 tb->tb_default)) {
365 fib_result_assign(res, fi);
366 tb->tb_default = order;
367 goto out;
368 }
369
370 if (last_idx >= 0)
371 fib_result_assign(res, last_resort);
372 tb->tb_default = last_idx;
373out:
374 rcu_read_unlock();
375}
376
377/* Insert node F to FZ. */
378static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
379{
380 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
381
382 hlist_add_head_rcu(&f->fn_hash, head);
383}
384
385/* Return the node in FZ matching KEY. */
386static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
387{
388 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
389 struct hlist_node *node;
390 struct fib_node *f;
391
392 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
393 if (f->fn_key == key)
394 return f;
395 }
396
397 return NULL;
398}
399
400
401static struct fib_alias *fib_fast_alloc(struct fib_node *f)
402{
403 struct fib_alias *fa = &f->fn_embedded_alias;
404
405 if (fa->fa_info != NULL)
406 fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
407 return fa;
408}
409
410/* Caller must hold RTNL. */
411int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
412{
413 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
414 struct fib_node *new_f = NULL;
415 struct fib_node *f;
416 struct fib_alias *fa, *new_fa;
417 struct fn_zone *fz;
418 struct fib_info *fi;
419 u8 tos = cfg->fc_tos;
420 __be32 key;
421 int err;
422
423 if (cfg->fc_dst_len > 32)
424 return -EINVAL;
425
426 fz = table->fn_zones[cfg->fc_dst_len];
427 if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
428 return -ENOBUFS;
429
430 key = 0;
431 if (cfg->fc_dst) {
432 if (cfg->fc_dst & ~FZ_MASK(fz))
433 return -EINVAL;
434 key = fz_key(cfg->fc_dst, fz);
435 }
436
437 fi = fib_create_info(cfg);
438 if (IS_ERR(fi))
439 return PTR_ERR(fi);
440
441 if (fz->fz_nent > (fz->fz_divisor<<1) &&
442 fz->fz_divisor < FZ_MAX_DIVISOR &&
443 (cfg->fc_dst_len == 32 ||
444 (1 << cfg->fc_dst_len) > fz->fz_divisor))
445 fn_rehash_zone(fz);
446
447 f = fib_find_node(fz, key);
448
449 if (!f)
450 fa = NULL;
451 else
452 fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
453
454 /* Now fa, if non-NULL, points to the first fib alias
455 * with the same keys [prefix,tos,priority], if such key already
456 * exists or to the node before which we will insert new one.
457 *
458 * If fa is NULL, we will need to allocate a new one and
459 * insert to the head of f.
460 *
461 * If f is NULL, no fib node matched the destination key
462 * and we need to allocate a new one of those as well.
463 */
464
465 if (fa && fa->fa_tos == tos &&
466 fa->fa_info->fib_priority == fi->fib_priority) {
467 struct fib_alias *fa_first, *fa_match;
468
469 err = -EEXIST;
470 if (cfg->fc_nlflags & NLM_F_EXCL)
471 goto out;
472
473 /* We have 2 goals:
474 * 1. Find exact match for type, scope, fib_info to avoid
475 * duplicate routes
476 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
477 */
478 fa_match = NULL;
479 fa_first = fa;
480 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
481 list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
482 if (fa->fa_tos != tos)
483 break;
484 if (fa->fa_info->fib_priority != fi->fib_priority)
485 break;
486 if (fa->fa_type == cfg->fc_type &&
487 fa->fa_scope == cfg->fc_scope &&
488 fa->fa_info == fi) {
489 fa_match = fa;
490 break;
491 }
492 }
493
494 if (cfg->fc_nlflags & NLM_F_REPLACE) {
495 u8 state;
496
497 fa = fa_first;
498 if (fa_match) {
499 if (fa == fa_match)
500 err = 0;
501 goto out;
502 }
503 err = -ENOBUFS;
504 new_fa = fib_fast_alloc(f);
505 if (new_fa == NULL)
506 goto out;
507
508 new_fa->fa_tos = fa->fa_tos;
509 new_fa->fa_info = fi;
510 new_fa->fa_type = cfg->fc_type;
511 new_fa->fa_scope = cfg->fc_scope;
512 state = fa->fa_state;
513 new_fa->fa_state = state & ~FA_S_ACCESSED;
514 fib_hash_genid++;
515 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
516
517 fn_free_alias(fa, f);
518 if (state & FA_S_ACCESSED)
519 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
520 rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
521 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
522 return 0;
523 }
524
525 /* Error if we find a perfect match which
526 * uses the same scope, type, and nexthop
527 * information.
528 */
529 if (fa_match)
530 goto out;
531
532 if (!(cfg->fc_nlflags & NLM_F_APPEND))
533 fa = fa_first;
534 }
535
536 err = -ENOENT;
537 if (!(cfg->fc_nlflags & NLM_F_CREATE))
538 goto out;
539
540 err = -ENOBUFS;
541
542 if (!f) {
543 new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
544 if (new_f == NULL)
545 goto out;
546
547 INIT_HLIST_NODE(&new_f->fn_hash);
548 INIT_LIST_HEAD(&new_f->fn_alias);
549 new_f->fn_key = key;
550 f = new_f;
551 }
552
553 new_fa = fib_fast_alloc(f);
554 if (new_fa == NULL)
555 goto out;
556
557 new_fa->fa_info = fi;
558 new_fa->fa_tos = tos;
559 new_fa->fa_type = cfg->fc_type;
560 new_fa->fa_scope = cfg->fc_scope;
561 new_fa->fa_state = 0;
562
563 /*
564 * Insert new entry to the list.
565 */
566
567 if (new_f)
568 fib_insert_node(fz, new_f);
569 list_add_tail_rcu(&new_fa->fa_list,
570 (fa ? &fa->fa_list : &f->fn_alias));
571 fib_hash_genid++;
572
573 if (new_f)
574 fz->fz_nent++;
575 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
576
577 rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
578 &cfg->fc_nlinfo, 0);
579 return 0;
580
581out:
582 if (new_f)
583 kmem_cache_free(fn_hash_kmem, new_f);
584 fib_release_info(fi);
585 return err;
586}
587
588int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
589{
590 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
591 struct fib_node *f;
592 struct fib_alias *fa, *fa_to_delete;
593 struct fn_zone *fz;
594 __be32 key;
595
596 if (cfg->fc_dst_len > 32)
597 return -EINVAL;
598
599 if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL)
600 return -ESRCH;
601
602 key = 0;
603 if (cfg->fc_dst) {
604 if (cfg->fc_dst & ~FZ_MASK(fz))
605 return -EINVAL;
606 key = fz_key(cfg->fc_dst, fz);
607 }
608
609 f = fib_find_node(fz, key);
610
611 if (!f)
612 fa = NULL;
613 else
614 fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
615 if (!fa)
616 return -ESRCH;
617
618 fa_to_delete = NULL;
619 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
620 list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
621 struct fib_info *fi = fa->fa_info;
622
623 if (fa->fa_tos != cfg->fc_tos)
624 break;
625
626 if ((!cfg->fc_type ||
627 fa->fa_type == cfg->fc_type) &&
628 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
629 fa->fa_scope == cfg->fc_scope) &&
630 (!cfg->fc_protocol ||
631 fi->fib_protocol == cfg->fc_protocol) &&
632 fib_nh_match(cfg, fi) == 0) {
633 fa_to_delete = fa;
634 break;
635 }
636 }
637
638 if (fa_to_delete) {
639 int kill_fn;
640
641 fa = fa_to_delete;
642 rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
643 tb->tb_id, &cfg->fc_nlinfo, 0);
644
645 kill_fn = 0;
646 list_del_rcu(&fa->fa_list);
647 if (list_empty(&f->fn_alias)) {
648 hlist_del_rcu(&f->fn_hash);
649 kill_fn = 1;
650 }
651 fib_hash_genid++;
652
653 if (fa->fa_state & FA_S_ACCESSED)
654 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
655 fn_free_alias(fa, f);
656 if (kill_fn) {
657 fn_free_node(f);
658 fz->fz_nent--;
659 }
660
661 return 0;
662 }
663 return -ESRCH;
664}
665
666static int fn_flush_list(struct fn_zone *fz, int idx)
667{
668 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
669 struct hlist_node *node, *n;
670 struct fib_node *f;
671 int found = 0;
672
673 hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
674 struct fib_alias *fa, *fa_node;
675 int kill_f;
676
677 kill_f = 0;
678 list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
679 struct fib_info *fi = fa->fa_info;
680
681 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
682 list_del_rcu(&fa->fa_list);
683 if (list_empty(&f->fn_alias)) {
684 hlist_del_rcu(&f->fn_hash);
685 kill_f = 1;
686 }
687 fib_hash_genid++;
688
689 fn_free_alias(fa, f);
690 found++;
691 }
692 }
693 if (kill_f) {
694 fn_free_node(f);
695 fz->fz_nent--;
696 }
697 }
698 return found;
699}
700
701/* caller must hold RTNL. */
702int fib_table_flush(struct fib_table *tb)
703{
704 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
705 struct fn_zone *fz;
706 int found = 0;
707
708 for (fz = rtnl_dereference(table->fn_zone_list);
709 fz != NULL;
710 fz = rtnl_dereference(fz->fz_next)) {
711 int i;
712
713 for (i = fz->fz_divisor - 1; i >= 0; i--)
714 found += fn_flush_list(fz, i);
715 }
716 return found;
717}
718
719void fib_free_table(struct fib_table *tb)
720{
721 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
722 struct fn_zone *fz, *next;
723
724 next = table->fn_zone_list;
725 while (next != NULL) {
726 fz = next;
727 next = fz->fz_next;
728
729 if (fz->fz_hash != fz->fz_embedded_hash)
730 fz_hash_free(fz->fz_hash, fz->fz_divisor);
731
732 kfree(fz);
733 }
734
735 kfree(tb);
736}
737
738static inline int
739fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
740 struct fib_table *tb,
741 struct fn_zone *fz,
742 struct hlist_head *head)
743{
744 struct hlist_node *node;
745 struct fib_node *f;
746 int i, s_i;
747
748 s_i = cb->args[4];
749 i = 0;
750 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
751 struct fib_alias *fa;
752
753 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
754 if (i < s_i)
755 goto next;
756
757 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
758 cb->nlh->nlmsg_seq,
759 RTM_NEWROUTE,
760 tb->tb_id,
761 fa->fa_type,
762 fa->fa_scope,
763 f->fn_key,
764 fz->fz_order,
765 fa->fa_tos,
766 fa->fa_info,
767 NLM_F_MULTI) < 0) {
768 cb->args[4] = i;
769 return -1;
770 }
771next:
772 i++;
773 }
774 }
775 cb->args[4] = i;
776 return skb->len;
777}
778
779static inline int
780fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
781 struct fib_table *tb,
782 struct fn_zone *fz)
783{
784 int h, s_h;
785 struct hlist_head *head = rcu_dereference(fz->fz_hash);
786
787 if (head == NULL)
788 return skb->len;
789 s_h = cb->args[3];
790 for (h = s_h; h < fz->fz_divisor; h++) {
791 if (hlist_empty(head + h))
792 continue;
793 if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
794 cb->args[3] = h;
795 return -1;
796 }
797 memset(&cb->args[4], 0,
798 sizeof(cb->args) - 4*sizeof(cb->args[0]));
799 }
800 cb->args[3] = h;
801 return skb->len;
802}
803
804int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
805 struct netlink_callback *cb)
806{
807 int m = 0, s_m;
808 struct fn_zone *fz;
809 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
810
811 s_m = cb->args[2];
812 rcu_read_lock();
813 for (fz = rcu_dereference(table->fn_zone_list);
814 fz != NULL;
815 fz = rcu_dereference(fz->fz_next), m++) {
816 if (m < s_m)
817 continue;
818 if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
819 cb->args[2] = m;
820 rcu_read_unlock();
821 return -1;
822 }
823 memset(&cb->args[3], 0,
824 sizeof(cb->args) - 3*sizeof(cb->args[0]));
825 }
826 rcu_read_unlock();
827 cb->args[2] = m;
828 return skb->len;
829}
830
831void __init fib_hash_init(void)
832{
833 fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
834 0, SLAB_PANIC, NULL);
835
836 fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
837 0, SLAB_PANIC, NULL);
838
839}
840
841struct fib_table *fib_hash_table(u32 id)
842{
843 struct fib_table *tb;
844
845 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
846 GFP_KERNEL);
847 if (tb == NULL)
848 return NULL;
849
850 tb->tb_id = id;
851 tb->tb_default = -1;
852
853 memset(tb->tb_data, 0, sizeof(struct fn_hash));
854 return tb;
855}
856
857/* ------------------------------------------------------------------------ */
858#ifdef CONFIG_PROC_FS
859
860struct fib_iter_state {
861 struct seq_net_private p;
862 struct fn_zone *zone;
863 int bucket;
864 struct hlist_head *hash_head;
865 struct fib_node *fn;
866 struct fib_alias *fa;
867 loff_t pos;
868 unsigned int genid;
869 int valid;
870};
871
872static struct fib_alias *fib_get_first(struct seq_file *seq)
873{
874 struct fib_iter_state *iter = seq->private;
875 struct fib_table *main_table;
876 struct fn_hash *table;
877
878 main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
879 table = (struct fn_hash *)main_table->tb_data;
880
881 iter->bucket = 0;
882 iter->hash_head = NULL;
883 iter->fn = NULL;
884 iter->fa = NULL;
885 iter->pos = 0;
886 iter->genid = fib_hash_genid;
887 iter->valid = 1;
888
889 for (iter->zone = rcu_dereference(table->fn_zone_list);
890 iter->zone != NULL;
891 iter->zone = rcu_dereference(iter->zone->fz_next)) {
892 int maxslot;
893
894 if (!iter->zone->fz_nent)
895 continue;
896
897 iter->hash_head = rcu_dereference(iter->zone->fz_hash);
898 maxslot = iter->zone->fz_divisor;
899
900 for (iter->bucket = 0; iter->bucket < maxslot;
901 ++iter->bucket, ++iter->hash_head) {
902 struct hlist_node *node;
903 struct fib_node *fn;
904
905 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
906 struct fib_alias *fa;
907
908 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
909 iter->fn = fn;
910 iter->fa = fa;
911 goto out;
912 }
913 }
914 }
915 }
916out:
917 return iter->fa;
918}
919
920static struct fib_alias *fib_get_next(struct seq_file *seq)
921{
922 struct fib_iter_state *iter = seq->private;
923 struct fib_node *fn;
924 struct fib_alias *fa;
925
926 /* Advance FA, if any. */
927 fn = iter->fn;
928 fa = iter->fa;
929 if (fa) {
930 BUG_ON(!fn);
931 list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
932 iter->fa = fa;
933 goto out;
934 }
935 }
936
937 fa = iter->fa = NULL;
938
939 /* Advance FN. */
940 if (fn) {
941 struct hlist_node *node = &fn->fn_hash;
942 hlist_for_each_entry_continue(fn, node, fn_hash) {
943 iter->fn = fn;
944
945 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
946 iter->fa = fa;
947 goto out;
948 }
949 }
950 }
951
952 fn = iter->fn = NULL;
953
954 /* Advance hash chain. */
955 if (!iter->zone)
956 goto out;
957
958 for (;;) {
959 struct hlist_node *node;
960 int maxslot;
961
962 maxslot = iter->zone->fz_divisor;
963
964 while (++iter->bucket < maxslot) {
965 iter->hash_head++;
966
967 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
968 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
969 iter->fn = fn;
970 iter->fa = fa;
971 goto out;
972 }
973 }
974 }
975
976 iter->zone = rcu_dereference(iter->zone->fz_next);
977
978 if (!iter->zone)
979 goto out;
980
981 iter->bucket = 0;
982 iter->hash_head = rcu_dereference(iter->zone->fz_hash);
983
984 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
985 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
986 iter->fn = fn;
987 iter->fa = fa;
988 goto out;
989 }
990 }
991 }
992out:
993 iter->pos++;
994 return fa;
995}
996
997static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
998{
999 struct fib_iter_state *iter = seq->private;
1000 struct fib_alias *fa;
1001
1002 if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
1003 fa = iter->fa;
1004 pos -= iter->pos;
1005 } else
1006 fa = fib_get_first(seq);
1007
1008 if (fa)
1009 while (pos && (fa = fib_get_next(seq)))
1010 --pos;
1011 return pos ? NULL : fa;
1012}
1013
1014static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
1015 __acquires(RCU)
1016{
1017 void *v = NULL;
1018
1019 rcu_read_lock();
1020 if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
1021 v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1022 return v;
1023}
1024
1025static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1026{
1027 ++*pos;
1028 return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
1029}
1030
1031static void fib_seq_stop(struct seq_file *seq, void *v)
1032 __releases(RCU)
1033{
1034 rcu_read_unlock();
1035}
1036
1037static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
1038{
1039 static const unsigned type2flags[RTN_MAX + 1] = {
1040 [7] = RTF_REJECT,
1041 [8] = RTF_REJECT,
1042 };
1043 unsigned flags = type2flags[type];
1044
1045 if (fi && fi->fib_nh->nh_gw)
1046 flags |= RTF_GATEWAY;
1047 if (mask == htonl(0xFFFFFFFF))
1048 flags |= RTF_HOST;
1049 flags |= RTF_UP;
1050 return flags;
1051}
1052
1053/*
1054 * This outputs /proc/net/route.
1055 *
1056 * It always works in backward compatibility mode.
1057 * The format of the file is not supposed to be changed.
1058 */
1059static int fib_seq_show(struct seq_file *seq, void *v)
1060{
1061 struct fib_iter_state *iter;
1062 int len;
1063 __be32 prefix, mask;
1064 unsigned flags;
1065 struct fib_node *f;
1066 struct fib_alias *fa;
1067 struct fib_info *fi;
1068
1069 if (v == SEQ_START_TOKEN) {
1070 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
1071 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
1072 "\tWindow\tIRTT");
1073 goto out;
1074 }
1075
1076 iter = seq->private;
1077 f = iter->fn;
1078 fa = iter->fa;
1079 fi = fa->fa_info;
1080 prefix = f->fn_key;
1081 mask = FZ_MASK(iter->zone);
1082 flags = fib_flag_trans(fa->fa_type, mask, fi);
1083 if (fi)
1084 seq_printf(seq,
1085 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
1086 fi->fib_dev ? fi->fib_dev->name : "*", prefix,
1087 fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
1088 mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
1089 fi->fib_window,
1090 fi->fib_rtt >> 3, &len);
1091 else
1092 seq_printf(seq,
1093 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
1094 prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len);
1095
1096 seq_printf(seq, "%*s\n", 127 - len, "");
1097out:
1098 return 0;
1099}
1100
1101static const struct seq_operations fib_seq_ops = {
1102 .start = fib_seq_start,
1103 .next = fib_seq_next,
1104 .stop = fib_seq_stop,
1105 .show = fib_seq_show,
1106};
1107
1108static int fib_seq_open(struct inode *inode, struct file *file)
1109{
1110 return seq_open_net(inode, file, &fib_seq_ops,
1111 sizeof(struct fib_iter_state));
1112}
1113
1114static const struct file_operations fib_seq_fops = {
1115 .owner = THIS_MODULE,
1116 .open = fib_seq_open,
1117 .read = seq_read,
1118 .llseek = seq_lseek,
1119 .release = seq_release_net,
1120};
1121
1122int __net_init fib_proc_init(struct net *net)
1123{
1124 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops))
1125 return -ENOMEM;
1126 return 0;
1127}
1128
1129void __net_exit fib_proc_exit(struct net *net)
1130{
1131 proc_net_remove(net, "route");
1132}
1133#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c079cc0ec651..af0f14aba169 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -10,7 +10,6 @@ struct fib_alias {
10 struct fib_info *fa_info; 10 struct fib_info *fa_info;
11 u8 fa_tos; 11 u8 fa_tos;
12 u8 fa_type; 12 u8 fa_type;
13 u8 fa_scope;
14 u8 fa_state; 13 u8 fa_state;
15 struct rcu_head rcu; 14 struct rcu_head rcu;
16}; 15};
@@ -25,14 +24,11 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
25} 24}
26 25
27/* Exported by fib_semantics.c */ 26/* Exported by fib_semantics.c */
28extern int fib_semantic_match(struct list_head *head,
29 const struct flowi *flp,
30 struct fib_result *res, int prefixlen, int fib_flags);
31extern void fib_release_info(struct fib_info *); 27extern void fib_release_info(struct fib_info *);
32extern struct fib_info *fib_create_info(struct fib_config *cfg); 28extern struct fib_info *fib_create_info(struct fib_config *cfg);
33extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); 29extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
34extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 30extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
35 u32 tb_id, u8 type, u8 scope, __be32 dst, 31 u32 tb_id, u8 type, __be32 dst,
36 int dst_len, u8 tos, struct fib_info *fi, 32 int dst_len, u8 tos, struct fib_info *fi,
37 unsigned int); 33 unsigned int);
38extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, 34extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
@@ -51,4 +47,11 @@ static inline void fib_result_assign(struct fib_result *res,
51 res->fi = fi; 47 res->fi = fi;
52} 48}
53 49
50struct fib_prop {
51 int error;
52 u8 scope;
53};
54
55extern const struct fib_prop fib_props[RTN_MAX + 1];
56
54#endif /* _FIB_LOOKUP_H */ 57#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 7981a24f5c7b..a53bb1b5b118 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -41,19 +41,19 @@ struct fib4_rule {
41 __be32 srcmask; 41 __be32 srcmask;
42 __be32 dst; 42 __be32 dst;
43 __be32 dstmask; 43 __be32 dstmask;
44#ifdef CONFIG_NET_CLS_ROUTE 44#ifdef CONFIG_IP_ROUTE_CLASSID
45 u32 tclassid; 45 u32 tclassid;
46#endif 46#endif
47}; 47};
48 48
49#ifdef CONFIG_NET_CLS_ROUTE 49#ifdef CONFIG_IP_ROUTE_CLASSID
50u32 fib_rules_tclass(struct fib_result *res) 50u32 fib_rules_tclass(const struct fib_result *res)
51{ 51{
52 return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; 52 return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
53} 53}
54#endif 54#endif
55 55
56int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) 56int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
57{ 57{
58 struct fib_lookup_arg arg = { 58 struct fib_lookup_arg arg = {
59 .result = res, 59 .result = res,
@@ -61,7 +61,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
61 }; 61 };
62 int err; 62 int err;
63 63
64 err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg); 64 err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
65 res->r = arg.rule; 65 res->r = arg.rule;
66 66
67 return err; 67 return err;
@@ -95,7 +95,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
95 if (!tbl) 95 if (!tbl)
96 goto errout; 96 goto errout;
97 97
98 err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags); 98 err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
99 if (err > 0) 99 if (err > 0)
100 err = -EAGAIN; 100 err = -EAGAIN;
101errout: 101errout:
@@ -106,14 +106,15 @@ errout:
106static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) 106static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
107{ 107{
108 struct fib4_rule *r = (struct fib4_rule *) rule; 108 struct fib4_rule *r = (struct fib4_rule *) rule;
109 __be32 daddr = fl->fl4_dst; 109 struct flowi4 *fl4 = &fl->u.ip4;
110 __be32 saddr = fl->fl4_src; 110 __be32 daddr = fl4->daddr;
111 __be32 saddr = fl4->saddr;
111 112
112 if (((saddr ^ r->src) & r->srcmask) || 113 if (((saddr ^ r->src) & r->srcmask) ||
113 ((daddr ^ r->dst) & r->dstmask)) 114 ((daddr ^ r->dst) & r->dstmask))
114 return 0; 115 return 0;
115 116
116 if (r->tos && (r->tos != fl->fl4_tos)) 117 if (r->tos && (r->tos != fl4->flowi4_tos))
117 return 0; 118 return 0;
118 119
119 return 1; 120 return 1;
@@ -165,7 +166,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
165 if (frh->dst_len) 166 if (frh->dst_len)
166 rule4->dst = nla_get_be32(tb[FRA_DST]); 167 rule4->dst = nla_get_be32(tb[FRA_DST]);
167 168
168#ifdef CONFIG_NET_CLS_ROUTE 169#ifdef CONFIG_IP_ROUTE_CLASSID
169 if (tb[FRA_FLOW]) 170 if (tb[FRA_FLOW])
170 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); 171 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
171#endif 172#endif
@@ -195,7 +196,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
195 if (frh->tos && (rule4->tos != frh->tos)) 196 if (frh->tos && (rule4->tos != frh->tos))
196 return 0; 197 return 0;
197 198
198#ifdef CONFIG_NET_CLS_ROUTE 199#ifdef CONFIG_IP_ROUTE_CLASSID
199 if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) 200 if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
200 return 0; 201 return 0;
201#endif 202#endif
@@ -224,7 +225,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
224 if (rule4->src_len) 225 if (rule4->src_len)
225 NLA_PUT_BE32(skb, FRA_SRC, rule4->src); 226 NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
226 227
227#ifdef CONFIG_NET_CLS_ROUTE 228#ifdef CONFIG_IP_ROUTE_CLASSID
228 if (rule4->tclassid) 229 if (rule4->tclassid)
229 NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); 230 NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
230#endif 231#endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 12d3dc3df1b7..641a5a2a9f9c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -49,7 +49,7 @@
49static DEFINE_SPINLOCK(fib_info_lock); 49static DEFINE_SPINLOCK(fib_info_lock);
50static struct hlist_head *fib_info_hash; 50static struct hlist_head *fib_info_hash;
51static struct hlist_head *fib_info_laddrhash; 51static struct hlist_head *fib_info_laddrhash;
52static unsigned int fib_hash_size; 52static unsigned int fib_info_hash_size;
53static unsigned int fib_info_cnt; 53static unsigned int fib_info_cnt;
54 54
55#define DEVINDEX_HASHBITS 8 55#define DEVINDEX_HASHBITS 8
@@ -90,11 +90,7 @@ static DEFINE_SPINLOCK(fib_multipath_lock);
90#define endfor_nexthops(fi) } 90#define endfor_nexthops(fi) }
91 91
92 92
93static const struct 93const struct fib_prop fib_props[RTN_MAX + 1] = {
94{
95 int error;
96 u8 scope;
97} fib_props[RTN_MAX + 1] = {
98 [RTN_UNSPEC] = { 94 [RTN_UNSPEC] = {
99 .error = 0, 95 .error = 0,
100 .scope = RT_SCOPE_NOWHERE, 96 .scope = RT_SCOPE_NOWHERE,
@@ -152,6 +148,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
152{ 148{
153 struct fib_info *fi = container_of(head, struct fib_info, rcu); 149 struct fib_info *fi = container_of(head, struct fib_info, rcu);
154 150
151 if (fi->fib_metrics != (u32 *) dst_default_metrics)
152 kfree(fi->fib_metrics);
155 kfree(fi); 153 kfree(fi);
156} 154}
157 155
@@ -200,7 +198,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
200#ifdef CONFIG_IP_ROUTE_MULTIPATH 198#ifdef CONFIG_IP_ROUTE_MULTIPATH
201 nh->nh_weight != onh->nh_weight || 199 nh->nh_weight != onh->nh_weight ||
202#endif 200#endif
203#ifdef CONFIG_NET_CLS_ROUTE 201#ifdef CONFIG_IP_ROUTE_CLASSID
204 nh->nh_tclassid != onh->nh_tclassid || 202 nh->nh_tclassid != onh->nh_tclassid ||
205#endif 203#endif
206 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) 204 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
@@ -221,10 +219,10 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
221 219
222static inline unsigned int fib_info_hashfn(const struct fib_info *fi) 220static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
223{ 221{
224 unsigned int mask = (fib_hash_size - 1); 222 unsigned int mask = (fib_info_hash_size - 1);
225 unsigned int val = fi->fib_nhs; 223 unsigned int val = fi->fib_nhs;
226 224
227 val ^= fi->fib_protocol; 225 val ^= (fi->fib_protocol << 8) | fi->fib_scope;
228 val ^= (__force u32)fi->fib_prefsrc; 226 val ^= (__force u32)fi->fib_prefsrc;
229 val ^= fi->fib_priority; 227 val ^= fi->fib_priority;
230 for_nexthops(fi) { 228 for_nexthops(fi) {
@@ -250,10 +248,11 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
250 if (fi->fib_nhs != nfi->fib_nhs) 248 if (fi->fib_nhs != nfi->fib_nhs)
251 continue; 249 continue;
252 if (nfi->fib_protocol == fi->fib_protocol && 250 if (nfi->fib_protocol == fi->fib_protocol &&
251 nfi->fib_scope == fi->fib_scope &&
253 nfi->fib_prefsrc == fi->fib_prefsrc && 252 nfi->fib_prefsrc == fi->fib_prefsrc &&
254 nfi->fib_priority == fi->fib_priority && 253 nfi->fib_priority == fi->fib_priority &&
255 memcmp(nfi->fib_metrics, fi->fib_metrics, 254 memcmp(nfi->fib_metrics, fi->fib_metrics,
256 sizeof(fi->fib_metrics)) == 0 && 255 sizeof(u32) * RTAX_MAX) == 0 &&
257 ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && 256 ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
258 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) 257 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
259 return fi; 258 return fi;
@@ -330,7 +329,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
330 goto errout; 329 goto errout;
331 330
332 err = fib_dump_info(skb, info->pid, seq, event, tb_id, 331 err = fib_dump_info(skb, info->pid, seq, event, tb_id,
333 fa->fa_type, fa->fa_scope, key, dst_len, 332 fa->fa_type, key, dst_len,
334 fa->fa_tos, fa->fa_info, nlm_flags); 333 fa->fa_tos, fa->fa_info, nlm_flags);
335 if (err < 0) { 334 if (err < 0) {
336 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ 335 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
@@ -422,7 +421,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
422 421
423 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 422 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
424 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; 423 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
425#ifdef CONFIG_NET_CLS_ROUTE 424#ifdef CONFIG_IP_ROUTE_CLASSID
426 nla = nla_find(attrs, attrlen, RTA_FLOW); 425 nla = nla_find(attrs, attrlen, RTA_FLOW);
427 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 426 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
428#endif 427#endif
@@ -476,7 +475,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
476 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 475 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
477 if (nla && nla_get_be32(nla) != nh->nh_gw) 476 if (nla && nla_get_be32(nla) != nh->nh_gw)
478 return 1; 477 return 1;
479#ifdef CONFIG_NET_CLS_ROUTE 478#ifdef CONFIG_IP_ROUTE_CLASSID
480 nla = nla_find(attrs, attrlen, RTA_FLOW); 479 nla = nla_find(attrs, attrlen, RTA_FLOW);
481 if (nla && nla_get_u32(nla) != nh->nh_tclassid) 480 if (nla && nla_get_u32(nla) != nh->nh_tclassid)
482 return 1; 481 return 1;
@@ -562,16 +561,16 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
562 } 561 }
563 rcu_read_lock(); 562 rcu_read_lock();
564 { 563 {
565 struct flowi fl = { 564 struct flowi4 fl4 = {
566 .fl4_dst = nh->nh_gw, 565 .daddr = nh->nh_gw,
567 .fl4_scope = cfg->fc_scope + 1, 566 .flowi4_scope = cfg->fc_scope + 1,
568 .oif = nh->nh_oif, 567 .flowi4_oif = nh->nh_oif,
569 }; 568 };
570 569
571 /* It is not necessary, but requires a bit of thinking */ 570 /* It is not necessary, but requires a bit of thinking */
572 if (fl.fl4_scope < RT_SCOPE_LINK) 571 if (fl4.flowi4_scope < RT_SCOPE_LINK)
573 fl.fl4_scope = RT_SCOPE_LINK; 572 fl4.flowi4_scope = RT_SCOPE_LINK;
574 err = fib_lookup(net, &fl, &res); 573 err = fib_lookup(net, &fl4, &res);
575 if (err) { 574 if (err) {
576 rcu_read_unlock(); 575 rcu_read_unlock();
577 return err; 576 return err;
@@ -613,14 +612,14 @@ out:
613 612
614static inline unsigned int fib_laddr_hashfn(__be32 val) 613static inline unsigned int fib_laddr_hashfn(__be32 val)
615{ 614{
616 unsigned int mask = (fib_hash_size - 1); 615 unsigned int mask = (fib_info_hash_size - 1);
617 616
618 return ((__force u32)val ^ 617 return ((__force u32)val ^
619 ((__force u32)val >> 7) ^ 618 ((__force u32)val >> 7) ^
620 ((__force u32)val >> 14)) & mask; 619 ((__force u32)val >> 14)) & mask;
621} 620}
622 621
623static struct hlist_head *fib_hash_alloc(int bytes) 622static struct hlist_head *fib_info_hash_alloc(int bytes)
624{ 623{
625 if (bytes <= PAGE_SIZE) 624 if (bytes <= PAGE_SIZE)
626 return kzalloc(bytes, GFP_KERNEL); 625 return kzalloc(bytes, GFP_KERNEL);
@@ -630,7 +629,7 @@ static struct hlist_head *fib_hash_alloc(int bytes)
630 get_order(bytes)); 629 get_order(bytes));
631} 630}
632 631
633static void fib_hash_free(struct hlist_head *hash, int bytes) 632static void fib_info_hash_free(struct hlist_head *hash, int bytes)
634{ 633{
635 if (!hash) 634 if (!hash)
636 return; 635 return;
@@ -641,18 +640,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes)
641 free_pages((unsigned long) hash, get_order(bytes)); 640 free_pages((unsigned long) hash, get_order(bytes));
642} 641}
643 642
644static void fib_hash_move(struct hlist_head *new_info_hash, 643static void fib_info_hash_move(struct hlist_head *new_info_hash,
645 struct hlist_head *new_laddrhash, 644 struct hlist_head *new_laddrhash,
646 unsigned int new_size) 645 unsigned int new_size)
647{ 646{
648 struct hlist_head *old_info_hash, *old_laddrhash; 647 struct hlist_head *old_info_hash, *old_laddrhash;
649 unsigned int old_size = fib_hash_size; 648 unsigned int old_size = fib_info_hash_size;
650 unsigned int i, bytes; 649 unsigned int i, bytes;
651 650
652 spin_lock_bh(&fib_info_lock); 651 spin_lock_bh(&fib_info_lock);
653 old_info_hash = fib_info_hash; 652 old_info_hash = fib_info_hash;
654 old_laddrhash = fib_info_laddrhash; 653 old_laddrhash = fib_info_laddrhash;
655 fib_hash_size = new_size; 654 fib_info_hash_size = new_size;
656 655
657 for (i = 0; i < old_size; i++) { 656 for (i = 0; i < old_size; i++) {
658 struct hlist_head *head = &fib_info_hash[i]; 657 struct hlist_head *head = &fib_info_hash[i];
@@ -693,8 +692,18 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
693 spin_unlock_bh(&fib_info_lock); 692 spin_unlock_bh(&fib_info_lock);
694 693
695 bytes = old_size * sizeof(struct hlist_head *); 694 bytes = old_size * sizeof(struct hlist_head *);
696 fib_hash_free(old_info_hash, bytes); 695 fib_info_hash_free(old_info_hash, bytes);
697 fib_hash_free(old_laddrhash, bytes); 696 fib_info_hash_free(old_laddrhash, bytes);
697}
698
699__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
700{
701 nh->nh_saddr = inet_select_addr(nh->nh_dev,
702 nh->nh_gw,
703 nh->nh_parent->fib_scope);
704 nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
705
706 return nh->nh_saddr;
698} 707}
699 708
700struct fib_info *fib_create_info(struct fib_config *cfg) 709struct fib_info *fib_create_info(struct fib_config *cfg)
@@ -705,6 +714,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
705 int nhs = 1; 714 int nhs = 1;
706 struct net *net = cfg->fc_nlinfo.nl_net; 715 struct net *net = cfg->fc_nlinfo.nl_net;
707 716
717 if (cfg->fc_type > RTN_MAX)
718 goto err_inval;
719
708 /* Fast check to catch the most weird cases */ 720 /* Fast check to catch the most weird cases */
709 if (fib_props[cfg->fc_type].scope > cfg->fc_scope) 721 if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
710 goto err_inval; 722 goto err_inval;
@@ -718,8 +730,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
718#endif 730#endif
719 731
720 err = -ENOBUFS; 732 err = -ENOBUFS;
721 if (fib_info_cnt >= fib_hash_size) { 733 if (fib_info_cnt >= fib_info_hash_size) {
722 unsigned int new_size = fib_hash_size << 1; 734 unsigned int new_size = fib_info_hash_size << 1;
723 struct hlist_head *new_info_hash; 735 struct hlist_head *new_info_hash;
724 struct hlist_head *new_laddrhash; 736 struct hlist_head *new_laddrhash;
725 unsigned int bytes; 737 unsigned int bytes;
@@ -727,25 +739,32 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
727 if (!new_size) 739 if (!new_size)
728 new_size = 1; 740 new_size = 1;
729 bytes = new_size * sizeof(struct hlist_head *); 741 bytes = new_size * sizeof(struct hlist_head *);
730 new_info_hash = fib_hash_alloc(bytes); 742 new_info_hash = fib_info_hash_alloc(bytes);
731 new_laddrhash = fib_hash_alloc(bytes); 743 new_laddrhash = fib_info_hash_alloc(bytes);
732 if (!new_info_hash || !new_laddrhash) { 744 if (!new_info_hash || !new_laddrhash) {
733 fib_hash_free(new_info_hash, bytes); 745 fib_info_hash_free(new_info_hash, bytes);
734 fib_hash_free(new_laddrhash, bytes); 746 fib_info_hash_free(new_laddrhash, bytes);
735 } else 747 } else
736 fib_hash_move(new_info_hash, new_laddrhash, new_size); 748 fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
737 749
738 if (!fib_hash_size) 750 if (!fib_info_hash_size)
739 goto failure; 751 goto failure;
740 } 752 }
741 753
742 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); 754 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
743 if (fi == NULL) 755 if (fi == NULL)
744 goto failure; 756 goto failure;
757 if (cfg->fc_mx) {
758 fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
759 if (!fi->fib_metrics)
760 goto failure;
761 } else
762 fi->fib_metrics = (u32 *) dst_default_metrics;
745 fib_info_cnt++; 763 fib_info_cnt++;
746 764
747 fi->fib_net = hold_net(net); 765 fi->fib_net = hold_net(net);
748 fi->fib_protocol = cfg->fc_protocol; 766 fi->fib_protocol = cfg->fc_protocol;
767 fi->fib_scope = cfg->fc_scope;
749 fi->fib_flags = cfg->fc_flags; 768 fi->fib_flags = cfg->fc_flags;
750 fi->fib_priority = cfg->fc_priority; 769 fi->fib_priority = cfg->fc_priority;
751 fi->fib_prefsrc = cfg->fc_prefsrc; 770 fi->fib_prefsrc = cfg->fc_prefsrc;
@@ -779,7 +798,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
779 goto err_inval; 798 goto err_inval;
780 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) 799 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
781 goto err_inval; 800 goto err_inval;
782#ifdef CONFIG_NET_CLS_ROUTE 801#ifdef CONFIG_IP_ROUTE_CLASSID
783 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) 802 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
784 goto err_inval; 803 goto err_inval;
785#endif 804#endif
@@ -792,7 +811,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
792 nh->nh_oif = cfg->fc_oif; 811 nh->nh_oif = cfg->fc_oif;
793 nh->nh_gw = cfg->fc_gw; 812 nh->nh_gw = cfg->fc_gw;
794 nh->nh_flags = cfg->fc_flags; 813 nh->nh_flags = cfg->fc_flags;
795#ifdef CONFIG_NET_CLS_ROUTE 814#ifdef CONFIG_IP_ROUTE_CLASSID
796 nh->nh_tclassid = cfg->fc_flow; 815 nh->nh_tclassid = cfg->fc_flow;
797#endif 816#endif
798#ifdef CONFIG_IP_ROUTE_MULTIPATH 817#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -804,6 +823,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
804 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) 823 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
805 goto err_inval; 824 goto err_inval;
806 goto link_it; 825 goto link_it;
826 } else {
827 switch (cfg->fc_type) {
828 case RTN_UNICAST:
829 case RTN_LOCAL:
830 case RTN_BROADCAST:
831 case RTN_ANYCAST:
832 case RTN_MULTICAST:
833 break;
834 default:
835 goto err_inval;
836 }
807 } 837 }
808 838
809 if (cfg->fc_scope > RT_SCOPE_HOST) 839 if (cfg->fc_scope > RT_SCOPE_HOST)
@@ -835,6 +865,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
835 goto err_inval; 865 goto err_inval;
836 } 866 }
837 867
868 change_nexthops(fi) {
869 fib_info_update_nh_saddr(net, nexthop_nh);
870 } endfor_nexthops(fi)
871
838link_it: 872link_it:
839 ofi = fib_find_info(fi); 873 ofi = fib_find_info(fi);
840 if (ofi) { 874 if (ofi) {
@@ -880,86 +914,8 @@ failure:
880 return ERR_PTR(err); 914 return ERR_PTR(err);
881} 915}
882 916
883/* Note! fib_semantic_match intentionally uses RCU list functions. */
884int fib_semantic_match(struct list_head *head, const struct flowi *flp,
885 struct fib_result *res, int prefixlen, int fib_flags)
886{
887 struct fib_alias *fa;
888 int nh_sel = 0;
889
890 list_for_each_entry_rcu(fa, head, fa_list) {
891 int err;
892
893 if (fa->fa_tos &&
894 fa->fa_tos != flp->fl4_tos)
895 continue;
896
897 if (fa->fa_scope < flp->fl4_scope)
898 continue;
899
900 fib_alias_accessed(fa);
901
902 err = fib_props[fa->fa_type].error;
903 if (err == 0) {
904 struct fib_info *fi = fa->fa_info;
905
906 if (fi->fib_flags & RTNH_F_DEAD)
907 continue;
908
909 switch (fa->fa_type) {
910 case RTN_UNICAST:
911 case RTN_LOCAL:
912 case RTN_BROADCAST:
913 case RTN_ANYCAST:
914 case RTN_MULTICAST:
915 for_nexthops(fi) {
916 if (nh->nh_flags & RTNH_F_DEAD)
917 continue;
918 if (!flp->oif || flp->oif == nh->nh_oif)
919 break;
920 }
921#ifdef CONFIG_IP_ROUTE_MULTIPATH
922 if (nhsel < fi->fib_nhs) {
923 nh_sel = nhsel;
924 goto out_fill_res;
925 }
926#else
927 if (nhsel < 1)
928 goto out_fill_res;
929#endif
930 endfor_nexthops(fi);
931 continue;
932
933 default:
934 pr_warning("fib_semantic_match bad type %#x\n",
935 fa->fa_type);
936 return -EINVAL;
937 }
938 }
939 return err;
940 }
941 return 1;
942
943out_fill_res:
944 res->prefixlen = prefixlen;
945 res->nh_sel = nh_sel;
946 res->type = fa->fa_type;
947 res->scope = fa->fa_scope;
948 res->fi = fa->fa_info;
949 if (!(fib_flags & FIB_LOOKUP_NOREF))
950 atomic_inc(&res->fi->fib_clntref);
951 return 0;
952}
953
954/* Find appropriate source address to this destination */
955
956__be32 __fib_res_prefsrc(struct fib_result *res)
957{
958 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
959}
960
961int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 917int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
962 u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, 918 u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
963 struct fib_info *fi, unsigned int flags) 919 struct fib_info *fi, unsigned int flags)
964{ 920{
965 struct nlmsghdr *nlh; 921 struct nlmsghdr *nlh;
@@ -981,7 +937,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
981 NLA_PUT_U32(skb, RTA_TABLE, tb_id); 937 NLA_PUT_U32(skb, RTA_TABLE, tb_id);
982 rtm->rtm_type = type; 938 rtm->rtm_type = type;
983 rtm->rtm_flags = fi->fib_flags; 939 rtm->rtm_flags = fi->fib_flags;
984 rtm->rtm_scope = scope; 940 rtm->rtm_scope = fi->fib_scope;
985 rtm->rtm_protocol = fi->fib_protocol; 941 rtm->rtm_protocol = fi->fib_protocol;
986 942
987 if (rtm->rtm_dst_len) 943 if (rtm->rtm_dst_len)
@@ -1002,7 +958,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
1002 958
1003 if (fi->fib_nh->nh_oif) 959 if (fi->fib_nh->nh_oif)
1004 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); 960 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
1005#ifdef CONFIG_NET_CLS_ROUTE 961#ifdef CONFIG_IP_ROUTE_CLASSID
1006 if (fi->fib_nh[0].nh_tclassid) 962 if (fi->fib_nh[0].nh_tclassid)
1007 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); 963 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
1008#endif 964#endif
@@ -1027,7 +983,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
1027 983
1028 if (nh->nh_gw) 984 if (nh->nh_gw)
1029 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); 985 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1030#ifdef CONFIG_NET_CLS_ROUTE 986#ifdef CONFIG_IP_ROUTE_CLASSID
1031 if (nh->nh_tclassid) 987 if (nh->nh_tclassid)
1032 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); 988 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1033#endif 989#endif
@@ -1125,6 +1081,62 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1125 return ret; 1081 return ret;
1126} 1082}
1127 1083
1084/* Must be invoked inside of an RCU protected region. */
1085void fib_select_default(struct fib_result *res)
1086{
1087 struct fib_info *fi = NULL, *last_resort = NULL;
1088 struct list_head *fa_head = res->fa_head;
1089 struct fib_table *tb = res->table;
1090 int order = -1, last_idx = -1;
1091 struct fib_alias *fa;
1092
1093 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1094 struct fib_info *next_fi = fa->fa_info;
1095
1096 if (next_fi->fib_scope != res->scope ||
1097 fa->fa_type != RTN_UNICAST)
1098 continue;
1099
1100 if (next_fi->fib_priority > res->fi->fib_priority)
1101 break;
1102 if (!next_fi->fib_nh[0].nh_gw ||
1103 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1104 continue;
1105
1106 fib_alias_accessed(fa);
1107
1108 if (fi == NULL) {
1109 if (next_fi != res->fi)
1110 break;
1111 } else if (!fib_detect_death(fi, order, &last_resort,
1112 &last_idx, tb->tb_default)) {
1113 fib_result_assign(res, fi);
1114 tb->tb_default = order;
1115 goto out;
1116 }
1117 fi = next_fi;
1118 order++;
1119 }
1120
1121 if (order <= 0 || fi == NULL) {
1122 tb->tb_default = -1;
1123 goto out;
1124 }
1125
1126 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1127 tb->tb_default)) {
1128 fib_result_assign(res, fi);
1129 tb->tb_default = order;
1130 goto out;
1131 }
1132
1133 if (last_idx >= 0)
1134 fib_result_assign(res, last_resort);
1135 tb->tb_default = last_idx;
1136out:
1137 return;
1138}
1139
1128#ifdef CONFIG_IP_ROUTE_MULTIPATH 1140#ifdef CONFIG_IP_ROUTE_MULTIPATH
1129 1141
1130/* 1142/*
@@ -1189,7 +1201,7 @@ int fib_sync_up(struct net_device *dev)
1189 * The algorithm is suboptimal, but it provides really 1201 * The algorithm is suboptimal, but it provides really
1190 * fair weighted route distribution. 1202 * fair weighted route distribution.
1191 */ 1203 */
1192void fib_select_multipath(const struct flowi *flp, struct fib_result *res) 1204void fib_select_multipath(struct fib_result *res)
1193{ 1205{
1194 struct fib_info *fi = res->fi; 1206 struct fib_info *fi = res->fi;
1195 int w; 1207 int w;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0f280348e0fd..5fe9b8b41df3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -12,7 +12,7 @@
12 * 12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet 13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 * 14 *
15 * This work is based on the LPC-trie which is originally descibed in: 15 * This work is based on the LPC-trie which is originally described in:
16 * 16 *
17 * An experimental study of compression methods for dynamic tries 17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. 18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
@@ -95,7 +95,7 @@ typedef unsigned int t_key;
95#define IS_TNODE(n) (!(n->parent & T_LEAF)) 95#define IS_TNODE(n) (!(n->parent & T_LEAF))
96#define IS_LEAF(n) (n->parent & T_LEAF) 96#define IS_LEAF(n) (n->parent & T_LEAF)
97 97
98struct node { 98struct rt_trie_node {
99 unsigned long parent; 99 unsigned long parent;
100 t_key key; 100 t_key key;
101}; 101};
@@ -126,7 +126,7 @@ struct tnode {
126 struct work_struct work; 126 struct work_struct work;
127 struct tnode *tnode_free; 127 struct tnode *tnode_free;
128 }; 128 };
129 struct node *child[0]; 129 struct rt_trie_node *child[0];
130}; 130};
131 131
132#ifdef CONFIG_IP_FIB_TRIE_STATS 132#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -151,16 +151,16 @@ struct trie_stat {
151}; 151};
152 152
153struct trie { 153struct trie {
154 struct node *trie; 154 struct rt_trie_node *trie;
155#ifdef CONFIG_IP_FIB_TRIE_STATS 155#ifdef CONFIG_IP_FIB_TRIE_STATS
156 struct trie_use_stats stats; 156 struct trie_use_stats stats;
157#endif 157#endif
158}; 158};
159 159
160static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); 160static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
161static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, 161static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
162 int wasfull); 162 int wasfull);
163static struct node *resize(struct trie *t, struct tnode *tn); 163static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
164static struct tnode *inflate(struct trie *t, struct tnode *tn); 164static struct tnode *inflate(struct trie *t, struct tnode *tn);
165static struct tnode *halve(struct trie *t, struct tnode *tn); 165static struct tnode *halve(struct trie *t, struct tnode *tn);
166/* tnodes to free after resize(); protected by RTNL */ 166/* tnodes to free after resize(); protected by RTNL */
@@ -177,12 +177,12 @@ static const int sync_pages = 128;
177static struct kmem_cache *fn_alias_kmem __read_mostly; 177static struct kmem_cache *fn_alias_kmem __read_mostly;
178static struct kmem_cache *trie_leaf_kmem __read_mostly; 178static struct kmem_cache *trie_leaf_kmem __read_mostly;
179 179
180static inline struct tnode *node_parent(struct node *node) 180static inline struct tnode *node_parent(struct rt_trie_node *node)
181{ 181{
182 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); 182 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
183} 183}
184 184
185static inline struct tnode *node_parent_rcu(struct node *node) 185static inline struct tnode *node_parent_rcu(struct rt_trie_node *node)
186{ 186{
187 struct tnode *ret = node_parent(node); 187 struct tnode *ret = node_parent(node);
188 188
@@ -192,22 +192,22 @@ static inline struct tnode *node_parent_rcu(struct node *node)
192/* Same as rcu_assign_pointer 192/* Same as rcu_assign_pointer
193 * but that macro() assumes that value is a pointer. 193 * but that macro() assumes that value is a pointer.
194 */ 194 */
195static inline void node_set_parent(struct node *node, struct tnode *ptr) 195static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
196{ 196{
197 smp_wmb(); 197 smp_wmb();
198 node->parent = (unsigned long)ptr | NODE_TYPE(node); 198 node->parent = (unsigned long)ptr | NODE_TYPE(node);
199} 199}
200 200
201static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) 201static inline struct rt_trie_node *tnode_get_child(struct tnode *tn, unsigned int i)
202{ 202{
203 BUG_ON(i >= 1U << tn->bits); 203 BUG_ON(i >= 1U << tn->bits);
204 204
205 return tn->child[i]; 205 return tn->child[i];
206} 206}
207 207
208static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) 208static inline struct rt_trie_node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
209{ 209{
210 struct node *ret = tnode_get_child(tn, i); 210 struct rt_trie_node *ret = tnode_get_child(tn, i);
211 211
212 return rcu_dereference_rtnl(ret); 212 return rcu_dereference_rtnl(ret);
213} 213}
@@ -217,12 +217,12 @@ static inline int tnode_child_length(const struct tnode *tn)
217 return 1 << tn->bits; 217 return 1 << tn->bits;
218} 218}
219 219
220static inline t_key mask_pfx(t_key k, unsigned short l) 220static inline t_key mask_pfx(t_key k, unsigned int l)
221{ 221{
222 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); 222 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
223} 223}
224 224
225static inline t_key tkey_extract_bits(t_key a, int offset, int bits) 225static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
226{ 226{
227 if (offset < KEYLENGTH) 227 if (offset < KEYLENGTH)
228 return ((t_key)(a << offset)) >> (KEYLENGTH - bits); 228 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
@@ -378,7 +378,7 @@ static void __tnode_free_rcu(struct rcu_head *head)
378{ 378{
379 struct tnode *tn = container_of(head, struct tnode, rcu); 379 struct tnode *tn = container_of(head, struct tnode, rcu);
380 size_t size = sizeof(struct tnode) + 380 size_t size = sizeof(struct tnode) +
381 (sizeof(struct node *) << tn->bits); 381 (sizeof(struct rt_trie_node *) << tn->bits);
382 382
383 if (size <= PAGE_SIZE) 383 if (size <= PAGE_SIZE)
384 kfree(tn); 384 kfree(tn);
@@ -402,7 +402,7 @@ static void tnode_free_safe(struct tnode *tn)
402 tn->tnode_free = tnode_free_head; 402 tn->tnode_free = tnode_free_head;
403 tnode_free_head = tn; 403 tnode_free_head = tn;
404 tnode_free_size += sizeof(struct tnode) + 404 tnode_free_size += sizeof(struct tnode) +
405 (sizeof(struct node *) << tn->bits); 405 (sizeof(struct rt_trie_node *) << tn->bits);
406} 406}
407 407
408static void tnode_free_flush(void) 408static void tnode_free_flush(void)
@@ -443,7 +443,7 @@ static struct leaf_info *leaf_info_new(int plen)
443 443
444static struct tnode *tnode_new(t_key key, int pos, int bits) 444static struct tnode *tnode_new(t_key key, int pos, int bits)
445{ 445{
446 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); 446 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
447 struct tnode *tn = tnode_alloc(sz); 447 struct tnode *tn = tnode_alloc(sz);
448 448
449 if (tn) { 449 if (tn) {
@@ -456,7 +456,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
456 } 456 }
457 457
458 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), 458 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
459 sizeof(struct node) << bits); 459 sizeof(struct rt_trie_node) << bits);
460 return tn; 460 return tn;
461} 461}
462 462
@@ -465,7 +465,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
465 * and no bits are skipped. See discussion in dyntree paper p. 6 465 * and no bits are skipped. See discussion in dyntree paper p. 6
466 */ 466 */
467 467
468static inline int tnode_full(const struct tnode *tn, const struct node *n) 468static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
469{ 469{
470 if (n == NULL || IS_LEAF(n)) 470 if (n == NULL || IS_LEAF(n))
471 return 0; 471 return 0;
@@ -474,7 +474,7 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n)
474} 474}
475 475
476static inline void put_child(struct trie *t, struct tnode *tn, int i, 476static inline void put_child(struct trie *t, struct tnode *tn, int i,
477 struct node *n) 477 struct rt_trie_node *n)
478{ 478{
479 tnode_put_child_reorg(tn, i, n, -1); 479 tnode_put_child_reorg(tn, i, n, -1);
480} 480}
@@ -484,10 +484,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i,
484 * Update the value of full_children and empty_children. 484 * Update the value of full_children and empty_children.
485 */ 485 */
486 486
487static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, 487static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
488 int wasfull) 488 int wasfull)
489{ 489{
490 struct node *chi = tn->child[i]; 490 struct rt_trie_node *chi = tn->child[i];
491 int isfull; 491 int isfull;
492 492
493 BUG_ON(i >= 1<<tn->bits); 493 BUG_ON(i >= 1<<tn->bits);
@@ -515,7 +515,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
515} 515}
516 516
517#define MAX_WORK 10 517#define MAX_WORK 10
518static struct node *resize(struct trie *t, struct tnode *tn) 518static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
519{ 519{
520 int i; 520 int i;
521 struct tnode *old_tn; 521 struct tnode *old_tn;
@@ -605,7 +605,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
605 605
606 /* Keep root node larger */ 606 /* Keep root node larger */
607 607
608 if (!node_parent((struct node *)tn)) { 608 if (!node_parent((struct rt_trie_node *)tn)) {
609 inflate_threshold_use = inflate_threshold_root; 609 inflate_threshold_use = inflate_threshold_root;
610 halve_threshold_use = halve_threshold_root; 610 halve_threshold_use = halve_threshold_root;
611 } else { 611 } else {
@@ -635,7 +635,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
635 635
636 /* Return if at least one inflate is run */ 636 /* Return if at least one inflate is run */
637 if (max_work != MAX_WORK) 637 if (max_work != MAX_WORK)
638 return (struct node *) tn; 638 return (struct rt_trie_node *) tn;
639 639
640 /* 640 /*
641 * Halve as long as the number of empty children in this 641 * Halve as long as the number of empty children in this
@@ -663,7 +663,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
663 if (tn->empty_children == tnode_child_length(tn) - 1) { 663 if (tn->empty_children == tnode_child_length(tn) - 1) {
664one_child: 664one_child:
665 for (i = 0; i < tnode_child_length(tn); i++) { 665 for (i = 0; i < tnode_child_length(tn); i++) {
666 struct node *n; 666 struct rt_trie_node *n;
667 667
668 n = tn->child[i]; 668 n = tn->child[i];
669 if (!n) 669 if (!n)
@@ -676,7 +676,7 @@ one_child:
676 return n; 676 return n;
677 } 677 }
678 } 678 }
679 return (struct node *) tn; 679 return (struct rt_trie_node *) tn;
680} 680}
681 681
682static struct tnode *inflate(struct trie *t, struct tnode *tn) 682static struct tnode *inflate(struct trie *t, struct tnode *tn)
@@ -723,14 +723,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
723 goto nomem; 723 goto nomem;
724 } 724 }
725 725
726 put_child(t, tn, 2*i, (struct node *) left); 726 put_child(t, tn, 2*i, (struct rt_trie_node *) left);
727 put_child(t, tn, 2*i+1, (struct node *) right); 727 put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
728 } 728 }
729 } 729 }
730 730
731 for (i = 0; i < olen; i++) { 731 for (i = 0; i < olen; i++) {
732 struct tnode *inode; 732 struct tnode *inode;
733 struct node *node = tnode_get_child(oldtnode, i); 733 struct rt_trie_node *node = tnode_get_child(oldtnode, i);
734 struct tnode *left, *right; 734 struct tnode *left, *right;
735 int size, j; 735 int size, j;
736 736
@@ -825,7 +825,7 @@ nomem:
825static struct tnode *halve(struct trie *t, struct tnode *tn) 825static struct tnode *halve(struct trie *t, struct tnode *tn)
826{ 826{
827 struct tnode *oldtnode = tn; 827 struct tnode *oldtnode = tn;
828 struct node *left, *right; 828 struct rt_trie_node *left, *right;
829 int i; 829 int i;
830 int olen = tnode_child_length(tn); 830 int olen = tnode_child_length(tn);
831 831
@@ -856,7 +856,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
856 if (!newn) 856 if (!newn)
857 goto nomem; 857 goto nomem;
858 858
859 put_child(t, tn, i/2, (struct node *)newn); 859 put_child(t, tn, i/2, (struct rt_trie_node *)newn);
860 } 860 }
861 861
862 } 862 }
@@ -958,7 +958,7 @@ fib_find_node(struct trie *t, u32 key)
958{ 958{
959 int pos; 959 int pos;
960 struct tnode *tn; 960 struct tnode *tn;
961 struct node *n; 961 struct rt_trie_node *n;
962 962
963 pos = 0; 963 pos = 0;
964 n = rcu_dereference_rtnl(t->trie); 964 n = rcu_dereference_rtnl(t->trie);
@@ -993,17 +993,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
993 993
994 key = tn->key; 994 key = tn->key;
995 995
996 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { 996 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
997 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 997 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
998 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 998 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
999 tn = (struct tnode *) resize(t, (struct tnode *)tn); 999 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1000 1000
1001 tnode_put_child_reorg((struct tnode *)tp, cindex, 1001 tnode_put_child_reorg((struct tnode *)tp, cindex,
1002 (struct node *)tn, wasfull); 1002 (struct rt_trie_node *)tn, wasfull);
1003 1003
1004 tp = node_parent((struct node *) tn); 1004 tp = node_parent((struct rt_trie_node *) tn);
1005 if (!tp) 1005 if (!tp)
1006 rcu_assign_pointer(t->trie, (struct node *)tn); 1006 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1007 1007
1008 tnode_free_flush(); 1008 tnode_free_flush();
1009 if (!tp) 1009 if (!tp)
@@ -1015,7 +1015,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1015 if (IS_TNODE(tn)) 1015 if (IS_TNODE(tn))
1016 tn = (struct tnode *)resize(t, (struct tnode *)tn); 1016 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1017 1017
1018 rcu_assign_pointer(t->trie, (struct node *)tn); 1018 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1019 tnode_free_flush(); 1019 tnode_free_flush();
1020} 1020}
1021 1021
@@ -1025,7 +1025,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1025{ 1025{
1026 int pos, newpos; 1026 int pos, newpos;
1027 struct tnode *tp = NULL, *tn = NULL; 1027 struct tnode *tp = NULL, *tn = NULL;
1028 struct node *n; 1028 struct rt_trie_node *n;
1029 struct leaf *l; 1029 struct leaf *l;
1030 int missbit; 1030 int missbit;
1031 struct list_head *fa_head = NULL; 1031 struct list_head *fa_head = NULL;
@@ -1111,10 +1111,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1111 if (t->trie && n == NULL) { 1111 if (t->trie && n == NULL) {
1112 /* Case 2: n is NULL, and will just insert a new leaf */ 1112 /* Case 2: n is NULL, and will just insert a new leaf */
1113 1113
1114 node_set_parent((struct node *)l, tp); 1114 node_set_parent((struct rt_trie_node *)l, tp);
1115 1115
1116 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1116 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1117 put_child(t, (struct tnode *)tp, cindex, (struct node *)l); 1117 put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
1118 } else { 1118 } else {
1119 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1119 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1120 /* 1120 /*
@@ -1141,18 +1141,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1141 return NULL; 1141 return NULL;
1142 } 1142 }
1143 1143
1144 node_set_parent((struct node *)tn, tp); 1144 node_set_parent((struct rt_trie_node *)tn, tp);
1145 1145
1146 missbit = tkey_extract_bits(key, newpos, 1); 1146 missbit = tkey_extract_bits(key, newpos, 1);
1147 put_child(t, tn, missbit, (struct node *)l); 1147 put_child(t, tn, missbit, (struct rt_trie_node *)l);
1148 put_child(t, tn, 1-missbit, n); 1148 put_child(t, tn, 1-missbit, n);
1149 1149
1150 if (tp) { 1150 if (tp) {
1151 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1151 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1152 put_child(t, (struct tnode *)tp, cindex, 1152 put_child(t, (struct tnode *)tp, cindex,
1153 (struct node *)tn); 1153 (struct rt_trie_node *)tn);
1154 } else { 1154 } else {
1155 rcu_assign_pointer(t->trie, (struct node *)tn); 1155 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1156 tp = tn; 1156 tp = tn;
1157 } 1157 }
1158 } 1158 }
@@ -1245,7 +1245,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1245 if (fa->fa_info->fib_priority != fi->fib_priority) 1245 if (fa->fa_info->fib_priority != fi->fib_priority)
1246 break; 1246 break;
1247 if (fa->fa_type == cfg->fc_type && 1247 if (fa->fa_type == cfg->fc_type &&
1248 fa->fa_scope == cfg->fc_scope &&
1249 fa->fa_info == fi) { 1248 fa->fa_info == fi) {
1250 fa_match = fa; 1249 fa_match = fa;
1251 break; 1250 break;
@@ -1271,7 +1270,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1271 new_fa->fa_tos = fa->fa_tos; 1270 new_fa->fa_tos = fa->fa_tos;
1272 new_fa->fa_info = fi; 1271 new_fa->fa_info = fi;
1273 new_fa->fa_type = cfg->fc_type; 1272 new_fa->fa_type = cfg->fc_type;
1274 new_fa->fa_scope = cfg->fc_scope;
1275 state = fa->fa_state; 1273 state = fa->fa_state;
1276 new_fa->fa_state = state & ~FA_S_ACCESSED; 1274 new_fa->fa_state = state & ~FA_S_ACCESSED;
1277 1275
@@ -1308,7 +1306,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1308 new_fa->fa_info = fi; 1306 new_fa->fa_info = fi;
1309 new_fa->fa_tos = tos; 1307 new_fa->fa_tos = tos;
1310 new_fa->fa_type = cfg->fc_type; 1308 new_fa->fa_type = cfg->fc_type;
1311 new_fa->fa_scope = cfg->fc_scope;
1312 new_fa->fa_state = 0; 1309 new_fa->fa_state = 0;
1313 /* 1310 /*
1314 * Insert new entry to the list. 1311 * Insert new entry to the list.
@@ -1340,8 +1337,8 @@ err:
1340} 1337}
1341 1338
1342/* should be called with rcu_read_lock */ 1339/* should be called with rcu_read_lock */
1343static int check_leaf(struct trie *t, struct leaf *l, 1340static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1344 t_key key, const struct flowi *flp, 1341 t_key key, const struct flowi4 *flp,
1345 struct fib_result *res, int fib_flags) 1342 struct fib_result *res, int fib_flags)
1346{ 1343{
1347 struct leaf_info *li; 1344 struct leaf_info *li;
@@ -1349,40 +1346,75 @@ static int check_leaf(struct trie *t, struct leaf *l,
1349 struct hlist_node *node; 1346 struct hlist_node *node;
1350 1347
1351 hlist_for_each_entry_rcu(li, node, hhead, hlist) { 1348 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1352 int err; 1349 struct fib_alias *fa;
1353 int plen = li->plen; 1350 int plen = li->plen;
1354 __be32 mask = inet_make_mask(plen); 1351 __be32 mask = inet_make_mask(plen);
1355 1352
1356 if (l->key != (key & ntohl(mask))) 1353 if (l->key != (key & ntohl(mask)))
1357 continue; 1354 continue;
1358 1355
1359 err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags); 1356 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1357 struct fib_info *fi = fa->fa_info;
1358 int nhsel, err;
1360 1359
1360 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1361 continue;
1362 if (fa->fa_info->fib_scope < flp->flowi4_scope)
1363 continue;
1364 fib_alias_accessed(fa);
1365 err = fib_props[fa->fa_type].error;
1366 if (err) {
1361#ifdef CONFIG_IP_FIB_TRIE_STATS 1367#ifdef CONFIG_IP_FIB_TRIE_STATS
1362 if (err <= 0) 1368 t->stats.semantic_match_passed++;
1363 t->stats.semantic_match_passed++; 1369#endif
1364 else 1370 return err;
1365 t->stats.semantic_match_miss++; 1371 }
1372 if (fi->fib_flags & RTNH_F_DEAD)
1373 continue;
1374 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1375 const struct fib_nh *nh = &fi->fib_nh[nhsel];
1376
1377 if (nh->nh_flags & RTNH_F_DEAD)
1378 continue;
1379 if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1380 continue;
1381
1382#ifdef CONFIG_IP_FIB_TRIE_STATS
1383 t->stats.semantic_match_passed++;
1384#endif
1385 res->prefixlen = plen;
1386 res->nh_sel = nhsel;
1387 res->type = fa->fa_type;
1388 res->scope = fa->fa_info->fib_scope;
1389 res->fi = fi;
1390 res->table = tb;
1391 res->fa_head = &li->falh;
1392 if (!(fib_flags & FIB_LOOKUP_NOREF))
1393 atomic_inc(&res->fi->fib_clntref);
1394 return 0;
1395 }
1396 }
1397
1398#ifdef CONFIG_IP_FIB_TRIE_STATS
1399 t->stats.semantic_match_miss++;
1366#endif 1400#endif
1367 if (err <= 0)
1368 return err;
1369 } 1401 }
1370 1402
1371 return 1; 1403 return 1;
1372} 1404}
1373 1405
1374int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, 1406int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1375 struct fib_result *res, int fib_flags) 1407 struct fib_result *res, int fib_flags)
1376{ 1408{
1377 struct trie *t = (struct trie *) tb->tb_data; 1409 struct trie *t = (struct trie *) tb->tb_data;
1378 int ret; 1410 int ret;
1379 struct node *n; 1411 struct rt_trie_node *n;
1380 struct tnode *pn; 1412 struct tnode *pn;
1381 int pos, bits; 1413 unsigned int pos, bits;
1382 t_key key = ntohl(flp->fl4_dst); 1414 t_key key = ntohl(flp->daddr);
1383 int chopped_off; 1415 unsigned int chopped_off;
1384 t_key cindex = 0; 1416 t_key cindex = 0;
1385 int current_prefix_length = KEYLENGTH; 1417 unsigned int current_prefix_length = KEYLENGTH;
1386 struct tnode *cn; 1418 struct tnode *cn;
1387 t_key pref_mismatch; 1419 t_key pref_mismatch;
1388 1420
@@ -1398,7 +1430,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1398 1430
1399 /* Just a leaf? */ 1431 /* Just a leaf? */
1400 if (IS_LEAF(n)) { 1432 if (IS_LEAF(n)) {
1401 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); 1433 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1402 goto found; 1434 goto found;
1403 } 1435 }
1404 1436
@@ -1423,7 +1455,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1423 } 1455 }
1424 1456
1425 if (IS_LEAF(n)) { 1457 if (IS_LEAF(n)) {
1426 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); 1458 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1427 if (ret > 0) 1459 if (ret > 0)
1428 goto backtrace; 1460 goto backtrace;
1429 goto found; 1461 goto found;
@@ -1541,7 +1573,7 @@ backtrace:
1541 if (chopped_off <= pn->bits) { 1573 if (chopped_off <= pn->bits) {
1542 cindex &= ~(1 << (chopped_off-1)); 1574 cindex &= ~(1 << (chopped_off-1));
1543 } else { 1575 } else {
1544 struct tnode *parent = node_parent_rcu((struct node *) pn); 1576 struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1545 if (!parent) 1577 if (!parent)
1546 goto failed; 1578 goto failed;
1547 1579
@@ -1568,7 +1600,7 @@ found:
1568 */ 1600 */
1569static void trie_leaf_remove(struct trie *t, struct leaf *l) 1601static void trie_leaf_remove(struct trie *t, struct leaf *l)
1570{ 1602{
1571 struct tnode *tp = node_parent((struct node *) l); 1603 struct tnode *tp = node_parent((struct rt_trie_node *) l);
1572 1604
1573 pr_debug("entering trie_leaf_remove(%p)\n", l); 1605 pr_debug("entering trie_leaf_remove(%p)\n", l);
1574 1606
@@ -1629,7 +1661,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1629 1661
1630 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && 1662 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1631 (cfg->fc_scope == RT_SCOPE_NOWHERE || 1663 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1632 fa->fa_scope == cfg->fc_scope) && 1664 fa->fa_info->fib_scope == cfg->fc_scope) &&
1665 (!cfg->fc_prefsrc ||
1666 fi->fib_prefsrc == cfg->fc_prefsrc) &&
1633 (!cfg->fc_protocol || 1667 (!cfg->fc_protocol ||
1634 fi->fib_protocol == cfg->fc_protocol) && 1668 fi->fib_protocol == cfg->fc_protocol) &&
1635 fib_nh_match(cfg, fi) == 0) { 1669 fib_nh_match(cfg, fi) == 0) {
@@ -1706,7 +1740,7 @@ static int trie_flush_leaf(struct leaf *l)
1706 * Scan for the next right leaf starting at node p->child[idx] 1740 * Scan for the next right leaf starting at node p->child[idx]
1707 * Since we have back pointer, no recursion necessary. 1741 * Since we have back pointer, no recursion necessary.
1708 */ 1742 */
1709static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) 1743static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1710{ 1744{
1711 do { 1745 do {
1712 t_key idx; 1746 t_key idx;
@@ -1732,7 +1766,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1732 } 1766 }
1733 1767
1734 /* Node empty, walk back up to parent */ 1768 /* Node empty, walk back up to parent */
1735 c = (struct node *) p; 1769 c = (struct rt_trie_node *) p;
1736 } while ((p = node_parent_rcu(c)) != NULL); 1770 } while ((p = node_parent_rcu(c)) != NULL);
1737 1771
1738 return NULL; /* Root of trie */ 1772 return NULL; /* Root of trie */
@@ -1753,7 +1787,7 @@ static struct leaf *trie_firstleaf(struct trie *t)
1753 1787
1754static struct leaf *trie_nextleaf(struct leaf *l) 1788static struct leaf *trie_nextleaf(struct leaf *l)
1755{ 1789{
1756 struct node *c = (struct node *) l; 1790 struct rt_trie_node *c = (struct rt_trie_node *) l;
1757 struct tnode *p = node_parent_rcu(c); 1791 struct tnode *p = node_parent_rcu(c);
1758 1792
1759 if (!p) 1793 if (!p)
@@ -1802,80 +1836,6 @@ void fib_free_table(struct fib_table *tb)
1802 kfree(tb); 1836 kfree(tb);
1803} 1837}
1804 1838
1805void fib_table_select_default(struct fib_table *tb,
1806 const struct flowi *flp,
1807 struct fib_result *res)
1808{
1809 struct trie *t = (struct trie *) tb->tb_data;
1810 int order, last_idx;
1811 struct fib_info *fi = NULL;
1812 struct fib_info *last_resort;
1813 struct fib_alias *fa = NULL;
1814 struct list_head *fa_head;
1815 struct leaf *l;
1816
1817 last_idx = -1;
1818 last_resort = NULL;
1819 order = -1;
1820
1821 rcu_read_lock();
1822
1823 l = fib_find_node(t, 0);
1824 if (!l)
1825 goto out;
1826
1827 fa_head = get_fa_head(l, 0);
1828 if (!fa_head)
1829 goto out;
1830
1831 if (list_empty(fa_head))
1832 goto out;
1833
1834 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1835 struct fib_info *next_fi = fa->fa_info;
1836
1837 if (fa->fa_scope != res->scope ||
1838 fa->fa_type != RTN_UNICAST)
1839 continue;
1840
1841 if (next_fi->fib_priority > res->fi->fib_priority)
1842 break;
1843 if (!next_fi->fib_nh[0].nh_gw ||
1844 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1845 continue;
1846
1847 fib_alias_accessed(fa);
1848
1849 if (fi == NULL) {
1850 if (next_fi != res->fi)
1851 break;
1852 } else if (!fib_detect_death(fi, order, &last_resort,
1853 &last_idx, tb->tb_default)) {
1854 fib_result_assign(res, fi);
1855 tb->tb_default = order;
1856 goto out;
1857 }
1858 fi = next_fi;
1859 order++;
1860 }
1861 if (order <= 0 || fi == NULL) {
1862 tb->tb_default = -1;
1863 goto out;
1864 }
1865
1866 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1867 tb->tb_default)) {
1868 fib_result_assign(res, fi);
1869 tb->tb_default = order;
1870 goto out;
1871 }
1872 if (last_idx >= 0)
1873 fib_result_assign(res, last_resort);
1874 tb->tb_default = last_idx;
1875out:
1876 rcu_read_unlock();
1877}
1878
1879static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, 1839static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1880 struct fib_table *tb, 1840 struct fib_table *tb,
1881 struct sk_buff *skb, struct netlink_callback *cb) 1841 struct sk_buff *skb, struct netlink_callback *cb)
@@ -1900,7 +1860,6 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1900 RTM_NEWROUTE, 1860 RTM_NEWROUTE,
1901 tb->tb_id, 1861 tb->tb_id,
1902 fa->fa_type, 1862 fa->fa_type,
1903 fa->fa_scope,
1904 xkey, 1863 xkey,
1905 plen, 1864 plen,
1906 fa->fa_tos, 1865 fa->fa_tos,
@@ -1990,7 +1949,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1990 return skb->len; 1949 return skb->len;
1991} 1950}
1992 1951
1993void __init fib_hash_init(void) 1952void __init fib_trie_init(void)
1994{ 1953{
1995 fn_alias_kmem = kmem_cache_create("ip_fib_alias", 1954 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1996 sizeof(struct fib_alias), 1955 sizeof(struct fib_alias),
@@ -2003,8 +1962,7 @@ void __init fib_hash_init(void)
2003} 1962}
2004 1963
2005 1964
2006/* Fix more generic FIB names for init later */ 1965struct fib_table *fib_trie_table(u32 id)
2007struct fib_table *fib_hash_table(u32 id)
2008{ 1966{
2009 struct fib_table *tb; 1967 struct fib_table *tb;
2010 struct trie *t; 1968 struct trie *t;
@@ -2020,9 +1978,6 @@ struct fib_table *fib_hash_table(u32 id)
2020 t = (struct trie *) tb->tb_data; 1978 t = (struct trie *) tb->tb_data;
2021 memset(t, 0, sizeof(*t)); 1979 memset(t, 0, sizeof(*t));
2022 1980
2023 if (id == RT_TABLE_LOCAL)
2024 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2025
2026 return tb; 1981 return tb;
2027} 1982}
2028 1983
@@ -2036,7 +1991,7 @@ struct fib_trie_iter {
2036 unsigned int depth; 1991 unsigned int depth;
2037}; 1992};
2038 1993
2039static struct node *fib_trie_get_next(struct fib_trie_iter *iter) 1994static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2040{ 1995{
2041 struct tnode *tn = iter->tnode; 1996 struct tnode *tn = iter->tnode;
2042 unsigned int cindex = iter->index; 1997 unsigned int cindex = iter->index;
@@ -2050,7 +2005,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2050 iter->tnode, iter->index, iter->depth); 2005 iter->tnode, iter->index, iter->depth);
2051rescan: 2006rescan:
2052 while (cindex < (1<<tn->bits)) { 2007 while (cindex < (1<<tn->bits)) {
2053 struct node *n = tnode_get_child_rcu(tn, cindex); 2008 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2054 2009
2055 if (n) { 2010 if (n) {
2056 if (IS_LEAF(n)) { 2011 if (IS_LEAF(n)) {
@@ -2069,7 +2024,7 @@ rescan:
2069 } 2024 }
2070 2025
2071 /* Current node exhausted, pop back up */ 2026 /* Current node exhausted, pop back up */
2072 p = node_parent_rcu((struct node *)tn); 2027 p = node_parent_rcu((struct rt_trie_node *)tn);
2073 if (p) { 2028 if (p) {
2074 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; 2029 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2075 tn = p; 2030 tn = p;
@@ -2081,10 +2036,10 @@ rescan:
2081 return NULL; 2036 return NULL;
2082} 2037}
2083 2038
2084static struct node *fib_trie_get_first(struct fib_trie_iter *iter, 2039static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2085 struct trie *t) 2040 struct trie *t)
2086{ 2041{
2087 struct node *n; 2042 struct rt_trie_node *n;
2088 2043
2089 if (!t) 2044 if (!t)
2090 return NULL; 2045 return NULL;
@@ -2108,7 +2063,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2108 2063
2109static void trie_collect_stats(struct trie *t, struct trie_stat *s) 2064static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2110{ 2065{
2111 struct node *n; 2066 struct rt_trie_node *n;
2112 struct fib_trie_iter iter; 2067 struct fib_trie_iter iter;
2113 2068
2114 memset(s, 0, sizeof(*s)); 2069 memset(s, 0, sizeof(*s));
@@ -2181,7 +2136,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2181 seq_putc(seq, '\n'); 2136 seq_putc(seq, '\n');
2182 seq_printf(seq, "\tPointers: %u\n", pointers); 2137 seq_printf(seq, "\tPointers: %u\n", pointers);
2183 2138
2184 bytes += sizeof(struct node *) * pointers; 2139 bytes += sizeof(struct rt_trie_node *) * pointers;
2185 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); 2140 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2186 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); 2141 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2187} 2142}
@@ -2262,7 +2217,7 @@ static const struct file_operations fib_triestat_fops = {
2262 .release = single_release_net, 2217 .release = single_release_net,
2263}; 2218};
2264 2219
2265static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) 2220static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2266{ 2221{
2267 struct fib_trie_iter *iter = seq->private; 2222 struct fib_trie_iter *iter = seq->private;
2268 struct net *net = seq_file_net(seq); 2223 struct net *net = seq_file_net(seq);
@@ -2275,7 +2230,7 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2275 struct fib_table *tb; 2230 struct fib_table *tb;
2276 2231
2277 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { 2232 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2278 struct node *n; 2233 struct rt_trie_node *n;
2279 2234
2280 for (n = fib_trie_get_first(iter, 2235 for (n = fib_trie_get_first(iter,
2281 (struct trie *) tb->tb_data); 2236 (struct trie *) tb->tb_data);
@@ -2304,7 +2259,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2304 struct fib_table *tb = iter->tb; 2259 struct fib_table *tb = iter->tb;
2305 struct hlist_node *tb_node; 2260 struct hlist_node *tb_node;
2306 unsigned int h; 2261 unsigned int h;
2307 struct node *n; 2262 struct rt_trie_node *n;
2308 2263
2309 ++*pos; 2264 ++*pos;
2310 /* next node in same table */ 2265 /* next node in same table */
@@ -2390,7 +2345,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2390static int fib_trie_seq_show(struct seq_file *seq, void *v) 2345static int fib_trie_seq_show(struct seq_file *seq, void *v)
2391{ 2346{
2392 const struct fib_trie_iter *iter = seq->private; 2347 const struct fib_trie_iter *iter = seq->private;
2393 struct node *n = v; 2348 struct rt_trie_node *n = v;
2394 2349
2395 if (!node_parent_rcu(n)) 2350 if (!node_parent_rcu(n))
2396 fib_table_print(seq, iter->tb); 2351 fib_table_print(seq, iter->tb);
@@ -2422,7 +2377,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
2422 seq_indent(seq, iter->depth+1); 2377 seq_indent(seq, iter->depth+1);
2423 seq_printf(seq, " /%d %s %s", li->plen, 2378 seq_printf(seq, " /%d %s %s", li->plen,
2424 rtn_scope(buf1, sizeof(buf1), 2379 rtn_scope(buf1, sizeof(buf1),
2425 fa->fa_scope), 2380 fa->fa_info->fib_scope),
2426 rtn_type(buf2, sizeof(buf2), 2381 rtn_type(buf2, sizeof(buf2),
2427 fa->fa_type)); 2382 fa->fa_type));
2428 if (fa->fa_tos) 2383 if (fa->fa_tos)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4aa1b7f01ea0..e5f8a71d3a2a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -233,48 +233,11 @@ static inline void icmp_xmit_unlock(struct sock *sk)
233 * Send an ICMP frame. 233 * Send an ICMP frame.
234 */ 234 */
235 235
236/* 236static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
237 * Check transmit rate limitation for given message.
238 * The rate information is held in the destination cache now.
239 * This function is generic and could be used for other purposes
240 * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
241 *
242 * Note that the same dst_entry fields are modified by functions in
243 * route.c too, but these work for packet destinations while xrlim_allow
244 * works for icmp destinations. This means the rate limiting information
245 * for one "ip object" is shared - and these ICMPs are twice limited:
246 * by source and by destination.
247 *
248 * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
249 * SHOULD allow setting of rate limits
250 *
251 * Shared between ICMPv4 and ICMPv6.
252 */
253#define XRLIM_BURST_FACTOR 6
254int xrlim_allow(struct dst_entry *dst, int timeout)
255{
256 unsigned long now, token = dst->rate_tokens;
257 int rc = 0;
258
259 now = jiffies;
260 token += now - dst->rate_last;
261 dst->rate_last = now;
262 if (token > XRLIM_BURST_FACTOR * timeout)
263 token = XRLIM_BURST_FACTOR * timeout;
264 if (token >= timeout) {
265 token -= timeout;
266 rc = 1;
267 }
268 dst->rate_tokens = token;
269 return rc;
270}
271EXPORT_SYMBOL(xrlim_allow);
272
273static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
274 int type, int code) 237 int type, int code)
275{ 238{
276 struct dst_entry *dst = &rt->dst; 239 struct dst_entry *dst = &rt->dst;
277 int rc = 1; 240 bool rc = true;
278 241
279 if (type > NR_ICMP_TYPES) 242 if (type > NR_ICMP_TYPES)
280 goto out; 243 goto out;
@@ -288,8 +251,12 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
288 goto out; 251 goto out;
289 252
290 /* Limit if icmp type is enabled in ratemask. */ 253 /* Limit if icmp type is enabled in ratemask. */
291 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) 254 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
292 rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit); 255 if (!rt->peer)
256 rt_bind_peer(rt, 1);
257 rc = inet_peer_xrlim_allow(rt->peer,
258 net->ipv4.sysctl_icmp_ratelimit);
259 }
293out: 260out:
294 return rc; 261 return rc;
295} 262}
@@ -386,12 +353,15 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
386 daddr = icmp_param->replyopts.faddr; 353 daddr = icmp_param->replyopts.faddr;
387 } 354 }
388 { 355 {
389 struct flowi fl = { .fl4_dst= daddr, 356 struct flowi4 fl4 = {
390 .fl4_src = rt->rt_spec_dst, 357 .daddr = daddr,
391 .fl4_tos = RT_TOS(ip_hdr(skb)->tos), 358 .saddr = rt->rt_spec_dst,
392 .proto = IPPROTO_ICMP }; 359 .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
393 security_skb_classify_flow(skb, &fl); 360 .flowi4_proto = IPPROTO_ICMP,
394 if (ip_route_output_key(net, &rt, &fl)) 361 };
362 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
363 rt = ip_route_output_key(net, &fl4);
364 if (IS_ERR(rt))
395 goto out_unlock; 365 goto out_unlock;
396 } 366 }
397 if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, 367 if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type,
@@ -402,6 +372,97 @@ out_unlock:
402 icmp_xmit_unlock(sk); 372 icmp_xmit_unlock(sk);
403} 373}
404 374
375static struct rtable *icmp_route_lookup(struct net *net, struct sk_buff *skb_in,
376 struct iphdr *iph,
377 __be32 saddr, u8 tos,
378 int type, int code,
379 struct icmp_bxm *param)
380{
381 struct flowi4 fl4 = {
382 .daddr = (param->replyopts.srr ?
383 param->replyopts.faddr : iph->saddr),
384 .saddr = saddr,
385 .flowi4_tos = RT_TOS(tos),
386 .flowi4_proto = IPPROTO_ICMP,
387 .fl4_icmp_type = type,
388 .fl4_icmp_code = code,
389 };
390 struct rtable *rt, *rt2;
391 int err;
392
393 security_skb_classify_flow(skb_in, flowi4_to_flowi(&fl4));
394 rt = __ip_route_output_key(net, &fl4);
395 if (IS_ERR(rt))
396 return rt;
397
398 /* No need to clone since we're just using its address. */
399 rt2 = rt;
400
401 if (!fl4.saddr)
402 fl4.saddr = rt->rt_src;
403
404 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
405 flowi4_to_flowi(&fl4), NULL, 0);
406 if (!IS_ERR(rt)) {
407 if (rt != rt2)
408 return rt;
409 } else if (PTR_ERR(rt) == -EPERM) {
410 rt = NULL;
411 } else
412 return rt;
413
414 err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4), AF_INET);
415 if (err)
416 goto relookup_failed;
417
418 if (inet_addr_type(net, fl4.saddr) == RTN_LOCAL) {
419 rt2 = __ip_route_output_key(net, &fl4);
420 if (IS_ERR(rt2))
421 err = PTR_ERR(rt2);
422 } else {
423 struct flowi4 fl4_2 = {};
424 unsigned long orefdst;
425
426 fl4_2.daddr = fl4.saddr;
427 rt2 = ip_route_output_key(net, &fl4_2);
428 if (IS_ERR(rt2)) {
429 err = PTR_ERR(rt2);
430 goto relookup_failed;
431 }
432 /* Ugh! */
433 orefdst = skb_in->_skb_refdst; /* save old refdst */
434 err = ip_route_input(skb_in, fl4.daddr, fl4.saddr,
435 RT_TOS(tos), rt2->dst.dev);
436
437 dst_release(&rt2->dst);
438 rt2 = skb_rtable(skb_in);
439 skb_in->_skb_refdst = orefdst; /* restore old refdst */
440 }
441
442 if (err)
443 goto relookup_failed;
444
445 rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
446 flowi4_to_flowi(&fl4), NULL,
447 XFRM_LOOKUP_ICMP);
448 if (!IS_ERR(rt2)) {
449 dst_release(&rt->dst);
450 rt = rt2;
451 } else if (PTR_ERR(rt2) == -EPERM) {
452 if (rt)
453 dst_release(&rt->dst);
454 return rt2;
455 } else {
456 err = PTR_ERR(rt2);
457 goto relookup_failed;
458 }
459 return rt;
460
461relookup_failed:
462 if (rt)
463 return rt;
464 return ERR_PTR(err);
465}
405 466
406/* 467/*
407 * Send an ICMP message in response to a situation 468 * Send an ICMP message in response to a situation
@@ -507,7 +568,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
507 rcu_read_lock(); 568 rcu_read_lock();
508 if (rt_is_input_route(rt) && 569 if (rt_is_input_route(rt) &&
509 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) 570 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
510 dev = dev_get_by_index_rcu(net, rt->fl.iif); 571 dev = dev_get_by_index_rcu(net, rt->rt_iif);
511 572
512 if (dev) 573 if (dev)
513 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); 574 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -539,86 +600,11 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
539 ipc.opt = &icmp_param.replyopts; 600 ipc.opt = &icmp_param.replyopts;
540 ipc.tx_flags = 0; 601 ipc.tx_flags = 0;
541 602
542 { 603 rt = icmp_route_lookup(net, skb_in, iph, saddr, tos,
543 struct flowi fl = { 604 type, code, &icmp_param);
544 .fl4_dst = icmp_param.replyopts.srr ? 605 if (IS_ERR(rt))
545 icmp_param.replyopts.faddr : iph->saddr, 606 goto out_unlock;
546 .fl4_src = saddr,
547 .fl4_tos = RT_TOS(tos),
548 .proto = IPPROTO_ICMP,
549 .fl_icmp_type = type,
550 .fl_icmp_code = code,
551 };
552 int err;
553 struct rtable *rt2;
554
555 security_skb_classify_flow(skb_in, &fl);
556 if (__ip_route_output_key(net, &rt, &fl))
557 goto out_unlock;
558
559 /* No need to clone since we're just using its address. */
560 rt2 = rt;
561
562 if (!fl.nl_u.ip4_u.saddr)
563 fl.nl_u.ip4_u.saddr = rt->rt_src;
564
565 err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0);
566 switch (err) {
567 case 0:
568 if (rt != rt2)
569 goto route_done;
570 break;
571 case -EPERM:
572 rt = NULL;
573 break;
574 default:
575 goto out_unlock;
576 }
577
578 if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET))
579 goto relookup_failed;
580
581 if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL)
582 err = __ip_route_output_key(net, &rt2, &fl);
583 else {
584 struct flowi fl2 = {};
585 unsigned long orefdst;
586
587 fl2.fl4_dst = fl.fl4_src;
588 if (ip_route_output_key(net, &rt2, &fl2))
589 goto relookup_failed;
590
591 /* Ugh! */
592 orefdst = skb_in->_skb_refdst; /* save old refdst */
593 err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
594 RT_TOS(tos), rt2->dst.dev);
595
596 dst_release(&rt2->dst);
597 rt2 = skb_rtable(skb_in);
598 skb_in->_skb_refdst = orefdst; /* restore old refdst */
599 }
600
601 if (err)
602 goto relookup_failed;
603
604 err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL,
605 XFRM_LOOKUP_ICMP);
606 switch (err) {
607 case 0:
608 dst_release(&rt->dst);
609 rt = rt2;
610 break;
611 case -EPERM:
612 goto ende;
613 default:
614relookup_failed:
615 if (!rt)
616 goto out_unlock;
617 break;
618 }
619 }
620 607
621route_done:
622 if (!icmpv4_xrlim_allow(net, rt, type, code)) 608 if (!icmpv4_xrlim_allow(net, rt, type, code))
623 goto ende; 609 goto ende;
624 610
@@ -718,7 +704,7 @@ static void icmp_unreach(struct sk_buff *skb)
718 */ 704 */
719 705
720 /* 706 /*
721 * Check the other end isnt violating RFC 1122. Some routers send 707 * Check the other end isn't violating RFC 1122. Some routers send
722 * bogus responses to broadcast frames. If you see this message 708 * bogus responses to broadcast frames. If you see this message
723 * first check your netmask matches at both ends, if it does then 709 * first check your netmask matches at both ends, if it does then
724 * get the other vendor to fix their kit. 710 * get the other vendor to fix their kit.
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index e0e77e297de3..1fd3d9ce8398 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -321,14 +321,12 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
321 } 321 }
322 igmp_skb_size(skb) = size; 322 igmp_skb_size(skb) = size;
323 323
324 { 324 rt = ip_route_output_ports(net, NULL, IGMPV3_ALL_MCR, 0,
325 struct flowi fl = { .oif = dev->ifindex, 325 0, 0,
326 .fl4_dst = IGMPV3_ALL_MCR, 326 IPPROTO_IGMP, 0, dev->ifindex);
327 .proto = IPPROTO_IGMP }; 327 if (IS_ERR(rt)) {
328 if (ip_route_output_key(net, &rt, &fl)) { 328 kfree_skb(skb);
329 kfree_skb(skb); 329 return NULL;
330 return NULL;
331 }
332 } 330 }
333 if (rt->rt_src == 0) { 331 if (rt->rt_src == 0) {
334 kfree_skb(skb); 332 kfree_skb(skb);
@@ -666,13 +664,12 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
666 else 664 else
667 dst = group; 665 dst = group;
668 666
669 { 667 rt = ip_route_output_ports(net, NULL, dst, 0,
670 struct flowi fl = { .oif = dev->ifindex, 668 0, 0,
671 .fl4_dst = dst, 669 IPPROTO_IGMP, 0, dev->ifindex);
672 .proto = IPPROTO_IGMP }; 670 if (IS_ERR(rt))
673 if (ip_route_output_key(net, &rt, &fl)) 671 return -1;
674 return -1; 672
675 }
676 if (rt->rt_src == 0) { 673 if (rt->rt_src == 0) {
677 ip_rt_put(rt); 674 ip_rt_put(rt);
678 return -1; 675 return -1;
@@ -1439,8 +1436,6 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
1439/* RTNL is locked */ 1436/* RTNL is locked */
1440static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) 1437static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1441{ 1438{
1442 struct flowi fl = { .fl4_dst = imr->imr_multiaddr.s_addr };
1443 struct rtable *rt;
1444 struct net_device *dev = NULL; 1439 struct net_device *dev = NULL;
1445 struct in_device *idev = NULL; 1440 struct in_device *idev = NULL;
1446 1441
@@ -1454,9 +1449,14 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
1454 return NULL; 1449 return NULL;
1455 } 1450 }
1456 1451
1457 if (!dev && !ip_route_output_key(net, &rt, &fl)) { 1452 if (!dev) {
1458 dev = rt->dst.dev; 1453 struct rtable *rt = ip_route_output(net,
1459 ip_rt_put(rt); 1454 imr->imr_multiaddr.s_addr,
1455 0, 0, 0);
1456 if (!IS_ERR(rt)) {
1457 dev = rt->dst.dev;
1458 ip_rt_put(rt);
1459 }
1460 } 1460 }
1461 if (dev) { 1461 if (dev) {
1462 imr->imr_ifindex = dev->ifindex; 1462 imr->imr_ifindex = dev->ifindex;
@@ -2329,13 +2329,13 @@ void ip_mc_drop_socket(struct sock *sk)
2329 rtnl_unlock(); 2329 rtnl_unlock();
2330} 2330}
2331 2331
2332int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) 2332/* called with rcu_read_lock() */
2333int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto)
2333{ 2334{
2334 struct ip_mc_list *im; 2335 struct ip_mc_list *im;
2335 struct ip_sf_list *psf; 2336 struct ip_sf_list *psf;
2336 int rv = 0; 2337 int rv = 0;
2337 2338
2338 rcu_read_lock();
2339 for_each_pmc_rcu(in_dev, im) { 2339 for_each_pmc_rcu(in_dev, im) {
2340 if (im->multiaddr == mc_addr) 2340 if (im->multiaddr == mc_addr)
2341 break; 2341 break;
@@ -2357,7 +2357,6 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p
2357 } else 2357 } else
2358 rv = 1; /* unspecified source; tentatively allow */ 2358 rv = 1; /* unspecified source; tentatively allow */
2359 } 2359 }
2360 rcu_read_unlock();
2361 return rv; 2360 return rv;
2362} 2361}
2363 2362
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 97e5fb765265..38f23e721b80 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -73,7 +73,7 @@ int inet_csk_bind_conflict(const struct sock *sk,
73 !sk2->sk_bound_dev_if || 73 !sk2->sk_bound_dev_if ||
74 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { 74 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
75 if (!reuse || !sk2->sk_reuse || 75 if (!reuse || !sk2->sk_reuse ||
76 ((1 << sk2->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))) { 76 sk2->sk_state == TCP_LISTEN) {
77 const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); 77 const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
78 if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || 78 if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
79 sk2_rcv_saddr == sk_rcv_saddr(sk)) 79 sk2_rcv_saddr == sk_rcv_saddr(sk))
@@ -122,8 +122,7 @@ again:
122 (tb->num_owners < smallest_size || smallest_size == -1)) { 122 (tb->num_owners < smallest_size || smallest_size == -1)) {
123 smallest_size = tb->num_owners; 123 smallest_size = tb->num_owners;
124 smallest_rover = rover; 124 smallest_rover = rover;
125 if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 && 125 if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
126 !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
127 spin_unlock(&head->lock); 126 spin_unlock(&head->lock);
128 snum = smallest_rover; 127 snum = smallest_rover;
129 goto have_snum; 128 goto have_snum;
@@ -356,20 +355,23 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
356 struct rtable *rt; 355 struct rtable *rt;
357 const struct inet_request_sock *ireq = inet_rsk(req); 356 const struct inet_request_sock *ireq = inet_rsk(req);
358 struct ip_options *opt = inet_rsk(req)->opt; 357 struct ip_options *opt = inet_rsk(req)->opt;
359 struct flowi fl = { .oif = sk->sk_bound_dev_if, 358 struct flowi4 fl4 = {
360 .mark = sk->sk_mark, 359 .flowi4_oif = sk->sk_bound_dev_if,
361 .fl4_dst = ((opt && opt->srr) ? 360 .flowi4_mark = sk->sk_mark,
362 opt->faddr : ireq->rmt_addr), 361 .daddr = ((opt && opt->srr) ?
363 .fl4_src = ireq->loc_addr, 362 opt->faddr : ireq->rmt_addr),
364 .fl4_tos = RT_CONN_FLAGS(sk), 363 .saddr = ireq->loc_addr,
365 .proto = sk->sk_protocol, 364 .flowi4_tos = RT_CONN_FLAGS(sk),
366 .flags = inet_sk_flowi_flags(sk), 365 .flowi4_proto = sk->sk_protocol,
367 .fl_ip_sport = inet_sk(sk)->inet_sport, 366 .flowi4_flags = inet_sk_flowi_flags(sk),
368 .fl_ip_dport = ireq->rmt_port }; 367 .fl4_sport = inet_sk(sk)->inet_sport,
368 .fl4_dport = ireq->rmt_port,
369 };
369 struct net *net = sock_net(sk); 370 struct net *net = sock_net(sk);
370 371
371 security_req_classify_flow(req, &fl); 372 security_req_classify_flow(req, flowi4_to_flowi(&fl4));
372 if (ip_route_output_flow(net, &rt, &fl, sk, 0)) 373 rt = ip_route_output_flow(net, &fl4, sk);
374 if (IS_ERR(rt))
373 goto no_route; 375 goto no_route;
374 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) 376 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
375 goto route_err; 377 goto route_err;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index a96e65674ac3..9df4e635fb5f 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -81,19 +81,19 @@ static const struct inet_peer peer_fake_node = {
81 81
82struct inet_peer_base { 82struct inet_peer_base {
83 struct inet_peer __rcu *root; 83 struct inet_peer __rcu *root;
84 spinlock_t lock; 84 seqlock_t lock;
85 int total; 85 int total;
86}; 86};
87 87
88static struct inet_peer_base v4_peers = { 88static struct inet_peer_base v4_peers = {
89 .root = peer_avl_empty_rcu, 89 .root = peer_avl_empty_rcu,
90 .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock), 90 .lock = __SEQLOCK_UNLOCKED(v4_peers.lock),
91 .total = 0, 91 .total = 0,
92}; 92};
93 93
94static struct inet_peer_base v6_peers = { 94static struct inet_peer_base v6_peers = {
95 .root = peer_avl_empty_rcu, 95 .root = peer_avl_empty_rcu,
96 .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock), 96 .lock = __SEQLOCK_UNLOCKED(v6_peers.lock),
97 .total = 0, 97 .total = 0,
98}; 98};
99 99
@@ -167,9 +167,9 @@ static int addr_compare(const struct inetpeer_addr *a,
167 int i, n = (a->family == AF_INET ? 1 : 4); 167 int i, n = (a->family == AF_INET ? 1 : 4);
168 168
169 for (i = 0; i < n; i++) { 169 for (i = 0; i < n; i++) {
170 if (a->a6[i] == b->a6[i]) 170 if (a->addr.a6[i] == b->addr.a6[i])
171 continue; 171 continue;
172 if (a->a6[i] < b->a6[i]) 172 if (a->addr.a6[i] < b->addr.a6[i])
173 return -1; 173 return -1;
174 return 1; 174 return 1;
175 } 175 }
@@ -177,6 +177,9 @@ static int addr_compare(const struct inetpeer_addr *a,
177 return 0; 177 return 0;
178} 178}
179 179
180#define rcu_deref_locked(X, BASE) \
181 rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
182
180/* 183/*
181 * Called with local BH disabled and the pool lock held. 184 * Called with local BH disabled and the pool lock held.
182 */ 185 */
@@ -187,8 +190,7 @@ static int addr_compare(const struct inetpeer_addr *a,
187 \ 190 \
188 stackptr = _stack; \ 191 stackptr = _stack; \
189 *stackptr++ = &_base->root; \ 192 *stackptr++ = &_base->root; \
190 for (u = rcu_dereference_protected(_base->root, \ 193 for (u = rcu_deref_locked(_base->root, _base); \
191 lockdep_is_held(&_base->lock)); \
192 u != peer_avl_empty; ) { \ 194 u != peer_avl_empty; ) { \
193 int cmp = addr_compare(_daddr, &u->daddr); \ 195 int cmp = addr_compare(_daddr, &u->daddr); \
194 if (cmp == 0) \ 196 if (cmp == 0) \
@@ -198,23 +200,22 @@ static int addr_compare(const struct inetpeer_addr *a,
198 else \ 200 else \
199 v = &u->avl_right; \ 201 v = &u->avl_right; \
200 *stackptr++ = v; \ 202 *stackptr++ = v; \
201 u = rcu_dereference_protected(*v, \ 203 u = rcu_deref_locked(*v, _base); \
202 lockdep_is_held(&_base->lock)); \
203 } \ 204 } \
204 u; \ 205 u; \
205}) 206})
206 207
207/* 208/*
208 * Called with rcu_read_lock_bh() 209 * Called with rcu_read_lock()
209 * Because we hold no lock against a writer, its quite possible we fall 210 * Because we hold no lock against a writer, its quite possible we fall
210 * in an endless loop. 211 * in an endless loop.
211 * But every pointer we follow is guaranteed to be valid thanks to RCU. 212 * But every pointer we follow is guaranteed to be valid thanks to RCU.
212 * We exit from this function if number of links exceeds PEER_MAXDEPTH 213 * We exit from this function if number of links exceeds PEER_MAXDEPTH
213 */ 214 */
214static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, 215static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
215 struct inet_peer_base *base) 216 struct inet_peer_base *base)
216{ 217{
217 struct inet_peer *u = rcu_dereference_bh(base->root); 218 struct inet_peer *u = rcu_dereference(base->root);
218 int count = 0; 219 int count = 0;
219 220
220 while (u != peer_avl_empty) { 221 while (u != peer_avl_empty) {
@@ -230,9 +231,9 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
230 return u; 231 return u;
231 } 232 }
232 if (cmp == -1) 233 if (cmp == -1)
233 u = rcu_dereference_bh(u->avl_left); 234 u = rcu_dereference(u->avl_left);
234 else 235 else
235 u = rcu_dereference_bh(u->avl_right); 236 u = rcu_dereference(u->avl_right);
236 if (unlikely(++count == PEER_MAXDEPTH)) 237 if (unlikely(++count == PEER_MAXDEPTH))
237 break; 238 break;
238 } 239 }
@@ -246,13 +247,11 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
246 struct inet_peer __rcu **v; \ 247 struct inet_peer __rcu **v; \
247 *stackptr++ = &start->avl_left; \ 248 *stackptr++ = &start->avl_left; \
248 v = &start->avl_left; \ 249 v = &start->avl_left; \
249 for (u = rcu_dereference_protected(*v, \ 250 for (u = rcu_deref_locked(*v, base); \
250 lockdep_is_held(&base->lock)); \
251 u->avl_right != peer_avl_empty_rcu; ) { \ 251 u->avl_right != peer_avl_empty_rcu; ) { \
252 v = &u->avl_right; \ 252 v = &u->avl_right; \
253 *stackptr++ = v; \ 253 *stackptr++ = v; \
254 u = rcu_dereference_protected(*v, \ 254 u = rcu_deref_locked(*v, base); \
255 lockdep_is_held(&base->lock)); \
256 } \ 255 } \
257 u; \ 256 u; \
258}) 257})
@@ -271,21 +270,16 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
271 270
272 while (stackend > stack) { 271 while (stackend > stack) {
273 nodep = *--stackend; 272 nodep = *--stackend;
274 node = rcu_dereference_protected(*nodep, 273 node = rcu_deref_locked(*nodep, base);
275 lockdep_is_held(&base->lock)); 274 l = rcu_deref_locked(node->avl_left, base);
276 l = rcu_dereference_protected(node->avl_left, 275 r = rcu_deref_locked(node->avl_right, base);
277 lockdep_is_held(&base->lock));
278 r = rcu_dereference_protected(node->avl_right,
279 lockdep_is_held(&base->lock));
280 lh = node_height(l); 276 lh = node_height(l);
281 rh = node_height(r); 277 rh = node_height(r);
282 if (lh > rh + 1) { /* l: RH+2 */ 278 if (lh > rh + 1) { /* l: RH+2 */
283 struct inet_peer *ll, *lr, *lrl, *lrr; 279 struct inet_peer *ll, *lr, *lrl, *lrr;
284 int lrh; 280 int lrh;
285 ll = rcu_dereference_protected(l->avl_left, 281 ll = rcu_deref_locked(l->avl_left, base);
286 lockdep_is_held(&base->lock)); 282 lr = rcu_deref_locked(l->avl_right, base);
287 lr = rcu_dereference_protected(l->avl_right,
288 lockdep_is_held(&base->lock));
289 lrh = node_height(lr); 283 lrh = node_height(lr);
290 if (lrh <= node_height(ll)) { /* ll: RH+1 */ 284 if (lrh <= node_height(ll)) { /* ll: RH+1 */
291 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ 285 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
@@ -296,10 +290,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
296 l->avl_height = node->avl_height + 1; 290 l->avl_height = node->avl_height + 1;
297 RCU_INIT_POINTER(*nodep, l); 291 RCU_INIT_POINTER(*nodep, l);
298 } else { /* ll: RH, lr: RH+1 */ 292 } else { /* ll: RH, lr: RH+1 */
299 lrl = rcu_dereference_protected(lr->avl_left, 293 lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
300 lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */ 294 lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
301 lrr = rcu_dereference_protected(lr->avl_right,
302 lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */
303 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ 295 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
304 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ 296 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
305 node->avl_height = rh + 1; /* node: RH+1 */ 297 node->avl_height = rh + 1; /* node: RH+1 */
@@ -314,10 +306,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
314 } else if (rh > lh + 1) { /* r: LH+2 */ 306 } else if (rh > lh + 1) { /* r: LH+2 */
315 struct inet_peer *rr, *rl, *rlr, *rll; 307 struct inet_peer *rr, *rl, *rlr, *rll;
316 int rlh; 308 int rlh;
317 rr = rcu_dereference_protected(r->avl_right, 309 rr = rcu_deref_locked(r->avl_right, base);
318 lockdep_is_held(&base->lock)); 310 rl = rcu_deref_locked(r->avl_left, base);
319 rl = rcu_dereference_protected(r->avl_left,
320 lockdep_is_held(&base->lock));
321 rlh = node_height(rl); 311 rlh = node_height(rl);
322 if (rlh <= node_height(rr)) { /* rr: LH+1 */ 312 if (rlh <= node_height(rr)) { /* rr: LH+1 */
323 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ 313 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
@@ -328,10 +318,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
328 r->avl_height = node->avl_height + 1; 318 r->avl_height = node->avl_height + 1;
329 RCU_INIT_POINTER(*nodep, r); 319 RCU_INIT_POINTER(*nodep, r);
330 } else { /* rr: RH, rl: RH+1 */ 320 } else { /* rr: RH, rl: RH+1 */
331 rlr = rcu_dereference_protected(rl->avl_right, 321 rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
332 lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */ 322 rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
333 rll = rcu_dereference_protected(rl->avl_left,
334 lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */
335 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ 323 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
336 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ 324 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
337 node->avl_height = lh + 1; /* node: LH+1 */ 325 node->avl_height = lh + 1; /* node: LH+1 */
@@ -366,13 +354,14 @@ static void inetpeer_free_rcu(struct rcu_head *head)
366} 354}
367 355
368/* May be called with local BH enabled. */ 356/* May be called with local BH enabled. */
369static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) 357static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
358 struct inet_peer __rcu **stack[PEER_MAXDEPTH])
370{ 359{
371 int do_free; 360 int do_free;
372 361
373 do_free = 0; 362 do_free = 0;
374 363
375 spin_lock_bh(&base->lock); 364 write_seqlock_bh(&base->lock);
376 /* Check the reference counter. It was artificially incremented by 1 365 /* Check the reference counter. It was artificially incremented by 1
377 * in cleanup() function to prevent sudden disappearing. If we can 366 * in cleanup() function to prevent sudden disappearing. If we can
378 * atomically (because of lockless readers) take this last reference, 367 * atomically (because of lockless readers) take this last reference,
@@ -380,7 +369,6 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
380 * We use refcnt=-1 to alert lockless readers this entry is deleted. 369 * We use refcnt=-1 to alert lockless readers this entry is deleted.
381 */ 370 */
382 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { 371 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
383 struct inet_peer __rcu **stack[PEER_MAXDEPTH];
384 struct inet_peer __rcu ***stackptr, ***delp; 372 struct inet_peer __rcu ***stackptr, ***delp;
385 if (lookup(&p->daddr, stack, base) != p) 373 if (lookup(&p->daddr, stack, base) != p)
386 BUG(); 374 BUG();
@@ -392,8 +380,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
392 /* look for a node to insert instead of p */ 380 /* look for a node to insert instead of p */
393 struct inet_peer *t; 381 struct inet_peer *t;
394 t = lookup_rightempty(p, base); 382 t = lookup_rightempty(p, base);
395 BUG_ON(rcu_dereference_protected(*stackptr[-1], 383 BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
396 lockdep_is_held(&base->lock)) != t);
397 **--stackptr = t->avl_left; 384 **--stackptr = t->avl_left;
398 /* t is removed, t->daddr > x->daddr for any 385 /* t is removed, t->daddr > x->daddr for any
399 * x in p->avl_left subtree. 386 * x in p->avl_left subtree.
@@ -409,10 +396,10 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
409 base->total--; 396 base->total--;
410 do_free = 1; 397 do_free = 1;
411 } 398 }
412 spin_unlock_bh(&base->lock); 399 write_sequnlock_bh(&base->lock);
413 400
414 if (do_free) 401 if (do_free)
415 call_rcu_bh(&p->rcu, inetpeer_free_rcu); 402 call_rcu(&p->rcu, inetpeer_free_rcu);
416 else 403 else
417 /* The node is used again. Decrease the reference counter 404 /* The node is used again. Decrease the reference counter
418 * back. The loop "cleanup -> unlink_from_unused 405 * back. The loop "cleanup -> unlink_from_unused
@@ -435,7 +422,7 @@ static struct inet_peer_base *peer_to_base(struct inet_peer *p)
435} 422}
436 423
437/* May be called with local BH enabled. */ 424/* May be called with local BH enabled. */
438static int cleanup_once(unsigned long ttl) 425static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH])
439{ 426{
440 struct inet_peer *p = NULL; 427 struct inet_peer *p = NULL;
441 428
@@ -467,7 +454,7 @@ static int cleanup_once(unsigned long ttl)
467 * happen because of entry limits in route cache. */ 454 * happen because of entry limits in route cache. */
468 return -1; 455 return -1;
469 456
470 unlink_from_pool(p, peer_to_base(p)); 457 unlink_from_pool(p, peer_to_base(p), stack);
471 return 0; 458 return 0;
472} 459}
473 460
@@ -477,13 +464,17 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
477 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; 464 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
478 struct inet_peer_base *base = family_to_base(daddr->family); 465 struct inet_peer_base *base = family_to_base(daddr->family);
479 struct inet_peer *p; 466 struct inet_peer *p;
467 unsigned int sequence;
468 int invalidated;
480 469
481 /* Look up for the address quickly, lockless. 470 /* Look up for the address quickly, lockless.
482 * Because of a concurrent writer, we might not find an existing entry. 471 * Because of a concurrent writer, we might not find an existing entry.
483 */ 472 */
484 rcu_read_lock_bh(); 473 rcu_read_lock();
485 p = lookup_rcu_bh(daddr, base); 474 sequence = read_seqbegin(&base->lock);
486 rcu_read_unlock_bh(); 475 p = lookup_rcu(daddr, base);
476 invalidated = read_seqretry(&base->lock, sequence);
477 rcu_read_unlock();
487 478
488 if (p) { 479 if (p) {
489 /* The existing node has been found. 480 /* The existing node has been found.
@@ -493,14 +484,18 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
493 return p; 484 return p;
494 } 485 }
495 486
487 /* If no writer did a change during our lookup, we can return early. */
488 if (!create && !invalidated)
489 return NULL;
490
496 /* retry an exact lookup, taking the lock before. 491 /* retry an exact lookup, taking the lock before.
497 * At least, nodes should be hot in our cache. 492 * At least, nodes should be hot in our cache.
498 */ 493 */
499 spin_lock_bh(&base->lock); 494 write_seqlock_bh(&base->lock);
500 p = lookup(daddr, stack, base); 495 p = lookup(daddr, stack, base);
501 if (p != peer_avl_empty) { 496 if (p != peer_avl_empty) {
502 atomic_inc(&p->refcnt); 497 atomic_inc(&p->refcnt);
503 spin_unlock_bh(&base->lock); 498 write_sequnlock_bh(&base->lock);
504 /* Remove the entry from unused list if it was there. */ 499 /* Remove the entry from unused list if it was there. */
505 unlink_from_unused(p); 500 unlink_from_unused(p);
506 return p; 501 return p;
@@ -510,8 +505,14 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
510 p->daddr = *daddr; 505 p->daddr = *daddr;
511 atomic_set(&p->refcnt, 1); 506 atomic_set(&p->refcnt, 1);
512 atomic_set(&p->rid, 0); 507 atomic_set(&p->rid, 0);
513 atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4)); 508 atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4));
514 p->tcp_ts_stamp = 0; 509 p->tcp_ts_stamp = 0;
510 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
511 p->rate_tokens = 0;
512 p->rate_last = 0;
513 p->pmtu_expires = 0;
514 p->pmtu_orig = 0;
515 memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
515 INIT_LIST_HEAD(&p->unused); 516 INIT_LIST_HEAD(&p->unused);
516 517
517 518
@@ -519,11 +520,11 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
519 link_to_pool(p, base); 520 link_to_pool(p, base);
520 base->total++; 521 base->total++;
521 } 522 }
522 spin_unlock_bh(&base->lock); 523 write_sequnlock_bh(&base->lock);
523 524
524 if (base->total >= inet_peer_threshold) 525 if (base->total >= inet_peer_threshold)
525 /* Remove one less-recently-used entry. */ 526 /* Remove one less-recently-used entry. */
526 cleanup_once(0); 527 cleanup_once(0, stack);
527 528
528 return p; 529 return p;
529} 530}
@@ -539,6 +540,7 @@ static void peer_check_expire(unsigned long dummy)
539{ 540{
540 unsigned long now = jiffies; 541 unsigned long now = jiffies;
541 int ttl, total; 542 int ttl, total;
543 struct inet_peer __rcu **stack[PEER_MAXDEPTH];
542 544
543 total = compute_total(); 545 total = compute_total();
544 if (total >= inet_peer_threshold) 546 if (total >= inet_peer_threshold)
@@ -547,7 +549,7 @@ static void peer_check_expire(unsigned long dummy)
547 ttl = inet_peer_maxttl 549 ttl = inet_peer_maxttl
548 - (inet_peer_maxttl - inet_peer_minttl) / HZ * 550 - (inet_peer_maxttl - inet_peer_minttl) / HZ *
549 total / inet_peer_threshold * HZ; 551 total / inet_peer_threshold * HZ;
550 while (!cleanup_once(ttl)) { 552 while (!cleanup_once(ttl, stack)) {
551 if (jiffies != now) 553 if (jiffies != now)
552 break; 554 break;
553 } 555 }
@@ -579,3 +581,44 @@ void inet_putpeer(struct inet_peer *p)
579 local_bh_enable(); 581 local_bh_enable();
580} 582}
581EXPORT_SYMBOL_GPL(inet_putpeer); 583EXPORT_SYMBOL_GPL(inet_putpeer);
584
585/*
586 * Check transmit rate limitation for given message.
587 * The rate information is held in the inet_peer entries now.
588 * This function is generic and could be used for other purposes
589 * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
590 *
591 * Note that the same inet_peer fields are modified by functions in
592 * route.c too, but these work for packet destinations while xrlim_allow
593 * works for icmp destinations. This means the rate limiting information
594 * for one "ip object" is shared - and these ICMPs are twice limited:
595 * by source and by destination.
596 *
597 * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
598 * SHOULD allow setting of rate limits
599 *
600 * Shared between ICMPv4 and ICMPv6.
601 */
602#define XRLIM_BURST_FACTOR 6
603bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
604{
605 unsigned long now, token;
606 bool rc = false;
607
608 if (!peer)
609 return true;
610
611 token = peer->rate_tokens;
612 now = jiffies;
613 token += now - peer->rate_last;
614 peer->rate_last = now;
615 if (token > XRLIM_BURST_FACTOR * timeout)
616 token = XRLIM_BURST_FACTOR * timeout;
617 if (token >= timeout) {
618 token -= timeout;
619 rc = true;
620 }
621 peer->rate_tokens = token;
622 return rc;
623}
624EXPORT_SYMBOL(inet_peer_xrlim_allow);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index a1151b8adf3c..b1d282f11be7 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -223,31 +223,30 @@ static void ip_expire(unsigned long arg)
223 223
224 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { 224 if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
225 struct sk_buff *head = qp->q.fragments; 225 struct sk_buff *head = qp->q.fragments;
226 const struct iphdr *iph;
227 int err;
226 228
227 rcu_read_lock(); 229 rcu_read_lock();
228 head->dev = dev_get_by_index_rcu(net, qp->iif); 230 head->dev = dev_get_by_index_rcu(net, qp->iif);
229 if (!head->dev) 231 if (!head->dev)
230 goto out_rcu_unlock; 232 goto out_rcu_unlock;
231 233
234 /* skb dst is stale, drop it, and perform route lookup again */
235 skb_dst_drop(head);
236 iph = ip_hdr(head);
237 err = ip_route_input_noref(head, iph->daddr, iph->saddr,
238 iph->tos, head->dev);
239 if (err)
240 goto out_rcu_unlock;
241
232 /* 242 /*
233 * Only search router table for the head fragment, 243 * Only an end host needs to send an ICMP
234 * when defraging timeout at PRE_ROUTING HOOK. 244 * "Fragment Reassembly Timeout" message, per RFC792.
235 */ 245 */
236 if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { 246 if (qp->user == IP_DEFRAG_CONNTRACK_IN &&
237 const struct iphdr *iph = ip_hdr(head); 247 skb_rtable(head)->rt_type != RTN_LOCAL)
238 int err = ip_route_input(head, iph->daddr, iph->saddr, 248 goto out_rcu_unlock;
239 iph->tos, head->dev);
240 if (unlikely(err))
241 goto out_rcu_unlock;
242
243 /*
244 * Only an end host needs to send an ICMP
245 * "Fragment Reassembly Timeout" message, per RFC792.
246 */
247 if (skb_rtable(head)->rt_type != RTN_LOCAL)
248 goto out_rcu_unlock;
249 249
250 }
251 250
252 /* Send an ICMP "Fragment Reassembly Timeout" message. */ 251 /* Send an ICMP "Fragment Reassembly Timeout" message. */
253 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); 252 icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 6613edfac28c..da5941f18c3c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -769,19 +769,12 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
769 tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph); 769 tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
770 } 770 }
771 771
772 { 772 rt = ip_route_output_gre(dev_net(dev), dst, tiph->saddr,
773 struct flowi fl = { 773 tunnel->parms.o_key, RT_TOS(tos),
774 .oif = tunnel->parms.link, 774 tunnel->parms.link);
775 .fl4_dst = dst, 775 if (IS_ERR(rt)) {
776 .fl4_src = tiph->saddr, 776 dev->stats.tx_carrier_errors++;
777 .fl4_tos = RT_TOS(tos), 777 goto tx_error;
778 .proto = IPPROTO_GRE,
779 .fl_gre_key = tunnel->parms.o_key
780 };
781 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
782 dev->stats.tx_carrier_errors++;
783 goto tx_error;
784 }
785 } 778 }
786 tdev = rt->dst.dev; 779 tdev = rt->dst.dev;
787 780
@@ -945,17 +938,13 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
945 /* Guess output device to choose reasonable mtu and needed_headroom */ 938 /* Guess output device to choose reasonable mtu and needed_headroom */
946 939
947 if (iph->daddr) { 940 if (iph->daddr) {
948 struct flowi fl = { 941 struct rtable *rt = ip_route_output_gre(dev_net(dev),
949 .oif = tunnel->parms.link, 942 iph->daddr, iph->saddr,
950 .fl4_dst = iph->daddr, 943 tunnel->parms.o_key,
951 .fl4_src = iph->saddr, 944 RT_TOS(iph->tos),
952 .fl4_tos = RT_TOS(iph->tos), 945 tunnel->parms.link);
953 .proto = IPPROTO_GRE, 946
954 .fl_gre_key = tunnel->parms.o_key 947 if (!IS_ERR(rt)) {
955 };
956 struct rtable *rt;
957
958 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
959 tdev = rt->dst.dev; 948 tdev = rt->dst.dev;
960 ip_rt_put(rt); 949 ip_rt_put(rt);
961 } 950 }
@@ -1207,17 +1196,14 @@ static int ipgre_open(struct net_device *dev)
1207 struct ip_tunnel *t = netdev_priv(dev); 1196 struct ip_tunnel *t = netdev_priv(dev);
1208 1197
1209 if (ipv4_is_multicast(t->parms.iph.daddr)) { 1198 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1210 struct flowi fl = { 1199 struct rtable *rt = ip_route_output_gre(dev_net(dev),
1211 .oif = t->parms.link, 1200 t->parms.iph.daddr,
1212 .fl4_dst = t->parms.iph.daddr, 1201 t->parms.iph.saddr,
1213 .fl4_src = t->parms.iph.saddr, 1202 t->parms.o_key,
1214 .fl4_tos = RT_TOS(t->parms.iph.tos), 1203 RT_TOS(t->parms.iph.tos),
1215 .proto = IPPROTO_GRE, 1204 t->parms.link);
1216 .fl_gre_key = t->parms.o_key 1205
1217 }; 1206 if (IS_ERR(rt))
1218 struct rtable *rt;
1219
1220 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1221 return -EADDRNOTAVAIL; 1207 return -EADDRNOTAVAIL;
1222 dev = rt->dst.dev; 1208 dev = rt->dst.dev;
1223 ip_rt_put(rt); 1209 ip_rt_put(rt);
@@ -1765,4 +1751,4 @@ module_exit(ipgre_fini);
1765MODULE_LICENSE("GPL"); 1751MODULE_LICENSE("GPL");
1766MODULE_ALIAS_RTNL_LINK("gre"); 1752MODULE_ALIAS_RTNL_LINK("gre");
1767MODULE_ALIAS_RTNL_LINK("gretap"); 1753MODULE_ALIAS_RTNL_LINK("gretap");
1768MODULE_ALIAS("gre0"); 1754MODULE_ALIAS_NETDEV("gre0");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d859bcc26cb7..d7b2b0987a3b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
340 } 340 }
341 } 341 }
342 342
343#ifdef CONFIG_NET_CLS_ROUTE 343#ifdef CONFIG_IP_ROUTE_CLASSID
344 if (unlikely(skb_dst(skb)->tclassid)) { 344 if (unlikely(skb_dst(skb)->tclassid)) {
345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); 345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
346 u32 idx = skb_dst(skb)->tclassid; 346 u32 idx = skb_dst(skb)->tclassid;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 1906fa35860c..2391b24e8251 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -140,11 +140,11 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
140 } else { 140 } else {
141 dopt->ts_needtime = 0; 141 dopt->ts_needtime = 0;
142 142
143 if (soffset + 8 <= optlen) { 143 if (soffset + 7 <= optlen) {
144 __be32 addr; 144 __be32 addr;
145 145
146 memcpy(&addr, sptr+soffset-1, 4); 146 memcpy(&addr, dptr+soffset-1, 4);
147 if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_LOCAL) { 147 if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) {
148 dopt->ts_needtime = 1; 148 dopt->ts_needtime = 1;
149 soffset += 8; 149 soffset += 8;
150 } 150 }
@@ -329,7 +329,7 @@ int ip_options_compile(struct net *net,
329 pp_ptr = optptr + 2; 329 pp_ptr = optptr + 2;
330 goto error; 330 goto error;
331 } 331 }
332 if (skb) { 332 if (rt) {
333 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); 333 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
334 opt->is_changed = 1; 334 opt->is_changed = 1;
335 } 335 }
@@ -371,7 +371,7 @@ int ip_options_compile(struct net *net,
371 goto error; 371 goto error;
372 } 372 }
373 opt->ts = optptr - iph; 373 opt->ts = optptr - iph;
374 if (skb) { 374 if (rt) {
375 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); 375 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
376 timeptr = (__be32*)&optptr[optptr[2]+3]; 376 timeptr = (__be32*)&optptr[optptr[2]+3];
377 } 377 }
@@ -603,7 +603,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
603 unsigned long orefdst; 603 unsigned long orefdst;
604 int err; 604 int err;
605 605
606 if (!opt->srr) 606 if (!opt->srr || !rt)
607 return 0; 607 return 0;
608 608
609 if (skb->pkt_type != PACKET_HOST) 609 if (skb->pkt_type != PACKET_HOST)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 04c7b3ba6b39..459c011b1d4a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -339,25 +339,19 @@ int ip_queue_xmit(struct sk_buff *skb)
339 if(opt && opt->srr) 339 if(opt && opt->srr)
340 daddr = opt->faddr; 340 daddr = opt->faddr;
341 341
342 { 342 /* If this fails, retransmit mechanism of transport layer will
343 struct flowi fl = { .oif = sk->sk_bound_dev_if, 343 * keep trying until route appears or the connection times
344 .mark = sk->sk_mark, 344 * itself out.
345 .fl4_dst = daddr, 345 */
346 .fl4_src = inet->inet_saddr, 346 rt = ip_route_output_ports(sock_net(sk), sk,
347 .fl4_tos = RT_CONN_FLAGS(sk), 347 daddr, inet->inet_saddr,
348 .proto = sk->sk_protocol, 348 inet->inet_dport,
349 .flags = inet_sk_flowi_flags(sk), 349 inet->inet_sport,
350 .fl_ip_sport = inet->inet_sport, 350 sk->sk_protocol,
351 .fl_ip_dport = inet->inet_dport }; 351 RT_CONN_FLAGS(sk),
352 352 sk->sk_bound_dev_if);
353 /* If this fails, retransmit mechanism of transport layer will 353 if (IS_ERR(rt))
354 * keep trying until route appears or the connection times 354 goto no_route;
355 * itself out.
356 */
357 security_sk_classify_flow(sk, &fl);
358 if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
359 goto no_route;
360 }
361 sk_setup_caps(sk, &rt->dst); 355 sk_setup_caps(sk, &rt->dst);
362 } 356 }
363 skb_dst_set_noref(skb, &rt->dst); 357 skb_dst_set_noref(skb, &rt->dst);
@@ -609,7 +603,7 @@ slow_path:
609 /* IF: it doesn't fit, use 'mtu' - the data space left */ 603 /* IF: it doesn't fit, use 'mtu' - the data space left */
610 if (len > mtu) 604 if (len > mtu)
611 len = mtu; 605 len = mtu;
612 /* IF: we are not sending upto and including the packet end 606 /* IF: we are not sending up to and including the packet end
613 then align the next start on an eight byte boundary */ 607 then align the next start on an eight byte boundary */
614 if (len < left) { 608 if (len < left) {
615 len &= ~7; 609 len &= ~7;
@@ -733,6 +727,7 @@ csum_page(struct page *page, int offset, int copy)
733} 727}
734 728
735static inline int ip_ufo_append_data(struct sock *sk, 729static inline int ip_ufo_append_data(struct sock *sk,
730 struct sk_buff_head *queue,
736 int getfrag(void *from, char *to, int offset, int len, 731 int getfrag(void *from, char *to, int offset, int len,
737 int odd, struct sk_buff *skb), 732 int odd, struct sk_buff *skb),
738 void *from, int length, int hh_len, int fragheaderlen, 733 void *from, int length, int hh_len, int fragheaderlen,
@@ -745,7 +740,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
745 * device, so create one single skb packet containing complete 740 * device, so create one single skb packet containing complete
746 * udp datagram 741 * udp datagram
747 */ 742 */
748 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { 743 if ((skb = skb_peek_tail(queue)) == NULL) {
749 skb = sock_alloc_send_skb(sk, 744 skb = sock_alloc_send_skb(sk,
750 hh_len + fragheaderlen + transhdrlen + 20, 745 hh_len + fragheaderlen + transhdrlen + 20,
751 (flags & MSG_DONTWAIT), &err); 746 (flags & MSG_DONTWAIT), &err);
@@ -767,40 +762,28 @@ static inline int ip_ufo_append_data(struct sock *sk,
767 762
768 skb->ip_summed = CHECKSUM_PARTIAL; 763 skb->ip_summed = CHECKSUM_PARTIAL;
769 skb->csum = 0; 764 skb->csum = 0;
770 sk->sk_sndmsg_off = 0;
771 765
772 /* specify the length of each IP datagram fragment */ 766 /* specify the length of each IP datagram fragment */
773 skb_shinfo(skb)->gso_size = mtu - fragheaderlen; 767 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
774 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 768 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
775 __skb_queue_tail(&sk->sk_write_queue, skb); 769 __skb_queue_tail(queue, skb);
776 } 770 }
777 771
778 return skb_append_datato_frags(sk, skb, getfrag, from, 772 return skb_append_datato_frags(sk, skb, getfrag, from,
779 (length - transhdrlen)); 773 (length - transhdrlen));
780} 774}
781 775
782/* 776static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
783 * ip_append_data() and ip_append_page() can make one large IP datagram 777 struct inet_cork *cork,
784 * from many pieces of data. Each pieces will be holded on the socket 778 int getfrag(void *from, char *to, int offset,
785 * until ip_push_pending_frames() is called. Each piece can be a page 779 int len, int odd, struct sk_buff *skb),
786 * or non-page data. 780 void *from, int length, int transhdrlen,
787 * 781 unsigned int flags)
788 * Not only UDP, other transport protocols - e.g. raw sockets - can use
789 * this interface potentially.
790 *
791 * LATER: length must be adjusted by pad at tail, when it is required.
792 */
793int ip_append_data(struct sock *sk,
794 int getfrag(void *from, char *to, int offset, int len,
795 int odd, struct sk_buff *skb),
796 void *from, int length, int transhdrlen,
797 struct ipcm_cookie *ipc, struct rtable **rtp,
798 unsigned int flags)
799{ 782{
800 struct inet_sock *inet = inet_sk(sk); 783 struct inet_sock *inet = inet_sk(sk);
801 struct sk_buff *skb; 784 struct sk_buff *skb;
802 785
803 struct ip_options *opt = NULL; 786 struct ip_options *opt = cork->opt;
804 int hh_len; 787 int hh_len;
805 int exthdrlen; 788 int exthdrlen;
806 int mtu; 789 int mtu;
@@ -809,58 +792,19 @@ int ip_append_data(struct sock *sk,
809 int offset = 0; 792 int offset = 0;
810 unsigned int maxfraglen, fragheaderlen; 793 unsigned int maxfraglen, fragheaderlen;
811 int csummode = CHECKSUM_NONE; 794 int csummode = CHECKSUM_NONE;
812 struct rtable *rt; 795 struct rtable *rt = (struct rtable *)cork->dst;
813 796
814 if (flags&MSG_PROBE) 797 exthdrlen = transhdrlen ? rt->dst.header_len : 0;
815 return 0; 798 length += exthdrlen;
816 799 transhdrlen += exthdrlen;
817 if (skb_queue_empty(&sk->sk_write_queue)) { 800 mtu = cork->fragsize;
818 /*
819 * setup for corking.
820 */
821 opt = ipc->opt;
822 if (opt) {
823 if (inet->cork.opt == NULL) {
824 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
825 if (unlikely(inet->cork.opt == NULL))
826 return -ENOBUFS;
827 }
828 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
829 inet->cork.flags |= IPCORK_OPT;
830 inet->cork.addr = ipc->addr;
831 }
832 rt = *rtp;
833 if (unlikely(!rt))
834 return -EFAULT;
835 /*
836 * We steal reference to this route, caller should not release it
837 */
838 *rtp = NULL;
839 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
840 rt->dst.dev->mtu :
841 dst_mtu(rt->dst.path);
842 inet->cork.dst = &rt->dst;
843 inet->cork.length = 0;
844 sk->sk_sndmsg_page = NULL;
845 sk->sk_sndmsg_off = 0;
846 exthdrlen = rt->dst.header_len;
847 length += exthdrlen;
848 transhdrlen += exthdrlen;
849 } else {
850 rt = (struct rtable *)inet->cork.dst;
851 if (inet->cork.flags & IPCORK_OPT)
852 opt = inet->cork.opt;
853 801
854 transhdrlen = 0;
855 exthdrlen = 0;
856 mtu = inet->cork.fragsize;
857 }
858 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 802 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
859 803
860 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); 804 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
861 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; 805 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
862 806
863 if (inet->cork.length + length > 0xFFFF - fragheaderlen) { 807 if (cork->length + length > 0xFFFF - fragheaderlen) {
864 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, 808 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
865 mtu-exthdrlen); 809 mtu-exthdrlen);
866 return -EMSGSIZE; 810 return -EMSGSIZE;
@@ -876,15 +820,15 @@ int ip_append_data(struct sock *sk,
876 !exthdrlen) 820 !exthdrlen)
877 csummode = CHECKSUM_PARTIAL; 821 csummode = CHECKSUM_PARTIAL;
878 822
879 skb = skb_peek_tail(&sk->sk_write_queue); 823 skb = skb_peek_tail(queue);
880 824
881 inet->cork.length += length; 825 cork->length += length;
882 if (((length > mtu) || (skb && skb_is_gso(skb))) && 826 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
883 (sk->sk_protocol == IPPROTO_UDP) && 827 (sk->sk_protocol == IPPROTO_UDP) &&
884 (rt->dst.dev->features & NETIF_F_UFO)) { 828 (rt->dst.dev->features & NETIF_F_UFO)) {
885 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, 829 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
886 fragheaderlen, transhdrlen, mtu, 830 hh_len, fragheaderlen, transhdrlen,
887 flags); 831 mtu, flags);
888 if (err) 832 if (err)
889 goto error; 833 goto error;
890 return 0; 834 return 0;
@@ -961,7 +905,7 @@ alloc_new_skb:
961 else 905 else
962 /* only the initial fragment is 906 /* only the initial fragment is
963 time stamped */ 907 time stamped */
964 ipc->tx_flags = 0; 908 cork->tx_flags = 0;
965 } 909 }
966 if (skb == NULL) 910 if (skb == NULL)
967 goto error; 911 goto error;
@@ -972,7 +916,7 @@ alloc_new_skb:
972 skb->ip_summed = csummode; 916 skb->ip_summed = csummode;
973 skb->csum = 0; 917 skb->csum = 0;
974 skb_reserve(skb, hh_len); 918 skb_reserve(skb, hh_len);
975 skb_shinfo(skb)->tx_flags = ipc->tx_flags; 919 skb_shinfo(skb)->tx_flags = cork->tx_flags;
976 920
977 /* 921 /*
978 * Find where to start putting bytes. 922 * Find where to start putting bytes.
@@ -1009,7 +953,7 @@ alloc_new_skb:
1009 /* 953 /*
1010 * Put the packet on the pending queue. 954 * Put the packet on the pending queue.
1011 */ 955 */
1012 __skb_queue_tail(&sk->sk_write_queue, skb); 956 __skb_queue_tail(queue, skb);
1013 continue; 957 continue;
1014 } 958 }
1015 959
@@ -1029,8 +973,8 @@ alloc_new_skb:
1029 } else { 973 } else {
1030 int i = skb_shinfo(skb)->nr_frags; 974 int i = skb_shinfo(skb)->nr_frags;
1031 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 975 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1032 struct page *page = sk->sk_sndmsg_page; 976 struct page *page = cork->page;
1033 int off = sk->sk_sndmsg_off; 977 int off = cork->off;
1034 unsigned int left; 978 unsigned int left;
1035 979
1036 if (page && (left = PAGE_SIZE - off) > 0) { 980 if (page && (left = PAGE_SIZE - off) > 0) {
@@ -1042,7 +986,7 @@ alloc_new_skb:
1042 goto error; 986 goto error;
1043 } 987 }
1044 get_page(page); 988 get_page(page);
1045 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); 989 skb_fill_page_desc(skb, i, page, off, 0);
1046 frag = &skb_shinfo(skb)->frags[i]; 990 frag = &skb_shinfo(skb)->frags[i];
1047 } 991 }
1048 } else if (i < MAX_SKB_FRAGS) { 992 } else if (i < MAX_SKB_FRAGS) {
@@ -1053,8 +997,8 @@ alloc_new_skb:
1053 err = -ENOMEM; 997 err = -ENOMEM;
1054 goto error; 998 goto error;
1055 } 999 }
1056 sk->sk_sndmsg_page = page; 1000 cork->page = page;
1057 sk->sk_sndmsg_off = 0; 1001 cork->off = 0;
1058 1002
1059 skb_fill_page_desc(skb, i, page, 0, 0); 1003 skb_fill_page_desc(skb, i, page, 0, 0);
1060 frag = &skb_shinfo(skb)->frags[i]; 1004 frag = &skb_shinfo(skb)->frags[i];
@@ -1066,7 +1010,7 @@ alloc_new_skb:
1066 err = -EFAULT; 1010 err = -EFAULT;
1067 goto error; 1011 goto error;
1068 } 1012 }
1069 sk->sk_sndmsg_off += copy; 1013 cork->off += copy;
1070 frag->size += copy; 1014 frag->size += copy;
1071 skb->len += copy; 1015 skb->len += copy;
1072 skb->data_len += copy; 1016 skb->data_len += copy;
@@ -1080,11 +1024,87 @@ alloc_new_skb:
1080 return 0; 1024 return 0;
1081 1025
1082error: 1026error:
1083 inet->cork.length -= length; 1027 cork->length -= length;
1084 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1028 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1085 return err; 1029 return err;
1086} 1030}
1087 1031
1032static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1033 struct ipcm_cookie *ipc, struct rtable **rtp)
1034{
1035 struct inet_sock *inet = inet_sk(sk);
1036 struct ip_options *opt;
1037 struct rtable *rt;
1038
1039 /*
1040 * setup for corking.
1041 */
1042 opt = ipc->opt;
1043 if (opt) {
1044 if (cork->opt == NULL) {
1045 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1046 sk->sk_allocation);
1047 if (unlikely(cork->opt == NULL))
1048 return -ENOBUFS;
1049 }
1050 memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen);
1051 cork->flags |= IPCORK_OPT;
1052 cork->addr = ipc->addr;
1053 }
1054 rt = *rtp;
1055 if (unlikely(!rt))
1056 return -EFAULT;
1057 /*
1058 * We steal reference to this route, caller should not release it
1059 */
1060 *rtp = NULL;
1061 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1062 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1063 cork->dst = &rt->dst;
1064 cork->length = 0;
1065 cork->tx_flags = ipc->tx_flags;
1066 cork->page = NULL;
1067 cork->off = 0;
1068
1069 return 0;
1070}
1071
1072/*
1073 * ip_append_data() and ip_append_page() can make one large IP datagram
1074 * from many pieces of data. Each pieces will be holded on the socket
1075 * until ip_push_pending_frames() is called. Each piece can be a page
1076 * or non-page data.
1077 *
1078 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1079 * this interface potentially.
1080 *
1081 * LATER: length must be adjusted by pad at tail, when it is required.
1082 */
1083int ip_append_data(struct sock *sk,
1084 int getfrag(void *from, char *to, int offset, int len,
1085 int odd, struct sk_buff *skb),
1086 void *from, int length, int transhdrlen,
1087 struct ipcm_cookie *ipc, struct rtable **rtp,
1088 unsigned int flags)
1089{
1090 struct inet_sock *inet = inet_sk(sk);
1091 int err;
1092
1093 if (flags&MSG_PROBE)
1094 return 0;
1095
1096 if (skb_queue_empty(&sk->sk_write_queue)) {
1097 err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
1098 if (err)
1099 return err;
1100 } else {
1101 transhdrlen = 0;
1102 }
1103
1104 return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
1105 from, length, transhdrlen, flags);
1106}
1107
1088ssize_t ip_append_page(struct sock *sk, struct page *page, 1108ssize_t ip_append_page(struct sock *sk, struct page *page,
1089 int offset, size_t size, int flags) 1109 int offset, size_t size, int flags)
1090{ 1110{
@@ -1228,40 +1248,41 @@ error:
1228 return err; 1248 return err;
1229} 1249}
1230 1250
1231static void ip_cork_release(struct inet_sock *inet) 1251static void ip_cork_release(struct inet_cork *cork)
1232{ 1252{
1233 inet->cork.flags &= ~IPCORK_OPT; 1253 cork->flags &= ~IPCORK_OPT;
1234 kfree(inet->cork.opt); 1254 kfree(cork->opt);
1235 inet->cork.opt = NULL; 1255 cork->opt = NULL;
1236 dst_release(inet->cork.dst); 1256 dst_release(cork->dst);
1237 inet->cork.dst = NULL; 1257 cork->dst = NULL;
1238} 1258}
1239 1259
1240/* 1260/*
1241 * Combined all pending IP fragments on the socket as one IP datagram 1261 * Combined all pending IP fragments on the socket as one IP datagram
1242 * and push them out. 1262 * and push them out.
1243 */ 1263 */
1244int ip_push_pending_frames(struct sock *sk) 1264struct sk_buff *__ip_make_skb(struct sock *sk,
1265 struct sk_buff_head *queue,
1266 struct inet_cork *cork)
1245{ 1267{
1246 struct sk_buff *skb, *tmp_skb; 1268 struct sk_buff *skb, *tmp_skb;
1247 struct sk_buff **tail_skb; 1269 struct sk_buff **tail_skb;
1248 struct inet_sock *inet = inet_sk(sk); 1270 struct inet_sock *inet = inet_sk(sk);
1249 struct net *net = sock_net(sk); 1271 struct net *net = sock_net(sk);
1250 struct ip_options *opt = NULL; 1272 struct ip_options *opt = NULL;
1251 struct rtable *rt = (struct rtable *)inet->cork.dst; 1273 struct rtable *rt = (struct rtable *)cork->dst;
1252 struct iphdr *iph; 1274 struct iphdr *iph;
1253 __be16 df = 0; 1275 __be16 df = 0;
1254 __u8 ttl; 1276 __u8 ttl;
1255 int err = 0;
1256 1277
1257 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) 1278 if ((skb = __skb_dequeue(queue)) == NULL)
1258 goto out; 1279 goto out;
1259 tail_skb = &(skb_shinfo(skb)->frag_list); 1280 tail_skb = &(skb_shinfo(skb)->frag_list);
1260 1281
1261 /* move skb->data to ip header from ext header */ 1282 /* move skb->data to ip header from ext header */
1262 if (skb->data < skb_network_header(skb)) 1283 if (skb->data < skb_network_header(skb))
1263 __skb_pull(skb, skb_network_offset(skb)); 1284 __skb_pull(skb, skb_network_offset(skb));
1264 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { 1285 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1265 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1286 __skb_pull(tmp_skb, skb_network_header_len(skb));
1266 *tail_skb = tmp_skb; 1287 *tail_skb = tmp_skb;
1267 tail_skb = &(tmp_skb->next); 1288 tail_skb = &(tmp_skb->next);
@@ -1287,8 +1308,8 @@ int ip_push_pending_frames(struct sock *sk)
1287 ip_dont_fragment(sk, &rt->dst))) 1308 ip_dont_fragment(sk, &rt->dst)))
1288 df = htons(IP_DF); 1309 df = htons(IP_DF);
1289 1310
1290 if (inet->cork.flags & IPCORK_OPT) 1311 if (cork->flags & IPCORK_OPT)
1291 opt = inet->cork.opt; 1312 opt = cork->opt;
1292 1313
1293 if (rt->rt_type == RTN_MULTICAST) 1314 if (rt->rt_type == RTN_MULTICAST)
1294 ttl = inet->mc_ttl; 1315 ttl = inet->mc_ttl;
@@ -1300,7 +1321,7 @@ int ip_push_pending_frames(struct sock *sk)
1300 iph->ihl = 5; 1321 iph->ihl = 5;
1301 if (opt) { 1322 if (opt) {
1302 iph->ihl += opt->optlen>>2; 1323 iph->ihl += opt->optlen>>2;
1303 ip_options_build(skb, opt, inet->cork.addr, rt, 0); 1324 ip_options_build(skb, opt, cork->addr, rt, 0);
1304 } 1325 }
1305 iph->tos = inet->tos; 1326 iph->tos = inet->tos;
1306 iph->frag_off = df; 1327 iph->frag_off = df;
@@ -1316,44 +1337,95 @@ int ip_push_pending_frames(struct sock *sk)
1316 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec 1337 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1317 * on dst refcount 1338 * on dst refcount
1318 */ 1339 */
1319 inet->cork.dst = NULL; 1340 cork->dst = NULL;
1320 skb_dst_set(skb, &rt->dst); 1341 skb_dst_set(skb, &rt->dst);
1321 1342
1322 if (iph->protocol == IPPROTO_ICMP) 1343 if (iph->protocol == IPPROTO_ICMP)
1323 icmp_out_count(net, ((struct icmphdr *) 1344 icmp_out_count(net, ((struct icmphdr *)
1324 skb_transport_header(skb))->type); 1345 skb_transport_header(skb))->type);
1325 1346
1326 /* Netfilter gets whole the not fragmented skb. */ 1347 ip_cork_release(cork);
1348out:
1349 return skb;
1350}
1351
1352int ip_send_skb(struct sk_buff *skb)
1353{
1354 struct net *net = sock_net(skb->sk);
1355 int err;
1356
1327 err = ip_local_out(skb); 1357 err = ip_local_out(skb);
1328 if (err) { 1358 if (err) {
1329 if (err > 0) 1359 if (err > 0)
1330 err = net_xmit_errno(err); 1360 err = net_xmit_errno(err);
1331 if (err) 1361 if (err)
1332 goto error; 1362 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1333 } 1363 }
1334 1364
1335out:
1336 ip_cork_release(inet);
1337 return err; 1365 return err;
1366}
1338 1367
1339error: 1368int ip_push_pending_frames(struct sock *sk)
1340 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); 1369{
1341 goto out; 1370 struct sk_buff *skb;
1371
1372 skb = ip_finish_skb(sk);
1373 if (!skb)
1374 return 0;
1375
1376 /* Netfilter gets whole the not fragmented skb. */
1377 return ip_send_skb(skb);
1342} 1378}
1343 1379
1344/* 1380/*
1345 * Throw away all pending data on the socket. 1381 * Throw away all pending data on the socket.
1346 */ 1382 */
1347void ip_flush_pending_frames(struct sock *sk) 1383static void __ip_flush_pending_frames(struct sock *sk,
1384 struct sk_buff_head *queue,
1385 struct inet_cork *cork)
1348{ 1386{
1349 struct sk_buff *skb; 1387 struct sk_buff *skb;
1350 1388
1351 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) 1389 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1352 kfree_skb(skb); 1390 kfree_skb(skb);
1353 1391
1354 ip_cork_release(inet_sk(sk)); 1392 ip_cork_release(cork);
1393}
1394
1395void ip_flush_pending_frames(struct sock *sk)
1396{
1397 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
1355} 1398}
1356 1399
1400struct sk_buff *ip_make_skb(struct sock *sk,
1401 int getfrag(void *from, char *to, int offset,
1402 int len, int odd, struct sk_buff *skb),
1403 void *from, int length, int transhdrlen,
1404 struct ipcm_cookie *ipc, struct rtable **rtp,
1405 unsigned int flags)
1406{
1407 struct inet_cork cork = {};
1408 struct sk_buff_head queue;
1409 int err;
1410
1411 if (flags & MSG_PROBE)
1412 return NULL;
1413
1414 __skb_queue_head_init(&queue);
1415
1416 err = ip_setup_cork(sk, &cork, ipc, rtp);
1417 if (err)
1418 return ERR_PTR(err);
1419
1420 err = __ip_append_data(sk, &queue, &cork, getfrag,
1421 from, length, transhdrlen, flags);
1422 if (err) {
1423 __ip_flush_pending_frames(sk, &queue, &cork);
1424 return ERR_PTR(err);
1425 }
1426
1427 return __ip_make_skb(sk, &queue, &cork);
1428}
1357 1429
1358/* 1430/*
1359 * Fetch data from kernel space and fill in checksum if needed. 1431 * Fetch data from kernel space and fill in checksum if needed.
@@ -1402,16 +1474,19 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1402 } 1474 }
1403 1475
1404 { 1476 {
1405 struct flowi fl = { .oif = arg->bound_dev_if, 1477 struct flowi4 fl4 = {
1406 .fl4_dst = daddr, 1478 .flowi4_oif = arg->bound_dev_if,
1407 .fl4_src = rt->rt_spec_dst, 1479 .daddr = daddr,
1408 .fl4_tos = RT_TOS(ip_hdr(skb)->tos), 1480 .saddr = rt->rt_spec_dst,
1409 .fl_ip_sport = tcp_hdr(skb)->dest, 1481 .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
1410 .fl_ip_dport = tcp_hdr(skb)->source, 1482 .fl4_sport = tcp_hdr(skb)->dest,
1411 .proto = sk->sk_protocol, 1483 .fl4_dport = tcp_hdr(skb)->source,
1412 .flags = ip_reply_arg_flowi_flags(arg) }; 1484 .flowi4_proto = sk->sk_protocol,
1413 security_skb_classify_flow(skb, &fl); 1485 .flowi4_flags = ip_reply_arg_flowi_flags(arg),
1414 if (ip_route_output_key(sock_net(sk), &rt, &fl)) 1486 };
1487 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1488 rt = ip_route_output_key(sock_net(sk), &fl4);
1489 if (IS_ERR(rt))
1415 return; 1490 return;
1416 } 1491 }
1417 1492
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 2b097752426b..cbff2ecccf3d 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1444,7 +1444,7 @@ static int __init ip_auto_config(void)
1444 root_server_addr = addr; 1444 root_server_addr = addr;
1445 1445
1446 /* 1446 /*
1447 * Use defaults whereever applicable. 1447 * Use defaults wherever applicable.
1448 */ 1448 */
1449 if (ic_defaults() < 0) 1449 if (ic_defaults() < 0)
1450 return -1; 1450 return -1;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 988f52fba54a..bfc17c5914e7 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -460,19 +460,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
460 goto tx_error_icmp; 460 goto tx_error_icmp;
461 } 461 }
462 462
463 { 463 rt = ip_route_output_ports(dev_net(dev), NULL,
464 struct flowi fl = { 464 dst, tiph->saddr,
465 .oif = tunnel->parms.link, 465 0, 0,
466 .fl4_dst = dst, 466 IPPROTO_IPIP, RT_TOS(tos),
467 .fl4_src= tiph->saddr, 467 tunnel->parms.link);
468 .fl4_tos = RT_TOS(tos), 468 if (IS_ERR(rt)) {
469 .proto = IPPROTO_IPIP 469 dev->stats.tx_carrier_errors++;
470 }; 470 goto tx_error_icmp;
471
472 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
473 dev->stats.tx_carrier_errors++;
474 goto tx_error_icmp;
475 }
476 } 471 }
477 tdev = rt->dst.dev; 472 tdev = rt->dst.dev;
478 473
@@ -583,16 +578,14 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
583 iph = &tunnel->parms.iph; 578 iph = &tunnel->parms.iph;
584 579
585 if (iph->daddr) { 580 if (iph->daddr) {
586 struct flowi fl = { 581 struct rtable *rt = ip_route_output_ports(dev_net(dev), NULL,
587 .oif = tunnel->parms.link, 582 iph->daddr, iph->saddr,
588 .fl4_dst = iph->daddr, 583 0, 0,
589 .fl4_src = iph->saddr, 584 IPPROTO_IPIP,
590 .fl4_tos = RT_TOS(iph->tos), 585 RT_TOS(iph->tos),
591 .proto = IPPROTO_IPIP 586 tunnel->parms.link);
592 }; 587
593 struct rtable *rt; 588 if (!IS_ERR(rt)) {
594
595 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
596 tdev = rt->dst.dev; 589 tdev = rt->dst.dev;
597 ip_rt_put(rt); 590 ip_rt_put(rt);
598 } 591 }
@@ -913,4 +906,4 @@ static void __exit ipip_fini(void)
913module_init(ipip_init); 906module_init(ipip_init);
914module_exit(ipip_fini); 907module_exit(ipip_fini);
915MODULE_LICENSE("GPL"); 908MODULE_LICENSE("GPL");
916MODULE_ALIAS("tunl0"); 909MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8b65a12654e7..1f62eaeb6de4 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -148,14 +148,15 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
148 return NULL; 148 return NULL;
149} 149}
150 150
151static int ipmr_fib_lookup(struct net *net, struct flowi *flp, 151static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
152 struct mr_table **mrt) 152 struct mr_table **mrt)
153{ 153{
154 struct ipmr_result res; 154 struct ipmr_result res;
155 struct fib_lookup_arg arg = { .result = &res, }; 155 struct fib_lookup_arg arg = { .result = &res, };
156 int err; 156 int err;
157 157
158 err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg); 158 err = fib_rules_lookup(net->ipv4.mr_rules_ops,
159 flowi4_to_flowi(flp4), 0, &arg);
159 if (err < 0) 160 if (err < 0)
160 return err; 161 return err;
161 *mrt = res.mrt; 162 *mrt = res.mrt;
@@ -283,7 +284,7 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
283 return net->ipv4.mrt; 284 return net->ipv4.mrt;
284} 285}
285 286
286static int ipmr_fib_lookup(struct net *net, struct flowi *flp, 287static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
287 struct mr_table **mrt) 288 struct mr_table **mrt)
288{ 289{
289 *mrt = net->ipv4.mrt; 290 *mrt = net->ipv4.mrt;
@@ -435,14 +436,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
435{ 436{
436 struct net *net = dev_net(dev); 437 struct net *net = dev_net(dev);
437 struct mr_table *mrt; 438 struct mr_table *mrt;
438 struct flowi fl = { 439 struct flowi4 fl4 = {
439 .oif = dev->ifindex, 440 .flowi4_oif = dev->ifindex,
440 .iif = skb->skb_iif, 441 .flowi4_iif = skb->skb_iif,
441 .mark = skb->mark, 442 .flowi4_mark = skb->mark,
442 }; 443 };
443 int err; 444 int err;
444 445
445 err = ipmr_fib_lookup(net, &fl, &mrt); 446 err = ipmr_fib_lookup(net, &fl4, &mrt);
446 if (err < 0) { 447 if (err < 0) {
447 kfree_skb(skb); 448 kfree_skb(skb);
448 return err; 449 return err;
@@ -1611,26 +1612,20 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1611#endif 1612#endif
1612 1613
1613 if (vif->flags & VIFF_TUNNEL) { 1614 if (vif->flags & VIFF_TUNNEL) {
1614 struct flowi fl = { 1615 rt = ip_route_output_ports(net, NULL,
1615 .oif = vif->link, 1616 vif->remote, vif->local,
1616 .fl4_dst = vif->remote, 1617 0, 0,
1617 .fl4_src = vif->local, 1618 IPPROTO_IPIP,
1618 .fl4_tos = RT_TOS(iph->tos), 1619 RT_TOS(iph->tos), vif->link);
1619 .proto = IPPROTO_IPIP 1620 if (IS_ERR(rt))
1620 };
1621
1622 if (ip_route_output_key(net, &rt, &fl))
1623 goto out_free; 1621 goto out_free;
1624 encap = sizeof(struct iphdr); 1622 encap = sizeof(struct iphdr);
1625 } else { 1623 } else {
1626 struct flowi fl = { 1624 rt = ip_route_output_ports(net, NULL, iph->daddr, 0,
1627 .oif = vif->link, 1625 0, 0,
1628 .fl4_dst = iph->daddr, 1626 IPPROTO_IPIP,
1629 .fl4_tos = RT_TOS(iph->tos), 1627 RT_TOS(iph->tos), vif->link);
1630 .proto = IPPROTO_IPIP 1628 if (IS_ERR(rt))
1631 };
1632
1633 if (ip_route_output_key(net, &rt, &fl))
1634 goto out_free; 1629 goto out_free;
1635 } 1630 }
1636 1631
@@ -1793,6 +1788,24 @@ dont_forward:
1793 return 0; 1788 return 0;
1794} 1789}
1795 1790
1791static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct rtable *rt)
1792{
1793 struct flowi4 fl4 = {
1794 .daddr = rt->rt_key_dst,
1795 .saddr = rt->rt_key_src,
1796 .flowi4_tos = rt->rt_tos,
1797 .flowi4_oif = rt->rt_oif,
1798 .flowi4_iif = rt->rt_iif,
1799 .flowi4_mark = rt->rt_mark,
1800 };
1801 struct mr_table *mrt;
1802 int err;
1803
1804 err = ipmr_fib_lookup(net, &fl4, &mrt);
1805 if (err)
1806 return ERR_PTR(err);
1807 return mrt;
1808}
1796 1809
1797/* 1810/*
1798 * Multicast packets for forwarding arrive here 1811 * Multicast packets for forwarding arrive here
@@ -1805,7 +1818,6 @@ int ip_mr_input(struct sk_buff *skb)
1805 struct net *net = dev_net(skb->dev); 1818 struct net *net = dev_net(skb->dev);
1806 int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; 1819 int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1807 struct mr_table *mrt; 1820 struct mr_table *mrt;
1808 int err;
1809 1821
1810 /* Packet is looped back after forward, it should not be 1822 /* Packet is looped back after forward, it should not be
1811 * forwarded second time, but still can be delivered locally. 1823 * forwarded second time, but still can be delivered locally.
@@ -1813,12 +1825,11 @@ int ip_mr_input(struct sk_buff *skb)
1813 if (IPCB(skb)->flags & IPSKB_FORWARDED) 1825 if (IPCB(skb)->flags & IPSKB_FORWARDED)
1814 goto dont_forward; 1826 goto dont_forward;
1815 1827
1816 err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); 1828 mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb));
1817 if (err < 0) { 1829 if (IS_ERR(mrt)) {
1818 kfree_skb(skb); 1830 kfree_skb(skb);
1819 return err; 1831 return PTR_ERR(mrt);
1820 } 1832 }
1821
1822 if (!local) { 1833 if (!local) {
1823 if (IPCB(skb)->opt.router_alert) { 1834 if (IPCB(skb)->opt.router_alert) {
1824 if (ip_call_ra_chain(skb)) 1835 if (ip_call_ra_chain(skb))
@@ -1946,9 +1957,9 @@ int pim_rcv_v1(struct sk_buff *skb)
1946 1957
1947 pim = igmp_hdr(skb); 1958 pim = igmp_hdr(skb);
1948 1959
1949 if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) 1960 mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb));
1961 if (IS_ERR(mrt))
1950 goto drop; 1962 goto drop;
1951
1952 if (!mrt->mroute_do_pim || 1963 if (!mrt->mroute_do_pim ||
1953 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 1964 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1954 goto drop; 1965 goto drop;
@@ -1978,9 +1989,9 @@ static int pim_rcv(struct sk_buff *skb)
1978 csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 1989 csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1979 goto drop; 1990 goto drop;
1980 1991
1981 if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) 1992 mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb));
1993 if (IS_ERR(mrt))
1982 goto drop; 1994 goto drop;
1983
1984 if (__pim_rcv(mrt, skb, sizeof(*pim))) { 1995 if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1985drop: 1996drop:
1986 kfree_skb(skb); 1997 kfree_skb(skb);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 994a1f29ebbc..4614babdc45f 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -16,7 +16,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
16 struct net *net = dev_net(skb_dst(skb)->dev); 16 struct net *net = dev_net(skb_dst(skb)->dev);
17 const struct iphdr *iph = ip_hdr(skb); 17 const struct iphdr *iph = ip_hdr(skb);
18 struct rtable *rt; 18 struct rtable *rt;
19 struct flowi fl = {}; 19 struct flowi4 fl4 = {};
20 unsigned long orefdst; 20 unsigned long orefdst;
21 unsigned int hh_len; 21 unsigned int hh_len;
22 unsigned int type; 22 unsigned int type;
@@ -31,14 +31,15 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
31 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. 31 * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
32 */ 32 */
33 if (addr_type == RTN_LOCAL) { 33 if (addr_type == RTN_LOCAL) {
34 fl.fl4_dst = iph->daddr; 34 fl4.daddr = iph->daddr;
35 if (type == RTN_LOCAL) 35 if (type == RTN_LOCAL)
36 fl.fl4_src = iph->saddr; 36 fl4.saddr = iph->saddr;
37 fl.fl4_tos = RT_TOS(iph->tos); 37 fl4.flowi4_tos = RT_TOS(iph->tos);
38 fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; 38 fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
39 fl.mark = skb->mark; 39 fl4.flowi4_mark = skb->mark;
40 fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; 40 fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
41 if (ip_route_output_key(net, &rt, &fl) != 0) 41 rt = ip_route_output_key(net, &fl4);
42 if (IS_ERR(rt))
42 return -1; 43 return -1;
43 44
44 /* Drop old route. */ 45 /* Drop old route. */
@@ -47,8 +48,9 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
47 } else { 48 } else {
48 /* non-local src, find valid iif to satisfy 49 /* non-local src, find valid iif to satisfy
49 * rp-filter when calling ip_route_input. */ 50 * rp-filter when calling ip_route_input. */
50 fl.fl4_dst = iph->saddr; 51 fl4.daddr = iph->saddr;
51 if (ip_route_output_key(net, &rt, &fl) != 0) 52 rt = ip_route_output_key(net, &fl4);
53 if (IS_ERR(rt))
52 return -1; 54 return -1;
53 55
54 orefdst = skb->_skb_refdst; 56 orefdst = skb->_skb_refdst;
@@ -66,10 +68,11 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
66 68
67#ifdef CONFIG_XFRM 69#ifdef CONFIG_XFRM
68 if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && 70 if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
69 xfrm_decode_session(skb, &fl, AF_INET) == 0) { 71 xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
70 struct dst_entry *dst = skb_dst(skb); 72 struct dst_entry *dst = skb_dst(skb);
71 skb_dst_set(skb, NULL); 73 skb_dst_set(skb, NULL);
72 if (xfrm_lookup(net, &dst, &fl, skb->sk, 0)) 74 dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
75 if (IS_ERR(dst))
73 return -1; 76 return -1;
74 skb_dst_set(skb, dst); 77 skb_dst_set(skb, dst);
75 } 78 }
@@ -102,7 +105,8 @@ int ip_xfrm_me_harder(struct sk_buff *skb)
102 dst = ((struct xfrm_dst *)dst)->route; 105 dst = ((struct xfrm_dst *)dst)->route;
103 dst_hold(dst); 106 dst_hold(dst);
104 107
105 if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0) 108 dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
109 if (IS_ERR(dst))
106 return -1; 110 return -1;
107 111
108 skb_dst_drop(skb); 112 skb_dst_drop(skb);
@@ -217,9 +221,14 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
217 return csum; 221 return csum;
218} 222}
219 223
220static int nf_ip_route(struct dst_entry **dst, struct flowi *fl) 224static int nf_ip_route(struct net *net, struct dst_entry **dst,
225 struct flowi *fl, bool strict __always_unused)
221{ 226{
222 return ip_route_output_key(&init_net, (struct rtable **)dst, fl); 227 struct rtable *rt = ip_route_output_key(net, &fl->u.ip4);
228 if (IS_ERR(rt))
229 return PTR_ERR(rt);
230 *dst = &rt->dst;
231 return 0;
223} 232}
224 233
225static const struct nf_afinfo nf_ip_afinfo = { 234static const struct nf_afinfo nf_ip_afinfo = {
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index babd1a2bae5f..1dfc18a03fd4 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -64,16 +64,6 @@ config IP_NF_IPTABLES
64if IP_NF_IPTABLES 64if IP_NF_IPTABLES
65 65
66# The matches. 66# The matches.
67config IP_NF_MATCH_ADDRTYPE
68 tristate '"addrtype" address type match support'
69 depends on NETFILTER_ADVANCED
70 help
71 This option allows you to match what routing thinks of an address,
72 eg. UNICAST, LOCAL, BROADCAST, ...
73
74 If you want to compile it as a module, say M here and read
75 <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
76
77config IP_NF_MATCH_AH 67config IP_NF_MATCH_AH
78 tristate '"ah" match support' 68 tristate '"ah" match support'
79 depends on NETFILTER_ADVANCED 69 depends on NETFILTER_ADVANCED
@@ -206,8 +196,9 @@ config IP_NF_TARGET_REDIRECT
206 196
207config NF_NAT_SNMP_BASIC 197config NF_NAT_SNMP_BASIC
208 tristate "Basic SNMP-ALG support" 198 tristate "Basic SNMP-ALG support"
209 depends on NF_NAT 199 depends on NF_CONNTRACK_SNMP && NF_NAT
210 depends on NETFILTER_ADVANCED 200 depends on NETFILTER_ADVANCED
201 default NF_NAT && NF_CONNTRACK_SNMP
211 ---help--- 202 ---help---
212 203
213 This module implements an Application Layer Gateway (ALG) for 204 This module implements an Application Layer Gateway (ALG) for
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 19eb59d01037..dca2082ec683 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -48,7 +48,6 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
48obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o 48obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
49 49
50# matches 50# matches
51obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
52obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o 51obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
53obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o 52obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
54 53
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e855fffaed95..89bc7e66d598 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -76,7 +76,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
76} 76}
77 77
78/* 78/*
79 * Unfortunatly, _b and _mask are not aligned to an int (or long int) 79 * Unfortunately, _b and _mask are not aligned to an int (or long int)
80 * Some arches dont care, unrolling the loop is a win on them. 80 * Some arches dont care, unrolling the loop is a win on them.
81 * For other arches, we only have a 16bit alignement. 81 * For other arches, we only have a 16bit alignement.
82 */ 82 */
@@ -866,6 +866,7 @@ static int compat_table_info(const struct xt_table_info *info,
866 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 866 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
867 newinfo->initial_entries = 0; 867 newinfo->initial_entries = 0;
868 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 868 loc_cpu_entry = info->entries[raw_smp_processor_id()];
869 xt_compat_init_offsets(NFPROTO_ARP, info->number);
869 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 870 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
870 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 871 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
871 if (ret != 0) 872 if (ret != 0)
@@ -1065,6 +1066,7 @@ static int do_replace(struct net *net, const void __user *user,
1065 /* overflow check */ 1066 /* overflow check */
1066 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1067 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1067 return -ENOMEM; 1068 return -ENOMEM;
1069 tmp.name[sizeof(tmp.name)-1] = 0;
1068 1070
1069 newinfo = xt_alloc_table_info(tmp.size); 1071 newinfo = xt_alloc_table_info(tmp.size);
1070 if (!newinfo) 1072 if (!newinfo)
@@ -1333,6 +1335,7 @@ static int translate_compat_table(const char *name,
1333 duprintf("translate_compat_table: size %u\n", info->size); 1335 duprintf("translate_compat_table: size %u\n", info->size);
1334 j = 0; 1336 j = 0;
1335 xt_compat_lock(NFPROTO_ARP); 1337 xt_compat_lock(NFPROTO_ARP);
1338 xt_compat_init_offsets(NFPROTO_ARP, number);
1336 /* Walk through entries, checking offsets. */ 1339 /* Walk through entries, checking offsets. */
1337 xt_entry_foreach(iter0, entry0, total_size) { 1340 xt_entry_foreach(iter0, entry0, total_size) {
1338 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1341 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
@@ -1486,6 +1489,7 @@ static int compat_do_replace(struct net *net, void __user *user,
1486 return -ENOMEM; 1489 return -ENOMEM;
1487 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1490 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1488 return -ENOMEM; 1491 return -ENOMEM;
1492 tmp.name[sizeof(tmp.name)-1] = 0;
1489 1493
1490 newinfo = xt_alloc_table_info(tmp.size); 1494 newinfo = xt_alloc_table_info(tmp.size);
1491 if (!newinfo) 1495 if (!newinfo)
@@ -1738,6 +1742,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
1738 ret = -EFAULT; 1742 ret = -EFAULT;
1739 break; 1743 break;
1740 } 1744 }
1745 rev.name[sizeof(rev.name)-1] = 0;
1741 1746
1742 try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, 1747 try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name,
1743 rev.revision, 1, &ret), 1748 rev.revision, 1, &ret),
@@ -1869,7 +1874,7 @@ static int __init arp_tables_init(void)
1869 if (ret < 0) 1874 if (ret < 0)
1870 goto err1; 1875 goto err1;
1871 1876
1872 /* Noone else will be downing sem now, so we won't sleep */ 1877 /* No one else will be downing sem now, so we won't sleep */
1873 ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); 1878 ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
1874 if (ret < 0) 1879 if (ret < 0)
1875 goto err2; 1880 goto err2;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 652efea013dc..704915028009 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -387,7 +387,7 @@ ipt_do_table(struct sk_buff *skb,
387 verdict = (unsigned)(-v) - 1; 387 verdict = (unsigned)(-v) - 1;
388 break; 388 break;
389 } 389 }
390 if (*stackptr == 0) { 390 if (*stackptr <= origptr) {
391 e = get_entry(table_base, 391 e = get_entry(table_base,
392 private->underflow[hook]); 392 private->underflow[hook]);
393 pr_debug("Underflow (this is normal) " 393 pr_debug("Underflow (this is normal) "
@@ -427,10 +427,10 @@ ipt_do_table(struct sk_buff *skb,
427 /* Verdict */ 427 /* Verdict */
428 break; 428 break;
429 } while (!acpar.hotdrop); 429 } while (!acpar.hotdrop);
430 xt_info_rdunlock_bh();
431 pr_debug("Exiting %s; resetting sp from %u to %u\n", 430 pr_debug("Exiting %s; resetting sp from %u to %u\n",
432 __func__, *stackptr, origptr); 431 __func__, *stackptr, origptr);
433 *stackptr = origptr; 432 *stackptr = origptr;
433 xt_info_rdunlock_bh();
434#ifdef DEBUG_ALLOW_ALL 434#ifdef DEBUG_ALLOW_ALL
435 return NF_ACCEPT; 435 return NF_ACCEPT;
436#else 436#else
@@ -1063,6 +1063,7 @@ static int compat_table_info(const struct xt_table_info *info,
1063 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 1063 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
1064 newinfo->initial_entries = 0; 1064 newinfo->initial_entries = 0;
1065 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 1065 loc_cpu_entry = info->entries[raw_smp_processor_id()];
1066 xt_compat_init_offsets(AF_INET, info->number);
1066 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 1067 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
1067 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 1068 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
1068 if (ret != 0) 1069 if (ret != 0)
@@ -1261,6 +1262,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
1261 /* overflow check */ 1262 /* overflow check */
1262 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1263 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1263 return -ENOMEM; 1264 return -ENOMEM;
1265 tmp.name[sizeof(tmp.name)-1] = 0;
1264 1266
1265 newinfo = xt_alloc_table_info(tmp.size); 1267 newinfo = xt_alloc_table_info(tmp.size);
1266 if (!newinfo) 1268 if (!newinfo)
@@ -1664,6 +1666,7 @@ translate_compat_table(struct net *net,
1664 duprintf("translate_compat_table: size %u\n", info->size); 1666 duprintf("translate_compat_table: size %u\n", info->size);
1665 j = 0; 1667 j = 0;
1666 xt_compat_lock(AF_INET); 1668 xt_compat_lock(AF_INET);
1669 xt_compat_init_offsets(AF_INET, number);
1667 /* Walk through entries, checking offsets. */ 1670 /* Walk through entries, checking offsets. */
1668 xt_entry_foreach(iter0, entry0, total_size) { 1671 xt_entry_foreach(iter0, entry0, total_size) {
1669 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1672 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
@@ -1805,6 +1808,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
1805 return -ENOMEM; 1808 return -ENOMEM;
1806 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) 1809 if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
1807 return -ENOMEM; 1810 return -ENOMEM;
1811 tmp.name[sizeof(tmp.name)-1] = 0;
1808 1812
1809 newinfo = xt_alloc_table_info(tmp.size); 1813 newinfo = xt_alloc_table_info(tmp.size);
1810 if (!newinfo) 1814 if (!newinfo)
@@ -2034,6 +2038,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2034 ret = -EFAULT; 2038 ret = -EFAULT;
2035 break; 2039 break;
2036 } 2040 }
2041 rev.name[sizeof(rev.name)-1] = 0;
2037 2042
2038 if (cmd == IPT_SO_GET_REVISION_TARGET) 2043 if (cmd == IPT_SO_GET_REVISION_TARGET)
2039 target = 1; 2044 target = 1;
@@ -2228,7 +2233,7 @@ static int __init ip_tables_init(void)
2228 if (ret < 0) 2233 if (ret < 0)
2229 goto err1; 2234 goto err1;
2230 2235
2231 /* Noone else will be downing sem now, so we won't sleep */ 2236 /* No one else will be downing sem now, so we won't sleep */
2232 ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); 2237 ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
2233 if (ret < 0) 2238 if (ret < 0)
2234 goto err2; 2239 goto err2;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1e26a4897655..d609ac3cb9a4 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
300 * that the ->target() function isn't called after ->destroy() */ 300 * that the ->target() function isn't called after ->destroy() */
301 301
302 ct = nf_ct_get(skb, &ctinfo); 302 ct = nf_ct_get(skb, &ctinfo);
303 if (ct == NULL) { 303 if (ct == NULL)
304 pr_info("no conntrack!\n");
305 /* FIXME: need to drop invalid ones, since replies
306 * to outgoing connections of other nodes will be
307 * marked as INVALID */
308 return NF_DROP; 304 return NF_DROP;
309 }
310 305
311 /* special case: ICMP error handling. conntrack distinguishes between 306 /* special case: ICMP error handling. conntrack distinguishes between
312 * error messages (RELATED) and information requests (see below) */ 307 * error messages (RELATED) and information requests (see below) */
@@ -669,8 +664,11 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
669 char buffer[PROC_WRITELEN+1]; 664 char buffer[PROC_WRITELEN+1];
670 unsigned long nodenum; 665 unsigned long nodenum;
671 666
672 if (copy_from_user(buffer, input, PROC_WRITELEN)) 667 if (size > PROC_WRITELEN)
668 return -EIO;
669 if (copy_from_user(buffer, input, size))
673 return -EFAULT; 670 return -EFAULT;
671 buffer[size] = 0;
674 672
675 if (*buffer == '+') { 673 if (*buffer == '+') {
676 nodenum = simple_strtoul(buffer+1, NULL, 10); 674 nodenum = simple_strtoul(buffer+1, NULL, 10);
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 72ffc8fda2e9..d76d6c9ed946 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf,
442 } 442 }
443#endif 443#endif
444 444
445 /* MAC logging for input path only. */ 445 if (in != NULL)
446 if (in && !out)
447 dump_mac_header(m, loginfo, skb); 446 dump_mac_header(m, loginfo, skb);
448 447
449 dump_packet(m, loginfo, skb, 0); 448 dump_packet(m, loginfo, skb, 0);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
deleted file mode 100644
index db8bff0fb86d..000000000000
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ /dev/null
@@ -1,134 +0,0 @@
1/*
2 * iptables module to match inet_addr_type() of an ip.
3 *
4 * Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
5 * (C) 2007 Laszlo Attila Toth <panther@balabit.hu>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/skbuff.h>
15#include <linux/netdevice.h>
16#include <linux/ip.h>
17#include <net/route.h>
18
19#include <linux/netfilter_ipv4/ipt_addrtype.h>
20#include <linux/netfilter/x_tables.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
24MODULE_DESCRIPTION("Xtables: address type match for IPv4");
25
26static inline bool match_type(struct net *net, const struct net_device *dev,
27 __be32 addr, u_int16_t mask)
28{
29 return !!(mask & (1 << inet_dev_addr_type(net, dev, addr)));
30}
31
32static bool
33addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
34{
35 struct net *net = dev_net(par->in ? par->in : par->out);
36 const struct ipt_addrtype_info *info = par->matchinfo;
37 const struct iphdr *iph = ip_hdr(skb);
38 bool ret = true;
39
40 if (info->source)
41 ret &= match_type(net, NULL, iph->saddr, info->source) ^
42 info->invert_source;
43 if (info->dest)
44 ret &= match_type(net, NULL, iph->daddr, info->dest) ^
45 info->invert_dest;
46
47 return ret;
48}
49
50static bool
51addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
52{
53 struct net *net = dev_net(par->in ? par->in : par->out);
54 const struct ipt_addrtype_info_v1 *info = par->matchinfo;
55 const struct iphdr *iph = ip_hdr(skb);
56 const struct net_device *dev = NULL;
57 bool ret = true;
58
59 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN)
60 dev = par->in;
61 else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT)
62 dev = par->out;
63
64 if (info->source)
65 ret &= match_type(net, dev, iph->saddr, info->source) ^
66 (info->flags & IPT_ADDRTYPE_INVERT_SOURCE);
67 if (ret && info->dest)
68 ret &= match_type(net, dev, iph->daddr, info->dest) ^
69 !!(info->flags & IPT_ADDRTYPE_INVERT_DEST);
70 return ret;
71}
72
73static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
74{
75 struct ipt_addrtype_info_v1 *info = par->matchinfo;
76
77 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN &&
78 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
79 pr_info("both incoming and outgoing "
80 "interface limitation cannot be selected\n");
81 return -EINVAL;
82 }
83
84 if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
85 (1 << NF_INET_LOCAL_IN)) &&
86 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
87 pr_info("output interface limitation "
88 "not valid in PREROUTING and INPUT\n");
89 return -EINVAL;
90 }
91
92 if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
93 (1 << NF_INET_LOCAL_OUT)) &&
94 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) {
95 pr_info("input interface limitation "
96 "not valid in POSTROUTING and OUTPUT\n");
97 return -EINVAL;
98 }
99
100 return 0;
101}
102
103static struct xt_match addrtype_mt_reg[] __read_mostly = {
104 {
105 .name = "addrtype",
106 .family = NFPROTO_IPV4,
107 .match = addrtype_mt_v0,
108 .matchsize = sizeof(struct ipt_addrtype_info),
109 .me = THIS_MODULE
110 },
111 {
112 .name = "addrtype",
113 .family = NFPROTO_IPV4,
114 .revision = 1,
115 .match = addrtype_mt_v1,
116 .checkentry = addrtype_mt_checkentry_v1,
117 .matchsize = sizeof(struct ipt_addrtype_info_v1),
118 .me = THIS_MODULE
119 }
120};
121
122static int __init addrtype_mt_init(void)
123{
124 return xt_register_matches(addrtype_mt_reg,
125 ARRAY_SIZE(addrtype_mt_reg));
126}
127
128static void __exit addrtype_mt_exit(void)
129{
130 xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg));
131}
132
133module_init(addrtype_mt_init);
134module_exit(addrtype_mt_exit);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 294a2a32f293..aef5d1fbe77d 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, 60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
61 dev_net(out)->ipv4.iptable_mangle); 61 dev_net(out)->ipv4.iptable_mangle);
62 /* Reroute for ANY change. */ 62 /* Reroute for ANY change. */
63 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { 63 if (ret != NF_DROP && ret != NF_STOLEN) {
64 iph = ip_hdr(skb); 64 iph = ip_hdr(skb);
65 65
66 if (iph->saddr != saddr || 66 if (iph->saddr != saddr ||
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 63f60fc5d26a..5585980fce2e 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -20,6 +20,7 @@
20#include <net/netfilter/nf_conntrack_l4proto.h> 20#include <net/netfilter/nf_conntrack_l4proto.h>
21#include <net/netfilter/nf_conntrack_expect.h> 21#include <net/netfilter/nf_conntrack_expect.h>
22#include <net/netfilter/nf_conntrack_acct.h> 22#include <net/netfilter/nf_conntrack_acct.h>
23#include <linux/rculist_nulls.h>
23 24
24struct ct_iter_state { 25struct ct_iter_state {
25 struct seq_net_private p; 26 struct seq_net_private p;
@@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
35 for (st->bucket = 0; 36 for (st->bucket = 0;
36 st->bucket < net->ct.htable_size; 37 st->bucket < net->ct.htable_size;
37 st->bucket++) { 38 st->bucket++) {
38 n = rcu_dereference(net->ct.hash[st->bucket].first); 39 n = rcu_dereference(
40 hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
39 if (!is_a_nulls(n)) 41 if (!is_a_nulls(n))
40 return n; 42 return n;
41 } 43 }
@@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
48 struct net *net = seq_file_net(seq); 50 struct net *net = seq_file_net(seq);
49 struct ct_iter_state *st = seq->private; 51 struct ct_iter_state *st = seq->private;
50 52
51 head = rcu_dereference(head->next); 53 head = rcu_dereference(hlist_nulls_next_rcu(head));
52 while (is_a_nulls(head)) { 54 while (is_a_nulls(head)) {
53 if (likely(get_nulls_value(head) == st->bucket)) { 55 if (likely(get_nulls_value(head) == st->bucket)) {
54 if (++st->bucket >= net->ct.htable_size) 56 if (++st->bucket >= net->ct.htable_size)
55 return NULL; 57 return NULL;
56 } 58 }
57 head = rcu_dereference(net->ct.hash[st->bucket].first); 59 head = rcu_dereference(
60 hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
58 } 61 }
59 return head; 62 return head;
60} 63}
@@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
217 struct hlist_node *n; 220 struct hlist_node *n;
218 221
219 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { 222 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
220 n = rcu_dereference(net->ct.expect_hash[st->bucket].first); 223 n = rcu_dereference(
224 hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
221 if (n) 225 if (n)
222 return n; 226 return n;
223 } 227 }
@@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
230 struct net *net = seq_file_net(seq); 234 struct net *net = seq_file_net(seq);
231 struct ct_expect_iter_state *st = seq->private; 235 struct ct_expect_iter_state *st = seq->private;
232 236
233 head = rcu_dereference(head->next); 237 head = rcu_dereference(hlist_next_rcu(head));
234 while (head == NULL) { 238 while (head == NULL) {
235 if (++st->bucket >= nf_ct_expect_hsize) 239 if (++st->bucket >= nf_ct_expect_hsize)
236 return NULL; 240 return NULL;
237 head = rcu_dereference(net->ct.expect_hash[st->bucket].first); 241 head = rcu_dereference(
242 hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
238 } 243 }
239 return head; 244 return head;
240} 245}
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 0f23b3f06df0..703f366fd235 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb,
44 44
45 /* Try to get same port: if not, try to change it. */ 45 /* Try to get same port: if not, try to change it. */
46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
47 int ret; 47 int res;
48 48
49 exp->tuple.dst.u.tcp.port = htons(port); 49 exp->tuple.dst.u.tcp.port = htons(port);
50 ret = nf_ct_expect_related(exp); 50 res = nf_ct_expect_related(exp);
51 if (ret == 0) 51 if (res == 0)
52 break; 52 break;
53 else if (ret != -EBUSY) { 53 else if (res != -EBUSY) {
54 port = 0; 54 port = 0;
55 break; 55 break;
56 } 56 }
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index c04787ce1a71..9c71b2755ce3 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
221 manips not an issue. */ 221 manips not an issue. */
222 if (maniptype == IP_NAT_MANIP_SRC && 222 if (maniptype == IP_NAT_MANIP_SRC &&
223 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { 223 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
224 if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { 224 /* try the original tuple first */
225 if (in_range(orig_tuple, range)) {
226 if (!nf_nat_used_tuple(orig_tuple, ct)) {
227 *tuple = *orig_tuple;
228 return;
229 }
230 } else if (find_appropriate_src(net, zone, orig_tuple, tuple,
231 range)) {
225 pr_debug("get_unique_tuple: Found current src map\n"); 232 pr_debug("get_unique_tuple: Found current src map\n");
226 if (!nf_nat_used_tuple(tuple, ct)) 233 if (!nf_nat_used_tuple(tuple, ct))
227 return; 234 return;
@@ -266,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct,
266 struct net *net = nf_ct_net(ct); 273 struct net *net = nf_ct_net(ct);
267 struct nf_conntrack_tuple curr_tuple, new_tuple; 274 struct nf_conntrack_tuple curr_tuple, new_tuple;
268 struct nf_conn_nat *nat; 275 struct nf_conn_nat *nat;
269 int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
270 276
271 /* nat helper or nfctnetlink also setup binding */ 277 /* nat helper or nfctnetlink also setup binding */
272 nat = nfct_nat(ct); 278 nat = nfct_nat(ct);
@@ -306,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct,
306 ct->status |= IPS_DST_NAT; 312 ct->status |= IPS_DST_NAT;
307 } 313 }
308 314
309 /* Place in source hash if this is the first time. */ 315 if (maniptype == IP_NAT_MANIP_SRC) {
310 if (have_to_hash) {
311 unsigned int srchash; 316 unsigned int srchash;
312 317
313 srchash = hash_by_src(net, nf_ct_zone(ct), 318 srchash = hash_by_src(net, nf_ct_zone(ct),
@@ -323,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct,
323 328
324 /* It's done. */ 329 /* It's done. */
325 if (maniptype == IP_NAT_MANIP_DST) 330 if (maniptype == IP_NAT_MANIP_DST)
326 set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); 331 ct->status |= IPS_DST_NAT_DONE;
327 else 332 else
328 set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); 333 ct->status |= IPS_SRC_NAT_DONE;
329 334
330 return NF_ACCEPT; 335 return NF_ACCEPT;
331} 336}
@@ -502,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
502 int ret = 0; 507 int ret = 0;
503 508
504 spin_lock_bh(&nf_nat_lock); 509 spin_lock_bh(&nf_nat_lock);
505 if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { 510 if (rcu_dereference_protected(
511 nf_nat_protos[proto->protonum],
512 lockdep_is_held(&nf_nat_lock)
513 ) != &nf_nat_unknown_protocol) {
506 ret = -EBUSY; 514 ret = -EBUSY;
507 goto out; 515 goto out;
508 } 516 }
@@ -513,7 +521,7 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
513} 521}
514EXPORT_SYMBOL(nf_nat_protocol_register); 522EXPORT_SYMBOL(nf_nat_protocol_register);
515 523
516/* Noone stores the protocol anywhere; simply delete it. */ 524/* No one stores the protocol anywhere; simply delete it. */
517void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) 525void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
518{ 526{
519 spin_lock_bh(&nf_nat_lock); 527 spin_lock_bh(&nf_nat_lock);
@@ -524,7 +532,7 @@ void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto)
524} 532}
525EXPORT_SYMBOL(nf_nat_protocol_unregister); 533EXPORT_SYMBOL(nf_nat_protocol_unregister);
526 534
527/* Noone using conntrack by the time this called. */ 535/* No one using conntrack by the time this called. */
528static void nf_nat_cleanup_conntrack(struct nf_conn *ct) 536static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
529{ 537{
530 struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); 538 struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
@@ -532,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
532 if (nat == NULL || nat->ct == NULL) 540 if (nat == NULL || nat->ct == NULL)
533 return; 541 return;
534 542
535 NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK); 543 NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
536 544
537 spin_lock_bh(&nf_nat_lock); 545 spin_lock_bh(&nf_nat_lock);
538 hlist_del_rcu(&nat->bysource); 546 hlist_del_rcu(&nat->bysource);
@@ -545,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old)
545 struct nf_conn_nat *old_nat = old; 553 struct nf_conn_nat *old_nat = old;
546 struct nf_conn *ct = old_nat->ct; 554 struct nf_conn *ct = old_nat->ct;
547 555
548 if (!ct || !(ct->status & IPS_NAT_DONE_MASK)) 556 if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
549 return; 557 return;
550 558
551 spin_lock_bh(&nf_nat_lock); 559 spin_lock_bh(&nf_nat_lock);
552 new_nat->ct = ct;
553 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); 560 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
554 spin_unlock_bh(&nf_nat_lock); 561 spin_unlock_bh(&nf_nat_lock);
555} 562}
@@ -679,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net)
679{ 686{
680 /* Leave them the same for the moment. */ 687 /* Leave them the same for the moment. */
681 net->ipv4.nat_htable_size = net->ct.htable_size; 688 net->ipv4.nat_htable_size = net->ct.htable_size;
682 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 689 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
683 &net->ipv4.nat_vmalloced, 0);
684 if (!net->ipv4.nat_bysource) 690 if (!net->ipv4.nat_bysource)
685 return -ENOMEM; 691 return -ENOMEM;
686 return 0; 692 return 0;
@@ -702,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
702{ 708{
703 nf_ct_iterate_cleanup(net, &clean_nat, NULL); 709 nf_ct_iterate_cleanup(net, &clean_nat, NULL);
704 synchronize_rcu(); 710 synchronize_rcu();
705 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, 711 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
706 net->ipv4.nat_htable_size);
707} 712}
708 713
709static struct pernet_operations nf_nat_net_ops = { 714static struct pernet_operations nf_nat_net_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ee5f419d0a56..8812a02078ab 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -54,6 +54,7 @@
54#include <net/netfilter/nf_conntrack_expect.h> 54#include <net/netfilter/nf_conntrack_expect.h>
55#include <net/netfilter/nf_conntrack_helper.h> 55#include <net/netfilter/nf_conntrack_helper.h>
56#include <net/netfilter/nf_nat_helper.h> 56#include <net/netfilter/nf_nat_helper.h>
57#include <linux/netfilter/nf_conntrack_snmp.h>
57 58
58MODULE_LICENSE("GPL"); 59MODULE_LICENSE("GPL");
59MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); 60MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
@@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void)
1310{ 1311{
1311 int ret = 0; 1312 int ret = 0;
1312 1313
1313 ret = nf_conntrack_helper_register(&snmp_helper); 1314 BUG_ON(nf_nat_snmp_hook != NULL);
1314 if (ret < 0) 1315 rcu_assign_pointer(nf_nat_snmp_hook, help);
1315 return ret; 1316
1316 ret = nf_conntrack_helper_register(&snmp_trap_helper); 1317 ret = nf_conntrack_helper_register(&snmp_trap_helper);
1317 if (ret < 0) { 1318 if (ret < 0) {
1318 nf_conntrack_helper_unregister(&snmp_helper); 1319 nf_conntrack_helper_unregister(&snmp_helper);
@@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void)
1323 1324
1324static void __exit nf_nat_snmp_basic_fini(void) 1325static void __exit nf_nat_snmp_basic_fini(void)
1325{ 1326{
1326 nf_conntrack_helper_unregister(&snmp_helper); 1327 rcu_assign_pointer(nf_nat_snmp_hook, NULL);
1327 nf_conntrack_helper_unregister(&snmp_trap_helper); 1328 nf_conntrack_helper_unregister(&snmp_trap_helper);
1328} 1329}
1329 1330
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 95481fee8bdb..7317bdf1d457 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -31,6 +31,7 @@
31#ifdef CONFIG_XFRM 31#ifdef CONFIG_XFRM
32static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) 32static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
33{ 33{
34 struct flowi4 *fl4 = &fl->u.ip4;
34 const struct nf_conn *ct; 35 const struct nf_conn *ct;
35 const struct nf_conntrack_tuple *t; 36 const struct nf_conntrack_tuple *t;
36 enum ip_conntrack_info ctinfo; 37 enum ip_conntrack_info ctinfo;
@@ -49,25 +50,25 @@ static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
49 statusbit = IPS_SRC_NAT; 50 statusbit = IPS_SRC_NAT;
50 51
51 if (ct->status & statusbit) { 52 if (ct->status & statusbit) {
52 fl->fl4_dst = t->dst.u3.ip; 53 fl4->daddr = t->dst.u3.ip;
53 if (t->dst.protonum == IPPROTO_TCP || 54 if (t->dst.protonum == IPPROTO_TCP ||
54 t->dst.protonum == IPPROTO_UDP || 55 t->dst.protonum == IPPROTO_UDP ||
55 t->dst.protonum == IPPROTO_UDPLITE || 56 t->dst.protonum == IPPROTO_UDPLITE ||
56 t->dst.protonum == IPPROTO_DCCP || 57 t->dst.protonum == IPPROTO_DCCP ||
57 t->dst.protonum == IPPROTO_SCTP) 58 t->dst.protonum == IPPROTO_SCTP)
58 fl->fl_ip_dport = t->dst.u.tcp.port; 59 fl4->fl4_dport = t->dst.u.tcp.port;
59 } 60 }
60 61
61 statusbit ^= IPS_NAT_MASK; 62 statusbit ^= IPS_NAT_MASK;
62 63
63 if (ct->status & statusbit) { 64 if (ct->status & statusbit) {
64 fl->fl4_src = t->src.u3.ip; 65 fl4->saddr = t->src.u3.ip;
65 if (t->dst.protonum == IPPROTO_TCP || 66 if (t->dst.protonum == IPPROTO_TCP ||
66 t->dst.protonum == IPPROTO_UDP || 67 t->dst.protonum == IPPROTO_UDP ||
67 t->dst.protonum == IPPROTO_UDPLITE || 68 t->dst.protonum == IPPROTO_UDPLITE ||
68 t->dst.protonum == IPPROTO_DCCP || 69 t->dst.protonum == IPPROTO_DCCP ||
69 t->dst.protonum == IPPROTO_SCTP) 70 t->dst.protonum == IPPROTO_SCTP)
70 fl->fl_ip_sport = t->src.u.tcp.port; 71 fl4->fl4_sport = t->src.u.tcp.port;
71 } 72 }
72} 73}
73#endif 74#endif
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 6390ba299b3d..bceaec42c37d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -402,7 +402,7 @@ error:
402 return err; 402 return err;
403} 403}
404 404
405static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) 405static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
406{ 406{
407 struct iovec *iov; 407 struct iovec *iov;
408 u8 __user *type = NULL; 408 u8 __user *type = NULL;
@@ -418,7 +418,7 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
418 if (!iov) 418 if (!iov)
419 continue; 419 continue;
420 420
421 switch (fl->proto) { 421 switch (fl4->flowi4_proto) {
422 case IPPROTO_ICMP: 422 case IPPROTO_ICMP:
423 /* check if one-byte field is readable or not. */ 423 /* check if one-byte field is readable or not. */
424 if (iov->iov_base && iov->iov_len < 1) 424 if (iov->iov_base && iov->iov_len < 1)
@@ -433,8 +433,8 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
433 code = iov->iov_base; 433 code = iov->iov_base;
434 434
435 if (type && code) { 435 if (type && code) {
436 if (get_user(fl->fl_icmp_type, type) || 436 if (get_user(fl4->fl4_icmp_type, type) ||
437 get_user(fl->fl_icmp_code, code)) 437 get_user(fl4->fl4_icmp_code, code))
438 return -EFAULT; 438 return -EFAULT;
439 probed = 1; 439 probed = 1;
440 } 440 }
@@ -548,25 +548,31 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
548 } 548 }
549 549
550 { 550 {
551 struct flowi fl = { .oif = ipc.oif, 551 struct flowi4 fl4 = {
552 .mark = sk->sk_mark, 552 .flowi4_oif = ipc.oif,
553 .fl4_dst = daddr, 553 .flowi4_mark = sk->sk_mark,
554 .fl4_src = saddr, 554 .daddr = daddr,
555 .fl4_tos = tos, 555 .saddr = saddr,
556 .proto = inet->hdrincl ? IPPROTO_RAW : 556 .flowi4_tos = tos,
557 sk->sk_protocol, 557 .flowi4_proto = (inet->hdrincl ?
558 }; 558 IPPROTO_RAW :
559 sk->sk_protocol),
560 .flowi4_flags = FLOWI_FLAG_CAN_SLEEP,
561 };
559 if (!inet->hdrincl) { 562 if (!inet->hdrincl) {
560 err = raw_probe_proto_opt(&fl, msg); 563 err = raw_probe_proto_opt(&fl4, msg);
561 if (err) 564 if (err)
562 goto done; 565 goto done;
563 } 566 }
564 567
565 security_sk_classify_flow(sk, &fl); 568 security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
566 err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); 569 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
570 if (IS_ERR(rt)) {
571 err = PTR_ERR(rt);
572 rt = NULL;
573 goto done;
574 }
567 } 575 }
568 if (err)
569 goto done;
570 576
571 err = -EACCES; 577 err = -EACCES;
572 if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) 578 if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
@@ -616,7 +622,7 @@ do_confirm:
616static void raw_close(struct sock *sk, long timeout) 622static void raw_close(struct sock *sk, long timeout)
617{ 623{
618 /* 624 /*
619 * Raw sockets may have direct kernel refereneces. Kill them. 625 * Raw sockets may have direct kernel references. Kill them.
620 */ 626 */
621 ip_ra_control(sk, 0, NULL); 627 ip_ra_control(sk, 0, NULL);
622 628
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6ed6603c2f6d..99e6e4bb1c72 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -109,8 +109,8 @@
109#include <linux/sysctl.h> 109#include <linux/sysctl.h>
110#endif 110#endif
111 111
112#define RT_FL_TOS(oldflp) \ 112#define RT_FL_TOS(oldflp4) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) 113 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 114
115#define IP_MAX_MTU 0xFFF0 115#define IP_MAX_MTU 0xFFF0
116 116
@@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256; 131static int ip_rt_min_advmss __read_mostly = 256;
132static int rt_chain_length_max __read_mostly = 20; 132static int rt_chain_length_max __read_mostly = 20;
133 133
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
136
137/* 134/*
138 * Interface to generic destination cache. 135 * Interface to generic destination cache.
139 */ 136 */
@@ -152,6 +149,41 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
152{ 149{
153} 150}
154 151
152static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
153{
154 struct rtable *rt = (struct rtable *) dst;
155 struct inet_peer *peer;
156 u32 *p = NULL;
157
158 if (!rt->peer)
159 rt_bind_peer(rt, 1);
160
161 peer = rt->peer;
162 if (peer) {
163 u32 *old_p = __DST_METRICS_PTR(old);
164 unsigned long prev, new;
165
166 p = peer->metrics;
167 if (inet_metrics_new(peer))
168 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
169
170 new = (unsigned long) p;
171 prev = cmpxchg(&dst->_metrics, old, new);
172
173 if (prev != old) {
174 p = __DST_METRICS_PTR(prev);
175 if (prev & DST_METRICS_READ_ONLY)
176 p = NULL;
177 } else {
178 if (rt->fi) {
179 fib_info_put(rt->fi);
180 rt->fi = NULL;
181 }
182 }
183 }
184 return p;
185}
186
155static struct dst_ops ipv4_dst_ops = { 187static struct dst_ops ipv4_dst_ops = {
156 .family = AF_INET, 188 .family = AF_INET,
157 .protocol = cpu_to_be16(ETH_P_IP), 189 .protocol = cpu_to_be16(ETH_P_IP),
@@ -159,6 +191,7 @@ static struct dst_ops ipv4_dst_ops = {
159 .check = ipv4_dst_check, 191 .check = ipv4_dst_check,
160 .default_advmss = ipv4_default_advmss, 192 .default_advmss = ipv4_default_advmss,
161 .default_mtu = ipv4_default_mtu, 193 .default_mtu = ipv4_default_mtu,
194 .cow_metrics = ipv4_cow_metrics,
162 .destroy = ipv4_dst_destroy, 195 .destroy = ipv4_dst_destroy,
163 .ifdown = ipv4_dst_ifdown, 196 .ifdown = ipv4_dst_ifdown,
164 .negative_advice = ipv4_negative_advice, 197 .negative_advice = ipv4_negative_advice,
@@ -171,7 +204,7 @@ static struct dst_ops ipv4_dst_ops = {
171 204
172const __u8 ip_tos2prio[16] = { 205const __u8 ip_tos2prio[16] = {
173 TC_PRIO_BESTEFFORT, 206 TC_PRIO_BESTEFFORT,
174 ECN_OR_COST(FILLER), 207 ECN_OR_COST(BESTEFFORT),
175 TC_PRIO_BESTEFFORT, 208 TC_PRIO_BESTEFFORT,
176 ECN_OR_COST(BESTEFFORT), 209 ECN_OR_COST(BESTEFFORT),
177 TC_PRIO_BULK, 210 TC_PRIO_BULK,
@@ -391,7 +424,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
391 dst_metric(&r->dst, RTAX_WINDOW), 424 dst_metric(&r->dst, RTAX_WINDOW),
392 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 425 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
393 dst_metric(&r->dst, RTAX_RTTVAR)), 426 dst_metric(&r->dst, RTAX_RTTVAR)),
394 r->fl.fl4_tos, 427 r->rt_tos,
395 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, 428 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
396 r->dst.hh ? (r->dst.hh->hh_output == 429 r->dst.hh ? (r->dst.hh->hh_output ==
397 dev_queue_xmit) : 0, 430 dev_queue_xmit) : 0,
@@ -514,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = {
514 .release = seq_release, 547 .release = seq_release,
515}; 548};
516 549
517#ifdef CONFIG_NET_CLS_ROUTE 550#ifdef CONFIG_IP_ROUTE_CLASSID
518static int rt_acct_proc_show(struct seq_file *m, void *v) 551static int rt_acct_proc_show(struct seq_file *m, void *v)
519{ 552{
520 struct ip_rt_acct *dst, *src; 553 struct ip_rt_acct *dst, *src;
@@ -567,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
567 if (!pde) 600 if (!pde)
568 goto err2; 601 goto err2;
569 602
570#ifdef CONFIG_NET_CLS_ROUTE 603#ifdef CONFIG_IP_ROUTE_CLASSID
571 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 604 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
572 if (!pde) 605 if (!pde)
573 goto err3; 606 goto err3;
574#endif 607#endif
575 return 0; 608 return 0;
576 609
577#ifdef CONFIG_NET_CLS_ROUTE 610#ifdef CONFIG_IP_ROUTE_CLASSID
578err3: 611err3:
579 remove_proc_entry("rt_cache", net->proc_net_stat); 612 remove_proc_entry("rt_cache", net->proc_net_stat);
580#endif 613#endif
@@ -588,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
588{ 621{
589 remove_proc_entry("rt_cache", net->proc_net_stat); 622 remove_proc_entry("rt_cache", net->proc_net_stat);
590 remove_proc_entry("rt_cache", net->proc_net); 623 remove_proc_entry("rt_cache", net->proc_net);
591#ifdef CONFIG_NET_CLS_ROUTE 624#ifdef CONFIG_IP_ROUTE_CLASSID
592 remove_proc_entry("rt_acct", net->proc_net); 625 remove_proc_entry("rt_acct", net->proc_net);
593#endif 626#endif
594} 627}
@@ -632,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth)
632static inline int rt_valuable(struct rtable *rth) 665static inline int rt_valuable(struct rtable *rth)
633{ 666{
634 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 667 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
635 rth->dst.expires; 668 (rth->peer && rth->peer->pmtu_expires);
636} 669}
637 670
638static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 671static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -643,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
643 if (atomic_read(&rth->dst.__refcnt)) 676 if (atomic_read(&rth->dst.__refcnt))
644 goto out; 677 goto out;
645 678
646 ret = 1;
647 if (rth->dst.expires &&
648 time_after_eq(jiffies, rth->dst.expires))
649 goto out;
650
651 age = jiffies - rth->dst.lastuse; 679 age = jiffies - rth->dst.lastuse;
652 ret = 0;
653 if ((age <= tmo1 && !rt_fast_clean(rth)) || 680 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
654 (age <= tmo2 && rt_valuable(rth))) 681 (age <= tmo2 && rt_valuable(rth)))
655 goto out; 682 goto out;
@@ -684,22 +711,22 @@ static inline bool rt_caching(const struct net *net)
684 net->ipv4.sysctl_rt_cache_rebuild_count; 711 net->ipv4.sysctl_rt_cache_rebuild_count;
685} 712}
686 713
687static inline bool compare_hash_inputs(const struct flowi *fl1, 714static inline bool compare_hash_inputs(const struct rtable *rt1,
688 const struct flowi *fl2) 715 const struct rtable *rt2)
689{ 716{
690 return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) | 717 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
691 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) | 718 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
692 (fl1->iif ^ fl2->iif)) == 0); 719 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
693} 720}
694 721
695static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 722static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
696{ 723{
697 return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) | 724 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
698 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) | 725 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
699 (fl1->mark ^ fl2->mark) | 726 (rt1->rt_mark ^ rt2->rt_mark) |
700 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) | 727 (rt1->rt_tos ^ rt2->rt_tos) |
701 (fl1->oif ^ fl2->oif) | 728 (rt1->rt_oif ^ rt2->rt_oif) |
702 (fl1->iif ^ fl2->iif)) == 0; 729 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
703} 730}
704 731
705static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 732static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
@@ -786,106 +813,15 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
786 const struct rtable *aux = head; 813 const struct rtable *aux = head;
787 814
788 while (aux != rth) { 815 while (aux != rth) {
789 if (compare_hash_inputs(&aux->fl, &rth->fl)) 816 if (compare_hash_inputs(aux, rth))
790 return 0; 817 return 0;
791 aux = rcu_dereference_protected(aux->dst.rt_next, 1); 818 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
792 } 819 }
793 return ONE; 820 return ONE;
794} 821}
795 822
796static void rt_check_expire(void)
797{
798 static unsigned int rover;
799 unsigned int i = rover, goal;
800 struct rtable *rth;
801 struct rtable __rcu **rthp;
802 unsigned long samples = 0;
803 unsigned long sum = 0, sum2 = 0;
804 unsigned long delta;
805 u64 mult;
806
807 delta = jiffies - expires_ljiffies;
808 expires_ljiffies = jiffies;
809 mult = ((u64)delta) << rt_hash_log;
810 if (ip_rt_gc_timeout > 1)
811 do_div(mult, ip_rt_gc_timeout);
812 goal = (unsigned int)mult;
813 if (goal > rt_hash_mask)
814 goal = rt_hash_mask + 1;
815 for (; goal > 0; goal--) {
816 unsigned long tmo = ip_rt_gc_timeout;
817 unsigned long length;
818
819 i = (i + 1) & rt_hash_mask;
820 rthp = &rt_hash_table[i].chain;
821
822 if (need_resched())
823 cond_resched();
824
825 samples++;
826
827 if (rcu_dereference_raw(*rthp) == NULL)
828 continue;
829 length = 0;
830 spin_lock_bh(rt_hash_lock_addr(i));
831 while ((rth = rcu_dereference_protected(*rthp,
832 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
833 prefetch(rth->dst.rt_next);
834 if (rt_is_expired(rth)) {
835 *rthp = rth->dst.rt_next;
836 rt_free(rth);
837 continue;
838 }
839 if (rth->dst.expires) {
840 /* Entry is expired even if it is in use */
841 if (time_before_eq(jiffies, rth->dst.expires)) {
842nofree:
843 tmo >>= 1;
844 rthp = &rth->dst.rt_next;
845 /*
846 * We only count entries on
847 * a chain with equal hash inputs once
848 * so that entries for different QOS
849 * levels, and other non-hash input
850 * attributes don't unfairly skew
851 * the length computation
852 */
853 length += has_noalias(rt_hash_table[i].chain, rth);
854 continue;
855 }
856 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
857 goto nofree;
858
859 /* Cleanup aged off entries. */
860 *rthp = rth->dst.rt_next;
861 rt_free(rth);
862 }
863 spin_unlock_bh(rt_hash_lock_addr(i));
864 sum += length;
865 sum2 += length*length;
866 }
867 if (samples) {
868 unsigned long avg = sum / samples;
869 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
870 rt_chain_length_max = max_t(unsigned long,
871 ip_rt_gc_elasticity,
872 (avg + 4*sd) >> FRACT_BITS);
873 }
874 rover = i;
875}
876
877/*
878 * rt_worker_func() is run in process context.
879 * we call rt_check_expire() to scan part of the hash table
880 */
881static void rt_worker_func(struct work_struct *work)
882{
883 rt_check_expire();
884 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
885}
886
887/* 823/*
888 * Pertubation of rt_genid by a small quantity [1..256] 824 * Perturbation of rt_genid by a small quantity [1..256]
889 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 825 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
890 * many times (2^24) without giving recent rt_genid. 826 * many times (2^24) without giving recent rt_genid.
891 * Jenkins hash is strong enough that litle changes of rt_genid are OK. 827 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
@@ -1078,8 +1014,8 @@ static int slow_chain_length(const struct rtable *head)
1078 return length >> FRACT_BITS; 1014 return length >> FRACT_BITS;
1079} 1015}
1080 1016
1081static int rt_intern_hash(unsigned hash, struct rtable *rt, 1017static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1082 struct rtable **rp, struct sk_buff *skb, int ifindex) 1018 struct sk_buff *skb, int ifindex)
1083{ 1019{
1084 struct rtable *rth, *cand; 1020 struct rtable *rth, *cand;
1085 struct rtable __rcu **rthp, **candp; 1021 struct rtable __rcu **rthp, **candp;
@@ -1120,7 +1056,7 @@ restart:
1120 printk(KERN_WARNING 1056 printk(KERN_WARNING
1121 "Neighbour table failure & not caching routes.\n"); 1057 "Neighbour table failure & not caching routes.\n");
1122 ip_rt_put(rt); 1058 ip_rt_put(rt);
1123 return err; 1059 return ERR_PTR(err);
1124 } 1060 }
1125 } 1061 }
1126 1062
@@ -1137,7 +1073,7 @@ restart:
1137 rt_free(rth); 1073 rt_free(rth);
1138 continue; 1074 continue;
1139 } 1075 }
1140 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { 1076 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1141 /* Put it first */ 1077 /* Put it first */
1142 *rthp = rth->dst.rt_next; 1078 *rthp = rth->dst.rt_next;
1143 /* 1079 /*
@@ -1157,11 +1093,9 @@ restart:
1157 spin_unlock_bh(rt_hash_lock_addr(hash)); 1093 spin_unlock_bh(rt_hash_lock_addr(hash));
1158 1094
1159 rt_drop(rt); 1095 rt_drop(rt);
1160 if (rp) 1096 if (skb)
1161 *rp = rth;
1162 else
1163 skb_dst_set(skb, &rth->dst); 1097 skb_dst_set(skb, &rth->dst);
1164 return 0; 1098 return rth;
1165 } 1099 }
1166 1100
1167 if (!atomic_read(&rth->dst.__refcnt)) { 1101 if (!atomic_read(&rth->dst.__refcnt)) {
@@ -1202,7 +1136,7 @@ restart:
1202 rt_emergency_hash_rebuild(net); 1136 rt_emergency_hash_rebuild(net);
1203 spin_unlock_bh(rt_hash_lock_addr(hash)); 1137 spin_unlock_bh(rt_hash_lock_addr(hash));
1204 1138
1205 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1139 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1206 ifindex, rt_genid(net)); 1140 ifindex, rt_genid(net));
1207 goto restart; 1141 goto restart;
1208 } 1142 }
@@ -1218,7 +1152,7 @@ restart:
1218 1152
1219 if (err != -ENOBUFS) { 1153 if (err != -ENOBUFS) {
1220 rt_drop(rt); 1154 rt_drop(rt);
1221 return err; 1155 return ERR_PTR(err);
1222 } 1156 }
1223 1157
1224 /* Neighbour tables are full and nothing 1158 /* Neighbour tables are full and nothing
@@ -1239,7 +1173,7 @@ restart:
1239 if (net_ratelimit()) 1173 if (net_ratelimit())
1240 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); 1174 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1241 rt_drop(rt); 1175 rt_drop(rt);
1242 return -ENOBUFS; 1176 return ERR_PTR(-ENOBUFS);
1243 } 1177 }
1244 } 1178 }
1245 1179
@@ -1257,7 +1191,7 @@ restart:
1257#endif 1191#endif
1258 /* 1192 /*
1259 * Since lookup is lockfree, we must make sure 1193 * Since lookup is lockfree, we must make sure
1260 * previous writes to rt are comitted to memory 1194 * previous writes to rt are committed to memory
1261 * before making rt visible to other CPUS. 1195 * before making rt visible to other CPUS.
1262 */ 1196 */
1263 rcu_assign_pointer(rt_hash_table[hash].chain, rt); 1197 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
@@ -1265,11 +1199,16 @@ restart:
1265 spin_unlock_bh(rt_hash_lock_addr(hash)); 1199 spin_unlock_bh(rt_hash_lock_addr(hash));
1266 1200
1267skip_hashing: 1201skip_hashing:
1268 if (rp) 1202 if (skb)
1269 *rp = rt;
1270 else
1271 skb_dst_set(skb, &rt->dst); 1203 skb_dst_set(skb, &rt->dst);
1272 return 0; 1204 return rt;
1205}
1206
1207static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1208
1209static u32 rt_peer_genid(void)
1210{
1211 return atomic_read(&__rt_peer_genid);
1273} 1212}
1274 1213
1275void rt_bind_peer(struct rtable *rt, int create) 1214void rt_bind_peer(struct rtable *rt, int create)
@@ -1280,6 +1219,8 @@ void rt_bind_peer(struct rtable *rt, int create)
1280 1219
1281 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) 1220 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1282 inet_putpeer(peer); 1221 inet_putpeer(peer);
1222 else
1223 rt->rt_peer_genid = rt_peer_genid();
1283} 1224}
1284 1225
1285/* 1226/*
@@ -1349,13 +1290,8 @@ static void rt_del(unsigned hash, struct rtable *rt)
1349void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1290void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1350 __be32 saddr, struct net_device *dev) 1291 __be32 saddr, struct net_device *dev)
1351{ 1292{
1352 int i, k;
1353 struct in_device *in_dev = __in_dev_get_rcu(dev); 1293 struct in_device *in_dev = __in_dev_get_rcu(dev);
1354 struct rtable *rth; 1294 struct inet_peer *peer;
1355 struct rtable __rcu **rthp;
1356 __be32 skeys[2] = { saddr, 0 };
1357 int ikeys[2] = { dev->ifindex, 0 };
1358 struct netevent_redirect netevent;
1359 struct net *net; 1295 struct net *net;
1360 1296
1361 if (!in_dev) 1297 if (!in_dev)
@@ -1367,9 +1303,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1367 ipv4_is_zeronet(new_gw)) 1303 ipv4_is_zeronet(new_gw))
1368 goto reject_redirect; 1304 goto reject_redirect;
1369 1305
1370 if (!rt_caching(net))
1371 goto reject_redirect;
1372
1373 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1306 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1374 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1307 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1375 goto reject_redirect; 1308 goto reject_redirect;
@@ -1380,91 +1313,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1380 goto reject_redirect; 1313 goto reject_redirect;
1381 } 1314 }
1382 1315
1383 for (i = 0; i < 2; i++) { 1316 peer = inet_getpeer_v4(daddr, 1);
1384 for (k = 0; k < 2; k++) { 1317 if (peer) {
1385 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1318 peer->redirect_learned.a4 = new_gw;
1386 rt_genid(net));
1387
1388 rthp = &rt_hash_table[hash].chain;
1389
1390 while ((rth = rcu_dereference(*rthp)) != NULL) {
1391 struct rtable *rt;
1392
1393 if (rth->fl.fl4_dst != daddr ||
1394 rth->fl.fl4_src != skeys[i] ||
1395 rth->fl.oif != ikeys[k] ||
1396 rt_is_input_route(rth) ||
1397 rt_is_expired(rth) ||
1398 !net_eq(dev_net(rth->dst.dev), net)) {
1399 rthp = &rth->dst.rt_next;
1400 continue;
1401 }
1402
1403 if (rth->rt_dst != daddr ||
1404 rth->rt_src != saddr ||
1405 rth->dst.error ||
1406 rth->rt_gateway != old_gw ||
1407 rth->dst.dev != dev)
1408 break;
1409
1410 dst_hold(&rth->dst);
1411
1412 rt = dst_alloc(&ipv4_dst_ops);
1413 if (rt == NULL) {
1414 ip_rt_put(rth);
1415 return;
1416 }
1417
1418 /* Copy all the information. */
1419 *rt = *rth;
1420 rt->dst.__use = 1;
1421 atomic_set(&rt->dst.__refcnt, 1);
1422 rt->dst.child = NULL;
1423 if (rt->dst.dev)
1424 dev_hold(rt->dst.dev);
1425 rt->dst.obsolete = -1;
1426 rt->dst.lastuse = jiffies;
1427 rt->dst.path = &rt->dst;
1428 rt->dst.neighbour = NULL;
1429 rt->dst.hh = NULL;
1430#ifdef CONFIG_XFRM
1431 rt->dst.xfrm = NULL;
1432#endif
1433 rt->rt_genid = rt_genid(net);
1434 rt->rt_flags |= RTCF_REDIRECTED;
1435
1436 /* Gateway is different ... */
1437 rt->rt_gateway = new_gw;
1438
1439 /* Redirect received -> path was valid */
1440 dst_confirm(&rth->dst);
1441
1442 if (rt->peer)
1443 atomic_inc(&rt->peer->refcnt);
1444
1445 if (arp_bind_neighbour(&rt->dst) ||
1446 !(rt->dst.neighbour->nud_state &
1447 NUD_VALID)) {
1448 if (rt->dst.neighbour)
1449 neigh_event_send(rt->dst.neighbour, NULL);
1450 ip_rt_put(rth);
1451 rt_drop(rt);
1452 goto do_next;
1453 }
1454 1319
1455 netevent.old = &rth->dst; 1320 inet_putpeer(peer);
1456 netevent.new = &rt->dst;
1457 call_netevent_notifiers(NETEVENT_REDIRECT,
1458 &netevent);
1459 1321
1460 rt_del(hash, rth); 1322 atomic_inc(&__rt_peer_genid);
1461 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1462 ip_rt_put(rt);
1463 goto do_next;
1464 }
1465 do_next:
1466 ;
1467 }
1468 } 1323 }
1469 return; 1324 return;
1470 1325
@@ -1488,18 +1343,24 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1488 if (dst->obsolete > 0) { 1343 if (dst->obsolete > 0) {
1489 ip_rt_put(rt); 1344 ip_rt_put(rt);
1490 ret = NULL; 1345 ret = NULL;
1491 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1346 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1492 (rt->dst.expires && 1347 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1493 time_after_eq(jiffies, rt->dst.expires))) { 1348 rt->rt_oif,
1494 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1495 rt->fl.oif,
1496 rt_genid(dev_net(dst->dev))); 1349 rt_genid(dev_net(dst->dev)));
1497#if RT_CACHE_DEBUG >= 1 1350#if RT_CACHE_DEBUG >= 1
1498 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", 1351 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1499 &rt->rt_dst, rt->fl.fl4_tos); 1352 &rt->rt_dst, rt->rt_tos);
1500#endif 1353#endif
1501 rt_del(hash, rt); 1354 rt_del(hash, rt);
1502 ret = NULL; 1355 ret = NULL;
1356 } else if (rt->peer &&
1357 rt->peer->pmtu_expires &&
1358 time_after_eq(jiffies, rt->peer->pmtu_expires)) {
1359 unsigned long orig = rt->peer->pmtu_expires;
1360
1361 if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1362 dst_metric_set(dst, RTAX_MTU,
1363 rt->peer->pmtu_orig);
1503 } 1364 }
1504 } 1365 }
1505 return ret; 1366 return ret;
@@ -1525,6 +1386,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1525{ 1386{
1526 struct rtable *rt = skb_rtable(skb); 1387 struct rtable *rt = skb_rtable(skb);
1527 struct in_device *in_dev; 1388 struct in_device *in_dev;
1389 struct inet_peer *peer;
1528 int log_martians; 1390 int log_martians;
1529 1391
1530 rcu_read_lock(); 1392 rcu_read_lock();
@@ -1536,33 +1398,41 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1536 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1398 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1537 rcu_read_unlock(); 1399 rcu_read_unlock();
1538 1400
1401 if (!rt->peer)
1402 rt_bind_peer(rt, 1);
1403 peer = rt->peer;
1404 if (!peer) {
1405 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1406 return;
1407 }
1408
1539 /* No redirected packets during ip_rt_redirect_silence; 1409 /* No redirected packets during ip_rt_redirect_silence;
1540 * reset the algorithm. 1410 * reset the algorithm.
1541 */ 1411 */
1542 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) 1412 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1543 rt->dst.rate_tokens = 0; 1413 peer->rate_tokens = 0;
1544 1414
1545 /* Too many ignored redirects; do not send anything 1415 /* Too many ignored redirects; do not send anything
1546 * set dst.rate_last to the last seen redirected packet. 1416 * set dst.rate_last to the last seen redirected packet.
1547 */ 1417 */
1548 if (rt->dst.rate_tokens >= ip_rt_redirect_number) { 1418 if (peer->rate_tokens >= ip_rt_redirect_number) {
1549 rt->dst.rate_last = jiffies; 1419 peer->rate_last = jiffies;
1550 return; 1420 return;
1551 } 1421 }
1552 1422
1553 /* Check for load limit; set rate_last to the latest sent 1423 /* Check for load limit; set rate_last to the latest sent
1554 * redirect. 1424 * redirect.
1555 */ 1425 */
1556 if (rt->dst.rate_tokens == 0 || 1426 if (peer->rate_tokens == 0 ||
1557 time_after(jiffies, 1427 time_after(jiffies,
1558 (rt->dst.rate_last + 1428 (peer->rate_last +
1559 (ip_rt_redirect_load << rt->dst.rate_tokens)))) { 1429 (ip_rt_redirect_load << peer->rate_tokens)))) {
1560 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1430 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1561 rt->dst.rate_last = jiffies; 1431 peer->rate_last = jiffies;
1562 ++rt->dst.rate_tokens; 1432 ++peer->rate_tokens;
1563#ifdef CONFIG_IP_ROUTE_VERBOSE 1433#ifdef CONFIG_IP_ROUTE_VERBOSE
1564 if (log_martians && 1434 if (log_martians &&
1565 rt->dst.rate_tokens == ip_rt_redirect_number && 1435 peer->rate_tokens == ip_rt_redirect_number &&
1566 net_ratelimit()) 1436 net_ratelimit())
1567 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1437 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1568 &rt->rt_src, rt->rt_iif, 1438 &rt->rt_src, rt->rt_iif,
@@ -1574,7 +1444,9 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1574static int ip_error(struct sk_buff *skb) 1444static int ip_error(struct sk_buff *skb)
1575{ 1445{
1576 struct rtable *rt = skb_rtable(skb); 1446 struct rtable *rt = skb_rtable(skb);
1447 struct inet_peer *peer;
1577 unsigned long now; 1448 unsigned long now;
1449 bool send;
1578 int code; 1450 int code;
1579 1451
1580 switch (rt->dst.error) { 1452 switch (rt->dst.error) {
@@ -1594,15 +1466,24 @@ static int ip_error(struct sk_buff *skb)
1594 break; 1466 break;
1595 } 1467 }
1596 1468
1597 now = jiffies; 1469 if (!rt->peer)
1598 rt->dst.rate_tokens += now - rt->dst.rate_last; 1470 rt_bind_peer(rt, 1);
1599 if (rt->dst.rate_tokens > ip_rt_error_burst) 1471 peer = rt->peer;
1600 rt->dst.rate_tokens = ip_rt_error_burst; 1472
1601 rt->dst.rate_last = now; 1473 send = true;
1602 if (rt->dst.rate_tokens >= ip_rt_error_cost) { 1474 if (peer) {
1603 rt->dst.rate_tokens -= ip_rt_error_cost; 1475 now = jiffies;
1604 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1476 peer->rate_tokens += now - peer->rate_last;
1477 if (peer->rate_tokens > ip_rt_error_burst)
1478 peer->rate_tokens = ip_rt_error_burst;
1479 peer->rate_last = now;
1480 if (peer->rate_tokens >= ip_rt_error_cost)
1481 peer->rate_tokens -= ip_rt_error_cost;
1482 else
1483 send = false;
1605 } 1484 }
1485 if (send)
1486 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1606 1487
1607out: kfree_skb(skb); 1488out: kfree_skb(skb);
1608 return 0; 1489 return 0;
@@ -1630,88 +1511,140 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1630 unsigned short new_mtu, 1511 unsigned short new_mtu,
1631 struct net_device *dev) 1512 struct net_device *dev)
1632{ 1513{
1633 int i, k;
1634 unsigned short old_mtu = ntohs(iph->tot_len); 1514 unsigned short old_mtu = ntohs(iph->tot_len);
1635 struct rtable *rth;
1636 int ikeys[2] = { dev->ifindex, 0 };
1637 __be32 skeys[2] = { iph->saddr, 0, };
1638 __be32 daddr = iph->daddr;
1639 unsigned short est_mtu = 0; 1515 unsigned short est_mtu = 0;
1516 struct inet_peer *peer;
1640 1517
1641 for (k = 0; k < 2; k++) { 1518 peer = inet_getpeer_v4(iph->daddr, 1);
1642 for (i = 0; i < 2; i++) { 1519 if (peer) {
1643 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1520 unsigned short mtu = new_mtu;
1644 rt_genid(net));
1645
1646 rcu_read_lock();
1647 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1648 rth = rcu_dereference(rth->dst.rt_next)) {
1649 unsigned short mtu = new_mtu;
1650
1651 if (rth->fl.fl4_dst != daddr ||
1652 rth->fl.fl4_src != skeys[i] ||
1653 rth->rt_dst != daddr ||
1654 rth->rt_src != iph->saddr ||
1655 rth->fl.oif != ikeys[k] ||
1656 rt_is_input_route(rth) ||
1657 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1658 !net_eq(dev_net(rth->dst.dev), net) ||
1659 rt_is_expired(rth))
1660 continue;
1661 1521
1662 if (new_mtu < 68 || new_mtu >= old_mtu) { 1522 if (new_mtu < 68 || new_mtu >= old_mtu) {
1523 /* BSD 4.2 derived systems incorrectly adjust
1524 * tot_len by the IP header length, and report
1525 * a zero MTU in the ICMP message.
1526 */
1527 if (mtu == 0 &&
1528 old_mtu >= 68 + (iph->ihl << 2))
1529 old_mtu -= iph->ihl << 2;
1530 mtu = guess_mtu(old_mtu);
1531 }
1663 1532
1664 /* BSD 4.2 compatibility hack :-( */ 1533 if (mtu < ip_rt_min_pmtu)
1665 if (mtu == 0 && 1534 mtu = ip_rt_min_pmtu;
1666 old_mtu >= dst_mtu(&rth->dst) && 1535 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1667 old_mtu >= 68 + (iph->ihl << 2)) 1536 unsigned long pmtu_expires;
1668 old_mtu -= iph->ihl << 2;
1669 1537
1670 mtu = guess_mtu(old_mtu); 1538 pmtu_expires = jiffies + ip_rt_mtu_expires;
1671 } 1539 if (!pmtu_expires)
1672 if (mtu <= dst_mtu(&rth->dst)) { 1540 pmtu_expires = 1UL;
1673 if (mtu < dst_mtu(&rth->dst)) { 1541
1674 dst_confirm(&rth->dst); 1542 est_mtu = mtu;
1675 if (mtu < ip_rt_min_pmtu) { 1543 peer->pmtu_learned = mtu;
1676 u32 lock = dst_metric(&rth->dst, 1544 peer->pmtu_expires = pmtu_expires;
1677 RTAX_LOCK);
1678 mtu = ip_rt_min_pmtu;
1679 lock |= (1 << RTAX_MTU);
1680 dst_metric_set(&rth->dst, RTAX_LOCK,
1681 lock);
1682 }
1683 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1684 dst_set_expires(&rth->dst,
1685 ip_rt_mtu_expires);
1686 }
1687 est_mtu = mtu;
1688 }
1689 }
1690 rcu_read_unlock();
1691 } 1545 }
1546
1547 inet_putpeer(peer);
1548
1549 atomic_inc(&__rt_peer_genid);
1692 } 1550 }
1693 return est_mtu ? : new_mtu; 1551 return est_mtu ? : new_mtu;
1694} 1552}
1695 1553
1554static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1555{
1556 unsigned long expires = peer->pmtu_expires;
1557
1558 if (time_before(jiffies, expires)) {
1559 u32 orig_dst_mtu = dst_mtu(dst);
1560 if (peer->pmtu_learned < orig_dst_mtu) {
1561 if (!peer->pmtu_orig)
1562 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1563 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1564 }
1565 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1566 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1567}
1568
1696static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1569static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1697{ 1570{
1698 if (dst_mtu(dst) > mtu && mtu >= 68 && 1571 struct rtable *rt = (struct rtable *) dst;
1699 !(dst_metric_locked(dst, RTAX_MTU))) { 1572 struct inet_peer *peer;
1700 if (mtu < ip_rt_min_pmtu) { 1573
1701 u32 lock = dst_metric(dst, RTAX_LOCK); 1574 dst_confirm(dst);
1575
1576 if (!rt->peer)
1577 rt_bind_peer(rt, 1);
1578 peer = rt->peer;
1579 if (peer) {
1580 if (mtu < ip_rt_min_pmtu)
1702 mtu = ip_rt_min_pmtu; 1581 mtu = ip_rt_min_pmtu;
1703 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU)); 1582 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1583 unsigned long pmtu_expires;
1584
1585 pmtu_expires = jiffies + ip_rt_mtu_expires;
1586 if (!pmtu_expires)
1587 pmtu_expires = 1UL;
1588
1589 peer->pmtu_learned = mtu;
1590 peer->pmtu_expires = pmtu_expires;
1591
1592 atomic_inc(&__rt_peer_genid);
1593 rt->rt_peer_genid = rt_peer_genid();
1704 } 1594 }
1705 dst_metric_set(dst, RTAX_MTU, mtu); 1595 check_peer_pmtu(dst, peer);
1706 dst_set_expires(dst, ip_rt_mtu_expires); 1596 }
1707 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); 1597}
1598
1599static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1600{
1601 struct rtable *rt = (struct rtable *) dst;
1602 __be32 orig_gw = rt->rt_gateway;
1603
1604 dst_confirm(&rt->dst);
1605
1606 neigh_release(rt->dst.neighbour);
1607 rt->dst.neighbour = NULL;
1608
1609 rt->rt_gateway = peer->redirect_learned.a4;
1610 if (arp_bind_neighbour(&rt->dst) ||
1611 !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1612 if (rt->dst.neighbour)
1613 neigh_event_send(rt->dst.neighbour, NULL);
1614 rt->rt_gateway = orig_gw;
1615 return -EAGAIN;
1616 } else {
1617 rt->rt_flags |= RTCF_REDIRECTED;
1618 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1619 rt->dst.neighbour);
1708 } 1620 }
1621 return 0;
1709} 1622}
1710 1623
1711static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1624static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1712{ 1625{
1713 if (rt_is_expired((struct rtable *)dst)) 1626 struct rtable *rt = (struct rtable *) dst;
1627
1628 if (rt_is_expired(rt))
1714 return NULL; 1629 return NULL;
1630 if (rt->rt_peer_genid != rt_peer_genid()) {
1631 struct inet_peer *peer;
1632
1633 if (!rt->peer)
1634 rt_bind_peer(rt, 0);
1635
1636 peer = rt->peer;
1637 if (peer && peer->pmtu_expires)
1638 check_peer_pmtu(dst, peer);
1639
1640 if (peer && peer->redirect_learned.a4 &&
1641 peer->redirect_learned.a4 != rt->rt_gateway) {
1642 if (check_peer_redir(dst, peer))
1643 return NULL;
1644 }
1645
1646 rt->rt_peer_genid = rt_peer_genid();
1647 }
1715 return dst; 1648 return dst;
1716} 1649}
1717 1650
@@ -1720,6 +1653,10 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
1720 struct rtable *rt = (struct rtable *) dst; 1653 struct rtable *rt = (struct rtable *) dst;
1721 struct inet_peer *peer = rt->peer; 1654 struct inet_peer *peer = rt->peer;
1722 1655
1656 if (rt->fi) {
1657 fib_info_put(rt->fi);
1658 rt->fi = NULL;
1659 }
1723 if (peer) { 1660 if (peer) {
1724 rt->peer = NULL; 1661 rt->peer = NULL;
1725 inet_putpeer(peer); 1662 inet_putpeer(peer);
@@ -1734,8 +1671,14 @@ static void ipv4_link_failure(struct sk_buff *skb)
1734 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1671 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1735 1672
1736 rt = skb_rtable(skb); 1673 rt = skb_rtable(skb);
1737 if (rt) 1674 if (rt &&
1738 dst_set_expires(&rt->dst, 0); 1675 rt->peer &&
1676 rt->peer->pmtu_expires) {
1677 unsigned long orig = rt->peer->pmtu_expires;
1678
1679 if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1680 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1681 }
1739} 1682}
1740 1683
1741static int ip_rt_bug(struct sk_buff *skb) 1684static int ip_rt_bug(struct sk_buff *skb)
@@ -1764,9 +1707,18 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1764 if (rt_is_output_route(rt)) 1707 if (rt_is_output_route(rt))
1765 src = rt->rt_src; 1708 src = rt->rt_src;
1766 else { 1709 else {
1710 struct flowi4 fl4 = {
1711 .daddr = rt->rt_key_dst,
1712 .saddr = rt->rt_key_src,
1713 .flowi4_tos = rt->rt_tos,
1714 .flowi4_oif = rt->rt_oif,
1715 .flowi4_iif = rt->rt_iif,
1716 .flowi4_mark = rt->rt_mark,
1717 };
1718
1767 rcu_read_lock(); 1719 rcu_read_lock();
1768 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) 1720 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1769 src = FIB_RES_PREFSRC(res); 1721 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1770 else 1722 else
1771 src = inet_select_addr(rt->dst.dev, rt->rt_gateway, 1723 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1772 RT_SCOPE_UNIVERSE); 1724 RT_SCOPE_UNIVERSE);
@@ -1775,7 +1727,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1775 memcpy(addr, &src, 4); 1727 memcpy(addr, &src, 4);
1776} 1728}
1777 1729
1778#ifdef CONFIG_NET_CLS_ROUTE 1730#ifdef CONFIG_IP_ROUTE_CLASSID
1779static void set_class_tag(struct rtable *rt, u32 tag) 1731static void set_class_tag(struct rtable *rt, u32 tag)
1780{ 1732{
1781 if (!(rt->dst.tclassid & 0xFFFF)) 1733 if (!(rt->dst.tclassid & 0xFFFF))
@@ -1815,17 +1767,54 @@ static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1815 return mtu; 1767 return mtu;
1816} 1768}
1817 1769
1818static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 1770static void rt_init_metrics(struct rtable *rt, const struct flowi4 *oldflp4,
1771 struct fib_info *fi)
1772{
1773 struct inet_peer *peer;
1774 int create = 0;
1775
1776 /* If a peer entry exists for this destination, we must hook
1777 * it up in order to get at cached metrics.
1778 */
1779 if (oldflp4 && (oldflp4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1780 create = 1;
1781
1782 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1783 if (peer) {
1784 rt->rt_peer_genid = rt_peer_genid();
1785 if (inet_metrics_new(peer))
1786 memcpy(peer->metrics, fi->fib_metrics,
1787 sizeof(u32) * RTAX_MAX);
1788 dst_init_metrics(&rt->dst, peer->metrics, false);
1789
1790 if (peer->pmtu_expires)
1791 check_peer_pmtu(&rt->dst, peer);
1792 if (peer->redirect_learned.a4 &&
1793 peer->redirect_learned.a4 != rt->rt_gateway) {
1794 rt->rt_gateway = peer->redirect_learned.a4;
1795 rt->rt_flags |= RTCF_REDIRECTED;
1796 }
1797 } else {
1798 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1799 rt->fi = fi;
1800 atomic_inc(&fi->fib_clntref);
1801 }
1802 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1803 }
1804}
1805
1806static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *oldflp4,
1807 const struct fib_result *res,
1808 struct fib_info *fi, u16 type, u32 itag)
1819{ 1809{
1820 struct dst_entry *dst = &rt->dst; 1810 struct dst_entry *dst = &rt->dst;
1821 struct fib_info *fi = res->fi;
1822 1811
1823 if (fi) { 1812 if (fi) {
1824 if (FIB_RES_GW(*res) && 1813 if (FIB_RES_GW(*res) &&
1825 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1814 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1826 rt->rt_gateway = FIB_RES_GW(*res); 1815 rt->rt_gateway = FIB_RES_GW(*res);
1827 dst_import_metrics(dst, fi->fib_metrics); 1816 rt_init_metrics(rt, oldflp4, fi);
1828#ifdef CONFIG_NET_CLS_ROUTE 1817#ifdef CONFIG_IP_ROUTE_CLASSID
1829 dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1818 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1830#endif 1819#endif
1831 } 1820 }
@@ -1835,13 +1824,26 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1835 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) 1824 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1836 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); 1825 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1837 1826
1838#ifdef CONFIG_NET_CLS_ROUTE 1827#ifdef CONFIG_IP_ROUTE_CLASSID
1839#ifdef CONFIG_IP_MULTIPLE_TABLES 1828#ifdef CONFIG_IP_MULTIPLE_TABLES
1840 set_class_tag(rt, fib_rules_tclass(res)); 1829 set_class_tag(rt, fib_rules_tclass(res));
1841#endif 1830#endif
1842 set_class_tag(rt, itag); 1831 set_class_tag(rt, itag);
1843#endif 1832#endif
1844 rt->rt_type = res->type; 1833 rt->rt_type = type;
1834}
1835
1836static struct rtable *rt_dst_alloc(bool nopolicy, bool noxfrm)
1837{
1838 struct rtable *rt = dst_alloc(&ipv4_dst_ops, 1);
1839 if (rt) {
1840 rt->dst.obsolete = -1;
1841
1842 rt->dst.flags = DST_HOST |
1843 (nopolicy ? DST_NOPOLICY : 0) |
1844 (noxfrm ? DST_NOXFRM : 0);
1845 }
1846 return rt;
1845} 1847}
1846 1848
1847/* called in rcu_read_lock() section */ 1849/* called in rcu_read_lock() section */
@@ -1874,31 +1876,26 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1874 if (err < 0) 1876 if (err < 0)
1875 goto e_err; 1877 goto e_err;
1876 } 1878 }
1877 rth = dst_alloc(&ipv4_dst_ops); 1879 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1878 if (!rth) 1880 if (!rth)
1879 goto e_nobufs; 1881 goto e_nobufs;
1880 1882
1881 rth->dst.output = ip_rt_bug; 1883 rth->dst.output = ip_rt_bug;
1882 rth->dst.obsolete = -1;
1883 1884
1884 atomic_set(&rth->dst.__refcnt, 1); 1885 rth->rt_key_dst = daddr;
1885 rth->dst.flags= DST_HOST;
1886 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1887 rth->dst.flags |= DST_NOPOLICY;
1888 rth->fl.fl4_dst = daddr;
1889 rth->rt_dst = daddr; 1886 rth->rt_dst = daddr;
1890 rth->fl.fl4_tos = tos; 1887 rth->rt_tos = tos;
1891 rth->fl.mark = skb->mark; 1888 rth->rt_mark = skb->mark;
1892 rth->fl.fl4_src = saddr; 1889 rth->rt_key_src = saddr;
1893 rth->rt_src = saddr; 1890 rth->rt_src = saddr;
1894#ifdef CONFIG_NET_CLS_ROUTE 1891#ifdef CONFIG_IP_ROUTE_CLASSID
1895 rth->dst.tclassid = itag; 1892 rth->dst.tclassid = itag;
1896#endif 1893#endif
1897 rth->rt_iif = 1894 rth->rt_route_iif = dev->ifindex;
1898 rth->fl.iif = dev->ifindex; 1895 rth->rt_iif = dev->ifindex;
1899 rth->dst.dev = init_net.loopback_dev; 1896 rth->dst.dev = init_net.loopback_dev;
1900 dev_hold(rth->dst.dev); 1897 dev_hold(rth->dst.dev);
1901 rth->fl.oif = 0; 1898 rth->rt_oif = 0;
1902 rth->rt_gateway = daddr; 1899 rth->rt_gateway = daddr;
1903 rth->rt_spec_dst= spec_dst; 1900 rth->rt_spec_dst= spec_dst;
1904 rth->rt_genid = rt_genid(dev_net(dev)); 1901 rth->rt_genid = rt_genid(dev_net(dev));
@@ -1916,7 +1913,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1916 RT_CACHE_STAT_INC(in_slow_mc); 1913 RT_CACHE_STAT_INC(in_slow_mc);
1917 1914
1918 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1915 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1919 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); 1916 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1917 err = 0;
1918 if (IS_ERR(rth))
1919 err = PTR_ERR(rth);
1920 1920
1921e_nobufs: 1921e_nobufs:
1922 return -ENOBUFS; 1922 return -ENOBUFS;
@@ -1959,7 +1959,7 @@ static void ip_handle_martian_source(struct net_device *dev,
1959 1959
1960/* called in rcu_read_lock() section */ 1960/* called in rcu_read_lock() section */
1961static int __mkroute_input(struct sk_buff *skb, 1961static int __mkroute_input(struct sk_buff *skb,
1962 struct fib_result *res, 1962 const struct fib_result *res,
1963 struct in_device *in_dev, 1963 struct in_device *in_dev,
1964 __be32 daddr, __be32 saddr, u32 tos, 1964 __be32 daddr, __be32 saddr, u32 tos,
1965 struct rtable **result) 1965 struct rtable **result)
@@ -2013,39 +2013,32 @@ static int __mkroute_input(struct sk_buff *skb,
2013 } 2013 }
2014 } 2014 }
2015 2015
2016 2016 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY),
2017 rth = dst_alloc(&ipv4_dst_ops); 2017 IN_DEV_CONF_GET(out_dev, NOXFRM));
2018 if (!rth) { 2018 if (!rth) {
2019 err = -ENOBUFS; 2019 err = -ENOBUFS;
2020 goto cleanup; 2020 goto cleanup;
2021 } 2021 }
2022 2022
2023 atomic_set(&rth->dst.__refcnt, 1); 2023 rth->rt_key_dst = daddr;
2024 rth->dst.flags= DST_HOST;
2025 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2026 rth->dst.flags |= DST_NOPOLICY;
2027 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2028 rth->dst.flags |= DST_NOXFRM;
2029 rth->fl.fl4_dst = daddr;
2030 rth->rt_dst = daddr; 2024 rth->rt_dst = daddr;
2031 rth->fl.fl4_tos = tos; 2025 rth->rt_tos = tos;
2032 rth->fl.mark = skb->mark; 2026 rth->rt_mark = skb->mark;
2033 rth->fl.fl4_src = saddr; 2027 rth->rt_key_src = saddr;
2034 rth->rt_src = saddr; 2028 rth->rt_src = saddr;
2035 rth->rt_gateway = daddr; 2029 rth->rt_gateway = daddr;
2036 rth->rt_iif = 2030 rth->rt_route_iif = in_dev->dev->ifindex;
2037 rth->fl.iif = in_dev->dev->ifindex; 2031 rth->rt_iif = in_dev->dev->ifindex;
2038 rth->dst.dev = (out_dev)->dev; 2032 rth->dst.dev = (out_dev)->dev;
2039 dev_hold(rth->dst.dev); 2033 dev_hold(rth->dst.dev);
2040 rth->fl.oif = 0; 2034 rth->rt_oif = 0;
2041 rth->rt_spec_dst= spec_dst; 2035 rth->rt_spec_dst= spec_dst;
2042 2036
2043 rth->dst.obsolete = -1;
2044 rth->dst.input = ip_forward; 2037 rth->dst.input = ip_forward;
2045 rth->dst.output = ip_output; 2038 rth->dst.output = ip_output;
2046 rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); 2039 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2047 2040
2048 rt_set_nexthop(rth, res, itag); 2041 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2049 2042
2050 rth->rt_flags = flags; 2043 rth->rt_flags = flags;
2051 2044
@@ -2057,7 +2050,7 @@ static int __mkroute_input(struct sk_buff *skb,
2057 2050
2058static int ip_mkroute_input(struct sk_buff *skb, 2051static int ip_mkroute_input(struct sk_buff *skb,
2059 struct fib_result *res, 2052 struct fib_result *res,
2060 const struct flowi *fl, 2053 const struct flowi4 *fl4,
2061 struct in_device *in_dev, 2054 struct in_device *in_dev,
2062 __be32 daddr, __be32 saddr, u32 tos) 2055 __be32 daddr, __be32 saddr, u32 tos)
2063{ 2056{
@@ -2066,8 +2059,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
2066 unsigned hash; 2059 unsigned hash;
2067 2060
2068#ifdef CONFIG_IP_ROUTE_MULTIPATH 2061#ifdef CONFIG_IP_ROUTE_MULTIPATH
2069 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) 2062 if (res->fi && res->fi->fib_nhs > 1)
2070 fib_select_multipath(fl, res); 2063 fib_select_multipath(res);
2071#endif 2064#endif
2072 2065
2073 /* create a routing cache entry */ 2066 /* create a routing cache entry */
@@ -2076,9 +2069,12 @@ static int ip_mkroute_input(struct sk_buff *skb,
2076 return err; 2069 return err;
2077 2070
2078 /* put it into the cache */ 2071 /* put it into the cache */
2079 hash = rt_hash(daddr, saddr, fl->iif, 2072 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2080 rt_genid(dev_net(rth->dst.dev))); 2073 rt_genid(dev_net(rth->dst.dev)));
2081 return rt_intern_hash(hash, rth, NULL, skb, fl->iif); 2074 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2075 if (IS_ERR(rth))
2076 return PTR_ERR(rth);
2077 return 0;
2082} 2078}
2083 2079
2084/* 2080/*
@@ -2097,12 +2093,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2097{ 2093{
2098 struct fib_result res; 2094 struct fib_result res;
2099 struct in_device *in_dev = __in_dev_get_rcu(dev); 2095 struct in_device *in_dev = __in_dev_get_rcu(dev);
2100 struct flowi fl = { .fl4_dst = daddr, 2096 struct flowi4 fl4;
2101 .fl4_src = saddr,
2102 .fl4_tos = tos,
2103 .fl4_scope = RT_SCOPE_UNIVERSE,
2104 .mark = skb->mark,
2105 .iif = dev->ifindex };
2106 unsigned flags = 0; 2097 unsigned flags = 0;
2107 u32 itag = 0; 2098 u32 itag = 0;
2108 struct rtable * rth; 2099 struct rtable * rth;
@@ -2139,7 +2130,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2139 /* 2130 /*
2140 * Now we are ready to route packet. 2131 * Now we are ready to route packet.
2141 */ 2132 */
2142 err = fib_lookup(net, &fl, &res); 2133 fl4.flowi4_oif = 0;
2134 fl4.flowi4_iif = dev->ifindex;
2135 fl4.flowi4_mark = skb->mark;
2136 fl4.flowi4_tos = tos;
2137 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2138 fl4.daddr = daddr;
2139 fl4.saddr = saddr;
2140 err = fib_lookup(net, &fl4, &res);
2143 if (err != 0) { 2141 if (err != 0) {
2144 if (!IN_DEV_FORWARD(in_dev)) 2142 if (!IN_DEV_FORWARD(in_dev))
2145 goto e_hostunreach; 2143 goto e_hostunreach;
@@ -2168,7 +2166,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2168 if (res.type != RTN_UNICAST) 2166 if (res.type != RTN_UNICAST)
2169 goto martian_destination; 2167 goto martian_destination;
2170 2168
2171 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 2169 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2172out: return err; 2170out: return err;
2173 2171
2174brd_input: 2172brd_input:
@@ -2190,29 +2188,24 @@ brd_input:
2190 RT_CACHE_STAT_INC(in_brd); 2188 RT_CACHE_STAT_INC(in_brd);
2191 2189
2192local_input: 2190local_input:
2193 rth = dst_alloc(&ipv4_dst_ops); 2191 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2194 if (!rth) 2192 if (!rth)
2195 goto e_nobufs; 2193 goto e_nobufs;
2196 2194
2197 rth->dst.output= ip_rt_bug; 2195 rth->dst.output= ip_rt_bug;
2198 rth->dst.obsolete = -1;
2199 rth->rt_genid = rt_genid(net); 2196 rth->rt_genid = rt_genid(net);
2200 2197
2201 atomic_set(&rth->dst.__refcnt, 1); 2198 rth->rt_key_dst = daddr;
2202 rth->dst.flags= DST_HOST;
2203 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2204 rth->dst.flags |= DST_NOPOLICY;
2205 rth->fl.fl4_dst = daddr;
2206 rth->rt_dst = daddr; 2199 rth->rt_dst = daddr;
2207 rth->fl.fl4_tos = tos; 2200 rth->rt_tos = tos;
2208 rth->fl.mark = skb->mark; 2201 rth->rt_mark = skb->mark;
2209 rth->fl.fl4_src = saddr; 2202 rth->rt_key_src = saddr;
2210 rth->rt_src = saddr; 2203 rth->rt_src = saddr;
2211#ifdef CONFIG_NET_CLS_ROUTE 2204#ifdef CONFIG_IP_ROUTE_CLASSID
2212 rth->dst.tclassid = itag; 2205 rth->dst.tclassid = itag;
2213#endif 2206#endif
2214 rth->rt_iif = 2207 rth->rt_route_iif = dev->ifindex;
2215 rth->fl.iif = dev->ifindex; 2208 rth->rt_iif = dev->ifindex;
2216 rth->dst.dev = net->loopback_dev; 2209 rth->dst.dev = net->loopback_dev;
2217 dev_hold(rth->dst.dev); 2210 dev_hold(rth->dst.dev);
2218 rth->rt_gateway = daddr; 2211 rth->rt_gateway = daddr;
@@ -2225,8 +2218,11 @@ local_input:
2225 rth->rt_flags &= ~RTCF_LOCAL; 2218 rth->rt_flags &= ~RTCF_LOCAL;
2226 } 2219 }
2227 rth->rt_type = res.type; 2220 rth->rt_type = res.type;
2228 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); 2221 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2229 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); 2222 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2223 err = 0;
2224 if (IS_ERR(rth))
2225 err = PTR_ERR(rth);
2230 goto out; 2226 goto out;
2231 2227
2232no_route: 2228no_route:
@@ -2288,12 +2284,12 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2288 2284
2289 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2285 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2290 rth = rcu_dereference(rth->dst.rt_next)) { 2286 rth = rcu_dereference(rth->dst.rt_next)) {
2291 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | 2287 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2292 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | 2288 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2293 (rth->fl.iif ^ iif) | 2289 (rth->rt_iif ^ iif) |
2294 rth->fl.oif | 2290 rth->rt_oif |
2295 (rth->fl.fl4_tos ^ tos)) == 0 && 2291 (rth->rt_tos ^ tos)) == 0 &&
2296 rth->fl.mark == skb->mark && 2292 rth->rt_mark == skb->mark &&
2297 net_eq(dev_net(rth->dst.dev), net) && 2293 net_eq(dev_net(rth->dst.dev), net) &&
2298 !rt_is_expired(rth)) { 2294 !rt_is_expired(rth)) {
2299 if (noref) { 2295 if (noref) {
@@ -2326,8 +2322,8 @@ skip_cache:
2326 struct in_device *in_dev = __in_dev_get_rcu(dev); 2322 struct in_device *in_dev = __in_dev_get_rcu(dev);
2327 2323
2328 if (in_dev) { 2324 if (in_dev) {
2329 int our = ip_check_mc(in_dev, daddr, saddr, 2325 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2330 ip_hdr(skb)->protocol); 2326 ip_hdr(skb)->protocol);
2331 if (our 2327 if (our
2332#ifdef CONFIG_IP_MROUTE 2328#ifdef CONFIG_IP_MROUTE
2333 || 2329 ||
@@ -2351,98 +2347,92 @@ skip_cache:
2351EXPORT_SYMBOL(ip_route_input_common); 2347EXPORT_SYMBOL(ip_route_input_common);
2352 2348
2353/* called with rcu_read_lock() */ 2349/* called with rcu_read_lock() */
2354static int __mkroute_output(struct rtable **result, 2350static struct rtable *__mkroute_output(const struct fib_result *res,
2355 struct fib_result *res, 2351 const struct flowi4 *fl4,
2356 const struct flowi *fl, 2352 const struct flowi4 *oldflp4,
2357 const struct flowi *oldflp, 2353 struct net_device *dev_out,
2358 struct net_device *dev_out, 2354 unsigned int flags)
2359 unsigned flags)
2360{ 2355{
2361 struct rtable *rth; 2356 struct fib_info *fi = res->fi;
2357 u32 tos = RT_FL_TOS(oldflp4);
2362 struct in_device *in_dev; 2358 struct in_device *in_dev;
2363 u32 tos = RT_FL_TOS(oldflp); 2359 u16 type = res->type;
2360 struct rtable *rth;
2364 2361
2365 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK)) 2362 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2366 return -EINVAL; 2363 return ERR_PTR(-EINVAL);
2367 2364
2368 if (ipv4_is_lbcast(fl->fl4_dst)) 2365 if (ipv4_is_lbcast(fl4->daddr))
2369 res->type = RTN_BROADCAST; 2366 type = RTN_BROADCAST;
2370 else if (ipv4_is_multicast(fl->fl4_dst)) 2367 else if (ipv4_is_multicast(fl4->daddr))
2371 res->type = RTN_MULTICAST; 2368 type = RTN_MULTICAST;
2372 else if (ipv4_is_zeronet(fl->fl4_dst)) 2369 else if (ipv4_is_zeronet(fl4->daddr))
2373 return -EINVAL; 2370 return ERR_PTR(-EINVAL);
2374 2371
2375 if (dev_out->flags & IFF_LOOPBACK) 2372 if (dev_out->flags & IFF_LOOPBACK)
2376 flags |= RTCF_LOCAL; 2373 flags |= RTCF_LOCAL;
2377 2374
2378 in_dev = __in_dev_get_rcu(dev_out); 2375 in_dev = __in_dev_get_rcu(dev_out);
2379 if (!in_dev) 2376 if (!in_dev)
2380 return -EINVAL; 2377 return ERR_PTR(-EINVAL);
2381 2378
2382 if (res->type == RTN_BROADCAST) { 2379 if (type == RTN_BROADCAST) {
2383 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2380 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2384 res->fi = NULL; 2381 fi = NULL;
2385 } else if (res->type == RTN_MULTICAST) { 2382 } else if (type == RTN_MULTICAST) {
2386 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2383 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2387 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 2384 if (!ip_check_mc_rcu(in_dev, oldflp4->daddr, oldflp4->saddr,
2388 oldflp->proto)) 2385 oldflp4->flowi4_proto))
2389 flags &= ~RTCF_LOCAL; 2386 flags &= ~RTCF_LOCAL;
2390 /* If multicast route do not exist use 2387 /* If multicast route do not exist use
2391 * default one, but do not gateway in this case. 2388 * default one, but do not gateway in this case.
2392 * Yes, it is hack. 2389 * Yes, it is hack.
2393 */ 2390 */
2394 if (res->fi && res->prefixlen < 4) 2391 if (fi && res->prefixlen < 4)
2395 res->fi = NULL; 2392 fi = NULL;
2396 } 2393 }
2397 2394
2398 2395 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY),
2399 rth = dst_alloc(&ipv4_dst_ops); 2396 IN_DEV_CONF_GET(in_dev, NOXFRM));
2400 if (!rth) 2397 if (!rth)
2401 return -ENOBUFS; 2398 return ERR_PTR(-ENOBUFS);
2402 2399
2403 atomic_set(&rth->dst.__refcnt, 1); 2400 rth->rt_key_dst = oldflp4->daddr;
2404 rth->dst.flags= DST_HOST; 2401 rth->rt_tos = tos;
2405 if (IN_DEV_CONF_GET(in_dev, NOXFRM)) 2402 rth->rt_key_src = oldflp4->saddr;
2406 rth->dst.flags |= DST_NOXFRM; 2403 rth->rt_oif = oldflp4->flowi4_oif;
2407 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2404 rth->rt_mark = oldflp4->flowi4_mark;
2408 rth->dst.flags |= DST_NOPOLICY; 2405 rth->rt_dst = fl4->daddr;
2409 2406 rth->rt_src = fl4->saddr;
2410 rth->fl.fl4_dst = oldflp->fl4_dst; 2407 rth->rt_route_iif = 0;
2411 rth->fl.fl4_tos = tos; 2408 rth->rt_iif = oldflp4->flowi4_oif ? : dev_out->ifindex;
2412 rth->fl.fl4_src = oldflp->fl4_src;
2413 rth->fl.oif = oldflp->oif;
2414 rth->fl.mark = oldflp->mark;
2415 rth->rt_dst = fl->fl4_dst;
2416 rth->rt_src = fl->fl4_src;
2417 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2418 /* get references to the devices that are to be hold by the routing 2409 /* get references to the devices that are to be hold by the routing
2419 cache entry */ 2410 cache entry */
2420 rth->dst.dev = dev_out; 2411 rth->dst.dev = dev_out;
2421 dev_hold(dev_out); 2412 dev_hold(dev_out);
2422 rth->rt_gateway = fl->fl4_dst; 2413 rth->rt_gateway = fl4->daddr;
2423 rth->rt_spec_dst= fl->fl4_src; 2414 rth->rt_spec_dst= fl4->saddr;
2424 2415
2425 rth->dst.output=ip_output; 2416 rth->dst.output=ip_output;
2426 rth->dst.obsolete = -1;
2427 rth->rt_genid = rt_genid(dev_net(dev_out)); 2417 rth->rt_genid = rt_genid(dev_net(dev_out));
2428 2418
2429 RT_CACHE_STAT_INC(out_slow_tot); 2419 RT_CACHE_STAT_INC(out_slow_tot);
2430 2420
2431 if (flags & RTCF_LOCAL) { 2421 if (flags & RTCF_LOCAL) {
2432 rth->dst.input = ip_local_deliver; 2422 rth->dst.input = ip_local_deliver;
2433 rth->rt_spec_dst = fl->fl4_dst; 2423 rth->rt_spec_dst = fl4->daddr;
2434 } 2424 }
2435 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2425 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2436 rth->rt_spec_dst = fl->fl4_src; 2426 rth->rt_spec_dst = fl4->saddr;
2437 if (flags & RTCF_LOCAL && 2427 if (flags & RTCF_LOCAL &&
2438 !(dev_out->flags & IFF_LOOPBACK)) { 2428 !(dev_out->flags & IFF_LOOPBACK)) {
2439 rth->dst.output = ip_mc_output; 2429 rth->dst.output = ip_mc_output;
2440 RT_CACHE_STAT_INC(out_slow_mc); 2430 RT_CACHE_STAT_INC(out_slow_mc);
2441 } 2431 }
2442#ifdef CONFIG_IP_MROUTE 2432#ifdef CONFIG_IP_MROUTE
2443 if (res->type == RTN_MULTICAST) { 2433 if (type == RTN_MULTICAST) {
2444 if (IN_DEV_MFORWARD(in_dev) && 2434 if (IN_DEV_MFORWARD(in_dev) &&
2445 !ipv4_is_local_multicast(oldflp->fl4_dst)) { 2435 !ipv4_is_local_multicast(oldflp4->daddr)) {
2446 rth->dst.input = ip_mr_input; 2436 rth->dst.input = ip_mr_input;
2447 rth->dst.output = ip_mc_output; 2437 rth->dst.output = ip_mc_output;
2448 } 2438 }
@@ -2450,31 +2440,10 @@ static int __mkroute_output(struct rtable **result,
2450#endif 2440#endif
2451 } 2441 }
2452 2442
2453 rt_set_nexthop(rth, res, 0); 2443 rt_set_nexthop(rth, oldflp4, res, fi, type, 0);
2454 2444
2455 rth->rt_flags = flags; 2445 rth->rt_flags = flags;
2456 *result = rth; 2446 return rth;
2457 return 0;
2458}
2459
2460/* called with rcu_read_lock() */
2461static int ip_mkroute_output(struct rtable **rp,
2462 struct fib_result *res,
2463 const struct flowi *fl,
2464 const struct flowi *oldflp,
2465 struct net_device *dev_out,
2466 unsigned flags)
2467{
2468 struct rtable *rth = NULL;
2469 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2470 unsigned hash;
2471 if (err == 0) {
2472 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2473 rt_genid(dev_net(dev_out)));
2474 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2475 }
2476
2477 return err;
2478} 2447}
2479 2448
2480/* 2449/*
@@ -2482,34 +2451,36 @@ static int ip_mkroute_output(struct rtable **rp,
2482 * called with rcu_read_lock(); 2451 * called with rcu_read_lock();
2483 */ 2452 */
2484 2453
2485static int ip_route_output_slow(struct net *net, struct rtable **rp, 2454static struct rtable *ip_route_output_slow(struct net *net,
2486 const struct flowi *oldflp) 2455 const struct flowi4 *oldflp4)
2487{ 2456{
2488 u32 tos = RT_FL_TOS(oldflp); 2457 u32 tos = RT_FL_TOS(oldflp4);
2489 struct flowi fl = { .fl4_dst = oldflp->fl4_dst, 2458 struct flowi4 fl4;
2490 .fl4_src = oldflp->fl4_src,
2491 .fl4_tos = tos & IPTOS_RT_MASK,
2492 .fl4_scope = ((tos & RTO_ONLINK) ?
2493 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2494 .mark = oldflp->mark,
2495 .iif = net->loopback_dev->ifindex,
2496 .oif = oldflp->oif };
2497 struct fib_result res; 2459 struct fib_result res;
2498 unsigned int flags = 0; 2460 unsigned int flags = 0;
2499 struct net_device *dev_out = NULL; 2461 struct net_device *dev_out = NULL;
2500 int err; 2462 struct rtable *rth;
2501
2502 2463
2503 res.fi = NULL; 2464 res.fi = NULL;
2504#ifdef CONFIG_IP_MULTIPLE_TABLES 2465#ifdef CONFIG_IP_MULTIPLE_TABLES
2505 res.r = NULL; 2466 res.r = NULL;
2506#endif 2467#endif
2507 2468
2508 if (oldflp->fl4_src) { 2469 fl4.flowi4_oif = oldflp4->flowi4_oif;
2509 err = -EINVAL; 2470 fl4.flowi4_iif = net->loopback_dev->ifindex;
2510 if (ipv4_is_multicast(oldflp->fl4_src) || 2471 fl4.flowi4_mark = oldflp4->flowi4_mark;
2511 ipv4_is_lbcast(oldflp->fl4_src) || 2472 fl4.daddr = oldflp4->daddr;
2512 ipv4_is_zeronet(oldflp->fl4_src)) 2473 fl4.saddr = oldflp4->saddr;
2474 fl4.flowi4_tos = tos & IPTOS_RT_MASK;
2475 fl4.flowi4_scope = ((tos & RTO_ONLINK) ?
2476 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2477
2478 rcu_read_lock();
2479 if (oldflp4->saddr) {
2480 rth = ERR_PTR(-EINVAL);
2481 if (ipv4_is_multicast(oldflp4->saddr) ||
2482 ipv4_is_lbcast(oldflp4->saddr) ||
2483 ipv4_is_zeronet(oldflp4->saddr))
2513 goto out; 2484 goto out;
2514 2485
2515 /* I removed check for oif == dev_out->oif here. 2486 /* I removed check for oif == dev_out->oif here.
@@ -2520,11 +2491,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2520 of another iface. --ANK 2491 of another iface. --ANK
2521 */ 2492 */
2522 2493
2523 if (oldflp->oif == 0 && 2494 if (oldflp4->flowi4_oif == 0 &&
2524 (ipv4_is_multicast(oldflp->fl4_dst) || 2495 (ipv4_is_multicast(oldflp4->daddr) ||
2525 ipv4_is_lbcast(oldflp->fl4_dst))) { 2496 ipv4_is_lbcast(oldflp4->daddr))) {
2526 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2497 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2527 dev_out = __ip_dev_find(net, oldflp->fl4_src, false); 2498 dev_out = __ip_dev_find(net, oldflp4->saddr, false);
2528 if (dev_out == NULL) 2499 if (dev_out == NULL)
2529 goto out; 2500 goto out;
2530 2501
@@ -2543,60 +2514,60 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2543 Luckily, this hack is good workaround. 2514 Luckily, this hack is good workaround.
2544 */ 2515 */
2545 2516
2546 fl.oif = dev_out->ifindex; 2517 fl4.flowi4_oif = dev_out->ifindex;
2547 goto make_route; 2518 goto make_route;
2548 } 2519 }
2549 2520
2550 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { 2521 if (!(oldflp4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2551 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2522 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2552 if (!__ip_dev_find(net, oldflp->fl4_src, false)) 2523 if (!__ip_dev_find(net, oldflp4->saddr, false))
2553 goto out; 2524 goto out;
2554 } 2525 }
2555 } 2526 }
2556 2527
2557 2528
2558 if (oldflp->oif) { 2529 if (oldflp4->flowi4_oif) {
2559 dev_out = dev_get_by_index_rcu(net, oldflp->oif); 2530 dev_out = dev_get_by_index_rcu(net, oldflp4->flowi4_oif);
2560 err = -ENODEV; 2531 rth = ERR_PTR(-ENODEV);
2561 if (dev_out == NULL) 2532 if (dev_out == NULL)
2562 goto out; 2533 goto out;
2563 2534
2564 /* RACE: Check return value of inet_select_addr instead. */ 2535 /* RACE: Check return value of inet_select_addr instead. */
2565 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 2536 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2566 err = -ENETUNREACH; 2537 rth = ERR_PTR(-ENETUNREACH);
2567 goto out; 2538 goto out;
2568 } 2539 }
2569 if (ipv4_is_local_multicast(oldflp->fl4_dst) || 2540 if (ipv4_is_local_multicast(oldflp4->daddr) ||
2570 ipv4_is_lbcast(oldflp->fl4_dst)) { 2541 ipv4_is_lbcast(oldflp4->daddr)) {
2571 if (!fl.fl4_src) 2542 if (!fl4.saddr)
2572 fl.fl4_src = inet_select_addr(dev_out, 0, 2543 fl4.saddr = inet_select_addr(dev_out, 0,
2573 RT_SCOPE_LINK); 2544 RT_SCOPE_LINK);
2574 goto make_route; 2545 goto make_route;
2575 } 2546 }
2576 if (!fl.fl4_src) { 2547 if (!fl4.saddr) {
2577 if (ipv4_is_multicast(oldflp->fl4_dst)) 2548 if (ipv4_is_multicast(oldflp4->daddr))
2578 fl.fl4_src = inet_select_addr(dev_out, 0, 2549 fl4.saddr = inet_select_addr(dev_out, 0,
2579 fl.fl4_scope); 2550 fl4.flowi4_scope);
2580 else if (!oldflp->fl4_dst) 2551 else if (!oldflp4->daddr)
2581 fl.fl4_src = inet_select_addr(dev_out, 0, 2552 fl4.saddr = inet_select_addr(dev_out, 0,
2582 RT_SCOPE_HOST); 2553 RT_SCOPE_HOST);
2583 } 2554 }
2584 } 2555 }
2585 2556
2586 if (!fl.fl4_dst) { 2557 if (!fl4.daddr) {
2587 fl.fl4_dst = fl.fl4_src; 2558 fl4.daddr = fl4.saddr;
2588 if (!fl.fl4_dst) 2559 if (!fl4.daddr)
2589 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); 2560 fl4.daddr = fl4.saddr = htonl(INADDR_LOOPBACK);
2590 dev_out = net->loopback_dev; 2561 dev_out = net->loopback_dev;
2591 fl.oif = net->loopback_dev->ifindex; 2562 fl4.flowi4_oif = net->loopback_dev->ifindex;
2592 res.type = RTN_LOCAL; 2563 res.type = RTN_LOCAL;
2593 flags |= RTCF_LOCAL; 2564 flags |= RTCF_LOCAL;
2594 goto make_route; 2565 goto make_route;
2595 } 2566 }
2596 2567
2597 if (fib_lookup(net, &fl, &res)) { 2568 if (fib_lookup(net, &fl4, &res)) {
2598 res.fi = NULL; 2569 res.fi = NULL;
2599 if (oldflp->oif) { 2570 if (oldflp4->flowi4_oif) {
2600 /* Apparently, routing tables are wrong. Assume, 2571 /* Apparently, routing tables are wrong. Assume,
2601 that the destination is on link. 2572 that the destination is on link.
2602 2573
@@ -2615,90 +2586,93 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2615 likely IPv6, but we do not. 2586 likely IPv6, but we do not.
2616 */ 2587 */
2617 2588
2618 if (fl.fl4_src == 0) 2589 if (fl4.saddr == 0)
2619 fl.fl4_src = inet_select_addr(dev_out, 0, 2590 fl4.saddr = inet_select_addr(dev_out, 0,
2620 RT_SCOPE_LINK); 2591 RT_SCOPE_LINK);
2621 res.type = RTN_UNICAST; 2592 res.type = RTN_UNICAST;
2622 goto make_route; 2593 goto make_route;
2623 } 2594 }
2624 err = -ENETUNREACH; 2595 rth = ERR_PTR(-ENETUNREACH);
2625 goto out; 2596 goto out;
2626 } 2597 }
2627 2598
2628 if (res.type == RTN_LOCAL) { 2599 if (res.type == RTN_LOCAL) {
2629 if (!fl.fl4_src) { 2600 if (!fl4.saddr) {
2630 if (res.fi->fib_prefsrc) 2601 if (res.fi->fib_prefsrc)
2631 fl.fl4_src = res.fi->fib_prefsrc; 2602 fl4.saddr = res.fi->fib_prefsrc;
2632 else 2603 else
2633 fl.fl4_src = fl.fl4_dst; 2604 fl4.saddr = fl4.daddr;
2634 } 2605 }
2635 dev_out = net->loopback_dev; 2606 dev_out = net->loopback_dev;
2636 fl.oif = dev_out->ifindex; 2607 fl4.flowi4_oif = dev_out->ifindex;
2637 res.fi = NULL; 2608 res.fi = NULL;
2638 flags |= RTCF_LOCAL; 2609 flags |= RTCF_LOCAL;
2639 goto make_route; 2610 goto make_route;
2640 } 2611 }
2641 2612
2642#ifdef CONFIG_IP_ROUTE_MULTIPATH 2613#ifdef CONFIG_IP_ROUTE_MULTIPATH
2643 if (res.fi->fib_nhs > 1 && fl.oif == 0) 2614 if (res.fi->fib_nhs > 1 && fl4.flowi4_oif == 0)
2644 fib_select_multipath(&fl, &res); 2615 fib_select_multipath(&res);
2645 else 2616 else
2646#endif 2617#endif
2647 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) 2618 if (!res.prefixlen && res.type == RTN_UNICAST && !fl4.flowi4_oif)
2648 fib_select_default(net, &fl, &res); 2619 fib_select_default(&res);
2649 2620
2650 if (!fl.fl4_src) 2621 if (!fl4.saddr)
2651 fl.fl4_src = FIB_RES_PREFSRC(res); 2622 fl4.saddr = FIB_RES_PREFSRC(net, res);
2652 2623
2653 dev_out = FIB_RES_DEV(res); 2624 dev_out = FIB_RES_DEV(res);
2654 fl.oif = dev_out->ifindex; 2625 fl4.flowi4_oif = dev_out->ifindex;
2655 2626
2656 2627
2657make_route: 2628make_route:
2658 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2629 rth = __mkroute_output(&res, &fl4, oldflp4, dev_out, flags);
2630 if (!IS_ERR(rth)) {
2631 unsigned int hash;
2659 2632
2660out: return err; 2633 hash = rt_hash(oldflp4->daddr, oldflp4->saddr, oldflp4->flowi4_oif,
2634 rt_genid(dev_net(dev_out)));
2635 rth = rt_intern_hash(hash, rth, NULL, oldflp4->flowi4_oif);
2636 }
2637
2638out:
2639 rcu_read_unlock();
2640 return rth;
2661} 2641}
2662 2642
2663int __ip_route_output_key(struct net *net, struct rtable **rp, 2643struct rtable *__ip_route_output_key(struct net *net, const struct flowi4 *flp4)
2664 const struct flowi *flp)
2665{ 2644{
2666 unsigned int hash;
2667 int res;
2668 struct rtable *rth; 2645 struct rtable *rth;
2646 unsigned int hash;
2669 2647
2670 if (!rt_caching(net)) 2648 if (!rt_caching(net))
2671 goto slow_output; 2649 goto slow_output;
2672 2650
2673 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); 2651 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2674 2652
2675 rcu_read_lock_bh(); 2653 rcu_read_lock_bh();
2676 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; 2654 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2677 rth = rcu_dereference_bh(rth->dst.rt_next)) { 2655 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2678 if (rth->fl.fl4_dst == flp->fl4_dst && 2656 if (rth->rt_key_dst == flp4->daddr &&
2679 rth->fl.fl4_src == flp->fl4_src && 2657 rth->rt_key_src == flp4->saddr &&
2680 rt_is_output_route(rth) && 2658 rt_is_output_route(rth) &&
2681 rth->fl.oif == flp->oif && 2659 rth->rt_oif == flp4->flowi4_oif &&
2682 rth->fl.mark == flp->mark && 2660 rth->rt_mark == flp4->flowi4_mark &&
2683 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2661 !((rth->rt_tos ^ flp4->flowi4_tos) &
2684 (IPTOS_RT_MASK | RTO_ONLINK)) && 2662 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2685 net_eq(dev_net(rth->dst.dev), net) && 2663 net_eq(dev_net(rth->dst.dev), net) &&
2686 !rt_is_expired(rth)) { 2664 !rt_is_expired(rth)) {
2687 dst_use(&rth->dst, jiffies); 2665 dst_use(&rth->dst, jiffies);
2688 RT_CACHE_STAT_INC(out_hit); 2666 RT_CACHE_STAT_INC(out_hit);
2689 rcu_read_unlock_bh(); 2667 rcu_read_unlock_bh();
2690 *rp = rth; 2668 return rth;
2691 return 0;
2692 } 2669 }
2693 RT_CACHE_STAT_INC(out_hlist_search); 2670 RT_CACHE_STAT_INC(out_hlist_search);
2694 } 2671 }
2695 rcu_read_unlock_bh(); 2672 rcu_read_unlock_bh();
2696 2673
2697slow_output: 2674slow_output:
2698 rcu_read_lock(); 2675 return ip_route_output_slow(net, flp4);
2699 res = ip_route_output_slow(net, rp, flp);
2700 rcu_read_unlock();
2701 return res;
2702} 2676}
2703EXPORT_SYMBOL_GPL(__ip_route_output_key); 2677EXPORT_SYMBOL_GPL(__ip_route_output_key);
2704 2678
@@ -2716,6 +2690,12 @@ static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2716{ 2690{
2717} 2691}
2718 2692
2693static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2694 unsigned long old)
2695{
2696 return NULL;
2697}
2698
2719static struct dst_ops ipv4_dst_blackhole_ops = { 2699static struct dst_ops ipv4_dst_blackhole_ops = {
2720 .family = AF_INET, 2700 .family = AF_INET,
2721 .protocol = cpu_to_be16(ETH_P_IP), 2701 .protocol = cpu_to_be16(ETH_P_IP),
@@ -2724,19 +2704,17 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
2724 .default_mtu = ipv4_blackhole_default_mtu, 2704 .default_mtu = ipv4_blackhole_default_mtu,
2725 .default_advmss = ipv4_default_advmss, 2705 .default_advmss = ipv4_default_advmss,
2726 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2706 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2707 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2727}; 2708};
2728 2709
2729 2710struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2730static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2731{ 2711{
2732 struct rtable *ort = *rp; 2712 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, 1);
2733 struct rtable *rt = (struct rtable *) 2713 struct rtable *ort = (struct rtable *) dst_orig;
2734 dst_alloc(&ipv4_dst_blackhole_ops);
2735 2714
2736 if (rt) { 2715 if (rt) {
2737 struct dst_entry *new = &rt->dst; 2716 struct dst_entry *new = &rt->dst;
2738 2717
2739 atomic_set(&new->__refcnt, 1);
2740 new->__use = 1; 2718 new->__use = 1;
2741 new->input = dst_discard; 2719 new->input = dst_discard;
2742 new->output = dst_discard; 2720 new->output = dst_discard;
@@ -2746,59 +2724,58 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2746 if (new->dev) 2724 if (new->dev)
2747 dev_hold(new->dev); 2725 dev_hold(new->dev);
2748 2726
2749 rt->fl = ort->fl; 2727 rt->rt_key_dst = ort->rt_key_dst;
2728 rt->rt_key_src = ort->rt_key_src;
2729 rt->rt_tos = ort->rt_tos;
2730 rt->rt_route_iif = ort->rt_route_iif;
2731 rt->rt_iif = ort->rt_iif;
2732 rt->rt_oif = ort->rt_oif;
2733 rt->rt_mark = ort->rt_mark;
2750 2734
2751 rt->rt_genid = rt_genid(net); 2735 rt->rt_genid = rt_genid(net);
2752 rt->rt_flags = ort->rt_flags; 2736 rt->rt_flags = ort->rt_flags;
2753 rt->rt_type = ort->rt_type; 2737 rt->rt_type = ort->rt_type;
2754 rt->rt_dst = ort->rt_dst; 2738 rt->rt_dst = ort->rt_dst;
2755 rt->rt_src = ort->rt_src; 2739 rt->rt_src = ort->rt_src;
2756 rt->rt_iif = ort->rt_iif;
2757 rt->rt_gateway = ort->rt_gateway; 2740 rt->rt_gateway = ort->rt_gateway;
2758 rt->rt_spec_dst = ort->rt_spec_dst; 2741 rt->rt_spec_dst = ort->rt_spec_dst;
2759 rt->peer = ort->peer; 2742 rt->peer = ort->peer;
2760 if (rt->peer) 2743 if (rt->peer)
2761 atomic_inc(&rt->peer->refcnt); 2744 atomic_inc(&rt->peer->refcnt);
2745 rt->fi = ort->fi;
2746 if (rt->fi)
2747 atomic_inc(&rt->fi->fib_clntref);
2762 2748
2763 dst_free(new); 2749 dst_free(new);
2764 } 2750 }
2765 2751
2766 dst_release(&(*rp)->dst); 2752 dst_release(dst_orig);
2767 *rp = rt; 2753
2768 return rt ? 0 : -ENOMEM; 2754 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2769} 2755}
2770 2756
2771int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, 2757struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2772 struct sock *sk, int flags) 2758 struct sock *sk)
2773{ 2759{
2774 int err; 2760 struct rtable *rt = __ip_route_output_key(net, flp4);
2775 2761
2776 if ((err = __ip_route_output_key(net, rp, flp)) != 0) 2762 if (IS_ERR(rt))
2777 return err; 2763 return rt;
2778 2764
2779 if (flp->proto) { 2765 if (flp4->flowi4_proto) {
2780 if (!flp->fl4_src) 2766 if (!flp4->saddr)
2781 flp->fl4_src = (*rp)->rt_src; 2767 flp4->saddr = rt->rt_src;
2782 if (!flp->fl4_dst) 2768 if (!flp4->daddr)
2783 flp->fl4_dst = (*rp)->rt_dst; 2769 flp4->daddr = rt->rt_dst;
2784 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, 2770 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2785 flags ? XFRM_LOOKUP_WAIT : 0); 2771 flowi4_to_flowi(flp4),
2786 if (err == -EREMOTE) 2772 sk, 0);
2787 err = ipv4_dst_blackhole(net, rp, flp);
2788
2789 return err;
2790 } 2773 }
2791 2774
2792 return 0; 2775 return rt;
2793} 2776}
2794EXPORT_SYMBOL_GPL(ip_route_output_flow); 2777EXPORT_SYMBOL_GPL(ip_route_output_flow);
2795 2778
2796int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2797{
2798 return ip_route_output_flow(net, rp, flp, NULL, 0);
2799}
2800EXPORT_SYMBOL(ip_route_output_key);
2801
2802static int rt_fill_info(struct net *net, 2779static int rt_fill_info(struct net *net,
2803 struct sk_buff *skb, u32 pid, u32 seq, int event, 2780 struct sk_buff *skb, u32 pid, u32 seq, int event,
2804 int nowait, unsigned int flags) 2781 int nowait, unsigned int flags)
@@ -2817,7 +2794,7 @@ static int rt_fill_info(struct net *net,
2817 r->rtm_family = AF_INET; 2794 r->rtm_family = AF_INET;
2818 r->rtm_dst_len = 32; 2795 r->rtm_dst_len = 32;
2819 r->rtm_src_len = 0; 2796 r->rtm_src_len = 0;
2820 r->rtm_tos = rt->fl.fl4_tos; 2797 r->rtm_tos = rt->rt_tos;
2821 r->rtm_table = RT_TABLE_MAIN; 2798 r->rtm_table = RT_TABLE_MAIN;
2822 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); 2799 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2823 r->rtm_type = rt->rt_type; 2800 r->rtm_type = rt->rt_type;
@@ -2829,19 +2806,19 @@ static int rt_fill_info(struct net *net,
2829 2806
2830 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); 2807 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2831 2808
2832 if (rt->fl.fl4_src) { 2809 if (rt->rt_key_src) {
2833 r->rtm_src_len = 32; 2810 r->rtm_src_len = 32;
2834 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); 2811 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2835 } 2812 }
2836 if (rt->dst.dev) 2813 if (rt->dst.dev)
2837 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); 2814 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2838#ifdef CONFIG_NET_CLS_ROUTE 2815#ifdef CONFIG_IP_ROUTE_CLASSID
2839 if (rt->dst.tclassid) 2816 if (rt->dst.tclassid)
2840 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); 2817 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2841#endif 2818#endif
2842 if (rt_is_input_route(rt)) 2819 if (rt_is_input_route(rt))
2843 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2820 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2844 else if (rt->rt_src != rt->fl.fl4_src) 2821 else if (rt->rt_src != rt->rt_key_src)
2845 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 2822 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2846 2823
2847 if (rt->rt_dst != rt->rt_gateway) 2824 if (rt->rt_dst != rt->rt_gateway)
@@ -2850,11 +2827,12 @@ static int rt_fill_info(struct net *net,
2850 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) 2827 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2851 goto nla_put_failure; 2828 goto nla_put_failure;
2852 2829
2853 if (rt->fl.mark) 2830 if (rt->rt_mark)
2854 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); 2831 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2855 2832
2856 error = rt->dst.error; 2833 error = rt->dst.error;
2857 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; 2834 expires = (rt->peer && rt->peer->pmtu_expires) ?
2835 rt->peer->pmtu_expires - jiffies : 0;
2858 if (rt->peer) { 2836 if (rt->peer) {
2859 inet_peer_refcheck(rt->peer); 2837 inet_peer_refcheck(rt->peer);
2860 id = atomic_read(&rt->peer->ip_id_count) & 0xffff; 2838 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
@@ -2884,7 +2862,7 @@ static int rt_fill_info(struct net *net,
2884 } 2862 }
2885 } else 2863 } else
2886#endif 2864#endif
2887 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); 2865 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2888 } 2866 }
2889 2867
2890 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, 2868 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
@@ -2958,14 +2936,18 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2958 if (err == 0 && rt->dst.error) 2936 if (err == 0 && rt->dst.error)
2959 err = -rt->dst.error; 2937 err = -rt->dst.error;
2960 } else { 2938 } else {
2961 struct flowi fl = { 2939 struct flowi4 fl4 = {
2962 .fl4_dst = dst, 2940 .daddr = dst,
2963 .fl4_src = src, 2941 .saddr = src,
2964 .fl4_tos = rtm->rtm_tos, 2942 .flowi4_tos = rtm->rtm_tos,
2965 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 2943 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2966 .mark = mark, 2944 .flowi4_mark = mark,
2967 }; 2945 };
2968 err = ip_route_output_key(net, &rt, &fl); 2946 rt = ip_route_output_key(net, &fl4);
2947
2948 err = 0;
2949 if (IS_ERR(rt))
2950 err = PTR_ERR(rt);
2969 } 2951 }
2970 2952
2971 if (err) 2953 if (err)
@@ -3248,6 +3230,8 @@ static __net_init int rt_genid_init(struct net *net)
3248{ 3230{
3249 get_random_bytes(&net->ipv4.rt_genid, 3231 get_random_bytes(&net->ipv4.rt_genid,
3250 sizeof(net->ipv4.rt_genid)); 3232 sizeof(net->ipv4.rt_genid));
3233 get_random_bytes(&net->ipv4.dev_addr_genid,
3234 sizeof(net->ipv4.dev_addr_genid));
3251 return 0; 3235 return 0;
3252} 3236}
3253 3237
@@ -3256,9 +3240,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
3256}; 3240};
3257 3241
3258 3242
3259#ifdef CONFIG_NET_CLS_ROUTE 3243#ifdef CONFIG_IP_ROUTE_CLASSID
3260struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3244struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3261#endif /* CONFIG_NET_CLS_ROUTE */ 3245#endif /* CONFIG_IP_ROUTE_CLASSID */
3262 3246
3263static __initdata unsigned long rhash_entries; 3247static __initdata unsigned long rhash_entries;
3264static int __init set_rhash_entries(char *str) 3248static int __init set_rhash_entries(char *str)
@@ -3274,7 +3258,7 @@ int __init ip_rt_init(void)
3274{ 3258{
3275 int rc = 0; 3259 int rc = 0;
3276 3260
3277#ifdef CONFIG_NET_CLS_ROUTE 3261#ifdef CONFIG_IP_ROUTE_CLASSID
3278 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3262 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3279 if (!ip_rt_acct) 3263 if (!ip_rt_acct)
3280 panic("IP: failed to allocate ip_rt_acct\n"); 3264 panic("IP: failed to allocate ip_rt_acct\n");
@@ -3311,14 +3295,6 @@ int __init ip_rt_init(void)
3311 devinet_init(); 3295 devinet_init();
3312 ip_fib_init(); 3296 ip_fib_init();
3313 3297
3314 /* All the timers, started at system startup tend
3315 to synchronize. Perturb it a bit.
3316 */
3317 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3318 expires_ljiffies = jiffies;
3319 schedule_delayed_work(&expires_work,
3320 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3321
3322 if (ip_rt_proc_init()) 3298 if (ip_rt_proc_init())
3323 printk(KERN_ERR "Unable to create route proc files\n"); 3299 printk(KERN_ERR "Unable to create route proc files\n");
3324#ifdef CONFIG_XFRM 3300#ifdef CONFIG_XFRM
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 47519205a014..8b44c6d2a79b 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -345,17 +345,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
345 * no easy way to do this. 345 * no easy way to do this.
346 */ 346 */
347 { 347 {
348 struct flowi fl = { .mark = sk->sk_mark, 348 struct flowi4 fl4 = {
349 .fl4_dst = ((opt && opt->srr) ? 349 .flowi4_mark = sk->sk_mark,
350 opt->faddr : ireq->rmt_addr), 350 .daddr = ((opt && opt->srr) ?
351 .fl4_src = ireq->loc_addr, 351 opt->faddr : ireq->rmt_addr),
352 .fl4_tos = RT_CONN_FLAGS(sk), 352 .saddr = ireq->loc_addr,
353 .proto = IPPROTO_TCP, 353 .flowi4_tos = RT_CONN_FLAGS(sk),
354 .flags = inet_sk_flowi_flags(sk), 354 .flowi4_proto = IPPROTO_TCP,
355 .fl_ip_sport = th->dest, 355 .flowi4_flags = inet_sk_flowi_flags(sk),
356 .fl_ip_dport = th->source }; 356 .fl4_sport = th->dest,
357 security_req_classify_flow(req, &fl); 357 .fl4_dport = th->source,
358 if (ip_route_output_key(sock_net(sk), &rt, &fl)) { 358 };
359 security_req_classify_flow(req, flowi4_to_flowi(&fl4));
360 rt = ip_route_output_key(sock_net(sk), &fl4);
361 if (IS_ERR(rt)) {
359 reqsk_free(req); 362 reqsk_free(req);
360 goto out; 363 goto out;
361 } 364 }
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1a456652086b..321e6e84dbcc 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -311,7 +311,6 @@ static struct ctl_table ipv4_table[] = {
311 .mode = 0644, 311 .mode = 0644,
312 .proc_handler = proc_do_large_bitmap, 312 .proc_handler = proc_do_large_bitmap,
313 }, 313 },
314#ifdef CONFIG_IP_MULTICAST
315 { 314 {
316 .procname = "igmp_max_memberships", 315 .procname = "igmp_max_memberships",
317 .data = &sysctl_igmp_max_memberships, 316 .data = &sysctl_igmp_max_memberships,
@@ -319,8 +318,6 @@ static struct ctl_table ipv4_table[] = {
319 .mode = 0644, 318 .mode = 0644,
320 .proc_handler = proc_dointvec 319 .proc_handler = proc_dointvec
321 }, 320 },
322
323#endif
324 { 321 {
325 .procname = "igmp_max_msf", 322 .procname = "igmp_max_msf",
326 .data = &sysctl_igmp_max_msf, 323 .data = &sysctl_igmp_max_msf,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6c11eece262c..b22d45010545 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -505,6 +505,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
505 else 505 else
506 answ = tp->write_seq - tp->snd_una; 506 answ = tp->write_seq - tp->snd_una;
507 break; 507 break;
508 case SIOCOUTQNSD:
509 if (sk->sk_state == TCP_LISTEN)
510 return -EINVAL;
511
512 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
513 answ = 0;
514 else
515 answ = tp->write_seq - tp->snd_nxt;
516 break;
508 default: 517 default:
509 return -ENOIOCTLCMD; 518 return -ENOIOCTLCMD;
510 } 519 }
@@ -873,9 +882,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
873 flags); 882 flags);
874 883
875 lock_sock(sk); 884 lock_sock(sk);
876 TCP_CHECK_TIMER(sk);
877 res = do_tcp_sendpages(sk, &page, offset, size, flags); 885 res = do_tcp_sendpages(sk, &page, offset, size, flags);
878 TCP_CHECK_TIMER(sk);
879 release_sock(sk); 886 release_sock(sk);
880 return res; 887 return res;
881} 888}
@@ -916,7 +923,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
916 long timeo; 923 long timeo;
917 924
918 lock_sock(sk); 925 lock_sock(sk);
919 TCP_CHECK_TIMER(sk);
920 926
921 flags = msg->msg_flags; 927 flags = msg->msg_flags;
922 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 928 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -1104,7 +1110,6 @@ wait_for_memory:
1104out: 1110out:
1105 if (copied) 1111 if (copied)
1106 tcp_push(sk, flags, mss_now, tp->nonagle); 1112 tcp_push(sk, flags, mss_now, tp->nonagle);
1107 TCP_CHECK_TIMER(sk);
1108 release_sock(sk); 1113 release_sock(sk);
1109 return copied; 1114 return copied;
1110 1115
@@ -1123,7 +1128,6 @@ do_error:
1123 goto out; 1128 goto out;
1124out_err: 1129out_err:
1125 err = sk_stream_error(sk, flags, err); 1130 err = sk_stream_error(sk, flags, err);
1126 TCP_CHECK_TIMER(sk);
1127 release_sock(sk); 1131 release_sock(sk);
1128 return err; 1132 return err;
1129} 1133}
@@ -1415,8 +1419,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1415 1419
1416 lock_sock(sk); 1420 lock_sock(sk);
1417 1421
1418 TCP_CHECK_TIMER(sk);
1419
1420 err = -ENOTCONN; 1422 err = -ENOTCONN;
1421 if (sk->sk_state == TCP_LISTEN) 1423 if (sk->sk_state == TCP_LISTEN)
1422 goto out; 1424 goto out;
@@ -1767,12 +1769,10 @@ skip_copy:
1767 /* Clean up data we have read: This will do ACK frames. */ 1769 /* Clean up data we have read: This will do ACK frames. */
1768 tcp_cleanup_rbuf(sk, copied); 1770 tcp_cleanup_rbuf(sk, copied);
1769 1771
1770 TCP_CHECK_TIMER(sk);
1771 release_sock(sk); 1772 release_sock(sk);
1772 return copied; 1773 return copied;
1773 1774
1774out: 1775out:
1775 TCP_CHECK_TIMER(sk);
1776 release_sock(sk); 1776 release_sock(sk);
1777 return err; 1777 return err;
1778 1778
@@ -2653,7 +2653,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2653EXPORT_SYMBOL(compat_tcp_getsockopt); 2653EXPORT_SYMBOL(compat_tcp_getsockopt);
2654#endif 2654#endif
2655 2655
2656struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) 2656struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features)
2657{ 2657{
2658 struct sk_buff *segs = ERR_PTR(-EINVAL); 2658 struct sk_buff *segs = ERR_PTR(-EINVAL);
2659 struct tcphdr *th; 2659 struct tcphdr *th;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 3b53fd1af23f..6187eb4d1dcf 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -209,7 +209,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
209} 209}
210 210
211 211
212static struct tcp_congestion_ops bictcp = { 212static struct tcp_congestion_ops bictcp __read_mostly = {
213 .init = bictcp_init, 213 .init = bictcp_init,
214 .ssthresh = bictcp_recalc_ssthresh, 214 .ssthresh = bictcp_recalc_ssthresh,
215 .cong_avoid = bictcp_cong_avoid, 215 .cong_avoid = bictcp_cong_avoid,
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 71d5f2f29fa6..f376b05cca81 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -39,7 +39,7 @@
39 39
40/* Number of delay samples for detecting the increase of delay */ 40/* Number of delay samples for detecting the increase of delay */
41#define HYSTART_MIN_SAMPLES 8 41#define HYSTART_MIN_SAMPLES 8
42#define HYSTART_DELAY_MIN (2U<<3) 42#define HYSTART_DELAY_MIN (4U<<3)
43#define HYSTART_DELAY_MAX (16U<<3) 43#define HYSTART_DELAY_MAX (16U<<3)
44#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) 44#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
45 45
@@ -52,6 +52,7 @@ static int tcp_friendliness __read_mostly = 1;
52static int hystart __read_mostly = 1; 52static int hystart __read_mostly = 1;
53static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; 53static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
54static int hystart_low_window __read_mostly = 16; 54static int hystart_low_window __read_mostly = 16;
55static int hystart_ack_delta __read_mostly = 2;
55 56
56static u32 cube_rtt_scale __read_mostly; 57static u32 cube_rtt_scale __read_mostly;
57static u32 beta_scale __read_mostly; 58static u32 beta_scale __read_mostly;
@@ -75,6 +76,8 @@ MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
75 " 1: packet-train 2: delay 3: both packet-train and delay"); 76 " 1: packet-train 2: delay 3: both packet-train and delay");
76module_param(hystart_low_window, int, 0644); 77module_param(hystart_low_window, int, 0644);
77MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); 78MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
79module_param(hystart_ack_delta, int, 0644);
80MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
78 81
79/* BIC TCP Parameters */ 82/* BIC TCP Parameters */
80struct bictcp { 83struct bictcp {
@@ -85,17 +88,18 @@ struct bictcp {
85 u32 last_time; /* time when updated last_cwnd */ 88 u32 last_time; /* time when updated last_cwnd */
86 u32 bic_origin_point;/* origin point of bic function */ 89 u32 bic_origin_point;/* origin point of bic function */
87 u32 bic_K; /* time to origin point from the beginning of the current epoch */ 90 u32 bic_K; /* time to origin point from the beginning of the current epoch */
88 u32 delay_min; /* min delay */ 91 u32 delay_min; /* min delay (msec << 3) */
89 u32 epoch_start; /* beginning of an epoch */ 92 u32 epoch_start; /* beginning of an epoch */
90 u32 ack_cnt; /* number of acks */ 93 u32 ack_cnt; /* number of acks */
91 u32 tcp_cwnd; /* estimated tcp cwnd */ 94 u32 tcp_cwnd; /* estimated tcp cwnd */
92#define ACK_RATIO_SHIFT 4 95#define ACK_RATIO_SHIFT 4
96#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT)
93 u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ 97 u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
94 u8 sample_cnt; /* number of samples to decide curr_rtt */ 98 u8 sample_cnt; /* number of samples to decide curr_rtt */
95 u8 found; /* the exit point is found? */ 99 u8 found; /* the exit point is found? */
96 u32 round_start; /* beginning of each round */ 100 u32 round_start; /* beginning of each round */
97 u32 end_seq; /* end_seq of the round */ 101 u32 end_seq; /* end_seq of the round */
98 u32 last_jiffies; /* last time when the ACK spacing is close */ 102 u32 last_ack; /* last time when the ACK spacing is close */
99 u32 curr_rtt; /* the minimum rtt of current round */ 103 u32 curr_rtt; /* the minimum rtt of current round */
100}; 104};
101 105
@@ -116,12 +120,21 @@ static inline void bictcp_reset(struct bictcp *ca)
116 ca->found = 0; 120 ca->found = 0;
117} 121}
118 122
123static inline u32 bictcp_clock(void)
124{
125#if HZ < 1000
126 return ktime_to_ms(ktime_get_real());
127#else
128 return jiffies_to_msecs(jiffies);
129#endif
130}
131
119static inline void bictcp_hystart_reset(struct sock *sk) 132static inline void bictcp_hystart_reset(struct sock *sk)
120{ 133{
121 struct tcp_sock *tp = tcp_sk(sk); 134 struct tcp_sock *tp = tcp_sk(sk);
122 struct bictcp *ca = inet_csk_ca(sk); 135 struct bictcp *ca = inet_csk_ca(sk);
123 136
124 ca->round_start = ca->last_jiffies = jiffies; 137 ca->round_start = ca->last_ack = bictcp_clock();
125 ca->end_seq = tp->snd_nxt; 138 ca->end_seq = tp->snd_nxt;
126 ca->curr_rtt = 0; 139 ca->curr_rtt = 0;
127 ca->sample_cnt = 0; 140 ca->sample_cnt = 0;
@@ -236,8 +249,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
236 */ 249 */
237 250
238 /* change the unit from HZ to bictcp_HZ */ 251 /* change the unit from HZ to bictcp_HZ */
239 t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start) 252 t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3)
240 << BICTCP_HZ) / HZ; 253 - ca->epoch_start) << BICTCP_HZ) / HZ;
241 254
242 if (t < ca->bic_K) /* t - K */ 255 if (t < ca->bic_K) /* t - K */
243 offs = ca->bic_K - t; 256 offs = ca->bic_K - t;
@@ -258,6 +271,13 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
258 ca->cnt = 100 * cwnd; /* very small increment*/ 271 ca->cnt = 100 * cwnd; /* very small increment*/
259 } 272 }
260 273
274 /*
275 * The initial growth of cubic function may be too conservative
276 * when the available bandwidth is still unknown.
277 */
278 if (ca->loss_cwnd == 0 && ca->cnt > 20)
279 ca->cnt = 20; /* increase cwnd 5% per RTT */
280
261 /* TCP Friendly */ 281 /* TCP Friendly */
262 if (tcp_friendliness) { 282 if (tcp_friendliness) {
263 u32 scale = beta_scale; 283 u32 scale = beta_scale;
@@ -339,12 +359,12 @@ static void hystart_update(struct sock *sk, u32 delay)
339 struct bictcp *ca = inet_csk_ca(sk); 359 struct bictcp *ca = inet_csk_ca(sk);
340 360
341 if (!(ca->found & hystart_detect)) { 361 if (!(ca->found & hystart_detect)) {
342 u32 curr_jiffies = jiffies; 362 u32 now = bictcp_clock();
343 363
344 /* first detection parameter - ack-train detection */ 364 /* first detection parameter - ack-train detection */
345 if (curr_jiffies - ca->last_jiffies <= msecs_to_jiffies(2)) { 365 if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
346 ca->last_jiffies = curr_jiffies; 366 ca->last_ack = now;
347 if (curr_jiffies - ca->round_start >= ca->delay_min>>4) 367 if ((s32)(now - ca->round_start) > ca->delay_min >> 4)
348 ca->found |= HYSTART_ACK_TRAIN; 368 ca->found |= HYSTART_ACK_TRAIN;
349 } 369 }
350 370
@@ -379,8 +399,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
379 u32 delay; 399 u32 delay;
380 400
381 if (icsk->icsk_ca_state == TCP_CA_Open) { 401 if (icsk->icsk_ca_state == TCP_CA_Open) {
382 cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; 402 u32 ratio = ca->delayed_ack;
383 ca->delayed_ack += cnt; 403
404 ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
405 ratio += cnt;
406
407 ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT);
384 } 408 }
385 409
386 /* Some calls are for duplicates without timetamps */ 410 /* Some calls are for duplicates without timetamps */
@@ -391,7 +415,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
391 if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) 415 if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ)
392 return; 416 return;
393 417
394 delay = usecs_to_jiffies(rtt_us) << 3; 418 delay = (rtt_us << 3) / USEC_PER_MSEC;
395 if (delay == 0) 419 if (delay == 0)
396 delay = 1; 420 delay = 1;
397 421
@@ -405,7 +429,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
405 hystart_update(sk, delay); 429 hystart_update(sk, delay);
406} 430}
407 431
408static struct tcp_congestion_ops cubictcp = { 432static struct tcp_congestion_ops cubictcp __read_mostly = {
409 .init = bictcp_init, 433 .init = bictcp_init,
410 .ssthresh = bictcp_recalc_ssthresh, 434 .ssthresh = bictcp_recalc_ssthresh,
411 .cong_avoid = bictcp_cong_avoid, 435 .cong_avoid = bictcp_cong_avoid,
@@ -447,6 +471,10 @@ static int __init cubictcp_register(void)
447 /* divide by bic_scale and by constant Srtt (100ms) */ 471 /* divide by bic_scale and by constant Srtt (100ms) */
448 do_div(cube_factor, bic_scale * 10); 472 do_div(cube_factor, bic_scale * 10);
449 473
474 /* hystart needs ms clock resolution */
475 if (hystart && HZ < 1000)
476 cubictcp.flags |= TCP_CONG_RTT_STAMP;
477
450 return tcp_register_congestion_control(&cubictcp); 478 return tcp_register_congestion_control(&cubictcp);
451} 479}
452 480
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 8b6caaf75bb9..30f27f6b3655 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -158,7 +158,7 @@ static u32 hstcp_ssthresh(struct sock *sk)
158} 158}
159 159
160 160
161static struct tcp_congestion_ops tcp_highspeed = { 161static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
162 .init = hstcp_init, 162 .init = hstcp_init,
163 .ssthresh = hstcp_ssthresh, 163 .ssthresh = hstcp_ssthresh,
164 .cong_avoid = hstcp_cong_avoid, 164 .cong_avoid = hstcp_cong_avoid,
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 7c94a4955416..c1a8175361e8 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -284,7 +284,7 @@ static void htcp_state(struct sock *sk, u8 new_state)
284 } 284 }
285} 285}
286 286
287static struct tcp_congestion_ops htcp = { 287static struct tcp_congestion_ops htcp __read_mostly = {
288 .init = htcp_init, 288 .init = htcp_init,
289 .ssthresh = htcp_recalc_ssthresh, 289 .ssthresh = htcp_recalc_ssthresh,
290 .cong_avoid = htcp_cong_avoid, 290 .cong_avoid = htcp_cong_avoid,
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 377bc9349371..fe3ecf484b44 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -162,7 +162,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
162 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); 162 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
163} 163}
164 164
165static struct tcp_congestion_ops tcp_hybla = { 165static struct tcp_congestion_ops tcp_hybla __read_mostly = {
166 .init = hybla_init, 166 .init = hybla_init,
167 .ssthresh = tcp_reno_ssthresh, 167 .ssthresh = tcp_reno_ssthresh,
168 .min_cwnd = tcp_reno_min_cwnd, 168 .min_cwnd = tcp_reno_min_cwnd,
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 00ca688d8964..813b43a76fec 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -322,7 +322,7 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
322 } 322 }
323} 323}
324 324
325static struct tcp_congestion_ops tcp_illinois = { 325static struct tcp_congestion_ops tcp_illinois __read_mostly = {
326 .flags = TCP_CONG_RTT_STAMP, 326 .flags = TCP_CONG_RTT_STAMP,
327 .init = tcp_illinois_init, 327 .init = tcp_illinois_init,
328 .ssthresh = tcp_illinois_ssthresh, 328 .ssthresh = tcp_illinois_ssthresh,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 65f6c0406245..bef9f04c22ba 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
817 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 817 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
818 818
819 if (!cwnd) 819 if (!cwnd)
820 cwnd = rfc3390_bytes_to_packets(tp->mss_cache); 820 cwnd = TCP_INIT_CWND;
821 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 821 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
822} 822}
823 823
@@ -2659,7 +2659,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2659#define DBGUNDO(x...) do { } while (0) 2659#define DBGUNDO(x...) do { } while (0)
2660#endif 2660#endif
2661 2661
2662static void tcp_undo_cwr(struct sock *sk, const int undo) 2662static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
2663{ 2663{
2664 struct tcp_sock *tp = tcp_sk(sk); 2664 struct tcp_sock *tp = tcp_sk(sk);
2665 2665
@@ -2671,14 +2671,13 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
2671 else 2671 else
2672 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1); 2672 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
2673 2673
2674 if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { 2674 if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) {
2675 tp->snd_ssthresh = tp->prior_ssthresh; 2675 tp->snd_ssthresh = tp->prior_ssthresh;
2676 TCP_ECN_withdraw_cwr(tp); 2676 TCP_ECN_withdraw_cwr(tp);
2677 } 2677 }
2678 } else { 2678 } else {
2679 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); 2679 tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
2680 } 2680 }
2681 tcp_moderate_cwnd(tp);
2682 tp->snd_cwnd_stamp = tcp_time_stamp; 2681 tp->snd_cwnd_stamp = tcp_time_stamp;
2683} 2682}
2684 2683
@@ -2699,7 +2698,7 @@ static int tcp_try_undo_recovery(struct sock *sk)
2699 * or our original transmission succeeded. 2698 * or our original transmission succeeded.
2700 */ 2699 */
2701 DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); 2700 DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
2702 tcp_undo_cwr(sk, 1); 2701 tcp_undo_cwr(sk, true);
2703 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) 2702 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
2704 mib_idx = LINUX_MIB_TCPLOSSUNDO; 2703 mib_idx = LINUX_MIB_TCPLOSSUNDO;
2705 else 2704 else
@@ -2726,7 +2725,7 @@ static void tcp_try_undo_dsack(struct sock *sk)
2726 2725
2727 if (tp->undo_marker && !tp->undo_retrans) { 2726 if (tp->undo_marker && !tp->undo_retrans) {
2728 DBGUNDO(sk, "D-SACK"); 2727 DBGUNDO(sk, "D-SACK");
2729 tcp_undo_cwr(sk, 1); 2728 tcp_undo_cwr(sk, true);
2730 tp->undo_marker = 0; 2729 tp->undo_marker = 0;
2731 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); 2730 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
2732 } 2731 }
@@ -2779,7 +2778,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
2779 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); 2778 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
2780 2779
2781 DBGUNDO(sk, "Hoe"); 2780 DBGUNDO(sk, "Hoe");
2782 tcp_undo_cwr(sk, 0); 2781 tcp_undo_cwr(sk, false);
2783 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); 2782 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
2784 2783
2785 /* So... Do not make Hoe's retransmit yet. 2784 /* So... Do not make Hoe's retransmit yet.
@@ -2808,7 +2807,7 @@ static int tcp_try_undo_loss(struct sock *sk)
2808 2807
2809 DBGUNDO(sk, "partial loss"); 2808 DBGUNDO(sk, "partial loss");
2810 tp->lost_out = 0; 2809 tp->lost_out = 0;
2811 tcp_undo_cwr(sk, 1); 2810 tcp_undo_cwr(sk, true);
2812 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); 2811 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
2813 inet_csk(sk)->icsk_retransmits = 0; 2812 inet_csk(sk)->icsk_retransmits = 0;
2814 tp->undo_marker = 0; 2813 tp->undo_marker = 0;
@@ -2822,8 +2821,11 @@ static int tcp_try_undo_loss(struct sock *sk)
2822static inline void tcp_complete_cwr(struct sock *sk) 2821static inline void tcp_complete_cwr(struct sock *sk)
2823{ 2822{
2824 struct tcp_sock *tp = tcp_sk(sk); 2823 struct tcp_sock *tp = tcp_sk(sk);
2825 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); 2824 /* Do not moderate cwnd if it's already undone in cwr or recovery */
2826 tp->snd_cwnd_stamp = tcp_time_stamp; 2825 if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) {
2826 tp->snd_cwnd = tp->snd_ssthresh;
2827 tp->snd_cwnd_stamp = tcp_time_stamp;
2828 }
2827 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); 2829 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
2828} 2830}
2829 2831
@@ -3350,7 +3352,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3350 net_invalid_timestamp())) 3352 net_invalid_timestamp()))
3351 rtt_us = ktime_us_delta(ktime_get_real(), 3353 rtt_us = ktime_us_delta(ktime_get_real(),
3352 last_ackt); 3354 last_ackt);
3353 else if (ca_seq_rtt > 0) 3355 else if (ca_seq_rtt >= 0)
3354 rtt_us = jiffies_to_usecs(ca_seq_rtt); 3356 rtt_us = jiffies_to_usecs(ca_seq_rtt);
3355 } 3357 }
3356 3358
@@ -3494,7 +3496,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
3494 if (flag & FLAG_ECE) 3496 if (flag & FLAG_ECE)
3495 tcp_ratehalving_spur_to_response(sk); 3497 tcp_ratehalving_spur_to_response(sk);
3496 else 3498 else
3497 tcp_undo_cwr(sk, 1); 3499 tcp_undo_cwr(sk, true);
3498} 3500}
3499 3501
3500/* F-RTO spurious RTO detection algorithm (RFC4138) 3502/* F-RTO spurious RTO detection algorithm (RFC4138)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 02f583b3744a..f7e6c2c2d2bb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -149,9 +149,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149 struct inet_sock *inet = inet_sk(sk); 149 struct inet_sock *inet = inet_sk(sk);
150 struct tcp_sock *tp = tcp_sk(sk); 150 struct tcp_sock *tp = tcp_sk(sk);
151 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 151 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
152 __be16 orig_sport, orig_dport;
152 struct rtable *rt; 153 struct rtable *rt;
153 __be32 daddr, nexthop; 154 __be32 daddr, nexthop;
154 int tmp;
155 int err; 155 int err;
156 156
157 if (addr_len < sizeof(struct sockaddr_in)) 157 if (addr_len < sizeof(struct sockaddr_in))
@@ -167,14 +167,17 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
167 nexthop = inet->opt->faddr; 167 nexthop = inet->opt->faddr;
168 } 168 }
169 169
170 tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr, 170 orig_sport = inet->inet_sport;
171 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 171 orig_dport = usin->sin_port;
172 IPPROTO_TCP, 172 rt = ip_route_connect(nexthop, inet->inet_saddr,
173 inet->inet_sport, usin->sin_port, sk, 1); 173 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174 if (tmp < 0) { 174 IPPROTO_TCP,
175 if (tmp == -ENETUNREACH) 175 orig_sport, orig_dport, sk, true);
176 if (IS_ERR(rt)) {
177 err = PTR_ERR(rt);
178 if (err == -ENETUNREACH)
176 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 179 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
177 return tmp; 180 return err;
178 } 181 }
179 182
180 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 183 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
@@ -233,11 +236,14 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
233 if (err) 236 if (err)
234 goto failure; 237 goto failure;
235 238
236 err = ip_route_newports(&rt, IPPROTO_TCP, 239 rt = ip_route_newports(rt, IPPROTO_TCP,
237 inet->inet_sport, inet->inet_dport, sk); 240 orig_sport, orig_dport,
238 if (err) 241 inet->inet_sport, inet->inet_dport, sk);
242 if (IS_ERR(rt)) {
243 err = PTR_ERR(rt);
244 rt = NULL;
239 goto failure; 245 goto failure;
240 246 }
241 /* OK, now commit destination to socket. */ 247 /* OK, now commit destination to socket. */
242 sk->sk_gso_type = SKB_GSO_TCPV4; 248 sk->sk_gso_type = SKB_GSO_TCPV4;
243 sk_setup_caps(sk, &rt->dst); 249 sk_setup_caps(sk, &rt->dst);
@@ -1341,7 +1347,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1341 tcp_death_row.sysctl_tw_recycle && 1347 tcp_death_row.sysctl_tw_recycle &&
1342 (dst = inet_csk_route_req(sk, req)) != NULL && 1348 (dst = inet_csk_route_req(sk, req)) != NULL &&
1343 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1349 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1344 peer->daddr.a4 == saddr) { 1350 peer->daddr.addr.a4 == saddr) {
1345 inet_peer_refcheck(peer); 1351 inet_peer_refcheck(peer);
1346 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && 1352 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1347 (s32)(peer->tcp_ts - req->ts_recent) > 1353 (s32)(peer->tcp_ts - req->ts_recent) >
@@ -1556,12 +1562,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1556 1562
1557 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1563 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1558 sock_rps_save_rxhash(sk, skb->rxhash); 1564 sock_rps_save_rxhash(sk, skb->rxhash);
1559 TCP_CHECK_TIMER(sk);
1560 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1565 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1561 rsk = sk; 1566 rsk = sk;
1562 goto reset; 1567 goto reset;
1563 } 1568 }
1564 TCP_CHECK_TIMER(sk);
1565 return 0; 1569 return 0;
1566 } 1570 }
1567 1571
@@ -1583,13 +1587,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1583 } else 1587 } else
1584 sock_rps_save_rxhash(sk, skb->rxhash); 1588 sock_rps_save_rxhash(sk, skb->rxhash);
1585 1589
1586
1587 TCP_CHECK_TIMER(sk);
1588 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { 1590 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1589 rsk = sk; 1591 rsk = sk;
1590 goto reset; 1592 goto reset;
1591 } 1593 }
1592 TCP_CHECK_TIMER(sk);
1593 return 0; 1594 return 0;
1594 1595
1595reset: 1596reset:
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index de870377fbba..72f7218b03f5 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -12,7 +12,7 @@
12 * within cong_avoid. 12 * within cong_avoid.
13 * o Error correcting in remote HZ, therefore remote HZ will be keeped 13 * o Error correcting in remote HZ, therefore remote HZ will be keeped
14 * on checking and updating. 14 * on checking and updating.
15 * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne 15 * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, since
16 * OWD have a similar meaning as RTT. Also correct the buggy formular. 16 * OWD have a similar meaning as RTT. Also correct the buggy formular.
17 * o Handle reaction for Early Congestion Indication (ECI) within 17 * o Handle reaction for Early Congestion Indication (ECI) within
18 * pkts_acked, as mentioned within pseudo code. 18 * pkts_acked, as mentioned within pseudo code.
@@ -313,7 +313,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
313 lp->last_drop = tcp_time_stamp; 313 lp->last_drop = tcp_time_stamp;
314} 314}
315 315
316static struct tcp_congestion_ops tcp_lp = { 316static struct tcp_congestion_ops tcp_lp __read_mostly = {
317 .flags = TCP_CONG_RTT_STAMP, 317 .flags = TCP_CONG_RTT_STAMP,
318 .init = tcp_lp_init, 318 .init = tcp_lp_init,
319 .ssthresh = tcp_reno_ssthresh, 319 .ssthresh = tcp_reno_ssthresh,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index dfa5beb0c1c8..17388c7f49c4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -73,7 +73,7 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
73 tcp_advance_send_head(sk, skb); 73 tcp_advance_send_head(sk, skb);
74 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; 74 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
75 75
76 /* Don't override Nagle indefinately with F-RTO */ 76 /* Don't override Nagle indefinitely with F-RTO */
77 if (tp->frto_counter == 2) 77 if (tp->frto_counter == 2)
78 tp->frto_counter = 3; 78 tp->frto_counter = 3;
79 79
@@ -1003,7 +1003,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1003 int nlen; 1003 int nlen;
1004 u8 flags; 1004 u8 flags;
1005 1005
1006 BUG_ON(len > skb->len); 1006 if (WARN_ON(len > skb->len))
1007 return -EINVAL;
1007 1008
1008 nsize = skb_headlen(skb) - len; 1009 nsize = skb_headlen(skb) - len;
1009 if (nsize < 0) 1010 if (nsize < 0)
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index a76513779e2b..8ce55b8aaec8 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -35,7 +35,7 @@ static u32 tcp_scalable_ssthresh(struct sock *sk)
35} 35}
36 36
37 37
38static struct tcp_congestion_ops tcp_scalable = { 38static struct tcp_congestion_ops tcp_scalable __read_mostly = {
39 .ssthresh = tcp_scalable_ssthresh, 39 .ssthresh = tcp_scalable_ssthresh,
40 .cong_avoid = tcp_scalable_cong_avoid, 40 .cong_avoid = tcp_scalable_cong_avoid,
41 .min_cwnd = tcp_reno_min_cwnd, 41 .min_cwnd = tcp_reno_min_cwnd,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 74a6aa003657..ecd44b0c45f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -259,7 +259,6 @@ static void tcp_delack_timer(unsigned long data)
259 tcp_send_ack(sk); 259 tcp_send_ack(sk);
260 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); 260 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
261 } 261 }
262 TCP_CHECK_TIMER(sk);
263 262
264out: 263out:
265 if (tcp_memory_pressure) 264 if (tcp_memory_pressure)
@@ -481,7 +480,6 @@ static void tcp_write_timer(unsigned long data)
481 tcp_probe_timer(sk); 480 tcp_probe_timer(sk);
482 break; 481 break;
483 } 482 }
484 TCP_CHECK_TIMER(sk);
485 483
486out: 484out:
487 sk_mem_reclaim(sk); 485 sk_mem_reclaim(sk);
@@ -589,7 +587,6 @@ static void tcp_keepalive_timer (unsigned long data)
589 elapsed = keepalive_time_when(tp) - elapsed; 587 elapsed = keepalive_time_when(tp) - elapsed;
590 } 588 }
591 589
592 TCP_CHECK_TIMER(sk);
593 sk_mem_reclaim(sk); 590 sk_mem_reclaim(sk);
594 591
595resched: 592resched:
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index c6743eec9b7d..80fa2bfd7ede 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -304,7 +304,7 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
304} 304}
305EXPORT_SYMBOL_GPL(tcp_vegas_get_info); 305EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
306 306
307static struct tcp_congestion_ops tcp_vegas = { 307static struct tcp_congestion_ops tcp_vegas __read_mostly = {
308 .flags = TCP_CONG_RTT_STAMP, 308 .flags = TCP_CONG_RTT_STAMP,
309 .init = tcp_vegas_init, 309 .init = tcp_vegas_init,
310 .ssthresh = tcp_reno_ssthresh, 310 .ssthresh = tcp_reno_ssthresh,
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 38bc0b52d745..ac43cd747bce 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -201,7 +201,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
201 return max(tp->snd_cwnd >> 1U, 2U); 201 return max(tp->snd_cwnd >> 1U, 2U);
202} 202}
203 203
204static struct tcp_congestion_ops tcp_veno = { 204static struct tcp_congestion_ops tcp_veno __read_mostly = {
205 .flags = TCP_CONG_RTT_STAMP, 205 .flags = TCP_CONG_RTT_STAMP,
206 .init = tcp_veno_init, 206 .init = tcp_veno_init,
207 .ssthresh = tcp_veno_ssthresh, 207 .ssthresh = tcp_veno_ssthresh,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index a534dda5456e..1b91bf48e277 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -272,7 +272,7 @@ static void tcp_westwood_info(struct sock *sk, u32 ext,
272} 272}
273 273
274 274
275static struct tcp_congestion_ops tcp_westwood = { 275static struct tcp_congestion_ops tcp_westwood __read_mostly = {
276 .init = tcp_westwood_init, 276 .init = tcp_westwood_init,
277 .ssthresh = tcp_reno_ssthresh, 277 .ssthresh = tcp_reno_ssthresh,
278 .cong_avoid = tcp_reno_cong_avoid, 278 .cong_avoid = tcp_reno_cong_avoid,
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index a0f240358892..05c3b6f0e8e1 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -20,7 +20,7 @@
20#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss 20#define TCP_YEAH_DELTA 3 //log minimum fraction of cwnd to be removed on loss
21#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion 21#define TCP_YEAH_EPSILON 1 //log maximum fraction to be removed on early decongestion
22#define TCP_YEAH_PHY 8 //lin maximum delta from base 22#define TCP_YEAH_PHY 8 //lin maximum delta from base
23#define TCP_YEAH_RHO 16 //lin minumum number of consecutive rtt to consider competition on loss 23#define TCP_YEAH_RHO 16 //lin minimum number of consecutive rtt to consider competition on loss
24#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count 24#define TCP_YEAH_ZETA 50 //lin minimum number of state switchs to reset reno_count
25 25
26#define TCP_SCALABLE_AI_CNT 100U 26#define TCP_SCALABLE_AI_CNT 100U
@@ -225,7 +225,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {
225 return tp->snd_cwnd - reduction; 225 return tp->snd_cwnd - reduction;
226} 226}
227 227
228static struct tcp_congestion_ops tcp_yeah = { 228static struct tcp_congestion_ops tcp_yeah __read_mostly = {
229 .flags = TCP_CONG_RTT_STAMP, 229 .flags = TCP_CONG_RTT_STAMP,
230 .init = tcp_yeah_init, 230 .init = tcp_yeah_init,
231 .ssthresh = tcp_yeah_ssthresh, 231 .ssthresh = tcp_yeah_ssthresh,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8157b17959ee..f87a8eb76f3b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -189,7 +189,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
189 * @sk: socket struct in question 189 * @sk: socket struct in question
190 * @snum: port number to look up 190 * @snum: port number to look up
191 * @saddr_comp: AF-dependent comparison of bound local IP addresses 191 * @saddr_comp: AF-dependent comparison of bound local IP addresses
192 * @hash2_nulladdr: AF-dependant hash value in secondary hash chains, 192 * @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
193 * with NULL address 193 * with NULL address
194 */ 194 */
195int udp_lib_get_port(struct sock *sk, unsigned short snum, 195int udp_lib_get_port(struct sock *sk, unsigned short snum,
@@ -663,75 +663,72 @@ void udp_flush_pending_frames(struct sock *sk)
663EXPORT_SYMBOL(udp_flush_pending_frames); 663EXPORT_SYMBOL(udp_flush_pending_frames);
664 664
665/** 665/**
666 * udp4_hwcsum_outgoing - handle outgoing HW checksumming 666 * udp4_hwcsum - handle outgoing HW checksumming
667 * @sk: socket we are sending on
668 * @skb: sk_buff containing the filled-in UDP header 667 * @skb: sk_buff containing the filled-in UDP header
669 * (checksum field must be zeroed out) 668 * (checksum field must be zeroed out)
669 * @src: source IP address
670 * @dst: destination IP address
670 */ 671 */
671static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, 672static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
672 __be32 src, __be32 dst, int len)
673{ 673{
674 unsigned int offset;
675 struct udphdr *uh = udp_hdr(skb); 674 struct udphdr *uh = udp_hdr(skb);
675 struct sk_buff *frags = skb_shinfo(skb)->frag_list;
676 int offset = skb_transport_offset(skb);
677 int len = skb->len - offset;
678 int hlen = len;
676 __wsum csum = 0; 679 __wsum csum = 0;
677 680
678 if (skb_queue_len(&sk->sk_write_queue) == 1) { 681 if (!frags) {
679 /* 682 /*
680 * Only one fragment on the socket. 683 * Only one fragment on the socket.
681 */ 684 */
682 skb->csum_start = skb_transport_header(skb) - skb->head; 685 skb->csum_start = skb_transport_header(skb) - skb->head;
683 skb->csum_offset = offsetof(struct udphdr, check); 686 skb->csum_offset = offsetof(struct udphdr, check);
684 uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); 687 uh->check = ~csum_tcpudp_magic(src, dst, len,
688 IPPROTO_UDP, 0);
685 } else { 689 } else {
686 /* 690 /*
687 * HW-checksum won't work as there are two or more 691 * HW-checksum won't work as there are two or more
688 * fragments on the socket so that all csums of sk_buffs 692 * fragments on the socket so that all csums of sk_buffs
689 * should be together 693 * should be together
690 */ 694 */
691 offset = skb_transport_offset(skb); 695 do {
692 skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); 696 csum = csum_add(csum, frags->csum);
697 hlen -= frags->len;
698 } while ((frags = frags->next));
693 699
700 csum = skb_checksum(skb, offset, hlen, csum);
694 skb->ip_summed = CHECKSUM_NONE; 701 skb->ip_summed = CHECKSUM_NONE;
695 702
696 skb_queue_walk(&sk->sk_write_queue, skb) {
697 csum = csum_add(csum, skb->csum);
698 }
699
700 uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum); 703 uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
701 if (uh->check == 0) 704 if (uh->check == 0)
702 uh->check = CSUM_MANGLED_0; 705 uh->check = CSUM_MANGLED_0;
703 } 706 }
704} 707}
705 708
706/* 709static int udp_send_skb(struct sk_buff *skb, __be32 daddr, __be32 dport)
707 * Push out all pending data as one UDP datagram. Socket is locked.
708 */
709static int udp_push_pending_frames(struct sock *sk)
710{ 710{
711 struct udp_sock *up = udp_sk(sk); 711 struct sock *sk = skb->sk;
712 struct inet_sock *inet = inet_sk(sk); 712 struct inet_sock *inet = inet_sk(sk);
713 struct flowi *fl = &inet->cork.fl;
714 struct sk_buff *skb;
715 struct udphdr *uh; 713 struct udphdr *uh;
714 struct rtable *rt = (struct rtable *)skb_dst(skb);
716 int err = 0; 715 int err = 0;
717 int is_udplite = IS_UDPLITE(sk); 716 int is_udplite = IS_UDPLITE(sk);
717 int offset = skb_transport_offset(skb);
718 int len = skb->len - offset;
718 __wsum csum = 0; 719 __wsum csum = 0;
719 720
720 /* Grab the skbuff where UDP header space exists. */
721 if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
722 goto out;
723
724 /* 721 /*
725 * Create a UDP header 722 * Create a UDP header
726 */ 723 */
727 uh = udp_hdr(skb); 724 uh = udp_hdr(skb);
728 uh->source = fl->fl_ip_sport; 725 uh->source = inet->inet_sport;
729 uh->dest = fl->fl_ip_dport; 726 uh->dest = dport;
730 uh->len = htons(up->len); 727 uh->len = htons(len);
731 uh->check = 0; 728 uh->check = 0;
732 729
733 if (is_udplite) /* UDP-Lite */ 730 if (is_udplite) /* UDP-Lite */
734 csum = udplite_csum_outgoing(sk, skb); 731 csum = udplite_csum(skb);
735 732
736 else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ 733 else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */
737 734
@@ -740,20 +737,20 @@ static int udp_push_pending_frames(struct sock *sk)
740 737
741 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ 738 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
742 739
743 udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len); 740 udp4_hwcsum(skb, rt->rt_src, daddr);
744 goto send; 741 goto send;
745 742
746 } else /* `normal' UDP */ 743 } else
747 csum = udp_csum_outgoing(sk, skb); 744 csum = udp_csum(skb);
748 745
749 /* add protocol-dependent pseudo-header */ 746 /* add protocol-dependent pseudo-header */
750 uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, 747 uh->check = csum_tcpudp_magic(rt->rt_src, daddr, len,
751 sk->sk_protocol, csum); 748 sk->sk_protocol, csum);
752 if (uh->check == 0) 749 if (uh->check == 0)
753 uh->check = CSUM_MANGLED_0; 750 uh->check = CSUM_MANGLED_0;
754 751
755send: 752send:
756 err = ip_push_pending_frames(sk); 753 err = ip_send_skb(skb);
757 if (err) { 754 if (err) {
758 if (err == -ENOBUFS && !inet->recverr) { 755 if (err == -ENOBUFS && !inet->recverr) {
759 UDP_INC_STATS_USER(sock_net(sk), 756 UDP_INC_STATS_USER(sock_net(sk),
@@ -763,6 +760,26 @@ send:
763 } else 760 } else
764 UDP_INC_STATS_USER(sock_net(sk), 761 UDP_INC_STATS_USER(sock_net(sk),
765 UDP_MIB_OUTDATAGRAMS, is_udplite); 762 UDP_MIB_OUTDATAGRAMS, is_udplite);
763 return err;
764}
765
766/*
767 * Push out all pending data as one UDP datagram. Socket is locked.
768 */
769static int udp_push_pending_frames(struct sock *sk)
770{
771 struct udp_sock *up = udp_sk(sk);
772 struct inet_sock *inet = inet_sk(sk);
773 struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
774 struct sk_buff *skb;
775 int err = 0;
776
777 skb = ip_finish_skb(sk);
778 if (!skb)
779 goto out;
780
781 err = udp_send_skb(skb, fl4->daddr, fl4->fl4_dport);
782
766out: 783out:
767 up->len = 0; 784 up->len = 0;
768 up->pending = 0; 785 up->pending = 0;
@@ -774,6 +791,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
774{ 791{
775 struct inet_sock *inet = inet_sk(sk); 792 struct inet_sock *inet = inet_sk(sk);
776 struct udp_sock *up = udp_sk(sk); 793 struct udp_sock *up = udp_sk(sk);
794 struct flowi4 *fl4;
777 int ulen = len; 795 int ulen = len;
778 struct ipcm_cookie ipc; 796 struct ipcm_cookie ipc;
779 struct rtable *rt = NULL; 797 struct rtable *rt = NULL;
@@ -785,6 +803,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
785 int err, is_udplite = IS_UDPLITE(sk); 803 int err, is_udplite = IS_UDPLITE(sk);
786 int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; 804 int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
787 int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); 805 int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
806 struct sk_buff *skb;
788 807
789 if (len > 0xFFFF) 808 if (len > 0xFFFF)
790 return -EMSGSIZE; 809 return -EMSGSIZE;
@@ -799,6 +818,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
799 ipc.opt = NULL; 818 ipc.opt = NULL;
800 ipc.tx_flags = 0; 819 ipc.tx_flags = 0;
801 820
821 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
822
802 if (up->pending) { 823 if (up->pending) {
803 /* 824 /*
804 * There are pending frames. 825 * There are pending frames.
@@ -888,20 +909,25 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
888 rt = (struct rtable *)sk_dst_check(sk, 0); 909 rt = (struct rtable *)sk_dst_check(sk, 0);
889 910
890 if (rt == NULL) { 911 if (rt == NULL) {
891 struct flowi fl = { .oif = ipc.oif, 912 struct flowi4 fl4 = {
892 .mark = sk->sk_mark, 913 .flowi4_oif = ipc.oif,
893 .fl4_dst = faddr, 914 .flowi4_mark = sk->sk_mark,
894 .fl4_src = saddr, 915 .daddr = faddr,
895 .fl4_tos = tos, 916 .saddr = saddr,
896 .proto = sk->sk_protocol, 917 .flowi4_tos = tos,
897 .flags = inet_sk_flowi_flags(sk), 918 .flowi4_proto = sk->sk_protocol,
898 .fl_ip_sport = inet->inet_sport, 919 .flowi4_flags = (inet_sk_flowi_flags(sk) |
899 .fl_ip_dport = dport }; 920 FLOWI_FLAG_CAN_SLEEP),
921 .fl4_sport = inet->inet_sport,
922 .fl4_dport = dport,
923 };
900 struct net *net = sock_net(sk); 924 struct net *net = sock_net(sk);
901 925
902 security_sk_classify_flow(sk, &fl); 926 security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
903 err = ip_route_output_flow(net, &rt, &fl, sk, 1); 927 rt = ip_route_output_flow(net, &fl4, sk);
904 if (err) { 928 if (IS_ERR(rt)) {
929 err = PTR_ERR(rt);
930 rt = NULL;
905 if (err == -ENETUNREACH) 931 if (err == -ENETUNREACH)
906 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); 932 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
907 goto out; 933 goto out;
@@ -923,6 +949,17 @@ back_from_confirm:
923 if (!ipc.addr) 949 if (!ipc.addr)
924 daddr = ipc.addr = rt->rt_dst; 950 daddr = ipc.addr = rt->rt_dst;
925 951
952 /* Lockless fast path for the non-corking case. */
953 if (!corkreq) {
954 skb = ip_make_skb(sk, getfrag, msg->msg_iov, ulen,
955 sizeof(struct udphdr), &ipc, &rt,
956 msg->msg_flags);
957 err = PTR_ERR(skb);
958 if (skb && !IS_ERR(skb))
959 err = udp_send_skb(skb, daddr, dport);
960 goto out;
961 }
962
926 lock_sock(sk); 963 lock_sock(sk);
927 if (unlikely(up->pending)) { 964 if (unlikely(up->pending)) {
928 /* The socket is already corked while preparing it. */ 965 /* The socket is already corked while preparing it. */
@@ -936,15 +973,15 @@ back_from_confirm:
936 /* 973 /*
937 * Now cork the socket to pend data. 974 * Now cork the socket to pend data.
938 */ 975 */
939 inet->cork.fl.fl4_dst = daddr; 976 fl4 = &inet->cork.fl.u.ip4;
940 inet->cork.fl.fl_ip_dport = dport; 977 fl4->daddr = daddr;
941 inet->cork.fl.fl4_src = saddr; 978 fl4->saddr = saddr;
942 inet->cork.fl.fl_ip_sport = inet->inet_sport; 979 fl4->fl4_dport = dport;
980 fl4->fl4_sport = inet->inet_sport;
943 up->pending = AF_INET; 981 up->pending = AF_INET;
944 982
945do_append_data: 983do_append_data:
946 up->len += ulen; 984 up->len += ulen;
947 getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
948 err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, 985 err = ip_append_data(sk, getfrag, msg->msg_iov, ulen,
949 sizeof(struct udphdr), &ipc, &rt, 986 sizeof(struct udphdr), &ipc, &rt,
950 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); 987 corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
@@ -2199,7 +2236,7 @@ int udp4_ufo_send_check(struct sk_buff *skb)
2199 return 0; 2236 return 0;
2200} 2237}
2201 2238
2202struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) 2239struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features)
2203{ 2240{
2204 struct sk_buff *segs = ERR_PTR(-EINVAL); 2241 struct sk_buff *segs = ERR_PTR(-EINVAL);
2205 unsigned int mss; 2242 unsigned int mss;
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 571aa96a175c..2d51840e53a1 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -69,7 +69,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
69} 69}
70EXPORT_SYMBOL(xfrm4_prepare_output); 70EXPORT_SYMBOL(xfrm4_prepare_output);
71 71
72static int xfrm4_output_finish(struct sk_buff *skb) 72int xfrm4_output_finish(struct sk_buff *skb)
73{ 73{
74#ifdef CONFIG_NETFILTER 74#ifdef CONFIG_NETFILTER
75 if (!skb_dst(skb)->xfrm) { 75 if (!skb_dst(skb)->xfrm) {
@@ -86,7 +86,11 @@ static int xfrm4_output_finish(struct sk_buff *skb)
86 86
87int xfrm4_output(struct sk_buff *skb) 87int xfrm4_output(struct sk_buff *skb)
88{ 88{
89 struct dst_entry *dst = skb_dst(skb);
90 struct xfrm_state *x = dst->xfrm;
91
89 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, 92 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb,
90 NULL, skb_dst(skb)->dev, xfrm4_output_finish, 93 NULL, dst->dev,
94 x->outer_mode->afinfo->output_finish,
91 !(IPCB(skb)->flags & IPSKB_REROUTED)); 95 !(IPCB(skb)->flags & IPSKB_REROUTED));
92} 96}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index b057d40addec..d20a05e970d8 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -19,25 +19,23 @@
19static struct xfrm_policy_afinfo xfrm4_policy_afinfo; 19static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
20 20
21static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, 21static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
22 xfrm_address_t *saddr, 22 const xfrm_address_t *saddr,
23 xfrm_address_t *daddr) 23 const xfrm_address_t *daddr)
24{ 24{
25 struct flowi fl = { 25 struct flowi4 fl4 = {
26 .fl4_dst = daddr->a4, 26 .daddr = daddr->a4,
27 .fl4_tos = tos, 27 .flowi4_tos = tos,
28 }; 28 };
29 struct dst_entry *dst;
30 struct rtable *rt; 29 struct rtable *rt;
31 int err;
32 30
33 if (saddr) 31 if (saddr)
34 fl.fl4_src = saddr->a4; 32 fl4.saddr = saddr->a4;
33
34 rt = __ip_route_output_key(net, &fl4);
35 if (!IS_ERR(rt))
36 return &rt->dst;
35 37
36 err = __ip_route_output_key(net, &rt, &fl); 38 return ERR_CAST(rt);
37 dst = &rt->dst;
38 if (err)
39 dst = ERR_PTR(err);
40 return dst;
41} 39}
42 40
43static int xfrm4_get_saddr(struct net *net, 41static int xfrm4_get_saddr(struct net *net,
@@ -56,9 +54,9 @@ static int xfrm4_get_saddr(struct net *net,
56 return 0; 54 return 0;
57} 55}
58 56
59static int xfrm4_get_tos(struct flowi *fl) 57static int xfrm4_get_tos(const struct flowi *fl)
60{ 58{
61 return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */ 59 return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */
62} 60}
63 61
64static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, 62static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
@@ -68,11 +66,18 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
68} 66}
69 67
70static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, 68static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
71 struct flowi *fl) 69 const struct flowi *fl)
72{ 70{
73 struct rtable *rt = (struct rtable *)xdst->route; 71 struct rtable *rt = (struct rtable *)xdst->route;
72 const struct flowi4 *fl4 = &fl->u.ip4;
74 73
75 xdst->u.rt.fl = *fl; 74 rt->rt_key_dst = fl4->daddr;
75 rt->rt_key_src = fl4->saddr;
76 rt->rt_tos = fl4->flowi4_tos;
77 rt->rt_route_iif = fl4->flowi4_iif;
78 rt->rt_iif = fl4->flowi4_iif;
79 rt->rt_oif = fl4->flowi4_oif;
80 rt->rt_mark = fl4->flowi4_mark;
76 81
77 xdst->u.dst.dev = dev; 82 xdst->u.dst.dev = dev;
78 dev_hold(dev); 83 dev_hold(dev);
@@ -99,9 +104,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
99{ 104{
100 struct iphdr *iph = ip_hdr(skb); 105 struct iphdr *iph = ip_hdr(skb);
101 u8 *xprth = skb_network_header(skb) + iph->ihl * 4; 106 u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
107 struct flowi4 *fl4 = &fl->u.ip4;
102 108
103 memset(fl, 0, sizeof(struct flowi)); 109 memset(fl4, 0, sizeof(struct flowi4));
104 fl->mark = skb->mark; 110 fl4->flowi4_mark = skb->mark;
105 111
106 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { 112 if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
107 switch (iph->protocol) { 113 switch (iph->protocol) {
@@ -114,8 +120,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
114 pskb_may_pull(skb, xprth + 4 - skb->data)) { 120 pskb_may_pull(skb, xprth + 4 - skb->data)) {
115 __be16 *ports = (__be16 *)xprth; 121 __be16 *ports = (__be16 *)xprth;
116 122
117 fl->fl_ip_sport = ports[!!reverse]; 123 fl4->fl4_sport = ports[!!reverse];
118 fl->fl_ip_dport = ports[!reverse]; 124 fl4->fl4_dport = ports[!reverse];
119 } 125 }
120 break; 126 break;
121 127
@@ -123,8 +129,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
123 if (pskb_may_pull(skb, xprth + 2 - skb->data)) { 129 if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
124 u8 *icmp = xprth; 130 u8 *icmp = xprth;
125 131
126 fl->fl_icmp_type = icmp[0]; 132 fl4->fl4_icmp_type = icmp[0];
127 fl->fl_icmp_code = icmp[1]; 133 fl4->fl4_icmp_code = icmp[1];
128 } 134 }
129 break; 135 break;
130 136
@@ -132,7 +138,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
132 if (pskb_may_pull(skb, xprth + 4 - skb->data)) { 138 if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
133 __be32 *ehdr = (__be32 *)xprth; 139 __be32 *ehdr = (__be32 *)xprth;
134 140
135 fl->fl_ipsec_spi = ehdr[0]; 141 fl4->fl4_ipsec_spi = ehdr[0];
136 } 142 }
137 break; 143 break;
138 144
@@ -140,7 +146,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
140 if (pskb_may_pull(skb, xprth + 8 - skb->data)) { 146 if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
141 __be32 *ah_hdr = (__be32*)xprth; 147 __be32 *ah_hdr = (__be32*)xprth;
142 148
143 fl->fl_ipsec_spi = ah_hdr[1]; 149 fl4->fl4_ipsec_spi = ah_hdr[1];
144 } 150 }
145 break; 151 break;
146 152
@@ -148,7 +154,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
148 if (pskb_may_pull(skb, xprth + 4 - skb->data)) { 154 if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
149 __be16 *ipcomp_hdr = (__be16 *)xprth; 155 __be16 *ipcomp_hdr = (__be16 *)xprth;
150 156
151 fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); 157 fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
152 } 158 }
153 break; 159 break;
154 160
@@ -160,20 +166,20 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
160 if (greflags[0] & GRE_KEY) { 166 if (greflags[0] & GRE_KEY) {
161 if (greflags[0] & GRE_CSUM) 167 if (greflags[0] & GRE_CSUM)
162 gre_hdr++; 168 gre_hdr++;
163 fl->fl_gre_key = gre_hdr[1]; 169 fl4->fl4_gre_key = gre_hdr[1];
164 } 170 }
165 } 171 }
166 break; 172 break;
167 173
168 default: 174 default:
169 fl->fl_ipsec_spi = 0; 175 fl4->fl4_ipsec_spi = 0;
170 break; 176 break;
171 } 177 }
172 } 178 }
173 fl->proto = iph->protocol; 179 fl4->flowi4_proto = iph->protocol;
174 fl->fl4_dst = reverse ? iph->saddr : iph->daddr; 180 fl4->daddr = reverse ? iph->saddr : iph->daddr;
175 fl->fl4_src = reverse ? iph->daddr : iph->saddr; 181 fl4->saddr = reverse ? iph->daddr : iph->saddr;
176 fl->fl4_tos = iph->tos; 182 fl4->flowi4_tos = iph->tos;
177} 183}
178 184
179static inline int xfrm4_garbage_collect(struct dst_ops *ops) 185static inline int xfrm4_garbage_collect(struct dst_ops *ops)
@@ -196,8 +202,11 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
196{ 202{
197 struct xfrm_dst *xdst = (struct xfrm_dst *)dst; 203 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
198 204
205 dst_destroy_metrics_generic(dst);
206
199 if (likely(xdst->u.rt.peer)) 207 if (likely(xdst->u.rt.peer))
200 inet_putpeer(xdst->u.rt.peer); 208 inet_putpeer(xdst->u.rt.peer);
209
201 xfrm_dst_destroy(xdst); 210 xfrm_dst_destroy(xdst);
202} 211}
203 212
@@ -215,6 +224,7 @@ static struct dst_ops xfrm4_dst_ops = {
215 .protocol = cpu_to_be16(ETH_P_IP), 224 .protocol = cpu_to_be16(ETH_P_IP),
216 .gc = xfrm4_garbage_collect, 225 .gc = xfrm4_garbage_collect,
217 .update_pmtu = xfrm4_update_pmtu, 226 .update_pmtu = xfrm4_update_pmtu,
227 .cow_metrics = dst_cow_metrics_generic,
218 .destroy = xfrm4_dst_destroy, 228 .destroy = xfrm4_dst_destroy,
219 .ifdown = xfrm4_dst_ifdown, 229 .ifdown = xfrm4_dst_ifdown,
220 .local_out = __ip_local_out, 230 .local_out = __ip_local_out,
@@ -230,6 +240,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
230 .get_tos = xfrm4_get_tos, 240 .get_tos = xfrm4_get_tos,
231 .init_path = xfrm4_init_path, 241 .init_path = xfrm4_init_path,
232 .fill_dst = xfrm4_fill_dst, 242 .fill_dst = xfrm4_fill_dst,
243 .blackhole_route = ipv4_blackhole_route,
233}; 244};
234 245
235#ifdef CONFIG_SYSCTL 246#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 47947624eccc..805d63ef4340 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,24 +21,26 @@ static int xfrm4_init_flags(struct xfrm_state *x)
21} 21}
22 22
23static void 23static void
24__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl) 24__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
25{ 25{
26 sel->daddr.a4 = fl->fl4_dst; 26 const struct flowi4 *fl4 = &fl->u.ip4;
27 sel->saddr.a4 = fl->fl4_src; 27
28 sel->dport = xfrm_flowi_dport(fl); 28 sel->daddr.a4 = fl4->daddr;
29 sel->saddr.a4 = fl4->saddr;
30 sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
29 sel->dport_mask = htons(0xffff); 31 sel->dport_mask = htons(0xffff);
30 sel->sport = xfrm_flowi_sport(fl); 32 sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
31 sel->sport_mask = htons(0xffff); 33 sel->sport_mask = htons(0xffff);
32 sel->family = AF_INET; 34 sel->family = AF_INET;
33 sel->prefixlen_d = 32; 35 sel->prefixlen_d = 32;
34 sel->prefixlen_s = 32; 36 sel->prefixlen_s = 32;
35 sel->proto = fl->proto; 37 sel->proto = fl4->flowi4_proto;
36 sel->ifindex = fl->oif; 38 sel->ifindex = fl4->flowi4_oif;
37} 39}
38 40
39static void 41static void
40xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl, 42xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
41 xfrm_address_t *daddr, xfrm_address_t *saddr) 43 const xfrm_address_t *daddr, const xfrm_address_t *saddr)
42{ 44{
43 x->id = tmpl->id; 45 x->id = tmpl->id;
44 if (x->id.daddr.a4 == 0) 46 if (x->id.daddr.a4 == 0)
@@ -76,6 +78,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
76 .init_tempsel = __xfrm4_init_tempsel, 78 .init_tempsel = __xfrm4_init_tempsel,
77 .init_temprop = xfrm4_init_temprop, 79 .init_temprop = xfrm4_init_temprop,
78 .output = xfrm4_output, 80 .output = xfrm4_output,
81 .output_finish = xfrm4_output_finish,
79 .extract_input = xfrm4_extract_input, 82 .extract_input = xfrm4_extract_input,
80 .extract_output = xfrm4_extract_output, 83 .extract_output = xfrm4_extract_output,
81 .transport_finish = xfrm4_transport_finish, 84 .transport_finish = xfrm4_transport_finish,