aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig42
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/ah4.c25
-rw-r--r--net/ipv4/devinet.c78
-rw-r--r--net/ipv4/fib_frontend.c63
-rw-r--r--net/ipv4/fib_hash.c1133
-rw-r--r--net/ipv4/fib_lookup.h2
-rw-r--r--net/ipv4/fib_rules.c12
-rw-r--r--net/ipv4/fib_semantics.c125
-rw-r--r--net/ipv4/fib_trie.c217
-rw-r--r--net/ipv4/icmp.c49
-rw-r--r--net/ipv4/inetpeer.c52
-rw-r--r--net/ipv4/ip_input.c2
-rw-r--r--net/ipv4/netfilter/Kconfig3
-rw-r--r--net/ipv4/netfilter/arp_tables.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c2
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c7
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c3
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c33
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c9
-rw-r--r--net/ipv4/route.c726
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/tcp_input.c2
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/udp.c2
-rw-r--r--net/ipv4/xfrm4_policy.c4
30 files changed, 730 insertions, 1900 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index a5a1050595d1..cbb505ba9324 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER
55 55
56 If unsure, say N here. 56 If unsure, say N here.
57 57
58choice
59 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
60 depends on IP_ADVANCED_ROUTER
61 default ASK_IP_FIB_HASH
62
63config ASK_IP_FIB_HASH
64 bool "FIB_HASH"
65 ---help---
66 Current FIB is very proven and good enough for most users.
67
68config IP_FIB_TRIE
69 bool "FIB_TRIE"
70 ---help---
71 Use new experimental LC-trie as FIB lookup algorithm.
72 This improves lookup performance if you have a large
73 number of routes.
74
75 LC-trie is a longest matching prefix lookup algorithm which
76 performs better than FIB_HASH for large routing tables.
77 But, it consumes more memory and is more complex.
78
79 LC-trie is described in:
80
81 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
82 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
83 June 1999
84
85 An experimental study of compression methods for dynamic tries
86 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
87 <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
88
89endchoice
90
91config IP_FIB_HASH
92 def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
93
94config IP_FIB_TRIE_STATS 58config IP_FIB_TRIE_STATS
95 bool "FIB TRIE statistics" 59 bool "FIB TRIE statistics"
96 depends on IP_FIB_TRIE 60 depends on IP_ADVANCED_ROUTER
97 ---help--- 61 ---help---
98 Keep track of statistics on structure of FIB TRIE table. 62 Keep track of statistics on structure of FIB TRIE table.
99 Useful for testing and measuring TRIE performance. 63 Useful for testing and measuring TRIE performance.
@@ -140,6 +104,9 @@ config IP_ROUTE_VERBOSE
140 handled by the klogd daemon which is responsible for kernel messages 104 handled by the klogd daemon which is responsible for kernel messages
141 ("man klogd"). 105 ("man klogd").
142 106
107config IP_ROUTE_CLASSID
108 bool
109
143config IP_PNP 110config IP_PNP
144 bool "IP: kernel level autoconfiguration" 111 bool "IP: kernel level autoconfiguration"
145 help 112 help
@@ -657,4 +624,3 @@ config TCP_MD5SIG
657 on the Internet. 624 on the Internet.
658 625
659 If unsure, say N. 626 If unsure, say N.
660
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 4978d22f9a75..0dc772d0d125 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,12 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \
10 tcp_minisocks.o tcp_cong.o \ 10 tcp_minisocks.o tcp_cong.o \
11 datagram.o raw.o udp.o udplite.o \ 11 datagram.o raw.o udp.o udplite.o \
12 arp.o icmp.o devinet.o af_inet.o igmp.o \ 12 arp.o icmp.o devinet.o af_inet.o igmp.o \
13 fib_frontend.o fib_semantics.o \ 13 fib_frontend.o fib_semantics.o fib_trie.o \
14 inet_fragment.o 14 inet_fragment.o
15 15
16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o 16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
17obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
18obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
19obj-$(CONFIG_PROC_FS) += proc.o 17obj-$(CONFIG_PROC_FS) += proc.o
20obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 18obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
21obj-$(CONFIG_IP_MROUTE) += ipmr.o 19obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 45b89d7bda5a..7ceb80447631 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1231,7 +1231,7 @@ out:
1231 return err; 1231 return err;
1232} 1232}
1233 1233
1234static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) 1234static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features)
1235{ 1235{
1236 struct sk_buff *segs = ERR_PTR(-EINVAL); 1236 struct sk_buff *segs = ERR_PTR(-EINVAL);
1237 struct iphdr *iph; 1237 struct iphdr *iph;
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 86961bec70ab..325053df6e70 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -201,7 +201,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
201 top_iph->ttl = 0; 201 top_iph->ttl = 0;
202 top_iph->check = 0; 202 top_iph->check = 0;
203 203
204 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; 204 if (x->props.flags & XFRM_STATE_ALIGN4)
205 ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
206 else
207 ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
205 208
206 ah->reserved = 0; 209 ah->reserved = 0;
207 ah->spi = x->id.spi; 210 ah->spi = x->id.spi;
@@ -299,9 +302,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
299 nexthdr = ah->nexthdr; 302 nexthdr = ah->nexthdr;
300 ah_hlen = (ah->hdrlen + 2) << 2; 303 ah_hlen = (ah->hdrlen + 2) << 2;
301 304
302 if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && 305 if (x->props.flags & XFRM_STATE_ALIGN4) {
303 ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) 306 if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
304 goto out; 307 ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
308 goto out;
309 } else {
310 if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
311 ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
312 goto out;
313 }
305 314
306 if (!pskb_may_pull(skb, ah_hlen)) 315 if (!pskb_may_pull(skb, ah_hlen))
307 goto out; 316 goto out;
@@ -450,8 +459,12 @@ static int ah_init_state(struct xfrm_state *x)
450 459
451 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); 460 BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
452 461
453 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + 462 if (x->props.flags & XFRM_STATE_ALIGN4)
454 ahp->icv_trunc_len); 463 x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
464 ahp->icv_trunc_len);
465 else
466 x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
467 ahp->icv_trunc_len);
455 if (x->props.mode == XFRM_MODE_TUNNEL) 468 if (x->props.mode == XFRM_MODE_TUNNEL)
456 x->props.header_len += sizeof(struct iphdr); 469 x->props.header_len += sizeof(struct iphdr);
457 x->data = ahp; 470 x->data = ahp;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index df4616fce929..90389281d97a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -51,6 +51,7 @@
51#include <linux/inetdevice.h> 51#include <linux/inetdevice.h>
52#include <linux/igmp.h> 52#include <linux/igmp.h>
53#include <linux/slab.h> 53#include <linux/slab.h>
54#include <linux/hash.h>
54#ifdef CONFIG_SYSCTL 55#ifdef CONFIG_SYSCTL
55#include <linux/sysctl.h> 56#include <linux/sysctl.h>
56#endif 57#endif
@@ -92,6 +93,71 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
92 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, 93 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
93}; 94};
94 95
96/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
97 * value. So if you change this define, make appropriate changes to
98 * inet_addr_hash as well.
99 */
100#define IN4_ADDR_HSIZE 256
101static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
102static DEFINE_SPINLOCK(inet_addr_hash_lock);
103
104static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
105{
106 u32 val = (__force u32) addr ^ hash_ptr(net, 8);
107
108 return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
109 (IN4_ADDR_HSIZE - 1));
110}
111
112static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
113{
114 unsigned int hash = inet_addr_hash(net, ifa->ifa_address);
115
116 spin_lock(&inet_addr_hash_lock);
117 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
118 spin_unlock(&inet_addr_hash_lock);
119}
120
121static void inet_hash_remove(struct in_ifaddr *ifa)
122{
123 spin_lock(&inet_addr_hash_lock);
124 hlist_del_init_rcu(&ifa->hash);
125 spin_unlock(&inet_addr_hash_lock);
126}
127
128/**
129 * __ip_dev_find - find the first device with a given source address.
130 * @net: the net namespace
131 * @addr: the source address
132 * @devref: if true, take a reference on the found device
133 *
134 * If a caller uses devref=false, it should be protected by RCU, or RTNL
135 */
136struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
137{
138 unsigned int hash = inet_addr_hash(net, addr);
139 struct net_device *result = NULL;
140 struct in_ifaddr *ifa;
141 struct hlist_node *node;
142
143 rcu_read_lock();
144 hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
145 struct net_device *dev = ifa->ifa_dev->dev;
146
147 if (!net_eq(dev_net(dev), net))
148 continue;
149 if (ifa->ifa_address == addr) {
150 result = dev;
151 break;
152 }
153 }
154 if (result && devref)
155 dev_hold(result);
156 rcu_read_unlock();
157 return result;
158}
159EXPORT_SYMBOL(__ip_dev_find);
160
95static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); 161static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
96 162
97static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); 163static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
@@ -265,6 +331,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
265 } 331 }
266 332
267 if (!do_promote) { 333 if (!do_promote) {
334 inet_hash_remove(ifa);
268 *ifap1 = ifa->ifa_next; 335 *ifap1 = ifa->ifa_next;
269 336
270 rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); 337 rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
@@ -281,6 +348,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
281 /* 2. Unlink it */ 348 /* 2. Unlink it */
282 349
283 *ifap = ifa1->ifa_next; 350 *ifap = ifa1->ifa_next;
351 inet_hash_remove(ifa1);
284 352
285 /* 3. Announce address deletion */ 353 /* 3. Announce address deletion */
286 354
@@ -368,6 +436,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
368 ifa->ifa_next = *ifap; 436 ifa->ifa_next = *ifap;
369 *ifap = ifa; 437 *ifap = ifa;
370 438
439 inet_hash_insert(dev_net(in_dev->dev), ifa);
440
371 /* Send message first, then call notifier. 441 /* Send message first, then call notifier.
372 Notifier will trigger FIB update, so that 442 Notifier will trigger FIB update, so that
373 listeners of netlink will know about new ifaddr */ 443 listeners of netlink will know about new ifaddr */
@@ -521,6 +591,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
521 if (tb[IFA_ADDRESS] == NULL) 591 if (tb[IFA_ADDRESS] == NULL)
522 tb[IFA_ADDRESS] = tb[IFA_LOCAL]; 592 tb[IFA_ADDRESS] = tb[IFA_LOCAL];
523 593
594 INIT_HLIST_NODE(&ifa->hash);
524 ifa->ifa_prefixlen = ifm->ifa_prefixlen; 595 ifa->ifa_prefixlen = ifm->ifa_prefixlen;
525 ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); 596 ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
526 ifa->ifa_flags = ifm->ifa_flags; 597 ifa->ifa_flags = ifm->ifa_flags;
@@ -728,6 +799,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
728 if (!ifa) { 799 if (!ifa) {
729 ret = -ENOBUFS; 800 ret = -ENOBUFS;
730 ifa = inet_alloc_ifa(); 801 ifa = inet_alloc_ifa();
802 INIT_HLIST_NODE(&ifa->hash);
731 if (!ifa) 803 if (!ifa)
732 break; 804 break;
733 if (colon) 805 if (colon)
@@ -1084,6 +1156,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1084 struct in_ifaddr *ifa = inet_alloc_ifa(); 1156 struct in_ifaddr *ifa = inet_alloc_ifa();
1085 1157
1086 if (ifa) { 1158 if (ifa) {
1159 INIT_HLIST_NODE(&ifa->hash);
1087 ifa->ifa_local = 1160 ifa->ifa_local =
1088 ifa->ifa_address = htonl(INADDR_LOOPBACK); 1161 ifa->ifa_address = htonl(INADDR_LOOPBACK);
1089 ifa->ifa_prefixlen = 8; 1162 ifa->ifa_prefixlen = 8;
@@ -1720,6 +1793,11 @@ static struct rtnl_af_ops inet_af_ops = {
1720 1793
1721void __init devinet_init(void) 1794void __init devinet_init(void)
1722{ 1795{
1796 int i;
1797
1798 for (i = 0; i < IN4_ADDR_HSIZE; i++)
1799 INIT_HLIST_HEAD(&inet_addr_lst[i]);
1800
1723 register_pernet_subsys(&devinet_ops); 1801 register_pernet_subsys(&devinet_ops);
1724 1802
1725 register_gifconf(PF_INET, inet_gifconf); 1803 register_gifconf(PF_INET, inet_gifconf);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1d2cdd43a878..ad0778a3fa53 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -51,11 +51,11 @@ static int __net_init fib4_rules_init(struct net *net)
51{ 51{
52 struct fib_table *local_table, *main_table; 52 struct fib_table *local_table, *main_table;
53 53
54 local_table = fib_hash_table(RT_TABLE_LOCAL); 54 local_table = fib_trie_table(RT_TABLE_LOCAL);
55 if (local_table == NULL) 55 if (local_table == NULL)
56 return -ENOMEM; 56 return -ENOMEM;
57 57
58 main_table = fib_hash_table(RT_TABLE_MAIN); 58 main_table = fib_trie_table(RT_TABLE_MAIN);
59 if (main_table == NULL) 59 if (main_table == NULL)
60 goto fail; 60 goto fail;
61 61
@@ -82,7 +82,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
82 if (tb) 82 if (tb)
83 return tb; 83 return tb;
84 84
85 tb = fib_hash_table(id); 85 tb = fib_trie_table(id);
86 if (!tb) 86 if (!tb)
87 return NULL; 87 return NULL;
88 h = id & (FIB_TABLE_HASHSZ - 1); 88 h = id & (FIB_TABLE_HASHSZ - 1);
@@ -114,21 +114,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
114} 114}
115#endif /* CONFIG_IP_MULTIPLE_TABLES */ 115#endif /* CONFIG_IP_MULTIPLE_TABLES */
116 116
117void fib_select_default(struct net *net,
118 const struct flowi *flp, struct fib_result *res)
119{
120 struct fib_table *tb;
121 int table = RT_TABLE_MAIN;
122#ifdef CONFIG_IP_MULTIPLE_TABLES
123 if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
124 return;
125 table = res->r->table;
126#endif
127 tb = fib_get_table(net, table);
128 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
129 fib_table_select_default(tb, flp, res);
130}
131
132static void fib_flush(struct net *net) 117static void fib_flush(struct net *net)
133{ 118{
134 int flushed = 0; 119 int flushed = 0;
@@ -147,46 +132,6 @@ static void fib_flush(struct net *net)
147 rt_cache_flush(net, -1); 132 rt_cache_flush(net, -1);
148} 133}
149 134
150/**
151 * __ip_dev_find - find the first device with a given source address.
152 * @net: the net namespace
153 * @addr: the source address
154 * @devref: if true, take a reference on the found device
155 *
156 * If a caller uses devref=false, it should be protected by RCU, or RTNL
157 */
158struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
159{
160 struct flowi fl = {
161 .fl4_dst = addr,
162 };
163 struct fib_result res = { 0 };
164 struct net_device *dev = NULL;
165 struct fib_table *local_table;
166
167#ifdef CONFIG_IP_MULTIPLE_TABLES
168 res.r = NULL;
169#endif
170
171 rcu_read_lock();
172 local_table = fib_get_table(net, RT_TABLE_LOCAL);
173 if (!local_table ||
174 fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
175 rcu_read_unlock();
176 return NULL;
177 }
178 if (res.type != RTN_LOCAL)
179 goto out;
180 dev = FIB_RES_DEV(res);
181
182 if (dev && devref)
183 dev_hold(dev);
184out:
185 rcu_read_unlock();
186 return dev;
187}
188EXPORT_SYMBOL(__ip_dev_find);
189
190/* 135/*
191 * Find address type as if only "dev" was present in the system. If 136 * Find address type as if only "dev" was present in the system. If
192 * on_dev is NULL then all interfaces are taken into consideration. 137 * on_dev is NULL then all interfaces are taken into consideration.
@@ -1101,5 +1046,5 @@ void __init ip_fib_init(void)
1101 register_netdevice_notifier(&fib_netdev_notifier); 1046 register_netdevice_notifier(&fib_netdev_notifier);
1102 register_inetaddr_notifier(&fib_inetaddr_notifier); 1047 register_inetaddr_notifier(&fib_inetaddr_notifier);
1103 1048
1104 fib_hash_init(); 1049 fib_trie_init();
1105} 1050}
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
deleted file mode 100644
index b3acb0417b21..000000000000
--- a/net/ipv4/fib_hash.c
+++ /dev/null
@@ -1,1133 +0,0 @@
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IPv4 FIB: lookup engine and maintenance routines.
7 *
8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 */
15
16#include <asm/uaccess.h>
17#include <asm/system.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/mm.h>
22#include <linux/string.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/errno.h>
26#include <linux/in.h>
27#include <linux/inet.h>
28#include <linux/inetdevice.h>
29#include <linux/netdevice.h>
30#include <linux/if_arp.h>
31#include <linux/proc_fs.h>
32#include <linux/skbuff.h>
33#include <linux/netlink.h>
34#include <linux/init.h>
35#include <linux/slab.h>
36
37#include <net/net_namespace.h>
38#include <net/ip.h>
39#include <net/protocol.h>
40#include <net/route.h>
41#include <net/tcp.h>
42#include <net/sock.h>
43#include <net/ip_fib.h>
44
45#include "fib_lookup.h"
46
47static struct kmem_cache *fn_hash_kmem __read_mostly;
48static struct kmem_cache *fn_alias_kmem __read_mostly;
49
50struct fib_node {
51 struct hlist_node fn_hash;
52 struct list_head fn_alias;
53 __be32 fn_key;
54 struct fib_alias fn_embedded_alias;
55};
56
57#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
58
59struct fn_zone {
60 struct fn_zone __rcu *fz_next; /* Next not empty zone */
61 struct hlist_head __rcu *fz_hash; /* Hash table pointer */
62 seqlock_t fz_lock;
63 u32 fz_hashmask; /* (fz_divisor - 1) */
64
65 u8 fz_order; /* Zone order (0..32) */
66 u8 fz_revorder; /* 32 - fz_order */
67 __be32 fz_mask; /* inet_make_mask(order) */
68#define FZ_MASK(fz) ((fz)->fz_mask)
69
70 struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
71
72 int fz_nent; /* Number of entries */
73 int fz_divisor; /* Hash size (mask+1) */
74};
75
76struct fn_hash {
77 struct fn_zone *fn_zones[33];
78 struct fn_zone __rcu *fn_zone_list;
79};
80
81static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
82{
83 u32 h = ntohl(key) >> fz->fz_revorder;
84 h ^= (h>>20);
85 h ^= (h>>10);
86 h ^= (h>>5);
87 h &= fz->fz_hashmask;
88 return h;
89}
90
91static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
92{
93 return dst & FZ_MASK(fz);
94}
95
96static unsigned int fib_hash_genid;
97
98#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
99
100static struct hlist_head *fz_hash_alloc(int divisor)
101{
102 unsigned long size = divisor * sizeof(struct hlist_head);
103
104 if (size <= PAGE_SIZE)
105 return kzalloc(size, GFP_KERNEL);
106
107 return (struct hlist_head *)
108 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
109}
110
111/* The fib hash lock must be held when this is called. */
112static inline void fn_rebuild_zone(struct fn_zone *fz,
113 struct hlist_head *old_ht,
114 int old_divisor)
115{
116 int i;
117
118 for (i = 0; i < old_divisor; i++) {
119 struct hlist_node *node, *n;
120 struct fib_node *f;
121
122 hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
123 struct hlist_head *new_head;
124
125 hlist_del_rcu(&f->fn_hash);
126
127 new_head = rcu_dereference_protected(fz->fz_hash, 1) +
128 fn_hash(f->fn_key, fz);
129 hlist_add_head_rcu(&f->fn_hash, new_head);
130 }
131 }
132}
133
134static void fz_hash_free(struct hlist_head *hash, int divisor)
135{
136 unsigned long size = divisor * sizeof(struct hlist_head);
137
138 if (size <= PAGE_SIZE)
139 kfree(hash);
140 else
141 free_pages((unsigned long)hash, get_order(size));
142}
143
144static void fn_rehash_zone(struct fn_zone *fz)
145{
146 struct hlist_head *ht, *old_ht;
147 int old_divisor, new_divisor;
148 u32 new_hashmask;
149
150 new_divisor = old_divisor = fz->fz_divisor;
151
152 switch (old_divisor) {
153 case EMBEDDED_HASH_SIZE:
154 new_divisor *= EMBEDDED_HASH_SIZE;
155 break;
156 case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
157 new_divisor *= (EMBEDDED_HASH_SIZE/2);
158 break;
159 default:
160 if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
161 printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
162 return;
163 }
164 new_divisor = (old_divisor << 1);
165 break;
166 }
167
168 new_hashmask = (new_divisor - 1);
169
170#if RT_CACHE_DEBUG >= 2
171 printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n",
172 fz->fz_order, old_divisor);
173#endif
174
175 ht = fz_hash_alloc(new_divisor);
176
177 if (ht) {
178 struct fn_zone nfz;
179
180 memcpy(&nfz, fz, sizeof(nfz));
181
182 write_seqlock_bh(&fz->fz_lock);
183 old_ht = rcu_dereference_protected(fz->fz_hash, 1);
184 RCU_INIT_POINTER(nfz.fz_hash, ht);
185 nfz.fz_hashmask = new_hashmask;
186 nfz.fz_divisor = new_divisor;
187 fn_rebuild_zone(&nfz, old_ht, old_divisor);
188 fib_hash_genid++;
189 rcu_assign_pointer(fz->fz_hash, ht);
190 fz->fz_hashmask = new_hashmask;
191 fz->fz_divisor = new_divisor;
192 write_sequnlock_bh(&fz->fz_lock);
193
194 if (old_ht != fz->fz_embedded_hash) {
195 synchronize_rcu();
196 fz_hash_free(old_ht, old_divisor);
197 }
198 }
199}
200
201static void fn_free_node_rcu(struct rcu_head *head)
202{
203 struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
204
205 kmem_cache_free(fn_hash_kmem, f);
206}
207
208static inline void fn_free_node(struct fib_node *f)
209{
210 call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
211}
212
213static void fn_free_alias_rcu(struct rcu_head *head)
214{
215 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
216
217 kmem_cache_free(fn_alias_kmem, fa);
218}
219
220static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
221{
222 fib_release_info(fa->fa_info);
223 if (fa == &f->fn_embedded_alias)
224 fa->fa_info = NULL;
225 else
226 call_rcu(&fa->rcu, fn_free_alias_rcu);
227}
228
229static struct fn_zone *
230fn_new_zone(struct fn_hash *table, int z)
231{
232 int i;
233 struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
234 if (!fz)
235 return NULL;
236
237 seqlock_init(&fz->fz_lock);
238 fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
239 fz->fz_hashmask = fz->fz_divisor - 1;
240 RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
241 fz->fz_order = z;
242 fz->fz_revorder = 32 - z;
243 fz->fz_mask = inet_make_mask(z);
244
245 /* Find the first not empty zone with more specific mask */
246 for (i = z + 1; i <= 32; i++)
247 if (table->fn_zones[i])
248 break;
249 if (i > 32) {
250 /* No more specific masks, we are the first. */
251 rcu_assign_pointer(fz->fz_next,
252 rtnl_dereference(table->fn_zone_list));
253 rcu_assign_pointer(table->fn_zone_list, fz);
254 } else {
255 rcu_assign_pointer(fz->fz_next,
256 rtnl_dereference(table->fn_zones[i]->fz_next));
257 rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
258 }
259 table->fn_zones[z] = fz;
260 fib_hash_genid++;
261 return fz;
262}
263
264int fib_table_lookup(struct fib_table *tb,
265 const struct flowi *flp, struct fib_result *res,
266 int fib_flags)
267{
268 int err;
269 struct fn_zone *fz;
270 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
271
272 rcu_read_lock();
273 for (fz = rcu_dereference(t->fn_zone_list);
274 fz != NULL;
275 fz = rcu_dereference(fz->fz_next)) {
276 struct hlist_head *head;
277 struct hlist_node *node;
278 struct fib_node *f;
279 __be32 k;
280 unsigned int seq;
281
282 do {
283 seq = read_seqbegin(&fz->fz_lock);
284 k = fz_key(flp->fl4_dst, fz);
285
286 head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
287 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
288 if (f->fn_key != k)
289 continue;
290
291 err = fib_semantic_match(&f->fn_alias,
292 flp, res,
293 fz->fz_order, fib_flags);
294 if (err <= 0)
295 goto out;
296 }
297 } while (read_seqretry(&fz->fz_lock, seq));
298 }
299 err = 1;
300out:
301 rcu_read_unlock();
302 return err;
303}
304
305void fib_table_select_default(struct fib_table *tb,
306 const struct flowi *flp, struct fib_result *res)
307{
308 int order, last_idx;
309 struct hlist_node *node;
310 struct fib_node *f;
311 struct fib_info *fi = NULL;
312 struct fib_info *last_resort;
313 struct fn_hash *t = (struct fn_hash *)tb->tb_data;
314 struct fn_zone *fz = t->fn_zones[0];
315 struct hlist_head *head;
316
317 if (fz == NULL)
318 return;
319
320 last_idx = -1;
321 last_resort = NULL;
322 order = -1;
323
324 rcu_read_lock();
325 head = rcu_dereference(fz->fz_hash);
326 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
327 struct fib_alias *fa;
328
329 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
330 struct fib_info *next_fi = fa->fa_info;
331
332 if (fa->fa_scope != res->scope ||
333 fa->fa_type != RTN_UNICAST)
334 continue;
335
336 if (next_fi->fib_priority > res->fi->fib_priority)
337 break;
338 if (!next_fi->fib_nh[0].nh_gw ||
339 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
340 continue;
341
342 fib_alias_accessed(fa);
343
344 if (fi == NULL) {
345 if (next_fi != res->fi)
346 break;
347 } else if (!fib_detect_death(fi, order, &last_resort,
348 &last_idx, tb->tb_default)) {
349 fib_result_assign(res, fi);
350 tb->tb_default = order;
351 goto out;
352 }
353 fi = next_fi;
354 order++;
355 }
356 }
357
358 if (order <= 0 || fi == NULL) {
359 tb->tb_default = -1;
360 goto out;
361 }
362
363 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
364 tb->tb_default)) {
365 fib_result_assign(res, fi);
366 tb->tb_default = order;
367 goto out;
368 }
369
370 if (last_idx >= 0)
371 fib_result_assign(res, last_resort);
372 tb->tb_default = last_idx;
373out:
374 rcu_read_unlock();
375}
376
377/* Insert node F to FZ. */
378static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
379{
380 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
381
382 hlist_add_head_rcu(&f->fn_hash, head);
383}
384
385/* Return the node in FZ matching KEY. */
386static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
387{
388 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
389 struct hlist_node *node;
390 struct fib_node *f;
391
392 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
393 if (f->fn_key == key)
394 return f;
395 }
396
397 return NULL;
398}
399
400
401static struct fib_alias *fib_fast_alloc(struct fib_node *f)
402{
403 struct fib_alias *fa = &f->fn_embedded_alias;
404
405 if (fa->fa_info != NULL)
406 fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
407 return fa;
408}
409
410/* Caller must hold RTNL. */
411int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
412{
413 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
414 struct fib_node *new_f = NULL;
415 struct fib_node *f;
416 struct fib_alias *fa, *new_fa;
417 struct fn_zone *fz;
418 struct fib_info *fi;
419 u8 tos = cfg->fc_tos;
420 __be32 key;
421 int err;
422
423 if (cfg->fc_dst_len > 32)
424 return -EINVAL;
425
426 fz = table->fn_zones[cfg->fc_dst_len];
427 if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
428 return -ENOBUFS;
429
430 key = 0;
431 if (cfg->fc_dst) {
432 if (cfg->fc_dst & ~FZ_MASK(fz))
433 return -EINVAL;
434 key = fz_key(cfg->fc_dst, fz);
435 }
436
437 fi = fib_create_info(cfg);
438 if (IS_ERR(fi))
439 return PTR_ERR(fi);
440
441 if (fz->fz_nent > (fz->fz_divisor<<1) &&
442 fz->fz_divisor < FZ_MAX_DIVISOR &&
443 (cfg->fc_dst_len == 32 ||
444 (1 << cfg->fc_dst_len) > fz->fz_divisor))
445 fn_rehash_zone(fz);
446
447 f = fib_find_node(fz, key);
448
449 if (!f)
450 fa = NULL;
451 else
452 fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
453
454 /* Now fa, if non-NULL, points to the first fib alias
455 * with the same keys [prefix,tos,priority], if such key already
456 * exists or to the node before which we will insert new one.
457 *
458 * If fa is NULL, we will need to allocate a new one and
459 * insert to the head of f.
460 *
461 * If f is NULL, no fib node matched the destination key
462 * and we need to allocate a new one of those as well.
463 */
464
465 if (fa && fa->fa_tos == tos &&
466 fa->fa_info->fib_priority == fi->fib_priority) {
467 struct fib_alias *fa_first, *fa_match;
468
469 err = -EEXIST;
470 if (cfg->fc_nlflags & NLM_F_EXCL)
471 goto out;
472
473 /* We have 2 goals:
474 * 1. Find exact match for type, scope, fib_info to avoid
475 * duplicate routes
476 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
477 */
478 fa_match = NULL;
479 fa_first = fa;
480 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
481 list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
482 if (fa->fa_tos != tos)
483 break;
484 if (fa->fa_info->fib_priority != fi->fib_priority)
485 break;
486 if (fa->fa_type == cfg->fc_type &&
487 fa->fa_scope == cfg->fc_scope &&
488 fa->fa_info == fi) {
489 fa_match = fa;
490 break;
491 }
492 }
493
494 if (cfg->fc_nlflags & NLM_F_REPLACE) {
495 u8 state;
496
497 fa = fa_first;
498 if (fa_match) {
499 if (fa == fa_match)
500 err = 0;
501 goto out;
502 }
503 err = -ENOBUFS;
504 new_fa = fib_fast_alloc(f);
505 if (new_fa == NULL)
506 goto out;
507
508 new_fa->fa_tos = fa->fa_tos;
509 new_fa->fa_info = fi;
510 new_fa->fa_type = cfg->fc_type;
511 new_fa->fa_scope = cfg->fc_scope;
512 state = fa->fa_state;
513 new_fa->fa_state = state & ~FA_S_ACCESSED;
514 fib_hash_genid++;
515 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
516
517 fn_free_alias(fa, f);
518 if (state & FA_S_ACCESSED)
519 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
520 rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
521 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
522 return 0;
523 }
524
525 /* Error if we find a perfect match which
526 * uses the same scope, type, and nexthop
527 * information.
528 */
529 if (fa_match)
530 goto out;
531
532 if (!(cfg->fc_nlflags & NLM_F_APPEND))
533 fa = fa_first;
534 }
535
536 err = -ENOENT;
537 if (!(cfg->fc_nlflags & NLM_F_CREATE))
538 goto out;
539
540 err = -ENOBUFS;
541
542 if (!f) {
543 new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
544 if (new_f == NULL)
545 goto out;
546
547 INIT_HLIST_NODE(&new_f->fn_hash);
548 INIT_LIST_HEAD(&new_f->fn_alias);
549 new_f->fn_key = key;
550 f = new_f;
551 }
552
553 new_fa = fib_fast_alloc(f);
554 if (new_fa == NULL)
555 goto out;
556
557 new_fa->fa_info = fi;
558 new_fa->fa_tos = tos;
559 new_fa->fa_type = cfg->fc_type;
560 new_fa->fa_scope = cfg->fc_scope;
561 new_fa->fa_state = 0;
562
563 /*
564 * Insert new entry to the list.
565 */
566
567 if (new_f)
568 fib_insert_node(fz, new_f);
569 list_add_tail_rcu(&new_fa->fa_list,
570 (fa ? &fa->fa_list : &f->fn_alias));
571 fib_hash_genid++;
572
573 if (new_f)
574 fz->fz_nent++;
575 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
576
577 rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
578 &cfg->fc_nlinfo, 0);
579 return 0;
580
581out:
582 if (new_f)
583 kmem_cache_free(fn_hash_kmem, new_f);
584 fib_release_info(fi);
585 return err;
586}
587
588int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
589{
590 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
591 struct fib_node *f;
592 struct fib_alias *fa, *fa_to_delete;
593 struct fn_zone *fz;
594 __be32 key;
595
596 if (cfg->fc_dst_len > 32)
597 return -EINVAL;
598
599 if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL)
600 return -ESRCH;
601
602 key = 0;
603 if (cfg->fc_dst) {
604 if (cfg->fc_dst & ~FZ_MASK(fz))
605 return -EINVAL;
606 key = fz_key(cfg->fc_dst, fz);
607 }
608
609 f = fib_find_node(fz, key);
610
611 if (!f)
612 fa = NULL;
613 else
614 fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
615 if (!fa)
616 return -ESRCH;
617
618 fa_to_delete = NULL;
619 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
620 list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
621 struct fib_info *fi = fa->fa_info;
622
623 if (fa->fa_tos != cfg->fc_tos)
624 break;
625
626 if ((!cfg->fc_type ||
627 fa->fa_type == cfg->fc_type) &&
628 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
629 fa->fa_scope == cfg->fc_scope) &&
630 (!cfg->fc_protocol ||
631 fi->fib_protocol == cfg->fc_protocol) &&
632 fib_nh_match(cfg, fi) == 0) {
633 fa_to_delete = fa;
634 break;
635 }
636 }
637
638 if (fa_to_delete) {
639 int kill_fn;
640
641 fa = fa_to_delete;
642 rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
643 tb->tb_id, &cfg->fc_nlinfo, 0);
644
645 kill_fn = 0;
646 list_del_rcu(&fa->fa_list);
647 if (list_empty(&f->fn_alias)) {
648 hlist_del_rcu(&f->fn_hash);
649 kill_fn = 1;
650 }
651 fib_hash_genid++;
652
653 if (fa->fa_state & FA_S_ACCESSED)
654 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
655 fn_free_alias(fa, f);
656 if (kill_fn) {
657 fn_free_node(f);
658 fz->fz_nent--;
659 }
660
661 return 0;
662 }
663 return -ESRCH;
664}
665
666static int fn_flush_list(struct fn_zone *fz, int idx)
667{
668 struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
669 struct hlist_node *node, *n;
670 struct fib_node *f;
671 int found = 0;
672
673 hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
674 struct fib_alias *fa, *fa_node;
675 int kill_f;
676
677 kill_f = 0;
678 list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
679 struct fib_info *fi = fa->fa_info;
680
681 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
682 list_del_rcu(&fa->fa_list);
683 if (list_empty(&f->fn_alias)) {
684 hlist_del_rcu(&f->fn_hash);
685 kill_f = 1;
686 }
687 fib_hash_genid++;
688
689 fn_free_alias(fa, f);
690 found++;
691 }
692 }
693 if (kill_f) {
694 fn_free_node(f);
695 fz->fz_nent--;
696 }
697 }
698 return found;
699}
700
701/* caller must hold RTNL. */
702int fib_table_flush(struct fib_table *tb)
703{
704 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
705 struct fn_zone *fz;
706 int found = 0;
707
708 for (fz = rtnl_dereference(table->fn_zone_list);
709 fz != NULL;
710 fz = rtnl_dereference(fz->fz_next)) {
711 int i;
712
713 for (i = fz->fz_divisor - 1; i >= 0; i--)
714 found += fn_flush_list(fz, i);
715 }
716 return found;
717}
718
719void fib_free_table(struct fib_table *tb)
720{
721 struct fn_hash *table = (struct fn_hash *) tb->tb_data;
722 struct fn_zone *fz, *next;
723
724 next = table->fn_zone_list;
725 while (next != NULL) {
726 fz = next;
727 next = fz->fz_next;
728
729 if (fz->fz_hash != fz->fz_embedded_hash)
730 fz_hash_free(fz->fz_hash, fz->fz_divisor);
731
732 kfree(fz);
733 }
734
735 kfree(tb);
736}
737
738static inline int
739fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
740 struct fib_table *tb,
741 struct fn_zone *fz,
742 struct hlist_head *head)
743{
744 struct hlist_node *node;
745 struct fib_node *f;
746 int i, s_i;
747
748 s_i = cb->args[4];
749 i = 0;
750 hlist_for_each_entry_rcu(f, node, head, fn_hash) {
751 struct fib_alias *fa;
752
753 list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
754 if (i < s_i)
755 goto next;
756
757 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
758 cb->nlh->nlmsg_seq,
759 RTM_NEWROUTE,
760 tb->tb_id,
761 fa->fa_type,
762 fa->fa_scope,
763 f->fn_key,
764 fz->fz_order,
765 fa->fa_tos,
766 fa->fa_info,
767 NLM_F_MULTI) < 0) {
768 cb->args[4] = i;
769 return -1;
770 }
771next:
772 i++;
773 }
774 }
775 cb->args[4] = i;
776 return skb->len;
777}
778
779static inline int
780fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
781 struct fib_table *tb,
782 struct fn_zone *fz)
783{
784 int h, s_h;
785 struct hlist_head *head = rcu_dereference(fz->fz_hash);
786
787 if (head == NULL)
788 return skb->len;
789 s_h = cb->args[3];
790 for (h = s_h; h < fz->fz_divisor; h++) {
791 if (hlist_empty(head + h))
792 continue;
793 if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
794 cb->args[3] = h;
795 return -1;
796 }
797 memset(&cb->args[4], 0,
798 sizeof(cb->args) - 4*sizeof(cb->args[0]));
799 }
800 cb->args[3] = h;
801 return skb->len;
802}
803
804int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
805 struct netlink_callback *cb)
806{
807 int m = 0, s_m;
808 struct fn_zone *fz;
809 struct fn_hash *table = (struct fn_hash *)tb->tb_data;
810
811 s_m = cb->args[2];
812 rcu_read_lock();
813 for (fz = rcu_dereference(table->fn_zone_list);
814 fz != NULL;
815 fz = rcu_dereference(fz->fz_next), m++) {
816 if (m < s_m)
817 continue;
818 if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
819 cb->args[2] = m;
820 rcu_read_unlock();
821 return -1;
822 }
823 memset(&cb->args[3], 0,
824 sizeof(cb->args) - 3*sizeof(cb->args[0]));
825 }
826 rcu_read_unlock();
827 cb->args[2] = m;
828 return skb->len;
829}
830
831void __init fib_hash_init(void)
832{
833 fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
834 0, SLAB_PANIC, NULL);
835
836 fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
837 0, SLAB_PANIC, NULL);
838
839}
840
841struct fib_table *fib_hash_table(u32 id)
842{
843 struct fib_table *tb;
844
845 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
846 GFP_KERNEL);
847 if (tb == NULL)
848 return NULL;
849
850 tb->tb_id = id;
851 tb->tb_default = -1;
852
853 memset(tb->tb_data, 0, sizeof(struct fn_hash));
854 return tb;
855}
856
857/* ------------------------------------------------------------------------ */
858#ifdef CONFIG_PROC_FS
859
860struct fib_iter_state {
861 struct seq_net_private p;
862 struct fn_zone *zone;
863 int bucket;
864 struct hlist_head *hash_head;
865 struct fib_node *fn;
866 struct fib_alias *fa;
867 loff_t pos;
868 unsigned int genid;
869 int valid;
870};
871
872static struct fib_alias *fib_get_first(struct seq_file *seq)
873{
874 struct fib_iter_state *iter = seq->private;
875 struct fib_table *main_table;
876 struct fn_hash *table;
877
878 main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
879 table = (struct fn_hash *)main_table->tb_data;
880
881 iter->bucket = 0;
882 iter->hash_head = NULL;
883 iter->fn = NULL;
884 iter->fa = NULL;
885 iter->pos = 0;
886 iter->genid = fib_hash_genid;
887 iter->valid = 1;
888
889 for (iter->zone = rcu_dereference(table->fn_zone_list);
890 iter->zone != NULL;
891 iter->zone = rcu_dereference(iter->zone->fz_next)) {
892 int maxslot;
893
894 if (!iter->zone->fz_nent)
895 continue;
896
897 iter->hash_head = rcu_dereference(iter->zone->fz_hash);
898 maxslot = iter->zone->fz_divisor;
899
900 for (iter->bucket = 0; iter->bucket < maxslot;
901 ++iter->bucket, ++iter->hash_head) {
902 struct hlist_node *node;
903 struct fib_node *fn;
904
905 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
906 struct fib_alias *fa;
907
908 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
909 iter->fn = fn;
910 iter->fa = fa;
911 goto out;
912 }
913 }
914 }
915 }
916out:
917 return iter->fa;
918}
919
920static struct fib_alias *fib_get_next(struct seq_file *seq)
921{
922 struct fib_iter_state *iter = seq->private;
923 struct fib_node *fn;
924 struct fib_alias *fa;
925
926 /* Advance FA, if any. */
927 fn = iter->fn;
928 fa = iter->fa;
929 if (fa) {
930 BUG_ON(!fn);
931 list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
932 iter->fa = fa;
933 goto out;
934 }
935 }
936
937 fa = iter->fa = NULL;
938
939 /* Advance FN. */
940 if (fn) {
941 struct hlist_node *node = &fn->fn_hash;
942 hlist_for_each_entry_continue(fn, node, fn_hash) {
943 iter->fn = fn;
944
945 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
946 iter->fa = fa;
947 goto out;
948 }
949 }
950 }
951
952 fn = iter->fn = NULL;
953
954 /* Advance hash chain. */
955 if (!iter->zone)
956 goto out;
957
958 for (;;) {
959 struct hlist_node *node;
960 int maxslot;
961
962 maxslot = iter->zone->fz_divisor;
963
964 while (++iter->bucket < maxslot) {
965 iter->hash_head++;
966
967 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
968 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
969 iter->fn = fn;
970 iter->fa = fa;
971 goto out;
972 }
973 }
974 }
975
976 iter->zone = rcu_dereference(iter->zone->fz_next);
977
978 if (!iter->zone)
979 goto out;
980
981 iter->bucket = 0;
982 iter->hash_head = rcu_dereference(iter->zone->fz_hash);
983
984 hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
985 list_for_each_entry(fa, &fn->fn_alias, fa_list) {
986 iter->fn = fn;
987 iter->fa = fa;
988 goto out;
989 }
990 }
991 }
992out:
993 iter->pos++;
994 return fa;
995}
996
997static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
998{
999 struct fib_iter_state *iter = seq->private;
1000 struct fib_alias *fa;
1001
1002 if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
1003 fa = iter->fa;
1004 pos -= iter->pos;
1005 } else
1006 fa = fib_get_first(seq);
1007
1008 if (fa)
1009 while (pos && (fa = fib_get_next(seq)))
1010 --pos;
1011 return pos ? NULL : fa;
1012}
1013
1014static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
1015 __acquires(RCU)
1016{
1017 void *v = NULL;
1018
1019 rcu_read_lock();
1020 if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
1021 v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1022 return v;
1023}
1024
1025static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1026{
1027 ++*pos;
1028 return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
1029}
1030
1031static void fib_seq_stop(struct seq_file *seq, void *v)
1032 __releases(RCU)
1033{
1034 rcu_read_unlock();
1035}
1036
1037static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
1038{
1039 static const unsigned type2flags[RTN_MAX + 1] = {
1040 [7] = RTF_REJECT,
1041 [8] = RTF_REJECT,
1042 };
1043 unsigned flags = type2flags[type];
1044
1045 if (fi && fi->fib_nh->nh_gw)
1046 flags |= RTF_GATEWAY;
1047 if (mask == htonl(0xFFFFFFFF))
1048 flags |= RTF_HOST;
1049 flags |= RTF_UP;
1050 return flags;
1051}
1052
1053/*
1054 * This outputs /proc/net/route.
1055 *
1056 * It always works in backward compatibility mode.
1057 * The format of the file is not supposed to be changed.
1058 */
1059static int fib_seq_show(struct seq_file *seq, void *v)
1060{
1061 struct fib_iter_state *iter;
1062 int len;
1063 __be32 prefix, mask;
1064 unsigned flags;
1065 struct fib_node *f;
1066 struct fib_alias *fa;
1067 struct fib_info *fi;
1068
1069 if (v == SEQ_START_TOKEN) {
1070 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
1071 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
1072 "\tWindow\tIRTT");
1073 goto out;
1074 }
1075
1076 iter = seq->private;
1077 f = iter->fn;
1078 fa = iter->fa;
1079 fi = fa->fa_info;
1080 prefix = f->fn_key;
1081 mask = FZ_MASK(iter->zone);
1082 flags = fib_flag_trans(fa->fa_type, mask, fi);
1083 if (fi)
1084 seq_printf(seq,
1085 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
1086 fi->fib_dev ? fi->fib_dev->name : "*", prefix,
1087 fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
1088 mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
1089 fi->fib_window,
1090 fi->fib_rtt >> 3, &len);
1091 else
1092 seq_printf(seq,
1093 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
1094 prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len);
1095
1096 seq_printf(seq, "%*s\n", 127 - len, "");
1097out:
1098 return 0;
1099}
1100
1101static const struct seq_operations fib_seq_ops = {
1102 .start = fib_seq_start,
1103 .next = fib_seq_next,
1104 .stop = fib_seq_stop,
1105 .show = fib_seq_show,
1106};
1107
1108static int fib_seq_open(struct inode *inode, struct file *file)
1109{
1110 return seq_open_net(inode, file, &fib_seq_ops,
1111 sizeof(struct fib_iter_state));
1112}
1113
1114static const struct file_operations fib_seq_fops = {
1115 .owner = THIS_MODULE,
1116 .open = fib_seq_open,
1117 .read = seq_read,
1118 .llseek = seq_lseek,
1119 .release = seq_release_net,
1120};
1121
1122int __net_init fib_proc_init(struct net *net)
1123{
1124 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops))
1125 return -ENOMEM;
1126 return 0;
1127}
1128
1129void __net_exit fib_proc_exit(struct net *net)
1130{
1131 proc_net_remove(net, "route");
1132}
1133#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c079cc0ec651..d5c40d8f6632 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -25,7 +25,7 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
25} 25}
26 26
27/* Exported by fib_semantics.c */ 27/* Exported by fib_semantics.c */
28extern int fib_semantic_match(struct list_head *head, 28extern int fib_semantic_match(struct fib_table *tb, struct list_head *head,
29 const struct flowi *flp, 29 const struct flowi *flp,
30 struct fib_result *res, int prefixlen, int fib_flags); 30 struct fib_result *res, int prefixlen, int fib_flags);
31extern void fib_release_info(struct fib_info *); 31extern void fib_release_info(struct fib_info *);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 7981a24f5c7b..3018efbaea77 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -41,13 +41,13 @@ struct fib4_rule {
41 __be32 srcmask; 41 __be32 srcmask;
42 __be32 dst; 42 __be32 dst;
43 __be32 dstmask; 43 __be32 dstmask;
44#ifdef CONFIG_NET_CLS_ROUTE 44#ifdef CONFIG_IP_ROUTE_CLASSID
45 u32 tclassid; 45 u32 tclassid;
46#endif 46#endif
47}; 47};
48 48
49#ifdef CONFIG_NET_CLS_ROUTE 49#ifdef CONFIG_IP_ROUTE_CLASSID
50u32 fib_rules_tclass(struct fib_result *res) 50u32 fib_rules_tclass(const struct fib_result *res)
51{ 51{
52 return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; 52 return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
53} 53}
@@ -165,7 +165,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
165 if (frh->dst_len) 165 if (frh->dst_len)
166 rule4->dst = nla_get_be32(tb[FRA_DST]); 166 rule4->dst = nla_get_be32(tb[FRA_DST]);
167 167
168#ifdef CONFIG_NET_CLS_ROUTE 168#ifdef CONFIG_IP_ROUTE_CLASSID
169 if (tb[FRA_FLOW]) 169 if (tb[FRA_FLOW])
170 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); 170 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
171#endif 171#endif
@@ -195,7 +195,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
195 if (frh->tos && (rule4->tos != frh->tos)) 195 if (frh->tos && (rule4->tos != frh->tos))
196 return 0; 196 return 0;
197 197
198#ifdef CONFIG_NET_CLS_ROUTE 198#ifdef CONFIG_IP_ROUTE_CLASSID
199 if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) 199 if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
200 return 0; 200 return 0;
201#endif 201#endif
@@ -224,7 +224,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
224 if (rule4->src_len) 224 if (rule4->src_len)
225 NLA_PUT_BE32(skb, FRA_SRC, rule4->src); 225 NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
226 226
227#ifdef CONFIG_NET_CLS_ROUTE 227#ifdef CONFIG_IP_ROUTE_CLASSID
228 if (rule4->tclassid) 228 if (rule4->tclassid)
229 NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); 229 NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
230#endif 230#endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 12d3dc3df1b7..562f34cd9303 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -49,7 +49,7 @@
49static DEFINE_SPINLOCK(fib_info_lock); 49static DEFINE_SPINLOCK(fib_info_lock);
50static struct hlist_head *fib_info_hash; 50static struct hlist_head *fib_info_hash;
51static struct hlist_head *fib_info_laddrhash; 51static struct hlist_head *fib_info_laddrhash;
52static unsigned int fib_hash_size; 52static unsigned int fib_info_hash_size;
53static unsigned int fib_info_cnt; 53static unsigned int fib_info_cnt;
54 54
55#define DEVINDEX_HASHBITS 8 55#define DEVINDEX_HASHBITS 8
@@ -152,6 +152,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
152{ 152{
153 struct fib_info *fi = container_of(head, struct fib_info, rcu); 153 struct fib_info *fi = container_of(head, struct fib_info, rcu);
154 154
155 if (fi->fib_metrics != (u32 *) dst_default_metrics)
156 kfree(fi->fib_metrics);
155 kfree(fi); 157 kfree(fi);
156} 158}
157 159
@@ -200,7 +202,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
200#ifdef CONFIG_IP_ROUTE_MULTIPATH 202#ifdef CONFIG_IP_ROUTE_MULTIPATH
201 nh->nh_weight != onh->nh_weight || 203 nh->nh_weight != onh->nh_weight ||
202#endif 204#endif
203#ifdef CONFIG_NET_CLS_ROUTE 205#ifdef CONFIG_IP_ROUTE_CLASSID
204 nh->nh_tclassid != onh->nh_tclassid || 206 nh->nh_tclassid != onh->nh_tclassid ||
205#endif 207#endif
206 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) 208 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
@@ -221,7 +223,7 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
221 223
222static inline unsigned int fib_info_hashfn(const struct fib_info *fi) 224static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
223{ 225{
224 unsigned int mask = (fib_hash_size - 1); 226 unsigned int mask = (fib_info_hash_size - 1);
225 unsigned int val = fi->fib_nhs; 227 unsigned int val = fi->fib_nhs;
226 228
227 val ^= fi->fib_protocol; 229 val ^= fi->fib_protocol;
@@ -422,7 +424,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
422 424
423 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 425 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
424 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; 426 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
425#ifdef CONFIG_NET_CLS_ROUTE 427#ifdef CONFIG_IP_ROUTE_CLASSID
426 nla = nla_find(attrs, attrlen, RTA_FLOW); 428 nla = nla_find(attrs, attrlen, RTA_FLOW);
427 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 429 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
428#endif 430#endif
@@ -476,7 +478,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
476 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 478 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
477 if (nla && nla_get_be32(nla) != nh->nh_gw) 479 if (nla && nla_get_be32(nla) != nh->nh_gw)
478 return 1; 480 return 1;
479#ifdef CONFIG_NET_CLS_ROUTE 481#ifdef CONFIG_IP_ROUTE_CLASSID
480 nla = nla_find(attrs, attrlen, RTA_FLOW); 482 nla = nla_find(attrs, attrlen, RTA_FLOW);
481 if (nla && nla_get_u32(nla) != nh->nh_tclassid) 483 if (nla && nla_get_u32(nla) != nh->nh_tclassid)
482 return 1; 484 return 1;
@@ -613,14 +615,14 @@ out:
613 615
614static inline unsigned int fib_laddr_hashfn(__be32 val) 616static inline unsigned int fib_laddr_hashfn(__be32 val)
615{ 617{
616 unsigned int mask = (fib_hash_size - 1); 618 unsigned int mask = (fib_info_hash_size - 1);
617 619
618 return ((__force u32)val ^ 620 return ((__force u32)val ^
619 ((__force u32)val >> 7) ^ 621 ((__force u32)val >> 7) ^
620 ((__force u32)val >> 14)) & mask; 622 ((__force u32)val >> 14)) & mask;
621} 623}
622 624
623static struct hlist_head *fib_hash_alloc(int bytes) 625static struct hlist_head *fib_info_hash_alloc(int bytes)
624{ 626{
625 if (bytes <= PAGE_SIZE) 627 if (bytes <= PAGE_SIZE)
626 return kzalloc(bytes, GFP_KERNEL); 628 return kzalloc(bytes, GFP_KERNEL);
@@ -630,7 +632,7 @@ static struct hlist_head *fib_hash_alloc(int bytes)
630 get_order(bytes)); 632 get_order(bytes));
631} 633}
632 634
633static void fib_hash_free(struct hlist_head *hash, int bytes) 635static void fib_info_hash_free(struct hlist_head *hash, int bytes)
634{ 636{
635 if (!hash) 637 if (!hash)
636 return; 638 return;
@@ -641,18 +643,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes)
641 free_pages((unsigned long) hash, get_order(bytes)); 643 free_pages((unsigned long) hash, get_order(bytes));
642} 644}
643 645
644static void fib_hash_move(struct hlist_head *new_info_hash, 646static void fib_info_hash_move(struct hlist_head *new_info_hash,
645 struct hlist_head *new_laddrhash, 647 struct hlist_head *new_laddrhash,
646 unsigned int new_size) 648 unsigned int new_size)
647{ 649{
648 struct hlist_head *old_info_hash, *old_laddrhash; 650 struct hlist_head *old_info_hash, *old_laddrhash;
649 unsigned int old_size = fib_hash_size; 651 unsigned int old_size = fib_info_hash_size;
650 unsigned int i, bytes; 652 unsigned int i, bytes;
651 653
652 spin_lock_bh(&fib_info_lock); 654 spin_lock_bh(&fib_info_lock);
653 old_info_hash = fib_info_hash; 655 old_info_hash = fib_info_hash;
654 old_laddrhash = fib_info_laddrhash; 656 old_laddrhash = fib_info_laddrhash;
655 fib_hash_size = new_size; 657 fib_info_hash_size = new_size;
656 658
657 for (i = 0; i < old_size; i++) { 659 for (i = 0; i < old_size; i++) {
658 struct hlist_head *head = &fib_info_hash[i]; 660 struct hlist_head *head = &fib_info_hash[i];
@@ -693,8 +695,8 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
693 spin_unlock_bh(&fib_info_lock); 695 spin_unlock_bh(&fib_info_lock);
694 696
695 bytes = old_size * sizeof(struct hlist_head *); 697 bytes = old_size * sizeof(struct hlist_head *);
696 fib_hash_free(old_info_hash, bytes); 698 fib_info_hash_free(old_info_hash, bytes);
697 fib_hash_free(old_laddrhash, bytes); 699 fib_info_hash_free(old_laddrhash, bytes);
698} 700}
699 701
700struct fib_info *fib_create_info(struct fib_config *cfg) 702struct fib_info *fib_create_info(struct fib_config *cfg)
@@ -718,8 +720,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
718#endif 720#endif
719 721
720 err = -ENOBUFS; 722 err = -ENOBUFS;
721 if (fib_info_cnt >= fib_hash_size) { 723 if (fib_info_cnt >= fib_info_hash_size) {
722 unsigned int new_size = fib_hash_size << 1; 724 unsigned int new_size = fib_info_hash_size << 1;
723 struct hlist_head *new_info_hash; 725 struct hlist_head *new_info_hash;
724 struct hlist_head *new_laddrhash; 726 struct hlist_head *new_laddrhash;
725 unsigned int bytes; 727 unsigned int bytes;
@@ -727,21 +729,27 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
727 if (!new_size) 729 if (!new_size)
728 new_size = 1; 730 new_size = 1;
729 bytes = new_size * sizeof(struct hlist_head *); 731 bytes = new_size * sizeof(struct hlist_head *);
730 new_info_hash = fib_hash_alloc(bytes); 732 new_info_hash = fib_info_hash_alloc(bytes);
731 new_laddrhash = fib_hash_alloc(bytes); 733 new_laddrhash = fib_info_hash_alloc(bytes);
732 if (!new_info_hash || !new_laddrhash) { 734 if (!new_info_hash || !new_laddrhash) {
733 fib_hash_free(new_info_hash, bytes); 735 fib_info_hash_free(new_info_hash, bytes);
734 fib_hash_free(new_laddrhash, bytes); 736 fib_info_hash_free(new_laddrhash, bytes);
735 } else 737 } else
736 fib_hash_move(new_info_hash, new_laddrhash, new_size); 738 fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
737 739
738 if (!fib_hash_size) 740 if (!fib_info_hash_size)
739 goto failure; 741 goto failure;
740 } 742 }
741 743
742 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); 744 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
743 if (fi == NULL) 745 if (fi == NULL)
744 goto failure; 746 goto failure;
747 if (cfg->fc_mx) {
748 fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
749 if (!fi->fib_metrics)
750 goto failure;
751 } else
752 fi->fib_metrics = (u32 *) dst_default_metrics;
745 fib_info_cnt++; 753 fib_info_cnt++;
746 754
747 fi->fib_net = hold_net(net); 755 fi->fib_net = hold_net(net);
@@ -779,7 +787,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
779 goto err_inval; 787 goto err_inval;
780 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) 788 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
781 goto err_inval; 789 goto err_inval;
782#ifdef CONFIG_NET_CLS_ROUTE 790#ifdef CONFIG_IP_ROUTE_CLASSID
783 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) 791 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
784 goto err_inval; 792 goto err_inval;
785#endif 793#endif
@@ -792,7 +800,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
792 nh->nh_oif = cfg->fc_oif; 800 nh->nh_oif = cfg->fc_oif;
793 nh->nh_gw = cfg->fc_gw; 801 nh->nh_gw = cfg->fc_gw;
794 nh->nh_flags = cfg->fc_flags; 802 nh->nh_flags = cfg->fc_flags;
795#ifdef CONFIG_NET_CLS_ROUTE 803#ifdef CONFIG_IP_ROUTE_CLASSID
796 nh->nh_tclassid = cfg->fc_flow; 804 nh->nh_tclassid = cfg->fc_flow;
797#endif 805#endif
798#ifdef CONFIG_IP_ROUTE_MULTIPATH 806#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -881,8 +889,9 @@ failure:
881} 889}
882 890
883/* Note! fib_semantic_match intentionally uses RCU list functions. */ 891/* Note! fib_semantic_match intentionally uses RCU list functions. */
884int fib_semantic_match(struct list_head *head, const struct flowi *flp, 892int fib_semantic_match(struct fib_table *tb, struct list_head *head,
885 struct fib_result *res, int prefixlen, int fib_flags) 893 const struct flowi *flp, struct fib_result *res,
894 int prefixlen, int fib_flags)
886{ 895{
887 struct fib_alias *fa; 896 struct fib_alias *fa;
888 int nh_sel = 0; 897 int nh_sel = 0;
@@ -946,6 +955,8 @@ out_fill_res:
946 res->type = fa->fa_type; 955 res->type = fa->fa_type;
947 res->scope = fa->fa_scope; 956 res->scope = fa->fa_scope;
948 res->fi = fa->fa_info; 957 res->fi = fa->fa_info;
958 res->table = tb;
959 res->fa_head = head;
949 if (!(fib_flags & FIB_LOOKUP_NOREF)) 960 if (!(fib_flags & FIB_LOOKUP_NOREF))
950 atomic_inc(&res->fi->fib_clntref); 961 atomic_inc(&res->fi->fib_clntref);
951 return 0; 962 return 0;
@@ -1002,7 +1013,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
1002 1013
1003 if (fi->fib_nh->nh_oif) 1014 if (fi->fib_nh->nh_oif)
1004 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); 1015 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
1005#ifdef CONFIG_NET_CLS_ROUTE 1016#ifdef CONFIG_IP_ROUTE_CLASSID
1006 if (fi->fib_nh[0].nh_tclassid) 1017 if (fi->fib_nh[0].nh_tclassid)
1007 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); 1018 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
1008#endif 1019#endif
@@ -1027,7 +1038,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
1027 1038
1028 if (nh->nh_gw) 1039 if (nh->nh_gw)
1029 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); 1040 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1030#ifdef CONFIG_NET_CLS_ROUTE 1041#ifdef CONFIG_IP_ROUTE_CLASSID
1031 if (nh->nh_tclassid) 1042 if (nh->nh_tclassid)
1032 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); 1043 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1033#endif 1044#endif
@@ -1125,6 +1136,62 @@ int fib_sync_down_dev(struct net_device *dev, int force)
1125 return ret; 1136 return ret;
1126} 1137}
1127 1138
1139/* Must be invoked inside of an RCU protected region. */
1140void fib_select_default(struct fib_result *res)
1141{
1142 struct fib_info *fi = NULL, *last_resort = NULL;
1143 struct list_head *fa_head = res->fa_head;
1144 struct fib_table *tb = res->table;
1145 int order = -1, last_idx = -1;
1146 struct fib_alias *fa;
1147
1148 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1149 struct fib_info *next_fi = fa->fa_info;
1150
1151 if (fa->fa_scope != res->scope ||
1152 fa->fa_type != RTN_UNICAST)
1153 continue;
1154
1155 if (next_fi->fib_priority > res->fi->fib_priority)
1156 break;
1157 if (!next_fi->fib_nh[0].nh_gw ||
1158 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1159 continue;
1160
1161 fib_alias_accessed(fa);
1162
1163 if (fi == NULL) {
1164 if (next_fi != res->fi)
1165 break;
1166 } else if (!fib_detect_death(fi, order, &last_resort,
1167 &last_idx, tb->tb_default)) {
1168 fib_result_assign(res, fi);
1169 tb->tb_default = order;
1170 goto out;
1171 }
1172 fi = next_fi;
1173 order++;
1174 }
1175
1176 if (order <= 0 || fi == NULL) {
1177 tb->tb_default = -1;
1178 goto out;
1179 }
1180
1181 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1182 tb->tb_default)) {
1183 fib_result_assign(res, fi);
1184 tb->tb_default = order;
1185 goto out;
1186 }
1187
1188 if (last_idx >= 0)
1189 fib_result_assign(res, last_resort);
1190 tb->tb_default = last_idx;
1191out:
1192 return;
1193}
1194
1128#ifdef CONFIG_IP_ROUTE_MULTIPATH 1195#ifdef CONFIG_IP_ROUTE_MULTIPATH
1129 1196
1130/* 1197/*
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0f280348e0fd..edf3b0997e01 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -95,7 +95,7 @@ typedef unsigned int t_key;
95#define IS_TNODE(n) (!(n->parent & T_LEAF)) 95#define IS_TNODE(n) (!(n->parent & T_LEAF))
96#define IS_LEAF(n) (n->parent & T_LEAF) 96#define IS_LEAF(n) (n->parent & T_LEAF)
97 97
98struct node { 98struct rt_trie_node {
99 unsigned long parent; 99 unsigned long parent;
100 t_key key; 100 t_key key;
101}; 101};
@@ -126,7 +126,7 @@ struct tnode {
126 struct work_struct work; 126 struct work_struct work;
127 struct tnode *tnode_free; 127 struct tnode *tnode_free;
128 }; 128 };
129 struct node *child[0]; 129 struct rt_trie_node *child[0];
130}; 130};
131 131
132#ifdef CONFIG_IP_FIB_TRIE_STATS 132#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -151,16 +151,16 @@ struct trie_stat {
151}; 151};
152 152
153struct trie { 153struct trie {
154 struct node *trie; 154 struct rt_trie_node *trie;
155#ifdef CONFIG_IP_FIB_TRIE_STATS 155#ifdef CONFIG_IP_FIB_TRIE_STATS
156 struct trie_use_stats stats; 156 struct trie_use_stats stats;
157#endif 157#endif
158}; 158};
159 159
160static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); 160static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
161static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, 161static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
162 int wasfull); 162 int wasfull);
163static struct node *resize(struct trie *t, struct tnode *tn); 163static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
164static struct tnode *inflate(struct trie *t, struct tnode *tn); 164static struct tnode *inflate(struct trie *t, struct tnode *tn);
165static struct tnode *halve(struct trie *t, struct tnode *tn); 165static struct tnode *halve(struct trie *t, struct tnode *tn);
166/* tnodes to free after resize(); protected by RTNL */ 166/* tnodes to free after resize(); protected by RTNL */
@@ -177,12 +177,12 @@ static const int sync_pages = 128;
177static struct kmem_cache *fn_alias_kmem __read_mostly; 177static struct kmem_cache *fn_alias_kmem __read_mostly;
178static struct kmem_cache *trie_leaf_kmem __read_mostly; 178static struct kmem_cache *trie_leaf_kmem __read_mostly;
179 179
180static inline struct tnode *node_parent(struct node *node) 180static inline struct tnode *node_parent(struct rt_trie_node *node)
181{ 181{
182 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); 182 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
183} 183}
184 184
185static inline struct tnode *node_parent_rcu(struct node *node) 185static inline struct tnode *node_parent_rcu(struct rt_trie_node *node)
186{ 186{
187 struct tnode *ret = node_parent(node); 187 struct tnode *ret = node_parent(node);
188 188
@@ -192,22 +192,22 @@ static inline struct tnode *node_parent_rcu(struct node *node)
192/* Same as rcu_assign_pointer 192/* Same as rcu_assign_pointer
193 * but that macro() assumes that value is a pointer. 193 * but that macro() assumes that value is a pointer.
194 */ 194 */
195static inline void node_set_parent(struct node *node, struct tnode *ptr) 195static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
196{ 196{
197 smp_wmb(); 197 smp_wmb();
198 node->parent = (unsigned long)ptr | NODE_TYPE(node); 198 node->parent = (unsigned long)ptr | NODE_TYPE(node);
199} 199}
200 200
201static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) 201static inline struct rt_trie_node *tnode_get_child(struct tnode *tn, unsigned int i)
202{ 202{
203 BUG_ON(i >= 1U << tn->bits); 203 BUG_ON(i >= 1U << tn->bits);
204 204
205 return tn->child[i]; 205 return tn->child[i];
206} 206}
207 207
208static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) 208static inline struct rt_trie_node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
209{ 209{
210 struct node *ret = tnode_get_child(tn, i); 210 struct rt_trie_node *ret = tnode_get_child(tn, i);
211 211
212 return rcu_dereference_rtnl(ret); 212 return rcu_dereference_rtnl(ret);
213} 213}
@@ -217,12 +217,12 @@ static inline int tnode_child_length(const struct tnode *tn)
217 return 1 << tn->bits; 217 return 1 << tn->bits;
218} 218}
219 219
220static inline t_key mask_pfx(t_key k, unsigned short l) 220static inline t_key mask_pfx(t_key k, unsigned int l)
221{ 221{
222 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); 222 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
223} 223}
224 224
225static inline t_key tkey_extract_bits(t_key a, int offset, int bits) 225static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
226{ 226{
227 if (offset < KEYLENGTH) 227 if (offset < KEYLENGTH)
228 return ((t_key)(a << offset)) >> (KEYLENGTH - bits); 228 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
@@ -378,7 +378,7 @@ static void __tnode_free_rcu(struct rcu_head *head)
378{ 378{
379 struct tnode *tn = container_of(head, struct tnode, rcu); 379 struct tnode *tn = container_of(head, struct tnode, rcu);
380 size_t size = sizeof(struct tnode) + 380 size_t size = sizeof(struct tnode) +
381 (sizeof(struct node *) << tn->bits); 381 (sizeof(struct rt_trie_node *) << tn->bits);
382 382
383 if (size <= PAGE_SIZE) 383 if (size <= PAGE_SIZE)
384 kfree(tn); 384 kfree(tn);
@@ -402,7 +402,7 @@ static void tnode_free_safe(struct tnode *tn)
402 tn->tnode_free = tnode_free_head; 402 tn->tnode_free = tnode_free_head;
403 tnode_free_head = tn; 403 tnode_free_head = tn;
404 tnode_free_size += sizeof(struct tnode) + 404 tnode_free_size += sizeof(struct tnode) +
405 (sizeof(struct node *) << tn->bits); 405 (sizeof(struct rt_trie_node *) << tn->bits);
406} 406}
407 407
408static void tnode_free_flush(void) 408static void tnode_free_flush(void)
@@ -443,7 +443,7 @@ static struct leaf_info *leaf_info_new(int plen)
443 443
444static struct tnode *tnode_new(t_key key, int pos, int bits) 444static struct tnode *tnode_new(t_key key, int pos, int bits)
445{ 445{
446 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); 446 size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
447 struct tnode *tn = tnode_alloc(sz); 447 struct tnode *tn = tnode_alloc(sz);
448 448
449 if (tn) { 449 if (tn) {
@@ -456,7 +456,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
456 } 456 }
457 457
458 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), 458 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
459 sizeof(struct node) << bits); 459 sizeof(struct rt_trie_node) << bits);
460 return tn; 460 return tn;
461} 461}
462 462
@@ -465,7 +465,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
465 * and no bits are skipped. See discussion in dyntree paper p. 6 465 * and no bits are skipped. See discussion in dyntree paper p. 6
466 */ 466 */
467 467
468static inline int tnode_full(const struct tnode *tn, const struct node *n) 468static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
469{ 469{
470 if (n == NULL || IS_LEAF(n)) 470 if (n == NULL || IS_LEAF(n))
471 return 0; 471 return 0;
@@ -474,7 +474,7 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n)
474} 474}
475 475
476static inline void put_child(struct trie *t, struct tnode *tn, int i, 476static inline void put_child(struct trie *t, struct tnode *tn, int i,
477 struct node *n) 477 struct rt_trie_node *n)
478{ 478{
479 tnode_put_child_reorg(tn, i, n, -1); 479 tnode_put_child_reorg(tn, i, n, -1);
480} 480}
@@ -484,10 +484,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i,
484 * Update the value of full_children and empty_children. 484 * Update the value of full_children and empty_children.
485 */ 485 */
486 486
487static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, 487static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
488 int wasfull) 488 int wasfull)
489{ 489{
490 struct node *chi = tn->child[i]; 490 struct rt_trie_node *chi = tn->child[i];
491 int isfull; 491 int isfull;
492 492
493 BUG_ON(i >= 1<<tn->bits); 493 BUG_ON(i >= 1<<tn->bits);
@@ -515,7 +515,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
515} 515}
516 516
517#define MAX_WORK 10 517#define MAX_WORK 10
518static struct node *resize(struct trie *t, struct tnode *tn) 518static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
519{ 519{
520 int i; 520 int i;
521 struct tnode *old_tn; 521 struct tnode *old_tn;
@@ -605,7 +605,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
605 605
606 /* Keep root node larger */ 606 /* Keep root node larger */
607 607
608 if (!node_parent((struct node *)tn)) { 608 if (!node_parent((struct rt_trie_node *)tn)) {
609 inflate_threshold_use = inflate_threshold_root; 609 inflate_threshold_use = inflate_threshold_root;
610 halve_threshold_use = halve_threshold_root; 610 halve_threshold_use = halve_threshold_root;
611 } else { 611 } else {
@@ -635,7 +635,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
635 635
636 /* Return if at least one inflate is run */ 636 /* Return if at least one inflate is run */
637 if (max_work != MAX_WORK) 637 if (max_work != MAX_WORK)
638 return (struct node *) tn; 638 return (struct rt_trie_node *) tn;
639 639
640 /* 640 /*
641 * Halve as long as the number of empty children in this 641 * Halve as long as the number of empty children in this
@@ -663,7 +663,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
663 if (tn->empty_children == tnode_child_length(tn) - 1) { 663 if (tn->empty_children == tnode_child_length(tn) - 1) {
664one_child: 664one_child:
665 for (i = 0; i < tnode_child_length(tn); i++) { 665 for (i = 0; i < tnode_child_length(tn); i++) {
666 struct node *n; 666 struct rt_trie_node *n;
667 667
668 n = tn->child[i]; 668 n = tn->child[i];
669 if (!n) 669 if (!n)
@@ -676,7 +676,7 @@ one_child:
676 return n; 676 return n;
677 } 677 }
678 } 678 }
679 return (struct node *) tn; 679 return (struct rt_trie_node *) tn;
680} 680}
681 681
682static struct tnode *inflate(struct trie *t, struct tnode *tn) 682static struct tnode *inflate(struct trie *t, struct tnode *tn)
@@ -723,14 +723,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
723 goto nomem; 723 goto nomem;
724 } 724 }
725 725
726 put_child(t, tn, 2*i, (struct node *) left); 726 put_child(t, tn, 2*i, (struct rt_trie_node *) left);
727 put_child(t, tn, 2*i+1, (struct node *) right); 727 put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
728 } 728 }
729 } 729 }
730 730
731 for (i = 0; i < olen; i++) { 731 for (i = 0; i < olen; i++) {
732 struct tnode *inode; 732 struct tnode *inode;
733 struct node *node = tnode_get_child(oldtnode, i); 733 struct rt_trie_node *node = tnode_get_child(oldtnode, i);
734 struct tnode *left, *right; 734 struct tnode *left, *right;
735 int size, j; 735 int size, j;
736 736
@@ -825,7 +825,7 @@ nomem:
825static struct tnode *halve(struct trie *t, struct tnode *tn) 825static struct tnode *halve(struct trie *t, struct tnode *tn)
826{ 826{
827 struct tnode *oldtnode = tn; 827 struct tnode *oldtnode = tn;
828 struct node *left, *right; 828 struct rt_trie_node *left, *right;
829 int i; 829 int i;
830 int olen = tnode_child_length(tn); 830 int olen = tnode_child_length(tn);
831 831
@@ -856,7 +856,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
856 if (!newn) 856 if (!newn)
857 goto nomem; 857 goto nomem;
858 858
859 put_child(t, tn, i/2, (struct node *)newn); 859 put_child(t, tn, i/2, (struct rt_trie_node *)newn);
860 } 860 }
861 861
862 } 862 }
@@ -958,7 +958,7 @@ fib_find_node(struct trie *t, u32 key)
958{ 958{
959 int pos; 959 int pos;
960 struct tnode *tn; 960 struct tnode *tn;
961 struct node *n; 961 struct rt_trie_node *n;
962 962
963 pos = 0; 963 pos = 0;
964 n = rcu_dereference_rtnl(t->trie); 964 n = rcu_dereference_rtnl(t->trie);
@@ -993,17 +993,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
993 993
994 key = tn->key; 994 key = tn->key;
995 995
996 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { 996 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
997 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 997 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
998 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 998 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
999 tn = (struct tnode *) resize(t, (struct tnode *)tn); 999 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1000 1000
1001 tnode_put_child_reorg((struct tnode *)tp, cindex, 1001 tnode_put_child_reorg((struct tnode *)tp, cindex,
1002 (struct node *)tn, wasfull); 1002 (struct rt_trie_node *)tn, wasfull);
1003 1003
1004 tp = node_parent((struct node *) tn); 1004 tp = node_parent((struct rt_trie_node *) tn);
1005 if (!tp) 1005 if (!tp)
1006 rcu_assign_pointer(t->trie, (struct node *)tn); 1006 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1007 1007
1008 tnode_free_flush(); 1008 tnode_free_flush();
1009 if (!tp) 1009 if (!tp)
@@ -1015,7 +1015,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1015 if (IS_TNODE(tn)) 1015 if (IS_TNODE(tn))
1016 tn = (struct tnode *)resize(t, (struct tnode *)tn); 1016 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1017 1017
1018 rcu_assign_pointer(t->trie, (struct node *)tn); 1018 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1019 tnode_free_flush(); 1019 tnode_free_flush();
1020} 1020}
1021 1021
@@ -1025,7 +1025,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1025{ 1025{
1026 int pos, newpos; 1026 int pos, newpos;
1027 struct tnode *tp = NULL, *tn = NULL; 1027 struct tnode *tp = NULL, *tn = NULL;
1028 struct node *n; 1028 struct rt_trie_node *n;
1029 struct leaf *l; 1029 struct leaf *l;
1030 int missbit; 1030 int missbit;
1031 struct list_head *fa_head = NULL; 1031 struct list_head *fa_head = NULL;
@@ -1111,10 +1111,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1111 if (t->trie && n == NULL) { 1111 if (t->trie && n == NULL) {
1112 /* Case 2: n is NULL, and will just insert a new leaf */ 1112 /* Case 2: n is NULL, and will just insert a new leaf */
1113 1113
1114 node_set_parent((struct node *)l, tp); 1114 node_set_parent((struct rt_trie_node *)l, tp);
1115 1115
1116 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1116 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1117 put_child(t, (struct tnode *)tp, cindex, (struct node *)l); 1117 put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
1118 } else { 1118 } else {
1119 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1119 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1120 /* 1120 /*
@@ -1141,18 +1141,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1141 return NULL; 1141 return NULL;
1142 } 1142 }
1143 1143
1144 node_set_parent((struct node *)tn, tp); 1144 node_set_parent((struct rt_trie_node *)tn, tp);
1145 1145
1146 missbit = tkey_extract_bits(key, newpos, 1); 1146 missbit = tkey_extract_bits(key, newpos, 1);
1147 put_child(t, tn, missbit, (struct node *)l); 1147 put_child(t, tn, missbit, (struct rt_trie_node *)l);
1148 put_child(t, tn, 1-missbit, n); 1148 put_child(t, tn, 1-missbit, n);
1149 1149
1150 if (tp) { 1150 if (tp) {
1151 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1151 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1152 put_child(t, (struct tnode *)tp, cindex, 1152 put_child(t, (struct tnode *)tp, cindex,
1153 (struct node *)tn); 1153 (struct rt_trie_node *)tn);
1154 } else { 1154 } else {
1155 rcu_assign_pointer(t->trie, (struct node *)tn); 1155 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1156 tp = tn; 1156 tp = tn;
1157 } 1157 }
1158 } 1158 }
@@ -1340,7 +1340,7 @@ err:
1340} 1340}
1341 1341
1342/* should be called with rcu_read_lock */ 1342/* should be called with rcu_read_lock */
1343static int check_leaf(struct trie *t, struct leaf *l, 1343static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1344 t_key key, const struct flowi *flp, 1344 t_key key, const struct flowi *flp,
1345 struct fib_result *res, int fib_flags) 1345 struct fib_result *res, int fib_flags)
1346{ 1346{
@@ -1356,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
1356 if (l->key != (key & ntohl(mask))) 1356 if (l->key != (key & ntohl(mask)))
1357 continue; 1357 continue;
1358 1358
1359 err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags); 1359 err = fib_semantic_match(tb, &li->falh, flp, res, plen, fib_flags);
1360 1360
1361#ifdef CONFIG_IP_FIB_TRIE_STATS 1361#ifdef CONFIG_IP_FIB_TRIE_STATS
1362 if (err <= 0) 1362 if (err <= 0)
@@ -1376,13 +1376,13 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1376{ 1376{
1377 struct trie *t = (struct trie *) tb->tb_data; 1377 struct trie *t = (struct trie *) tb->tb_data;
1378 int ret; 1378 int ret;
1379 struct node *n; 1379 struct rt_trie_node *n;
1380 struct tnode *pn; 1380 struct tnode *pn;
1381 int pos, bits; 1381 unsigned int pos, bits;
1382 t_key key = ntohl(flp->fl4_dst); 1382 t_key key = ntohl(flp->fl4_dst);
1383 int chopped_off; 1383 unsigned int chopped_off;
1384 t_key cindex = 0; 1384 t_key cindex = 0;
1385 int current_prefix_length = KEYLENGTH; 1385 unsigned int current_prefix_length = KEYLENGTH;
1386 struct tnode *cn; 1386 struct tnode *cn;
1387 t_key pref_mismatch; 1387 t_key pref_mismatch;
1388 1388
@@ -1398,7 +1398,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1398 1398
1399 /* Just a leaf? */ 1399 /* Just a leaf? */
1400 if (IS_LEAF(n)) { 1400 if (IS_LEAF(n)) {
1401 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); 1401 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1402 goto found; 1402 goto found;
1403 } 1403 }
1404 1404
@@ -1423,7 +1423,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1423 } 1423 }
1424 1424
1425 if (IS_LEAF(n)) { 1425 if (IS_LEAF(n)) {
1426 ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); 1426 ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1427 if (ret > 0) 1427 if (ret > 0)
1428 goto backtrace; 1428 goto backtrace;
1429 goto found; 1429 goto found;
@@ -1541,7 +1541,7 @@ backtrace:
1541 if (chopped_off <= pn->bits) { 1541 if (chopped_off <= pn->bits) {
1542 cindex &= ~(1 << (chopped_off-1)); 1542 cindex &= ~(1 << (chopped_off-1));
1543 } else { 1543 } else {
1544 struct tnode *parent = node_parent_rcu((struct node *) pn); 1544 struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1545 if (!parent) 1545 if (!parent)
1546 goto failed; 1546 goto failed;
1547 1547
@@ -1568,7 +1568,7 @@ found:
1568 */ 1568 */
1569static void trie_leaf_remove(struct trie *t, struct leaf *l) 1569static void trie_leaf_remove(struct trie *t, struct leaf *l)
1570{ 1570{
1571 struct tnode *tp = node_parent((struct node *) l); 1571 struct tnode *tp = node_parent((struct rt_trie_node *) l);
1572 1572
1573 pr_debug("entering trie_leaf_remove(%p)\n", l); 1573 pr_debug("entering trie_leaf_remove(%p)\n", l);
1574 1574
@@ -1706,7 +1706,7 @@ static int trie_flush_leaf(struct leaf *l)
1706 * Scan for the next right leaf starting at node p->child[idx] 1706 * Scan for the next right leaf starting at node p->child[idx]
1707 * Since we have back pointer, no recursion necessary. 1707 * Since we have back pointer, no recursion necessary.
1708 */ 1708 */
1709static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) 1709static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1710{ 1710{
1711 do { 1711 do {
1712 t_key idx; 1712 t_key idx;
@@ -1732,7 +1732,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1732 } 1732 }
1733 1733
1734 /* Node empty, walk back up to parent */ 1734 /* Node empty, walk back up to parent */
1735 c = (struct node *) p; 1735 c = (struct rt_trie_node *) p;
1736 } while ((p = node_parent_rcu(c)) != NULL); 1736 } while ((p = node_parent_rcu(c)) != NULL);
1737 1737
1738 return NULL; /* Root of trie */ 1738 return NULL; /* Root of trie */
@@ -1753,7 +1753,7 @@ static struct leaf *trie_firstleaf(struct trie *t)
1753 1753
1754static struct leaf *trie_nextleaf(struct leaf *l) 1754static struct leaf *trie_nextleaf(struct leaf *l)
1755{ 1755{
1756 struct node *c = (struct node *) l; 1756 struct rt_trie_node *c = (struct rt_trie_node *) l;
1757 struct tnode *p = node_parent_rcu(c); 1757 struct tnode *p = node_parent_rcu(c);
1758 1758
1759 if (!p) 1759 if (!p)
@@ -1802,80 +1802,6 @@ void fib_free_table(struct fib_table *tb)
1802 kfree(tb); 1802 kfree(tb);
1803} 1803}
1804 1804
1805void fib_table_select_default(struct fib_table *tb,
1806 const struct flowi *flp,
1807 struct fib_result *res)
1808{
1809 struct trie *t = (struct trie *) tb->tb_data;
1810 int order, last_idx;
1811 struct fib_info *fi = NULL;
1812 struct fib_info *last_resort;
1813 struct fib_alias *fa = NULL;
1814 struct list_head *fa_head;
1815 struct leaf *l;
1816
1817 last_idx = -1;
1818 last_resort = NULL;
1819 order = -1;
1820
1821 rcu_read_lock();
1822
1823 l = fib_find_node(t, 0);
1824 if (!l)
1825 goto out;
1826
1827 fa_head = get_fa_head(l, 0);
1828 if (!fa_head)
1829 goto out;
1830
1831 if (list_empty(fa_head))
1832 goto out;
1833
1834 list_for_each_entry_rcu(fa, fa_head, fa_list) {
1835 struct fib_info *next_fi = fa->fa_info;
1836
1837 if (fa->fa_scope != res->scope ||
1838 fa->fa_type != RTN_UNICAST)
1839 continue;
1840
1841 if (next_fi->fib_priority > res->fi->fib_priority)
1842 break;
1843 if (!next_fi->fib_nh[0].nh_gw ||
1844 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1845 continue;
1846
1847 fib_alias_accessed(fa);
1848
1849 if (fi == NULL) {
1850 if (next_fi != res->fi)
1851 break;
1852 } else if (!fib_detect_death(fi, order, &last_resort,
1853 &last_idx, tb->tb_default)) {
1854 fib_result_assign(res, fi);
1855 tb->tb_default = order;
1856 goto out;
1857 }
1858 fi = next_fi;
1859 order++;
1860 }
1861 if (order <= 0 || fi == NULL) {
1862 tb->tb_default = -1;
1863 goto out;
1864 }
1865
1866 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1867 tb->tb_default)) {
1868 fib_result_assign(res, fi);
1869 tb->tb_default = order;
1870 goto out;
1871 }
1872 if (last_idx >= 0)
1873 fib_result_assign(res, last_resort);
1874 tb->tb_default = last_idx;
1875out:
1876 rcu_read_unlock();
1877}
1878
1879static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, 1805static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1880 struct fib_table *tb, 1806 struct fib_table *tb,
1881 struct sk_buff *skb, struct netlink_callback *cb) 1807 struct sk_buff *skb, struct netlink_callback *cb)
@@ -1990,7 +1916,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1990 return skb->len; 1916 return skb->len;
1991} 1917}
1992 1918
1993void __init fib_hash_init(void) 1919void __init fib_trie_init(void)
1994{ 1920{
1995 fn_alias_kmem = kmem_cache_create("ip_fib_alias", 1921 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1996 sizeof(struct fib_alias), 1922 sizeof(struct fib_alias),
@@ -2003,8 +1929,7 @@ void __init fib_hash_init(void)
2003} 1929}
2004 1930
2005 1931
2006/* Fix more generic FIB names for init later */ 1932struct fib_table *fib_trie_table(u32 id)
2007struct fib_table *fib_hash_table(u32 id)
2008{ 1933{
2009 struct fib_table *tb; 1934 struct fib_table *tb;
2010 struct trie *t; 1935 struct trie *t;
@@ -2036,7 +1961,7 @@ struct fib_trie_iter {
2036 unsigned int depth; 1961 unsigned int depth;
2037}; 1962};
2038 1963
2039static struct node *fib_trie_get_next(struct fib_trie_iter *iter) 1964static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2040{ 1965{
2041 struct tnode *tn = iter->tnode; 1966 struct tnode *tn = iter->tnode;
2042 unsigned int cindex = iter->index; 1967 unsigned int cindex = iter->index;
@@ -2050,7 +1975,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2050 iter->tnode, iter->index, iter->depth); 1975 iter->tnode, iter->index, iter->depth);
2051rescan: 1976rescan:
2052 while (cindex < (1<<tn->bits)) { 1977 while (cindex < (1<<tn->bits)) {
2053 struct node *n = tnode_get_child_rcu(tn, cindex); 1978 struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2054 1979
2055 if (n) { 1980 if (n) {
2056 if (IS_LEAF(n)) { 1981 if (IS_LEAF(n)) {
@@ -2069,7 +1994,7 @@ rescan:
2069 } 1994 }
2070 1995
2071 /* Current node exhausted, pop back up */ 1996 /* Current node exhausted, pop back up */
2072 p = node_parent_rcu((struct node *)tn); 1997 p = node_parent_rcu((struct rt_trie_node *)tn);
2073 if (p) { 1998 if (p) {
2074 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; 1999 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2075 tn = p; 2000 tn = p;
@@ -2081,10 +2006,10 @@ rescan:
2081 return NULL; 2006 return NULL;
2082} 2007}
2083 2008
2084static struct node *fib_trie_get_first(struct fib_trie_iter *iter, 2009static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2085 struct trie *t) 2010 struct trie *t)
2086{ 2011{
2087 struct node *n; 2012 struct rt_trie_node *n;
2088 2013
2089 if (!t) 2014 if (!t)
2090 return NULL; 2015 return NULL;
@@ -2108,7 +2033,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2108 2033
2109static void trie_collect_stats(struct trie *t, struct trie_stat *s) 2034static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2110{ 2035{
2111 struct node *n; 2036 struct rt_trie_node *n;
2112 struct fib_trie_iter iter; 2037 struct fib_trie_iter iter;
2113 2038
2114 memset(s, 0, sizeof(*s)); 2039 memset(s, 0, sizeof(*s));
@@ -2181,7 +2106,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2181 seq_putc(seq, '\n'); 2106 seq_putc(seq, '\n');
2182 seq_printf(seq, "\tPointers: %u\n", pointers); 2107 seq_printf(seq, "\tPointers: %u\n", pointers);
2183 2108
2184 bytes += sizeof(struct node *) * pointers; 2109 bytes += sizeof(struct rt_trie_node *) * pointers;
2185 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); 2110 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2186 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); 2111 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
2187} 2112}
@@ -2262,7 +2187,7 @@ static const struct file_operations fib_triestat_fops = {
2262 .release = single_release_net, 2187 .release = single_release_net,
2263}; 2188};
2264 2189
2265static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) 2190static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2266{ 2191{
2267 struct fib_trie_iter *iter = seq->private; 2192 struct fib_trie_iter *iter = seq->private;
2268 struct net *net = seq_file_net(seq); 2193 struct net *net = seq_file_net(seq);
@@ -2275,7 +2200,7 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2275 struct fib_table *tb; 2200 struct fib_table *tb;
2276 2201
2277 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { 2202 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2278 struct node *n; 2203 struct rt_trie_node *n;
2279 2204
2280 for (n = fib_trie_get_first(iter, 2205 for (n = fib_trie_get_first(iter,
2281 (struct trie *) tb->tb_data); 2206 (struct trie *) tb->tb_data);
@@ -2304,7 +2229,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2304 struct fib_table *tb = iter->tb; 2229 struct fib_table *tb = iter->tb;
2305 struct hlist_node *tb_node; 2230 struct hlist_node *tb_node;
2306 unsigned int h; 2231 unsigned int h;
2307 struct node *n; 2232 struct rt_trie_node *n;
2308 2233
2309 ++*pos; 2234 ++*pos;
2310 /* next node in same table */ 2235 /* next node in same table */
@@ -2390,7 +2315,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2390static int fib_trie_seq_show(struct seq_file *seq, void *v) 2315static int fib_trie_seq_show(struct seq_file *seq, void *v)
2391{ 2316{
2392 const struct fib_trie_iter *iter = seq->private; 2317 const struct fib_trie_iter *iter = seq->private;
2393 struct node *n = v; 2318 struct rt_trie_node *n = v;
2394 2319
2395 if (!node_parent_rcu(n)) 2320 if (!node_parent_rcu(n))
2396 fib_table_print(seq, iter->tb); 2321 fib_table_print(seq, iter->tb);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4aa1b7f01ea0..ad2bcf1b69ae 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -233,48 +233,11 @@ static inline void icmp_xmit_unlock(struct sock *sk)
233 * Send an ICMP frame. 233 * Send an ICMP frame.
234 */ 234 */
235 235
236/* 236static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
237 * Check transmit rate limitation for given message.
238 * The rate information is held in the destination cache now.
239 * This function is generic and could be used for other purposes
240 * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
241 *
242 * Note that the same dst_entry fields are modified by functions in
243 * route.c too, but these work for packet destinations while xrlim_allow
244 * works for icmp destinations. This means the rate limiting information
245 * for one "ip object" is shared - and these ICMPs are twice limited:
246 * by source and by destination.
247 *
248 * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
249 * SHOULD allow setting of rate limits
250 *
251 * Shared between ICMPv4 and ICMPv6.
252 */
253#define XRLIM_BURST_FACTOR 6
254int xrlim_allow(struct dst_entry *dst, int timeout)
255{
256 unsigned long now, token = dst->rate_tokens;
257 int rc = 0;
258
259 now = jiffies;
260 token += now - dst->rate_last;
261 dst->rate_last = now;
262 if (token > XRLIM_BURST_FACTOR * timeout)
263 token = XRLIM_BURST_FACTOR * timeout;
264 if (token >= timeout) {
265 token -= timeout;
266 rc = 1;
267 }
268 dst->rate_tokens = token;
269 return rc;
270}
271EXPORT_SYMBOL(xrlim_allow);
272
273static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
274 int type, int code) 237 int type, int code)
275{ 238{
276 struct dst_entry *dst = &rt->dst; 239 struct dst_entry *dst = &rt->dst;
277 int rc = 1; 240 bool rc = true;
278 241
279 if (type > NR_ICMP_TYPES) 242 if (type > NR_ICMP_TYPES)
280 goto out; 243 goto out;
@@ -288,8 +251,12 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
288 goto out; 251 goto out;
289 252
290 /* Limit if icmp type is enabled in ratemask. */ 253 /* Limit if icmp type is enabled in ratemask. */
291 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) 254 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
292 rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit); 255 if (!rt->peer)
256 rt_bind_peer(rt, 1);
257 rc = inet_peer_xrlim_allow(rt->peer,
258 net->ipv4.sysctl_icmp_ratelimit);
259 }
293out: 260out:
294 return rc; 261 return rc;
295} 262}
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index a96e65674ac3..48f8d4592ccd 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -167,9 +167,9 @@ static int addr_compare(const struct inetpeer_addr *a,
167 int i, n = (a->family == AF_INET ? 1 : 4); 167 int i, n = (a->family == AF_INET ? 1 : 4);
168 168
169 for (i = 0; i < n; i++) { 169 for (i = 0; i < n; i++) {
170 if (a->a6[i] == b->a6[i]) 170 if (a->addr.a6[i] == b->addr.a6[i])
171 continue; 171 continue;
172 if (a->a6[i] < b->a6[i]) 172 if (a->addr.a6[i] < b->addr.a6[i])
173 return -1; 173 return -1;
174 return 1; 174 return 1;
175 } 175 }
@@ -510,8 +510,13 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
510 p->daddr = *daddr; 510 p->daddr = *daddr;
511 atomic_set(&p->refcnt, 1); 511 atomic_set(&p->refcnt, 1);
512 atomic_set(&p->rid, 0); 512 atomic_set(&p->rid, 0);
513 atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4)); 513 atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4));
514 p->tcp_ts_stamp = 0; 514 p->tcp_ts_stamp = 0;
515 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
516 p->rate_tokens = 0;
517 p->rate_last = 0;
518 p->pmtu_expires = 0;
519 memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
515 INIT_LIST_HEAD(&p->unused); 520 INIT_LIST_HEAD(&p->unused);
516 521
517 522
@@ -579,3 +584,44 @@ void inet_putpeer(struct inet_peer *p)
579 local_bh_enable(); 584 local_bh_enable();
580} 585}
581EXPORT_SYMBOL_GPL(inet_putpeer); 586EXPORT_SYMBOL_GPL(inet_putpeer);
587
588/*
589 * Check transmit rate limitation for given message.
590 * The rate information is held in the inet_peer entries now.
591 * This function is generic and could be used for other purposes
592 * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
593 *
594 * Note that the same inet_peer fields are modified by functions in
595 * route.c too, but these work for packet destinations while xrlim_allow
596 * works for icmp destinations. This means the rate limiting information
597 * for one "ip object" is shared - and these ICMPs are twice limited:
598 * by source and by destination.
599 *
600 * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
601 * SHOULD allow setting of rate limits
602 *
603 * Shared between ICMPv4 and ICMPv6.
604 */
605#define XRLIM_BURST_FACTOR 6
606bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
607{
608 unsigned long now, token;
609 bool rc = false;
610
611 if (!peer)
612 return true;
613
614 token = peer->rate_tokens;
615 now = jiffies;
616 token += now - peer->rate_last;
617 peer->rate_last = now;
618 if (token > XRLIM_BURST_FACTOR * timeout)
619 token = XRLIM_BURST_FACTOR * timeout;
620 if (token >= timeout) {
621 token -= timeout;
622 rc = true;
623 }
624 peer->rate_tokens = token;
625 return rc;
626}
627EXPORT_SYMBOL(inet_peer_xrlim_allow);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d859bcc26cb7..d7b2b0987a3b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
340 } 340 }
341 } 341 }
342 342
343#ifdef CONFIG_NET_CLS_ROUTE 343#ifdef CONFIG_IP_ROUTE_CLASSID
344 if (unlikely(skb_dst(skb)->tclassid)) { 344 if (unlikely(skb_dst(skb)->tclassid)) {
345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); 345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
346 u32 idx = skb_dst(skb)->tclassid; 346 u32 idx = skb_dst(skb)->tclassid;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index babd1a2bae5f..f926a310075d 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -206,8 +206,9 @@ config IP_NF_TARGET_REDIRECT
206 206
207config NF_NAT_SNMP_BASIC 207config NF_NAT_SNMP_BASIC
208 tristate "Basic SNMP-ALG support" 208 tristate "Basic SNMP-ALG support"
209 depends on NF_NAT 209 depends on NF_CONNTRACK_SNMP && NF_NAT
210 depends on NETFILTER_ADVANCED 210 depends on NETFILTER_ADVANCED
211 default NF_NAT && NF_CONNTRACK_SNMP
211 ---help--- 212 ---help---
212 213
213 This module implements an Application Layer Gateway (ALG) for 214 This module implements an Application Layer Gateway (ALG) for
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e855fffaed95..e95054c690c6 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -866,6 +866,7 @@ static int compat_table_info(const struct xt_table_info *info,
866 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 866 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
867 newinfo->initial_entries = 0; 867 newinfo->initial_entries = 0;
868 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 868 loc_cpu_entry = info->entries[raw_smp_processor_id()];
869 xt_compat_init_offsets(NFPROTO_ARP, info->number);
869 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 870 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
870 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 871 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
871 if (ret != 0) 872 if (ret != 0)
@@ -1333,6 +1334,7 @@ static int translate_compat_table(const char *name,
1333 duprintf("translate_compat_table: size %u\n", info->size); 1334 duprintf("translate_compat_table: size %u\n", info->size);
1334 j = 0; 1335 j = 0;
1335 xt_compat_lock(NFPROTO_ARP); 1336 xt_compat_lock(NFPROTO_ARP);
1337 xt_compat_init_offsets(NFPROTO_ARP, number);
1336 /* Walk through entries, checking offsets. */ 1338 /* Walk through entries, checking offsets. */
1337 xt_entry_foreach(iter0, entry0, total_size) { 1339 xt_entry_foreach(iter0, entry0, total_size) {
1338 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1340 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 652efea013dc..ef7d7b9680ea 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1063,6 +1063,7 @@ static int compat_table_info(const struct xt_table_info *info,
1063 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 1063 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
1064 newinfo->initial_entries = 0; 1064 newinfo->initial_entries = 0;
1065 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 1065 loc_cpu_entry = info->entries[raw_smp_processor_id()];
1066 xt_compat_init_offsets(AF_INET, info->number);
1066 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 1067 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
1067 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 1068 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
1068 if (ret != 0) 1069 if (ret != 0)
@@ -1664,6 +1665,7 @@ translate_compat_table(struct net *net,
1664 duprintf("translate_compat_table: size %u\n", info->size); 1665 duprintf("translate_compat_table: size %u\n", info->size);
1665 j = 0; 1666 j = 0;
1666 xt_compat_lock(AF_INET); 1667 xt_compat_lock(AF_INET);
1668 xt_compat_init_offsets(AF_INET, number);
1667 /* Walk through entries, checking offsets. */ 1669 /* Walk through entries, checking offsets. */
1668 xt_entry_foreach(iter0, entry0, total_size) { 1670 xt_entry_foreach(iter0, entry0, total_size) {
1669 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1671 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1e26a4897655..403ca57f6011 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
300 * that the ->target() function isn't called after ->destroy() */ 300 * that the ->target() function isn't called after ->destroy() */
301 301
302 ct = nf_ct_get(skb, &ctinfo); 302 ct = nf_ct_get(skb, &ctinfo);
303 if (ct == NULL) { 303 if (ct == NULL)
304 pr_info("no conntrack!\n");
305 /* FIXME: need to drop invalid ones, since replies
306 * to outgoing connections of other nodes will be
307 * marked as INVALID */
308 return NF_DROP; 304 return NF_DROP;
309 }
310 305
311 /* special case: ICMP error handling. conntrack distinguishes between 306 /* special case: ICMP error handling. conntrack distinguishes between
312 * error messages (RELATED) and information requests (see below) */ 307 * error messages (RELATED) and information requests (see below) */
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 72ffc8fda2e9..d76d6c9ed946 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf,
442 } 442 }
443#endif 443#endif
444 444
445 /* MAC logging for input path only. */ 445 if (in != NULL)
446 if (in && !out)
447 dump_mac_header(m, loginfo, skb); 446 dump_mac_header(m, loginfo, skb);
448 447
449 dump_packet(m, loginfo, skb, 0); 448 dump_packet(m, loginfo, skb, 0);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 294a2a32f293..aef5d1fbe77d 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, 60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
61 dev_net(out)->ipv4.iptable_mangle); 61 dev_net(out)->ipv4.iptable_mangle);
62 /* Reroute for ANY change. */ 62 /* Reroute for ANY change. */
63 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { 63 if (ret != NF_DROP && ret != NF_STOLEN) {
64 iph = ip_hdr(skb); 64 iph = ip_hdr(skb);
65 65
66 if (iph->saddr != saddr || 66 if (iph->saddr != saddr ||
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 63f60fc5d26a..5585980fce2e 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -20,6 +20,7 @@
20#include <net/netfilter/nf_conntrack_l4proto.h> 20#include <net/netfilter/nf_conntrack_l4proto.h>
21#include <net/netfilter/nf_conntrack_expect.h> 21#include <net/netfilter/nf_conntrack_expect.h>
22#include <net/netfilter/nf_conntrack_acct.h> 22#include <net/netfilter/nf_conntrack_acct.h>
23#include <linux/rculist_nulls.h>
23 24
24struct ct_iter_state { 25struct ct_iter_state {
25 struct seq_net_private p; 26 struct seq_net_private p;
@@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
35 for (st->bucket = 0; 36 for (st->bucket = 0;
36 st->bucket < net->ct.htable_size; 37 st->bucket < net->ct.htable_size;
37 st->bucket++) { 38 st->bucket++) {
38 n = rcu_dereference(net->ct.hash[st->bucket].first); 39 n = rcu_dereference(
40 hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
39 if (!is_a_nulls(n)) 41 if (!is_a_nulls(n))
40 return n; 42 return n;
41 } 43 }
@@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
48 struct net *net = seq_file_net(seq); 50 struct net *net = seq_file_net(seq);
49 struct ct_iter_state *st = seq->private; 51 struct ct_iter_state *st = seq->private;
50 52
51 head = rcu_dereference(head->next); 53 head = rcu_dereference(hlist_nulls_next_rcu(head));
52 while (is_a_nulls(head)) { 54 while (is_a_nulls(head)) {
53 if (likely(get_nulls_value(head) == st->bucket)) { 55 if (likely(get_nulls_value(head) == st->bucket)) {
54 if (++st->bucket >= net->ct.htable_size) 56 if (++st->bucket >= net->ct.htable_size)
55 return NULL; 57 return NULL;
56 } 58 }
57 head = rcu_dereference(net->ct.hash[st->bucket].first); 59 head = rcu_dereference(
60 hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
58 } 61 }
59 return head; 62 return head;
60} 63}
@@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
217 struct hlist_node *n; 220 struct hlist_node *n;
218 221
219 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { 222 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
220 n = rcu_dereference(net->ct.expect_hash[st->bucket].first); 223 n = rcu_dereference(
224 hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
221 if (n) 225 if (n)
222 return n; 226 return n;
223 } 227 }
@@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
230 struct net *net = seq_file_net(seq); 234 struct net *net = seq_file_net(seq);
231 struct ct_expect_iter_state *st = seq->private; 235 struct ct_expect_iter_state *st = seq->private;
232 236
233 head = rcu_dereference(head->next); 237 head = rcu_dereference(hlist_next_rcu(head));
234 while (head == NULL) { 238 while (head == NULL) {
235 if (++st->bucket >= nf_ct_expect_hsize) 239 if (++st->bucket >= nf_ct_expect_hsize)
236 return NULL; 240 return NULL;
237 head = rcu_dereference(net->ct.expect_hash[st->bucket].first); 241 head = rcu_dereference(
242 hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
238 } 243 }
239 return head; 244 return head;
240} 245}
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 0f23b3f06df0..703f366fd235 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb,
44 44
45 /* Try to get same port: if not, try to change it. */ 45 /* Try to get same port: if not, try to change it. */
46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
47 int ret; 47 int res;
48 48
49 exp->tuple.dst.u.tcp.port = htons(port); 49 exp->tuple.dst.u.tcp.port = htons(port);
50 ret = nf_ct_expect_related(exp); 50 res = nf_ct_expect_related(exp);
51 if (ret == 0) 51 if (res == 0)
52 break; 52 break;
53 else if (ret != -EBUSY) { 53 else if (res != -EBUSY) {
54 port = 0; 54 port = 0;
55 break; 55 break;
56 } 56 }
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index c04787ce1a71..21bcf471b25a 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
221 manips not an issue. */ 221 manips not an issue. */
222 if (maniptype == IP_NAT_MANIP_SRC && 222 if (maniptype == IP_NAT_MANIP_SRC &&
223 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { 223 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
224 if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { 224 /* try the original tuple first */
225 if (in_range(orig_tuple, range)) {
226 if (!nf_nat_used_tuple(orig_tuple, ct)) {
227 *tuple = *orig_tuple;
228 return;
229 }
230 } else if (find_appropriate_src(net, zone, orig_tuple, tuple,
231 range)) {
225 pr_debug("get_unique_tuple: Found current src map\n"); 232 pr_debug("get_unique_tuple: Found current src map\n");
226 if (!nf_nat_used_tuple(tuple, ct)) 233 if (!nf_nat_used_tuple(tuple, ct))
227 return; 234 return;
@@ -266,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct,
266 struct net *net = nf_ct_net(ct); 273 struct net *net = nf_ct_net(ct);
267 struct nf_conntrack_tuple curr_tuple, new_tuple; 274 struct nf_conntrack_tuple curr_tuple, new_tuple;
268 struct nf_conn_nat *nat; 275 struct nf_conn_nat *nat;
269 int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
270 276
271 /* nat helper or nfctnetlink also setup binding */ 277 /* nat helper or nfctnetlink also setup binding */
272 nat = nfct_nat(ct); 278 nat = nfct_nat(ct);
@@ -306,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct,
306 ct->status |= IPS_DST_NAT; 312 ct->status |= IPS_DST_NAT;
307 } 313 }
308 314
309 /* Place in source hash if this is the first time. */ 315 if (maniptype == IP_NAT_MANIP_SRC) {
310 if (have_to_hash) {
311 unsigned int srchash; 316 unsigned int srchash;
312 317
313 srchash = hash_by_src(net, nf_ct_zone(ct), 318 srchash = hash_by_src(net, nf_ct_zone(ct),
@@ -323,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct,
323 328
324 /* It's done. */ 329 /* It's done. */
325 if (maniptype == IP_NAT_MANIP_DST) 330 if (maniptype == IP_NAT_MANIP_DST)
326 set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); 331 ct->status |= IPS_DST_NAT_DONE;
327 else 332 else
328 set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); 333 ct->status |= IPS_SRC_NAT_DONE;
329 334
330 return NF_ACCEPT; 335 return NF_ACCEPT;
331} 336}
@@ -502,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
502 int ret = 0; 507 int ret = 0;
503 508
504 spin_lock_bh(&nf_nat_lock); 509 spin_lock_bh(&nf_nat_lock);
505 if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { 510 if (rcu_dereference_protected(
511 nf_nat_protos[proto->protonum],
512 lockdep_is_held(&nf_nat_lock)
513 ) != &nf_nat_unknown_protocol) {
506 ret = -EBUSY; 514 ret = -EBUSY;
507 goto out; 515 goto out;
508 } 516 }
@@ -532,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
532 if (nat == NULL || nat->ct == NULL) 540 if (nat == NULL || nat->ct == NULL)
533 return; 541 return;
534 542
535 NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK); 543 NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
536 544
537 spin_lock_bh(&nf_nat_lock); 545 spin_lock_bh(&nf_nat_lock);
538 hlist_del_rcu(&nat->bysource); 546 hlist_del_rcu(&nat->bysource);
@@ -545,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old)
545 struct nf_conn_nat *old_nat = old; 553 struct nf_conn_nat *old_nat = old;
546 struct nf_conn *ct = old_nat->ct; 554 struct nf_conn *ct = old_nat->ct;
547 555
548 if (!ct || !(ct->status & IPS_NAT_DONE_MASK)) 556 if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
549 return; 557 return;
550 558
551 spin_lock_bh(&nf_nat_lock); 559 spin_lock_bh(&nf_nat_lock);
552 new_nat->ct = ct;
553 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); 560 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
554 spin_unlock_bh(&nf_nat_lock); 561 spin_unlock_bh(&nf_nat_lock);
555} 562}
@@ -679,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net)
679{ 686{
680 /* Leave them the same for the moment. */ 687 /* Leave them the same for the moment. */
681 net->ipv4.nat_htable_size = net->ct.htable_size; 688 net->ipv4.nat_htable_size = net->ct.htable_size;
682 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 689 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
683 &net->ipv4.nat_vmalloced, 0);
684 if (!net->ipv4.nat_bysource) 690 if (!net->ipv4.nat_bysource)
685 return -ENOMEM; 691 return -ENOMEM;
686 return 0; 692 return 0;
@@ -702,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
702{ 708{
703 nf_ct_iterate_cleanup(net, &clean_nat, NULL); 709 nf_ct_iterate_cleanup(net, &clean_nat, NULL);
704 synchronize_rcu(); 710 synchronize_rcu();
705 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, 711 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
706 net->ipv4.nat_htable_size);
707} 712}
708 713
709static struct pernet_operations nf_nat_net_ops = { 714static struct pernet_operations nf_nat_net_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ee5f419d0a56..8812a02078ab 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -54,6 +54,7 @@
54#include <net/netfilter/nf_conntrack_expect.h> 54#include <net/netfilter/nf_conntrack_expect.h>
55#include <net/netfilter/nf_conntrack_helper.h> 55#include <net/netfilter/nf_conntrack_helper.h>
56#include <net/netfilter/nf_nat_helper.h> 56#include <net/netfilter/nf_nat_helper.h>
57#include <linux/netfilter/nf_conntrack_snmp.h>
57 58
58MODULE_LICENSE("GPL"); 59MODULE_LICENSE("GPL");
59MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); 60MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
@@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void)
1310{ 1311{
1311 int ret = 0; 1312 int ret = 0;
1312 1313
1313 ret = nf_conntrack_helper_register(&snmp_helper); 1314 BUG_ON(nf_nat_snmp_hook != NULL);
1314 if (ret < 0) 1315 rcu_assign_pointer(nf_nat_snmp_hook, help);
1315 return ret; 1316
1316 ret = nf_conntrack_helper_register(&snmp_trap_helper); 1317 ret = nf_conntrack_helper_register(&snmp_trap_helper);
1317 if (ret < 0) { 1318 if (ret < 0) {
1318 nf_conntrack_helper_unregister(&snmp_helper); 1319 nf_conntrack_helper_unregister(&snmp_helper);
@@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void)
1323 1324
1324static void __exit nf_nat_snmp_basic_fini(void) 1325static void __exit nf_nat_snmp_basic_fini(void)
1325{ 1326{
1326 nf_conntrack_helper_unregister(&snmp_helper); 1327 rcu_assign_pointer(nf_nat_snmp_hook, NULL);
1327 nf_conntrack_helper_unregister(&snmp_trap_helper); 1328 nf_conntrack_helper_unregister(&snmp_trap_helper);
1328} 1329}
1329 1330
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6ed6603c2f6d..52b077d45208 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256; 131static int ip_rt_min_advmss __read_mostly = 256;
132static int rt_chain_length_max __read_mostly = 20; 132static int rt_chain_length_max __read_mostly = 20;
133 133
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
136
137/* 134/*
138 * Interface to generic destination cache. 135 * Interface to generic destination cache.
139 */ 136 */
@@ -152,6 +149,41 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
152{ 149{
153} 150}
154 151
152static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
153{
154 struct rtable *rt = (struct rtable *) dst;
155 struct inet_peer *peer;
156 u32 *p = NULL;
157
158 if (!rt->peer)
159 rt_bind_peer(rt, 1);
160
161 peer = rt->peer;
162 if (peer) {
163 u32 *old_p = __DST_METRICS_PTR(old);
164 unsigned long prev, new;
165
166 p = peer->metrics;
167 if (inet_metrics_new(peer))
168 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
169
170 new = (unsigned long) p;
171 prev = cmpxchg(&dst->_metrics, old, new);
172
173 if (prev != old) {
174 p = __DST_METRICS_PTR(prev);
175 if (prev & DST_METRICS_READ_ONLY)
176 p = NULL;
177 } else {
178 if (rt->fi) {
179 fib_info_put(rt->fi);
180 rt->fi = NULL;
181 }
182 }
183 }
184 return p;
185}
186
155static struct dst_ops ipv4_dst_ops = { 187static struct dst_ops ipv4_dst_ops = {
156 .family = AF_INET, 188 .family = AF_INET,
157 .protocol = cpu_to_be16(ETH_P_IP), 189 .protocol = cpu_to_be16(ETH_P_IP),
@@ -159,6 +191,7 @@ static struct dst_ops ipv4_dst_ops = {
159 .check = ipv4_dst_check, 191 .check = ipv4_dst_check,
160 .default_advmss = ipv4_default_advmss, 192 .default_advmss = ipv4_default_advmss,
161 .default_mtu = ipv4_default_mtu, 193 .default_mtu = ipv4_default_mtu,
194 .cow_metrics = ipv4_cow_metrics,
162 .destroy = ipv4_dst_destroy, 195 .destroy = ipv4_dst_destroy,
163 .ifdown = ipv4_dst_ifdown, 196 .ifdown = ipv4_dst_ifdown,
164 .negative_advice = ipv4_negative_advice, 197 .negative_advice = ipv4_negative_advice,
@@ -514,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = {
514 .release = seq_release, 547 .release = seq_release,
515}; 548};
516 549
517#ifdef CONFIG_NET_CLS_ROUTE 550#ifdef CONFIG_IP_ROUTE_CLASSID
518static int rt_acct_proc_show(struct seq_file *m, void *v) 551static int rt_acct_proc_show(struct seq_file *m, void *v)
519{ 552{
520 struct ip_rt_acct *dst, *src; 553 struct ip_rt_acct *dst, *src;
@@ -567,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
567 if (!pde) 600 if (!pde)
568 goto err2; 601 goto err2;
569 602
570#ifdef CONFIG_NET_CLS_ROUTE 603#ifdef CONFIG_IP_ROUTE_CLASSID
571 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 604 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
572 if (!pde) 605 if (!pde)
573 goto err3; 606 goto err3;
574#endif 607#endif
575 return 0; 608 return 0;
576 609
577#ifdef CONFIG_NET_CLS_ROUTE 610#ifdef CONFIG_IP_ROUTE_CLASSID
578err3: 611err3:
579 remove_proc_entry("rt_cache", net->proc_net_stat); 612 remove_proc_entry("rt_cache", net->proc_net_stat);
580#endif 613#endif
@@ -588,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
588{ 621{
589 remove_proc_entry("rt_cache", net->proc_net_stat); 622 remove_proc_entry("rt_cache", net->proc_net_stat);
590 remove_proc_entry("rt_cache", net->proc_net); 623 remove_proc_entry("rt_cache", net->proc_net);
591#ifdef CONFIG_NET_CLS_ROUTE 624#ifdef CONFIG_IP_ROUTE_CLASSID
592 remove_proc_entry("rt_acct", net->proc_net); 625 remove_proc_entry("rt_acct", net->proc_net);
593#endif 626#endif
594} 627}
@@ -632,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth)
632static inline int rt_valuable(struct rtable *rth) 665static inline int rt_valuable(struct rtable *rth)
633{ 666{
634 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 667 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
635 rth->dst.expires; 668 (rth->peer && rth->peer->pmtu_expires);
636} 669}
637 670
638static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 671static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -643,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
643 if (atomic_read(&rth->dst.__refcnt)) 676 if (atomic_read(&rth->dst.__refcnt))
644 goto out; 677 goto out;
645 678
646 ret = 1;
647 if (rth->dst.expires &&
648 time_after_eq(jiffies, rth->dst.expires))
649 goto out;
650
651 age = jiffies - rth->dst.lastuse; 679 age = jiffies - rth->dst.lastuse;
652 ret = 0;
653 if ((age <= tmo1 && !rt_fast_clean(rth)) || 680 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
654 (age <= tmo2 && rt_valuable(rth))) 681 (age <= tmo2 && rt_valuable(rth)))
655 goto out; 682 goto out;
@@ -793,97 +820,6 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
793 return ONE; 820 return ONE;
794} 821}
795 822
796static void rt_check_expire(void)
797{
798 static unsigned int rover;
799 unsigned int i = rover, goal;
800 struct rtable *rth;
801 struct rtable __rcu **rthp;
802 unsigned long samples = 0;
803 unsigned long sum = 0, sum2 = 0;
804 unsigned long delta;
805 u64 mult;
806
807 delta = jiffies - expires_ljiffies;
808 expires_ljiffies = jiffies;
809 mult = ((u64)delta) << rt_hash_log;
810 if (ip_rt_gc_timeout > 1)
811 do_div(mult, ip_rt_gc_timeout);
812 goal = (unsigned int)mult;
813 if (goal > rt_hash_mask)
814 goal = rt_hash_mask + 1;
815 for (; goal > 0; goal--) {
816 unsigned long tmo = ip_rt_gc_timeout;
817 unsigned long length;
818
819 i = (i + 1) & rt_hash_mask;
820 rthp = &rt_hash_table[i].chain;
821
822 if (need_resched())
823 cond_resched();
824
825 samples++;
826
827 if (rcu_dereference_raw(*rthp) == NULL)
828 continue;
829 length = 0;
830 spin_lock_bh(rt_hash_lock_addr(i));
831 while ((rth = rcu_dereference_protected(*rthp,
832 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
833 prefetch(rth->dst.rt_next);
834 if (rt_is_expired(rth)) {
835 *rthp = rth->dst.rt_next;
836 rt_free(rth);
837 continue;
838 }
839 if (rth->dst.expires) {
840 /* Entry is expired even if it is in use */
841 if (time_before_eq(jiffies, rth->dst.expires)) {
842nofree:
843 tmo >>= 1;
844 rthp = &rth->dst.rt_next;
845 /*
846 * We only count entries on
847 * a chain with equal hash inputs once
848 * so that entries for different QOS
849 * levels, and other non-hash input
850 * attributes don't unfairly skew
851 * the length computation
852 */
853 length += has_noalias(rt_hash_table[i].chain, rth);
854 continue;
855 }
856 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
857 goto nofree;
858
859 /* Cleanup aged off entries. */
860 *rthp = rth->dst.rt_next;
861 rt_free(rth);
862 }
863 spin_unlock_bh(rt_hash_lock_addr(i));
864 sum += length;
865 sum2 += length*length;
866 }
867 if (samples) {
868 unsigned long avg = sum / samples;
869 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
870 rt_chain_length_max = max_t(unsigned long,
871 ip_rt_gc_elasticity,
872 (avg + 4*sd) >> FRACT_BITS);
873 }
874 rover = i;
875}
876
877/*
878 * rt_worker_func() is run in process context.
879 * we call rt_check_expire() to scan part of the hash table
880 */
881static void rt_worker_func(struct work_struct *work)
882{
883 rt_check_expire();
884 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
885}
886
887/* 823/*
888 * Pertubation of rt_genid by a small quantity [1..256] 824 * Pertubation of rt_genid by a small quantity [1..256]
889 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 825 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -1272,6 +1208,13 @@ skip_hashing:
1272 return 0; 1208 return 0;
1273} 1209}
1274 1210
1211static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1212
1213static u32 rt_peer_genid(void)
1214{
1215 return atomic_read(&__rt_peer_genid);
1216}
1217
1275void rt_bind_peer(struct rtable *rt, int create) 1218void rt_bind_peer(struct rtable *rt, int create)
1276{ 1219{
1277 struct inet_peer *peer; 1220 struct inet_peer *peer;
@@ -1280,6 +1223,8 @@ void rt_bind_peer(struct rtable *rt, int create)
1280 1223
1281 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) 1224 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1282 inet_putpeer(peer); 1225 inet_putpeer(peer);
1226 else
1227 rt->rt_peer_genid = rt_peer_genid();
1283} 1228}
1284 1229
1285/* 1230/*
@@ -1349,13 +1294,8 @@ static void rt_del(unsigned hash, struct rtable *rt)
1349void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1294void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1350 __be32 saddr, struct net_device *dev) 1295 __be32 saddr, struct net_device *dev)
1351{ 1296{
1352 int i, k;
1353 struct in_device *in_dev = __in_dev_get_rcu(dev); 1297 struct in_device *in_dev = __in_dev_get_rcu(dev);
1354 struct rtable *rth; 1298 struct inet_peer *peer;
1355 struct rtable __rcu **rthp;
1356 __be32 skeys[2] = { saddr, 0 };
1357 int ikeys[2] = { dev->ifindex, 0 };
1358 struct netevent_redirect netevent;
1359 struct net *net; 1299 struct net *net;
1360 1300
1361 if (!in_dev) 1301 if (!in_dev)
@@ -1367,9 +1307,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1367 ipv4_is_zeronet(new_gw)) 1307 ipv4_is_zeronet(new_gw))
1368 goto reject_redirect; 1308 goto reject_redirect;
1369 1309
1370 if (!rt_caching(net))
1371 goto reject_redirect;
1372
1373 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1310 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1374 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1311 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1375 goto reject_redirect; 1312 goto reject_redirect;
@@ -1380,91 +1317,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1380 goto reject_redirect; 1317 goto reject_redirect;
1381 } 1318 }
1382 1319
1383 for (i = 0; i < 2; i++) { 1320 peer = inet_getpeer_v4(daddr, 1);
1384 for (k = 0; k < 2; k++) { 1321 if (peer) {
1385 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1322 peer->redirect_learned.a4 = new_gw;
1386 rt_genid(net));
1387
1388 rthp = &rt_hash_table[hash].chain;
1389
1390 while ((rth = rcu_dereference(*rthp)) != NULL) {
1391 struct rtable *rt;
1392
1393 if (rth->fl.fl4_dst != daddr ||
1394 rth->fl.fl4_src != skeys[i] ||
1395 rth->fl.oif != ikeys[k] ||
1396 rt_is_input_route(rth) ||
1397 rt_is_expired(rth) ||
1398 !net_eq(dev_net(rth->dst.dev), net)) {
1399 rthp = &rth->dst.rt_next;
1400 continue;
1401 }
1402
1403 if (rth->rt_dst != daddr ||
1404 rth->rt_src != saddr ||
1405 rth->dst.error ||
1406 rth->rt_gateway != old_gw ||
1407 rth->dst.dev != dev)
1408 break;
1409
1410 dst_hold(&rth->dst);
1411
1412 rt = dst_alloc(&ipv4_dst_ops);
1413 if (rt == NULL) {
1414 ip_rt_put(rth);
1415 return;
1416 }
1417
1418 /* Copy all the information. */
1419 *rt = *rth;
1420 rt->dst.__use = 1;
1421 atomic_set(&rt->dst.__refcnt, 1);
1422 rt->dst.child = NULL;
1423 if (rt->dst.dev)
1424 dev_hold(rt->dst.dev);
1425 rt->dst.obsolete = -1;
1426 rt->dst.lastuse = jiffies;
1427 rt->dst.path = &rt->dst;
1428 rt->dst.neighbour = NULL;
1429 rt->dst.hh = NULL;
1430#ifdef CONFIG_XFRM
1431 rt->dst.xfrm = NULL;
1432#endif
1433 rt->rt_genid = rt_genid(net);
1434 rt->rt_flags |= RTCF_REDIRECTED;
1435
1436 /* Gateway is different ... */
1437 rt->rt_gateway = new_gw;
1438
1439 /* Redirect received -> path was valid */
1440 dst_confirm(&rth->dst);
1441
1442 if (rt->peer)
1443 atomic_inc(&rt->peer->refcnt);
1444
1445 if (arp_bind_neighbour(&rt->dst) ||
1446 !(rt->dst.neighbour->nud_state &
1447 NUD_VALID)) {
1448 if (rt->dst.neighbour)
1449 neigh_event_send(rt->dst.neighbour, NULL);
1450 ip_rt_put(rth);
1451 rt_drop(rt);
1452 goto do_next;
1453 }
1454 1323
1455 netevent.old = &rth->dst; 1324 inet_putpeer(peer);
1456 netevent.new = &rt->dst;
1457 call_netevent_notifiers(NETEVENT_REDIRECT,
1458 &netevent);
1459 1325
1460 rt_del(hash, rth); 1326 atomic_inc(&__rt_peer_genid);
1461 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1462 ip_rt_put(rt);
1463 goto do_next;
1464 }
1465 do_next:
1466 ;
1467 }
1468 } 1327 }
1469 return; 1328 return;
1470 1329
@@ -1488,9 +1347,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1488 if (dst->obsolete > 0) { 1347 if (dst->obsolete > 0) {
1489 ip_rt_put(rt); 1348 ip_rt_put(rt);
1490 ret = NULL; 1349 ret = NULL;
1491 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1350 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1492 (rt->dst.expires &&
1493 time_after_eq(jiffies, rt->dst.expires))) {
1494 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1351 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1495 rt->fl.oif, 1352 rt->fl.oif,
1496 rt_genid(dev_net(dst->dev))); 1353 rt_genid(dev_net(dst->dev)));
@@ -1500,6 +1357,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1500#endif 1357#endif
1501 rt_del(hash, rt); 1358 rt_del(hash, rt);
1502 ret = NULL; 1359 ret = NULL;
1360 } else if (rt->peer &&
1361 rt->peer->pmtu_expires &&
1362 time_after_eq(jiffies, rt->peer->pmtu_expires)) {
1363 unsigned long orig = rt->peer->pmtu_expires;
1364
1365 if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1366 dst_metric_set(dst, RTAX_MTU,
1367 rt->peer->pmtu_orig);
1503 } 1368 }
1504 } 1369 }
1505 return ret; 1370 return ret;
@@ -1525,6 +1390,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1525{ 1390{
1526 struct rtable *rt = skb_rtable(skb); 1391 struct rtable *rt = skb_rtable(skb);
1527 struct in_device *in_dev; 1392 struct in_device *in_dev;
1393 struct inet_peer *peer;
1528 int log_martians; 1394 int log_martians;
1529 1395
1530 rcu_read_lock(); 1396 rcu_read_lock();
@@ -1536,33 +1402,41 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1536 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1402 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1537 rcu_read_unlock(); 1403 rcu_read_unlock();
1538 1404
1405 if (!rt->peer)
1406 rt_bind_peer(rt, 1);
1407 peer = rt->peer;
1408 if (!peer) {
1409 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1410 return;
1411 }
1412
1539 /* No redirected packets during ip_rt_redirect_silence; 1413 /* No redirected packets during ip_rt_redirect_silence;
1540 * reset the algorithm. 1414 * reset the algorithm.
1541 */ 1415 */
1542 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) 1416 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1543 rt->dst.rate_tokens = 0; 1417 peer->rate_tokens = 0;
1544 1418
1545 /* Too many ignored redirects; do not send anything 1419 /* Too many ignored redirects; do not send anything
1546 * set dst.rate_last to the last seen redirected packet. 1420 * set dst.rate_last to the last seen redirected packet.
1547 */ 1421 */
1548 if (rt->dst.rate_tokens >= ip_rt_redirect_number) { 1422 if (peer->rate_tokens >= ip_rt_redirect_number) {
1549 rt->dst.rate_last = jiffies; 1423 peer->rate_last = jiffies;
1550 return; 1424 return;
1551 } 1425 }
1552 1426
1553 /* Check for load limit; set rate_last to the latest sent 1427 /* Check for load limit; set rate_last to the latest sent
1554 * redirect. 1428 * redirect.
1555 */ 1429 */
1556 if (rt->dst.rate_tokens == 0 || 1430 if (peer->rate_tokens == 0 ||
1557 time_after(jiffies, 1431 time_after(jiffies,
1558 (rt->dst.rate_last + 1432 (peer->rate_last +
1559 (ip_rt_redirect_load << rt->dst.rate_tokens)))) { 1433 (ip_rt_redirect_load << peer->rate_tokens)))) {
1560 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1434 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1561 rt->dst.rate_last = jiffies; 1435 peer->rate_last = jiffies;
1562 ++rt->dst.rate_tokens; 1436 ++peer->rate_tokens;
1563#ifdef CONFIG_IP_ROUTE_VERBOSE 1437#ifdef CONFIG_IP_ROUTE_VERBOSE
1564 if (log_martians && 1438 if (log_martians &&
1565 rt->dst.rate_tokens == ip_rt_redirect_number && 1439 peer->rate_tokens == ip_rt_redirect_number &&
1566 net_ratelimit()) 1440 net_ratelimit())
1567 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1441 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1568 &rt->rt_src, rt->rt_iif, 1442 &rt->rt_src, rt->rt_iif,
@@ -1574,7 +1448,9 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1574static int ip_error(struct sk_buff *skb) 1448static int ip_error(struct sk_buff *skb)
1575{ 1449{
1576 struct rtable *rt = skb_rtable(skb); 1450 struct rtable *rt = skb_rtable(skb);
1451 struct inet_peer *peer;
1577 unsigned long now; 1452 unsigned long now;
1453 bool send;
1578 int code; 1454 int code;
1579 1455
1580 switch (rt->dst.error) { 1456 switch (rt->dst.error) {
@@ -1594,15 +1470,24 @@ static int ip_error(struct sk_buff *skb)
1594 break; 1470 break;
1595 } 1471 }
1596 1472
1597 now = jiffies; 1473 if (!rt->peer)
1598 rt->dst.rate_tokens += now - rt->dst.rate_last; 1474 rt_bind_peer(rt, 1);
1599 if (rt->dst.rate_tokens > ip_rt_error_burst) 1475 peer = rt->peer;
1600 rt->dst.rate_tokens = ip_rt_error_burst; 1476
1601 rt->dst.rate_last = now; 1477 send = true;
1602 if (rt->dst.rate_tokens >= ip_rt_error_cost) { 1478 if (peer) {
1603 rt->dst.rate_tokens -= ip_rt_error_cost; 1479 now = jiffies;
1604 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1480 peer->rate_tokens += now - peer->rate_last;
1481 if (peer->rate_tokens > ip_rt_error_burst)
1482 peer->rate_tokens = ip_rt_error_burst;
1483 peer->rate_last = now;
1484 if (peer->rate_tokens >= ip_rt_error_cost)
1485 peer->rate_tokens -= ip_rt_error_cost;
1486 else
1487 send = false;
1605 } 1488 }
1489 if (send)
1490 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1606 1491
1607out: kfree_skb(skb); 1492out: kfree_skb(skb);
1608 return 0; 1493 return 0;
@@ -1630,88 +1515,130 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1630 unsigned short new_mtu, 1515 unsigned short new_mtu,
1631 struct net_device *dev) 1516 struct net_device *dev)
1632{ 1517{
1633 int i, k;
1634 unsigned short old_mtu = ntohs(iph->tot_len); 1518 unsigned short old_mtu = ntohs(iph->tot_len);
1635 struct rtable *rth;
1636 int ikeys[2] = { dev->ifindex, 0 };
1637 __be32 skeys[2] = { iph->saddr, 0, };
1638 __be32 daddr = iph->daddr;
1639 unsigned short est_mtu = 0; 1519 unsigned short est_mtu = 0;
1520 struct inet_peer *peer;
1640 1521
1641 for (k = 0; k < 2; k++) { 1522 peer = inet_getpeer_v4(iph->daddr, 1);
1642 for (i = 0; i < 2; i++) { 1523 if (peer) {
1643 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1524 unsigned short mtu = new_mtu;
1644 rt_genid(net));
1645
1646 rcu_read_lock();
1647 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1648 rth = rcu_dereference(rth->dst.rt_next)) {
1649 unsigned short mtu = new_mtu;
1650
1651 if (rth->fl.fl4_dst != daddr ||
1652 rth->fl.fl4_src != skeys[i] ||
1653 rth->rt_dst != daddr ||
1654 rth->rt_src != iph->saddr ||
1655 rth->fl.oif != ikeys[k] ||
1656 rt_is_input_route(rth) ||
1657 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1658 !net_eq(dev_net(rth->dst.dev), net) ||
1659 rt_is_expired(rth))
1660 continue;
1661
1662 if (new_mtu < 68 || new_mtu >= old_mtu) {
1663 1525
1664 /* BSD 4.2 compatibility hack :-( */ 1526 if (new_mtu < 68 || new_mtu >= old_mtu) {
1665 if (mtu == 0 && 1527 /* BSD 4.2 derived systems incorrectly adjust
1666 old_mtu >= dst_mtu(&rth->dst) && 1528 * tot_len by the IP header length, and report
1667 old_mtu >= 68 + (iph->ihl << 2)) 1529 * a zero MTU in the ICMP message.
1668 old_mtu -= iph->ihl << 2; 1530 */
1531 if (mtu == 0 &&
1532 old_mtu >= 68 + (iph->ihl << 2))
1533 old_mtu -= iph->ihl << 2;
1534 mtu = guess_mtu(old_mtu);
1535 }
1669 1536
1670 mtu = guess_mtu(old_mtu); 1537 if (mtu < ip_rt_min_pmtu)
1671 } 1538 mtu = ip_rt_min_pmtu;
1672 if (mtu <= dst_mtu(&rth->dst)) { 1539 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1673 if (mtu < dst_mtu(&rth->dst)) { 1540 est_mtu = mtu;
1674 dst_confirm(&rth->dst); 1541 peer->pmtu_learned = mtu;
1675 if (mtu < ip_rt_min_pmtu) { 1542 peer->pmtu_expires = jiffies + ip_rt_mtu_expires;
1676 u32 lock = dst_metric(&rth->dst,
1677 RTAX_LOCK);
1678 mtu = ip_rt_min_pmtu;
1679 lock |= (1 << RTAX_MTU);
1680 dst_metric_set(&rth->dst, RTAX_LOCK,
1681 lock);
1682 }
1683 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1684 dst_set_expires(&rth->dst,
1685 ip_rt_mtu_expires);
1686 }
1687 est_mtu = mtu;
1688 }
1689 }
1690 rcu_read_unlock();
1691 } 1543 }
1544
1545 inet_putpeer(peer);
1546
1547 atomic_inc(&__rt_peer_genid);
1692 } 1548 }
1693 return est_mtu ? : new_mtu; 1549 return est_mtu ? : new_mtu;
1694} 1550}
1695 1551
1552static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1553{
1554 unsigned long expires = peer->pmtu_expires;
1555
1556 if (time_before(expires, jiffies)) {
1557 u32 orig_dst_mtu = dst_mtu(dst);
1558 if (peer->pmtu_learned < orig_dst_mtu) {
1559 if (!peer->pmtu_orig)
1560 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1561 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1562 }
1563 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1564 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1565}
1566
1696static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1567static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1697{ 1568{
1698 if (dst_mtu(dst) > mtu && mtu >= 68 && 1569 struct rtable *rt = (struct rtable *) dst;
1699 !(dst_metric_locked(dst, RTAX_MTU))) { 1570 struct inet_peer *peer;
1700 if (mtu < ip_rt_min_pmtu) { 1571
1701 u32 lock = dst_metric(dst, RTAX_LOCK); 1572 dst_confirm(dst);
1573
1574 if (!rt->peer)
1575 rt_bind_peer(rt, 1);
1576 peer = rt->peer;
1577 if (peer) {
1578 if (mtu < ip_rt_min_pmtu)
1702 mtu = ip_rt_min_pmtu; 1579 mtu = ip_rt_min_pmtu;
1703 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU)); 1580 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1581 peer->pmtu_learned = mtu;
1582 peer->pmtu_expires = jiffies + ip_rt_mtu_expires;
1583
1584 atomic_inc(&__rt_peer_genid);
1585 rt->rt_peer_genid = rt_peer_genid();
1586
1587 check_peer_pmtu(dst, peer);
1704 } 1588 }
1705 dst_metric_set(dst, RTAX_MTU, mtu); 1589 inet_putpeer(peer);
1706 dst_set_expires(dst, ip_rt_mtu_expires); 1590 }
1707 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); 1591}
1592
1593static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1594{
1595 struct rtable *rt = (struct rtable *) dst;
1596 __be32 orig_gw = rt->rt_gateway;
1597
1598 dst_confirm(&rt->dst);
1599
1600 neigh_release(rt->dst.neighbour);
1601 rt->dst.neighbour = NULL;
1602
1603 rt->rt_gateway = peer->redirect_learned.a4;
1604 if (arp_bind_neighbour(&rt->dst) ||
1605 !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1606 if (rt->dst.neighbour)
1607 neigh_event_send(rt->dst.neighbour, NULL);
1608 rt->rt_gateway = orig_gw;
1609 return -EAGAIN;
1610 } else {
1611 rt->rt_flags |= RTCF_REDIRECTED;
1612 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1613 rt->dst.neighbour);
1708 } 1614 }
1615 return 0;
1709} 1616}
1710 1617
1711static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1618static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1712{ 1619{
1713 if (rt_is_expired((struct rtable *)dst)) 1620 struct rtable *rt = (struct rtable *) dst;
1621
1622 if (rt_is_expired(rt))
1714 return NULL; 1623 return NULL;
1624 if (rt->rt_peer_genid != rt_peer_genid()) {
1625 struct inet_peer *peer;
1626
1627 if (!rt->peer)
1628 rt_bind_peer(rt, 0);
1629
1630 peer = rt->peer;
1631 if (peer && peer->pmtu_expires)
1632 check_peer_pmtu(dst, peer);
1633
1634 if (peer && peer->redirect_learned.a4 &&
1635 peer->redirect_learned.a4 != rt->rt_gateway) {
1636 if (check_peer_redir(dst, peer))
1637 return NULL;
1638 }
1639
1640 rt->rt_peer_genid = rt_peer_genid();
1641 }
1715 return dst; 1642 return dst;
1716} 1643}
1717 1644
@@ -1720,6 +1647,10 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
1720 struct rtable *rt = (struct rtable *) dst; 1647 struct rtable *rt = (struct rtable *) dst;
1721 struct inet_peer *peer = rt->peer; 1648 struct inet_peer *peer = rt->peer;
1722 1649
1650 if (rt->fi) {
1651 fib_info_put(rt->fi);
1652 rt->fi = NULL;
1653 }
1723 if (peer) { 1654 if (peer) {
1724 rt->peer = NULL; 1655 rt->peer = NULL;
1725 inet_putpeer(peer); 1656 inet_putpeer(peer);
@@ -1734,8 +1665,14 @@ static void ipv4_link_failure(struct sk_buff *skb)
1734 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1665 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1735 1666
1736 rt = skb_rtable(skb); 1667 rt = skb_rtable(skb);
1737 if (rt) 1668 if (rt &&
1738 dst_set_expires(&rt->dst, 0); 1669 rt->peer &&
1670 rt->peer->pmtu_expires) {
1671 unsigned long orig = rt->peer->pmtu_expires;
1672
1673 if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1674 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1675 }
1739} 1676}
1740 1677
1741static int ip_rt_bug(struct sk_buff *skb) 1678static int ip_rt_bug(struct sk_buff *skb)
@@ -1775,7 +1712,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1775 memcpy(addr, &src, 4); 1712 memcpy(addr, &src, 4);
1776} 1713}
1777 1714
1778#ifdef CONFIG_NET_CLS_ROUTE 1715#ifdef CONFIG_IP_ROUTE_CLASSID
1779static void set_class_tag(struct rtable *rt, u32 tag) 1716static void set_class_tag(struct rtable *rt, u32 tag)
1780{ 1717{
1781 if (!(rt->dst.tclassid & 0xFFFF)) 1718 if (!(rt->dst.tclassid & 0xFFFF))
@@ -1815,17 +1752,52 @@ static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1815 return mtu; 1752 return mtu;
1816} 1753}
1817 1754
1818static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 1755static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
1756{
1757 struct inet_peer *peer;
1758 int create = 0;
1759
1760 /* If a peer entry exists for this destination, we must hook
1761 * it up in order to get at cached metrics.
1762 */
1763 if (rt->fl.flags & FLOWI_FLAG_PRECOW_METRICS)
1764 create = 1;
1765
1766 rt_bind_peer(rt, create);
1767 peer = rt->peer;
1768 if (peer) {
1769 if (inet_metrics_new(peer))
1770 memcpy(peer->metrics, fi->fib_metrics,
1771 sizeof(u32) * RTAX_MAX);
1772 dst_init_metrics(&rt->dst, peer->metrics, false);
1773
1774 if (peer->pmtu_expires)
1775 check_peer_pmtu(&rt->dst, peer);
1776 if (peer->redirect_learned.a4 &&
1777 peer->redirect_learned.a4 != rt->rt_gateway) {
1778 rt->rt_gateway = peer->redirect_learned.a4;
1779 rt->rt_flags |= RTCF_REDIRECTED;
1780 }
1781 } else {
1782 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1783 rt->fi = fi;
1784 atomic_inc(&fi->fib_clntref);
1785 }
1786 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1787 }
1788}
1789
1790static void rt_set_nexthop(struct rtable *rt, const struct fib_result *res,
1791 struct fib_info *fi, u16 type, u32 itag)
1819{ 1792{
1820 struct dst_entry *dst = &rt->dst; 1793 struct dst_entry *dst = &rt->dst;
1821 struct fib_info *fi = res->fi;
1822 1794
1823 if (fi) { 1795 if (fi) {
1824 if (FIB_RES_GW(*res) && 1796 if (FIB_RES_GW(*res) &&
1825 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1797 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1826 rt->rt_gateway = FIB_RES_GW(*res); 1798 rt->rt_gateway = FIB_RES_GW(*res);
1827 dst_import_metrics(dst, fi->fib_metrics); 1799 rt_init_metrics(rt, fi);
1828#ifdef CONFIG_NET_CLS_ROUTE 1800#ifdef CONFIG_IP_ROUTE_CLASSID
1829 dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1801 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1830#endif 1802#endif
1831 } 1803 }
@@ -1835,13 +1807,26 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1835 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) 1807 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1836 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); 1808 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1837 1809
1838#ifdef CONFIG_NET_CLS_ROUTE 1810#ifdef CONFIG_IP_ROUTE_CLASSID
1839#ifdef CONFIG_IP_MULTIPLE_TABLES 1811#ifdef CONFIG_IP_MULTIPLE_TABLES
1840 set_class_tag(rt, fib_rules_tclass(res)); 1812 set_class_tag(rt, fib_rules_tclass(res));
1841#endif 1813#endif
1842 set_class_tag(rt, itag); 1814 set_class_tag(rt, itag);
1843#endif 1815#endif
1844 rt->rt_type = res->type; 1816 rt->rt_type = type;
1817}
1818
1819static struct rtable *rt_dst_alloc(bool nopolicy, bool noxfrm)
1820{
1821 struct rtable *rt = dst_alloc(&ipv4_dst_ops, 1);
1822 if (rt) {
1823 rt->dst.obsolete = -1;
1824
1825 rt->dst.flags = DST_HOST |
1826 (nopolicy ? DST_NOPOLICY : 0) |
1827 (noxfrm ? DST_NOXFRM : 0);
1828 }
1829 return rt;
1845} 1830}
1846 1831
1847/* called in rcu_read_lock() section */ 1832/* called in rcu_read_lock() section */
@@ -1874,24 +1859,19 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1874 if (err < 0) 1859 if (err < 0)
1875 goto e_err; 1860 goto e_err;
1876 } 1861 }
1877 rth = dst_alloc(&ipv4_dst_ops); 1862 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1878 if (!rth) 1863 if (!rth)
1879 goto e_nobufs; 1864 goto e_nobufs;
1880 1865
1881 rth->dst.output = ip_rt_bug; 1866 rth->dst.output = ip_rt_bug;
1882 rth->dst.obsolete = -1;
1883 1867
1884 atomic_set(&rth->dst.__refcnt, 1);
1885 rth->dst.flags= DST_HOST;
1886 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1887 rth->dst.flags |= DST_NOPOLICY;
1888 rth->fl.fl4_dst = daddr; 1868 rth->fl.fl4_dst = daddr;
1889 rth->rt_dst = daddr; 1869 rth->rt_dst = daddr;
1890 rth->fl.fl4_tos = tos; 1870 rth->fl.fl4_tos = tos;
1891 rth->fl.mark = skb->mark; 1871 rth->fl.mark = skb->mark;
1892 rth->fl.fl4_src = saddr; 1872 rth->fl.fl4_src = saddr;
1893 rth->rt_src = saddr; 1873 rth->rt_src = saddr;
1894#ifdef CONFIG_NET_CLS_ROUTE 1874#ifdef CONFIG_IP_ROUTE_CLASSID
1895 rth->dst.tclassid = itag; 1875 rth->dst.tclassid = itag;
1896#endif 1876#endif
1897 rth->rt_iif = 1877 rth->rt_iif =
@@ -1959,7 +1939,7 @@ static void ip_handle_martian_source(struct net_device *dev,
1959 1939
1960/* called in rcu_read_lock() section */ 1940/* called in rcu_read_lock() section */
1961static int __mkroute_input(struct sk_buff *skb, 1941static int __mkroute_input(struct sk_buff *skb,
1962 struct fib_result *res, 1942 const struct fib_result *res,
1963 struct in_device *in_dev, 1943 struct in_device *in_dev,
1964 __be32 daddr, __be32 saddr, u32 tos, 1944 __be32 daddr, __be32 saddr, u32 tos,
1965 struct rtable **result) 1945 struct rtable **result)
@@ -2013,19 +1993,13 @@ static int __mkroute_input(struct sk_buff *skb,
2013 } 1993 }
2014 } 1994 }
2015 1995
2016 1996 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY),
2017 rth = dst_alloc(&ipv4_dst_ops); 1997 IN_DEV_CONF_GET(out_dev, NOXFRM));
2018 if (!rth) { 1998 if (!rth) {
2019 err = -ENOBUFS; 1999 err = -ENOBUFS;
2020 goto cleanup; 2000 goto cleanup;
2021 } 2001 }
2022 2002
2023 atomic_set(&rth->dst.__refcnt, 1);
2024 rth->dst.flags= DST_HOST;
2025 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2026 rth->dst.flags |= DST_NOPOLICY;
2027 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2028 rth->dst.flags |= DST_NOXFRM;
2029 rth->fl.fl4_dst = daddr; 2003 rth->fl.fl4_dst = daddr;
2030 rth->rt_dst = daddr; 2004 rth->rt_dst = daddr;
2031 rth->fl.fl4_tos = tos; 2005 rth->fl.fl4_tos = tos;
@@ -2040,12 +2014,11 @@ static int __mkroute_input(struct sk_buff *skb,
2040 rth->fl.oif = 0; 2014 rth->fl.oif = 0;
2041 rth->rt_spec_dst= spec_dst; 2015 rth->rt_spec_dst= spec_dst;
2042 2016
2043 rth->dst.obsolete = -1;
2044 rth->dst.input = ip_forward; 2017 rth->dst.input = ip_forward;
2045 rth->dst.output = ip_output; 2018 rth->dst.output = ip_output;
2046 rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); 2019 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2047 2020
2048 rt_set_nexthop(rth, res, itag); 2021 rt_set_nexthop(rth, res, res->fi, res->type, itag);
2049 2022
2050 rth->rt_flags = flags; 2023 rth->rt_flags = flags;
2051 2024
@@ -2190,25 +2163,20 @@ brd_input:
2190 RT_CACHE_STAT_INC(in_brd); 2163 RT_CACHE_STAT_INC(in_brd);
2191 2164
2192local_input: 2165local_input:
2193 rth = dst_alloc(&ipv4_dst_ops); 2166 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2194 if (!rth) 2167 if (!rth)
2195 goto e_nobufs; 2168 goto e_nobufs;
2196 2169
2197 rth->dst.output= ip_rt_bug; 2170 rth->dst.output= ip_rt_bug;
2198 rth->dst.obsolete = -1;
2199 rth->rt_genid = rt_genid(net); 2171 rth->rt_genid = rt_genid(net);
2200 2172
2201 atomic_set(&rth->dst.__refcnt, 1);
2202 rth->dst.flags= DST_HOST;
2203 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2204 rth->dst.flags |= DST_NOPOLICY;
2205 rth->fl.fl4_dst = daddr; 2173 rth->fl.fl4_dst = daddr;
2206 rth->rt_dst = daddr; 2174 rth->rt_dst = daddr;
2207 rth->fl.fl4_tos = tos; 2175 rth->fl.fl4_tos = tos;
2208 rth->fl.mark = skb->mark; 2176 rth->fl.mark = skb->mark;
2209 rth->fl.fl4_src = saddr; 2177 rth->fl.fl4_src = saddr;
2210 rth->rt_src = saddr; 2178 rth->rt_src = saddr;
2211#ifdef CONFIG_NET_CLS_ROUTE 2179#ifdef CONFIG_IP_ROUTE_CLASSID
2212 rth->dst.tclassid = itag; 2180 rth->dst.tclassid = itag;
2213#endif 2181#endif
2214 rth->rt_iif = 2182 rth->rt_iif =
@@ -2351,38 +2319,39 @@ skip_cache:
2351EXPORT_SYMBOL(ip_route_input_common); 2319EXPORT_SYMBOL(ip_route_input_common);
2352 2320
2353/* called with rcu_read_lock() */ 2321/* called with rcu_read_lock() */
2354static int __mkroute_output(struct rtable **result, 2322static struct rtable *__mkroute_output(const struct fib_result *res,
2355 struct fib_result *res, 2323 const struct flowi *fl,
2356 const struct flowi *fl, 2324 const struct flowi *oldflp,
2357 const struct flowi *oldflp, 2325 struct net_device *dev_out,
2358 struct net_device *dev_out, 2326 unsigned int flags)
2359 unsigned flags)
2360{ 2327{
2361 struct rtable *rth; 2328 struct fib_info *fi = res->fi;
2362 struct in_device *in_dev;
2363 u32 tos = RT_FL_TOS(oldflp); 2329 u32 tos = RT_FL_TOS(oldflp);
2330 struct in_device *in_dev;
2331 u16 type = res->type;
2332 struct rtable *rth;
2364 2333
2365 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK)) 2334 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2366 return -EINVAL; 2335 return ERR_PTR(-EINVAL);
2367 2336
2368 if (ipv4_is_lbcast(fl->fl4_dst)) 2337 if (ipv4_is_lbcast(fl->fl4_dst))
2369 res->type = RTN_BROADCAST; 2338 type = RTN_BROADCAST;
2370 else if (ipv4_is_multicast(fl->fl4_dst)) 2339 else if (ipv4_is_multicast(fl->fl4_dst))
2371 res->type = RTN_MULTICAST; 2340 type = RTN_MULTICAST;
2372 else if (ipv4_is_zeronet(fl->fl4_dst)) 2341 else if (ipv4_is_zeronet(fl->fl4_dst))
2373 return -EINVAL; 2342 return ERR_PTR(-EINVAL);
2374 2343
2375 if (dev_out->flags & IFF_LOOPBACK) 2344 if (dev_out->flags & IFF_LOOPBACK)
2376 flags |= RTCF_LOCAL; 2345 flags |= RTCF_LOCAL;
2377 2346
2378 in_dev = __in_dev_get_rcu(dev_out); 2347 in_dev = __in_dev_get_rcu(dev_out);
2379 if (!in_dev) 2348 if (!in_dev)
2380 return -EINVAL; 2349 return ERR_PTR(-EINVAL);
2381 2350
2382 if (res->type == RTN_BROADCAST) { 2351 if (type == RTN_BROADCAST) {
2383 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2352 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2384 res->fi = NULL; 2353 fi = NULL;
2385 } else if (res->type == RTN_MULTICAST) { 2354 } else if (type == RTN_MULTICAST) {
2386 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2355 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2387 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 2356 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2388 oldflp->proto)) 2357 oldflp->proto))
@@ -2391,21 +2360,14 @@ static int __mkroute_output(struct rtable **result,
2391 * default one, but do not gateway in this case. 2360 * default one, but do not gateway in this case.
2392 * Yes, it is hack. 2361 * Yes, it is hack.
2393 */ 2362 */
2394 if (res->fi && res->prefixlen < 4) 2363 if (fi && res->prefixlen < 4)
2395 res->fi = NULL; 2364 fi = NULL;
2396 } 2365 }
2397 2366
2398 2367 rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY),
2399 rth = dst_alloc(&ipv4_dst_ops); 2368 IN_DEV_CONF_GET(in_dev, NOXFRM));
2400 if (!rth) 2369 if (!rth)
2401 return -ENOBUFS; 2370 return ERR_PTR(-ENOBUFS);
2402
2403 atomic_set(&rth->dst.__refcnt, 1);
2404 rth->dst.flags= DST_HOST;
2405 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2406 rth->dst.flags |= DST_NOXFRM;
2407 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2408 rth->dst.flags |= DST_NOPOLICY;
2409 2371
2410 rth->fl.fl4_dst = oldflp->fl4_dst; 2372 rth->fl.fl4_dst = oldflp->fl4_dst;
2411 rth->fl.fl4_tos = tos; 2373 rth->fl.fl4_tos = tos;
@@ -2423,7 +2385,6 @@ static int __mkroute_output(struct rtable **result,
2423 rth->rt_spec_dst= fl->fl4_src; 2385 rth->rt_spec_dst= fl->fl4_src;
2424 2386
2425 rth->dst.output=ip_output; 2387 rth->dst.output=ip_output;
2426 rth->dst.obsolete = -1;
2427 rth->rt_genid = rt_genid(dev_net(dev_out)); 2388 rth->rt_genid = rt_genid(dev_net(dev_out));
2428 2389
2429 RT_CACHE_STAT_INC(out_slow_tot); 2390 RT_CACHE_STAT_INC(out_slow_tot);
@@ -2440,7 +2401,7 @@ static int __mkroute_output(struct rtable **result,
2440 RT_CACHE_STAT_INC(out_slow_mc); 2401 RT_CACHE_STAT_INC(out_slow_mc);
2441 } 2402 }
2442#ifdef CONFIG_IP_MROUTE 2403#ifdef CONFIG_IP_MROUTE
2443 if (res->type == RTN_MULTICAST) { 2404 if (type == RTN_MULTICAST) {
2444 if (IN_DEV_MFORWARD(in_dev) && 2405 if (IN_DEV_MFORWARD(in_dev) &&
2445 !ipv4_is_local_multicast(oldflp->fl4_dst)) { 2406 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2446 rth->dst.input = ip_mr_input; 2407 rth->dst.input = ip_mr_input;
@@ -2450,31 +2411,10 @@ static int __mkroute_output(struct rtable **result,
2450#endif 2411#endif
2451 } 2412 }
2452 2413
2453 rt_set_nexthop(rth, res, 0); 2414 rt_set_nexthop(rth, res, fi, type, 0);
2454 2415
2455 rth->rt_flags = flags; 2416 rth->rt_flags = flags;
2456 *result = rth; 2417 return rth;
2457 return 0;
2458}
2459
2460/* called with rcu_read_lock() */
2461static int ip_mkroute_output(struct rtable **rp,
2462 struct fib_result *res,
2463 const struct flowi *fl,
2464 const struct flowi *oldflp,
2465 struct net_device *dev_out,
2466 unsigned flags)
2467{
2468 struct rtable *rth = NULL;
2469 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2470 unsigned hash;
2471 if (err == 0) {
2472 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2473 rt_genid(dev_net(dev_out)));
2474 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2475 }
2476
2477 return err;
2478} 2418}
2479 2419
2480/* 2420/*
@@ -2497,6 +2437,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2497 struct fib_result res; 2437 struct fib_result res;
2498 unsigned int flags = 0; 2438 unsigned int flags = 0;
2499 struct net_device *dev_out = NULL; 2439 struct net_device *dev_out = NULL;
2440 struct rtable *rth;
2500 int err; 2441 int err;
2501 2442
2502 2443
@@ -2505,6 +2446,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2505 res.r = NULL; 2446 res.r = NULL;
2506#endif 2447#endif
2507 2448
2449 rcu_read_lock();
2508 if (oldflp->fl4_src) { 2450 if (oldflp->fl4_src) {
2509 err = -EINVAL; 2451 err = -EINVAL;
2510 if (ipv4_is_multicast(oldflp->fl4_src) || 2452 if (ipv4_is_multicast(oldflp->fl4_src) ||
@@ -2645,7 +2587,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2645 else 2587 else
2646#endif 2588#endif
2647 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) 2589 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2648 fib_select_default(net, &fl, &res); 2590 fib_select_default(&res);
2649 2591
2650 if (!fl.fl4_src) 2592 if (!fl.fl4_src)
2651 fl.fl4_src = FIB_RES_PREFSRC(res); 2593 fl.fl4_src = FIB_RES_PREFSRC(res);
@@ -2655,17 +2597,27 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2655 2597
2656 2598
2657make_route: 2599make_route:
2658 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2600 rth = __mkroute_output(&res, &fl, oldflp, dev_out, flags);
2601 if (IS_ERR(rth))
2602 err = PTR_ERR(rth);
2603 else {
2604 unsigned int hash;
2659 2605
2660out: return err; 2606 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2607 rt_genid(dev_net(dev_out)));
2608 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2609 }
2610
2611out:
2612 rcu_read_unlock();
2613 return err;
2661} 2614}
2662 2615
2663int __ip_route_output_key(struct net *net, struct rtable **rp, 2616int __ip_route_output_key(struct net *net, struct rtable **rp,
2664 const struct flowi *flp) 2617 const struct flowi *flp)
2665{ 2618{
2666 unsigned int hash;
2667 int res;
2668 struct rtable *rth; 2619 struct rtable *rth;
2620 unsigned int hash;
2669 2621
2670 if (!rt_caching(net)) 2622 if (!rt_caching(net))
2671 goto slow_output; 2623 goto slow_output;
@@ -2695,10 +2647,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2695 rcu_read_unlock_bh(); 2647 rcu_read_unlock_bh();
2696 2648
2697slow_output: 2649slow_output:
2698 rcu_read_lock(); 2650 return ip_route_output_slow(net, rp, flp);
2699 res = ip_route_output_slow(net, rp, flp);
2700 rcu_read_unlock();
2701 return res;
2702} 2651}
2703EXPORT_SYMBOL_GPL(__ip_route_output_key); 2652EXPORT_SYMBOL_GPL(__ip_route_output_key);
2704 2653
@@ -2731,12 +2680,11 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2731{ 2680{
2732 struct rtable *ort = *rp; 2681 struct rtable *ort = *rp;
2733 struct rtable *rt = (struct rtable *) 2682 struct rtable *rt = (struct rtable *)
2734 dst_alloc(&ipv4_dst_blackhole_ops); 2683 dst_alloc(&ipv4_dst_blackhole_ops, 1);
2735 2684
2736 if (rt) { 2685 if (rt) {
2737 struct dst_entry *new = &rt->dst; 2686 struct dst_entry *new = &rt->dst;
2738 2687
2739 atomic_set(&new->__refcnt, 1);
2740 new->__use = 1; 2688 new->__use = 1;
2741 new->input = dst_discard; 2689 new->input = dst_discard;
2742 new->output = dst_discard; 2690 new->output = dst_discard;
@@ -2759,6 +2707,9 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2759 rt->peer = ort->peer; 2707 rt->peer = ort->peer;
2760 if (rt->peer) 2708 if (rt->peer)
2761 atomic_inc(&rt->peer->refcnt); 2709 atomic_inc(&rt->peer->refcnt);
2710 rt->fi = ort->fi;
2711 if (rt->fi)
2712 atomic_inc(&rt->fi->fib_clntref);
2762 2713
2763 dst_free(new); 2714 dst_free(new);
2764 } 2715 }
@@ -2835,7 +2786,7 @@ static int rt_fill_info(struct net *net,
2835 } 2786 }
2836 if (rt->dst.dev) 2787 if (rt->dst.dev)
2837 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); 2788 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2838#ifdef CONFIG_NET_CLS_ROUTE 2789#ifdef CONFIG_IP_ROUTE_CLASSID
2839 if (rt->dst.tclassid) 2790 if (rt->dst.tclassid)
2840 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); 2791 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2841#endif 2792#endif
@@ -2854,7 +2805,8 @@ static int rt_fill_info(struct net *net,
2854 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); 2805 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2855 2806
2856 error = rt->dst.error; 2807 error = rt->dst.error;
2857 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; 2808 expires = (rt->peer && rt->peer->pmtu_expires) ?
2809 rt->peer->pmtu_expires - jiffies : 0;
2858 if (rt->peer) { 2810 if (rt->peer) {
2859 inet_peer_refcheck(rt->peer); 2811 inet_peer_refcheck(rt->peer);
2860 id = atomic_read(&rt->peer->ip_id_count) & 0xffff; 2812 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
@@ -3256,9 +3208,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
3256}; 3208};
3257 3209
3258 3210
3259#ifdef CONFIG_NET_CLS_ROUTE 3211#ifdef CONFIG_IP_ROUTE_CLASSID
3260struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3212struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3261#endif /* CONFIG_NET_CLS_ROUTE */ 3213#endif /* CONFIG_IP_ROUTE_CLASSID */
3262 3214
3263static __initdata unsigned long rhash_entries; 3215static __initdata unsigned long rhash_entries;
3264static int __init set_rhash_entries(char *str) 3216static int __init set_rhash_entries(char *str)
@@ -3274,7 +3226,7 @@ int __init ip_rt_init(void)
3274{ 3226{
3275 int rc = 0; 3227 int rc = 0;
3276 3228
3277#ifdef CONFIG_NET_CLS_ROUTE 3229#ifdef CONFIG_IP_ROUTE_CLASSID
3278 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3230 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3279 if (!ip_rt_acct) 3231 if (!ip_rt_acct)
3280 panic("IP: failed to allocate ip_rt_acct\n"); 3232 panic("IP: failed to allocate ip_rt_acct\n");
@@ -3311,14 +3263,6 @@ int __init ip_rt_init(void)
3311 devinet_init(); 3263 devinet_init();
3312 ip_fib_init(); 3264 ip_fib_init();
3313 3265
3314 /* All the timers, started at system startup tend
3315 to synchronize. Perturb it a bit.
3316 */
3317 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3318 expires_ljiffies = jiffies;
3319 schedule_delayed_work(&expires_work,
3320 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3321
3322 if (ip_rt_proc_init()) 3266 if (ip_rt_proc_init())
3323 printk(KERN_ERR "Unable to create route proc files\n"); 3267 printk(KERN_ERR "Unable to create route proc files\n");
3324#ifdef CONFIG_XFRM 3268#ifdef CONFIG_XFRM
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6c11eece262c..f9867d2dbef4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2653,7 +2653,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2653EXPORT_SYMBOL(compat_tcp_getsockopt); 2653EXPORT_SYMBOL(compat_tcp_getsockopt);
2654#endif 2654#endif
2655 2655
2656struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) 2656struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features)
2657{ 2657{
2658 struct sk_buff *segs = ERR_PTR(-EINVAL); 2658 struct sk_buff *segs = ERR_PTR(-EINVAL);
2659 struct tcphdr *th; 2659 struct tcphdr *th;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eb7f82ebf4a3..2f692cefd3b0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
817 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 817 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
818 818
819 if (!cwnd) 819 if (!cwnd)
820 cwnd = rfc3390_bytes_to_packets(tp->mss_cache); 820 cwnd = TCP_INIT_CWND;
821 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 821 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
822} 822}
823 823
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 02f583b3744a..e2b9be27f226 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1341,7 +1341,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1341 tcp_death_row.sysctl_tw_recycle && 1341 tcp_death_row.sysctl_tw_recycle &&
1342 (dst = inet_csk_route_req(sk, req)) != NULL && 1342 (dst = inet_csk_route_req(sk, req)) != NULL &&
1343 (peer = rt_get_peer((struct rtable *)dst)) != NULL && 1343 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1344 peer->daddr.a4 == saddr) { 1344 peer->daddr.addr.a4 == saddr) {
1345 inet_peer_refcheck(peer); 1345 inet_peer_refcheck(peer);
1346 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && 1346 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1347 (s32)(peer->tcp_ts - req->ts_recent) > 1347 (s32)(peer->tcp_ts - req->ts_recent) >
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8157b17959ee..d37baaa1dbe3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2199,7 +2199,7 @@ int udp4_ufo_send_check(struct sk_buff *skb)
2199 return 0; 2199 return 0;
2200} 2200}
2201 2201
2202struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) 2202struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features)
2203{ 2203{
2204 struct sk_buff *segs = ERR_PTR(-EINVAL); 2204 struct sk_buff *segs = ERR_PTR(-EINVAL);
2205 unsigned int mss; 2205 unsigned int mss;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index b057d40addec..19fbdec6baaa 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -196,8 +196,11 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
196{ 196{
197 struct xfrm_dst *xdst = (struct xfrm_dst *)dst; 197 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
198 198
199 dst_destroy_metrics_generic(dst);
200
199 if (likely(xdst->u.rt.peer)) 201 if (likely(xdst->u.rt.peer))
200 inet_putpeer(xdst->u.rt.peer); 202 inet_putpeer(xdst->u.rt.peer);
203
201 xfrm_dst_destroy(xdst); 204 xfrm_dst_destroy(xdst);
202} 205}
203 206
@@ -215,6 +218,7 @@ static struct dst_ops xfrm4_dst_ops = {
215 .protocol = cpu_to_be16(ETH_P_IP), 218 .protocol = cpu_to_be16(ETH_P_IP),
216 .gc = xfrm4_garbage_collect, 219 .gc = xfrm4_garbage_collect,
217 .update_pmtu = xfrm4_update_pmtu, 220 .update_pmtu = xfrm4_update_pmtu,
221 .cow_metrics = dst_cow_metrics_generic,
218 .destroy = xfrm4_dst_destroy, 222 .destroy = xfrm4_dst_destroy,
219 .ifdown = xfrm4_dst_ifdown, 223 .ifdown = xfrm4_dst_ifdown,
220 .local_out = __ip_local_out, 224 .local_out = __ip_local_out,