aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2012-07-22 20:04:15 -0400
committerDavid S. Miller <davem@davemloft.net>2012-07-22 20:04:15 -0400
commit5e9965c15ba88319500284e590733f4a4629a288 (patch)
treeab76263b9f43fb75048a50141d199f445f5fdd2d /net/ipv4
parent3ba97381343b271296487bf073eb670d5465a8b8 (diff)
parent2860583fe840d972573363dfa190b2149a604534 (diff)
Merge branch 'kill_rtcache'
The ipv4 routing cache is non-deterministic, performance wise, and is subject to reasonably easy to launch denial of service attacks. The routing cache works great for well behaved traffic, and the world was a much friendlier place when the tradeoffs that led to the routing cache's design were considered. What it boils down to is that the performance of the routing cache is a product of the traffic patterns seen by a system rather than being a product of the contents of the routing tables. The former of which is controllable by external entitites. Even for "well behaved" legitimate traffic, high volume sites can see hit rates in the routing cache of only ~%10. The general flow of this patch series is that first the routing cache is removed. We build a completely new rtable entry every lookup request. Next we make some simplifications due to the fact that removing the routing cache causes several members of struct rtable to become no longer necessary. Then we need to make some amends such that we can legally cache pre-constructed routes in the FIB nexthops. Firstly, we need to invalidate routes which are hit with nexthop exceptions. Secondly we have to change the semantics of rt->rt_gateway such that zero means that the destination is on-link and non-zero otherwise. Now that the preparations are ready, we start caching precomputed routes in the FIB nexthops. Output and input routes need different kinds of care when determining if we can legally do such caching or not. The details are in the commit log messages for those changes. The patch series then winds down with some more struct rtable simplifications and other tidy ups that remove unnecessary overhead. On a SPARC-T3 output route lookups are ~876 cycles. Input route lookups are ~1169 cycles with rpfilter disabled, and about ~1468 cycles with rpfilter enabled. These measurements were taken with the kbench_mod test module in the net_test_tools GIT tree: git://git.kernel.org/pub/scm/linux/kernel/git/davem/net_test_tools.git That GIT tree also includes a udpflood tester tool and stresses route lookups on packet output. For example, on the same SPARC-T3 system we can run: time ./udpflood -l 10000000 10.2.2.11 with routing cache: real 1m21.955s user 0m6.530s sys 1m15.390s without routing cache: real 1m31.678s user 0m6.520s sys 1m25.140s Performance undoubtedly can easily be improved further. For example fib_table_lookup() performs a lot of excessive computations with all the masking and shifting, some of it conditionalized to deal with edge cases. Also, Eric's no-ref optimization for input route lookups can be re-instated for the FIB nexthop caching code path. I would be really pleased if someone would work on that. In fact anyone suitable motivated can just fire up perf on the loading of the test net_test_tools benchmark kernel module. I spend much of my time going: bash# perf record insmod ./kbench_mod.ko dst=172.30.42.22 src=74.128.0.1 iif=2 bash# perf report Thanks to helpful feedback from Joe Perches, Eric Dumazet, Ben Hutchings, and others. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/arp.c5
-rw-r--r--net/ipv4/fib_frontend.c5
-rw-r--r--net/ipv4/fib_semantics.c4
-rw-r--r--net/ipv4/inet_connection_sock.c9
-rw-r--r--net/ipv4/ip_fragment.c4
-rw-r--r--net/ipv4/ip_gre.c2
-rw-r--r--net/ipv4/ip_input.c4
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/ipip.c2
-rw-r--r--net/ipv4/ipmr.c9
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c5
-rw-r--r--net/ipv4/route.c1329
-rw-r--r--net/ipv4/tcp_ipv4.c4
-rw-r--r--net/ipv4/xfrm4_input.c4
-rw-r--r--net/ipv4/xfrm4_policy.c9
15 files changed, 242 insertions, 1155 deletions
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 2e560f0c757d..a0124eb7dbea 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -475,8 +475,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
475 return 1; 475 return 1;
476 } 476 }
477 477
478 paddr = skb_rtable(skb)->rt_gateway; 478 paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr);
479
480 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, 479 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
481 paddr, dev)) 480 paddr, dev))
482 return 0; 481 return 0;
@@ -828,7 +827,7 @@ static int arp_process(struct sk_buff *skb)
828 } 827 }
829 828
830 if (arp->ar_op == htons(ARPOP_REQUEST) && 829 if (arp->ar_op == htons(ARPOP_REQUEST) &&
831 ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { 830 ip_route_input(skb, tip, sip, 0, dev) == 0) {
832 831
833 rt = skb_rtable(skb); 832 rt = skb_rtable(skb);
834 addr_type = rt->rt_type; 833 addr_type = rt->rt_type;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b83203658ee3..f277cf0e6321 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1072,11 +1072,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
1072 rt_cache_flush(dev_net(dev), 0); 1072 rt_cache_flush(dev_net(dev), 0);
1073 break; 1073 break;
1074 case NETDEV_UNREGISTER_BATCH: 1074 case NETDEV_UNREGISTER_BATCH:
1075 /* The batch unregister is only called on the first
1076 * device in the list of devices being unregistered.
1077 * Therefore we should not pass dev_net(dev) in here.
1078 */
1079 rt_cache_flush_batch(NULL);
1080 break; 1075 break;
1081 } 1076 }
1082 return NOTIFY_DONE; 1077 return NOTIFY_DONE;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 2b57d768240d..e55171f184f9 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -171,6 +171,10 @@ static void free_fib_info_rcu(struct rcu_head *head)
171 dev_put(nexthop_nh->nh_dev); 171 dev_put(nexthop_nh->nh_dev);
172 if (nexthop_nh->nh_exceptions) 172 if (nexthop_nh->nh_exceptions)
173 free_nh_exceptions(nexthop_nh); 173 free_nh_exceptions(nexthop_nh);
174 if (nexthop_nh->nh_rth_output)
175 dst_release(&nexthop_nh->nh_rth_output->dst);
176 if (nexthop_nh->nh_rth_input)
177 dst_release(&nexthop_nh->nh_rth_input->dst);
174 } endfor_nexthops(fi); 178 } endfor_nexthops(fi);
175 179
176 release_net(fi->fib_net); 180 release_net(fi->fib_net);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index c7a4de05ca04..db0cf17c00f7 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -368,8 +368,7 @@ EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
368 368
369struct dst_entry *inet_csk_route_req(struct sock *sk, 369struct dst_entry *inet_csk_route_req(struct sock *sk,
370 struct flowi4 *fl4, 370 struct flowi4 *fl4,
371 const struct request_sock *req, 371 const struct request_sock *req)
372 bool nocache)
373{ 372{
374 struct rtable *rt; 373 struct rtable *rt;
375 const struct inet_request_sock *ireq = inet_rsk(req); 374 const struct inet_request_sock *ireq = inet_rsk(req);
@@ -377,8 +376,6 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
377 struct net *net = sock_net(sk); 376 struct net *net = sock_net(sk);
378 int flags = inet_sk_flowi_flags(sk); 377 int flags = inet_sk_flowi_flags(sk);
379 378
380 if (nocache)
381 flags |= FLOWI_FLAG_RT_NOCACHE;
382 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 379 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
383 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 380 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
384 sk->sk_protocol, 381 sk->sk_protocol,
@@ -389,7 +386,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
389 rt = ip_route_output_flow(net, fl4, sk); 386 rt = ip_route_output_flow(net, fl4, sk);
390 if (IS_ERR(rt)) 387 if (IS_ERR(rt))
391 goto no_route; 388 goto no_route;
392 if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) 389 if (opt && opt->opt.is_strictroute && rt->rt_gateway)
393 goto route_err; 390 goto route_err;
394 return &rt->dst; 391 return &rt->dst;
395 392
@@ -422,7 +419,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
422 rt = ip_route_output_flow(net, fl4, sk); 419 rt = ip_route_output_flow(net, fl4, sk);
423 if (IS_ERR(rt)) 420 if (IS_ERR(rt))
424 goto no_route; 421 goto no_route;
425 if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) 422 if (opt && opt->opt.is_strictroute && rt->rt_gateway)
426 goto route_err; 423 goto route_err;
427 return &rt->dst; 424 return &rt->dst;
428 425
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8d07c973409c..7ad88e5e7110 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -258,8 +258,8 @@ static void ip_expire(unsigned long arg)
258 /* skb dst is stale, drop it, and perform route lookup again */ 258 /* skb dst is stale, drop it, and perform route lookup again */
259 skb_dst_drop(head); 259 skb_dst_drop(head);
260 iph = ip_hdr(head); 260 iph = ip_hdr(head);
261 err = ip_route_input_noref(head, iph->daddr, iph->saddr, 261 err = ip_route_input(head, iph->daddr, iph->saddr,
262 iph->tos, head->dev); 262 iph->tos, head->dev);
263 if (err) 263 if (err)
264 goto out_rcu_unlock; 264 goto out_rcu_unlock;
265 265
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 42c44b1403c9..b062a98574f2 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -766,7 +766,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
766 766
767 if (skb->protocol == htons(ETH_P_IP)) { 767 if (skb->protocol == htons(ETH_P_IP)) {
768 rt = skb_rtable(skb); 768 rt = skb_rtable(skb);
769 dst = rt->rt_gateway; 769 dst = rt_nexthop(rt, old_iph->daddr);
770 } 770 }
771#if IS_ENABLED(CONFIG_IPV6) 771#if IS_ENABLED(CONFIG_IPV6)
772 else if (skb->protocol == htons(ETH_P_IPV6)) { 772 else if (skb->protocol == htons(ETH_P_IPV6)) {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index b27d4440f523..4ebc6feee250 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -336,8 +336,8 @@ static int ip_rcv_finish(struct sk_buff *skb)
336 * how the packet travels inside Linux networking. 336 * how the packet travels inside Linux networking.
337 */ 337 */
338 if (!skb_dst(skb)) { 338 if (!skb_dst(skb)) {
339 int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, 339 int err = ip_route_input(skb, iph->daddr, iph->saddr,
340 iph->tos, skb->dev); 340 iph->tos, skb->dev);
341 if (unlikely(err)) { 341 if (unlikely(err)) {
342 if (err == -EXDEV) 342 if (err == -EXDEV)
343 NET_INC_STATS_BH(dev_net(skb->dev), 343 NET_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 665abbb7122a..ba39a52d18c1 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -371,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
371 skb_dst_set_noref(skb, &rt->dst); 371 skb_dst_set_noref(skb, &rt->dst);
372 372
373packet_routed: 373packet_routed:
374 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway) 374 if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway)
375 goto no_route; 375 goto no_route;
376 376
377 /* OK, we know where to send it, allocate and build IP header. */ 377 /* OK, we know where to send it, allocate and build IP header. */
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 2c2c35bace76..99af1f0cc658 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -487,7 +487,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
487 dev->stats.tx_fifo_errors++; 487 dev->stats.tx_fifo_errors++;
488 goto tx_error; 488 goto tx_error;
489 } 489 }
490 dst = rt->rt_gateway; 490 dst = rt_nexthop(rt, old_iph->daddr);
491 } 491 }
492 492
493 rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, 493 rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5716c6b808d6..8eec8f4a0536 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1795,9 +1795,12 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
1795 .daddr = iph->daddr, 1795 .daddr = iph->daddr,
1796 .saddr = iph->saddr, 1796 .saddr = iph->saddr,
1797 .flowi4_tos = RT_TOS(iph->tos), 1797 .flowi4_tos = RT_TOS(iph->tos),
1798 .flowi4_oif = rt->rt_oif, 1798 .flowi4_oif = (rt_is_output_route(rt) ?
1799 .flowi4_iif = rt->rt_iif, 1799 skb->dev->ifindex : 0),
1800 .flowi4_mark = rt->rt_mark, 1800 .flowi4_iif = (rt_is_output_route(rt) ?
1801 net->loopback_dev->ifindex :
1802 skb->dev->ifindex),
1803 .flowi4_mark = skb->mark,
1801 }; 1804 };
1802 struct mr_table *mrt; 1805 struct mr_table *mrt;
1803 int err; 1806 int err;
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 2f210c79dc87..cbb6a1a6f6f7 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -52,7 +52,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
52 struct nf_nat_ipv4_range newrange; 52 struct nf_nat_ipv4_range newrange;
53 const struct nf_nat_ipv4_multi_range_compat *mr; 53 const struct nf_nat_ipv4_multi_range_compat *mr;
54 const struct rtable *rt; 54 const struct rtable *rt;
55 __be32 newsrc; 55 __be32 newsrc, nh;
56 56
57 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); 57 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
58 58
@@ -70,7 +70,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
70 70
71 mr = par->targinfo; 71 mr = par->targinfo;
72 rt = skb_rtable(skb); 72 rt = skb_rtable(skb);
73 newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); 73 nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
74 newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE);
74 if (!newsrc) { 75 if (!newsrc) {
75 pr_info("%s ate my IP address\n", par->out->name); 76 pr_info("%s ate my IP address\n", par->out->name);
76 return NF_DROP; 77 return NF_DROP;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d547f6fae20d..9add08869c75 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8;
133static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 133static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 134static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135static int ip_rt_min_advmss __read_mostly = 256; 135static int ip_rt_min_advmss __read_mostly = 256;
136static int rt_chain_length_max __read_mostly = 20;
137
138static struct delayed_work expires_work;
139static unsigned long expires_ljiffies;
140 136
141/* 137/*
142 * Interface to generic destination cache. 138 * Interface to generic destination cache.
@@ -145,14 +141,12 @@ static unsigned long expires_ljiffies;
145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 142static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
147static unsigned int ipv4_mtu(const struct dst_entry *dst); 143static unsigned int ipv4_mtu(const struct dst_entry *dst);
148static void ipv4_dst_destroy(struct dst_entry *dst);
149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 144static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150static void ipv4_link_failure(struct sk_buff *skb); 145static void ipv4_link_failure(struct sk_buff *skb);
151static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 146static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
152 struct sk_buff *skb, u32 mtu); 147 struct sk_buff *skb, u32 mtu);
153static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, 148static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
154 struct sk_buff *skb); 149 struct sk_buff *skb);
155static int rt_garbage_collect(struct dst_ops *ops);
156 150
157static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 151static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
158 int how) 152 int how)
@@ -172,12 +166,10 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
172static struct dst_ops ipv4_dst_ops = { 166static struct dst_ops ipv4_dst_ops = {
173 .family = AF_INET, 167 .family = AF_INET,
174 .protocol = cpu_to_be16(ETH_P_IP), 168 .protocol = cpu_to_be16(ETH_P_IP),
175 .gc = rt_garbage_collect,
176 .check = ipv4_dst_check, 169 .check = ipv4_dst_check,
177 .default_advmss = ipv4_default_advmss, 170 .default_advmss = ipv4_default_advmss,
178 .mtu = ipv4_mtu, 171 .mtu = ipv4_mtu,
179 .cow_metrics = ipv4_cow_metrics, 172 .cow_metrics = ipv4_cow_metrics,
180 .destroy = ipv4_dst_destroy,
181 .ifdown = ipv4_dst_ifdown, 173 .ifdown = ipv4_dst_ifdown,
182 .negative_advice = ipv4_negative_advice, 174 .negative_advice = ipv4_negative_advice,
183 .link_failure = ipv4_link_failure, 175 .link_failure = ipv4_link_failure,
@@ -209,184 +201,30 @@ const __u8 ip_tos2prio[16] = {
209}; 201};
210EXPORT_SYMBOL(ip_tos2prio); 202EXPORT_SYMBOL(ip_tos2prio);
211 203
212/*
213 * Route cache.
214 */
215
216/* The locking scheme is rather straight forward:
217 *
218 * 1) Read-Copy Update protects the buckets of the central route hash.
219 * 2) Only writers remove entries, and they hold the lock
220 * as they look at rtable reference counts.
221 * 3) Only readers acquire references to rtable entries,
222 * they do so with atomic increments and with the
223 * lock held.
224 */
225
226struct rt_hash_bucket {
227 struct rtable __rcu *chain;
228};
229
230#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
231 defined(CONFIG_PROVE_LOCKING)
232/*
233 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
234 * The size of this table is a power of two and depends on the number of CPUS.
235 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
236 */
237#ifdef CONFIG_LOCKDEP
238# define RT_HASH_LOCK_SZ 256
239#else
240# if NR_CPUS >= 32
241# define RT_HASH_LOCK_SZ 4096
242# elif NR_CPUS >= 16
243# define RT_HASH_LOCK_SZ 2048
244# elif NR_CPUS >= 8
245# define RT_HASH_LOCK_SZ 1024
246# elif NR_CPUS >= 4
247# define RT_HASH_LOCK_SZ 512
248# else
249# define RT_HASH_LOCK_SZ 256
250# endif
251#endif
252
253static spinlock_t *rt_hash_locks;
254# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
255
256static __init void rt_hash_lock_init(void)
257{
258 int i;
259
260 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
261 GFP_KERNEL);
262 if (!rt_hash_locks)
263 panic("IP: failed to allocate rt_hash_locks\n");
264
265 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
266 spin_lock_init(&rt_hash_locks[i]);
267}
268#else
269# define rt_hash_lock_addr(slot) NULL
270
271static inline void rt_hash_lock_init(void)
272{
273}
274#endif
275
276static struct rt_hash_bucket *rt_hash_table __read_mostly;
277static unsigned int rt_hash_mask __read_mostly;
278static unsigned int rt_hash_log __read_mostly;
279
280static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 204static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
281#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) 205#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
282 206
283static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
284 int genid)
285{
286 return jhash_3words((__force u32)daddr, (__force u32)saddr,
287 idx, genid)
288 & rt_hash_mask;
289}
290
291static inline int rt_genid(struct net *net) 207static inline int rt_genid(struct net *net)
292{ 208{
293 return atomic_read(&net->ipv4.rt_genid); 209 return atomic_read(&net->ipv4.rt_genid);
294} 210}
295 211
296#ifdef CONFIG_PROC_FS 212#ifdef CONFIG_PROC_FS
297struct rt_cache_iter_state {
298 struct seq_net_private p;
299 int bucket;
300 int genid;
301};
302
303static struct rtable *rt_cache_get_first(struct seq_file *seq)
304{
305 struct rt_cache_iter_state *st = seq->private;
306 struct rtable *r = NULL;
307
308 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
309 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
310 continue;
311 rcu_read_lock_bh();
312 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
313 while (r) {
314 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
315 r->rt_genid == st->genid)
316 return r;
317 r = rcu_dereference_bh(r->dst.rt_next);
318 }
319 rcu_read_unlock_bh();
320 }
321 return r;
322}
323
324static struct rtable *__rt_cache_get_next(struct seq_file *seq,
325 struct rtable *r)
326{
327 struct rt_cache_iter_state *st = seq->private;
328
329 r = rcu_dereference_bh(r->dst.rt_next);
330 while (!r) {
331 rcu_read_unlock_bh();
332 do {
333 if (--st->bucket < 0)
334 return NULL;
335 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
336 rcu_read_lock_bh();
337 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
338 }
339 return r;
340}
341
342static struct rtable *rt_cache_get_next(struct seq_file *seq,
343 struct rtable *r)
344{
345 struct rt_cache_iter_state *st = seq->private;
346 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
347 if (dev_net(r->dst.dev) != seq_file_net(seq))
348 continue;
349 if (r->rt_genid == st->genid)
350 break;
351 }
352 return r;
353}
354
355static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
356{
357 struct rtable *r = rt_cache_get_first(seq);
358
359 if (r)
360 while (pos && (r = rt_cache_get_next(seq, r)))
361 --pos;
362 return pos ? NULL : r;
363}
364
365static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 213static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
366{ 214{
367 struct rt_cache_iter_state *st = seq->private;
368 if (*pos) 215 if (*pos)
369 return rt_cache_get_idx(seq, *pos - 1); 216 return NULL;
370 st->genid = rt_genid(seq_file_net(seq));
371 return SEQ_START_TOKEN; 217 return SEQ_START_TOKEN;
372} 218}
373 219
374static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 220static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
375{ 221{
376 struct rtable *r;
377
378 if (v == SEQ_START_TOKEN)
379 r = rt_cache_get_first(seq);
380 else
381 r = rt_cache_get_next(seq, v);
382 ++*pos; 222 ++*pos;
383 return r; 223 return NULL;
384} 224}
385 225
386static void rt_cache_seq_stop(struct seq_file *seq, void *v) 226static void rt_cache_seq_stop(struct seq_file *seq, void *v)
387{ 227{
388 if (v && v != SEQ_START_TOKEN)
389 rcu_read_unlock_bh();
390} 228}
391 229
392static int rt_cache_seq_show(struct seq_file *seq, void *v) 230static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -396,24 +234,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
396 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 234 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
397 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 235 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
398 "HHUptod\tSpecDst"); 236 "HHUptod\tSpecDst");
399 else {
400 struct rtable *r = v;
401 int len;
402
403 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
404 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
405 r->dst.dev ? r->dst.dev->name : "*",
406 (__force u32)r->rt_dst,
407 (__force u32)r->rt_gateway,
408 r->rt_flags, atomic_read(&r->dst.__refcnt),
409 r->dst.__use, 0, (__force u32)r->rt_src,
410 dst_metric_advmss(&r->dst) + 40,
411 dst_metric(&r->dst, RTAX_WINDOW), 0,
412 r->rt_key_tos,
413 -1, 0, 0, &len);
414
415 seq_printf(seq, "%*s\n", 127 - len, "");
416 }
417 return 0; 237 return 0;
418} 238}
419 239
@@ -426,8 +246,7 @@ static const struct seq_operations rt_cache_seq_ops = {
426 246
427static int rt_cache_seq_open(struct inode *inode, struct file *file) 247static int rt_cache_seq_open(struct inode *inode, struct file *file)
428{ 248{
429 return seq_open_net(inode, file, &rt_cache_seq_ops, 249 return seq_open(file, &rt_cache_seq_ops);
430 sizeof(struct rt_cache_iter_state));
431} 250}
432 251
433static const struct file_operations rt_cache_seq_fops = { 252static const struct file_operations rt_cache_seq_fops = {
@@ -435,7 +254,7 @@ static const struct file_operations rt_cache_seq_fops = {
435 .open = rt_cache_seq_open, 254 .open = rt_cache_seq_open,
436 .read = seq_read, 255 .read = seq_read,
437 .llseek = seq_lseek, 256 .llseek = seq_lseek,
438 .release = seq_release_net, 257 .release = seq_release,
439}; 258};
440 259
441 260
@@ -625,263 +444,12 @@ static inline int ip_rt_proc_init(void)
625} 444}
626#endif /* CONFIG_PROC_FS */ 445#endif /* CONFIG_PROC_FS */
627 446
628static inline void rt_free(struct rtable *rt)
629{
630 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
631}
632
633static inline void rt_drop(struct rtable *rt)
634{
635 ip_rt_put(rt);
636 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
637}
638
639static inline int rt_fast_clean(struct rtable *rth)
640{
641 /* Kill broadcast/multicast entries very aggresively, if they
642 collide in hash table with more useful entries */
643 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
644 rt_is_input_route(rth) && rth->dst.rt_next;
645}
646
647static inline int rt_valuable(struct rtable *rth)
648{
649 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
650 rth->dst.expires;
651}
652
653static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
654{
655 unsigned long age;
656 int ret = 0;
657
658 if (atomic_read(&rth->dst.__refcnt))
659 goto out;
660
661 age = jiffies - rth->dst.lastuse;
662 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
663 (age <= tmo2 && rt_valuable(rth)))
664 goto out;
665 ret = 1;
666out: return ret;
667}
668
669/* Bits of score are:
670 * 31: very valuable
671 * 30: not quite useless
672 * 29..0: usage counter
673 */
674static inline u32 rt_score(struct rtable *rt)
675{
676 u32 score = jiffies - rt->dst.lastuse;
677
678 score = ~score & ~(3<<30);
679
680 if (rt_valuable(rt))
681 score |= (1<<31);
682
683 if (rt_is_output_route(rt) ||
684 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
685 score |= (1<<30);
686
687 return score;
688}
689
690static inline bool rt_caching(const struct net *net)
691{
692 return net->ipv4.current_rt_cache_rebuild_count <=
693 net->ipv4.sysctl_rt_cache_rebuild_count;
694}
695
696static inline bool compare_hash_inputs(const struct rtable *rt1,
697 const struct rtable *rt2)
698{
699 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
700 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
701 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
702}
703
704static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
705{
706 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
707 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
708 (rt1->rt_mark ^ rt2->rt_mark) |
709 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
710 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
711 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
712}
713
714static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
715{
716 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
717}
718
719static inline int rt_is_expired(struct rtable *rth) 447static inline int rt_is_expired(struct rtable *rth)
720{ 448{
721 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); 449 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
722} 450}
723 451
724/* 452/*
725 * Perform a full scan of hash table and free all entries.
726 * Can be called by a softirq or a process.
727 * In the later case, we want to be reschedule if necessary
728 */
729static void rt_do_flush(struct net *net, int process_context)
730{
731 unsigned int i;
732 struct rtable *rth, *next;
733
734 for (i = 0; i <= rt_hash_mask; i++) {
735 struct rtable __rcu **pprev;
736 struct rtable *list;
737
738 if (process_context && need_resched())
739 cond_resched();
740 rth = rcu_access_pointer(rt_hash_table[i].chain);
741 if (!rth)
742 continue;
743
744 spin_lock_bh(rt_hash_lock_addr(i));
745
746 list = NULL;
747 pprev = &rt_hash_table[i].chain;
748 rth = rcu_dereference_protected(*pprev,
749 lockdep_is_held(rt_hash_lock_addr(i)));
750
751 while (rth) {
752 next = rcu_dereference_protected(rth->dst.rt_next,
753 lockdep_is_held(rt_hash_lock_addr(i)));
754
755 if (!net ||
756 net_eq(dev_net(rth->dst.dev), net)) {
757 rcu_assign_pointer(*pprev, next);
758 rcu_assign_pointer(rth->dst.rt_next, list);
759 list = rth;
760 } else {
761 pprev = &rth->dst.rt_next;
762 }
763 rth = next;
764 }
765
766 spin_unlock_bh(rt_hash_lock_addr(i));
767
768 for (; list; list = next) {
769 next = rcu_dereference_protected(list->dst.rt_next, 1);
770 rt_free(list);
771 }
772 }
773}
774
775/*
776 * While freeing expired entries, we compute average chain length
777 * and standard deviation, using fixed-point arithmetic.
778 * This to have an estimation of rt_chain_length_max
779 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
780 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
781 */
782
783#define FRACT_BITS 3
784#define ONE (1UL << FRACT_BITS)
785
786/*
787 * Given a hash chain and an item in this hash chain,
788 * find if a previous entry has the same hash_inputs
789 * (but differs on tos, mark or oif)
790 * Returns 0 if an alias is found.
791 * Returns ONE if rth has no alias before itself.
792 */
793static int has_noalias(const struct rtable *head, const struct rtable *rth)
794{
795 const struct rtable *aux = head;
796
797 while (aux != rth) {
798 if (compare_hash_inputs(aux, rth))
799 return 0;
800 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
801 }
802 return ONE;
803}
804
805static void rt_check_expire(void)
806{
807 static unsigned int rover;
808 unsigned int i = rover, goal;
809 struct rtable *rth;
810 struct rtable __rcu **rthp;
811 unsigned long samples = 0;
812 unsigned long sum = 0, sum2 = 0;
813 unsigned long delta;
814 u64 mult;
815
816 delta = jiffies - expires_ljiffies;
817 expires_ljiffies = jiffies;
818 mult = ((u64)delta) << rt_hash_log;
819 if (ip_rt_gc_timeout > 1)
820 do_div(mult, ip_rt_gc_timeout);
821 goal = (unsigned int)mult;
822 if (goal > rt_hash_mask)
823 goal = rt_hash_mask + 1;
824 for (; goal > 0; goal--) {
825 unsigned long tmo = ip_rt_gc_timeout;
826 unsigned long length;
827
828 i = (i + 1) & rt_hash_mask;
829 rthp = &rt_hash_table[i].chain;
830
831 if (need_resched())
832 cond_resched();
833
834 samples++;
835
836 if (rcu_dereference_raw(*rthp) == NULL)
837 continue;
838 length = 0;
839 spin_lock_bh(rt_hash_lock_addr(i));
840 while ((rth = rcu_dereference_protected(*rthp,
841 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
842 prefetch(rth->dst.rt_next);
843 if (rt_is_expired(rth) ||
844 rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
845 *rthp = rth->dst.rt_next;
846 rt_free(rth);
847 continue;
848 }
849
850 /* We only count entries on a chain with equal
851 * hash inputs once so that entries for
852 * different QOS levels, and other non-hash
853 * input attributes don't unfairly skew the
854 * length computation
855 */
856 tmo >>= 1;
857 rthp = &rth->dst.rt_next;
858 length += has_noalias(rt_hash_table[i].chain, rth);
859 }
860 spin_unlock_bh(rt_hash_lock_addr(i));
861 sum += length;
862 sum2 += length*length;
863 }
864 if (samples) {
865 unsigned long avg = sum / samples;
866 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
867 rt_chain_length_max = max_t(unsigned long,
868 ip_rt_gc_elasticity,
869 (avg + 4*sd) >> FRACT_BITS);
870 }
871 rover = i;
872}
873
874/*
875 * rt_worker_func() is run in process context.
876 * we call rt_check_expire() to scan part of the hash table
877 */
878static void rt_worker_func(struct work_struct *work)
879{
880 rt_check_expire();
881 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
882}
883
884/*
885 * Perturbation of rt_genid by a small quantity [1..256] 453 * Perturbation of rt_genid by a small quantity [1..256]
886 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 454 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
887 * many times (2^24) without giving recent rt_genid. 455 * many times (2^24) without giving recent rt_genid.
@@ -902,167 +470,6 @@ static void rt_cache_invalidate(struct net *net)
902void rt_cache_flush(struct net *net, int delay) 470void rt_cache_flush(struct net *net, int delay)
903{ 471{
904 rt_cache_invalidate(net); 472 rt_cache_invalidate(net);
905 if (delay >= 0)
906 rt_do_flush(net, !in_softirq());
907}
908
909/* Flush previous cache invalidated entries from the cache */
910void rt_cache_flush_batch(struct net *net)
911{
912 rt_do_flush(net, !in_softirq());
913}
914
915static void rt_emergency_hash_rebuild(struct net *net)
916{
917 net_warn_ratelimited("Route hash chain too long!\n");
918 rt_cache_invalidate(net);
919}
920
921/*
922 Short description of GC goals.
923
924 We want to build algorithm, which will keep routing cache
925 at some equilibrium point, when number of aged off entries
926 is kept approximately equal to newly generated ones.
927
928 Current expiration strength is variable "expire".
929 We try to adjust it dynamically, so that if networking
930 is idle expires is large enough to keep enough of warm entries,
931 and when load increases it reduces to limit cache size.
932 */
933
934static int rt_garbage_collect(struct dst_ops *ops)
935{
936 static unsigned long expire = RT_GC_TIMEOUT;
937 static unsigned long last_gc;
938 static int rover;
939 static int equilibrium;
940 struct rtable *rth;
941 struct rtable __rcu **rthp;
942 unsigned long now = jiffies;
943 int goal;
944 int entries = dst_entries_get_fast(&ipv4_dst_ops);
945
946 /*
947 * Garbage collection is pretty expensive,
948 * do not make it too frequently.
949 */
950
951 RT_CACHE_STAT_INC(gc_total);
952
953 if (now - last_gc < ip_rt_gc_min_interval &&
954 entries < ip_rt_max_size) {
955 RT_CACHE_STAT_INC(gc_ignored);
956 goto out;
957 }
958
959 entries = dst_entries_get_slow(&ipv4_dst_ops);
960 /* Calculate number of entries, which we want to expire now. */
961 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
962 if (goal <= 0) {
963 if (equilibrium < ipv4_dst_ops.gc_thresh)
964 equilibrium = ipv4_dst_ops.gc_thresh;
965 goal = entries - equilibrium;
966 if (goal > 0) {
967 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
968 goal = entries - equilibrium;
969 }
970 } else {
971 /* We are in dangerous area. Try to reduce cache really
972 * aggressively.
973 */
974 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
975 equilibrium = entries - goal;
976 }
977
978 if (now - last_gc >= ip_rt_gc_min_interval)
979 last_gc = now;
980
981 if (goal <= 0) {
982 equilibrium += goal;
983 goto work_done;
984 }
985
986 do {
987 int i, k;
988
989 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
990 unsigned long tmo = expire;
991
992 k = (k + 1) & rt_hash_mask;
993 rthp = &rt_hash_table[k].chain;
994 spin_lock_bh(rt_hash_lock_addr(k));
995 while ((rth = rcu_dereference_protected(*rthp,
996 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
997 if (!rt_is_expired(rth) &&
998 !rt_may_expire(rth, tmo, expire)) {
999 tmo >>= 1;
1000 rthp = &rth->dst.rt_next;
1001 continue;
1002 }
1003 *rthp = rth->dst.rt_next;
1004 rt_free(rth);
1005 goal--;
1006 }
1007 spin_unlock_bh(rt_hash_lock_addr(k));
1008 if (goal <= 0)
1009 break;
1010 }
1011 rover = k;
1012
1013 if (goal <= 0)
1014 goto work_done;
1015
1016 /* Goal is not achieved. We stop process if:
1017
1018 - if expire reduced to zero. Otherwise, expire is halfed.
1019 - if table is not full.
1020 - if we are called from interrupt.
1021 - jiffies check is just fallback/debug loop breaker.
1022 We will not spin here for long time in any case.
1023 */
1024
1025 RT_CACHE_STAT_INC(gc_goal_miss);
1026
1027 if (expire == 0)
1028 break;
1029
1030 expire >>= 1;
1031
1032 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1033 goto out;
1034 } while (!in_softirq() && time_before_eq(jiffies, now));
1035
1036 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1037 goto out;
1038 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1039 goto out;
1040 net_warn_ratelimited("dst cache overflow\n");
1041 RT_CACHE_STAT_INC(gc_dst_overflow);
1042 return 1;
1043
1044work_done:
1045 expire += ip_rt_gc_min_interval;
1046 if (expire > ip_rt_gc_timeout ||
1047 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1048 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1049 expire = ip_rt_gc_timeout;
1050out: return 0;
1051}
1052
1053/*
1054 * Returns number of entries in a hash chain that have different hash_inputs
1055 */
1056static int slow_chain_length(const struct rtable *head)
1057{
1058 int length = 0;
1059 const struct rtable *rth = head;
1060
1061 while (rth) {
1062 length += has_noalias(head, rth);
1063 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1064 }
1065 return length >> FRACT_BITS;
1066} 473}
1067 474
1068static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 475static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -1086,139 +493,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1086 return neigh_create(&arp_tbl, pkey, dev); 493 return neigh_create(&arp_tbl, pkey, dev);
1087} 494}
1088 495
1089static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1090 struct sk_buff *skb, int ifindex)
1091{
1092 struct rtable *rth, *cand;
1093 struct rtable __rcu **rthp, **candp;
1094 unsigned long now;
1095 u32 min_score;
1096 int chain_length;
1097
1098restart:
1099 chain_length = 0;
1100 min_score = ~(u32)0;
1101 cand = NULL;
1102 candp = NULL;
1103 now = jiffies;
1104
1105 if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1106 /*
1107 * If we're not caching, just tell the caller we
1108 * were successful and don't touch the route. The
1109 * caller hold the sole reference to the cache entry, and
1110 * it will be released when the caller is done with it.
1111 * If we drop it here, the callers have no way to resolve routes
1112 * when we're not caching. Instead, just point *rp at rt, so
1113 * the caller gets a single use out of the route
1114 * Note that we do rt_free on this new route entry, so that
1115 * once its refcount hits zero, we are still able to reap it
1116 * (Thanks Alexey)
1117 * Note: To avoid expensive rcu stuff for this uncached dst,
1118 * we set DST_NOCACHE so that dst_release() can free dst without
1119 * waiting a grace period.
1120 */
1121
1122 rt->dst.flags |= DST_NOCACHE;
1123 goto skip_hashing;
1124 }
1125
1126 rthp = &rt_hash_table[hash].chain;
1127
1128 spin_lock_bh(rt_hash_lock_addr(hash));
1129 while ((rth = rcu_dereference_protected(*rthp,
1130 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1131 if (rt_is_expired(rth)) {
1132 *rthp = rth->dst.rt_next;
1133 rt_free(rth);
1134 continue;
1135 }
1136 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1137 /* Put it first */
1138 *rthp = rth->dst.rt_next;
1139 /*
1140 * Since lookup is lockfree, the deletion
1141 * must be visible to another weakly ordered CPU before
1142 * the insertion at the start of the hash chain.
1143 */
1144 rcu_assign_pointer(rth->dst.rt_next,
1145 rt_hash_table[hash].chain);
1146 /*
1147 * Since lookup is lockfree, the update writes
1148 * must be ordered for consistency on SMP.
1149 */
1150 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1151
1152 dst_use(&rth->dst, now);
1153 spin_unlock_bh(rt_hash_lock_addr(hash));
1154
1155 rt_drop(rt);
1156 if (skb)
1157 skb_dst_set(skb, &rth->dst);
1158 return rth;
1159 }
1160
1161 if (!atomic_read(&rth->dst.__refcnt)) {
1162 u32 score = rt_score(rth);
1163
1164 if (score <= min_score) {
1165 cand = rth;
1166 candp = rthp;
1167 min_score = score;
1168 }
1169 }
1170
1171 chain_length++;
1172
1173 rthp = &rth->dst.rt_next;
1174 }
1175
1176 if (cand) {
1177 /* ip_rt_gc_elasticity used to be average length of chain
1178 * length, when exceeded gc becomes really aggressive.
1179 *
1180 * The second limit is less certain. At the moment it allows
1181 * only 2 entries per bucket. We will see.
1182 */
1183 if (chain_length > ip_rt_gc_elasticity) {
1184 *candp = cand->dst.rt_next;
1185 rt_free(cand);
1186 }
1187 } else {
1188 if (chain_length > rt_chain_length_max &&
1189 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1190 struct net *net = dev_net(rt->dst.dev);
1191 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1192 if (!rt_caching(net)) {
1193 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1194 rt->dst.dev->name, num);
1195 }
1196 rt_emergency_hash_rebuild(net);
1197 spin_unlock_bh(rt_hash_lock_addr(hash));
1198
1199 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1200 ifindex, rt_genid(net));
1201 goto restart;
1202 }
1203 }
1204
1205 rt->dst.rt_next = rt_hash_table[hash].chain;
1206
1207 /*
1208 * Since lookup is lockfree, we must make sure
1209 * previous writes to rt are committed to memory
1210 * before making rt visible to other CPUS.
1211 */
1212 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1213
1214 spin_unlock_bh(rt_hash_lock_addr(hash));
1215
1216skip_hashing:
1217 if (skb)
1218 skb_dst_set(skb, &rt->dst);
1219 return rt;
1220}
1221
1222/* 496/*
1223 * Peer allocation may fail only in serious out-of-memory conditions. However 497 * Peer allocation may fail only in serious out-of-memory conditions. However
1224 * we still can generate some output. 498 * we still can generate some output.
@@ -1255,26 +529,6 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1255} 529}
1256EXPORT_SYMBOL(__ip_select_ident); 530EXPORT_SYMBOL(__ip_select_ident);
1257 531
1258static void rt_del(unsigned int hash, struct rtable *rt)
1259{
1260 struct rtable __rcu **rthp;
1261 struct rtable *aux;
1262
1263 rthp = &rt_hash_table[hash].chain;
1264 spin_lock_bh(rt_hash_lock_addr(hash));
1265 ip_rt_put(rt);
1266 while ((aux = rcu_dereference_protected(*rthp,
1267 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1268 if (aux == rt || rt_is_expired(aux)) {
1269 *rthp = aux->dst.rt_next;
1270 rt_free(aux);
1271 continue;
1272 }
1273 rthp = &aux->dst.rt_next;
1274 }
1275 spin_unlock_bh(rt_hash_lock_addr(hash));
1276}
1277
1278static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, 532static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
1279 const struct iphdr *iph, 533 const struct iphdr *iph,
1280 int oif, u8 tos, 534 int oif, u8 tos,
@@ -1417,7 +671,8 @@ out_unlock:
1417 return; 671 return;
1418} 672}
1419 673
1420static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4) 674static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
675 bool kill_route)
1421{ 676{
1422 __be32 new_gw = icmp_hdr(skb)->un.gateway; 677 __be32 new_gw = icmp_hdr(skb)->un.gateway;
1423 __be32 old_gw = ip_hdr(skb)->saddr; 678 __be32 old_gw = ip_hdr(skb)->saddr;
@@ -1472,8 +727,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
1472 update_or_create_fnhe(nh, fl4->daddr, new_gw, 727 update_or_create_fnhe(nh, fl4->daddr, new_gw,
1473 0, 0); 728 0, 0);
1474 } 729 }
1475 rt->rt_gateway = new_gw; 730 if (kill_route)
1476 rt->rt_flags |= RTCF_REDIRECTED; 731 rt->dst.obsolete = DST_OBSOLETE_KILL;
1477 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 732 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1478 } 733 }
1479 neigh_release(n); 734 neigh_release(n);
@@ -1504,7 +759,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
1504 rt = (struct rtable *) dst; 759 rt = (struct rtable *) dst;
1505 760
1506 ip_rt_build_flow_key(&fl4, sk, skb); 761 ip_rt_build_flow_key(&fl4, sk, skb);
1507 __ip_do_redirect(rt, skb, &fl4); 762 __ip_do_redirect(rt, skb, &fl4, true);
1508} 763}
1509 764
1510static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 765static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -1518,10 +773,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1518 ret = NULL; 773 ret = NULL;
1519 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 774 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1520 rt->dst.expires) { 775 rt->dst.expires) {
1521 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 776 ip_rt_put(rt);
1522 rt->rt_oif,
1523 rt_genid(dev_net(dst->dev)));
1524 rt_del(hash, rt);
1525 ret = NULL; 777 ret = NULL;
1526 } 778 }
1527 } 779 }
@@ -1597,7 +849,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1597 peer->rate_tokens == ip_rt_redirect_number) 849 peer->rate_tokens == ip_rt_redirect_number)
1598 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", 850 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1599 &ip_hdr(skb)->saddr, rt->rt_iif, 851 &ip_hdr(skb)->saddr, rt->rt_iif,
1600 &rt->rt_dst, &rt->rt_gateway); 852 &ip_hdr(skb)->daddr, &rt->rt_gateway);
1601#endif 853#endif
1602 } 854 }
1603out_put_peer: 855out_put_peer:
@@ -1666,7 +918,7 @@ out: kfree_skb(skb);
1666 return 0; 918 return 0;
1667} 919}
1668 920
1669static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) 921static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1670{ 922{
1671 struct fib_result res; 923 struct fib_result res;
1672 924
@@ -1679,8 +931,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1679 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, 931 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1680 jiffies + ip_rt_mtu_expires); 932 jiffies + ip_rt_mtu_expires);
1681 } 933 }
1682 rt->rt_pmtu = mtu; 934 return mtu;
1683 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
1684} 935}
1685 936
1686static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 937static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
@@ -1690,7 +941,14 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1690 struct flowi4 fl4; 941 struct flowi4 fl4;
1691 942
1692 ip_rt_build_flow_key(&fl4, sk, skb); 943 ip_rt_build_flow_key(&fl4, sk, skb);
1693 __ip_rt_update_pmtu(rt, &fl4, mtu); 944 mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
945
946 if (!rt->rt_pmtu) {
947 dst->obsolete = DST_OBSOLETE_KILL;
948 } else {
949 rt->rt_pmtu = mtu;
950 dst_set_expires(&rt->dst, ip_rt_mtu_expires);
951 }
1694} 952}
1695 953
1696void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 954void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
@@ -1736,7 +994,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
1736 RT_TOS(iph->tos), protocol, mark, flow_flags); 994 RT_TOS(iph->tos), protocol, mark, flow_flags);
1737 rt = __ip_route_output_key(net, &fl4); 995 rt = __ip_route_output_key(net, &fl4);
1738 if (!IS_ERR(rt)) { 996 if (!IS_ERR(rt)) {
1739 __ip_do_redirect(rt, skb, &fl4); 997 __ip_do_redirect(rt, skb, &fl4, false);
1740 ip_rt_put(rt); 998 ip_rt_put(rt);
1741 } 999 }
1742} 1000}
@@ -1751,7 +1009,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1751 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); 1009 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1752 rt = __ip_route_output_key(sock_net(sk), &fl4); 1010 rt = __ip_route_output_key(sock_net(sk), &fl4);
1753 if (!IS_ERR(rt)) { 1011 if (!IS_ERR(rt)) {
1754 __ip_do_redirect(rt, skb, &fl4); 1012 __ip_do_redirect(rt, skb, &fl4, false);
1755 ip_rt_put(rt); 1013 ip_rt_put(rt);
1756 } 1014 }
1757} 1015}
@@ -1761,22 +1019,19 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1761{ 1019{
1762 struct rtable *rt = (struct rtable *) dst; 1020 struct rtable *rt = (struct rtable *) dst;
1763 1021
1764 if (rt_is_expired(rt)) 1022 /* All IPV4 dsts are created with ->obsolete set to the value
1023 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1024 * into this function always.
1025 *
1026 * When a PMTU/redirect information update invalidates a
1027 * route, this is indicated by setting obsolete to
1028 * DST_OBSOLETE_KILL.
1029 */
1030 if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1765 return NULL; 1031 return NULL;
1766 return dst; 1032 return dst;
1767} 1033}
1768 1034
1769static void ipv4_dst_destroy(struct dst_entry *dst)
1770{
1771 struct rtable *rt = (struct rtable *) dst;
1772
1773 if (rt->fi) {
1774 fib_info_put(rt->fi);
1775 rt->fi = NULL;
1776 }
1777}
1778
1779
1780static void ipv4_link_failure(struct sk_buff *skb) 1035static void ipv4_link_failure(struct sk_buff *skb)
1781{ 1036{
1782 struct rtable *rt; 1037 struct rtable *rt;
@@ -1832,8 +1087,9 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1832 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) 1087 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1833 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); 1088 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1834 else 1089 else
1835 src = inet_select_addr(rt->dst.dev, rt->rt_gateway, 1090 src = inet_select_addr(rt->dst.dev,
1836 RT_SCOPE_UNIVERSE); 1091 rt_nexthop(rt, iph->daddr),
1092 RT_SCOPE_UNIVERSE);
1837 rcu_read_unlock(); 1093 rcu_read_unlock();
1838 } 1094 }
1839 memcpy(addr, &src, 4); 1095 memcpy(addr, &src, 4);
@@ -1879,8 +1135,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
1879 mtu = dst->dev->mtu; 1135 mtu = dst->dev->mtu;
1880 1136
1881 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { 1137 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1882 1138 if (rt->rt_gateway && mtu > 576)
1883 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1884 mtu = 576; 1139 mtu = 576;
1885 } 1140 }
1886 1141
@@ -1890,58 +1145,91 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
1890 return mtu; 1145 return mtu;
1891} 1146}
1892 1147
1893static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, 1148static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1894 struct fib_info *fi)
1895{
1896 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1897 rt->fi = fi;
1898 atomic_inc(&fi->fib_clntref);
1899 }
1900 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1901}
1902
1903static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
1904{ 1149{
1905 struct fnhe_hash_bucket *hash = nh->nh_exceptions; 1150 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1906 struct fib_nh_exception *fnhe; 1151 struct fib_nh_exception *fnhe;
1907 u32 hval; 1152 u32 hval;
1908 1153
1154 if (!hash)
1155 return NULL;
1156
1909 hval = fnhe_hashfun(daddr); 1157 hval = fnhe_hashfun(daddr);
1910 1158
1911restart:
1912 for (fnhe = rcu_dereference(hash[hval].chain); fnhe; 1159 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1913 fnhe = rcu_dereference(fnhe->fnhe_next)) { 1160 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1914 __be32 fnhe_daddr, gw; 1161 if (fnhe->fnhe_daddr == daddr)
1915 unsigned long expires; 1162 return fnhe;
1916 unsigned int seq; 1163 }
1917 u32 pmtu; 1164 return NULL;
1918 1165}
1919 seq = read_seqbegin(&fnhe_seqlock);
1920 fnhe_daddr = fnhe->fnhe_daddr;
1921 gw = fnhe->fnhe_gw;
1922 pmtu = fnhe->fnhe_pmtu;
1923 expires = fnhe->fnhe_expires;
1924 if (read_seqretry(&fnhe_seqlock, seq))
1925 goto restart;
1926 if (daddr != fnhe_daddr)
1927 continue;
1928 if (pmtu) {
1929 unsigned long diff = expires - jiffies;
1930 1166
1931 if (time_before(jiffies, expires)) { 1167static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1932 rt->rt_pmtu = pmtu; 1168 __be32 daddr)
1933 dst_set_expires(&rt->dst, diff); 1169{
1934 } 1170 __be32 fnhe_daddr, gw;
1171 unsigned long expires;
1172 unsigned int seq;
1173 u32 pmtu;
1174
1175restart:
1176 seq = read_seqbegin(&fnhe_seqlock);
1177 fnhe_daddr = fnhe->fnhe_daddr;
1178 gw = fnhe->fnhe_gw;
1179 pmtu = fnhe->fnhe_pmtu;
1180 expires = fnhe->fnhe_expires;
1181 if (read_seqretry(&fnhe_seqlock, seq))
1182 goto restart;
1183
1184 if (daddr != fnhe_daddr)
1185 return;
1186
1187 if (pmtu) {
1188 unsigned long diff = expires - jiffies;
1189
1190 if (time_before(jiffies, expires)) {
1191 rt->rt_pmtu = pmtu;
1192 dst_set_expires(&rt->dst, diff);
1935 } 1193 }
1936 if (gw)
1937 rt->rt_gateway = gw;
1938 fnhe->fnhe_stamp = jiffies;
1939 break;
1940 } 1194 }
1195 if (gw) {
1196 rt->rt_flags |= RTCF_REDIRECTED;
1197 rt->rt_gateway = gw;
1198 }
1199 fnhe->fnhe_stamp = jiffies;
1941} 1200}
1942 1201
1943static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, 1202static inline void rt_release_rcu(struct rcu_head *head)
1203{
1204 struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
1205 dst_release(dst);
1206}
1207
1208static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1209{
1210 struct rtable *orig, *prev, **p = &nh->nh_rth_output;
1211
1212 if (rt_is_input_route(rt))
1213 p = &nh->nh_rth_input;
1214
1215 orig = *p;
1216
1217 prev = cmpxchg(p, orig, rt);
1218 if (prev == orig) {
1219 dst_clone(&rt->dst);
1220 if (orig)
1221 call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu);
1222 }
1223}
1224
1225static bool rt_cache_valid(struct rtable *rt)
1226{
1227 return (rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK);
1228}
1229
1230static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1944 const struct fib_result *res, 1231 const struct fib_result *res,
1232 struct fib_nh_exception *fnhe,
1945 struct fib_info *fi, u16 type, u32 itag) 1233 struct fib_info *fi, u16 type, u32 itag)
1946{ 1234{
1947 if (fi) { 1235 if (fi) {
@@ -1949,12 +1237,14 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1949 1237
1950 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) 1238 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1951 rt->rt_gateway = nh->nh_gw; 1239 rt->rt_gateway = nh->nh_gw;
1952 if (unlikely(nh->nh_exceptions)) 1240 if (unlikely(fnhe))
1953 rt_bind_exception(rt, nh, fl4->daddr); 1241 rt_bind_exception(rt, fnhe, daddr);
1954 rt_init_metrics(rt, fl4, fi); 1242 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1955#ifdef CONFIG_IP_ROUTE_CLASSID 1243#ifdef CONFIG_IP_ROUTE_CLASSID
1956 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 1244 rt->dst.tclassid = nh->nh_tclassid;
1957#endif 1245#endif
1246 if (!(rt->dst.flags & DST_HOST))
1247 rt_cache_route(nh, rt);
1958 } 1248 }
1959 1249
1960#ifdef CONFIG_IP_ROUTE_CLASSID 1250#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -1966,10 +1256,10 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1966} 1256}
1967 1257
1968static struct rtable *rt_dst_alloc(struct net_device *dev, 1258static struct rtable *rt_dst_alloc(struct net_device *dev,
1969 bool nopolicy, bool noxfrm) 1259 bool nopolicy, bool noxfrm, bool will_cache)
1970{ 1260{
1971 return dst_alloc(&ipv4_dst_ops, dev, 1, -1, 1261 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1972 DST_HOST | 1262 (will_cache ? 0 : DST_HOST) | DST_NOCACHE |
1973 (nopolicy ? DST_NOPOLICY : 0) | 1263 (nopolicy ? DST_NOPOLICY : 0) |
1974 (noxfrm ? DST_NOXFRM : 0)); 1264 (noxfrm ? DST_NOXFRM : 0));
1975} 1265}
@@ -1978,7 +1268,6 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
1978static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1268static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1979 u8 tos, struct net_device *dev, int our) 1269 u8 tos, struct net_device *dev, int our)
1980{ 1270{
1981 unsigned int hash;
1982 struct rtable *rth; 1271 struct rtable *rth;
1983 struct in_device *in_dev = __in_dev_get_rcu(dev); 1272 struct in_device *in_dev = __in_dev_get_rcu(dev);
1984 u32 itag = 0; 1273 u32 itag = 0;
@@ -2007,7 +1296,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2007 goto e_err; 1296 goto e_err;
2008 } 1297 }
2009 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, 1298 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2010 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 1299 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
2011 if (!rth) 1300 if (!rth)
2012 goto e_nobufs; 1301 goto e_nobufs;
2013 1302
@@ -2016,21 +1305,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2016#endif 1305#endif
2017 rth->dst.output = ip_rt_bug; 1306 rth->dst.output = ip_rt_bug;
2018 1307
2019 rth->rt_key_dst = daddr;
2020 rth->rt_key_src = saddr;
2021 rth->rt_genid = rt_genid(dev_net(dev)); 1308 rth->rt_genid = rt_genid(dev_net(dev));
2022 rth->rt_flags = RTCF_MULTICAST; 1309 rth->rt_flags = RTCF_MULTICAST;
2023 rth->rt_type = RTN_MULTICAST; 1310 rth->rt_type = RTN_MULTICAST;
2024 rth->rt_key_tos = tos; 1311 rth->rt_is_input= 1;
2025 rth->rt_dst = daddr;
2026 rth->rt_src = saddr;
2027 rth->rt_route_iif = dev->ifindex;
2028 rth->rt_iif = dev->ifindex; 1312 rth->rt_iif = dev->ifindex;
2029 rth->rt_oif = 0;
2030 rth->rt_mark = skb->mark;
2031 rth->rt_pmtu = 0; 1313 rth->rt_pmtu = 0;
2032 rth->rt_gateway = daddr; 1314 rth->rt_gateway = 0;
2033 rth->fi = NULL;
2034 if (our) { 1315 if (our) {
2035 rth->dst.input= ip_local_deliver; 1316 rth->dst.input= ip_local_deliver;
2036 rth->rt_flags |= RTCF_LOCAL; 1317 rth->rt_flags |= RTCF_LOCAL;
@@ -2042,9 +1323,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2042#endif 1323#endif
2043 RT_CACHE_STAT_INC(in_slow_mc); 1324 RT_CACHE_STAT_INC(in_slow_mc);
2044 1325
2045 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1326 skb_dst_set(skb, &rth->dst);
2046 rth = rt_intern_hash(hash, rth, skb, dev->ifindex); 1327 return 0;
2047 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2048 1328
2049e_nobufs: 1329e_nobufs:
2050 return -ENOBUFS; 1330 return -ENOBUFS;
@@ -2091,6 +1371,7 @@ static int __mkroute_input(struct sk_buff *skb,
2091 int err; 1371 int err;
2092 struct in_device *out_dev; 1372 struct in_device *out_dev;
2093 unsigned int flags = 0; 1373 unsigned int flags = 0;
1374 bool do_cache;
2094 u32 itag; 1375 u32 itag;
2095 1376
2096 /* get a working reference to the output device */ 1377 /* get a working reference to the output device */
@@ -2133,35 +1414,39 @@ static int __mkroute_input(struct sk_buff *skb,
2133 } 1414 }
2134 } 1415 }
2135 1416
1417 do_cache = false;
1418 if (res->fi) {
1419 if (!(flags & RTCF_DIRECTSRC) && !itag) {
1420 rth = FIB_RES_NH(*res).nh_rth_input;
1421 if (rt_cache_valid(rth)) {
1422 dst_hold(&rth->dst);
1423 goto out;
1424 }
1425 do_cache = true;
1426 }
1427 }
1428
2136 rth = rt_dst_alloc(out_dev->dev, 1429 rth = rt_dst_alloc(out_dev->dev,
2137 IN_DEV_CONF_GET(in_dev, NOPOLICY), 1430 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2138 IN_DEV_CONF_GET(out_dev, NOXFRM)); 1431 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
2139 if (!rth) { 1432 if (!rth) {
2140 err = -ENOBUFS; 1433 err = -ENOBUFS;
2141 goto cleanup; 1434 goto cleanup;
2142 } 1435 }
2143 1436
2144 rth->rt_key_dst = daddr;
2145 rth->rt_key_src = saddr;
2146 rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); 1437 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2147 rth->rt_flags = flags; 1438 rth->rt_flags = flags;
2148 rth->rt_type = res->type; 1439 rth->rt_type = res->type;
2149 rth->rt_key_tos = tos; 1440 rth->rt_is_input = 1;
2150 rth->rt_dst = daddr;
2151 rth->rt_src = saddr;
2152 rth->rt_route_iif = in_dev->dev->ifindex;
2153 rth->rt_iif = in_dev->dev->ifindex; 1441 rth->rt_iif = in_dev->dev->ifindex;
2154 rth->rt_oif = 0;
2155 rth->rt_mark = skb->mark;
2156 rth->rt_pmtu = 0; 1442 rth->rt_pmtu = 0;
2157 rth->rt_gateway = daddr; 1443 rth->rt_gateway = 0;
2158 rth->fi = NULL;
2159 1444
2160 rth->dst.input = ip_forward; 1445 rth->dst.input = ip_forward;
2161 rth->dst.output = ip_output; 1446 rth->dst.output = ip_output;
2162 1447
2163 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); 1448 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
2164 1449out:
2165 *result = rth; 1450 *result = rth;
2166 err = 0; 1451 err = 0;
2167 cleanup: 1452 cleanup:
@@ -2176,7 +1461,6 @@ static int ip_mkroute_input(struct sk_buff *skb,
2176{ 1461{
2177 struct rtable *rth = NULL; 1462 struct rtable *rth = NULL;
2178 int err; 1463 int err;
2179 unsigned int hash;
2180 1464
2181#ifdef CONFIG_IP_ROUTE_MULTIPATH 1465#ifdef CONFIG_IP_ROUTE_MULTIPATH
2182 if (res->fi && res->fi->fib_nhs > 1) 1466 if (res->fi && res->fi->fib_nhs > 1)
@@ -2188,12 +1472,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
2188 if (err) 1472 if (err)
2189 return err; 1473 return err;
2190 1474
2191 /* put it into the cache */ 1475 skb_dst_set(skb, &rth->dst);
2192 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2193 rt_genid(dev_net(rth->dst.dev)));
2194 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2195 if (IS_ERR(rth))
2196 return PTR_ERR(rth);
2197 return 0; 1476 return 0;
2198} 1477}
2199 1478
@@ -2217,9 +1496,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2217 unsigned int flags = 0; 1496 unsigned int flags = 0;
2218 u32 itag = 0; 1497 u32 itag = 0;
2219 struct rtable *rth; 1498 struct rtable *rth;
2220 unsigned int hash;
2221 int err = -EINVAL; 1499 int err = -EINVAL;
2222 struct net *net = dev_net(dev); 1500 struct net *net = dev_net(dev);
1501 bool do_cache;
2223 1502
2224 /* IP on this device is disabled. */ 1503 /* IP on this device is disabled. */
2225 1504
@@ -2233,6 +1512,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2233 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) 1512 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2234 goto martian_source; 1513 goto martian_source;
2235 1514
1515 res.fi = NULL;
2236 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 1516 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2237 goto brd_input; 1517 goto brd_input;
2238 1518
@@ -2308,8 +1588,20 @@ brd_input:
2308 RT_CACHE_STAT_INC(in_brd); 1588 RT_CACHE_STAT_INC(in_brd);
2309 1589
2310local_input: 1590local_input:
1591 do_cache = false;
1592 if (res.fi) {
1593 if (!(flags & RTCF_DIRECTSRC) && !itag) {
1594 rth = FIB_RES_NH(res).nh_rth_input;
1595 if (rt_cache_valid(rth)) {
1596 dst_hold(&rth->dst);
1597 goto set_and_out;
1598 }
1599 do_cache = true;
1600 }
1601 }
1602
2311 rth = rt_dst_alloc(net->loopback_dev, 1603 rth = rt_dst_alloc(net->loopback_dev,
2312 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 1604 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2313 if (!rth) 1605 if (!rth)
2314 goto e_nobufs; 1606 goto e_nobufs;
2315 1607
@@ -2319,31 +1611,23 @@ local_input:
2319 rth->dst.tclassid = itag; 1611 rth->dst.tclassid = itag;
2320#endif 1612#endif
2321 1613
2322 rth->rt_key_dst = daddr;
2323 rth->rt_key_src = saddr;
2324 rth->rt_genid = rt_genid(net); 1614 rth->rt_genid = rt_genid(net);
2325 rth->rt_flags = flags|RTCF_LOCAL; 1615 rth->rt_flags = flags|RTCF_LOCAL;
2326 rth->rt_type = res.type; 1616 rth->rt_type = res.type;
2327 rth->rt_key_tos = tos; 1617 rth->rt_is_input = 1;
2328 rth->rt_dst = daddr;
2329 rth->rt_src = saddr;
2330 rth->rt_route_iif = dev->ifindex;
2331 rth->rt_iif = dev->ifindex; 1618 rth->rt_iif = dev->ifindex;
2332 rth->rt_oif = 0;
2333 rth->rt_mark = skb->mark;
2334 rth->rt_pmtu = 0; 1619 rth->rt_pmtu = 0;
2335 rth->rt_gateway = daddr; 1620 rth->rt_gateway = 0;
2336 rth->fi = NULL;
2337 if (res.type == RTN_UNREACHABLE) { 1621 if (res.type == RTN_UNREACHABLE) {
2338 rth->dst.input= ip_error; 1622 rth->dst.input= ip_error;
2339 rth->dst.error= -err; 1623 rth->dst.error= -err;
2340 rth->rt_flags &= ~RTCF_LOCAL; 1624 rth->rt_flags &= ~RTCF_LOCAL;
2341 } 1625 }
2342 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); 1626 if (do_cache)
2343 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); 1627 rt_cache_route(&FIB_RES_NH(res), rth);
1628set_and_out:
1629 skb_dst_set(skb, &rth->dst);
2344 err = 0; 1630 err = 0;
2345 if (IS_ERR(rth))
2346 err = PTR_ERR(rth);
2347 goto out; 1631 goto out;
2348 1632
2349no_route: 1633no_route:
@@ -2379,49 +1663,13 @@ martian_source_keep_err:
2379 goto out; 1663 goto out;
2380} 1664}
2381 1665
2382int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1666int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2383 u8 tos, struct net_device *dev, bool noref) 1667 u8 tos, struct net_device *dev)
2384{ 1668{
2385 struct rtable *rth;
2386 unsigned int hash;
2387 int iif = dev->ifindex;
2388 struct net *net;
2389 int res; 1669 int res;
2390 1670
2391 net = dev_net(dev);
2392
2393 rcu_read_lock(); 1671 rcu_read_lock();
2394 1672
2395 if (!rt_caching(net))
2396 goto skip_cache;
2397
2398 tos &= IPTOS_RT_MASK;
2399 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2400
2401 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2402 rth = rcu_dereference(rth->dst.rt_next)) {
2403 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2404 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2405 (rth->rt_route_iif ^ iif) |
2406 (rth->rt_key_tos ^ tos)) == 0 &&
2407 rth->rt_mark == skb->mark &&
2408 net_eq(dev_net(rth->dst.dev), net) &&
2409 !rt_is_expired(rth)) {
2410 if (noref) {
2411 dst_use_noref(&rth->dst, jiffies);
2412 skb_dst_set_noref(skb, &rth->dst);
2413 } else {
2414 dst_use(&rth->dst, jiffies);
2415 skb_dst_set(skb, &rth->dst);
2416 }
2417 RT_CACHE_STAT_INC(in_hit);
2418 rcu_read_unlock();
2419 return 0;
2420 }
2421 RT_CACHE_STAT_INC(in_hlist_search);
2422 }
2423
2424skip_cache:
2425 /* Multicast recognition logic is moved from route cache to here. 1673 /* Multicast recognition logic is moved from route cache to here.
2426 The problem was that too many Ethernet cards have broken/missing 1674 The problem was that too many Ethernet cards have broken/missing
2427 hardware multicast filters :-( As result the host on multicasting 1675 hardware multicast filters :-( As result the host on multicasting
@@ -2459,17 +1707,16 @@ skip_cache:
2459 rcu_read_unlock(); 1707 rcu_read_unlock();
2460 return res; 1708 return res;
2461} 1709}
2462EXPORT_SYMBOL(ip_route_input_common); 1710EXPORT_SYMBOL(ip_route_input);
2463 1711
2464/* called with rcu_read_lock() */ 1712/* called with rcu_read_lock() */
2465static struct rtable *__mkroute_output(const struct fib_result *res, 1713static struct rtable *__mkroute_output(const struct fib_result *res,
2466 const struct flowi4 *fl4, 1714 const struct flowi4 *fl4, int orig_oif,
2467 __be32 orig_daddr, __be32 orig_saddr,
2468 int orig_oif, __u8 orig_rtos,
2469 struct net_device *dev_out, 1715 struct net_device *dev_out,
2470 unsigned int flags) 1716 unsigned int flags)
2471{ 1717{
2472 struct fib_info *fi = res->fi; 1718 struct fib_info *fi = res->fi;
1719 struct fib_nh_exception *fnhe;
2473 struct in_device *in_dev; 1720 struct in_device *in_dev;
2474 u16 type = res->type; 1721 u16 type = res->type;
2475 struct rtable *rth; 1722 struct rtable *rth;
@@ -2508,29 +1755,33 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
2508 fi = NULL; 1755 fi = NULL;
2509 } 1756 }
2510 1757
1758 fnhe = NULL;
1759 if (fi) {
1760 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1761 if (!fnhe) {
1762 rth = FIB_RES_NH(*res).nh_rth_output;
1763 if (rt_cache_valid(rth)) {
1764 dst_hold(&rth->dst);
1765 return rth;
1766 }
1767 }
1768 }
2511 rth = rt_dst_alloc(dev_out, 1769 rth = rt_dst_alloc(dev_out,
2512 IN_DEV_CONF_GET(in_dev, NOPOLICY), 1770 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2513 IN_DEV_CONF_GET(in_dev, NOXFRM)); 1771 IN_DEV_CONF_GET(in_dev, NOXFRM),
1772 fi && !fnhe);
2514 if (!rth) 1773 if (!rth)
2515 return ERR_PTR(-ENOBUFS); 1774 return ERR_PTR(-ENOBUFS);
2516 1775
2517 rth->dst.output = ip_output; 1776 rth->dst.output = ip_output;
2518 1777
2519 rth->rt_key_dst = orig_daddr;
2520 rth->rt_key_src = orig_saddr;
2521 rth->rt_genid = rt_genid(dev_net(dev_out)); 1778 rth->rt_genid = rt_genid(dev_net(dev_out));
2522 rth->rt_flags = flags; 1779 rth->rt_flags = flags;
2523 rth->rt_type = type; 1780 rth->rt_type = type;
2524 rth->rt_key_tos = orig_rtos; 1781 rth->rt_is_input = 0;
2525 rth->rt_dst = fl4->daddr;
2526 rth->rt_src = fl4->saddr;
2527 rth->rt_route_iif = 0;
2528 rth->rt_iif = orig_oif ? : dev_out->ifindex; 1782 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2529 rth->rt_oif = orig_oif;
2530 rth->rt_mark = fl4->flowi4_mark;
2531 rth->rt_pmtu = 0; 1783 rth->rt_pmtu = 0;
2532 rth->rt_gateway = fl4->daddr; 1784 rth->rt_gateway = 0;
2533 rth->fi = NULL;
2534 1785
2535 RT_CACHE_STAT_INC(out_slow_tot); 1786 RT_CACHE_STAT_INC(out_slow_tot);
2536 1787
@@ -2553,36 +1804,28 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
2553#endif 1804#endif
2554 } 1805 }
2555 1806
2556 rt_set_nexthop(rth, fl4, res, fi, type, 0); 1807 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2557
2558 if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
2559 rth->dst.flags |= DST_NOCACHE;
2560 1808
2561 return rth; 1809 return rth;
2562} 1810}
2563 1811
2564/* 1812/*
2565 * Major route resolver routine. 1813 * Major route resolver routine.
2566 * called with rcu_read_lock();
2567 */ 1814 */
2568 1815
2569static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) 1816struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2570{ 1817{
2571 struct net_device *dev_out = NULL; 1818 struct net_device *dev_out = NULL;
2572 __u8 tos = RT_FL_TOS(fl4); 1819 __u8 tos = RT_FL_TOS(fl4);
2573 unsigned int flags = 0; 1820 unsigned int flags = 0;
2574 struct fib_result res; 1821 struct fib_result res;
2575 struct rtable *rth; 1822 struct rtable *rth;
2576 __be32 orig_daddr;
2577 __be32 orig_saddr;
2578 int orig_oif; 1823 int orig_oif;
2579 1824
2580 res.tclassid = 0; 1825 res.tclassid = 0;
2581 res.fi = NULL; 1826 res.fi = NULL;
2582 res.table = NULL; 1827 res.table = NULL;
2583 1828
2584 orig_daddr = fl4->daddr;
2585 orig_saddr = fl4->saddr;
2586 orig_oif = fl4->flowi4_oif; 1829 orig_oif = fl4->flowi4_oif;
2587 1830
2588 fl4->flowi4_iif = net->loopback_dev->ifindex; 1831 fl4->flowi4_iif = net->loopback_dev->ifindex;
@@ -2744,59 +1987,12 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2744 1987
2745 1988
2746make_route: 1989make_route:
2747 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, 1990 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2748 tos, dev_out, flags);
2749 if (!IS_ERR(rth)) {
2750 unsigned int hash;
2751
2752 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2753 rt_genid(dev_net(dev_out)));
2754 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2755 }
2756 1991
2757out: 1992out:
2758 rcu_read_unlock(); 1993 rcu_read_unlock();
2759 return rth; 1994 return rth;
2760} 1995}
2761
2762struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2763{
2764 struct rtable *rth;
2765 unsigned int hash;
2766
2767 if (!rt_caching(net))
2768 goto slow_output;
2769
2770 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2771
2772 rcu_read_lock_bh();
2773 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2774 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2775 if (rth->rt_key_dst == flp4->daddr &&
2776 rth->rt_key_src == flp4->saddr &&
2777 rt_is_output_route(rth) &&
2778 rth->rt_oif == flp4->flowi4_oif &&
2779 rth->rt_mark == flp4->flowi4_mark &&
2780 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2781 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2782 net_eq(dev_net(rth->dst.dev), net) &&
2783 !rt_is_expired(rth)) {
2784 dst_use(&rth->dst, jiffies);
2785 RT_CACHE_STAT_INC(out_hit);
2786 rcu_read_unlock_bh();
2787 if (!flp4->saddr)
2788 flp4->saddr = rth->rt_src;
2789 if (!flp4->daddr)
2790 flp4->daddr = rth->rt_dst;
2791 return rth;
2792 }
2793 RT_CACHE_STAT_INC(out_hlist_search);
2794 }
2795 rcu_read_unlock_bh();
2796
2797slow_output:
2798 return ip_route_output_slow(net, flp4);
2799}
2800EXPORT_SYMBOL_GPL(__ip_route_output_key); 1996EXPORT_SYMBOL_GPL(__ip_route_output_key);
2801 1997
2802static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 1998static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
@@ -2830,7 +2026,6 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2830static struct dst_ops ipv4_dst_blackhole_ops = { 2026static struct dst_ops ipv4_dst_blackhole_ops = {
2831 .family = AF_INET, 2027 .family = AF_INET,
2832 .protocol = cpu_to_be16(ETH_P_IP), 2028 .protocol = cpu_to_be16(ETH_P_IP),
2833 .destroy = ipv4_dst_destroy,
2834 .check = ipv4_blackhole_dst_check, 2029 .check = ipv4_blackhole_dst_check,
2835 .mtu = ipv4_blackhole_mtu, 2030 .mtu = ipv4_blackhole_mtu,
2836 .default_advmss = ipv4_default_advmss, 2031 .default_advmss = ipv4_default_advmss,
@@ -2842,9 +2037,10 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
2842 2037
2843struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2038struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2844{ 2039{
2845 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2846 struct rtable *ort = (struct rtable *) dst_orig; 2040 struct rtable *ort = (struct rtable *) dst_orig;
2041 struct rtable *rt;
2847 2042
2043 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2848 if (rt) { 2044 if (rt) {
2849 struct dst_entry *new = &rt->dst; 2045 struct dst_entry *new = &rt->dst;
2850 2046
@@ -2856,24 +2052,14 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
2856 if (new->dev) 2052 if (new->dev)
2857 dev_hold(new->dev); 2053 dev_hold(new->dev);
2858 2054
2859 rt->rt_key_dst = ort->rt_key_dst; 2055 rt->rt_is_input = ort->rt_is_input;
2860 rt->rt_key_src = ort->rt_key_src;
2861 rt->rt_key_tos = ort->rt_key_tos;
2862 rt->rt_route_iif = ort->rt_route_iif;
2863 rt->rt_iif = ort->rt_iif; 2056 rt->rt_iif = ort->rt_iif;
2864 rt->rt_oif = ort->rt_oif;
2865 rt->rt_mark = ort->rt_mark;
2866 rt->rt_pmtu = ort->rt_pmtu; 2057 rt->rt_pmtu = ort->rt_pmtu;
2867 2058
2868 rt->rt_genid = rt_genid(net); 2059 rt->rt_genid = rt_genid(net);
2869 rt->rt_flags = ort->rt_flags; 2060 rt->rt_flags = ort->rt_flags;
2870 rt->rt_type = ort->rt_type; 2061 rt->rt_type = ort->rt_type;
2871 rt->rt_dst = ort->rt_dst;
2872 rt->rt_src = ort->rt_src;
2873 rt->rt_gateway = ort->rt_gateway; 2062 rt->rt_gateway = ort->rt_gateway;
2874 rt->fi = ort->fi;
2875 if (rt->fi)
2876 atomic_inc(&rt->fi->fib_clntref);
2877 2063
2878 dst_free(new); 2064 dst_free(new);
2879 } 2065 }
@@ -2900,9 +2086,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2900} 2086}
2901EXPORT_SYMBOL_GPL(ip_route_output_flow); 2087EXPORT_SYMBOL_GPL(ip_route_output_flow);
2902 2088
2903static int rt_fill_info(struct net *net, 2089static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2904 struct sk_buff *skb, u32 pid, u32 seq, int event, 2090 struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
2905 int nowait, unsigned int flags) 2091 u32 seq, int event, int nowait, unsigned int flags)
2906{ 2092{
2907 struct rtable *rt = skb_rtable(skb); 2093 struct rtable *rt = skb_rtable(skb);
2908 struct rtmsg *r; 2094 struct rtmsg *r;
@@ -2919,7 +2105,7 @@ static int rt_fill_info(struct net *net,
2919 r->rtm_family = AF_INET; 2105 r->rtm_family = AF_INET;
2920 r->rtm_dst_len = 32; 2106 r->rtm_dst_len = 32;
2921 r->rtm_src_len = 0; 2107 r->rtm_src_len = 0;
2922 r->rtm_tos = rt->rt_key_tos; 2108 r->rtm_tos = fl4->flowi4_tos;
2923 r->rtm_table = RT_TABLE_MAIN; 2109 r->rtm_table = RT_TABLE_MAIN;
2924 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) 2110 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2925 goto nla_put_failure; 2111 goto nla_put_failure;
@@ -2930,11 +2116,11 @@ static int rt_fill_info(struct net *net,
2930 if (rt->rt_flags & RTCF_NOTIFY) 2116 if (rt->rt_flags & RTCF_NOTIFY)
2931 r->rtm_flags |= RTM_F_NOTIFY; 2117 r->rtm_flags |= RTM_F_NOTIFY;
2932 2118
2933 if (nla_put_be32(skb, RTA_DST, rt->rt_dst)) 2119 if (nla_put_be32(skb, RTA_DST, dst))
2934 goto nla_put_failure; 2120 goto nla_put_failure;
2935 if (rt->rt_key_src) { 2121 if (src) {
2936 r->rtm_src_len = 32; 2122 r->rtm_src_len = 32;
2937 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src)) 2123 if (nla_put_be32(skb, RTA_SRC, src))
2938 goto nla_put_failure; 2124 goto nla_put_failure;
2939 } 2125 }
2940 if (rt->dst.dev && 2126 if (rt->dst.dev &&
@@ -2946,11 +2132,11 @@ static int rt_fill_info(struct net *net,
2946 goto nla_put_failure; 2132 goto nla_put_failure;
2947#endif 2133#endif
2948 if (!rt_is_input_route(rt) && 2134 if (!rt_is_input_route(rt) &&
2949 rt->rt_src != rt->rt_key_src) { 2135 fl4->saddr != src) {
2950 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src)) 2136 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2951 goto nla_put_failure; 2137 goto nla_put_failure;
2952 } 2138 }
2953 if (rt->rt_dst != rt->rt_gateway && 2139 if (rt->rt_gateway &&
2954 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) 2140 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2955 goto nla_put_failure; 2141 goto nla_put_failure;
2956 2142
@@ -2960,8 +2146,8 @@ static int rt_fill_info(struct net *net,
2960 if (rtnetlink_put_metrics(skb, metrics) < 0) 2146 if (rtnetlink_put_metrics(skb, metrics) < 0)
2961 goto nla_put_failure; 2147 goto nla_put_failure;
2962 2148
2963 if (rt->rt_mark && 2149 if (fl4->flowi4_mark &&
2964 nla_put_be32(skb, RTA_MARK, rt->rt_mark)) 2150 nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2965 goto nla_put_failure; 2151 goto nla_put_failure;
2966 2152
2967 error = rt->dst.error; 2153 error = rt->dst.error;
@@ -2974,29 +2160,8 @@ static int rt_fill_info(struct net *net,
2974 } 2160 }
2975 2161
2976 if (rt_is_input_route(rt)) { 2162 if (rt_is_input_route(rt)) {
2977#ifdef CONFIG_IP_MROUTE 2163 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2978 __be32 dst = rt->rt_dst; 2164 goto nla_put_failure;
2979
2980 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2981 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2982 int err = ipmr_get_route(net, skb,
2983 rt->rt_src, rt->rt_dst,
2984 r, nowait);
2985 if (err <= 0) {
2986 if (!nowait) {
2987 if (err == 0)
2988 return 0;
2989 goto nla_put_failure;
2990 } else {
2991 if (err == -EMSGSIZE)
2992 goto nla_put_failure;
2993 error = err;
2994 }
2995 }
2996 } else
2997#endif
2998 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2999 goto nla_put_failure;
3000 } 2165 }
3001 2166
3002 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) 2167 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
@@ -3015,6 +2180,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
3015 struct rtmsg *rtm; 2180 struct rtmsg *rtm;
3016 struct nlattr *tb[RTA_MAX+1]; 2181 struct nlattr *tb[RTA_MAX+1];
3017 struct rtable *rt = NULL; 2182 struct rtable *rt = NULL;
2183 struct flowi4 fl4;
3018 __be32 dst = 0; 2184 __be32 dst = 0;
3019 __be32 src = 0; 2185 __be32 src = 0;
3020 u32 iif; 2186 u32 iif;
@@ -3049,6 +2215,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
3049 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2215 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3050 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 2216 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3051 2217
2218 memset(&fl4, 0, sizeof(fl4));
2219 fl4.daddr = dst;
2220 fl4.saddr = src;
2221 fl4.flowi4_tos = rtm->rtm_tos;
2222 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2223 fl4.flowi4_mark = mark;
2224
3052 if (iif) { 2225 if (iif) {
3053 struct net_device *dev; 2226 struct net_device *dev;
3054 2227
@@ -3069,13 +2242,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
3069 if (err == 0 && rt->dst.error) 2242 if (err == 0 && rt->dst.error)
3070 err = -rt->dst.error; 2243 err = -rt->dst.error;
3071 } else { 2244 } else {
3072 struct flowi4 fl4 = {
3073 .daddr = dst,
3074 .saddr = src,
3075 .flowi4_tos = rtm->rtm_tos,
3076 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3077 .flowi4_mark = mark,
3078 };
3079 rt = ip_route_output_key(net, &fl4); 2245 rt = ip_route_output_key(net, &fl4);
3080 2246
3081 err = 0; 2247 err = 0;
@@ -3090,7 +2256,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
3090 if (rtm->rtm_flags & RTM_F_NOTIFY) 2256 if (rtm->rtm_flags & RTM_F_NOTIFY)
3091 rt->rt_flags |= RTCF_NOTIFY; 2257 rt->rt_flags |= RTCF_NOTIFY;
3092 2258
3093 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 2259 err = rt_fill_info(net, dst, src, &fl4, skb,
2260 NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3094 RTM_NEWROUTE, 0, 0); 2261 RTM_NEWROUTE, 0, 0);
3095 if (err <= 0) 2262 if (err <= 0)
3096 goto errout_free; 2263 goto errout_free;
@@ -3106,43 +2273,6 @@ errout_free:
3106 2273
3107int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) 2274int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3108{ 2275{
3109 struct rtable *rt;
3110 int h, s_h;
3111 int idx, s_idx;
3112 struct net *net;
3113
3114 net = sock_net(skb->sk);
3115
3116 s_h = cb->args[0];
3117 if (s_h < 0)
3118 s_h = 0;
3119 s_idx = idx = cb->args[1];
3120 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3121 if (!rt_hash_table[h].chain)
3122 continue;
3123 rcu_read_lock_bh();
3124 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3125 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3126 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3127 continue;
3128 if (rt_is_expired(rt))
3129 continue;
3130 skb_dst_set_noref(skb, &rt->dst);
3131 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3132 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3133 1, NLM_F_MULTI) <= 0) {
3134 skb_dst_drop(skb);
3135 rcu_read_unlock_bh();
3136 goto done;
3137 }
3138 skb_dst_drop(skb);
3139 }
3140 rcu_read_unlock_bh();
3141 }
3142
3143done:
3144 cb->args[0] = h;
3145 cb->args[1] = idx;
3146 return skb->len; 2276 return skb->len;
3147} 2277}
3148 2278
@@ -3376,22 +2506,6 @@ static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3376struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 2506struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3377#endif /* CONFIG_IP_ROUTE_CLASSID */ 2507#endif /* CONFIG_IP_ROUTE_CLASSID */
3378 2508
3379static __initdata unsigned long rhash_entries;
3380static int __init set_rhash_entries(char *str)
3381{
3382 ssize_t ret;
3383
3384 if (!str)
3385 return 0;
3386
3387 ret = kstrtoul(str, 0, &rhash_entries);
3388 if (ret)
3389 return 0;
3390
3391 return 1;
3392}
3393__setup("rhash_entries=", set_rhash_entries);
3394
3395int __init ip_rt_init(void) 2509int __init ip_rt_init(void)
3396{ 2510{
3397 int rc = 0; 2511 int rc = 0;
@@ -3414,31 +2528,12 @@ int __init ip_rt_init(void)
3414 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 2528 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3415 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 2529 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3416 2530
3417 rt_hash_table = (struct rt_hash_bucket *) 2531 ipv4_dst_ops.gc_thresh = ~0;
3418 alloc_large_system_hash("IP route cache", 2532 ip_rt_max_size = INT_MAX;
3419 sizeof(struct rt_hash_bucket),
3420 rhash_entries,
3421 (totalram_pages >= 128 * 1024) ?
3422 15 : 17,
3423 0,
3424 &rt_hash_log,
3425 &rt_hash_mask,
3426 0,
3427 rhash_entries ? 0 : 512 * 1024);
3428 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3429 rt_hash_lock_init();
3430
3431 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3432 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3433 2533
3434 devinet_init(); 2534 devinet_init();
3435 ip_fib_init(); 2535 ip_fib_init();
3436 2536
3437 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3438 expires_ljiffies = jiffies;
3439 schedule_delayed_work(&expires_work,
3440 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3441
3442 if (ip_rt_proc_init()) 2537 if (ip_rt_proc_init())
3443 pr_err("Unable to create route proc files\n"); 2538 pr_err("Unable to create route proc files\n");
3444#ifdef CONFIG_XFRM 2539#ifdef CONFIG_XFRM
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1d8b75a58981..59110caeb074 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -824,7 +824,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
824 struct sk_buff * skb; 824 struct sk_buff * skb;
825 825
826 /* First, grab a route. */ 826 /* First, grab a route. */
827 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req, nocache)) == NULL) 827 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
828 return -1; 828 return -1;
829 829
830 skb = tcp_make_synack(sk, dst, req, rvp); 830 skb = tcp_make_synack(sk, dst, req, rvp);
@@ -1378,7 +1378,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1378 */ 1378 */
1379 if (tmp_opt.saw_tstamp && 1379 if (tmp_opt.saw_tstamp &&
1380 tcp_death_row.sysctl_tw_recycle && 1380 tcp_death_row.sysctl_tw_recycle &&
1381 (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL && 1381 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1382 fl4.daddr == saddr) { 1382 fl4.daddr == saddr) {
1383 if (!tcp_peer_is_proven(req, dst, true)) { 1383 if (!tcp_peer_is_proven(req, dst, true)) {
1384 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 1384 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6216dc..58d23a572509 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
27 if (skb_dst(skb) == NULL) { 27 if (skb_dst(skb) == NULL) {
28 const struct iphdr *iph = ip_hdr(skb); 28 const struct iphdr *iph = ip_hdr(skb);
29 29
30 if (ip_route_input_noref(skb, iph->daddr, iph->saddr, 30 if (ip_route_input(skb, iph->daddr, iph->saddr,
31 iph->tos, skb->dev)) 31 iph->tos, skb->dev))
32 goto drop; 32 goto drop;
33 } 33 }
34 return dst_input(skb); 34 return dst_input(skb);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index fcf7678bc009..c6281847f16a 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -79,24 +79,17 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
79 struct rtable *rt = (struct rtable *)xdst->route; 79 struct rtable *rt = (struct rtable *)xdst->route;
80 const struct flowi4 *fl4 = &fl->u.ip4; 80 const struct flowi4 *fl4 = &fl->u.ip4;
81 81
82 xdst->u.rt.rt_key_dst = fl4->daddr;
83 xdst->u.rt.rt_key_src = fl4->saddr;
84 xdst->u.rt.rt_key_tos = fl4->flowi4_tos;
85 xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
86 xdst->u.rt.rt_iif = fl4->flowi4_iif; 82 xdst->u.rt.rt_iif = fl4->flowi4_iif;
87 xdst->u.rt.rt_oif = fl4->flowi4_oif;
88 xdst->u.rt.rt_mark = fl4->flowi4_mark;
89 83
90 xdst->u.dst.dev = dev; 84 xdst->u.dst.dev = dev;
91 dev_hold(dev); 85 dev_hold(dev);
92 86
93 /* Sheit... I remember I did this right. Apparently, 87 /* Sheit... I remember I did this right. Apparently,
94 * it was magically lost, so this code needs audit */ 88 * it was magically lost, so this code needs audit */
89 xdst->u.rt.rt_is_input = rt->rt_is_input;
95 xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | 90 xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
96 RTCF_LOCAL); 91 RTCF_LOCAL);
97 xdst->u.rt.rt_type = rt->rt_type; 92 xdst->u.rt.rt_type = rt->rt_type;
98 xdst->u.rt.rt_src = rt->rt_src;
99 xdst->u.rt.rt_dst = rt->rt_dst;
100 xdst->u.rt.rt_gateway = rt->rt_gateway; 93 xdst->u.rt.rt_gateway = rt->rt_gateway;
101 xdst->u.rt.rt_pmtu = rt->rt_pmtu; 94 xdst->u.rt.rt_pmtu = rt->rt_pmtu;
102 95