diff options
-rw-r--r-- | include/net/route.h | 1 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 5 | ||||
-rw-r--r-- | net/ipv4/route.c | 940 |
3 files changed, 13 insertions, 933 deletions
diff --git a/include/net/route.h b/include/net/route.h index ace3cb442519..5dcfeb621e06 100644 --- a/include/net/route.h +++ b/include/net/route.h | |||
@@ -109,7 +109,6 @@ extern struct ip_rt_acct __percpu *ip_rt_acct; | |||
109 | struct in_device; | 109 | struct in_device; |
110 | extern int ip_rt_init(void); | 110 | extern int ip_rt_init(void); |
111 | extern void rt_cache_flush(struct net *net, int how); | 111 | extern void rt_cache_flush(struct net *net, int how); |
112 | extern void rt_cache_flush_batch(struct net *net); | ||
113 | extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); | 112 | extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); |
114 | extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, | 113 | extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, |
115 | struct sock *sk); | 114 | struct sock *sk); |
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b83203658ee3..f277cf0e6321 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c | |||
@@ -1072,11 +1072,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo | |||
1072 | rt_cache_flush(dev_net(dev), 0); | 1072 | rt_cache_flush(dev_net(dev), 0); |
1073 | break; | 1073 | break; |
1074 | case NETDEV_UNREGISTER_BATCH: | 1074 | case NETDEV_UNREGISTER_BATCH: |
1075 | /* The batch unregister is only called on the first | ||
1076 | * device in the list of devices being unregistered. | ||
1077 | * Therefore we should not pass dev_net(dev) in here. | ||
1078 | */ | ||
1079 | rt_cache_flush_batch(NULL); | ||
1080 | break; | 1075 | break; |
1081 | } | 1076 | } |
1082 | return NOTIFY_DONE; | 1077 | return NOTIFY_DONE; |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d547f6fae20d..6d6146d31f22 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8; | |||
133 | static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; | 133 | static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; |
134 | static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; | 134 | static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; |
135 | static int ip_rt_min_advmss __read_mostly = 256; | 135 | static int ip_rt_min_advmss __read_mostly = 256; |
136 | static int rt_chain_length_max __read_mostly = 20; | ||
137 | |||
138 | static struct delayed_work expires_work; | ||
139 | static unsigned long expires_ljiffies; | ||
140 | 136 | ||
141 | /* | 137 | /* |
142 | * Interface to generic destination cache. | 138 | * Interface to generic destination cache. |
@@ -152,7 +148,6 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, | |||
152 | struct sk_buff *skb, u32 mtu); | 148 | struct sk_buff *skb, u32 mtu); |
153 | static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, | 149 | static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, |
154 | struct sk_buff *skb); | 150 | struct sk_buff *skb); |
155 | static int rt_garbage_collect(struct dst_ops *ops); | ||
156 | 151 | ||
157 | static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, | 152 | static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, |
158 | int how) | 153 | int how) |
@@ -172,7 +167,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, | |||
172 | static struct dst_ops ipv4_dst_ops = { | 167 | static struct dst_ops ipv4_dst_ops = { |
173 | .family = AF_INET, | 168 | .family = AF_INET, |
174 | .protocol = cpu_to_be16(ETH_P_IP), | 169 | .protocol = cpu_to_be16(ETH_P_IP), |
175 | .gc = rt_garbage_collect, | ||
176 | .check = ipv4_dst_check, | 170 | .check = ipv4_dst_check, |
177 | .default_advmss = ipv4_default_advmss, | 171 | .default_advmss = ipv4_default_advmss, |
178 | .mtu = ipv4_mtu, | 172 | .mtu = ipv4_mtu, |
@@ -209,184 +203,30 @@ const __u8 ip_tos2prio[16] = { | |||
209 | }; | 203 | }; |
210 | EXPORT_SYMBOL(ip_tos2prio); | 204 | EXPORT_SYMBOL(ip_tos2prio); |
211 | 205 | ||
212 | /* | ||
213 | * Route cache. | ||
214 | */ | ||
215 | |||
216 | /* The locking scheme is rather straight forward: | ||
217 | * | ||
218 | * 1) Read-Copy Update protects the buckets of the central route hash. | ||
219 | * 2) Only writers remove entries, and they hold the lock | ||
220 | * as they look at rtable reference counts. | ||
221 | * 3) Only readers acquire references to rtable entries, | ||
222 | * they do so with atomic increments and with the | ||
223 | * lock held. | ||
224 | */ | ||
225 | |||
226 | struct rt_hash_bucket { | ||
227 | struct rtable __rcu *chain; | ||
228 | }; | ||
229 | |||
230 | #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ | ||
231 | defined(CONFIG_PROVE_LOCKING) | ||
232 | /* | ||
233 | * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks | ||
234 | * The size of this table is a power of two and depends on the number of CPUS. | ||
235 | * (on lockdep we have a quite big spinlock_t, so keep the size down there) | ||
236 | */ | ||
237 | #ifdef CONFIG_LOCKDEP | ||
238 | # define RT_HASH_LOCK_SZ 256 | ||
239 | #else | ||
240 | # if NR_CPUS >= 32 | ||
241 | # define RT_HASH_LOCK_SZ 4096 | ||
242 | # elif NR_CPUS >= 16 | ||
243 | # define RT_HASH_LOCK_SZ 2048 | ||
244 | # elif NR_CPUS >= 8 | ||
245 | # define RT_HASH_LOCK_SZ 1024 | ||
246 | # elif NR_CPUS >= 4 | ||
247 | # define RT_HASH_LOCK_SZ 512 | ||
248 | # else | ||
249 | # define RT_HASH_LOCK_SZ 256 | ||
250 | # endif | ||
251 | #endif | ||
252 | |||
253 | static spinlock_t *rt_hash_locks; | ||
254 | # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] | ||
255 | |||
256 | static __init void rt_hash_lock_init(void) | ||
257 | { | ||
258 | int i; | ||
259 | |||
260 | rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, | ||
261 | GFP_KERNEL); | ||
262 | if (!rt_hash_locks) | ||
263 | panic("IP: failed to allocate rt_hash_locks\n"); | ||
264 | |||
265 | for (i = 0; i < RT_HASH_LOCK_SZ; i++) | ||
266 | spin_lock_init(&rt_hash_locks[i]); | ||
267 | } | ||
268 | #else | ||
269 | # define rt_hash_lock_addr(slot) NULL | ||
270 | |||
271 | static inline void rt_hash_lock_init(void) | ||
272 | { | ||
273 | } | ||
274 | #endif | ||
275 | |||
276 | static struct rt_hash_bucket *rt_hash_table __read_mostly; | ||
277 | static unsigned int rt_hash_mask __read_mostly; | ||
278 | static unsigned int rt_hash_log __read_mostly; | ||
279 | |||
280 | static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); | 206 | static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); |
281 | #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) | 207 | #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) |
282 | 208 | ||
283 | static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, | ||
284 | int genid) | ||
285 | { | ||
286 | return jhash_3words((__force u32)daddr, (__force u32)saddr, | ||
287 | idx, genid) | ||
288 | & rt_hash_mask; | ||
289 | } | ||
290 | |||
291 | static inline int rt_genid(struct net *net) | 209 | static inline int rt_genid(struct net *net) |
292 | { | 210 | { |
293 | return atomic_read(&net->ipv4.rt_genid); | 211 | return atomic_read(&net->ipv4.rt_genid); |
294 | } | 212 | } |
295 | 213 | ||
296 | #ifdef CONFIG_PROC_FS | 214 | #ifdef CONFIG_PROC_FS |
297 | struct rt_cache_iter_state { | ||
298 | struct seq_net_private p; | ||
299 | int bucket; | ||
300 | int genid; | ||
301 | }; | ||
302 | |||
303 | static struct rtable *rt_cache_get_first(struct seq_file *seq) | ||
304 | { | ||
305 | struct rt_cache_iter_state *st = seq->private; | ||
306 | struct rtable *r = NULL; | ||
307 | |||
308 | for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { | ||
309 | if (!rcu_access_pointer(rt_hash_table[st->bucket].chain)) | ||
310 | continue; | ||
311 | rcu_read_lock_bh(); | ||
312 | r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); | ||
313 | while (r) { | ||
314 | if (dev_net(r->dst.dev) == seq_file_net(seq) && | ||
315 | r->rt_genid == st->genid) | ||
316 | return r; | ||
317 | r = rcu_dereference_bh(r->dst.rt_next); | ||
318 | } | ||
319 | rcu_read_unlock_bh(); | ||
320 | } | ||
321 | return r; | ||
322 | } | ||
323 | |||
324 | static struct rtable *__rt_cache_get_next(struct seq_file *seq, | ||
325 | struct rtable *r) | ||
326 | { | ||
327 | struct rt_cache_iter_state *st = seq->private; | ||
328 | |||
329 | r = rcu_dereference_bh(r->dst.rt_next); | ||
330 | while (!r) { | ||
331 | rcu_read_unlock_bh(); | ||
332 | do { | ||
333 | if (--st->bucket < 0) | ||
334 | return NULL; | ||
335 | } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain)); | ||
336 | rcu_read_lock_bh(); | ||
337 | r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); | ||
338 | } | ||
339 | return r; | ||
340 | } | ||
341 | |||
342 | static struct rtable *rt_cache_get_next(struct seq_file *seq, | ||
343 | struct rtable *r) | ||
344 | { | ||
345 | struct rt_cache_iter_state *st = seq->private; | ||
346 | while ((r = __rt_cache_get_next(seq, r)) != NULL) { | ||
347 | if (dev_net(r->dst.dev) != seq_file_net(seq)) | ||
348 | continue; | ||
349 | if (r->rt_genid == st->genid) | ||
350 | break; | ||
351 | } | ||
352 | return r; | ||
353 | } | ||
354 | |||
355 | static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) | ||
356 | { | ||
357 | struct rtable *r = rt_cache_get_first(seq); | ||
358 | |||
359 | if (r) | ||
360 | while (pos && (r = rt_cache_get_next(seq, r))) | ||
361 | --pos; | ||
362 | return pos ? NULL : r; | ||
363 | } | ||
364 | |||
365 | static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) | 215 | static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) |
366 | { | 216 | { |
367 | struct rt_cache_iter_state *st = seq->private; | ||
368 | if (*pos) | 217 | if (*pos) |
369 | return rt_cache_get_idx(seq, *pos - 1); | 218 | return NULL; |
370 | st->genid = rt_genid(seq_file_net(seq)); | ||
371 | return SEQ_START_TOKEN; | 219 | return SEQ_START_TOKEN; |
372 | } | 220 | } |
373 | 221 | ||
374 | static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) | 222 | static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
375 | { | 223 | { |
376 | struct rtable *r; | ||
377 | |||
378 | if (v == SEQ_START_TOKEN) | ||
379 | r = rt_cache_get_first(seq); | ||
380 | else | ||
381 | r = rt_cache_get_next(seq, v); | ||
382 | ++*pos; | 224 | ++*pos; |
383 | return r; | 225 | return NULL; |
384 | } | 226 | } |
385 | 227 | ||
386 | static void rt_cache_seq_stop(struct seq_file *seq, void *v) | 228 | static void rt_cache_seq_stop(struct seq_file *seq, void *v) |
387 | { | 229 | { |
388 | if (v && v != SEQ_START_TOKEN) | ||
389 | rcu_read_unlock_bh(); | ||
390 | } | 230 | } |
391 | 231 | ||
392 | static int rt_cache_seq_show(struct seq_file *seq, void *v) | 232 | static int rt_cache_seq_show(struct seq_file *seq, void *v) |
@@ -396,24 +236,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) | |||
396 | "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" | 236 | "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" |
397 | "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" | 237 | "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" |
398 | "HHUptod\tSpecDst"); | 238 | "HHUptod\tSpecDst"); |
399 | else { | ||
400 | struct rtable *r = v; | ||
401 | int len; | ||
402 | |||
403 | seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" | ||
404 | "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", | ||
405 | r->dst.dev ? r->dst.dev->name : "*", | ||
406 | (__force u32)r->rt_dst, | ||
407 | (__force u32)r->rt_gateway, | ||
408 | r->rt_flags, atomic_read(&r->dst.__refcnt), | ||
409 | r->dst.__use, 0, (__force u32)r->rt_src, | ||
410 | dst_metric_advmss(&r->dst) + 40, | ||
411 | dst_metric(&r->dst, RTAX_WINDOW), 0, | ||
412 | r->rt_key_tos, | ||
413 | -1, 0, 0, &len); | ||
414 | |||
415 | seq_printf(seq, "%*s\n", 127 - len, ""); | ||
416 | } | ||
417 | return 0; | 239 | return 0; |
418 | } | 240 | } |
419 | 241 | ||
@@ -426,8 +248,7 @@ static const struct seq_operations rt_cache_seq_ops = { | |||
426 | 248 | ||
427 | static int rt_cache_seq_open(struct inode *inode, struct file *file) | 249 | static int rt_cache_seq_open(struct inode *inode, struct file *file) |
428 | { | 250 | { |
429 | return seq_open_net(inode, file, &rt_cache_seq_ops, | 251 | return seq_open(file, &rt_cache_seq_ops); |
430 | sizeof(struct rt_cache_iter_state)); | ||
431 | } | 252 | } |
432 | 253 | ||
433 | static const struct file_operations rt_cache_seq_fops = { | 254 | static const struct file_operations rt_cache_seq_fops = { |
@@ -435,7 +256,7 @@ static const struct file_operations rt_cache_seq_fops = { | |||
435 | .open = rt_cache_seq_open, | 256 | .open = rt_cache_seq_open, |
436 | .read = seq_read, | 257 | .read = seq_read, |
437 | .llseek = seq_lseek, | 258 | .llseek = seq_lseek, |
438 | .release = seq_release_net, | 259 | .release = seq_release, |
439 | }; | 260 | }; |
440 | 261 | ||
441 | 262 | ||
@@ -625,263 +446,12 @@ static inline int ip_rt_proc_init(void) | |||
625 | } | 446 | } |
626 | #endif /* CONFIG_PROC_FS */ | 447 | #endif /* CONFIG_PROC_FS */ |
627 | 448 | ||
628 | static inline void rt_free(struct rtable *rt) | ||
629 | { | ||
630 | call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); | ||
631 | } | ||
632 | |||
633 | static inline void rt_drop(struct rtable *rt) | ||
634 | { | ||
635 | ip_rt_put(rt); | ||
636 | call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); | ||
637 | } | ||
638 | |||
639 | static inline int rt_fast_clean(struct rtable *rth) | ||
640 | { | ||
641 | /* Kill broadcast/multicast entries very aggresively, if they | ||
642 | collide in hash table with more useful entries */ | ||
643 | return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && | ||
644 | rt_is_input_route(rth) && rth->dst.rt_next; | ||
645 | } | ||
646 | |||
647 | static inline int rt_valuable(struct rtable *rth) | ||
648 | { | ||
649 | return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || | ||
650 | rth->dst.expires; | ||
651 | } | ||
652 | |||
653 | static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) | ||
654 | { | ||
655 | unsigned long age; | ||
656 | int ret = 0; | ||
657 | |||
658 | if (atomic_read(&rth->dst.__refcnt)) | ||
659 | goto out; | ||
660 | |||
661 | age = jiffies - rth->dst.lastuse; | ||
662 | if ((age <= tmo1 && !rt_fast_clean(rth)) || | ||
663 | (age <= tmo2 && rt_valuable(rth))) | ||
664 | goto out; | ||
665 | ret = 1; | ||
666 | out: return ret; | ||
667 | } | ||
668 | |||
669 | /* Bits of score are: | ||
670 | * 31: very valuable | ||
671 | * 30: not quite useless | ||
672 | * 29..0: usage counter | ||
673 | */ | ||
674 | static inline u32 rt_score(struct rtable *rt) | ||
675 | { | ||
676 | u32 score = jiffies - rt->dst.lastuse; | ||
677 | |||
678 | score = ~score & ~(3<<30); | ||
679 | |||
680 | if (rt_valuable(rt)) | ||
681 | score |= (1<<31); | ||
682 | |||
683 | if (rt_is_output_route(rt) || | ||
684 | !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) | ||
685 | score |= (1<<30); | ||
686 | |||
687 | return score; | ||
688 | } | ||
689 | |||
690 | static inline bool rt_caching(const struct net *net) | ||
691 | { | ||
692 | return net->ipv4.current_rt_cache_rebuild_count <= | ||
693 | net->ipv4.sysctl_rt_cache_rebuild_count; | ||
694 | } | ||
695 | |||
696 | static inline bool compare_hash_inputs(const struct rtable *rt1, | ||
697 | const struct rtable *rt2) | ||
698 | { | ||
699 | return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | | ||
700 | ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | | ||
701 | (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0); | ||
702 | } | ||
703 | |||
704 | static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) | ||
705 | { | ||
706 | return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | | ||
707 | ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | | ||
708 | (rt1->rt_mark ^ rt2->rt_mark) | | ||
709 | (rt1->rt_key_tos ^ rt2->rt_key_tos) | | ||
710 | (rt1->rt_route_iif ^ rt2->rt_route_iif) | | ||
711 | (rt1->rt_oif ^ rt2->rt_oif)) == 0; | ||
712 | } | ||
713 | |||
714 | static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) | ||
715 | { | ||
716 | return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); | ||
717 | } | ||
718 | |||
719 | static inline int rt_is_expired(struct rtable *rth) | 449 | static inline int rt_is_expired(struct rtable *rth) |
720 | { | 450 | { |
721 | return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); | 451 | return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); |
722 | } | 452 | } |
723 | 453 | ||
724 | /* | 454 | /* |
725 | * Perform a full scan of hash table and free all entries. | ||
726 | * Can be called by a softirq or a process. | ||
727 | * In the later case, we want to be reschedule if necessary | ||
728 | */ | ||
729 | static void rt_do_flush(struct net *net, int process_context) | ||
730 | { | ||
731 | unsigned int i; | ||
732 | struct rtable *rth, *next; | ||
733 | |||
734 | for (i = 0; i <= rt_hash_mask; i++) { | ||
735 | struct rtable __rcu **pprev; | ||
736 | struct rtable *list; | ||
737 | |||
738 | if (process_context && need_resched()) | ||
739 | cond_resched(); | ||
740 | rth = rcu_access_pointer(rt_hash_table[i].chain); | ||
741 | if (!rth) | ||
742 | continue; | ||
743 | |||
744 | spin_lock_bh(rt_hash_lock_addr(i)); | ||
745 | |||
746 | list = NULL; | ||
747 | pprev = &rt_hash_table[i].chain; | ||
748 | rth = rcu_dereference_protected(*pprev, | ||
749 | lockdep_is_held(rt_hash_lock_addr(i))); | ||
750 | |||
751 | while (rth) { | ||
752 | next = rcu_dereference_protected(rth->dst.rt_next, | ||
753 | lockdep_is_held(rt_hash_lock_addr(i))); | ||
754 | |||
755 | if (!net || | ||
756 | net_eq(dev_net(rth->dst.dev), net)) { | ||
757 | rcu_assign_pointer(*pprev, next); | ||
758 | rcu_assign_pointer(rth->dst.rt_next, list); | ||
759 | list = rth; | ||
760 | } else { | ||
761 | pprev = &rth->dst.rt_next; | ||
762 | } | ||
763 | rth = next; | ||
764 | } | ||
765 | |||
766 | spin_unlock_bh(rt_hash_lock_addr(i)); | ||
767 | |||
768 | for (; list; list = next) { | ||
769 | next = rcu_dereference_protected(list->dst.rt_next, 1); | ||
770 | rt_free(list); | ||
771 | } | ||
772 | } | ||
773 | } | ||
774 | |||
775 | /* | ||
776 | * While freeing expired entries, we compute average chain length | ||
777 | * and standard deviation, using fixed-point arithmetic. | ||
778 | * This to have an estimation of rt_chain_length_max | ||
779 | * rt_chain_length_max = max(elasticity, AVG + 4*SD) | ||
780 | * We use 3 bits for frational part, and 29 (or 61) for magnitude. | ||
781 | */ | ||
782 | |||
783 | #define FRACT_BITS 3 | ||
784 | #define ONE (1UL << FRACT_BITS) | ||
785 | |||
786 | /* | ||
787 | * Given a hash chain and an item in this hash chain, | ||
788 | * find if a previous entry has the same hash_inputs | ||
789 | * (but differs on tos, mark or oif) | ||
790 | * Returns 0 if an alias is found. | ||
791 | * Returns ONE if rth has no alias before itself. | ||
792 | */ | ||
793 | static int has_noalias(const struct rtable *head, const struct rtable *rth) | ||
794 | { | ||
795 | const struct rtable *aux = head; | ||
796 | |||
797 | while (aux != rth) { | ||
798 | if (compare_hash_inputs(aux, rth)) | ||
799 | return 0; | ||
800 | aux = rcu_dereference_protected(aux->dst.rt_next, 1); | ||
801 | } | ||
802 | return ONE; | ||
803 | } | ||
804 | |||
805 | static void rt_check_expire(void) | ||
806 | { | ||
807 | static unsigned int rover; | ||
808 | unsigned int i = rover, goal; | ||
809 | struct rtable *rth; | ||
810 | struct rtable __rcu **rthp; | ||
811 | unsigned long samples = 0; | ||
812 | unsigned long sum = 0, sum2 = 0; | ||
813 | unsigned long delta; | ||
814 | u64 mult; | ||
815 | |||
816 | delta = jiffies - expires_ljiffies; | ||
817 | expires_ljiffies = jiffies; | ||
818 | mult = ((u64)delta) << rt_hash_log; | ||
819 | if (ip_rt_gc_timeout > 1) | ||
820 | do_div(mult, ip_rt_gc_timeout); | ||
821 | goal = (unsigned int)mult; | ||
822 | if (goal > rt_hash_mask) | ||
823 | goal = rt_hash_mask + 1; | ||
824 | for (; goal > 0; goal--) { | ||
825 | unsigned long tmo = ip_rt_gc_timeout; | ||
826 | unsigned long length; | ||
827 | |||
828 | i = (i + 1) & rt_hash_mask; | ||
829 | rthp = &rt_hash_table[i].chain; | ||
830 | |||
831 | if (need_resched()) | ||
832 | cond_resched(); | ||
833 | |||
834 | samples++; | ||
835 | |||
836 | if (rcu_dereference_raw(*rthp) == NULL) | ||
837 | continue; | ||
838 | length = 0; | ||
839 | spin_lock_bh(rt_hash_lock_addr(i)); | ||
840 | while ((rth = rcu_dereference_protected(*rthp, | ||
841 | lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { | ||
842 | prefetch(rth->dst.rt_next); | ||
843 | if (rt_is_expired(rth) || | ||
844 | rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { | ||
845 | *rthp = rth->dst.rt_next; | ||
846 | rt_free(rth); | ||
847 | continue; | ||
848 | } | ||
849 | |||
850 | /* We only count entries on a chain with equal | ||
851 | * hash inputs once so that entries for | ||
852 | * different QOS levels, and other non-hash | ||
853 | * input attributes don't unfairly skew the | ||
854 | * length computation | ||
855 | */ | ||
856 | tmo >>= 1; | ||
857 | rthp = &rth->dst.rt_next; | ||
858 | length += has_noalias(rt_hash_table[i].chain, rth); | ||
859 | } | ||
860 | spin_unlock_bh(rt_hash_lock_addr(i)); | ||
861 | sum += length; | ||
862 | sum2 += length*length; | ||
863 | } | ||
864 | if (samples) { | ||
865 | unsigned long avg = sum / samples; | ||
866 | unsigned long sd = int_sqrt(sum2 / samples - avg*avg); | ||
867 | rt_chain_length_max = max_t(unsigned long, | ||
868 | ip_rt_gc_elasticity, | ||
869 | (avg + 4*sd) >> FRACT_BITS); | ||
870 | } | ||
871 | rover = i; | ||
872 | } | ||
873 | |||
874 | /* | ||
875 | * rt_worker_func() is run in process context. | ||
876 | * we call rt_check_expire() to scan part of the hash table | ||
877 | */ | ||
878 | static void rt_worker_func(struct work_struct *work) | ||
879 | { | ||
880 | rt_check_expire(); | ||
881 | schedule_delayed_work(&expires_work, ip_rt_gc_interval); | ||
882 | } | ||
883 | |||
884 | /* | ||
885 | * Perturbation of rt_genid by a small quantity [1..256] | 455 | * Perturbation of rt_genid by a small quantity [1..256] |
886 | * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() | 456 | * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() |
887 | * many times (2^24) without giving recent rt_genid. | 457 | * many times (2^24) without giving recent rt_genid. |
@@ -902,167 +472,6 @@ static void rt_cache_invalidate(struct net *net) | |||
902 | void rt_cache_flush(struct net *net, int delay) | 472 | void rt_cache_flush(struct net *net, int delay) |
903 | { | 473 | { |
904 | rt_cache_invalidate(net); | 474 | rt_cache_invalidate(net); |
905 | if (delay >= 0) | ||
906 | rt_do_flush(net, !in_softirq()); | ||
907 | } | ||
908 | |||
909 | /* Flush previous cache invalidated entries from the cache */ | ||
910 | void rt_cache_flush_batch(struct net *net) | ||
911 | { | ||
912 | rt_do_flush(net, !in_softirq()); | ||
913 | } | ||
914 | |||
915 | static void rt_emergency_hash_rebuild(struct net *net) | ||
916 | { | ||
917 | net_warn_ratelimited("Route hash chain too long!\n"); | ||
918 | rt_cache_invalidate(net); | ||
919 | } | ||
920 | |||
921 | /* | ||
922 | Short description of GC goals. | ||
923 | |||
924 | We want to build algorithm, which will keep routing cache | ||
925 | at some equilibrium point, when number of aged off entries | ||
926 | is kept approximately equal to newly generated ones. | ||
927 | |||
928 | Current expiration strength is variable "expire". | ||
929 | We try to adjust it dynamically, so that if networking | ||
930 | is idle expires is large enough to keep enough of warm entries, | ||
931 | and when load increases it reduces to limit cache size. | ||
932 | */ | ||
933 | |||
934 | static int rt_garbage_collect(struct dst_ops *ops) | ||
935 | { | ||
936 | static unsigned long expire = RT_GC_TIMEOUT; | ||
937 | static unsigned long last_gc; | ||
938 | static int rover; | ||
939 | static int equilibrium; | ||
940 | struct rtable *rth; | ||
941 | struct rtable __rcu **rthp; | ||
942 | unsigned long now = jiffies; | ||
943 | int goal; | ||
944 | int entries = dst_entries_get_fast(&ipv4_dst_ops); | ||
945 | |||
946 | /* | ||
947 | * Garbage collection is pretty expensive, | ||
948 | * do not make it too frequently. | ||
949 | */ | ||
950 | |||
951 | RT_CACHE_STAT_INC(gc_total); | ||
952 | |||
953 | if (now - last_gc < ip_rt_gc_min_interval && | ||
954 | entries < ip_rt_max_size) { | ||
955 | RT_CACHE_STAT_INC(gc_ignored); | ||
956 | goto out; | ||
957 | } | ||
958 | |||
959 | entries = dst_entries_get_slow(&ipv4_dst_ops); | ||
960 | /* Calculate number of entries, which we want to expire now. */ | ||
961 | goal = entries - (ip_rt_gc_elasticity << rt_hash_log); | ||
962 | if (goal <= 0) { | ||
963 | if (equilibrium < ipv4_dst_ops.gc_thresh) | ||
964 | equilibrium = ipv4_dst_ops.gc_thresh; | ||
965 | goal = entries - equilibrium; | ||
966 | if (goal > 0) { | ||
967 | equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); | ||
968 | goal = entries - equilibrium; | ||
969 | } | ||
970 | } else { | ||
971 | /* We are in dangerous area. Try to reduce cache really | ||
972 | * aggressively. | ||
973 | */ | ||
974 | goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); | ||
975 | equilibrium = entries - goal; | ||
976 | } | ||
977 | |||
978 | if (now - last_gc >= ip_rt_gc_min_interval) | ||
979 | last_gc = now; | ||
980 | |||
981 | if (goal <= 0) { | ||
982 | equilibrium += goal; | ||
983 | goto work_done; | ||
984 | } | ||
985 | |||
986 | do { | ||
987 | int i, k; | ||
988 | |||
989 | for (i = rt_hash_mask, k = rover; i >= 0; i--) { | ||
990 | unsigned long tmo = expire; | ||
991 | |||
992 | k = (k + 1) & rt_hash_mask; | ||
993 | rthp = &rt_hash_table[k].chain; | ||
994 | spin_lock_bh(rt_hash_lock_addr(k)); | ||
995 | while ((rth = rcu_dereference_protected(*rthp, | ||
996 | lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { | ||
997 | if (!rt_is_expired(rth) && | ||
998 | !rt_may_expire(rth, tmo, expire)) { | ||
999 | tmo >>= 1; | ||
1000 | rthp = &rth->dst.rt_next; | ||
1001 | continue; | ||
1002 | } | ||
1003 | *rthp = rth->dst.rt_next; | ||
1004 | rt_free(rth); | ||
1005 | goal--; | ||
1006 | } | ||
1007 | spin_unlock_bh(rt_hash_lock_addr(k)); | ||
1008 | if (goal <= 0) | ||
1009 | break; | ||
1010 | } | ||
1011 | rover = k; | ||
1012 | |||
1013 | if (goal <= 0) | ||
1014 | goto work_done; | ||
1015 | |||
1016 | /* Goal is not achieved. We stop process if: | ||
1017 | |||
1018 | - if expire reduced to zero. Otherwise, expire is halfed. | ||
1019 | - if table is not full. | ||
1020 | - if we are called from interrupt. | ||
1021 | - jiffies check is just fallback/debug loop breaker. | ||
1022 | We will not spin here for long time in any case. | ||
1023 | */ | ||
1024 | |||
1025 | RT_CACHE_STAT_INC(gc_goal_miss); | ||
1026 | |||
1027 | if (expire == 0) | ||
1028 | break; | ||
1029 | |||
1030 | expire >>= 1; | ||
1031 | |||
1032 | if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) | ||
1033 | goto out; | ||
1034 | } while (!in_softirq() && time_before_eq(jiffies, now)); | ||
1035 | |||
1036 | if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) | ||
1037 | goto out; | ||
1038 | if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) | ||
1039 | goto out; | ||
1040 | net_warn_ratelimited("dst cache overflow\n"); | ||
1041 | RT_CACHE_STAT_INC(gc_dst_overflow); | ||
1042 | return 1; | ||
1043 | |||
1044 | work_done: | ||
1045 | expire += ip_rt_gc_min_interval; | ||
1046 | if (expire > ip_rt_gc_timeout || | ||
1047 | dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || | ||
1048 | dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) | ||
1049 | expire = ip_rt_gc_timeout; | ||
1050 | out: return 0; | ||
1051 | } | ||
1052 | |||
1053 | /* | ||
1054 | * Returns number of entries in a hash chain that have different hash_inputs | ||
1055 | */ | ||
1056 | static int slow_chain_length(const struct rtable *head) | ||
1057 | { | ||
1058 | int length = 0; | ||
1059 | const struct rtable *rth = head; | ||
1060 | |||
1061 | while (rth) { | ||
1062 | length += has_noalias(head, rth); | ||
1063 | rth = rcu_dereference_protected(rth->dst.rt_next, 1); | ||
1064 | } | ||
1065 | return length >> FRACT_BITS; | ||
1066 | } | 475 | } |
1067 | 476 | ||
1068 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, | 477 | static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, |
@@ -1086,139 +495,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, | |||
1086 | return neigh_create(&arp_tbl, pkey, dev); | 495 | return neigh_create(&arp_tbl, pkey, dev); |
1087 | } | 496 | } |
1088 | 497 | ||
1089 | static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt, | ||
1090 | struct sk_buff *skb, int ifindex) | ||
1091 | { | ||
1092 | struct rtable *rth, *cand; | ||
1093 | struct rtable __rcu **rthp, **candp; | ||
1094 | unsigned long now; | ||
1095 | u32 min_score; | ||
1096 | int chain_length; | ||
1097 | |||
1098 | restart: | ||
1099 | chain_length = 0; | ||
1100 | min_score = ~(u32)0; | ||
1101 | cand = NULL; | ||
1102 | candp = NULL; | ||
1103 | now = jiffies; | ||
1104 | |||
1105 | if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) { | ||
1106 | /* | ||
1107 | * If we're not caching, just tell the caller we | ||
1108 | * were successful and don't touch the route. The | ||
1109 | * caller hold the sole reference to the cache entry, and | ||
1110 | * it will be released when the caller is done with it. | ||
1111 | * If we drop it here, the callers have no way to resolve routes | ||
1112 | * when we're not caching. Instead, just point *rp at rt, so | ||
1113 | * the caller gets a single use out of the route | ||
1114 | * Note that we do rt_free on this new route entry, so that | ||
1115 | * once its refcount hits zero, we are still able to reap it | ||
1116 | * (Thanks Alexey) | ||
1117 | * Note: To avoid expensive rcu stuff for this uncached dst, | ||
1118 | * we set DST_NOCACHE so that dst_release() can free dst without | ||
1119 | * waiting a grace period. | ||
1120 | */ | ||
1121 | |||
1122 | rt->dst.flags |= DST_NOCACHE; | ||
1123 | goto skip_hashing; | ||
1124 | } | ||
1125 | |||
1126 | rthp = &rt_hash_table[hash].chain; | ||
1127 | |||
1128 | spin_lock_bh(rt_hash_lock_addr(hash)); | ||
1129 | while ((rth = rcu_dereference_protected(*rthp, | ||
1130 | lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { | ||
1131 | if (rt_is_expired(rth)) { | ||
1132 | *rthp = rth->dst.rt_next; | ||
1133 | rt_free(rth); | ||
1134 | continue; | ||
1135 | } | ||
1136 | if (compare_keys(rth, rt) && compare_netns(rth, rt)) { | ||
1137 | /* Put it first */ | ||
1138 | *rthp = rth->dst.rt_next; | ||
1139 | /* | ||
1140 | * Since lookup is lockfree, the deletion | ||
1141 | * must be visible to another weakly ordered CPU before | ||
1142 | * the insertion at the start of the hash chain. | ||
1143 | */ | ||
1144 | rcu_assign_pointer(rth->dst.rt_next, | ||
1145 | rt_hash_table[hash].chain); | ||
1146 | /* | ||
1147 | * Since lookup is lockfree, the update writes | ||
1148 | * must be ordered for consistency on SMP. | ||
1149 | */ | ||
1150 | rcu_assign_pointer(rt_hash_table[hash].chain, rth); | ||
1151 | |||
1152 | dst_use(&rth->dst, now); | ||
1153 | spin_unlock_bh(rt_hash_lock_addr(hash)); | ||
1154 | |||
1155 | rt_drop(rt); | ||
1156 | if (skb) | ||
1157 | skb_dst_set(skb, &rth->dst); | ||
1158 | return rth; | ||
1159 | } | ||
1160 | |||
1161 | if (!atomic_read(&rth->dst.__refcnt)) { | ||
1162 | u32 score = rt_score(rth); | ||
1163 | |||
1164 | if (score <= min_score) { | ||
1165 | cand = rth; | ||
1166 | candp = rthp; | ||
1167 | min_score = score; | ||
1168 | } | ||
1169 | } | ||
1170 | |||
1171 | chain_length++; | ||
1172 | |||
1173 | rthp = &rth->dst.rt_next; | ||
1174 | } | ||
1175 | |||
1176 | if (cand) { | ||
1177 | /* ip_rt_gc_elasticity used to be average length of chain | ||
1178 | * length, when exceeded gc becomes really aggressive. | ||
1179 | * | ||
1180 | * The second limit is less certain. At the moment it allows | ||
1181 | * only 2 entries per bucket. We will see. | ||
1182 | */ | ||
1183 | if (chain_length > ip_rt_gc_elasticity) { | ||
1184 | *candp = cand->dst.rt_next; | ||
1185 | rt_free(cand); | ||
1186 | } | ||
1187 | } else { | ||
1188 | if (chain_length > rt_chain_length_max && | ||
1189 | slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { | ||
1190 | struct net *net = dev_net(rt->dst.dev); | ||
1191 | int num = ++net->ipv4.current_rt_cache_rebuild_count; | ||
1192 | if (!rt_caching(net)) { | ||
1193 | pr_warn("%s: %d rebuilds is over limit, route caching disabled\n", | ||
1194 | rt->dst.dev->name, num); | ||
1195 | } | ||
1196 | rt_emergency_hash_rebuild(net); | ||
1197 | spin_unlock_bh(rt_hash_lock_addr(hash)); | ||
1198 | |||
1199 | hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, | ||
1200 | ifindex, rt_genid(net)); | ||
1201 | goto restart; | ||
1202 | } | ||
1203 | } | ||
1204 | |||
1205 | rt->dst.rt_next = rt_hash_table[hash].chain; | ||
1206 | |||
1207 | /* | ||
1208 | * Since lookup is lockfree, we must make sure | ||
1209 | * previous writes to rt are committed to memory | ||
1210 | * before making rt visible to other CPUS. | ||
1211 | */ | ||
1212 | rcu_assign_pointer(rt_hash_table[hash].chain, rt); | ||
1213 | |||
1214 | spin_unlock_bh(rt_hash_lock_addr(hash)); | ||
1215 | |||
1216 | skip_hashing: | ||
1217 | if (skb) | ||
1218 | skb_dst_set(skb, &rt->dst); | ||
1219 | return rt; | ||
1220 | } | ||
1221 | |||
1222 | /* | 498 | /* |
1223 | * Peer allocation may fail only in serious out-of-memory conditions. However | 499 | * Peer allocation may fail only in serious out-of-memory conditions. However |
1224 | * we still can generate some output. | 500 | * we still can generate some output. |
@@ -1255,26 +531,6 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) | |||
1255 | } | 531 | } |
1256 | EXPORT_SYMBOL(__ip_select_ident); | 532 | EXPORT_SYMBOL(__ip_select_ident); |
1257 | 533 | ||
1258 | static void rt_del(unsigned int hash, struct rtable *rt) | ||
1259 | { | ||
1260 | struct rtable __rcu **rthp; | ||
1261 | struct rtable *aux; | ||
1262 | |||
1263 | rthp = &rt_hash_table[hash].chain; | ||
1264 | spin_lock_bh(rt_hash_lock_addr(hash)); | ||
1265 | ip_rt_put(rt); | ||
1266 | while ((aux = rcu_dereference_protected(*rthp, | ||
1267 | lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { | ||
1268 | if (aux == rt || rt_is_expired(aux)) { | ||
1269 | *rthp = aux->dst.rt_next; | ||
1270 | rt_free(aux); | ||
1271 | continue; | ||
1272 | } | ||
1273 | rthp = &aux->dst.rt_next; | ||
1274 | } | ||
1275 | spin_unlock_bh(rt_hash_lock_addr(hash)); | ||
1276 | } | ||
1277 | |||
1278 | static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, | 534 | static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, |
1279 | const struct iphdr *iph, | 535 | const struct iphdr *iph, |
1280 | int oif, u8 tos, | 536 | int oif, u8 tos, |
@@ -1518,10 +774,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) | |||
1518 | ret = NULL; | 774 | ret = NULL; |
1519 | } else if ((rt->rt_flags & RTCF_REDIRECTED) || | 775 | } else if ((rt->rt_flags & RTCF_REDIRECTED) || |
1520 | rt->dst.expires) { | 776 | rt->dst.expires) { |
1521 | unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, | 777 | ip_rt_put(rt); |
1522 | rt->rt_oif, | ||
1523 | rt_genid(dev_net(dst->dev))); | ||
1524 | rt_del(hash, rt); | ||
1525 | ret = NULL; | 778 | ret = NULL; |
1526 | } | 779 | } |
1527 | } | 780 | } |
@@ -1969,7 +1222,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, | |||
1969 | bool nopolicy, bool noxfrm) | 1222 | bool nopolicy, bool noxfrm) |
1970 | { | 1223 | { |
1971 | return dst_alloc(&ipv4_dst_ops, dev, 1, -1, | 1224 | return dst_alloc(&ipv4_dst_ops, dev, 1, -1, |
1972 | DST_HOST | | 1225 | DST_HOST | DST_NOCACHE | |
1973 | (nopolicy ? DST_NOPOLICY : 0) | | 1226 | (nopolicy ? DST_NOPOLICY : 0) | |
1974 | (noxfrm ? DST_NOXFRM : 0)); | 1227 | (noxfrm ? DST_NOXFRM : 0)); |
1975 | } | 1228 | } |
@@ -1978,7 +1231,6 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, | |||
1978 | static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | 1231 | static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, |
1979 | u8 tos, struct net_device *dev, int our) | 1232 | u8 tos, struct net_device *dev, int our) |
1980 | { | 1233 | { |
1981 | unsigned int hash; | ||
1982 | struct rtable *rth; | 1234 | struct rtable *rth; |
1983 | struct in_device *in_dev = __in_dev_get_rcu(dev); | 1235 | struct in_device *in_dev = __in_dev_get_rcu(dev); |
1984 | u32 itag = 0; | 1236 | u32 itag = 0; |
@@ -2042,9 +1294,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2042 | #endif | 1294 | #endif |
2043 | RT_CACHE_STAT_INC(in_slow_mc); | 1295 | RT_CACHE_STAT_INC(in_slow_mc); |
2044 | 1296 | ||
2045 | hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); | 1297 | skb_dst_set(skb, &rth->dst); |
2046 | rth = rt_intern_hash(hash, rth, skb, dev->ifindex); | 1298 | return 0; |
2047 | return IS_ERR(rth) ? PTR_ERR(rth) : 0; | ||
2048 | 1299 | ||
2049 | e_nobufs: | 1300 | e_nobufs: |
2050 | return -ENOBUFS; | 1301 | return -ENOBUFS; |
@@ -2176,7 +1427,6 @@ static int ip_mkroute_input(struct sk_buff *skb, | |||
2176 | { | 1427 | { |
2177 | struct rtable *rth = NULL; | 1428 | struct rtable *rth = NULL; |
2178 | int err; | 1429 | int err; |
2179 | unsigned int hash; | ||
2180 | 1430 | ||
2181 | #ifdef CONFIG_IP_ROUTE_MULTIPATH | 1431 | #ifdef CONFIG_IP_ROUTE_MULTIPATH |
2182 | if (res->fi && res->fi->fib_nhs > 1) | 1432 | if (res->fi && res->fi->fib_nhs > 1) |
@@ -2188,12 +1438,7 @@ static int ip_mkroute_input(struct sk_buff *skb, | |||
2188 | if (err) | 1438 | if (err) |
2189 | return err; | 1439 | return err; |
2190 | 1440 | ||
2191 | /* put it into the cache */ | 1441 | skb_dst_set(skb, &rth->dst); |
2192 | hash = rt_hash(daddr, saddr, fl4->flowi4_iif, | ||
2193 | rt_genid(dev_net(rth->dst.dev))); | ||
2194 | rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); | ||
2195 | if (IS_ERR(rth)) | ||
2196 | return PTR_ERR(rth); | ||
2197 | return 0; | 1442 | return 0; |
2198 | } | 1443 | } |
2199 | 1444 | ||
@@ -2217,7 +1462,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, | |||
2217 | unsigned int flags = 0; | 1462 | unsigned int flags = 0; |
2218 | u32 itag = 0; | 1463 | u32 itag = 0; |
2219 | struct rtable *rth; | 1464 | struct rtable *rth; |
2220 | unsigned int hash; | ||
2221 | int err = -EINVAL; | 1465 | int err = -EINVAL; |
2222 | struct net *net = dev_net(dev); | 1466 | struct net *net = dev_net(dev); |
2223 | 1467 | ||
@@ -2339,11 +1583,8 @@ local_input: | |||
2339 | rth->dst.error= -err; | 1583 | rth->dst.error= -err; |
2340 | rth->rt_flags &= ~RTCF_LOCAL; | 1584 | rth->rt_flags &= ~RTCF_LOCAL; |
2341 | } | 1585 | } |
2342 | hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); | 1586 | skb_dst_set(skb, &rth->dst); |
2343 | rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); | ||
2344 | err = 0; | 1587 | err = 0; |
2345 | if (IS_ERR(rth)) | ||
2346 | err = PTR_ERR(rth); | ||
2347 | goto out; | 1588 | goto out; |
2348 | 1589 | ||
2349 | no_route: | 1590 | no_route: |
@@ -2382,46 +1623,10 @@ martian_source_keep_err: | |||
2382 | int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, | 1623 | int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, |
2383 | u8 tos, struct net_device *dev, bool noref) | 1624 | u8 tos, struct net_device *dev, bool noref) |
2384 | { | 1625 | { |
2385 | struct rtable *rth; | ||
2386 | unsigned int hash; | ||
2387 | int iif = dev->ifindex; | ||
2388 | struct net *net; | ||
2389 | int res; | 1626 | int res; |
2390 | 1627 | ||
2391 | net = dev_net(dev); | ||
2392 | |||
2393 | rcu_read_lock(); | 1628 | rcu_read_lock(); |
2394 | 1629 | ||
2395 | if (!rt_caching(net)) | ||
2396 | goto skip_cache; | ||
2397 | |||
2398 | tos &= IPTOS_RT_MASK; | ||
2399 | hash = rt_hash(daddr, saddr, iif, rt_genid(net)); | ||
2400 | |||
2401 | for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; | ||
2402 | rth = rcu_dereference(rth->dst.rt_next)) { | ||
2403 | if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | | ||
2404 | ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | | ||
2405 | (rth->rt_route_iif ^ iif) | | ||
2406 | (rth->rt_key_tos ^ tos)) == 0 && | ||
2407 | rth->rt_mark == skb->mark && | ||
2408 | net_eq(dev_net(rth->dst.dev), net) && | ||
2409 | !rt_is_expired(rth)) { | ||
2410 | if (noref) { | ||
2411 | dst_use_noref(&rth->dst, jiffies); | ||
2412 | skb_dst_set_noref(skb, &rth->dst); | ||
2413 | } else { | ||
2414 | dst_use(&rth->dst, jiffies); | ||
2415 | skb_dst_set(skb, &rth->dst); | ||
2416 | } | ||
2417 | RT_CACHE_STAT_INC(in_hit); | ||
2418 | rcu_read_unlock(); | ||
2419 | return 0; | ||
2420 | } | ||
2421 | RT_CACHE_STAT_INC(in_hlist_search); | ||
2422 | } | ||
2423 | |||
2424 | skip_cache: | ||
2425 | /* Multicast recognition logic is moved from route cache to here. | 1630 | /* Multicast recognition logic is moved from route cache to here. |
2426 | The problem was that too many Ethernet cards have broken/missing | 1631 | The problem was that too many Ethernet cards have broken/missing |
2427 | hardware multicast filters :-( As result the host on multicasting | 1632 | hardware multicast filters :-( As result the host on multicasting |
@@ -2563,10 +1768,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res, | |||
2563 | 1768 | ||
2564 | /* | 1769 | /* |
2565 | * Major route resolver routine. | 1770 | * Major route resolver routine. |
2566 | * called with rcu_read_lock(); | ||
2567 | */ | 1771 | */ |
2568 | 1772 | ||
2569 | static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) | 1773 | struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) |
2570 | { | 1774 | { |
2571 | struct net_device *dev_out = NULL; | 1775 | struct net_device *dev_out = NULL; |
2572 | __u8 tos = RT_FL_TOS(fl4); | 1776 | __u8 tos = RT_FL_TOS(fl4); |
@@ -2746,57 +1950,11 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) | |||
2746 | make_route: | 1950 | make_route: |
2747 | rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, | 1951 | rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, |
2748 | tos, dev_out, flags); | 1952 | tos, dev_out, flags); |
2749 | if (!IS_ERR(rth)) { | ||
2750 | unsigned int hash; | ||
2751 | |||
2752 | hash = rt_hash(orig_daddr, orig_saddr, orig_oif, | ||
2753 | rt_genid(dev_net(dev_out))); | ||
2754 | rth = rt_intern_hash(hash, rth, NULL, orig_oif); | ||
2755 | } | ||
2756 | 1953 | ||
2757 | out: | 1954 | out: |
2758 | rcu_read_unlock(); | 1955 | rcu_read_unlock(); |
2759 | return rth; | 1956 | return rth; |
2760 | } | 1957 | } |
2761 | |||
2762 | struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) | ||
2763 | { | ||
2764 | struct rtable *rth; | ||
2765 | unsigned int hash; | ||
2766 | |||
2767 | if (!rt_caching(net)) | ||
2768 | goto slow_output; | ||
2769 | |||
2770 | hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); | ||
2771 | |||
2772 | rcu_read_lock_bh(); | ||
2773 | for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; | ||
2774 | rth = rcu_dereference_bh(rth->dst.rt_next)) { | ||
2775 | if (rth->rt_key_dst == flp4->daddr && | ||
2776 | rth->rt_key_src == flp4->saddr && | ||
2777 | rt_is_output_route(rth) && | ||
2778 | rth->rt_oif == flp4->flowi4_oif && | ||
2779 | rth->rt_mark == flp4->flowi4_mark && | ||
2780 | !((rth->rt_key_tos ^ flp4->flowi4_tos) & | ||
2781 | (IPTOS_RT_MASK | RTO_ONLINK)) && | ||
2782 | net_eq(dev_net(rth->dst.dev), net) && | ||
2783 | !rt_is_expired(rth)) { | ||
2784 | dst_use(&rth->dst, jiffies); | ||
2785 | RT_CACHE_STAT_INC(out_hit); | ||
2786 | rcu_read_unlock_bh(); | ||
2787 | if (!flp4->saddr) | ||
2788 | flp4->saddr = rth->rt_src; | ||
2789 | if (!flp4->daddr) | ||
2790 | flp4->daddr = rth->rt_dst; | ||
2791 | return rth; | ||
2792 | } | ||
2793 | RT_CACHE_STAT_INC(out_hlist_search); | ||
2794 | } | ||
2795 | rcu_read_unlock_bh(); | ||
2796 | |||
2797 | slow_output: | ||
2798 | return ip_route_output_slow(net, flp4); | ||
2799 | } | ||
2800 | EXPORT_SYMBOL_GPL(__ip_route_output_key); | 1958 | EXPORT_SYMBOL_GPL(__ip_route_output_key); |
2801 | 1959 | ||
2802 | static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) | 1960 | static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) |
@@ -3106,43 +2264,6 @@ errout_free: | |||
3106 | 2264 | ||
3107 | int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) | 2265 | int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) |
3108 | { | 2266 | { |
3109 | struct rtable *rt; | ||
3110 | int h, s_h; | ||
3111 | int idx, s_idx; | ||
3112 | struct net *net; | ||
3113 | |||
3114 | net = sock_net(skb->sk); | ||
3115 | |||
3116 | s_h = cb->args[0]; | ||
3117 | if (s_h < 0) | ||
3118 | s_h = 0; | ||
3119 | s_idx = idx = cb->args[1]; | ||
3120 | for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { | ||
3121 | if (!rt_hash_table[h].chain) | ||
3122 | continue; | ||
3123 | rcu_read_lock_bh(); | ||
3124 | for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; | ||
3125 | rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { | ||
3126 | if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) | ||
3127 | continue; | ||
3128 | if (rt_is_expired(rt)) | ||
3129 | continue; | ||
3130 | skb_dst_set_noref(skb, &rt->dst); | ||
3131 | if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, | ||
3132 | cb->nlh->nlmsg_seq, RTM_NEWROUTE, | ||
3133 | 1, NLM_F_MULTI) <= 0) { | ||
3134 | skb_dst_drop(skb); | ||
3135 | rcu_read_unlock_bh(); | ||
3136 | goto done; | ||
3137 | } | ||
3138 | skb_dst_drop(skb); | ||
3139 | } | ||
3140 | rcu_read_unlock_bh(); | ||
3141 | } | ||
3142 | |||
3143 | done: | ||
3144 | cb->args[0] = h; | ||
3145 | cb->args[1] = idx; | ||
3146 | return skb->len; | 2267 | return skb->len; |
3147 | } | 2268 | } |
3148 | 2269 | ||
@@ -3376,22 +2497,6 @@ static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { | |||
3376 | struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; | 2497 | struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; |
3377 | #endif /* CONFIG_IP_ROUTE_CLASSID */ | 2498 | #endif /* CONFIG_IP_ROUTE_CLASSID */ |
3378 | 2499 | ||
3379 | static __initdata unsigned long rhash_entries; | ||
3380 | static int __init set_rhash_entries(char *str) | ||
3381 | { | ||
3382 | ssize_t ret; | ||
3383 | |||
3384 | if (!str) | ||
3385 | return 0; | ||
3386 | |||
3387 | ret = kstrtoul(str, 0, &rhash_entries); | ||
3388 | if (ret) | ||
3389 | return 0; | ||
3390 | |||
3391 | return 1; | ||
3392 | } | ||
3393 | __setup("rhash_entries=", set_rhash_entries); | ||
3394 | |||
3395 | int __init ip_rt_init(void) | 2500 | int __init ip_rt_init(void) |
3396 | { | 2501 | { |
3397 | int rc = 0; | 2502 | int rc = 0; |
@@ -3414,31 +2519,12 @@ int __init ip_rt_init(void) | |||
3414 | if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) | 2519 | if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) |
3415 | panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); | 2520 | panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); |
3416 | 2521 | ||
3417 | rt_hash_table = (struct rt_hash_bucket *) | 2522 | ipv4_dst_ops.gc_thresh = ~0; |
3418 | alloc_large_system_hash("IP route cache", | 2523 | ip_rt_max_size = INT_MAX; |
3419 | sizeof(struct rt_hash_bucket), | ||
3420 | rhash_entries, | ||
3421 | (totalram_pages >= 128 * 1024) ? | ||
3422 | 15 : 17, | ||
3423 | 0, | ||
3424 | &rt_hash_log, | ||
3425 | &rt_hash_mask, | ||
3426 | 0, | ||
3427 | rhash_entries ? 0 : 512 * 1024); | ||
3428 | memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); | ||
3429 | rt_hash_lock_init(); | ||
3430 | |||
3431 | ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); | ||
3432 | ip_rt_max_size = (rt_hash_mask + 1) * 16; | ||
3433 | 2524 | ||
3434 | devinet_init(); | 2525 | devinet_init(); |
3435 | ip_fib_init(); | 2526 | ip_fib_init(); |
3436 | 2527 | ||
3437 | INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); | ||
3438 | expires_ljiffies = jiffies; | ||
3439 | schedule_delayed_work(&expires_work, | ||
3440 | net_random() % ip_rt_gc_interval + ip_rt_gc_interval); | ||
3441 | |||
3442 | if (ip_rt_proc_init()) | 2528 | if (ip_rt_proc_init()) |
3443 | pr_err("Unable to create route proc files\n"); | 2529 | pr_err("Unable to create route proc files\n"); |
3444 | #ifdef CONFIG_XFRM | 2530 | #ifdef CONFIG_XFRM |