aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2012-07-17 14:00:09 -0400
committerDavid S. Miller <davem@davemloft.net>2012-07-20 16:30:27 -0400
commit89aef8921bfbac22f00e04f8450f6e447db13e42 (patch)
tree4ff3885262d0f05af367c119528780b5d8d172ff
parentfa0afcd10951afad2022dda09777d2bf70cdab3d (diff)
ipv4: Delete routing cache.
The ipv4 routing cache is non-deterministic, performance wise, and is subject to reasonably easy to launch denial of service attacks. The routing cache works great for well behaved traffic, and the world was a much friendlier place when the tradeoffs that led to the routing cache's design were considered. What it boils down to is that the performance of the routing cache is a product of the traffic patterns seen by a system rather than being a product of the contents of the routing tables. The former of which is controllable by external entitites. Even for "well behaved" legitimate traffic, high volume sites can see hit rates in the routing cache of only ~%10. Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/route.h1
-rw-r--r--net/ipv4/fib_frontend.c5
-rw-r--r--net/ipv4/route.c940
3 files changed, 13 insertions, 933 deletions
diff --git a/include/net/route.h b/include/net/route.h
index ace3cb44251..5dcfeb621e0 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -109,7 +109,6 @@ extern struct ip_rt_acct __percpu *ip_rt_acct;
109struct in_device; 109struct in_device;
110extern int ip_rt_init(void); 110extern int ip_rt_init(void);
111extern void rt_cache_flush(struct net *net, int how); 111extern void rt_cache_flush(struct net *net, int how);
112extern void rt_cache_flush_batch(struct net *net);
113extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); 112extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
114extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, 113extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
115 struct sock *sk); 114 struct sock *sk);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b83203658ee..f277cf0e632 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1072,11 +1072,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
1072 rt_cache_flush(dev_net(dev), 0); 1072 rt_cache_flush(dev_net(dev), 0);
1073 break; 1073 break;
1074 case NETDEV_UNREGISTER_BATCH: 1074 case NETDEV_UNREGISTER_BATCH:
1075 /* The batch unregister is only called on the first
1076 * device in the list of devices being unregistered.
1077 * Therefore we should not pass dev_net(dev) in here.
1078 */
1079 rt_cache_flush_batch(NULL);
1080 break; 1075 break;
1081 } 1076 }
1082 return NOTIFY_DONE; 1077 return NOTIFY_DONE;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d547f6fae20..6d6146d31f2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -133,10 +133,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8;
133static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 133static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 134static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135static int ip_rt_min_advmss __read_mostly = 256; 135static int ip_rt_min_advmss __read_mostly = 256;
136static int rt_chain_length_max __read_mostly = 20;
137
138static struct delayed_work expires_work;
139static unsigned long expires_ljiffies;
140 136
141/* 137/*
142 * Interface to generic destination cache. 138 * Interface to generic destination cache.
@@ -152,7 +148,6 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
152 struct sk_buff *skb, u32 mtu); 148 struct sk_buff *skb, u32 mtu);
153static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, 149static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
154 struct sk_buff *skb); 150 struct sk_buff *skb);
155static int rt_garbage_collect(struct dst_ops *ops);
156 151
157static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 152static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
158 int how) 153 int how)
@@ -172,7 +167,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
172static struct dst_ops ipv4_dst_ops = { 167static struct dst_ops ipv4_dst_ops = {
173 .family = AF_INET, 168 .family = AF_INET,
174 .protocol = cpu_to_be16(ETH_P_IP), 169 .protocol = cpu_to_be16(ETH_P_IP),
175 .gc = rt_garbage_collect,
176 .check = ipv4_dst_check, 170 .check = ipv4_dst_check,
177 .default_advmss = ipv4_default_advmss, 171 .default_advmss = ipv4_default_advmss,
178 .mtu = ipv4_mtu, 172 .mtu = ipv4_mtu,
@@ -209,184 +203,30 @@ const __u8 ip_tos2prio[16] = {
209}; 203};
210EXPORT_SYMBOL(ip_tos2prio); 204EXPORT_SYMBOL(ip_tos2prio);
211 205
212/*
213 * Route cache.
214 */
215
216/* The locking scheme is rather straight forward:
217 *
218 * 1) Read-Copy Update protects the buckets of the central route hash.
219 * 2) Only writers remove entries, and they hold the lock
220 * as they look at rtable reference counts.
221 * 3) Only readers acquire references to rtable entries,
222 * they do so with atomic increments and with the
223 * lock held.
224 */
225
226struct rt_hash_bucket {
227 struct rtable __rcu *chain;
228};
229
230#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
231 defined(CONFIG_PROVE_LOCKING)
232/*
233 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
234 * The size of this table is a power of two and depends on the number of CPUS.
235 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
236 */
237#ifdef CONFIG_LOCKDEP
238# define RT_HASH_LOCK_SZ 256
239#else
240# if NR_CPUS >= 32
241# define RT_HASH_LOCK_SZ 4096
242# elif NR_CPUS >= 16
243# define RT_HASH_LOCK_SZ 2048
244# elif NR_CPUS >= 8
245# define RT_HASH_LOCK_SZ 1024
246# elif NR_CPUS >= 4
247# define RT_HASH_LOCK_SZ 512
248# else
249# define RT_HASH_LOCK_SZ 256
250# endif
251#endif
252
253static spinlock_t *rt_hash_locks;
254# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
255
256static __init void rt_hash_lock_init(void)
257{
258 int i;
259
260 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
261 GFP_KERNEL);
262 if (!rt_hash_locks)
263 panic("IP: failed to allocate rt_hash_locks\n");
264
265 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
266 spin_lock_init(&rt_hash_locks[i]);
267}
268#else
269# define rt_hash_lock_addr(slot) NULL
270
271static inline void rt_hash_lock_init(void)
272{
273}
274#endif
275
276static struct rt_hash_bucket *rt_hash_table __read_mostly;
277static unsigned int rt_hash_mask __read_mostly;
278static unsigned int rt_hash_log __read_mostly;
279
280static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 206static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
281#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) 207#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
282 208
283static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
284 int genid)
285{
286 return jhash_3words((__force u32)daddr, (__force u32)saddr,
287 idx, genid)
288 & rt_hash_mask;
289}
290
291static inline int rt_genid(struct net *net) 209static inline int rt_genid(struct net *net)
292{ 210{
293 return atomic_read(&net->ipv4.rt_genid); 211 return atomic_read(&net->ipv4.rt_genid);
294} 212}
295 213
296#ifdef CONFIG_PROC_FS 214#ifdef CONFIG_PROC_FS
297struct rt_cache_iter_state {
298 struct seq_net_private p;
299 int bucket;
300 int genid;
301};
302
303static struct rtable *rt_cache_get_first(struct seq_file *seq)
304{
305 struct rt_cache_iter_state *st = seq->private;
306 struct rtable *r = NULL;
307
308 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
309 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
310 continue;
311 rcu_read_lock_bh();
312 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
313 while (r) {
314 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
315 r->rt_genid == st->genid)
316 return r;
317 r = rcu_dereference_bh(r->dst.rt_next);
318 }
319 rcu_read_unlock_bh();
320 }
321 return r;
322}
323
324static struct rtable *__rt_cache_get_next(struct seq_file *seq,
325 struct rtable *r)
326{
327 struct rt_cache_iter_state *st = seq->private;
328
329 r = rcu_dereference_bh(r->dst.rt_next);
330 while (!r) {
331 rcu_read_unlock_bh();
332 do {
333 if (--st->bucket < 0)
334 return NULL;
335 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
336 rcu_read_lock_bh();
337 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
338 }
339 return r;
340}
341
342static struct rtable *rt_cache_get_next(struct seq_file *seq,
343 struct rtable *r)
344{
345 struct rt_cache_iter_state *st = seq->private;
346 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
347 if (dev_net(r->dst.dev) != seq_file_net(seq))
348 continue;
349 if (r->rt_genid == st->genid)
350 break;
351 }
352 return r;
353}
354
355static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
356{
357 struct rtable *r = rt_cache_get_first(seq);
358
359 if (r)
360 while (pos && (r = rt_cache_get_next(seq, r)))
361 --pos;
362 return pos ? NULL : r;
363}
364
365static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 215static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
366{ 216{
367 struct rt_cache_iter_state *st = seq->private;
368 if (*pos) 217 if (*pos)
369 return rt_cache_get_idx(seq, *pos - 1); 218 return NULL;
370 st->genid = rt_genid(seq_file_net(seq));
371 return SEQ_START_TOKEN; 219 return SEQ_START_TOKEN;
372} 220}
373 221
374static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 222static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
375{ 223{
376 struct rtable *r;
377
378 if (v == SEQ_START_TOKEN)
379 r = rt_cache_get_first(seq);
380 else
381 r = rt_cache_get_next(seq, v);
382 ++*pos; 224 ++*pos;
383 return r; 225 return NULL;
384} 226}
385 227
386static void rt_cache_seq_stop(struct seq_file *seq, void *v) 228static void rt_cache_seq_stop(struct seq_file *seq, void *v)
387{ 229{
388 if (v && v != SEQ_START_TOKEN)
389 rcu_read_unlock_bh();
390} 230}
391 231
392static int rt_cache_seq_show(struct seq_file *seq, void *v) 232static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -396,24 +236,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
396 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 236 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
397 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 237 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
398 "HHUptod\tSpecDst"); 238 "HHUptod\tSpecDst");
399 else {
400 struct rtable *r = v;
401 int len;
402
403 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
404 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
405 r->dst.dev ? r->dst.dev->name : "*",
406 (__force u32)r->rt_dst,
407 (__force u32)r->rt_gateway,
408 r->rt_flags, atomic_read(&r->dst.__refcnt),
409 r->dst.__use, 0, (__force u32)r->rt_src,
410 dst_metric_advmss(&r->dst) + 40,
411 dst_metric(&r->dst, RTAX_WINDOW), 0,
412 r->rt_key_tos,
413 -1, 0, 0, &len);
414
415 seq_printf(seq, "%*s\n", 127 - len, "");
416 }
417 return 0; 239 return 0;
418} 240}
419 241
@@ -426,8 +248,7 @@ static const struct seq_operations rt_cache_seq_ops = {
426 248
427static int rt_cache_seq_open(struct inode *inode, struct file *file) 249static int rt_cache_seq_open(struct inode *inode, struct file *file)
428{ 250{
429 return seq_open_net(inode, file, &rt_cache_seq_ops, 251 return seq_open(file, &rt_cache_seq_ops);
430 sizeof(struct rt_cache_iter_state));
431} 252}
432 253
433static const struct file_operations rt_cache_seq_fops = { 254static const struct file_operations rt_cache_seq_fops = {
@@ -435,7 +256,7 @@ static const struct file_operations rt_cache_seq_fops = {
435 .open = rt_cache_seq_open, 256 .open = rt_cache_seq_open,
436 .read = seq_read, 257 .read = seq_read,
437 .llseek = seq_lseek, 258 .llseek = seq_lseek,
438 .release = seq_release_net, 259 .release = seq_release,
439}; 260};
440 261
441 262
@@ -625,263 +446,12 @@ static inline int ip_rt_proc_init(void)
625} 446}
626#endif /* CONFIG_PROC_FS */ 447#endif /* CONFIG_PROC_FS */
627 448
628static inline void rt_free(struct rtable *rt)
629{
630 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
631}
632
633static inline void rt_drop(struct rtable *rt)
634{
635 ip_rt_put(rt);
636 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
637}
638
639static inline int rt_fast_clean(struct rtable *rth)
640{
641 /* Kill broadcast/multicast entries very aggresively, if they
642 collide in hash table with more useful entries */
643 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
644 rt_is_input_route(rth) && rth->dst.rt_next;
645}
646
647static inline int rt_valuable(struct rtable *rth)
648{
649 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
650 rth->dst.expires;
651}
652
653static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
654{
655 unsigned long age;
656 int ret = 0;
657
658 if (atomic_read(&rth->dst.__refcnt))
659 goto out;
660
661 age = jiffies - rth->dst.lastuse;
662 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
663 (age <= tmo2 && rt_valuable(rth)))
664 goto out;
665 ret = 1;
666out: return ret;
667}
668
669/* Bits of score are:
670 * 31: very valuable
671 * 30: not quite useless
672 * 29..0: usage counter
673 */
674static inline u32 rt_score(struct rtable *rt)
675{
676 u32 score = jiffies - rt->dst.lastuse;
677
678 score = ~score & ~(3<<30);
679
680 if (rt_valuable(rt))
681 score |= (1<<31);
682
683 if (rt_is_output_route(rt) ||
684 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
685 score |= (1<<30);
686
687 return score;
688}
689
690static inline bool rt_caching(const struct net *net)
691{
692 return net->ipv4.current_rt_cache_rebuild_count <=
693 net->ipv4.sysctl_rt_cache_rebuild_count;
694}
695
696static inline bool compare_hash_inputs(const struct rtable *rt1,
697 const struct rtable *rt2)
698{
699 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
700 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
701 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
702}
703
704static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
705{
706 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
707 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
708 (rt1->rt_mark ^ rt2->rt_mark) |
709 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
710 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
711 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
712}
713
714static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
715{
716 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
717}
718
719static inline int rt_is_expired(struct rtable *rth) 449static inline int rt_is_expired(struct rtable *rth)
720{ 450{
721 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); 451 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
722} 452}
723 453
724/* 454/*
725 * Perform a full scan of hash table and free all entries.
726 * Can be called by a softirq or a process.
727 * In the later case, we want to be reschedule if necessary
728 */
729static void rt_do_flush(struct net *net, int process_context)
730{
731 unsigned int i;
732 struct rtable *rth, *next;
733
734 for (i = 0; i <= rt_hash_mask; i++) {
735 struct rtable __rcu **pprev;
736 struct rtable *list;
737
738 if (process_context && need_resched())
739 cond_resched();
740 rth = rcu_access_pointer(rt_hash_table[i].chain);
741 if (!rth)
742 continue;
743
744 spin_lock_bh(rt_hash_lock_addr(i));
745
746 list = NULL;
747 pprev = &rt_hash_table[i].chain;
748 rth = rcu_dereference_protected(*pprev,
749 lockdep_is_held(rt_hash_lock_addr(i)));
750
751 while (rth) {
752 next = rcu_dereference_protected(rth->dst.rt_next,
753 lockdep_is_held(rt_hash_lock_addr(i)));
754
755 if (!net ||
756 net_eq(dev_net(rth->dst.dev), net)) {
757 rcu_assign_pointer(*pprev, next);
758 rcu_assign_pointer(rth->dst.rt_next, list);
759 list = rth;
760 } else {
761 pprev = &rth->dst.rt_next;
762 }
763 rth = next;
764 }
765
766 spin_unlock_bh(rt_hash_lock_addr(i));
767
768 for (; list; list = next) {
769 next = rcu_dereference_protected(list->dst.rt_next, 1);
770 rt_free(list);
771 }
772 }
773}
774
775/*
776 * While freeing expired entries, we compute average chain length
777 * and standard deviation, using fixed-point arithmetic.
778 * This to have an estimation of rt_chain_length_max
779 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
780 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
781 */
782
783#define FRACT_BITS 3
784#define ONE (1UL << FRACT_BITS)
785
786/*
787 * Given a hash chain and an item in this hash chain,
788 * find if a previous entry has the same hash_inputs
789 * (but differs on tos, mark or oif)
790 * Returns 0 if an alias is found.
791 * Returns ONE if rth has no alias before itself.
792 */
793static int has_noalias(const struct rtable *head, const struct rtable *rth)
794{
795 const struct rtable *aux = head;
796
797 while (aux != rth) {
798 if (compare_hash_inputs(aux, rth))
799 return 0;
800 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
801 }
802 return ONE;
803}
804
805static void rt_check_expire(void)
806{
807 static unsigned int rover;
808 unsigned int i = rover, goal;
809 struct rtable *rth;
810 struct rtable __rcu **rthp;
811 unsigned long samples = 0;
812 unsigned long sum = 0, sum2 = 0;
813 unsigned long delta;
814 u64 mult;
815
816 delta = jiffies - expires_ljiffies;
817 expires_ljiffies = jiffies;
818 mult = ((u64)delta) << rt_hash_log;
819 if (ip_rt_gc_timeout > 1)
820 do_div(mult, ip_rt_gc_timeout);
821 goal = (unsigned int)mult;
822 if (goal > rt_hash_mask)
823 goal = rt_hash_mask + 1;
824 for (; goal > 0; goal--) {
825 unsigned long tmo = ip_rt_gc_timeout;
826 unsigned long length;
827
828 i = (i + 1) & rt_hash_mask;
829 rthp = &rt_hash_table[i].chain;
830
831 if (need_resched())
832 cond_resched();
833
834 samples++;
835
836 if (rcu_dereference_raw(*rthp) == NULL)
837 continue;
838 length = 0;
839 spin_lock_bh(rt_hash_lock_addr(i));
840 while ((rth = rcu_dereference_protected(*rthp,
841 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
842 prefetch(rth->dst.rt_next);
843 if (rt_is_expired(rth) ||
844 rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
845 *rthp = rth->dst.rt_next;
846 rt_free(rth);
847 continue;
848 }
849
850 /* We only count entries on a chain with equal
851 * hash inputs once so that entries for
852 * different QOS levels, and other non-hash
853 * input attributes don't unfairly skew the
854 * length computation
855 */
856 tmo >>= 1;
857 rthp = &rth->dst.rt_next;
858 length += has_noalias(rt_hash_table[i].chain, rth);
859 }
860 spin_unlock_bh(rt_hash_lock_addr(i));
861 sum += length;
862 sum2 += length*length;
863 }
864 if (samples) {
865 unsigned long avg = sum / samples;
866 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
867 rt_chain_length_max = max_t(unsigned long,
868 ip_rt_gc_elasticity,
869 (avg + 4*sd) >> FRACT_BITS);
870 }
871 rover = i;
872}
873
874/*
875 * rt_worker_func() is run in process context.
876 * we call rt_check_expire() to scan part of the hash table
877 */
878static void rt_worker_func(struct work_struct *work)
879{
880 rt_check_expire();
881 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
882}
883
884/*
885 * Perturbation of rt_genid by a small quantity [1..256] 455 * Perturbation of rt_genid by a small quantity [1..256]
886 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 456 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
887 * many times (2^24) without giving recent rt_genid. 457 * many times (2^24) without giving recent rt_genid.
@@ -902,167 +472,6 @@ static void rt_cache_invalidate(struct net *net)
902void rt_cache_flush(struct net *net, int delay) 472void rt_cache_flush(struct net *net, int delay)
903{ 473{
904 rt_cache_invalidate(net); 474 rt_cache_invalidate(net);
905 if (delay >= 0)
906 rt_do_flush(net, !in_softirq());
907}
908
909/* Flush previous cache invalidated entries from the cache */
910void rt_cache_flush_batch(struct net *net)
911{
912 rt_do_flush(net, !in_softirq());
913}
914
915static void rt_emergency_hash_rebuild(struct net *net)
916{
917 net_warn_ratelimited("Route hash chain too long!\n");
918 rt_cache_invalidate(net);
919}
920
921/*
922 Short description of GC goals.
923
924 We want to build algorithm, which will keep routing cache
925 at some equilibrium point, when number of aged off entries
926 is kept approximately equal to newly generated ones.
927
928 Current expiration strength is variable "expire".
929 We try to adjust it dynamically, so that if networking
930 is idle expires is large enough to keep enough of warm entries,
931 and when load increases it reduces to limit cache size.
932 */
933
934static int rt_garbage_collect(struct dst_ops *ops)
935{
936 static unsigned long expire = RT_GC_TIMEOUT;
937 static unsigned long last_gc;
938 static int rover;
939 static int equilibrium;
940 struct rtable *rth;
941 struct rtable __rcu **rthp;
942 unsigned long now = jiffies;
943 int goal;
944 int entries = dst_entries_get_fast(&ipv4_dst_ops);
945
946 /*
947 * Garbage collection is pretty expensive,
948 * do not make it too frequently.
949 */
950
951 RT_CACHE_STAT_INC(gc_total);
952
953 if (now - last_gc < ip_rt_gc_min_interval &&
954 entries < ip_rt_max_size) {
955 RT_CACHE_STAT_INC(gc_ignored);
956 goto out;
957 }
958
959 entries = dst_entries_get_slow(&ipv4_dst_ops);
960 /* Calculate number of entries, which we want to expire now. */
961 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
962 if (goal <= 0) {
963 if (equilibrium < ipv4_dst_ops.gc_thresh)
964 equilibrium = ipv4_dst_ops.gc_thresh;
965 goal = entries - equilibrium;
966 if (goal > 0) {
967 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
968 goal = entries - equilibrium;
969 }
970 } else {
971 /* We are in dangerous area. Try to reduce cache really
972 * aggressively.
973 */
974 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
975 equilibrium = entries - goal;
976 }
977
978 if (now - last_gc >= ip_rt_gc_min_interval)
979 last_gc = now;
980
981 if (goal <= 0) {
982 equilibrium += goal;
983 goto work_done;
984 }
985
986 do {
987 int i, k;
988
989 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
990 unsigned long tmo = expire;
991
992 k = (k + 1) & rt_hash_mask;
993 rthp = &rt_hash_table[k].chain;
994 spin_lock_bh(rt_hash_lock_addr(k));
995 while ((rth = rcu_dereference_protected(*rthp,
996 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
997 if (!rt_is_expired(rth) &&
998 !rt_may_expire(rth, tmo, expire)) {
999 tmo >>= 1;
1000 rthp = &rth->dst.rt_next;
1001 continue;
1002 }
1003 *rthp = rth->dst.rt_next;
1004 rt_free(rth);
1005 goal--;
1006 }
1007 spin_unlock_bh(rt_hash_lock_addr(k));
1008 if (goal <= 0)
1009 break;
1010 }
1011 rover = k;
1012
1013 if (goal <= 0)
1014 goto work_done;
1015
1016 /* Goal is not achieved. We stop process if:
1017
1018 - if expire reduced to zero. Otherwise, expire is halfed.
1019 - if table is not full.
1020 - if we are called from interrupt.
1021 - jiffies check is just fallback/debug loop breaker.
1022 We will not spin here for long time in any case.
1023 */
1024
1025 RT_CACHE_STAT_INC(gc_goal_miss);
1026
1027 if (expire == 0)
1028 break;
1029
1030 expire >>= 1;
1031
1032 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1033 goto out;
1034 } while (!in_softirq() && time_before_eq(jiffies, now));
1035
1036 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1037 goto out;
1038 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1039 goto out;
1040 net_warn_ratelimited("dst cache overflow\n");
1041 RT_CACHE_STAT_INC(gc_dst_overflow);
1042 return 1;
1043
1044work_done:
1045 expire += ip_rt_gc_min_interval;
1046 if (expire > ip_rt_gc_timeout ||
1047 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1048 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1049 expire = ip_rt_gc_timeout;
1050out: return 0;
1051}
1052
1053/*
1054 * Returns number of entries in a hash chain that have different hash_inputs
1055 */
1056static int slow_chain_length(const struct rtable *head)
1057{
1058 int length = 0;
1059 const struct rtable *rth = head;
1060
1061 while (rth) {
1062 length += has_noalias(head, rth);
1063 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1064 }
1065 return length >> FRACT_BITS;
1066} 475}
1067 476
1068static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 477static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -1086,139 +495,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
1086 return neigh_create(&arp_tbl, pkey, dev); 495 return neigh_create(&arp_tbl, pkey, dev);
1087} 496}
1088 497
1089static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1090 struct sk_buff *skb, int ifindex)
1091{
1092 struct rtable *rth, *cand;
1093 struct rtable __rcu **rthp, **candp;
1094 unsigned long now;
1095 u32 min_score;
1096 int chain_length;
1097
1098restart:
1099 chain_length = 0;
1100 min_score = ~(u32)0;
1101 cand = NULL;
1102 candp = NULL;
1103 now = jiffies;
1104
1105 if (!rt_caching(dev_net(rt->dst.dev)) || (rt->dst.flags & DST_NOCACHE)) {
1106 /*
1107 * If we're not caching, just tell the caller we
1108 * were successful and don't touch the route. The
1109 * caller hold the sole reference to the cache entry, and
1110 * it will be released when the caller is done with it.
1111 * If we drop it here, the callers have no way to resolve routes
1112 * when we're not caching. Instead, just point *rp at rt, so
1113 * the caller gets a single use out of the route
1114 * Note that we do rt_free on this new route entry, so that
1115 * once its refcount hits zero, we are still able to reap it
1116 * (Thanks Alexey)
1117 * Note: To avoid expensive rcu stuff for this uncached dst,
1118 * we set DST_NOCACHE so that dst_release() can free dst without
1119 * waiting a grace period.
1120 */
1121
1122 rt->dst.flags |= DST_NOCACHE;
1123 goto skip_hashing;
1124 }
1125
1126 rthp = &rt_hash_table[hash].chain;
1127
1128 spin_lock_bh(rt_hash_lock_addr(hash));
1129 while ((rth = rcu_dereference_protected(*rthp,
1130 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1131 if (rt_is_expired(rth)) {
1132 *rthp = rth->dst.rt_next;
1133 rt_free(rth);
1134 continue;
1135 }
1136 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1137 /* Put it first */
1138 *rthp = rth->dst.rt_next;
1139 /*
1140 * Since lookup is lockfree, the deletion
1141 * must be visible to another weakly ordered CPU before
1142 * the insertion at the start of the hash chain.
1143 */
1144 rcu_assign_pointer(rth->dst.rt_next,
1145 rt_hash_table[hash].chain);
1146 /*
1147 * Since lookup is lockfree, the update writes
1148 * must be ordered for consistency on SMP.
1149 */
1150 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1151
1152 dst_use(&rth->dst, now);
1153 spin_unlock_bh(rt_hash_lock_addr(hash));
1154
1155 rt_drop(rt);
1156 if (skb)
1157 skb_dst_set(skb, &rth->dst);
1158 return rth;
1159 }
1160
1161 if (!atomic_read(&rth->dst.__refcnt)) {
1162 u32 score = rt_score(rth);
1163
1164 if (score <= min_score) {
1165 cand = rth;
1166 candp = rthp;
1167 min_score = score;
1168 }
1169 }
1170
1171 chain_length++;
1172
1173 rthp = &rth->dst.rt_next;
1174 }
1175
1176 if (cand) {
1177 /* ip_rt_gc_elasticity used to be average length of chain
1178 * length, when exceeded gc becomes really aggressive.
1179 *
1180 * The second limit is less certain. At the moment it allows
1181 * only 2 entries per bucket. We will see.
1182 */
1183 if (chain_length > ip_rt_gc_elasticity) {
1184 *candp = cand->dst.rt_next;
1185 rt_free(cand);
1186 }
1187 } else {
1188 if (chain_length > rt_chain_length_max &&
1189 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1190 struct net *net = dev_net(rt->dst.dev);
1191 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1192 if (!rt_caching(net)) {
1193 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1194 rt->dst.dev->name, num);
1195 }
1196 rt_emergency_hash_rebuild(net);
1197 spin_unlock_bh(rt_hash_lock_addr(hash));
1198
1199 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1200 ifindex, rt_genid(net));
1201 goto restart;
1202 }
1203 }
1204
1205 rt->dst.rt_next = rt_hash_table[hash].chain;
1206
1207 /*
1208 * Since lookup is lockfree, we must make sure
1209 * previous writes to rt are committed to memory
1210 * before making rt visible to other CPUS.
1211 */
1212 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1213
1214 spin_unlock_bh(rt_hash_lock_addr(hash));
1215
1216skip_hashing:
1217 if (skb)
1218 skb_dst_set(skb, &rt->dst);
1219 return rt;
1220}
1221
1222/* 498/*
1223 * Peer allocation may fail only in serious out-of-memory conditions. However 499 * Peer allocation may fail only in serious out-of-memory conditions. However
1224 * we still can generate some output. 500 * we still can generate some output.
@@ -1255,26 +531,6 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1255} 531}
1256EXPORT_SYMBOL(__ip_select_ident); 532EXPORT_SYMBOL(__ip_select_ident);
1257 533
1258static void rt_del(unsigned int hash, struct rtable *rt)
1259{
1260 struct rtable __rcu **rthp;
1261 struct rtable *aux;
1262
1263 rthp = &rt_hash_table[hash].chain;
1264 spin_lock_bh(rt_hash_lock_addr(hash));
1265 ip_rt_put(rt);
1266 while ((aux = rcu_dereference_protected(*rthp,
1267 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1268 if (aux == rt || rt_is_expired(aux)) {
1269 *rthp = aux->dst.rt_next;
1270 rt_free(aux);
1271 continue;
1272 }
1273 rthp = &aux->dst.rt_next;
1274 }
1275 spin_unlock_bh(rt_hash_lock_addr(hash));
1276}
1277
1278static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, 534static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
1279 const struct iphdr *iph, 535 const struct iphdr *iph,
1280 int oif, u8 tos, 536 int oif, u8 tos,
@@ -1518,10 +774,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1518 ret = NULL; 774 ret = NULL;
1519 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 775 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1520 rt->dst.expires) { 776 rt->dst.expires) {
1521 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 777 ip_rt_put(rt);
1522 rt->rt_oif,
1523 rt_genid(dev_net(dst->dev)));
1524 rt_del(hash, rt);
1525 ret = NULL; 778 ret = NULL;
1526 } 779 }
1527 } 780 }
@@ -1969,7 +1222,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
1969 bool nopolicy, bool noxfrm) 1222 bool nopolicy, bool noxfrm)
1970{ 1223{
1971 return dst_alloc(&ipv4_dst_ops, dev, 1, -1, 1224 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1972 DST_HOST | 1225 DST_HOST | DST_NOCACHE |
1973 (nopolicy ? DST_NOPOLICY : 0) | 1226 (nopolicy ? DST_NOPOLICY : 0) |
1974 (noxfrm ? DST_NOXFRM : 0)); 1227 (noxfrm ? DST_NOXFRM : 0));
1975} 1228}
@@ -1978,7 +1231,6 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
1978static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1231static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1979 u8 tos, struct net_device *dev, int our) 1232 u8 tos, struct net_device *dev, int our)
1980{ 1233{
1981 unsigned int hash;
1982 struct rtable *rth; 1234 struct rtable *rth;
1983 struct in_device *in_dev = __in_dev_get_rcu(dev); 1235 struct in_device *in_dev = __in_dev_get_rcu(dev);
1984 u32 itag = 0; 1236 u32 itag = 0;
@@ -2042,9 +1294,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2042#endif 1294#endif
2043 RT_CACHE_STAT_INC(in_slow_mc); 1295 RT_CACHE_STAT_INC(in_slow_mc);
2044 1296
2045 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1297 skb_dst_set(skb, &rth->dst);
2046 rth = rt_intern_hash(hash, rth, skb, dev->ifindex); 1298 return 0;
2047 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2048 1299
2049e_nobufs: 1300e_nobufs:
2050 return -ENOBUFS; 1301 return -ENOBUFS;
@@ -2176,7 +1427,6 @@ static int ip_mkroute_input(struct sk_buff *skb,
2176{ 1427{
2177 struct rtable *rth = NULL; 1428 struct rtable *rth = NULL;
2178 int err; 1429 int err;
2179 unsigned int hash;
2180 1430
2181#ifdef CONFIG_IP_ROUTE_MULTIPATH 1431#ifdef CONFIG_IP_ROUTE_MULTIPATH
2182 if (res->fi && res->fi->fib_nhs > 1) 1432 if (res->fi && res->fi->fib_nhs > 1)
@@ -2188,12 +1438,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
2188 if (err) 1438 if (err)
2189 return err; 1439 return err;
2190 1440
2191 /* put it into the cache */ 1441 skb_dst_set(skb, &rth->dst);
2192 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2193 rt_genid(dev_net(rth->dst.dev)));
2194 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2195 if (IS_ERR(rth))
2196 return PTR_ERR(rth);
2197 return 0; 1442 return 0;
2198} 1443}
2199 1444
@@ -2217,7 +1462,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2217 unsigned int flags = 0; 1462 unsigned int flags = 0;
2218 u32 itag = 0; 1463 u32 itag = 0;
2219 struct rtable *rth; 1464 struct rtable *rth;
2220 unsigned int hash;
2221 int err = -EINVAL; 1465 int err = -EINVAL;
2222 struct net *net = dev_net(dev); 1466 struct net *net = dev_net(dev);
2223 1467
@@ -2339,11 +1583,8 @@ local_input:
2339 rth->dst.error= -err; 1583 rth->dst.error= -err;
2340 rth->rt_flags &= ~RTCF_LOCAL; 1584 rth->rt_flags &= ~RTCF_LOCAL;
2341 } 1585 }
2342 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); 1586 skb_dst_set(skb, &rth->dst);
2343 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2344 err = 0; 1587 err = 0;
2345 if (IS_ERR(rth))
2346 err = PTR_ERR(rth);
2347 goto out; 1588 goto out;
2348 1589
2349no_route: 1590no_route:
@@ -2382,46 +1623,10 @@ martian_source_keep_err:
2382int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1623int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2383 u8 tos, struct net_device *dev, bool noref) 1624 u8 tos, struct net_device *dev, bool noref)
2384{ 1625{
2385 struct rtable *rth;
2386 unsigned int hash;
2387 int iif = dev->ifindex;
2388 struct net *net;
2389 int res; 1626 int res;
2390 1627
2391 net = dev_net(dev);
2392
2393 rcu_read_lock(); 1628 rcu_read_lock();
2394 1629
2395 if (!rt_caching(net))
2396 goto skip_cache;
2397
2398 tos &= IPTOS_RT_MASK;
2399 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2400
2401 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2402 rth = rcu_dereference(rth->dst.rt_next)) {
2403 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2404 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2405 (rth->rt_route_iif ^ iif) |
2406 (rth->rt_key_tos ^ tos)) == 0 &&
2407 rth->rt_mark == skb->mark &&
2408 net_eq(dev_net(rth->dst.dev), net) &&
2409 !rt_is_expired(rth)) {
2410 if (noref) {
2411 dst_use_noref(&rth->dst, jiffies);
2412 skb_dst_set_noref(skb, &rth->dst);
2413 } else {
2414 dst_use(&rth->dst, jiffies);
2415 skb_dst_set(skb, &rth->dst);
2416 }
2417 RT_CACHE_STAT_INC(in_hit);
2418 rcu_read_unlock();
2419 return 0;
2420 }
2421 RT_CACHE_STAT_INC(in_hlist_search);
2422 }
2423
2424skip_cache:
2425 /* Multicast recognition logic is moved from route cache to here. 1630 /* Multicast recognition logic is moved from route cache to here.
2426 The problem was that too many Ethernet cards have broken/missing 1631 The problem was that too many Ethernet cards have broken/missing
2427 hardware multicast filters :-( As result the host on multicasting 1632 hardware multicast filters :-( As result the host on multicasting
@@ -2563,10 +1768,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
2563 1768
2564/* 1769/*
2565 * Major route resolver routine. 1770 * Major route resolver routine.
2566 * called with rcu_read_lock();
2567 */ 1771 */
2568 1772
2569static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) 1773struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2570{ 1774{
2571 struct net_device *dev_out = NULL; 1775 struct net_device *dev_out = NULL;
2572 __u8 tos = RT_FL_TOS(fl4); 1776 __u8 tos = RT_FL_TOS(fl4);
@@ -2746,57 +1950,11 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2746make_route: 1950make_route:
2747 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, 1951 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2748 tos, dev_out, flags); 1952 tos, dev_out, flags);
2749 if (!IS_ERR(rth)) {
2750 unsigned int hash;
2751
2752 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2753 rt_genid(dev_net(dev_out)));
2754 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2755 }
2756 1953
2757out: 1954out:
2758 rcu_read_unlock(); 1955 rcu_read_unlock();
2759 return rth; 1956 return rth;
2760} 1957}
2761
2762struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2763{
2764 struct rtable *rth;
2765 unsigned int hash;
2766
2767 if (!rt_caching(net))
2768 goto slow_output;
2769
2770 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2771
2772 rcu_read_lock_bh();
2773 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2774 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2775 if (rth->rt_key_dst == flp4->daddr &&
2776 rth->rt_key_src == flp4->saddr &&
2777 rt_is_output_route(rth) &&
2778 rth->rt_oif == flp4->flowi4_oif &&
2779 rth->rt_mark == flp4->flowi4_mark &&
2780 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2781 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2782 net_eq(dev_net(rth->dst.dev), net) &&
2783 !rt_is_expired(rth)) {
2784 dst_use(&rth->dst, jiffies);
2785 RT_CACHE_STAT_INC(out_hit);
2786 rcu_read_unlock_bh();
2787 if (!flp4->saddr)
2788 flp4->saddr = rth->rt_src;
2789 if (!flp4->daddr)
2790 flp4->daddr = rth->rt_dst;
2791 return rth;
2792 }
2793 RT_CACHE_STAT_INC(out_hlist_search);
2794 }
2795 rcu_read_unlock_bh();
2796
2797slow_output:
2798 return ip_route_output_slow(net, flp4);
2799}
2800EXPORT_SYMBOL_GPL(__ip_route_output_key); 1958EXPORT_SYMBOL_GPL(__ip_route_output_key);
2801 1959
2802static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 1960static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
@@ -3106,43 +2264,6 @@ errout_free:
3106 2264
3107int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) 2265int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3108{ 2266{
3109 struct rtable *rt;
3110 int h, s_h;
3111 int idx, s_idx;
3112 struct net *net;
3113
3114 net = sock_net(skb->sk);
3115
3116 s_h = cb->args[0];
3117 if (s_h < 0)
3118 s_h = 0;
3119 s_idx = idx = cb->args[1];
3120 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3121 if (!rt_hash_table[h].chain)
3122 continue;
3123 rcu_read_lock_bh();
3124 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3125 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3126 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3127 continue;
3128 if (rt_is_expired(rt))
3129 continue;
3130 skb_dst_set_noref(skb, &rt->dst);
3131 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3132 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3133 1, NLM_F_MULTI) <= 0) {
3134 skb_dst_drop(skb);
3135 rcu_read_unlock_bh();
3136 goto done;
3137 }
3138 skb_dst_drop(skb);
3139 }
3140 rcu_read_unlock_bh();
3141 }
3142
3143done:
3144 cb->args[0] = h;
3145 cb->args[1] = idx;
3146 return skb->len; 2267 return skb->len;
3147} 2268}
3148 2269
@@ -3376,22 +2497,6 @@ static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3376struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 2497struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3377#endif /* CONFIG_IP_ROUTE_CLASSID */ 2498#endif /* CONFIG_IP_ROUTE_CLASSID */
3378 2499
3379static __initdata unsigned long rhash_entries;
3380static int __init set_rhash_entries(char *str)
3381{
3382 ssize_t ret;
3383
3384 if (!str)
3385 return 0;
3386
3387 ret = kstrtoul(str, 0, &rhash_entries);
3388 if (ret)
3389 return 0;
3390
3391 return 1;
3392}
3393__setup("rhash_entries=", set_rhash_entries);
3394
3395int __init ip_rt_init(void) 2500int __init ip_rt_init(void)
3396{ 2501{
3397 int rc = 0; 2502 int rc = 0;
@@ -3414,31 +2519,12 @@ int __init ip_rt_init(void)
3414 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 2519 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3415 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 2520 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3416 2521
3417 rt_hash_table = (struct rt_hash_bucket *) 2522 ipv4_dst_ops.gc_thresh = ~0;
3418 alloc_large_system_hash("IP route cache", 2523 ip_rt_max_size = INT_MAX;
3419 sizeof(struct rt_hash_bucket),
3420 rhash_entries,
3421 (totalram_pages >= 128 * 1024) ?
3422 15 : 17,
3423 0,
3424 &rt_hash_log,
3425 &rt_hash_mask,
3426 0,
3427 rhash_entries ? 0 : 512 * 1024);
3428 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3429 rt_hash_lock_init();
3430
3431 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3432 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3433 2524
3434 devinet_init(); 2525 devinet_init();
3435 ip_fib_init(); 2526 ip_fib_init();
3436 2527
3437 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3438 expires_ljiffies = jiffies;
3439 schedule_delayed_work(&expires_work,
3440 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3441
3442 if (ip_rt_proc_init()) 2528 if (ip_rt_proc_init())
3443 pr_err("Unable to create route proc files\n"); 2529 pr_err("Unable to create route proc files\n");
3444#ifdef CONFIG_XFRM 2530#ifdef CONFIG_XFRM