aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2008-01-31 20:05:09 -0500
committerDavid S. Miller <davem@davemloft.net>2008-01-31 22:28:27 -0500
commit29e75252da20f3ab9e132c68c9aed156b87beae6 (patch)
treeaffd152c959eede937b50f6054a303a388a88545
parent174ce0483198b9dffd712fdd7d53635954fddffe (diff)
[IPV4] route cache: Introduce rt_genid for smooth cache invalidation
Current ip route cache implementation is not suited to large caches. We can consume a lot of CPU when cache must be invalidated, since we currently need to evict all cache entries, and this eviction is sometimes asynchronous. min_delay & max_delay can somewhat control this asynchronism behavior, but whole thing is a kludge, regularly triggering infamous soft lockup messages. When entries are still in use, this also consumes a lot of ram, filling dst_garbage.list. A better scheme is to use a generation identifier on each entry, so that cache invalidation can be performed by changing the table identifier, without having to scan all entries. No more delayed flushing, no more stalling when secret_interval expires. Invalidated entries will then be freed at GC time (controled by ip_rt_gc_timeout or stress), or when an invalidated entry is found in a chain when an insert is done. Thus we keep a normal equilibrium. This patch : - renames rt_hash_rnd to rt_genid (and makes it an atomic_t) - Adds a new rt_genid field to 'struct rtable' (filling a hole on 64bit) - Checks entry->rt_genid at appropriate places :
-rw-r--r--Documentation/filesystems/proc.txt5
-rw-r--r--include/linux/sysctl.h4
-rw-r--r--include/net/route.h1
-rw-r--r--net/ipv4/route.c209
4 files changed, 92 insertions, 127 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 4413a2d4646f..11fe51c036bf 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1919,11 +1919,6 @@ max_size
1919Maximum size of the routing cache. Old entries will be purged once the cache 1919Maximum size of the routing cache. Old entries will be purged once the cache
1920reached has this size. 1920reached has this size.
1921 1921
1922max_delay, min_delay
1923--------------------
1924
1925Delays for flushing the routing cache.
1926
1927redirect_load, redirect_number 1922redirect_load, redirect_number
1928------------------------------ 1923------------------------------
1929 1924
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 89faebfe48b8..bf4ae4e138f7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -440,8 +440,8 @@ enum
440 440
441enum { 441enum {
442 NET_IPV4_ROUTE_FLUSH=1, 442 NET_IPV4_ROUTE_FLUSH=1,
443 NET_IPV4_ROUTE_MIN_DELAY=2, 443 NET_IPV4_ROUTE_MIN_DELAY=2, /* obsolete since 2.6.25 */
444 NET_IPV4_ROUTE_MAX_DELAY=3, 444 NET_IPV4_ROUTE_MAX_DELAY=3, /* obsolete since 2.6.25 */
445 NET_IPV4_ROUTE_GC_THRESH=4, 445 NET_IPV4_ROUTE_GC_THRESH=4,
446 NET_IPV4_ROUTE_MAX_SIZE=5, 446 NET_IPV4_ROUTE_MAX_SIZE=5,
447 NET_IPV4_ROUTE_GC_MIN_INTERVAL=6, 447 NET_IPV4_ROUTE_GC_MIN_INTERVAL=6,
diff --git a/include/net/route.h b/include/net/route.h
index fcc6d5b35863..eadad5901429 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -62,6 +62,7 @@ struct rtable
62 62
63 struct in_device *idev; 63 struct in_device *idev;
64 64
65 int rt_genid;
65 unsigned rt_flags; 66 unsigned rt_flags;
66 __u16 rt_type; 67 __u16 rt_type;
67 68
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 163086b2f058..8842ecb9be48 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -117,8 +117,6 @@
117 117
118#define RT_GC_TIMEOUT (300*HZ) 118#define RT_GC_TIMEOUT (300*HZ)
119 119
120static int ip_rt_min_delay = 2 * HZ;
121static int ip_rt_max_delay = 10 * HZ;
122static int ip_rt_max_size; 120static int ip_rt_max_size;
123static int ip_rt_gc_timeout = RT_GC_TIMEOUT; 121static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
124static int ip_rt_gc_interval = 60 * HZ; 122static int ip_rt_gc_interval = 60 * HZ;
@@ -133,12 +131,9 @@ static int ip_rt_mtu_expires = 10 * 60 * HZ;
133static int ip_rt_min_pmtu = 512 + 20 + 20; 131static int ip_rt_min_pmtu = 512 + 20 + 20;
134static int ip_rt_min_advmss = 256; 132static int ip_rt_min_advmss = 256;
135static int ip_rt_secret_interval = 10 * 60 * HZ; 133static int ip_rt_secret_interval = 10 * 60 * HZ;
136static int ip_rt_flush_expected;
137static unsigned long rt_deadline;
138 134
139#define RTprint(a...) printk(KERN_DEBUG a) 135#define RTprint(a...) printk(KERN_DEBUG a)
140 136
141static struct timer_list rt_flush_timer;
142static void rt_worker_func(struct work_struct *work); 137static void rt_worker_func(struct work_struct *work);
143static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); 138static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
144static struct timer_list rt_secret_timer; 139static struct timer_list rt_secret_timer;
@@ -260,19 +255,16 @@ static inline void rt_hash_lock_init(void)
260static struct rt_hash_bucket *rt_hash_table; 255static struct rt_hash_bucket *rt_hash_table;
261static unsigned rt_hash_mask; 256static unsigned rt_hash_mask;
262static unsigned int rt_hash_log; 257static unsigned int rt_hash_log;
263static unsigned int rt_hash_rnd; 258static atomic_t rt_genid;
264 259
265static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 260static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
266#define RT_CACHE_STAT_INC(field) \ 261#define RT_CACHE_STAT_INC(field) \
267 (__raw_get_cpu_var(rt_cache_stat).field++) 262 (__raw_get_cpu_var(rt_cache_stat).field++)
268 263
269static int rt_intern_hash(unsigned hash, struct rtable *rth,
270 struct rtable **res);
271
272static unsigned int rt_hash_code(u32 daddr, u32 saddr) 264static unsigned int rt_hash_code(u32 daddr, u32 saddr)
273{ 265{
274 return (jhash_2words(daddr, saddr, rt_hash_rnd) 266 return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
275 & rt_hash_mask); 267 & rt_hash_mask;
276} 268}
277 269
278#define rt_hash(daddr, saddr, idx) \ 270#define rt_hash(daddr, saddr, idx) \
@@ -282,27 +274,28 @@ static unsigned int rt_hash_code(u32 daddr, u32 saddr)
282#ifdef CONFIG_PROC_FS 274#ifdef CONFIG_PROC_FS
283struct rt_cache_iter_state { 275struct rt_cache_iter_state {
284 int bucket; 276 int bucket;
277 int genid;
285}; 278};
286 279
287static struct rtable *rt_cache_get_first(struct seq_file *seq) 280static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
288{ 281{
289 struct rtable *r = NULL; 282 struct rtable *r = NULL;
290 struct rt_cache_iter_state *st = seq->private;
291 283
292 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 284 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
293 rcu_read_lock_bh(); 285 rcu_read_lock_bh();
294 r = rt_hash_table[st->bucket].chain; 286 r = rcu_dereference(rt_hash_table[st->bucket].chain);
295 if (r) 287 while (r) {
296 break; 288 if (r->rt_genid == st->genid)
289 return r;
290 r = rcu_dereference(r->u.dst.rt_next);
291 }
297 rcu_read_unlock_bh(); 292 rcu_read_unlock_bh();
298 } 293 }
299 return rcu_dereference(r); 294 return r;
300} 295}
301 296
302static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r) 297static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st, struct rtable *r)
303{ 298{
304 struct rt_cache_iter_state *st = seq->private;
305
306 r = r->u.dst.rt_next; 299 r = r->u.dst.rt_next;
307 while (!r) { 300 while (!r) {
308 rcu_read_unlock_bh(); 301 rcu_read_unlock_bh();
@@ -314,29 +307,38 @@ static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
314 return rcu_dereference(r); 307 return rcu_dereference(r);
315} 308}
316 309
317static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) 310static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
318{ 311{
319 struct rtable *r = rt_cache_get_first(seq); 312 struct rtable *r = rt_cache_get_first(st);
320 313
321 if (r) 314 if (r)
322 while (pos && (r = rt_cache_get_next(seq, r))) 315 while (pos && (r = rt_cache_get_next(st, r))) {
316 if (r->rt_genid != st->genid)
317 continue;
323 --pos; 318 --pos;
319 }
324 return pos ? NULL : r; 320 return pos ? NULL : r;
325} 321}
326 322
327static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 323static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
328{ 324{
329 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 325 struct rt_cache_iter_state *st = seq->private;
326
327 if (*pos)
328 return rt_cache_get_idx(st, *pos - 1);
329 st->genid = atomic_read(&rt_genid);
330 return SEQ_START_TOKEN;
330} 331}
331 332
332static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 333static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
333{ 334{
334 struct rtable *r = NULL; 335 struct rtable *r;
336 struct rt_cache_iter_state *st = seq->private;
335 337
336 if (v == SEQ_START_TOKEN) 338 if (v == SEQ_START_TOKEN)
337 r = rt_cache_get_first(seq); 339 r = rt_cache_get_first(st);
338 else 340 else
339 r = rt_cache_get_next(seq, v); 341 r = rt_cache_get_next(st, v);
340 ++*pos; 342 ++*pos;
341 return r; 343 return r;
342} 344}
@@ -709,6 +711,11 @@ static void rt_check_expire(void)
709 continue; 711 continue;
710 spin_lock_bh(rt_hash_lock_addr(i)); 712 spin_lock_bh(rt_hash_lock_addr(i));
711 while ((rth = *rthp) != NULL) { 713 while ((rth = *rthp) != NULL) {
714 if (rth->rt_genid != atomic_read(&rt_genid)) {
715 *rthp = rth->u.dst.rt_next;
716 rt_free(rth);
717 continue;
718 }
712 if (rth->u.dst.expires) { 719 if (rth->u.dst.expires) {
713 /* Entry is expired even if it is in use */ 720 /* Entry is expired even if it is in use */
714 if (time_before_eq(jiffies, rth->u.dst.expires)) { 721 if (time_before_eq(jiffies, rth->u.dst.expires)) {
@@ -733,83 +740,45 @@ static void rt_check_expire(void)
733 740
734/* 741/*
735 * rt_worker_func() is run in process context. 742 * rt_worker_func() is run in process context.
736 * If a whole flush was scheduled, it is done. 743 * we call rt_check_expire() to scan part of the hash table
737 * Else, we call rt_check_expire() to scan part of the hash table
738 */ 744 */
739static void rt_worker_func(struct work_struct *work) 745static void rt_worker_func(struct work_struct *work)
740{ 746{
741 if (ip_rt_flush_expected) { 747 rt_check_expire();
742 ip_rt_flush_expected = 0;
743 rt_do_flush(1);
744 } else
745 rt_check_expire();
746 schedule_delayed_work(&expires_work, ip_rt_gc_interval); 748 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
747} 749}
748 750
749/* This can run from both BH and non-BH contexts, the latter 751/*
750 * in the case of a forced flush event. 752 * Pertubation of rt_genid by a small quantity [1..256]
753 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
754 * many times (2^24) without giving recent rt_genid.
755 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
751 */ 756 */
752static void rt_run_flush(unsigned long process_context) 757static void rt_cache_invalidate(void)
753{ 758{
754 rt_deadline = 0; 759 unsigned char shuffle;
755
756 get_random_bytes(&rt_hash_rnd, 4);
757 760
758 rt_do_flush(process_context); 761 get_random_bytes(&shuffle, sizeof(shuffle));
762 atomic_add(shuffle + 1U, &rt_genid);
759} 763}
760 764
761static DEFINE_SPINLOCK(rt_flush_lock); 765/*
762 766 * delay < 0 : invalidate cache (fast : entries will be deleted later)
767 * delay >= 0 : invalidate & flush cache (can be long)
768 */
763void rt_cache_flush(int delay) 769void rt_cache_flush(int delay)
764{ 770{
765 unsigned long now = jiffies; 771 rt_cache_invalidate();
766 int user_mode = !in_softirq(); 772 if (delay >= 0)
767 773 rt_do_flush(!in_softirq());
768 if (delay < 0)
769 delay = ip_rt_min_delay;
770
771 spin_lock_bh(&rt_flush_lock);
772
773 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
774 long tmo = (long)(rt_deadline - now);
775
776 /* If flush timer is already running
777 and flush request is not immediate (delay > 0):
778
779 if deadline is not achieved, prolongate timer to "delay",
780 otherwise fire it at deadline time.
781 */
782
783 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
784 tmo = 0;
785
786 if (delay > tmo)
787 delay = tmo;
788 }
789
790 if (delay <= 0) {
791 spin_unlock_bh(&rt_flush_lock);
792 rt_run_flush(user_mode);
793 return;
794 }
795
796 if (rt_deadline == 0)
797 rt_deadline = now + ip_rt_max_delay;
798
799 mod_timer(&rt_flush_timer, now+delay);
800 spin_unlock_bh(&rt_flush_lock);
801} 774}
802 775
803/* 776/*
804 * We change rt_hash_rnd and ask next rt_worker_func() invocation 777 * We change rt_genid and let gc do the cleanup
805 * to perform a flush in process context
806 */ 778 */
807static void rt_secret_rebuild(unsigned long dummy) 779static void rt_secret_rebuild(unsigned long dummy)
808{ 780{
809 get_random_bytes(&rt_hash_rnd, 4); 781 rt_cache_invalidate();
810 ip_rt_flush_expected = 1;
811 cancel_delayed_work(&expires_work);
812 schedule_delayed_work(&expires_work, HZ/10);
813 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval); 782 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
814} 783}
815 784
@@ -886,7 +855,8 @@ static int rt_garbage_collect(struct dst_ops *ops)
886 rthp = &rt_hash_table[k].chain; 855 rthp = &rt_hash_table[k].chain;
887 spin_lock_bh(rt_hash_lock_addr(k)); 856 spin_lock_bh(rt_hash_lock_addr(k));
888 while ((rth = *rthp) != NULL) { 857 while ((rth = *rthp) != NULL) {
889 if (!rt_may_expire(rth, tmo, expire)) { 858 if (rth->rt_genid == atomic_read(&rt_genid) &&
859 !rt_may_expire(rth, tmo, expire)) {
890 tmo >>= 1; 860 tmo >>= 1;
891 rthp = &rth->u.dst.rt_next; 861 rthp = &rth->u.dst.rt_next;
892 continue; 862 continue;
@@ -967,6 +937,11 @@ restart:
967 937
968 spin_lock_bh(rt_hash_lock_addr(hash)); 938 spin_lock_bh(rt_hash_lock_addr(hash));
969 while ((rth = *rthp) != NULL) { 939 while ((rth = *rthp) != NULL) {
940 if (rth->rt_genid != atomic_read(&rt_genid)) {
941 *rthp = rth->u.dst.rt_next;
942 rt_free(rth);
943 continue;
944 }
970 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { 945 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
971 /* Put it first */ 946 /* Put it first */
972 *rthp = rth->u.dst.rt_next; 947 *rthp = rth->u.dst.rt_next;
@@ -1132,17 +1107,19 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1132 1107
1133static void rt_del(unsigned hash, struct rtable *rt) 1108static void rt_del(unsigned hash, struct rtable *rt)
1134{ 1109{
1135 struct rtable **rthp; 1110 struct rtable **rthp, *aux;
1136 1111
1112 rthp = &rt_hash_table[hash].chain;
1137 spin_lock_bh(rt_hash_lock_addr(hash)); 1113 spin_lock_bh(rt_hash_lock_addr(hash));
1138 ip_rt_put(rt); 1114 ip_rt_put(rt);
1139 for (rthp = &rt_hash_table[hash].chain; *rthp; 1115 while ((aux = *rthp) != NULL) {
1140 rthp = &(*rthp)->u.dst.rt_next) 1116 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1141 if (*rthp == rt) { 1117 *rthp = aux->u.dst.rt_next;
1142 *rthp = rt->u.dst.rt_next; 1118 rt_free(aux);
1143 rt_free(rt); 1119 continue;
1144 break;
1145 } 1120 }
1121 rthp = &aux->u.dst.rt_next;
1122 }
1146 spin_unlock_bh(rt_hash_lock_addr(hash)); 1123 spin_unlock_bh(rt_hash_lock_addr(hash));
1147} 1124}
1148 1125
@@ -1187,7 +1164,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1187 if (rth->fl.fl4_dst != daddr || 1164 if (rth->fl.fl4_dst != daddr ||
1188 rth->fl.fl4_src != skeys[i] || 1165 rth->fl.fl4_src != skeys[i] ||
1189 rth->fl.oif != ikeys[k] || 1166 rth->fl.oif != ikeys[k] ||
1190 rth->fl.iif != 0) { 1167 rth->fl.iif != 0 ||
1168 rth->rt_genid != atomic_read(&rt_genid)) {
1191 rthp = &rth->u.dst.rt_next; 1169 rthp = &rth->u.dst.rt_next;
1192 continue; 1170 continue;
1193 } 1171 }
@@ -1225,7 +1203,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1225 rt->u.dst.neighbour = NULL; 1203 rt->u.dst.neighbour = NULL;
1226 rt->u.dst.hh = NULL; 1204 rt->u.dst.hh = NULL;
1227 rt->u.dst.xfrm = NULL; 1205 rt->u.dst.xfrm = NULL;
1228 1206 rt->rt_genid = atomic_read(&rt_genid);
1229 rt->rt_flags |= RTCF_REDIRECTED; 1207 rt->rt_flags |= RTCF_REDIRECTED;
1230 1208
1231 /* Gateway is different ... */ 1209 /* Gateway is different ... */
@@ -1446,7 +1424,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1446 rth->rt_src == iph->saddr && 1424 rth->rt_src == iph->saddr &&
1447 rth->fl.iif == 0 && 1425 rth->fl.iif == 0 &&
1448 !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) && 1426 !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1449 rth->u.dst.dev->nd_net == net) { 1427 rth->u.dst.dev->nd_net == net &&
1428 rth->rt_genid == atomic_read(&rt_genid)) {
1450 unsigned short mtu = new_mtu; 1429 unsigned short mtu = new_mtu;
1451 1430
1452 if (new_mtu < 68 || new_mtu >= old_mtu) { 1431 if (new_mtu < 68 || new_mtu >= old_mtu) {
@@ -1681,8 +1660,9 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1681 rth->fl.oif = 0; 1660 rth->fl.oif = 0;
1682 rth->rt_gateway = daddr; 1661 rth->rt_gateway = daddr;
1683 rth->rt_spec_dst= spec_dst; 1662 rth->rt_spec_dst= spec_dst;
1684 rth->rt_type = RTN_MULTICAST; 1663 rth->rt_genid = atomic_read(&rt_genid);
1685 rth->rt_flags = RTCF_MULTICAST; 1664 rth->rt_flags = RTCF_MULTICAST;
1665 rth->rt_type = RTN_MULTICAST;
1686 if (our) { 1666 if (our) {
1687 rth->u.dst.input= ip_local_deliver; 1667 rth->u.dst.input= ip_local_deliver;
1688 rth->rt_flags |= RTCF_LOCAL; 1668 rth->rt_flags |= RTCF_LOCAL;
@@ -1821,6 +1801,7 @@ static inline int __mkroute_input(struct sk_buff *skb,
1821 1801
1822 rth->u.dst.input = ip_forward; 1802 rth->u.dst.input = ip_forward;
1823 rth->u.dst.output = ip_output; 1803 rth->u.dst.output = ip_output;
1804 rth->rt_genid = atomic_read(&rt_genid);
1824 1805
1825 rt_set_nexthop(rth, res, itag); 1806 rt_set_nexthop(rth, res, itag);
1826 1807
@@ -1981,6 +1962,7 @@ local_input:
1981 goto e_nobufs; 1962 goto e_nobufs;
1982 1963
1983 rth->u.dst.output= ip_rt_bug; 1964 rth->u.dst.output= ip_rt_bug;
1965 rth->rt_genid = atomic_read(&rt_genid);
1984 1966
1985 atomic_set(&rth->u.dst.__refcnt, 1); 1967 atomic_set(&rth->u.dst.__refcnt, 1);
1986 rth->u.dst.flags= DST_HOST; 1968 rth->u.dst.flags= DST_HOST;
@@ -2072,7 +2054,8 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2072 rth->fl.oif == 0 && 2054 rth->fl.oif == 0 &&
2073 rth->fl.mark == skb->mark && 2055 rth->fl.mark == skb->mark &&
2074 rth->fl.fl4_tos == tos && 2056 rth->fl.fl4_tos == tos &&
2075 rth->u.dst.dev->nd_net == net) { 2057 rth->u.dst.dev->nd_net == net &&
2058 rth->rt_genid == atomic_read(&rt_genid)) {
2076 dst_use(&rth->u.dst, jiffies); 2059 dst_use(&rth->u.dst, jiffies);
2077 RT_CACHE_STAT_INC(in_hit); 2060 RT_CACHE_STAT_INC(in_hit);
2078 rcu_read_unlock(); 2061 rcu_read_unlock();
@@ -2200,6 +2183,7 @@ static inline int __mkroute_output(struct rtable **result,
2200 rth->rt_spec_dst= fl->fl4_src; 2183 rth->rt_spec_dst= fl->fl4_src;
2201 2184
2202 rth->u.dst.output=ip_output; 2185 rth->u.dst.output=ip_output;
2186 rth->rt_genid = atomic_read(&rt_genid);
2203 2187
2204 RT_CACHE_STAT_INC(out_slow_tot); 2188 RT_CACHE_STAT_INC(out_slow_tot);
2205 2189
@@ -2472,7 +2456,8 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2472 rth->fl.mark == flp->mark && 2456 rth->fl.mark == flp->mark &&
2473 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2457 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2474 (IPTOS_RT_MASK | RTO_ONLINK)) && 2458 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2475 rth->u.dst.dev->nd_net == net) { 2459 rth->u.dst.dev->nd_net == net &&
2460 rth->rt_genid == atomic_read(&rt_genid)) {
2476 dst_use(&rth->u.dst, jiffies); 2461 dst_use(&rth->u.dst, jiffies);
2477 RT_CACHE_STAT_INC(out_hit); 2462 RT_CACHE_STAT_INC(out_hit);
2478 rcu_read_unlock_bh(); 2463 rcu_read_unlock_bh();
@@ -2527,6 +2512,7 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock
2527 rt->idev = ort->idev; 2512 rt->idev = ort->idev;
2528 if (rt->idev) 2513 if (rt->idev)
2529 in_dev_hold(rt->idev); 2514 in_dev_hold(rt->idev);
2515 rt->rt_genid = atomic_read(&rt_genid);
2530 rt->rt_flags = ort->rt_flags; 2516 rt->rt_flags = ort->rt_flags;
2531 rt->rt_type = ort->rt_type; 2517 rt->rt_type = ort->rt_type;
2532 rt->rt_dst = ort->rt_dst; 2518 rt->rt_dst = ort->rt_dst;
@@ -2781,6 +2767,8 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2781 rt = rcu_dereference(rt->u.dst.rt_next), idx++) { 2767 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2782 if (idx < s_idx) 2768 if (idx < s_idx)
2783 continue; 2769 continue;
2770 if (rt->rt_genid != atomic_read(&rt_genid))
2771 continue;
2784 skb->dst = dst_clone(&rt->u.dst); 2772 skb->dst = dst_clone(&rt->u.dst);
2785 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, 2773 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2786 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 2774 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
@@ -2850,24 +2838,6 @@ ctl_table ipv4_route_table[] = {
2850 .strategy = &ipv4_sysctl_rtcache_flush_strategy, 2838 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2851 }, 2839 },
2852 { 2840 {
2853 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2854 .procname = "min_delay",
2855 .data = &ip_rt_min_delay,
2856 .maxlen = sizeof(int),
2857 .mode = 0644,
2858 .proc_handler = &proc_dointvec_jiffies,
2859 .strategy = &sysctl_jiffies,
2860 },
2861 {
2862 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2863 .procname = "max_delay",
2864 .data = &ip_rt_max_delay,
2865 .maxlen = sizeof(int),
2866 .mode = 0644,
2867 .proc_handler = &proc_dointvec_jiffies,
2868 .strategy = &sysctl_jiffies,
2869 },
2870 {
2871 .ctl_name = NET_IPV4_ROUTE_GC_THRESH, 2841 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2872 .procname = "gc_thresh", 2842 .procname = "gc_thresh",
2873 .data = &ipv4_dst_ops.gc_thresh, 2843 .data = &ipv4_dst_ops.gc_thresh,
@@ -3025,8 +2995,8 @@ int __init ip_rt_init(void)
3025{ 2995{
3026 int rc = 0; 2996 int rc = 0;
3027 2997
3028 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ 2998 atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3029 (jiffies ^ (jiffies >> 7))); 2999 (jiffies ^ (jiffies >> 7))));
3030 3000
3031#ifdef CONFIG_NET_CLS_ROUTE 3001#ifdef CONFIG_NET_CLS_ROUTE
3032 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); 3002 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
@@ -3059,7 +3029,6 @@ int __init ip_rt_init(void)
3059 devinet_init(); 3029 devinet_init();
3060 ip_fib_init(); 3030 ip_fib_init();
3061 3031
3062 setup_timer(&rt_flush_timer, rt_run_flush, 0);
3063 setup_timer(&rt_secret_timer, rt_secret_rebuild, 0); 3032 setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3064 3033
3065 /* All the timers, started at system startup tend 3034 /* All the timers, started at system startup tend