aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/route.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r--net/ipv4/route.c759
1 files changed, 359 insertions, 400 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d62b05d33384..ac6559cb54f9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -90,6 +90,7 @@
90#include <linux/jhash.h> 90#include <linux/jhash.h>
91#include <linux/rcupdate.h> 91#include <linux/rcupdate.h>
92#include <linux/times.h> 92#include <linux/times.h>
93#include <linux/slab.h>
93#include <net/dst.h> 94#include <net/dst.h>
94#include <net/net_namespace.h> 95#include <net/net_namespace.h>
95#include <net/protocol.h> 96#include <net/protocol.h>
@@ -128,7 +129,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256; 131static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
132static int rt_chain_length_max __read_mostly = 20; 132static int rt_chain_length_max __read_mostly = 20;
133 133
134static struct delayed_work expires_work; 134static struct delayed_work expires_work;
@@ -146,7 +146,6 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb); 146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148static int rt_garbage_collect(struct dst_ops *ops); 148static int rt_garbage_collect(struct dst_ops *ops);
149static void rt_emergency_hash_rebuild(struct net *net);
150 149
151 150
152static struct dst_ops ipv4_dst_ops = { 151static struct dst_ops ipv4_dst_ops = {
@@ -254,14 +253,12 @@ static unsigned rt_hash_mask __read_mostly;
254static unsigned int rt_hash_log __read_mostly; 253static unsigned int rt_hash_log __read_mostly;
255 254
256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 255static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257#define RT_CACHE_STAT_INC(field) \ 256#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
258 (__raw_get_cpu_var(rt_cache_stat).field++)
259 257
260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, 258static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid) 259 int genid)
262{ 260{
263 return jhash_3words((__force u32)(__be32)(daddr), 261 return jhash_3words((__force u32)daddr, (__force u32)saddr,
264 (__force u32)(__be32)(saddr),
265 idx, genid) 262 idx, genid)
266 & rt_hash_mask; 263 & rt_hash_mask;
267} 264}
@@ -287,12 +284,12 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
287 if (!rt_hash_table[st->bucket].chain) 284 if (!rt_hash_table[st->bucket].chain)
288 continue; 285 continue;
289 rcu_read_lock_bh(); 286 rcu_read_lock_bh();
290 r = rcu_dereference(rt_hash_table[st->bucket].chain); 287 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
291 while (r) { 288 while (r) {
292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) && 289 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
293 r->rt_genid == st->genid) 290 r->rt_genid == st->genid)
294 return r; 291 return r;
295 r = rcu_dereference(r->u.dst.rt_next); 292 r = rcu_dereference_bh(r->dst.rt_next);
296 } 293 }
297 rcu_read_unlock_bh(); 294 rcu_read_unlock_bh();
298 } 295 }
@@ -304,7 +301,7 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
304{ 301{
305 struct rt_cache_iter_state *st = seq->private; 302 struct rt_cache_iter_state *st = seq->private;
306 303
307 r = r->u.dst.rt_next; 304 r = r->dst.rt_next;
308 while (!r) { 305 while (!r) {
309 rcu_read_unlock_bh(); 306 rcu_read_unlock_bh();
310 do { 307 do {
@@ -314,7 +311,7 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
314 rcu_read_lock_bh(); 311 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain; 312 r = rt_hash_table[st->bucket].chain;
316 } 313 }
317 return rcu_dereference(r); 314 return rcu_dereference_bh(r);
318} 315}
319 316
320static struct rtable *rt_cache_get_next(struct seq_file *seq, 317static struct rtable *rt_cache_get_next(struct seq_file *seq,
@@ -322,7 +319,7 @@ static struct rtable *rt_cache_get_next(struct seq_file *seq,
322{ 319{
323 struct rt_cache_iter_state *st = seq->private; 320 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) { 321 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 if (dev_net(r->u.dst.dev) != seq_file_net(seq)) 322 if (dev_net(r->dst.dev) != seq_file_net(seq))
326 continue; 323 continue;
327 if (r->rt_genid == st->genid) 324 if (r->rt_genid == st->genid)
328 break; 325 break;
@@ -378,20 +375,21 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
378 struct rtable *r = v; 375 struct rtable *r = v;
379 int len; 376 int len;
380 377
381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" 378 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", 379 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383 r->u.dst.dev ? r->u.dst.dev->name : "*", 380 r->dst.dev ? r->dst.dev->name : "*",
384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, 381 (__force u32)r->rt_dst,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt), 382 (__force u32)r->rt_gateway,
386 r->u.dst.__use, 0, (unsigned long)r->rt_src, 383 r->rt_flags, atomic_read(&r->dst.__refcnt),
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ? 384 r->dst.__use, 0, (__force u32)r->rt_src,
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), 385 (dst_metric(&r->dst, RTAX_ADVMSS) ?
389 dst_metric(&r->u.dst, RTAX_WINDOW), 386 (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + 387 dst_metric(&r->dst, RTAX_WINDOW),
391 dst_metric(&r->u.dst, RTAX_RTTVAR)), 388 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
389 dst_metric(&r->dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos, 390 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, 391 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output == 392 r->dst.hh ? (r->dst.hh->hh_output ==
395 dev_queue_xmit) : 0, 393 dev_queue_xmit) : 0,
396 r->rt_spec_dst, &len); 394 r->rt_spec_dst, &len);
397 395
@@ -610,13 +608,13 @@ static inline int ip_rt_proc_init(void)
610 608
611static inline void rt_free(struct rtable *rt) 609static inline void rt_free(struct rtable *rt)
612{ 610{
613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 611 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
614} 612}
615 613
616static inline void rt_drop(struct rtable *rt) 614static inline void rt_drop(struct rtable *rt)
617{ 615{
618 ip_rt_put(rt); 616 ip_rt_put(rt);
619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 617 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
620} 618}
621 619
622static inline int rt_fast_clean(struct rtable *rth) 620static inline int rt_fast_clean(struct rtable *rth)
@@ -624,13 +622,13 @@ static inline int rt_fast_clean(struct rtable *rth)
624 /* Kill broadcast/multicast entries very aggresively, if they 622 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */ 623 collide in hash table with more useful entries */
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 624 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627 rth->fl.iif && rth->u.dst.rt_next; 625 rth->fl.iif && rth->dst.rt_next;
628} 626}
629 627
630static inline int rt_valuable(struct rtable *rth) 628static inline int rt_valuable(struct rtable *rth)
631{ 629{
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 630 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 rth->u.dst.expires; 631 rth->dst.expires;
634} 632}
635 633
636static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 634static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -638,15 +636,15 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
638 unsigned long age; 636 unsigned long age;
639 int ret = 0; 637 int ret = 0;
640 638
641 if (atomic_read(&rth->u.dst.__refcnt)) 639 if (atomic_read(&rth->dst.__refcnt))
642 goto out; 640 goto out;
643 641
644 ret = 1; 642 ret = 1;
645 if (rth->u.dst.expires && 643 if (rth->dst.expires &&
646 time_after_eq(jiffies, rth->u.dst.expires)) 644 time_after_eq(jiffies, rth->dst.expires))
647 goto out; 645 goto out;
648 646
649 age = jiffies - rth->u.dst.lastuse; 647 age = jiffies - rth->dst.lastuse;
650 ret = 0; 648 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) || 649 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth))) 650 (age <= tmo2 && rt_valuable(rth)))
@@ -662,7 +660,7 @@ out: return ret;
662 */ 660 */
663static inline u32 rt_score(struct rtable *rt) 661static inline u32 rt_score(struct rtable *rt)
664{ 662{
665 u32 score = jiffies - rt->u.dst.lastuse; 663 u32 score = jiffies - rt->dst.lastuse;
666 664
667 score = ~score & ~(3<<30); 665 score = ~score & ~(3<<30);
668 666
@@ -685,30 +683,29 @@ static inline bool rt_caching(const struct net *net)
685static inline bool compare_hash_inputs(const struct flowi *fl1, 683static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2) 684 const struct flowi *fl2)
687{ 685{
688 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | 686 return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
689 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | 687 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
690 (fl1->iif ^ fl2->iif)) == 0); 688 (fl1->iif ^ fl2->iif)) == 0);
691} 689}
692 690
693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 691static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694{ 692{
695 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | 693 return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
696 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) | 694 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
697 (fl1->mark ^ fl2->mark) | 695 (fl1->mark ^ fl2->mark) |
698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ 696 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
699 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 (fl1->oif ^ fl2->oif) | 697 (fl1->oif ^ fl2->oif) |
701 (fl1->iif ^ fl2->iif)) == 0; 698 (fl1->iif ^ fl2->iif)) == 0;
702} 699}
703 700
704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 701static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705{ 702{
706 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev)); 703 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
707} 704}
708 705
709static inline int rt_is_expired(struct rtable *rth) 706static inline int rt_is_expired(struct rtable *rth)
710{ 707{
711 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); 708 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
712} 709}
713 710
714/* 711/*
@@ -737,7 +734,7 @@ static void rt_do_flush(int process_context)
737 rth = rt_hash_table[i].chain; 734 rth = rt_hash_table[i].chain;
738 735
739 /* defer releasing the head of the list after spin_unlock */ 736 /* defer releasing the head of the list after spin_unlock */
740 for (tail = rth; tail; tail = tail->u.dst.rt_next) 737 for (tail = rth; tail; tail = tail->dst.rt_next)
741 if (!rt_is_expired(tail)) 738 if (!rt_is_expired(tail))
742 break; 739 break;
743 if (rth != tail) 740 if (rth != tail)
@@ -746,9 +743,9 @@ static void rt_do_flush(int process_context)
746 /* call rt_free on entries after the tail requiring flush */ 743 /* call rt_free on entries after the tail requiring flush */
747 prev = &rt_hash_table[i].chain; 744 prev = &rt_hash_table[i].chain;
748 for (p = *prev; p; p = next) { 745 for (p = *prev; p; p = next) {
749 next = p->u.dst.rt_next; 746 next = p->dst.rt_next;
750 if (!rt_is_expired(p)) { 747 if (!rt_is_expired(p)) {
751 prev = &p->u.dst.rt_next; 748 prev = &p->dst.rt_next;
752 } else { 749 } else {
753 *prev = next; 750 *prev = next;
754 rt_free(p); 751 rt_free(p);
@@ -763,7 +760,7 @@ static void rt_do_flush(int process_context)
763 spin_unlock_bh(rt_hash_lock_addr(i)); 760 spin_unlock_bh(rt_hash_lock_addr(i));
764 761
765 for (; rth != tail; rth = next) { 762 for (; rth != tail; rth = next) {
766 next = rth->u.dst.rt_next; 763 next = rth->dst.rt_next;
767 rt_free(rth); 764 rt_free(rth);
768 } 765 }
769 } 766 }
@@ -780,11 +777,30 @@ static void rt_do_flush(int process_context)
780#define FRACT_BITS 3 777#define FRACT_BITS 3
781#define ONE (1UL << FRACT_BITS) 778#define ONE (1UL << FRACT_BITS)
782 779
780/*
781 * Given a hash chain and an item in this hash chain,
782 * find if a previous entry has the same hash_inputs
783 * (but differs on tos, mark or oif)
784 * Returns 0 if an alias is found.
785 * Returns ONE if rth has no alias before itself.
786 */
787static int has_noalias(const struct rtable *head, const struct rtable *rth)
788{
789 const struct rtable *aux = head;
790
791 while (aux != rth) {
792 if (compare_hash_inputs(&aux->fl, &rth->fl))
793 return 0;
794 aux = aux->dst.rt_next;
795 }
796 return ONE;
797}
798
783static void rt_check_expire(void) 799static void rt_check_expire(void)
784{ 800{
785 static unsigned int rover; 801 static unsigned int rover;
786 unsigned int i = rover, goal; 802 unsigned int i = rover, goal;
787 struct rtable *rth, *aux, **rthp; 803 struct rtable *rth, **rthp;
788 unsigned long samples = 0; 804 unsigned long samples = 0;
789 unsigned long sum = 0, sum2 = 0; 805 unsigned long sum = 0, sum2 = 0;
790 unsigned long delta; 806 unsigned long delta;
@@ -815,18 +831,18 @@ static void rt_check_expire(void)
815 length = 0; 831 length = 0;
816 spin_lock_bh(rt_hash_lock_addr(i)); 832 spin_lock_bh(rt_hash_lock_addr(i));
817 while ((rth = *rthp) != NULL) { 833 while ((rth = *rthp) != NULL) {
818 prefetch(rth->u.dst.rt_next); 834 prefetch(rth->dst.rt_next);
819 if (rt_is_expired(rth)) { 835 if (rt_is_expired(rth)) {
820 *rthp = rth->u.dst.rt_next; 836 *rthp = rth->dst.rt_next;
821 rt_free(rth); 837 rt_free(rth);
822 continue; 838 continue;
823 } 839 }
824 if (rth->u.dst.expires) { 840 if (rth->dst.expires) {
825 /* Entry is expired even if it is in use */ 841 /* Entry is expired even if it is in use */
826 if (time_before_eq(jiffies, rth->u.dst.expires)) { 842 if (time_before_eq(jiffies, rth->dst.expires)) {
827nofree: 843nofree:
828 tmo >>= 1; 844 tmo >>= 1;
829 rthp = &rth->u.dst.rt_next; 845 rthp = &rth->dst.rt_next;
830 /* 846 /*
831 * We only count entries on 847 * We only count entries on
832 * a chain with equal hash inputs once 848 * a chain with equal hash inputs once
@@ -835,22 +851,14 @@ nofree:
835 * attributes don't unfairly skew 851 * attributes don't unfairly skew
836 * the length computation 852 * the length computation
837 */ 853 */
838 for (aux = rt_hash_table[i].chain;;) { 854 length += has_noalias(rt_hash_table[i].chain, rth);
839 if (aux == rth) {
840 length += ONE;
841 break;
842 }
843 if (compare_hash_inputs(&aux->fl, &rth->fl))
844 break;
845 aux = aux->u.dst.rt_next;
846 }
847 continue; 855 continue;
848 } 856 }
849 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) 857 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
850 goto nofree; 858 goto nofree;
851 859
852 /* Cleanup aged off entries. */ 860 /* Cleanup aged off entries. */
853 *rthp = rth->u.dst.rt_next; 861 *rthp = rth->dst.rt_next;
854 rt_free(rth); 862 rt_free(rth);
855 } 863 }
856 spin_unlock_bh(rt_hash_lock_addr(i)); 864 spin_unlock_bh(rt_hash_lock_addr(i));
@@ -908,34 +916,11 @@ void rt_cache_flush_batch(void)
908 rt_do_flush(!in_softirq()); 916 rt_do_flush(!in_softirq());
909} 917}
910 918
911/*
912 * We change rt_genid and let gc do the cleanup
913 */
914static void rt_secret_rebuild(unsigned long __net)
915{
916 struct net *net = (struct net *)__net;
917 rt_cache_invalidate(net);
918 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
919}
920
921static void rt_secret_rebuild_oneshot(struct net *net)
922{
923 del_timer_sync(&net->ipv4.rt_secret_timer);
924 rt_cache_invalidate(net);
925 if (ip_rt_secret_interval) {
926 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
927 add_timer(&net->ipv4.rt_secret_timer);
928 }
929}
930
931static void rt_emergency_hash_rebuild(struct net *net) 919static void rt_emergency_hash_rebuild(struct net *net)
932{ 920{
933 if (net_ratelimit()) { 921 if (net_ratelimit())
934 printk(KERN_WARNING "Route hash chain too long!\n"); 922 printk(KERN_WARNING "Route hash chain too long!\n");
935 printk(KERN_WARNING "Adjust your secret_interval!\n"); 923 rt_cache_invalidate(net);
936 }
937
938 rt_secret_rebuild_oneshot(net);
939} 924}
940 925
941/* 926/*
@@ -1014,10 +999,10 @@ static int rt_garbage_collect(struct dst_ops *ops)
1014 if (!rt_is_expired(rth) && 999 if (!rt_is_expired(rth) &&
1015 !rt_may_expire(rth, tmo, expire)) { 1000 !rt_may_expire(rth, tmo, expire)) {
1016 tmo >>= 1; 1001 tmo >>= 1;
1017 rthp = &rth->u.dst.rt_next; 1002 rthp = &rth->dst.rt_next;
1018 continue; 1003 continue;
1019 } 1004 }
1020 *rthp = rth->u.dst.rt_next; 1005 *rthp = rth->dst.rt_next;
1021 rt_free(rth); 1006 rt_free(rth);
1022 goal--; 1007 goal--;
1023 } 1008 }
@@ -1073,8 +1058,23 @@ work_done:
1073out: return 0; 1058out: return 0;
1074} 1059}
1075 1060
1061/*
1062 * Returns number of entries in a hash chain that have different hash_inputs
1063 */
1064static int slow_chain_length(const struct rtable *head)
1065{
1066 int length = 0;
1067 const struct rtable *rth = head;
1068
1069 while (rth) {
1070 length += has_noalias(head, rth);
1071 rth = rth->dst.rt_next;
1072 }
1073 return length >> FRACT_BITS;
1074}
1075
1076static int rt_intern_hash(unsigned hash, struct rtable *rt, 1076static int rt_intern_hash(unsigned hash, struct rtable *rt,
1077 struct rtable **rp, struct sk_buff *skb) 1077 struct rtable **rp, struct sk_buff *skb, int ifindex)
1078{ 1078{
1079 struct rtable *rth, **rthp; 1079 struct rtable *rth, **rthp;
1080 unsigned long now; 1080 unsigned long now;
@@ -1090,7 +1090,7 @@ restart:
1090 candp = NULL; 1090 candp = NULL;
1091 now = jiffies; 1091 now = jiffies;
1092 1092
1093 if (!rt_caching(dev_net(rt->u.dst.dev))) { 1093 if (!rt_caching(dev_net(rt->dst.dev))) {
1094 /* 1094 /*
1095 * If we're not caching, just tell the caller we 1095 * If we're not caching, just tell the caller we
1096 * were successful and don't touch the route. The 1096 * were successful and don't touch the route. The
@@ -1108,7 +1108,7 @@ restart:
1108 */ 1108 */
1109 1109
1110 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1110 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1111 int err = arp_bind_neighbour(&rt->u.dst); 1111 int err = arp_bind_neighbour(&rt->dst);
1112 if (err) { 1112 if (err) {
1113 if (net_ratelimit()) 1113 if (net_ratelimit())
1114 printk(KERN_WARNING 1114 printk(KERN_WARNING
@@ -1127,19 +1127,19 @@ restart:
1127 spin_lock_bh(rt_hash_lock_addr(hash)); 1127 spin_lock_bh(rt_hash_lock_addr(hash));
1128 while ((rth = *rthp) != NULL) { 1128 while ((rth = *rthp) != NULL) {
1129 if (rt_is_expired(rth)) { 1129 if (rt_is_expired(rth)) {
1130 *rthp = rth->u.dst.rt_next; 1130 *rthp = rth->dst.rt_next;
1131 rt_free(rth); 1131 rt_free(rth);
1132 continue; 1132 continue;
1133 } 1133 }
1134 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { 1134 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1135 /* Put it first */ 1135 /* Put it first */
1136 *rthp = rth->u.dst.rt_next; 1136 *rthp = rth->dst.rt_next;
1137 /* 1137 /*
1138 * Since lookup is lockfree, the deletion 1138 * Since lookup is lockfree, the deletion
1139 * must be visible to another weakly ordered CPU before 1139 * must be visible to another weakly ordered CPU before
1140 * the insertion at the start of the hash chain. 1140 * the insertion at the start of the hash chain.
1141 */ 1141 */
1142 rcu_assign_pointer(rth->u.dst.rt_next, 1142 rcu_assign_pointer(rth->dst.rt_next,
1143 rt_hash_table[hash].chain); 1143 rt_hash_table[hash].chain);
1144 /* 1144 /*
1145 * Since lookup is lockfree, the update writes 1145 * Since lookup is lockfree, the update writes
@@ -1147,18 +1147,18 @@ restart:
1147 */ 1147 */
1148 rcu_assign_pointer(rt_hash_table[hash].chain, rth); 1148 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1149 1149
1150 dst_use(&rth->u.dst, now); 1150 dst_use(&rth->dst, now);
1151 spin_unlock_bh(rt_hash_lock_addr(hash)); 1151 spin_unlock_bh(rt_hash_lock_addr(hash));
1152 1152
1153 rt_drop(rt); 1153 rt_drop(rt);
1154 if (rp) 1154 if (rp)
1155 *rp = rth; 1155 *rp = rth;
1156 else 1156 else
1157 skb_dst_set(skb, &rth->u.dst); 1157 skb_dst_set(skb, &rth->dst);
1158 return 0; 1158 return 0;
1159 } 1159 }
1160 1160
1161 if (!atomic_read(&rth->u.dst.__refcnt)) { 1161 if (!atomic_read(&rth->dst.__refcnt)) {
1162 u32 score = rt_score(rth); 1162 u32 score = rt_score(rth);
1163 1163
1164 if (score <= min_score) { 1164 if (score <= min_score) {
@@ -1170,7 +1170,7 @@ restart:
1170 1170
1171 chain_length++; 1171 chain_length++;
1172 1172
1173 rthp = &rth->u.dst.rt_next; 1173 rthp = &rth->dst.rt_next;
1174 } 1174 }
1175 1175
1176 if (cand) { 1176 if (cand) {
@@ -1181,18 +1181,24 @@ restart:
1181 * only 2 entries per bucket. We will see. 1181 * only 2 entries per bucket. We will see.
1182 */ 1182 */
1183 if (chain_length > ip_rt_gc_elasticity) { 1183 if (chain_length > ip_rt_gc_elasticity) {
1184 *candp = cand->u.dst.rt_next; 1184 *candp = cand->dst.rt_next;
1185 rt_free(cand); 1185 rt_free(cand);
1186 } 1186 }
1187 } else { 1187 } else {
1188 if (chain_length > rt_chain_length_max) { 1188 if (chain_length > rt_chain_length_max &&
1189 struct net *net = dev_net(rt->u.dst.dev); 1189 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1190 struct net *net = dev_net(rt->dst.dev);
1190 int num = ++net->ipv4.current_rt_cache_rebuild_count; 1191 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1191 if (!rt_caching(dev_net(rt->u.dst.dev))) { 1192 if (!rt_caching(net)) {
1192 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", 1193 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1193 rt->u.dst.dev->name, num); 1194 rt->dst.dev->name, num);
1194 } 1195 }
1195 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); 1196 rt_emergency_hash_rebuild(net);
1197 spin_unlock_bh(rt_hash_lock_addr(hash));
1198
1199 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1200 ifindex, rt_genid(net));
1201 goto restart;
1196 } 1202 }
1197 } 1203 }
1198 1204
@@ -1200,7 +1206,7 @@ restart:
1200 route or unicast forwarding path. 1206 route or unicast forwarding path.
1201 */ 1207 */
1202 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1208 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1203 int err = arp_bind_neighbour(&rt->u.dst); 1209 int err = arp_bind_neighbour(&rt->dst);
1204 if (err) { 1210 if (err) {
1205 spin_unlock_bh(rt_hash_lock_addr(hash)); 1211 spin_unlock_bh(rt_hash_lock_addr(hash));
1206 1212
@@ -1225,20 +1231,20 @@ restart:
1225 } 1231 }
1226 1232
1227 if (net_ratelimit()) 1233 if (net_ratelimit())
1228 printk(KERN_WARNING "Neighbour table overflow.\n"); 1234 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1229 rt_drop(rt); 1235 rt_drop(rt);
1230 return -ENOBUFS; 1236 return -ENOBUFS;
1231 } 1237 }
1232 } 1238 }
1233 1239
1234 rt->u.dst.rt_next = rt_hash_table[hash].chain; 1240 rt->dst.rt_next = rt_hash_table[hash].chain;
1235 1241
1236#if RT_CACHE_DEBUG >= 2 1242#if RT_CACHE_DEBUG >= 2
1237 if (rt->u.dst.rt_next) { 1243 if (rt->dst.rt_next) {
1238 struct rtable *trt; 1244 struct rtable *trt;
1239 printk(KERN_DEBUG "rt_cache @%02x: %pI4", 1245 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1240 hash, &rt->rt_dst); 1246 hash, &rt->rt_dst);
1241 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) 1247 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1242 printk(" . %pI4", &trt->rt_dst); 1248 printk(" . %pI4", &trt->rt_dst);
1243 printk("\n"); 1249 printk("\n");
1244 } 1250 }
@@ -1256,7 +1262,7 @@ skip_hashing:
1256 if (rp) 1262 if (rp)
1257 *rp = rt; 1263 *rp = rt;
1258 else 1264 else
1259 skb_dst_set(skb, &rt->u.dst); 1265 skb_dst_set(skb, &rt->dst);
1260 return 0; 1266 return 0;
1261} 1267}
1262 1268
@@ -1318,6 +1324,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1318 1324
1319 ip_select_fb_ident(iph); 1325 ip_select_fb_ident(iph);
1320} 1326}
1327EXPORT_SYMBOL(__ip_select_ident);
1321 1328
1322static void rt_del(unsigned hash, struct rtable *rt) 1329static void rt_del(unsigned hash, struct rtable *rt)
1323{ 1330{
@@ -1328,20 +1335,21 @@ static void rt_del(unsigned hash, struct rtable *rt)
1328 ip_rt_put(rt); 1335 ip_rt_put(rt);
1329 while ((aux = *rthp) != NULL) { 1336 while ((aux = *rthp) != NULL) {
1330 if (aux == rt || rt_is_expired(aux)) { 1337 if (aux == rt || rt_is_expired(aux)) {
1331 *rthp = aux->u.dst.rt_next; 1338 *rthp = aux->dst.rt_next;
1332 rt_free(aux); 1339 rt_free(aux);
1333 continue; 1340 continue;
1334 } 1341 }
1335 rthp = &aux->u.dst.rt_next; 1342 rthp = &aux->dst.rt_next;
1336 } 1343 }
1337 spin_unlock_bh(rt_hash_lock_addr(hash)); 1344 spin_unlock_bh(rt_hash_lock_addr(hash));
1338} 1345}
1339 1346
1347/* called in rcu_read_lock() section */
1340void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1348void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1341 __be32 saddr, struct net_device *dev) 1349 __be32 saddr, struct net_device *dev)
1342{ 1350{
1343 int i, k; 1351 int i, k;
1344 struct in_device *in_dev = in_dev_get(dev); 1352 struct in_device *in_dev = __in_dev_get_rcu(dev);
1345 struct rtable *rth, **rthp; 1353 struct rtable *rth, **rthp;
1346 __be32 skeys[2] = { saddr, 0 }; 1354 __be32 skeys[2] = { saddr, 0 };
1347 int ikeys[2] = { dev->ifindex, 0 }; 1355 int ikeys[2] = { dev->ifindex, 0 };
@@ -1377,7 +1385,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1377 1385
1378 rthp=&rt_hash_table[hash].chain; 1386 rthp=&rt_hash_table[hash].chain;
1379 1387
1380 rcu_read_lock();
1381 while ((rth = rcu_dereference(*rthp)) != NULL) { 1388 while ((rth = rcu_dereference(*rthp)) != NULL) {
1382 struct rtable *rt; 1389 struct rtable *rt;
1383 1390
@@ -1386,44 +1393,42 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1386 rth->fl.oif != ikeys[k] || 1393 rth->fl.oif != ikeys[k] ||
1387 rth->fl.iif != 0 || 1394 rth->fl.iif != 0 ||
1388 rt_is_expired(rth) || 1395 rt_is_expired(rth) ||
1389 !net_eq(dev_net(rth->u.dst.dev), net)) { 1396 !net_eq(dev_net(rth->dst.dev), net)) {
1390 rthp = &rth->u.dst.rt_next; 1397 rthp = &rth->dst.rt_next;
1391 continue; 1398 continue;
1392 } 1399 }
1393 1400
1394 if (rth->rt_dst != daddr || 1401 if (rth->rt_dst != daddr ||
1395 rth->rt_src != saddr || 1402 rth->rt_src != saddr ||
1396 rth->u.dst.error || 1403 rth->dst.error ||
1397 rth->rt_gateway != old_gw || 1404 rth->rt_gateway != old_gw ||
1398 rth->u.dst.dev != dev) 1405 rth->dst.dev != dev)
1399 break; 1406 break;
1400 1407
1401 dst_hold(&rth->u.dst); 1408 dst_hold(&rth->dst);
1402 rcu_read_unlock();
1403 1409
1404 rt = dst_alloc(&ipv4_dst_ops); 1410 rt = dst_alloc(&ipv4_dst_ops);
1405 if (rt == NULL) { 1411 if (rt == NULL) {
1406 ip_rt_put(rth); 1412 ip_rt_put(rth);
1407 in_dev_put(in_dev);
1408 return; 1413 return;
1409 } 1414 }
1410 1415
1411 /* Copy all the information. */ 1416 /* Copy all the information. */
1412 *rt = *rth; 1417 *rt = *rth;
1413 rt->u.dst.__use = 1; 1418 rt->dst.__use = 1;
1414 atomic_set(&rt->u.dst.__refcnt, 1); 1419 atomic_set(&rt->dst.__refcnt, 1);
1415 rt->u.dst.child = NULL; 1420 rt->dst.child = NULL;
1416 if (rt->u.dst.dev) 1421 if (rt->dst.dev)
1417 dev_hold(rt->u.dst.dev); 1422 dev_hold(rt->dst.dev);
1418 if (rt->idev) 1423 if (rt->idev)
1419 in_dev_hold(rt->idev); 1424 in_dev_hold(rt->idev);
1420 rt->u.dst.obsolete = 0; 1425 rt->dst.obsolete = -1;
1421 rt->u.dst.lastuse = jiffies; 1426 rt->dst.lastuse = jiffies;
1422 rt->u.dst.path = &rt->u.dst; 1427 rt->dst.path = &rt->dst;
1423 rt->u.dst.neighbour = NULL; 1428 rt->dst.neighbour = NULL;
1424 rt->u.dst.hh = NULL; 1429 rt->dst.hh = NULL;
1425#ifdef CONFIG_XFRM 1430#ifdef CONFIG_XFRM
1426 rt->u.dst.xfrm = NULL; 1431 rt->dst.xfrm = NULL;
1427#endif 1432#endif
1428 rt->rt_genid = rt_genid(net); 1433 rt->rt_genid = rt_genid(net);
1429 rt->rt_flags |= RTCF_REDIRECTED; 1434 rt->rt_flags |= RTCF_REDIRECTED;
@@ -1432,37 +1437,35 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1432 rt->rt_gateway = new_gw; 1437 rt->rt_gateway = new_gw;
1433 1438
1434 /* Redirect received -> path was valid */ 1439 /* Redirect received -> path was valid */
1435 dst_confirm(&rth->u.dst); 1440 dst_confirm(&rth->dst);
1436 1441
1437 if (rt->peer) 1442 if (rt->peer)
1438 atomic_inc(&rt->peer->refcnt); 1443 atomic_inc(&rt->peer->refcnt);
1439 1444
1440 if (arp_bind_neighbour(&rt->u.dst) || 1445 if (arp_bind_neighbour(&rt->dst) ||
1441 !(rt->u.dst.neighbour->nud_state & 1446 !(rt->dst.neighbour->nud_state &
1442 NUD_VALID)) { 1447 NUD_VALID)) {
1443 if (rt->u.dst.neighbour) 1448 if (rt->dst.neighbour)
1444 neigh_event_send(rt->u.dst.neighbour, NULL); 1449 neigh_event_send(rt->dst.neighbour, NULL);
1445 ip_rt_put(rth); 1450 ip_rt_put(rth);
1446 rt_drop(rt); 1451 rt_drop(rt);
1447 goto do_next; 1452 goto do_next;
1448 } 1453 }
1449 1454
1450 netevent.old = &rth->u.dst; 1455 netevent.old = &rth->dst;
1451 netevent.new = &rt->u.dst; 1456 netevent.new = &rt->dst;
1452 call_netevent_notifiers(NETEVENT_REDIRECT, 1457 call_netevent_notifiers(NETEVENT_REDIRECT,
1453 &netevent); 1458 &netevent);
1454 1459
1455 rt_del(hash, rth); 1460 rt_del(hash, rth);
1456 if (!rt_intern_hash(hash, rt, &rt, NULL)) 1461 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1457 ip_rt_put(rt); 1462 ip_rt_put(rt);
1458 goto do_next; 1463 goto do_next;
1459 } 1464 }
1460 rcu_read_unlock();
1461 do_next: 1465 do_next:
1462 ; 1466 ;
1463 } 1467 }
1464 } 1468 }
1465 in_dev_put(in_dev);
1466 return; 1469 return;
1467 1470
1468reject_redirect: 1471reject_redirect:
@@ -1473,7 +1476,7 @@ reject_redirect:
1473 &old_gw, dev->name, &new_gw, 1476 &old_gw, dev->name, &new_gw,
1474 &saddr, &daddr); 1477 &saddr, &daddr);
1475#endif 1478#endif
1476 in_dev_put(in_dev); 1479 ;
1477} 1480}
1478 1481
1479static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1482static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -1482,11 +1485,12 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1482 struct dst_entry *ret = dst; 1485 struct dst_entry *ret = dst;
1483 1486
1484 if (rt) { 1487 if (rt) {
1485 if (dst->obsolete) { 1488 if (dst->obsolete > 0) {
1486 ip_rt_put(rt); 1489 ip_rt_put(rt);
1487 ret = NULL; 1490 ret = NULL;
1488 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1491 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1489 rt->u.dst.expires) { 1492 (rt->dst.expires &&
1493 time_after_eq(jiffies, rt->dst.expires))) {
1490 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1494 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1491 rt->fl.oif, 1495 rt->fl.oif,
1492 rt_genid(dev_net(dst->dev))); 1496 rt_genid(dev_net(dst->dev)));
@@ -1524,7 +1528,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1524 int log_martians; 1528 int log_martians;
1525 1529
1526 rcu_read_lock(); 1530 rcu_read_lock();
1527 in_dev = __in_dev_get_rcu(rt->u.dst.dev); 1531 in_dev = __in_dev_get_rcu(rt->dst.dev);
1528 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 1532 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1529 rcu_read_unlock(); 1533 rcu_read_unlock();
1530 return; 1534 return;
@@ -1535,30 +1539,30 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1535 /* No redirected packets during ip_rt_redirect_silence; 1539 /* No redirected packets during ip_rt_redirect_silence;
1536 * reset the algorithm. 1540 * reset the algorithm.
1537 */ 1541 */
1538 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) 1542 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1539 rt->u.dst.rate_tokens = 0; 1543 rt->dst.rate_tokens = 0;
1540 1544
1541 /* Too many ignored redirects; do not send anything 1545 /* Too many ignored redirects; do not send anything
1542 * set u.dst.rate_last to the last seen redirected packet. 1546 * set dst.rate_last to the last seen redirected packet.
1543 */ 1547 */
1544 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { 1548 if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1545 rt->u.dst.rate_last = jiffies; 1549 rt->dst.rate_last = jiffies;
1546 return; 1550 return;
1547 } 1551 }
1548 1552
1549 /* Check for load limit; set rate_last to the latest sent 1553 /* Check for load limit; set rate_last to the latest sent
1550 * redirect. 1554 * redirect.
1551 */ 1555 */
1552 if (rt->u.dst.rate_tokens == 0 || 1556 if (rt->dst.rate_tokens == 0 ||
1553 time_after(jiffies, 1557 time_after(jiffies,
1554 (rt->u.dst.rate_last + 1558 (rt->dst.rate_last +
1555 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { 1559 (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1556 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1560 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1557 rt->u.dst.rate_last = jiffies; 1561 rt->dst.rate_last = jiffies;
1558 ++rt->u.dst.rate_tokens; 1562 ++rt->dst.rate_tokens;
1559#ifdef CONFIG_IP_ROUTE_VERBOSE 1563#ifdef CONFIG_IP_ROUTE_VERBOSE
1560 if (log_martians && 1564 if (log_martians &&
1561 rt->u.dst.rate_tokens == ip_rt_redirect_number && 1565 rt->dst.rate_tokens == ip_rt_redirect_number &&
1562 net_ratelimit()) 1566 net_ratelimit())
1563 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1567 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1564 &rt->rt_src, rt->rt_iif, 1568 &rt->rt_src, rt->rt_iif,
@@ -1573,7 +1577,7 @@ static int ip_error(struct sk_buff *skb)
1573 unsigned long now; 1577 unsigned long now;
1574 int code; 1578 int code;
1575 1579
1576 switch (rt->u.dst.error) { 1580 switch (rt->dst.error) {
1577 case EINVAL: 1581 case EINVAL:
1578 default: 1582 default:
1579 goto out; 1583 goto out;
@@ -1582,7 +1586,7 @@ static int ip_error(struct sk_buff *skb)
1582 break; 1586 break;
1583 case ENETUNREACH: 1587 case ENETUNREACH:
1584 code = ICMP_NET_UNREACH; 1588 code = ICMP_NET_UNREACH;
1585 IP_INC_STATS_BH(dev_net(rt->u.dst.dev), 1589 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1586 IPSTATS_MIB_INNOROUTES); 1590 IPSTATS_MIB_INNOROUTES);
1587 break; 1591 break;
1588 case EACCES: 1592 case EACCES:
@@ -1591,12 +1595,12 @@ static int ip_error(struct sk_buff *skb)
1591 } 1595 }
1592 1596
1593 now = jiffies; 1597 now = jiffies;
1594 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; 1598 rt->dst.rate_tokens += now - rt->dst.rate_last;
1595 if (rt->u.dst.rate_tokens > ip_rt_error_burst) 1599 if (rt->dst.rate_tokens > ip_rt_error_burst)
1596 rt->u.dst.rate_tokens = ip_rt_error_burst; 1600 rt->dst.rate_tokens = ip_rt_error_burst;
1597 rt->u.dst.rate_last = now; 1601 rt->dst.rate_last = now;
1598 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { 1602 if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1599 rt->u.dst.rate_tokens -= ip_rt_error_cost; 1603 rt->dst.rate_tokens -= ip_rt_error_cost;
1600 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1604 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1601 } 1605 }
1602 1606
@@ -1641,7 +1645,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1641 1645
1642 rcu_read_lock(); 1646 rcu_read_lock();
1643 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 1647 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1644 rth = rcu_dereference(rth->u.dst.rt_next)) { 1648 rth = rcu_dereference(rth->dst.rt_next)) {
1645 unsigned short mtu = new_mtu; 1649 unsigned short mtu = new_mtu;
1646 1650
1647 if (rth->fl.fl4_dst != daddr || 1651 if (rth->fl.fl4_dst != daddr ||
@@ -1650,8 +1654,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1650 rth->rt_src != iph->saddr || 1654 rth->rt_src != iph->saddr ||
1651 rth->fl.oif != ikeys[k] || 1655 rth->fl.oif != ikeys[k] ||
1652 rth->fl.iif != 0 || 1656 rth->fl.iif != 0 ||
1653 dst_metric_locked(&rth->u.dst, RTAX_MTU) || 1657 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1654 !net_eq(dev_net(rth->u.dst.dev), net) || 1658 !net_eq(dev_net(rth->dst.dev), net) ||
1655 rt_is_expired(rth)) 1659 rt_is_expired(rth))
1656 continue; 1660 continue;
1657 1661
@@ -1659,22 +1663,22 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1659 1663
1660 /* BSD 4.2 compatibility hack :-( */ 1664 /* BSD 4.2 compatibility hack :-( */
1661 if (mtu == 0 && 1665 if (mtu == 0 &&
1662 old_mtu >= dst_mtu(&rth->u.dst) && 1666 old_mtu >= dst_mtu(&rth->dst) &&
1663 old_mtu >= 68 + (iph->ihl << 2)) 1667 old_mtu >= 68 + (iph->ihl << 2))
1664 old_mtu -= iph->ihl << 2; 1668 old_mtu -= iph->ihl << 2;
1665 1669
1666 mtu = guess_mtu(old_mtu); 1670 mtu = guess_mtu(old_mtu);
1667 } 1671 }
1668 if (mtu <= dst_mtu(&rth->u.dst)) { 1672 if (mtu <= dst_mtu(&rth->dst)) {
1669 if (mtu < dst_mtu(&rth->u.dst)) { 1673 if (mtu < dst_mtu(&rth->dst)) {
1670 dst_confirm(&rth->u.dst); 1674 dst_confirm(&rth->dst);
1671 if (mtu < ip_rt_min_pmtu) { 1675 if (mtu < ip_rt_min_pmtu) {
1672 mtu = ip_rt_min_pmtu; 1676 mtu = ip_rt_min_pmtu;
1673 rth->u.dst.metrics[RTAX_LOCK-1] |= 1677 rth->dst.metrics[RTAX_LOCK-1] |=
1674 (1 << RTAX_MTU); 1678 (1 << RTAX_MTU);
1675 } 1679 }
1676 rth->u.dst.metrics[RTAX_MTU-1] = mtu; 1680 rth->dst.metrics[RTAX_MTU-1] = mtu;
1677 dst_set_expires(&rth->u.dst, 1681 dst_set_expires(&rth->dst,
1678 ip_rt_mtu_expires); 1682 ip_rt_mtu_expires);
1679 } 1683 }
1680 est_mtu = mtu; 1684 est_mtu = mtu;
@@ -1702,7 +1706,9 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1702 1706
1703static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1707static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1704{ 1708{
1705 return NULL; 1709 if (rt_is_expired((struct rtable *)dst))
1710 return NULL;
1711 return dst;
1706} 1712}
1707 1713
1708static void ipv4_dst_destroy(struct dst_entry *dst) 1714static void ipv4_dst_destroy(struct dst_entry *dst)
@@ -1745,7 +1751,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
1745 1751
1746 rt = skb_rtable(skb); 1752 rt = skb_rtable(skb);
1747 if (rt) 1753 if (rt)
1748 dst_set_expires(&rt->u.dst, 0); 1754 dst_set_expires(&rt->dst, 0);
1749} 1755}
1750 1756
1751static int ip_rt_bug(struct sk_buff *skb) 1757static int ip_rt_bug(struct sk_buff *skb)
@@ -1773,11 +1779,11 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1773 1779
1774 if (rt->fl.iif == 0) 1780 if (rt->fl.iif == 0)
1775 src = rt->rt_src; 1781 src = rt->rt_src;
1776 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) { 1782 else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) {
1777 src = FIB_RES_PREFSRC(res); 1783 src = FIB_RES_PREFSRC(res);
1778 fib_res_put(&res); 1784 fib_res_put(&res);
1779 } else 1785 } else
1780 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, 1786 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1781 RT_SCOPE_UNIVERSE); 1787 RT_SCOPE_UNIVERSE);
1782 memcpy(addr, &src, 4); 1788 memcpy(addr, &src, 4);
1783} 1789}
@@ -1785,10 +1791,10 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1785#ifdef CONFIG_NET_CLS_ROUTE 1791#ifdef CONFIG_NET_CLS_ROUTE
1786static void set_class_tag(struct rtable *rt, u32 tag) 1792static void set_class_tag(struct rtable *rt, u32 tag)
1787{ 1793{
1788 if (!(rt->u.dst.tclassid & 0xFFFF)) 1794 if (!(rt->dst.tclassid & 0xFFFF))
1789 rt->u.dst.tclassid |= tag & 0xFFFF; 1795 rt->dst.tclassid |= tag & 0xFFFF;
1790 if (!(rt->u.dst.tclassid & 0xFFFF0000)) 1796 if (!(rt->dst.tclassid & 0xFFFF0000))
1791 rt->u.dst.tclassid |= tag & 0xFFFF0000; 1797 rt->dst.tclassid |= tag & 0xFFFF0000;
1792} 1798}
1793#endif 1799#endif
1794 1800
@@ -1800,30 +1806,30 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1800 if (FIB_RES_GW(*res) && 1806 if (FIB_RES_GW(*res) &&
1801 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1807 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1802 rt->rt_gateway = FIB_RES_GW(*res); 1808 rt->rt_gateway = FIB_RES_GW(*res);
1803 memcpy(rt->u.dst.metrics, fi->fib_metrics, 1809 memcpy(rt->dst.metrics, fi->fib_metrics,
1804 sizeof(rt->u.dst.metrics)); 1810 sizeof(rt->dst.metrics));
1805 if (fi->fib_mtu == 0) { 1811 if (fi->fib_mtu == 0) {
1806 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; 1812 rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1807 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) && 1813 if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1808 rt->rt_gateway != rt->rt_dst && 1814 rt->rt_gateway != rt->rt_dst &&
1809 rt->u.dst.dev->mtu > 576) 1815 rt->dst.dev->mtu > 576)
1810 rt->u.dst.metrics[RTAX_MTU-1] = 576; 1816 rt->dst.metrics[RTAX_MTU-1] = 576;
1811 } 1817 }
1812#ifdef CONFIG_NET_CLS_ROUTE 1818#ifdef CONFIG_NET_CLS_ROUTE
1813 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 1819 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1814#endif 1820#endif
1815 } else 1821 } else
1816 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; 1822 rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
1817 1823
1818 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0) 1824 if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1819 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; 1825 rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1820 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU) 1826 if (dst_mtu(&rt->dst) > IP_MAX_MTU)
1821 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; 1827 rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1822 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0) 1828 if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
1823 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, 1829 rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1824 ip_rt_min_advmss); 1830 ip_rt_min_advmss);
1825 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40) 1831 if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1826 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; 1832 rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1827 1833
1828#ifdef CONFIG_NET_CLS_ROUTE 1834#ifdef CONFIG_NET_CLS_ROUTE
1829#ifdef CONFIG_IP_MULTIPLE_TABLES 1835#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1834,14 +1840,16 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1834 rt->rt_type = res->type; 1840 rt->rt_type = res->type;
1835} 1841}
1836 1842
1843/* called in rcu_read_lock() section */
1837static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1844static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1838 u8 tos, struct net_device *dev, int our) 1845 u8 tos, struct net_device *dev, int our)
1839{ 1846{
1840 unsigned hash; 1847 unsigned int hash;
1841 struct rtable *rth; 1848 struct rtable *rth;
1842 __be32 spec_dst; 1849 __be32 spec_dst;
1843 struct in_device *in_dev = in_dev_get(dev); 1850 struct in_device *in_dev = __in_dev_get_rcu(dev);
1844 u32 itag = 0; 1851 u32 itag = 0;
1852 int err;
1845 1853
1846 /* Primary sanity checks. */ 1854 /* Primary sanity checks. */
1847 1855
@@ -1856,20 +1864,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1856 if (!ipv4_is_local_multicast(daddr)) 1864 if (!ipv4_is_local_multicast(daddr))
1857 goto e_inval; 1865 goto e_inval;
1858 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1866 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1859 } else if (fib_validate_source(saddr, 0, tos, 0, 1867 } else {
1860 dev, &spec_dst, &itag, 0) < 0) 1868 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1861 goto e_inval; 1869 &itag, 0);
1862 1870 if (err < 0)
1871 goto e_err;
1872 }
1863 rth = dst_alloc(&ipv4_dst_ops); 1873 rth = dst_alloc(&ipv4_dst_ops);
1864 if (!rth) 1874 if (!rth)
1865 goto e_nobufs; 1875 goto e_nobufs;
1866 1876
1867 rth->u.dst.output= ip_rt_bug; 1877 rth->dst.output = ip_rt_bug;
1878 rth->dst.obsolete = -1;
1868 1879
1869 atomic_set(&rth->u.dst.__refcnt, 1); 1880 atomic_set(&rth->dst.__refcnt, 1);
1870 rth->u.dst.flags= DST_HOST; 1881 rth->dst.flags= DST_HOST;
1871 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 1882 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1872 rth->u.dst.flags |= DST_NOPOLICY; 1883 rth->dst.flags |= DST_NOPOLICY;
1873 rth->fl.fl4_dst = daddr; 1884 rth->fl.fl4_dst = daddr;
1874 rth->rt_dst = daddr; 1885 rth->rt_dst = daddr;
1875 rth->fl.fl4_tos = tos; 1886 rth->fl.fl4_tos = tos;
@@ -1877,13 +1888,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1877 rth->fl.fl4_src = saddr; 1888 rth->fl.fl4_src = saddr;
1878 rth->rt_src = saddr; 1889 rth->rt_src = saddr;
1879#ifdef CONFIG_NET_CLS_ROUTE 1890#ifdef CONFIG_NET_CLS_ROUTE
1880 rth->u.dst.tclassid = itag; 1891 rth->dst.tclassid = itag;
1881#endif 1892#endif
1882 rth->rt_iif = 1893 rth->rt_iif =
1883 rth->fl.iif = dev->ifindex; 1894 rth->fl.iif = dev->ifindex;
1884 rth->u.dst.dev = init_net.loopback_dev; 1895 rth->dst.dev = init_net.loopback_dev;
1885 dev_hold(rth->u.dst.dev); 1896 dev_hold(rth->dst.dev);
1886 rth->idev = in_dev_get(rth->u.dst.dev); 1897 rth->idev = in_dev_get(rth->dst.dev);
1887 rth->fl.oif = 0; 1898 rth->fl.oif = 0;
1888 rth->rt_gateway = daddr; 1899 rth->rt_gateway = daddr;
1889 rth->rt_spec_dst= spec_dst; 1900 rth->rt_spec_dst= spec_dst;
@@ -1891,27 +1902,25 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1891 rth->rt_flags = RTCF_MULTICAST; 1902 rth->rt_flags = RTCF_MULTICAST;
1892 rth->rt_type = RTN_MULTICAST; 1903 rth->rt_type = RTN_MULTICAST;
1893 if (our) { 1904 if (our) {
1894 rth->u.dst.input= ip_local_deliver; 1905 rth->dst.input= ip_local_deliver;
1895 rth->rt_flags |= RTCF_LOCAL; 1906 rth->rt_flags |= RTCF_LOCAL;
1896 } 1907 }
1897 1908
1898#ifdef CONFIG_IP_MROUTE 1909#ifdef CONFIG_IP_MROUTE
1899 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1910 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1900 rth->u.dst.input = ip_mr_input; 1911 rth->dst.input = ip_mr_input;
1901#endif 1912#endif
1902 RT_CACHE_STAT_INC(in_slow_mc); 1913 RT_CACHE_STAT_INC(in_slow_mc);
1903 1914
1904 in_dev_put(in_dev);
1905 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1915 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1906 return rt_intern_hash(hash, rth, NULL, skb); 1916 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1907 1917
1908e_nobufs: 1918e_nobufs:
1909 in_dev_put(in_dev);
1910 return -ENOBUFS; 1919 return -ENOBUFS;
1911
1912e_inval: 1920e_inval:
1913 in_dev_put(in_dev);
1914 return -EINVAL; 1921 return -EINVAL;
1922e_err:
1923 return err;
1915} 1924}
1916 1925
1917 1926
@@ -1945,22 +1954,22 @@ static void ip_handle_martian_source(struct net_device *dev,
1945#endif 1954#endif
1946} 1955}
1947 1956
1957/* called in rcu_read_lock() section */
1948static int __mkroute_input(struct sk_buff *skb, 1958static int __mkroute_input(struct sk_buff *skb,
1949 struct fib_result *res, 1959 struct fib_result *res,
1950 struct in_device *in_dev, 1960 struct in_device *in_dev,
1951 __be32 daddr, __be32 saddr, u32 tos, 1961 __be32 daddr, __be32 saddr, u32 tos,
1952 struct rtable **result) 1962 struct rtable **result)
1953{ 1963{
1954
1955 struct rtable *rth; 1964 struct rtable *rth;
1956 int err; 1965 int err;
1957 struct in_device *out_dev; 1966 struct in_device *out_dev;
1958 unsigned flags = 0; 1967 unsigned int flags = 0;
1959 __be32 spec_dst; 1968 __be32 spec_dst;
1960 u32 itag; 1969 u32 itag;
1961 1970
1962 /* get a working reference to the output device */ 1971 /* get a working reference to the output device */
1963 out_dev = in_dev_get(FIB_RES_DEV(*res)); 1972 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1964 if (out_dev == NULL) { 1973 if (out_dev == NULL) {
1965 if (net_ratelimit()) 1974 if (net_ratelimit())
1966 printk(KERN_CRIT "Bug in ip_route_input" \ 1975 printk(KERN_CRIT "Bug in ip_route_input" \
@@ -1975,7 +1984,6 @@ static int __mkroute_input(struct sk_buff *skb,
1975 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1984 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1976 saddr); 1985 saddr);
1977 1986
1978 err = -EINVAL;
1979 goto cleanup; 1987 goto cleanup;
1980 } 1988 }
1981 1989
@@ -1990,8 +1998,13 @@ static int __mkroute_input(struct sk_buff *skb,
1990 if (skb->protocol != htons(ETH_P_IP)) { 1998 if (skb->protocol != htons(ETH_P_IP)) {
1991 /* Not IP (i.e. ARP). Do not create route, if it is 1999 /* Not IP (i.e. ARP). Do not create route, if it is
1992 * invalid for proxy arp. DNAT routes are always valid. 2000 * invalid for proxy arp. DNAT routes are always valid.
2001 *
2002 * Proxy arp feature have been extended to allow, ARP
2003 * replies back to the same interface, to support
2004 * Private VLAN switch technologies. See arp.c.
1993 */ 2005 */
1994 if (out_dev == in_dev) { 2006 if (out_dev == in_dev &&
2007 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1995 err = -EINVAL; 2008 err = -EINVAL;
1996 goto cleanup; 2009 goto cleanup;
1997 } 2010 }
@@ -2004,12 +2017,12 @@ static int __mkroute_input(struct sk_buff *skb,
2004 goto cleanup; 2017 goto cleanup;
2005 } 2018 }
2006 2019
2007 atomic_set(&rth->u.dst.__refcnt, 1); 2020 atomic_set(&rth->dst.__refcnt, 1);
2008 rth->u.dst.flags= DST_HOST; 2021 rth->dst.flags= DST_HOST;
2009 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2022 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2010 rth->u.dst.flags |= DST_NOPOLICY; 2023 rth->dst.flags |= DST_NOPOLICY;
2011 if (IN_DEV_CONF_GET(out_dev, NOXFRM)) 2024 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2012 rth->u.dst.flags |= DST_NOXFRM; 2025 rth->dst.flags |= DST_NOXFRM;
2013 rth->fl.fl4_dst = daddr; 2026 rth->fl.fl4_dst = daddr;
2014 rth->rt_dst = daddr; 2027 rth->rt_dst = daddr;
2015 rth->fl.fl4_tos = tos; 2028 rth->fl.fl4_tos = tos;
@@ -2019,15 +2032,16 @@ static int __mkroute_input(struct sk_buff *skb,
2019 rth->rt_gateway = daddr; 2032 rth->rt_gateway = daddr;
2020 rth->rt_iif = 2033 rth->rt_iif =
2021 rth->fl.iif = in_dev->dev->ifindex; 2034 rth->fl.iif = in_dev->dev->ifindex;
2022 rth->u.dst.dev = (out_dev)->dev; 2035 rth->dst.dev = (out_dev)->dev;
2023 dev_hold(rth->u.dst.dev); 2036 dev_hold(rth->dst.dev);
2024 rth->idev = in_dev_get(rth->u.dst.dev); 2037 rth->idev = in_dev_get(rth->dst.dev);
2025 rth->fl.oif = 0; 2038 rth->fl.oif = 0;
2026 rth->rt_spec_dst= spec_dst; 2039 rth->rt_spec_dst= spec_dst;
2027 2040
2028 rth->u.dst.input = ip_forward; 2041 rth->dst.obsolete = -1;
2029 rth->u.dst.output = ip_output; 2042 rth->dst.input = ip_forward;
2030 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); 2043 rth->dst.output = ip_output;
2044 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2031 2045
2032 rt_set_nexthop(rth, res, itag); 2046 rt_set_nexthop(rth, res, itag);
2033 2047
@@ -2036,8 +2050,6 @@ static int __mkroute_input(struct sk_buff *skb,
2036 *result = rth; 2050 *result = rth;
2037 err = 0; 2051 err = 0;
2038 cleanup: 2052 cleanup:
2039 /* release the working reference to the output device */
2040 in_dev_put(out_dev);
2041 return err; 2053 return err;
2042} 2054}
2043 2055
@@ -2063,8 +2075,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
2063 2075
2064 /* put it into the cache */ 2076 /* put it into the cache */
2065 hash = rt_hash(daddr, saddr, fl->iif, 2077 hash = rt_hash(daddr, saddr, fl->iif,
2066 rt_genid(dev_net(rth->u.dst.dev))); 2078 rt_genid(dev_net(rth->dst.dev)));
2067 return rt_intern_hash(hash, rth, NULL, skb); 2079 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2068} 2080}
2069 2081
2070/* 2082/*
@@ -2081,7 +2093,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2081 u8 tos, struct net_device *dev) 2093 u8 tos, struct net_device *dev)
2082{ 2094{
2083 struct fib_result res; 2095 struct fib_result res;
2084 struct in_device *in_dev = in_dev_get(dev); 2096 struct in_device *in_dev = __in_dev_get_rcu(dev);
2085 struct flowi fl = { .nl_u = { .ip4_u = 2097 struct flowi fl = { .nl_u = { .ip4_u =
2086 { .daddr = daddr, 2098 { .daddr = daddr,
2087 .saddr = saddr, 2099 .saddr = saddr,
@@ -2141,13 +2153,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2141 goto brd_input; 2153 goto brd_input;
2142 2154
2143 if (res.type == RTN_LOCAL) { 2155 if (res.type == RTN_LOCAL) {
2144 int result; 2156 err = fib_validate_source(saddr, daddr, tos,
2145 result = fib_validate_source(saddr, daddr, tos,
2146 net->loopback_dev->ifindex, 2157 net->loopback_dev->ifindex,
2147 dev, &spec_dst, &itag, skb->mark); 2158 dev, &spec_dst, &itag, skb->mark);
2148 if (result < 0) 2159 if (err < 0)
2149 goto martian_source; 2160 goto martian_source_keep_err;
2150 if (result) 2161 if (err)
2151 flags |= RTCF_DIRECTSRC; 2162 flags |= RTCF_DIRECTSRC;
2152 spec_dst = daddr; 2163 spec_dst = daddr;
2153 goto local_input; 2164 goto local_input;
@@ -2160,7 +2171,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2160 2171
2161 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 2172 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2162done: 2173done:
2163 in_dev_put(in_dev);
2164 if (free_res) 2174 if (free_res)
2165 fib_res_put(&res); 2175 fib_res_put(&res);
2166out: return err; 2176out: return err;
@@ -2175,7 +2185,7 @@ brd_input:
2175 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, 2185 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2176 &itag, skb->mark); 2186 &itag, skb->mark);
2177 if (err < 0) 2187 if (err < 0)
2178 goto martian_source; 2188 goto martian_source_keep_err;
2179 if (err) 2189 if (err)
2180 flags |= RTCF_DIRECTSRC; 2190 flags |= RTCF_DIRECTSRC;
2181 } 2191 }
@@ -2188,13 +2198,14 @@ local_input:
2188 if (!rth) 2198 if (!rth)
2189 goto e_nobufs; 2199 goto e_nobufs;
2190 2200
2191 rth->u.dst.output= ip_rt_bug; 2201 rth->dst.output= ip_rt_bug;
2202 rth->dst.obsolete = -1;
2192 rth->rt_genid = rt_genid(net); 2203 rth->rt_genid = rt_genid(net);
2193 2204
2194 atomic_set(&rth->u.dst.__refcnt, 1); 2205 atomic_set(&rth->dst.__refcnt, 1);
2195 rth->u.dst.flags= DST_HOST; 2206 rth->dst.flags= DST_HOST;
2196 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2207 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2197 rth->u.dst.flags |= DST_NOPOLICY; 2208 rth->dst.flags |= DST_NOPOLICY;
2198 rth->fl.fl4_dst = daddr; 2209 rth->fl.fl4_dst = daddr;
2199 rth->rt_dst = daddr; 2210 rth->rt_dst = daddr;
2200 rth->fl.fl4_tos = tos; 2211 rth->fl.fl4_tos = tos;
@@ -2202,25 +2213,25 @@ local_input:
2202 rth->fl.fl4_src = saddr; 2213 rth->fl.fl4_src = saddr;
2203 rth->rt_src = saddr; 2214 rth->rt_src = saddr;
2204#ifdef CONFIG_NET_CLS_ROUTE 2215#ifdef CONFIG_NET_CLS_ROUTE
2205 rth->u.dst.tclassid = itag; 2216 rth->dst.tclassid = itag;
2206#endif 2217#endif
2207 rth->rt_iif = 2218 rth->rt_iif =
2208 rth->fl.iif = dev->ifindex; 2219 rth->fl.iif = dev->ifindex;
2209 rth->u.dst.dev = net->loopback_dev; 2220 rth->dst.dev = net->loopback_dev;
2210 dev_hold(rth->u.dst.dev); 2221 dev_hold(rth->dst.dev);
2211 rth->idev = in_dev_get(rth->u.dst.dev); 2222 rth->idev = in_dev_get(rth->dst.dev);
2212 rth->rt_gateway = daddr; 2223 rth->rt_gateway = daddr;
2213 rth->rt_spec_dst= spec_dst; 2224 rth->rt_spec_dst= spec_dst;
2214 rth->u.dst.input= ip_local_deliver; 2225 rth->dst.input= ip_local_deliver;
2215 rth->rt_flags = flags|RTCF_LOCAL; 2226 rth->rt_flags = flags|RTCF_LOCAL;
2216 if (res.type == RTN_UNREACHABLE) { 2227 if (res.type == RTN_UNREACHABLE) {
2217 rth->u.dst.input= ip_error; 2228 rth->dst.input= ip_error;
2218 rth->u.dst.error= -err; 2229 rth->dst.error= -err;
2219 rth->rt_flags &= ~RTCF_LOCAL; 2230 rth->rt_flags &= ~RTCF_LOCAL;
2220 } 2231 }
2221 rth->rt_type = res.type; 2232 rth->rt_type = res.type;
2222 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); 2233 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2223 err = rt_intern_hash(hash, rth, NULL, skb); 2234 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2224 goto done; 2235 goto done;
2225 2236
2226no_route: 2237no_route:
@@ -2255,46 +2266,54 @@ e_nobufs:
2255 goto done; 2266 goto done;
2256 2267
2257martian_source: 2268martian_source:
2269 err = -EINVAL;
2270martian_source_keep_err:
2258 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2271 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2259 goto e_inval; 2272 goto done;
2260} 2273}
2261 2274
2262int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2275int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2263 u8 tos, struct net_device *dev) 2276 u8 tos, struct net_device *dev, bool noref)
2264{ 2277{
2265 struct rtable * rth; 2278 struct rtable * rth;
2266 unsigned hash; 2279 unsigned hash;
2267 int iif = dev->ifindex; 2280 int iif = dev->ifindex;
2268 struct net *net; 2281 struct net *net;
2282 int res;
2269 2283
2270 net = dev_net(dev); 2284 net = dev_net(dev);
2271 2285
2286 rcu_read_lock();
2287
2272 if (!rt_caching(net)) 2288 if (!rt_caching(net))
2273 goto skip_cache; 2289 goto skip_cache;
2274 2290
2275 tos &= IPTOS_RT_MASK; 2291 tos &= IPTOS_RT_MASK;
2276 hash = rt_hash(daddr, saddr, iif, rt_genid(net)); 2292 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2277 2293
2278 rcu_read_lock();
2279 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2294 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2280 rth = rcu_dereference(rth->u.dst.rt_next)) { 2295 rth = rcu_dereference(rth->dst.rt_next)) {
2281 if (((rth->fl.fl4_dst ^ daddr) | 2296 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2282 (rth->fl.fl4_src ^ saddr) | 2297 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2283 (rth->fl.iif ^ iif) | 2298 (rth->fl.iif ^ iif) |
2284 rth->fl.oif | 2299 rth->fl.oif |
2285 (rth->fl.fl4_tos ^ tos)) == 0 && 2300 (rth->fl.fl4_tos ^ tos)) == 0 &&
2286 rth->fl.mark == skb->mark && 2301 rth->fl.mark == skb->mark &&
2287 net_eq(dev_net(rth->u.dst.dev), net) && 2302 net_eq(dev_net(rth->dst.dev), net) &&
2288 !rt_is_expired(rth)) { 2303 !rt_is_expired(rth)) {
2289 dst_use(&rth->u.dst, jiffies); 2304 if (noref) {
2305 dst_use_noref(&rth->dst, jiffies);
2306 skb_dst_set_noref(skb, &rth->dst);
2307 } else {
2308 dst_use(&rth->dst, jiffies);
2309 skb_dst_set(skb, &rth->dst);
2310 }
2290 RT_CACHE_STAT_INC(in_hit); 2311 RT_CACHE_STAT_INC(in_hit);
2291 rcu_read_unlock(); 2312 rcu_read_unlock();
2292 skb_dst_set(skb, &rth->u.dst);
2293 return 0; 2313 return 0;
2294 } 2314 }
2295 RT_CACHE_STAT_INC(in_hlist_search); 2315 RT_CACHE_STAT_INC(in_hlist_search);
2296 } 2316 }
2297 rcu_read_unlock();
2298 2317
2299skip_cache: 2318skip_cache:
2300 /* Multicast recognition logic is moved from route cache to here. 2319 /* Multicast recognition logic is moved from route cache to here.
@@ -2309,12 +2328,11 @@ skip_cache:
2309 route cache entry is created eventually. 2328 route cache entry is created eventually.
2310 */ 2329 */
2311 if (ipv4_is_multicast(daddr)) { 2330 if (ipv4_is_multicast(daddr)) {
2312 struct in_device *in_dev; 2331 struct in_device *in_dev = __in_dev_get_rcu(dev);
2313 2332
2314 rcu_read_lock(); 2333 if (in_dev) {
2315 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2316 int our = ip_check_mc(in_dev, daddr, saddr, 2334 int our = ip_check_mc(in_dev, daddr, saddr,
2317 ip_hdr(skb)->protocol); 2335 ip_hdr(skb)->protocol);
2318 if (our 2336 if (our
2319#ifdef CONFIG_IP_MROUTE 2337#ifdef CONFIG_IP_MROUTE
2320 || 2338 ||
@@ -2322,16 +2340,20 @@ skip_cache:
2322 IN_DEV_MFORWARD(in_dev)) 2340 IN_DEV_MFORWARD(in_dev))
2323#endif 2341#endif
2324 ) { 2342 ) {
2343 int res = ip_route_input_mc(skb, daddr, saddr,
2344 tos, dev, our);
2325 rcu_read_unlock(); 2345 rcu_read_unlock();
2326 return ip_route_input_mc(skb, daddr, saddr, 2346 return res;
2327 tos, dev, our);
2328 } 2347 }
2329 } 2348 }
2330 rcu_read_unlock(); 2349 rcu_read_unlock();
2331 return -EINVAL; 2350 return -EINVAL;
2332 } 2351 }
2333 return ip_route_input_slow(skb, daddr, saddr, tos, dev); 2352 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2353 rcu_read_unlock();
2354 return res;
2334} 2355}
2356EXPORT_SYMBOL(ip_route_input_common);
2335 2357
2336static int __mkroute_output(struct rtable **result, 2358static int __mkroute_output(struct rtable **result,
2337 struct fib_result *res, 2359 struct fib_result *res,
@@ -2391,12 +2413,12 @@ static int __mkroute_output(struct rtable **result,
2391 goto cleanup; 2413 goto cleanup;
2392 } 2414 }
2393 2415
2394 atomic_set(&rth->u.dst.__refcnt, 1); 2416 atomic_set(&rth->dst.__refcnt, 1);
2395 rth->u.dst.flags= DST_HOST; 2417 rth->dst.flags= DST_HOST;
2396 if (IN_DEV_CONF_GET(in_dev, NOXFRM)) 2418 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2397 rth->u.dst.flags |= DST_NOXFRM; 2419 rth->dst.flags |= DST_NOXFRM;
2398 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2420 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2399 rth->u.dst.flags |= DST_NOPOLICY; 2421 rth->dst.flags |= DST_NOPOLICY;
2400 2422
2401 rth->fl.fl4_dst = oldflp->fl4_dst; 2423 rth->fl.fl4_dst = oldflp->fl4_dst;
2402 rth->fl.fl4_tos = tos; 2424 rth->fl.fl4_tos = tos;
@@ -2408,34 +2430,35 @@ static int __mkroute_output(struct rtable **result,
2408 rth->rt_iif = oldflp->oif ? : dev_out->ifindex; 2430 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2409 /* get references to the devices that are to be hold by the routing 2431 /* get references to the devices that are to be hold by the routing
2410 cache entry */ 2432 cache entry */
2411 rth->u.dst.dev = dev_out; 2433 rth->dst.dev = dev_out;
2412 dev_hold(dev_out); 2434 dev_hold(dev_out);
2413 rth->idev = in_dev_get(dev_out); 2435 rth->idev = in_dev_get(dev_out);
2414 rth->rt_gateway = fl->fl4_dst; 2436 rth->rt_gateway = fl->fl4_dst;
2415 rth->rt_spec_dst= fl->fl4_src; 2437 rth->rt_spec_dst= fl->fl4_src;
2416 2438
2417 rth->u.dst.output=ip_output; 2439 rth->dst.output=ip_output;
2440 rth->dst.obsolete = -1;
2418 rth->rt_genid = rt_genid(dev_net(dev_out)); 2441 rth->rt_genid = rt_genid(dev_net(dev_out));
2419 2442
2420 RT_CACHE_STAT_INC(out_slow_tot); 2443 RT_CACHE_STAT_INC(out_slow_tot);
2421 2444
2422 if (flags & RTCF_LOCAL) { 2445 if (flags & RTCF_LOCAL) {
2423 rth->u.dst.input = ip_local_deliver; 2446 rth->dst.input = ip_local_deliver;
2424 rth->rt_spec_dst = fl->fl4_dst; 2447 rth->rt_spec_dst = fl->fl4_dst;
2425 } 2448 }
2426 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2449 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2427 rth->rt_spec_dst = fl->fl4_src; 2450 rth->rt_spec_dst = fl->fl4_src;
2428 if (flags & RTCF_LOCAL && 2451 if (flags & RTCF_LOCAL &&
2429 !(dev_out->flags & IFF_LOOPBACK)) { 2452 !(dev_out->flags & IFF_LOOPBACK)) {
2430 rth->u.dst.output = ip_mc_output; 2453 rth->dst.output = ip_mc_output;
2431 RT_CACHE_STAT_INC(out_slow_mc); 2454 RT_CACHE_STAT_INC(out_slow_mc);
2432 } 2455 }
2433#ifdef CONFIG_IP_MROUTE 2456#ifdef CONFIG_IP_MROUTE
2434 if (res->type == RTN_MULTICAST) { 2457 if (res->type == RTN_MULTICAST) {
2435 if (IN_DEV_MFORWARD(in_dev) && 2458 if (IN_DEV_MFORWARD(in_dev) &&
2436 !ipv4_is_local_multicast(oldflp->fl4_dst)) { 2459 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2437 rth->u.dst.input = ip_mr_input; 2460 rth->dst.input = ip_mr_input;
2438 rth->u.dst.output = ip_mc_output; 2461 rth->dst.output = ip_mc_output;
2439 } 2462 }
2440 } 2463 }
2441#endif 2464#endif
@@ -2466,7 +2489,7 @@ static int ip_mkroute_output(struct rtable **rp,
2466 if (err == 0) { 2489 if (err == 0) {
2467 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, 2490 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2468 rt_genid(dev_net(dev_out))); 2491 rt_genid(dev_net(dev_out)));
2469 err = rt_intern_hash(hash, rth, rp, NULL); 2492 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2470 } 2493 }
2471 2494
2472 return err; 2495 return err;
@@ -2689,8 +2712,8 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2689 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); 2712 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2690 2713
2691 rcu_read_lock_bh(); 2714 rcu_read_lock_bh();
2692 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2715 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2693 rth = rcu_dereference(rth->u.dst.rt_next)) { 2716 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2694 if (rth->fl.fl4_dst == flp->fl4_dst && 2717 if (rth->fl.fl4_dst == flp->fl4_dst &&
2695 rth->fl.fl4_src == flp->fl4_src && 2718 rth->fl.fl4_src == flp->fl4_src &&
2696 rth->fl.iif == 0 && 2719 rth->fl.iif == 0 &&
@@ -2698,9 +2721,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2698 rth->fl.mark == flp->mark && 2721 rth->fl.mark == flp->mark &&
2699 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2722 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2700 (IPTOS_RT_MASK | RTO_ONLINK)) && 2723 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2701 net_eq(dev_net(rth->u.dst.dev), net) && 2724 net_eq(dev_net(rth->dst.dev), net) &&
2702 !rt_is_expired(rth)) { 2725 !rt_is_expired(rth)) {
2703 dst_use(&rth->u.dst, jiffies); 2726 dst_use(&rth->dst, jiffies);
2704 RT_CACHE_STAT_INC(out_hit); 2727 RT_CACHE_STAT_INC(out_hit);
2705 rcu_read_unlock_bh(); 2728 rcu_read_unlock_bh();
2706 *rp = rth; 2729 *rp = rth;
@@ -2713,9 +2736,13 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
2713slow_output: 2736slow_output:
2714 return ip_route_output_slow(net, rp, flp); 2737 return ip_route_output_slow(net, rp, flp);
2715} 2738}
2716
2717EXPORT_SYMBOL_GPL(__ip_route_output_key); 2739EXPORT_SYMBOL_GPL(__ip_route_output_key);
2718 2740
2741static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2742{
2743 return NULL;
2744}
2745
2719static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2746static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2720{ 2747{
2721} 2748}
@@ -2724,7 +2751,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
2724 .family = AF_INET, 2751 .family = AF_INET,
2725 .protocol = cpu_to_be16(ETH_P_IP), 2752 .protocol = cpu_to_be16(ETH_P_IP),
2726 .destroy = ipv4_dst_destroy, 2753 .destroy = ipv4_dst_destroy,
2727 .check = ipv4_dst_check, 2754 .check = ipv4_blackhole_dst_check,
2728 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2755 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2729 .entries = ATOMIC_INIT(0), 2756 .entries = ATOMIC_INIT(0),
2730}; 2757};
@@ -2737,15 +2764,15 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2737 dst_alloc(&ipv4_dst_blackhole_ops); 2764 dst_alloc(&ipv4_dst_blackhole_ops);
2738 2765
2739 if (rt) { 2766 if (rt) {
2740 struct dst_entry *new = &rt->u.dst; 2767 struct dst_entry *new = &rt->dst;
2741 2768
2742 atomic_set(&new->__refcnt, 1); 2769 atomic_set(&new->__refcnt, 1);
2743 new->__use = 1; 2770 new->__use = 1;
2744 new->input = dst_discard; 2771 new->input = dst_discard;
2745 new->output = dst_discard; 2772 new->output = dst_discard;
2746 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); 2773 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
2747 2774
2748 new->dev = ort->u.dst.dev; 2775 new->dev = ort->dst.dev;
2749 if (new->dev) 2776 if (new->dev)
2750 dev_hold(new->dev); 2777 dev_hold(new->dev);
2751 2778
@@ -2769,7 +2796,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2769 dst_free(new); 2796 dst_free(new);
2770 } 2797 }
2771 2798
2772 dst_release(&(*rp)->u.dst); 2799 dst_release(&(*rp)->dst);
2773 *rp = rt; 2800 *rp = rt;
2774 return (rt ? 0 : -ENOMEM); 2801 return (rt ? 0 : -ENOMEM);
2775} 2802}
@@ -2797,13 +2824,13 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2797 2824
2798 return 0; 2825 return 0;
2799} 2826}
2800
2801EXPORT_SYMBOL_GPL(ip_route_output_flow); 2827EXPORT_SYMBOL_GPL(ip_route_output_flow);
2802 2828
2803int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) 2829int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2804{ 2830{
2805 return ip_route_output_flow(net, rp, flp, NULL, 0); 2831 return ip_route_output_flow(net, rp, flp, NULL, 0);
2806} 2832}
2833EXPORT_SYMBOL(ip_route_output_key);
2807 2834
2808static int rt_fill_info(struct net *net, 2835static int rt_fill_info(struct net *net,
2809 struct sk_buff *skb, u32 pid, u32 seq, int event, 2836 struct sk_buff *skb, u32 pid, u32 seq, int event,
@@ -2839,11 +2866,11 @@ static int rt_fill_info(struct net *net,
2839 r->rtm_src_len = 32; 2866 r->rtm_src_len = 32;
2840 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); 2867 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2841 } 2868 }
2842 if (rt->u.dst.dev) 2869 if (rt->dst.dev)
2843 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex); 2870 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2844#ifdef CONFIG_NET_CLS_ROUTE 2871#ifdef CONFIG_NET_CLS_ROUTE
2845 if (rt->u.dst.tclassid) 2872 if (rt->dst.tclassid)
2846 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid); 2873 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2847#endif 2874#endif
2848 if (rt->fl.iif) 2875 if (rt->fl.iif)
2849 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2876 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
@@ -2853,12 +2880,16 @@ static int rt_fill_info(struct net *net,
2853 if (rt->rt_dst != rt->rt_gateway) 2880 if (rt->rt_dst != rt->rt_gateway)
2854 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2881 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2855 2882
2856 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) 2883 if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2857 goto nla_put_failure; 2884 goto nla_put_failure;
2858 2885
2859 error = rt->u.dst.error; 2886 if (rt->fl.mark)
2860 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; 2887 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2888
2889 error = rt->dst.error;
2890 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2861 if (rt->peer) { 2891 if (rt->peer) {
2892 inet_peer_refcheck(rt->peer);
2862 id = atomic_read(&rt->peer->ip_id_count) & 0xffff; 2893 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2863 if (rt->peer->tcp_ts_stamp) { 2894 if (rt->peer->tcp_ts_stamp) {
2864 ts = rt->peer->tcp_ts; 2895 ts = rt->peer->tcp_ts;
@@ -2889,7 +2920,7 @@ static int rt_fill_info(struct net *net,
2889 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); 2920 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2890 } 2921 }
2891 2922
2892 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage, 2923 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2893 expires, error) < 0) 2924 expires, error) < 0)
2894 goto nla_put_failure; 2925 goto nla_put_failure;
2895 2926
@@ -2910,6 +2941,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2910 __be32 src = 0; 2941 __be32 src = 0;
2911 u32 iif; 2942 u32 iif;
2912 int err; 2943 int err;
2944 int mark;
2913 struct sk_buff *skb; 2945 struct sk_buff *skb;
2914 2946
2915 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); 2947 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
@@ -2937,6 +2969,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2937 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; 2969 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2938 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; 2970 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2939 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2971 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2972 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2940 2973
2941 if (iif) { 2974 if (iif) {
2942 struct net_device *dev; 2975 struct net_device *dev;
@@ -2949,13 +2982,14 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2949 2982
2950 skb->protocol = htons(ETH_P_IP); 2983 skb->protocol = htons(ETH_P_IP);
2951 skb->dev = dev; 2984 skb->dev = dev;
2985 skb->mark = mark;
2952 local_bh_disable(); 2986 local_bh_disable();
2953 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 2987 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2954 local_bh_enable(); 2988 local_bh_enable();
2955 2989
2956 rt = skb_rtable(skb); 2990 rt = skb_rtable(skb);
2957 if (err == 0 && rt->u.dst.error) 2991 if (err == 0 && rt->dst.error)
2958 err = -rt->u.dst.error; 2992 err = -rt->dst.error;
2959 } else { 2993 } else {
2960 struct flowi fl = { 2994 struct flowi fl = {
2961 .nl_u = { 2995 .nl_u = {
@@ -2966,6 +3000,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2966 }, 3000 },
2967 }, 3001 },
2968 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 3002 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3003 .mark = mark,
2969 }; 3004 };
2970 err = ip_route_output_key(net, &rt, &fl); 3005 err = ip_route_output_key(net, &rt, &fl);
2971 } 3006 }
@@ -2973,7 +3008,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2973 if (err) 3008 if (err)
2974 goto errout_free; 3009 goto errout_free;
2975 3010
2976 skb_dst_set(skb, &rt->u.dst); 3011 skb_dst_set(skb, &rt->dst);
2977 if (rtm->rtm_flags & RTM_F_NOTIFY) 3012 if (rtm->rtm_flags & RTM_F_NOTIFY)
2978 rt->rt_flags |= RTCF_NOTIFY; 3013 rt->rt_flags |= RTCF_NOTIFY;
2979 3014
@@ -3008,13 +3043,13 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3008 if (!rt_hash_table[h].chain) 3043 if (!rt_hash_table[h].chain)
3009 continue; 3044 continue;
3010 rcu_read_lock_bh(); 3045 rcu_read_lock_bh();
3011 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; 3046 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3012 rt = rcu_dereference(rt->u.dst.rt_next), idx++) { 3047 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3013 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) 3048 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3014 continue; 3049 continue;
3015 if (rt_is_expired(rt)) 3050 if (rt_is_expired(rt))
3016 continue; 3051 continue;
3017 skb_dst_set(skb, dst_clone(&rt->u.dst)); 3052 skb_dst_set_noref(skb, &rt->dst);
3018 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, 3053 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3019 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 3054 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3020 1, NLM_F_MULTI) <= 0) { 3055 1, NLM_F_MULTI) <= 0) {
@@ -3060,50 +3095,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3060 return -EINVAL; 3095 return -EINVAL;
3061} 3096}
3062 3097
3063static void rt_secret_reschedule(int old)
3064{
3065 struct net *net;
3066 int new = ip_rt_secret_interval;
3067 int diff = new - old;
3068
3069 if (!diff)
3070 return;
3071
3072 rtnl_lock();
3073 for_each_net(net) {
3074 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3075
3076 if (!new)
3077 continue;
3078
3079 if (deleted) {
3080 long time = net->ipv4.rt_secret_timer.expires - jiffies;
3081
3082 if (time <= 0 || (time += diff) <= 0)
3083 time = 0;
3084
3085 net->ipv4.rt_secret_timer.expires = time;
3086 } else
3087 net->ipv4.rt_secret_timer.expires = new;
3088
3089 net->ipv4.rt_secret_timer.expires += jiffies;
3090 add_timer(&net->ipv4.rt_secret_timer);
3091 }
3092 rtnl_unlock();
3093}
3094
3095static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3096 void __user *buffer, size_t *lenp,
3097 loff_t *ppos)
3098{
3099 int old = ip_rt_secret_interval;
3100 int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3101
3102 rt_secret_reschedule(old);
3103
3104 return ret;
3105}
3106
3107static ctl_table ipv4_route_table[] = { 3098static ctl_table ipv4_route_table[] = {
3108 { 3099 {
3109 .procname = "gc_thresh", 3100 .procname = "gc_thresh",
@@ -3212,13 +3203,6 @@ static ctl_table ipv4_route_table[] = {
3212 .mode = 0644, 3203 .mode = 0644,
3213 .proc_handler = proc_dointvec, 3204 .proc_handler = proc_dointvec,
3214 }, 3205 },
3215 {
3216 .procname = "secret_interval",
3217 .data = &ip_rt_secret_interval,
3218 .maxlen = sizeof(int),
3219 .mode = 0644,
3220 .proc_handler = ipv4_sysctl_rt_secret_interval,
3221 },
3222 { } 3206 { }
3223}; 3207};
3224 3208
@@ -3297,39 +3281,20 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
3297}; 3281};
3298#endif 3282#endif
3299 3283
3300 3284static __net_init int rt_genid_init(struct net *net)
3301static __net_init int rt_secret_timer_init(struct net *net)
3302{ 3285{
3303 atomic_set(&net->ipv4.rt_genid, 3286 get_random_bytes(&net->ipv4.rt_genid,
3304 (int) ((num_physpages ^ (num_physpages>>8)) ^ 3287 sizeof(net->ipv4.rt_genid));
3305 (jiffies ^ (jiffies >> 7))));
3306
3307 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3308 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3309 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3310
3311 if (ip_rt_secret_interval) {
3312 net->ipv4.rt_secret_timer.expires =
3313 jiffies + net_random() % ip_rt_secret_interval +
3314 ip_rt_secret_interval;
3315 add_timer(&net->ipv4.rt_secret_timer);
3316 }
3317 return 0; 3288 return 0;
3318} 3289}
3319 3290
3320static __net_exit void rt_secret_timer_exit(struct net *net) 3291static __net_initdata struct pernet_operations rt_genid_ops = {
3321{ 3292 .init = rt_genid_init,
3322 del_timer_sync(&net->ipv4.rt_secret_timer);
3323}
3324
3325static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3326 .init = rt_secret_timer_init,
3327 .exit = rt_secret_timer_exit,
3328}; 3293};
3329 3294
3330 3295
3331#ifdef CONFIG_NET_CLS_ROUTE 3296#ifdef CONFIG_NET_CLS_ROUTE
3332struct ip_rt_acct *ip_rt_acct __read_mostly; 3297struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3333#endif /* CONFIG_NET_CLS_ROUTE */ 3298#endif /* CONFIG_NET_CLS_ROUTE */
3334 3299
3335static __initdata unsigned long rhash_entries; 3300static __initdata unsigned long rhash_entries;
@@ -3385,9 +3350,6 @@ int __init ip_rt_init(void)
3385 schedule_delayed_work(&expires_work, 3350 schedule_delayed_work(&expires_work,
3386 net_random() % ip_rt_gc_interval + ip_rt_gc_interval); 3351 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3387 3352
3388 if (register_pernet_subsys(&rt_secret_timer_ops))
3389 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3390
3391 if (ip_rt_proc_init()) 3353 if (ip_rt_proc_init())
3392 printk(KERN_ERR "Unable to create route proc files\n"); 3354 printk(KERN_ERR "Unable to create route proc files\n");
3393#ifdef CONFIG_XFRM 3355#ifdef CONFIG_XFRM
@@ -3399,6 +3361,7 @@ int __init ip_rt_init(void)
3399#ifdef CONFIG_SYSCTL 3361#ifdef CONFIG_SYSCTL
3400 register_pernet_subsys(&sysctl_route_ops); 3362 register_pernet_subsys(&sysctl_route_ops);
3401#endif 3363#endif
3364 register_pernet_subsys(&rt_genid_ops);
3402 return rc; 3365 return rc;
3403} 3366}
3404 3367
@@ -3412,7 +3375,3 @@ void __init ip_static_sysctl_init(void)
3412 register_sysctl_paths(ipv4_path, ipv4_skeleton); 3375 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3413} 3376}
3414#endif 3377#endif
3415
3416EXPORT_SYMBOL(__ip_select_ident);
3417EXPORT_SYMBOL(ip_route_input);
3418EXPORT_SYMBOL(ip_route_output_key);