aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/route.c
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /net/ipv4/route.c
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r--net/ipv4/route.c1697
1 files changed, 822 insertions, 875 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ac6559cb54f9..aa13ef105110 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -109,8 +109,8 @@
109#include <linux/sysctl.h> 109#include <linux/sysctl.h>
110#endif 110#endif
111 111
112#define RT_FL_TOS(oldflp) \ 112#define RT_FL_TOS(oldflp4) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) 113 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114 114
115#define IP_MAX_MTU 0xFFF0 115#define IP_MAX_MTU 0xFFF0
116 116
@@ -131,42 +131,80 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256; 131static int ip_rt_min_advmss __read_mostly = 256;
132static int rt_chain_length_max __read_mostly = 20; 132static int rt_chain_length_max __read_mostly = 20;
133 133
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
136
137/* 134/*
138 * Interface to generic destination cache. 135 * Interface to generic destination cache.
139 */ 136 */
140 137
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 138static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
139static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
140static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
142static void ipv4_dst_destroy(struct dst_entry *dst); 141static void ipv4_dst_destroy(struct dst_entry *dst);
143static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb); 143static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 144static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148static int rt_garbage_collect(struct dst_ops *ops); 145static int rt_garbage_collect(struct dst_ops *ops);
149 146
147static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
148 int how)
149{
150}
151
152static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
153{
154 struct rtable *rt = (struct rtable *) dst;
155 struct inet_peer *peer;
156 u32 *p = NULL;
157
158 if (!rt->peer)
159 rt_bind_peer(rt, rt->rt_dst, 1);
160
161 peer = rt->peer;
162 if (peer) {
163 u32 *old_p = __DST_METRICS_PTR(old);
164 unsigned long prev, new;
165
166 p = peer->metrics;
167 if (inet_metrics_new(peer))
168 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
169
170 new = (unsigned long) p;
171 prev = cmpxchg(&dst->_metrics, old, new);
172
173 if (prev != old) {
174 p = __DST_METRICS_PTR(prev);
175 if (prev & DST_METRICS_READ_ONLY)
176 p = NULL;
177 } else {
178 if (rt->fi) {
179 fib_info_put(rt->fi);
180 rt->fi = NULL;
181 }
182 }
183 }
184 return p;
185}
150 186
151static struct dst_ops ipv4_dst_ops = { 187static struct dst_ops ipv4_dst_ops = {
152 .family = AF_INET, 188 .family = AF_INET,
153 .protocol = cpu_to_be16(ETH_P_IP), 189 .protocol = cpu_to_be16(ETH_P_IP),
154 .gc = rt_garbage_collect, 190 .gc = rt_garbage_collect,
155 .check = ipv4_dst_check, 191 .check = ipv4_dst_check,
192 .default_advmss = ipv4_default_advmss,
193 .default_mtu = ipv4_default_mtu,
194 .cow_metrics = ipv4_cow_metrics,
156 .destroy = ipv4_dst_destroy, 195 .destroy = ipv4_dst_destroy,
157 .ifdown = ipv4_dst_ifdown, 196 .ifdown = ipv4_dst_ifdown,
158 .negative_advice = ipv4_negative_advice, 197 .negative_advice = ipv4_negative_advice,
159 .link_failure = ipv4_link_failure, 198 .link_failure = ipv4_link_failure,
160 .update_pmtu = ip_rt_update_pmtu, 199 .update_pmtu = ip_rt_update_pmtu,
161 .local_out = __ip_local_out, 200 .local_out = __ip_local_out,
162 .entries = ATOMIC_INIT(0),
163}; 201};
164 202
165#define ECN_OR_COST(class) TC_PRIO_##class 203#define ECN_OR_COST(class) TC_PRIO_##class
166 204
167const __u8 ip_tos2prio[16] = { 205const __u8 ip_tos2prio[16] = {
168 TC_PRIO_BESTEFFORT, 206 TC_PRIO_BESTEFFORT,
169 ECN_OR_COST(FILLER), 207 ECN_OR_COST(BESTEFFORT),
170 TC_PRIO_BESTEFFORT, 208 TC_PRIO_BESTEFFORT,
171 ECN_OR_COST(BESTEFFORT), 209 ECN_OR_COST(BESTEFFORT),
172 TC_PRIO_BULK, 210 TC_PRIO_BULK,
@@ -199,7 +237,7 @@ const __u8 ip_tos2prio[16] = {
199 */ 237 */
200 238
201struct rt_hash_bucket { 239struct rt_hash_bucket {
202 struct rtable *chain; 240 struct rtable __rcu *chain;
203}; 241};
204 242
205#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ 243#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
@@ -281,7 +319,7 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
281 struct rtable *r = NULL; 319 struct rtable *r = NULL;
282 320
283 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 321 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
284 if (!rt_hash_table[st->bucket].chain) 322 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
285 continue; 323 continue;
286 rcu_read_lock_bh(); 324 rcu_read_lock_bh();
287 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); 325 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
@@ -301,17 +339,17 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
301{ 339{
302 struct rt_cache_iter_state *st = seq->private; 340 struct rt_cache_iter_state *st = seq->private;
303 341
304 r = r->dst.rt_next; 342 r = rcu_dereference_bh(r->dst.rt_next);
305 while (!r) { 343 while (!r) {
306 rcu_read_unlock_bh(); 344 rcu_read_unlock_bh();
307 do { 345 do {
308 if (--st->bucket < 0) 346 if (--st->bucket < 0)
309 return NULL; 347 return NULL;
310 } while (!rt_hash_table[st->bucket].chain); 348 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
311 rcu_read_lock_bh(); 349 rcu_read_lock_bh();
312 r = rt_hash_table[st->bucket].chain; 350 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
313 } 351 }
314 return rcu_dereference_bh(r); 352 return r;
315} 353}
316 354
317static struct rtable *rt_cache_get_next(struct seq_file *seq, 355static struct rtable *rt_cache_get_next(struct seq_file *seq,
@@ -382,12 +420,11 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
382 (__force u32)r->rt_gateway, 420 (__force u32)r->rt_gateway,
383 r->rt_flags, atomic_read(&r->dst.__refcnt), 421 r->rt_flags, atomic_read(&r->dst.__refcnt),
384 r->dst.__use, 0, (__force u32)r->rt_src, 422 r->dst.__use, 0, (__force u32)r->rt_src,
385 (dst_metric(&r->dst, RTAX_ADVMSS) ? 423 dst_metric_advmss(&r->dst) + 40,
386 (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
387 dst_metric(&r->dst, RTAX_WINDOW), 424 dst_metric(&r->dst, RTAX_WINDOW),
388 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 425 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
389 dst_metric(&r->dst, RTAX_RTTVAR)), 426 dst_metric(&r->dst, RTAX_RTTVAR)),
390 r->fl.fl4_tos, 427 r->rt_key_tos,
391 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, 428 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
392 r->dst.hh ? (r->dst.hh->hh_output == 429 r->dst.hh ? (r->dst.hh->hh_output ==
393 dev_queue_xmit) : 0, 430 dev_queue_xmit) : 0,
@@ -466,7 +503,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
466 503
467 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 504 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
468 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 505 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
469 atomic_read(&ipv4_dst_ops.entries), 506 dst_entries_get_slow(&ipv4_dst_ops),
470 st->in_hit, 507 st->in_hit,
471 st->in_slow_tot, 508 st->in_slow_tot,
472 st->in_slow_mc, 509 st->in_slow_mc,
@@ -510,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = {
510 .release = seq_release, 547 .release = seq_release,
511}; 548};
512 549
513#ifdef CONFIG_NET_CLS_ROUTE 550#ifdef CONFIG_IP_ROUTE_CLASSID
514static int rt_acct_proc_show(struct seq_file *m, void *v) 551static int rt_acct_proc_show(struct seq_file *m, void *v)
515{ 552{
516 struct ip_rt_acct *dst, *src; 553 struct ip_rt_acct *dst, *src;
@@ -563,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
563 if (!pde) 600 if (!pde)
564 goto err2; 601 goto err2;
565 602
566#ifdef CONFIG_NET_CLS_ROUTE 603#ifdef CONFIG_IP_ROUTE_CLASSID
567 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 604 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
568 if (!pde) 605 if (!pde)
569 goto err3; 606 goto err3;
570#endif 607#endif
571 return 0; 608 return 0;
572 609
573#ifdef CONFIG_NET_CLS_ROUTE 610#ifdef CONFIG_IP_ROUTE_CLASSID
574err3: 611err3:
575 remove_proc_entry("rt_cache", net->proc_net_stat); 612 remove_proc_entry("rt_cache", net->proc_net_stat);
576#endif 613#endif
@@ -584,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
584{ 621{
585 remove_proc_entry("rt_cache", net->proc_net_stat); 622 remove_proc_entry("rt_cache", net->proc_net_stat);
586 remove_proc_entry("rt_cache", net->proc_net); 623 remove_proc_entry("rt_cache", net->proc_net);
587#ifdef CONFIG_NET_CLS_ROUTE 624#ifdef CONFIG_IP_ROUTE_CLASSID
588 remove_proc_entry("rt_acct", net->proc_net); 625 remove_proc_entry("rt_acct", net->proc_net);
589#endif 626#endif
590} 627}
@@ -622,13 +659,13 @@ static inline int rt_fast_clean(struct rtable *rth)
622 /* Kill broadcast/multicast entries very aggresively, if they 659 /* Kill broadcast/multicast entries very aggresively, if they
623 collide in hash table with more useful entries */ 660 collide in hash table with more useful entries */
624 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 661 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
625 rth->fl.iif && rth->dst.rt_next; 662 rt_is_input_route(rth) && rth->dst.rt_next;
626} 663}
627 664
628static inline int rt_valuable(struct rtable *rth) 665static inline int rt_valuable(struct rtable *rth)
629{ 666{
630 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 667 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
631 rth->dst.expires; 668 (rth->peer && rth->peer->pmtu_expires);
632} 669}
633 670
634static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 671static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -639,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
639 if (atomic_read(&rth->dst.__refcnt)) 676 if (atomic_read(&rth->dst.__refcnt))
640 goto out; 677 goto out;
641 678
642 ret = 1;
643 if (rth->dst.expires &&
644 time_after_eq(jiffies, rth->dst.expires))
645 goto out;
646
647 age = jiffies - rth->dst.lastuse; 679 age = jiffies - rth->dst.lastuse;
648 ret = 0;
649 if ((age <= tmo1 && !rt_fast_clean(rth)) || 680 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
650 (age <= tmo2 && rt_valuable(rth))) 681 (age <= tmo2 && rt_valuable(rth)))
651 goto out; 682 goto out;
@@ -667,7 +698,7 @@ static inline u32 rt_score(struct rtable *rt)
667 if (rt_valuable(rt)) 698 if (rt_valuable(rt))
668 score |= (1<<31); 699 score |= (1<<31);
669 700
670 if (!rt->fl.iif || 701 if (rt_is_output_route(rt) ||
671 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) 702 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
672 score |= (1<<30); 703 score |= (1<<30);
673 704
@@ -680,22 +711,22 @@ static inline bool rt_caching(const struct net *net)
680 net->ipv4.sysctl_rt_cache_rebuild_count; 711 net->ipv4.sysctl_rt_cache_rebuild_count;
681} 712}
682 713
683static inline bool compare_hash_inputs(const struct flowi *fl1, 714static inline bool compare_hash_inputs(const struct rtable *rt1,
684 const struct flowi *fl2) 715 const struct rtable *rt2)
685{ 716{
686 return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | 717 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
687 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | 718 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
688 (fl1->iif ^ fl2->iif)) == 0); 719 (rt1->rt_iif ^ rt2->rt_iif)) == 0);
689} 720}
690 721
691static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 722static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
692{ 723{
693 return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | 724 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
694 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | 725 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
695 (fl1->mark ^ fl2->mark) | 726 (rt1->rt_mark ^ rt2->rt_mark) |
696 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) | 727 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
697 (fl1->oif ^ fl2->oif) | 728 (rt1->rt_oif ^ rt2->rt_oif) |
698 (fl1->iif ^ fl2->iif)) == 0; 729 (rt1->rt_iif ^ rt2->rt_iif)) == 0;
699} 730}
700 731
701static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 732static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
@@ -713,55 +744,48 @@ static inline int rt_is_expired(struct rtable *rth)
713 * Can be called by a softirq or a process. 744 * Can be called by a softirq or a process.
714 * In the later case, we want to be reschedule if necessary 745 * In the later case, we want to be reschedule if necessary
715 */ 746 */
716static void rt_do_flush(int process_context) 747static void rt_do_flush(struct net *net, int process_context)
717{ 748{
718 unsigned int i; 749 unsigned int i;
719 struct rtable *rth, *next; 750 struct rtable *rth, *next;
720 struct rtable * tail;
721 751
722 for (i = 0; i <= rt_hash_mask; i++) { 752 for (i = 0; i <= rt_hash_mask; i++) {
753 struct rtable __rcu **pprev;
754 struct rtable *list;
755
723 if (process_context && need_resched()) 756 if (process_context && need_resched())
724 cond_resched(); 757 cond_resched();
725 rth = rt_hash_table[i].chain; 758 rth = rcu_dereference_raw(rt_hash_table[i].chain);
726 if (!rth) 759 if (!rth)
727 continue; 760 continue;
728 761
729 spin_lock_bh(rt_hash_lock_addr(i)); 762 spin_lock_bh(rt_hash_lock_addr(i));
730#ifdef CONFIG_NET_NS
731 {
732 struct rtable ** prev, * p;
733 763
734 rth = rt_hash_table[i].chain; 764 list = NULL;
765 pprev = &rt_hash_table[i].chain;
766 rth = rcu_dereference_protected(*pprev,
767 lockdep_is_held(rt_hash_lock_addr(i)));
735 768
736 /* defer releasing the head of the list after spin_unlock */ 769 while (rth) {
737 for (tail = rth; tail; tail = tail->dst.rt_next) 770 next = rcu_dereference_protected(rth->dst.rt_next,
738 if (!rt_is_expired(tail)) 771 lockdep_is_held(rt_hash_lock_addr(i)));
739 break; 772
740 if (rth != tail) 773 if (!net ||
741 rt_hash_table[i].chain = tail; 774 net_eq(dev_net(rth->dst.dev), net)) {
742 775 rcu_assign_pointer(*pprev, next);
743 /* call rt_free on entries after the tail requiring flush */ 776 rcu_assign_pointer(rth->dst.rt_next, list);
744 prev = &rt_hash_table[i].chain; 777 list = rth;
745 for (p = *prev; p; p = next) {
746 next = p->dst.rt_next;
747 if (!rt_is_expired(p)) {
748 prev = &p->dst.rt_next;
749 } else { 778 } else {
750 *prev = next; 779 pprev = &rth->dst.rt_next;
751 rt_free(p);
752 } 780 }
781 rth = next;
753 } 782 }
754 } 783
755#else
756 rth = rt_hash_table[i].chain;
757 rt_hash_table[i].chain = NULL;
758 tail = NULL;
759#endif
760 spin_unlock_bh(rt_hash_lock_addr(i)); 784 spin_unlock_bh(rt_hash_lock_addr(i));
761 785
762 for (; rth != tail; rth = next) { 786 for (; list; list = next) {
763 next = rth->dst.rt_next; 787 next = rcu_dereference_protected(list->dst.rt_next, 1);
764 rt_free(rth); 788 rt_free(list);
765 } 789 }
766 } 790 }
767} 791}
@@ -789,104 +813,15 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
789 const struct rtable *aux = head; 813 const struct rtable *aux = head;
790 814
791 while (aux != rth) { 815 while (aux != rth) {
792 if (compare_hash_inputs(&aux->fl, &rth->fl)) 816 if (compare_hash_inputs(aux, rth))
793 return 0; 817 return 0;
794 aux = aux->dst.rt_next; 818 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
795 } 819 }
796 return ONE; 820 return ONE;
797} 821}
798 822
799static void rt_check_expire(void)
800{
801 static unsigned int rover;
802 unsigned int i = rover, goal;
803 struct rtable *rth, **rthp;
804 unsigned long samples = 0;
805 unsigned long sum = 0, sum2 = 0;
806 unsigned long delta;
807 u64 mult;
808
809 delta = jiffies - expires_ljiffies;
810 expires_ljiffies = jiffies;
811 mult = ((u64)delta) << rt_hash_log;
812 if (ip_rt_gc_timeout > 1)
813 do_div(mult, ip_rt_gc_timeout);
814 goal = (unsigned int)mult;
815 if (goal > rt_hash_mask)
816 goal = rt_hash_mask + 1;
817 for (; goal > 0; goal--) {
818 unsigned long tmo = ip_rt_gc_timeout;
819 unsigned long length;
820
821 i = (i + 1) & rt_hash_mask;
822 rthp = &rt_hash_table[i].chain;
823
824 if (need_resched())
825 cond_resched();
826
827 samples++;
828
829 if (*rthp == NULL)
830 continue;
831 length = 0;
832 spin_lock_bh(rt_hash_lock_addr(i));
833 while ((rth = *rthp) != NULL) {
834 prefetch(rth->dst.rt_next);
835 if (rt_is_expired(rth)) {
836 *rthp = rth->dst.rt_next;
837 rt_free(rth);
838 continue;
839 }
840 if (rth->dst.expires) {
841 /* Entry is expired even if it is in use */
842 if (time_before_eq(jiffies, rth->dst.expires)) {
843nofree:
844 tmo >>= 1;
845 rthp = &rth->dst.rt_next;
846 /*
847 * We only count entries on
848 * a chain with equal hash inputs once
849 * so that entries for different QOS
850 * levels, and other non-hash input
851 * attributes don't unfairly skew
852 * the length computation
853 */
854 length += has_noalias(rt_hash_table[i].chain, rth);
855 continue;
856 }
857 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
858 goto nofree;
859
860 /* Cleanup aged off entries. */
861 *rthp = rth->dst.rt_next;
862 rt_free(rth);
863 }
864 spin_unlock_bh(rt_hash_lock_addr(i));
865 sum += length;
866 sum2 += length*length;
867 }
868 if (samples) {
869 unsigned long avg = sum / samples;
870 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
871 rt_chain_length_max = max_t(unsigned long,
872 ip_rt_gc_elasticity,
873 (avg + 4*sd) >> FRACT_BITS);
874 }
875 rover = i;
876}
877
878/*
879 * rt_worker_func() is run in process context.
880 * we call rt_check_expire() to scan part of the hash table
881 */
882static void rt_worker_func(struct work_struct *work)
883{
884 rt_check_expire();
885 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
886}
887
888/* 823/*
889 * Pertubation of rt_genid by a small quantity [1..256] 824 * Perturbation of rt_genid by a small quantity [1..256]
890 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 825 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
891 * many times (2^24) without giving recent rt_genid. 826 * many times (2^24) without giving recent rt_genid.
892 * Jenkins hash is strong enough that litle changes of rt_genid are OK. 827 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
@@ -907,13 +842,13 @@ void rt_cache_flush(struct net *net, int delay)
907{ 842{
908 rt_cache_invalidate(net); 843 rt_cache_invalidate(net);
909 if (delay >= 0) 844 if (delay >= 0)
910 rt_do_flush(!in_softirq()); 845 rt_do_flush(net, !in_softirq());
911} 846}
912 847
913/* Flush previous cache invalidated entries from the cache */ 848/* Flush previous cache invalidated entries from the cache */
914void rt_cache_flush_batch(void) 849void rt_cache_flush_batch(struct net *net)
915{ 850{
916 rt_do_flush(!in_softirq()); 851 rt_do_flush(net, !in_softirq());
917} 852}
918 853
919static void rt_emergency_hash_rebuild(struct net *net) 854static void rt_emergency_hash_rebuild(struct net *net)
@@ -942,9 +877,11 @@ static int rt_garbage_collect(struct dst_ops *ops)
942 static unsigned long last_gc; 877 static unsigned long last_gc;
943 static int rover; 878 static int rover;
944 static int equilibrium; 879 static int equilibrium;
945 struct rtable *rth, **rthp; 880 struct rtable *rth;
881 struct rtable __rcu **rthp;
946 unsigned long now = jiffies; 882 unsigned long now = jiffies;
947 int goal; 883 int goal;
884 int entries = dst_entries_get_fast(&ipv4_dst_ops);
948 885
949 /* 886 /*
950 * Garbage collection is pretty expensive, 887 * Garbage collection is pretty expensive,
@@ -954,28 +891,28 @@ static int rt_garbage_collect(struct dst_ops *ops)
954 RT_CACHE_STAT_INC(gc_total); 891 RT_CACHE_STAT_INC(gc_total);
955 892
956 if (now - last_gc < ip_rt_gc_min_interval && 893 if (now - last_gc < ip_rt_gc_min_interval &&
957 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { 894 entries < ip_rt_max_size) {
958 RT_CACHE_STAT_INC(gc_ignored); 895 RT_CACHE_STAT_INC(gc_ignored);
959 goto out; 896 goto out;
960 } 897 }
961 898
899 entries = dst_entries_get_slow(&ipv4_dst_ops);
962 /* Calculate number of entries, which we want to expire now. */ 900 /* Calculate number of entries, which we want to expire now. */
963 goal = atomic_read(&ipv4_dst_ops.entries) - 901 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
964 (ip_rt_gc_elasticity << rt_hash_log);
965 if (goal <= 0) { 902 if (goal <= 0) {
966 if (equilibrium < ipv4_dst_ops.gc_thresh) 903 if (equilibrium < ipv4_dst_ops.gc_thresh)
967 equilibrium = ipv4_dst_ops.gc_thresh; 904 equilibrium = ipv4_dst_ops.gc_thresh;
968 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 905 goal = entries - equilibrium;
969 if (goal > 0) { 906 if (goal > 0) {
970 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); 907 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
971 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 908 goal = entries - equilibrium;
972 } 909 }
973 } else { 910 } else {
974 /* We are in dangerous area. Try to reduce cache really 911 /* We are in dangerous area. Try to reduce cache really
975 * aggressively. 912 * aggressively.
976 */ 913 */
977 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); 914 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
978 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; 915 equilibrium = entries - goal;
979 } 916 }
980 917
981 if (now - last_gc >= ip_rt_gc_min_interval) 918 if (now - last_gc >= ip_rt_gc_min_interval)
@@ -995,7 +932,8 @@ static int rt_garbage_collect(struct dst_ops *ops)
995 k = (k + 1) & rt_hash_mask; 932 k = (k + 1) & rt_hash_mask;
996 rthp = &rt_hash_table[k].chain; 933 rthp = &rt_hash_table[k].chain;
997 spin_lock_bh(rt_hash_lock_addr(k)); 934 spin_lock_bh(rt_hash_lock_addr(k));
998 while ((rth = *rthp) != NULL) { 935 while ((rth = rcu_dereference_protected(*rthp,
936 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
999 if (!rt_is_expired(rth) && 937 if (!rt_is_expired(rth) &&
1000 !rt_may_expire(rth, tmo, expire)) { 938 !rt_may_expire(rth, tmo, expire)) {
1001 tmo >>= 1; 939 tmo >>= 1;
@@ -1030,16 +968,14 @@ static int rt_garbage_collect(struct dst_ops *ops)
1030 break; 968 break;
1031 969
1032 expire >>= 1; 970 expire >>= 1;
1033#if RT_CACHE_DEBUG >= 2
1034 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1035 atomic_read(&ipv4_dst_ops.entries), goal, i);
1036#endif
1037 971
1038 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 972 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1039 goto out; 973 goto out;
1040 } while (!in_softirq() && time_before_eq(jiffies, now)); 974 } while (!in_softirq() && time_before_eq(jiffies, now));
1041 975
1042 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 976 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
977 goto out;
978 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1043 goto out; 979 goto out;
1044 if (net_ratelimit()) 980 if (net_ratelimit())
1045 printk(KERN_WARNING "dst cache overflow\n"); 981 printk(KERN_WARNING "dst cache overflow\n");
@@ -1049,12 +985,9 @@ static int rt_garbage_collect(struct dst_ops *ops)
1049work_done: 985work_done:
1050 expire += ip_rt_gc_min_interval; 986 expire += ip_rt_gc_min_interval;
1051 if (expire > ip_rt_gc_timeout || 987 if (expire > ip_rt_gc_timeout ||
1052 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) 988 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
989 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1053 expire = ip_rt_gc_timeout; 990 expire = ip_rt_gc_timeout;
1054#if RT_CACHE_DEBUG >= 2
1055 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1056 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1057#endif
1058out: return 0; 991out: return 0;
1059} 992}
1060 993
@@ -1068,17 +1001,17 @@ static int slow_chain_length(const struct rtable *head)
1068 1001
1069 while (rth) { 1002 while (rth) {
1070 length += has_noalias(head, rth); 1003 length += has_noalias(head, rth);
1071 rth = rth->dst.rt_next; 1004 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1072 } 1005 }
1073 return length >> FRACT_BITS; 1006 return length >> FRACT_BITS;
1074} 1007}
1075 1008
1076static int rt_intern_hash(unsigned hash, struct rtable *rt, 1009static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1077 struct rtable **rp, struct sk_buff *skb, int ifindex) 1010 struct sk_buff *skb, int ifindex)
1078{ 1011{
1079 struct rtable *rth, **rthp; 1012 struct rtable *rth, *cand;
1013 struct rtable __rcu **rthp, **candp;
1080 unsigned long now; 1014 unsigned long now;
1081 struct rtable *cand, **candp;
1082 u32 min_score; 1015 u32 min_score;
1083 int chain_length; 1016 int chain_length;
1084 int attempts = !in_softirq(); 1017 int attempts = !in_softirq();
@@ -1102,36 +1035,37 @@ restart:
1102 * Note that we do rt_free on this new route entry, so that 1035 * Note that we do rt_free on this new route entry, so that
1103 * once its refcount hits zero, we are still able to reap it 1036 * once its refcount hits zero, we are still able to reap it
1104 * (Thanks Alexey) 1037 * (Thanks Alexey)
1105 * Note also the rt_free uses call_rcu. We don't actually 1038 * Note: To avoid expensive rcu stuff for this uncached dst,
1106 * need rcu protection here, this is just our path to get 1039 * we set DST_NOCACHE so that dst_release() can free dst without
1107 * on the route gc list. 1040 * waiting a grace period.
1108 */ 1041 */
1109 1042
1110 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1043 rt->dst.flags |= DST_NOCACHE;
1044 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1111 int err = arp_bind_neighbour(&rt->dst); 1045 int err = arp_bind_neighbour(&rt->dst);
1112 if (err) { 1046 if (err) {
1113 if (net_ratelimit()) 1047 if (net_ratelimit())
1114 printk(KERN_WARNING 1048 printk(KERN_WARNING
1115 "Neighbour table failure & not caching routes.\n"); 1049 "Neighbour table failure & not caching routes.\n");
1116 rt_drop(rt); 1050 ip_rt_put(rt);
1117 return err; 1051 return ERR_PTR(err);
1118 } 1052 }
1119 } 1053 }
1120 1054
1121 rt_free(rt);
1122 goto skip_hashing; 1055 goto skip_hashing;
1123 } 1056 }
1124 1057
1125 rthp = &rt_hash_table[hash].chain; 1058 rthp = &rt_hash_table[hash].chain;
1126 1059
1127 spin_lock_bh(rt_hash_lock_addr(hash)); 1060 spin_lock_bh(rt_hash_lock_addr(hash));
1128 while ((rth = *rthp) != NULL) { 1061 while ((rth = rcu_dereference_protected(*rthp,
1062 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1129 if (rt_is_expired(rth)) { 1063 if (rt_is_expired(rth)) {
1130 *rthp = rth->dst.rt_next; 1064 *rthp = rth->dst.rt_next;
1131 rt_free(rth); 1065 rt_free(rth);
1132 continue; 1066 continue;
1133 } 1067 }
1134 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { 1068 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1135 /* Put it first */ 1069 /* Put it first */
1136 *rthp = rth->dst.rt_next; 1070 *rthp = rth->dst.rt_next;
1137 /* 1071 /*
@@ -1151,11 +1085,9 @@ restart:
1151 spin_unlock_bh(rt_hash_lock_addr(hash)); 1085 spin_unlock_bh(rt_hash_lock_addr(hash));
1152 1086
1153 rt_drop(rt); 1087 rt_drop(rt);
1154 if (rp) 1088 if (skb)
1155 *rp = rth;
1156 else
1157 skb_dst_set(skb, &rth->dst); 1089 skb_dst_set(skb, &rth->dst);
1158 return 0; 1090 return rth;
1159 } 1091 }
1160 1092
1161 if (!atomic_read(&rth->dst.__refcnt)) { 1093 if (!atomic_read(&rth->dst.__refcnt)) {
@@ -1196,7 +1128,7 @@ restart:
1196 rt_emergency_hash_rebuild(net); 1128 rt_emergency_hash_rebuild(net);
1197 spin_unlock_bh(rt_hash_lock_addr(hash)); 1129 spin_unlock_bh(rt_hash_lock_addr(hash));
1198 1130
1199 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1131 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1200 ifindex, rt_genid(net)); 1132 ifindex, rt_genid(net));
1201 goto restart; 1133 goto restart;
1202 } 1134 }
@@ -1205,14 +1137,14 @@ restart:
1205 /* Try to bind route to arp only if it is output 1137 /* Try to bind route to arp only if it is output
1206 route or unicast forwarding path. 1138 route or unicast forwarding path.
1207 */ 1139 */
1208 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1140 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1209 int err = arp_bind_neighbour(&rt->dst); 1141 int err = arp_bind_neighbour(&rt->dst);
1210 if (err) { 1142 if (err) {
1211 spin_unlock_bh(rt_hash_lock_addr(hash)); 1143 spin_unlock_bh(rt_hash_lock_addr(hash));
1212 1144
1213 if (err != -ENOBUFS) { 1145 if (err != -ENOBUFS) {
1214 rt_drop(rt); 1146 rt_drop(rt);
1215 return err; 1147 return ERR_PTR(err);
1216 } 1148 }
1217 1149
1218 /* Neighbour tables are full and nothing 1150 /* Neighbour tables are full and nothing
@@ -1233,25 +1165,15 @@ restart:
1233 if (net_ratelimit()) 1165 if (net_ratelimit())
1234 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); 1166 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1235 rt_drop(rt); 1167 rt_drop(rt);
1236 return -ENOBUFS; 1168 return ERR_PTR(-ENOBUFS);
1237 } 1169 }
1238 } 1170 }
1239 1171
1240 rt->dst.rt_next = rt_hash_table[hash].chain; 1172 rt->dst.rt_next = rt_hash_table[hash].chain;
1241 1173
1242#if RT_CACHE_DEBUG >= 2
1243 if (rt->dst.rt_next) {
1244 struct rtable *trt;
1245 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1246 hash, &rt->rt_dst);
1247 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1248 printk(" . %pI4", &trt->rt_dst);
1249 printk("\n");
1250 }
1251#endif
1252 /* 1174 /*
1253 * Since lookup is lockfree, we must make sure 1175 * Since lookup is lockfree, we must make sure
1254 * previous writes to rt are comitted to memory 1176 * previous writes to rt are committed to memory
1255 * before making rt visible to other CPUS. 1177 * before making rt visible to other CPUS.
1256 */ 1178 */
1257 rcu_assign_pointer(rt_hash_table[hash].chain, rt); 1179 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
@@ -1259,28 +1181,28 @@ restart:
1259 spin_unlock_bh(rt_hash_lock_addr(hash)); 1181 spin_unlock_bh(rt_hash_lock_addr(hash));
1260 1182
1261skip_hashing: 1183skip_hashing:
1262 if (rp) 1184 if (skb)
1263 *rp = rt;
1264 else
1265 skb_dst_set(skb, &rt->dst); 1185 skb_dst_set(skb, &rt->dst);
1266 return 0; 1186 return rt;
1187}
1188
1189static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1190
1191static u32 rt_peer_genid(void)
1192{
1193 return atomic_read(&__rt_peer_genid);
1267} 1194}
1268 1195
1269void rt_bind_peer(struct rtable *rt, int create) 1196void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1270{ 1197{
1271 static DEFINE_SPINLOCK(rt_peer_lock);
1272 struct inet_peer *peer; 1198 struct inet_peer *peer;
1273 1199
1274 peer = inet_getpeer(rt->rt_dst, create); 1200 peer = inet_getpeer_v4(daddr, create);
1275 1201
1276 spin_lock_bh(&rt_peer_lock); 1202 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1277 if (rt->peer == NULL) {
1278 rt->peer = peer;
1279 peer = NULL;
1280 }
1281 spin_unlock_bh(&rt_peer_lock);
1282 if (peer)
1283 inet_putpeer(peer); 1203 inet_putpeer(peer);
1204 else
1205 rt->rt_peer_genid = rt_peer_genid();
1284} 1206}
1285 1207
1286/* 1208/*
@@ -1309,7 +1231,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1309 1231
1310 if (rt) { 1232 if (rt) {
1311 if (rt->peer == NULL) 1233 if (rt->peer == NULL)
1312 rt_bind_peer(rt, 1); 1234 rt_bind_peer(rt, rt->rt_dst, 1);
1313 1235
1314 /* If peer is attached to destination, it is never detached, 1236 /* If peer is attached to destination, it is never detached,
1315 so that we need not to grab a lock to dereference it. 1237 so that we need not to grab a lock to dereference it.
@@ -1328,12 +1250,14 @@ EXPORT_SYMBOL(__ip_select_ident);
1328 1250
1329static void rt_del(unsigned hash, struct rtable *rt) 1251static void rt_del(unsigned hash, struct rtable *rt)
1330{ 1252{
1331 struct rtable **rthp, *aux; 1253 struct rtable __rcu **rthp;
1254 struct rtable *aux;
1332 1255
1333 rthp = &rt_hash_table[hash].chain; 1256 rthp = &rt_hash_table[hash].chain;
1334 spin_lock_bh(rt_hash_lock_addr(hash)); 1257 spin_lock_bh(rt_hash_lock_addr(hash));
1335 ip_rt_put(rt); 1258 ip_rt_put(rt);
1336 while ((aux = *rthp) != NULL) { 1259 while ((aux = rcu_dereference_protected(*rthp,
1260 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1337 if (aux == rt || rt_is_expired(aux)) { 1261 if (aux == rt || rt_is_expired(aux)) {
1338 *rthp = aux->dst.rt_next; 1262 *rthp = aux->dst.rt_next;
1339 rt_free(aux); 1263 rt_free(aux);
@@ -1348,12 +1272,8 @@ static void rt_del(unsigned hash, struct rtable *rt)
1348void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1272void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1349 __be32 saddr, struct net_device *dev) 1273 __be32 saddr, struct net_device *dev)
1350{ 1274{
1351 int i, k;
1352 struct in_device *in_dev = __in_dev_get_rcu(dev); 1275 struct in_device *in_dev = __in_dev_get_rcu(dev);
1353 struct rtable *rth, **rthp; 1276 struct inet_peer *peer;
1354 __be32 skeys[2] = { saddr, 0 };
1355 int ikeys[2] = { dev->ifindex, 0 };
1356 struct netevent_redirect netevent;
1357 struct net *net; 1277 struct net *net;
1358 1278
1359 if (!in_dev) 1279 if (!in_dev)
@@ -1365,9 +1285,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1365 ipv4_is_zeronet(new_gw)) 1285 ipv4_is_zeronet(new_gw))
1366 goto reject_redirect; 1286 goto reject_redirect;
1367 1287
1368 if (!rt_caching(net))
1369 goto reject_redirect;
1370
1371 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1288 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1372 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1289 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1373 goto reject_redirect; 1290 goto reject_redirect;
@@ -1378,93 +1295,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1378 goto reject_redirect; 1295 goto reject_redirect;
1379 } 1296 }
1380 1297
1381 for (i = 0; i < 2; i++) { 1298 peer = inet_getpeer_v4(daddr, 1);
1382 for (k = 0; k < 2; k++) { 1299 if (peer) {
1383 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1300 peer->redirect_learned.a4 = new_gw;
1384 rt_genid(net));
1385
1386 rthp=&rt_hash_table[hash].chain;
1387
1388 while ((rth = rcu_dereference(*rthp)) != NULL) {
1389 struct rtable *rt;
1390
1391 if (rth->fl.fl4_dst != daddr ||
1392 rth->fl.fl4_src != skeys[i] ||
1393 rth->fl.oif != ikeys[k] ||
1394 rth->fl.iif != 0 ||
1395 rt_is_expired(rth) ||
1396 !net_eq(dev_net(rth->dst.dev), net)) {
1397 rthp = &rth->dst.rt_next;
1398 continue;
1399 }
1400
1401 if (rth->rt_dst != daddr ||
1402 rth->rt_src != saddr ||
1403 rth->dst.error ||
1404 rth->rt_gateway != old_gw ||
1405 rth->dst.dev != dev)
1406 break;
1407
1408 dst_hold(&rth->dst);
1409
1410 rt = dst_alloc(&ipv4_dst_ops);
1411 if (rt == NULL) {
1412 ip_rt_put(rth);
1413 return;
1414 }
1415
1416 /* Copy all the information. */
1417 *rt = *rth;
1418 rt->dst.__use = 1;
1419 atomic_set(&rt->dst.__refcnt, 1);
1420 rt->dst.child = NULL;
1421 if (rt->dst.dev)
1422 dev_hold(rt->dst.dev);
1423 if (rt->idev)
1424 in_dev_hold(rt->idev);
1425 rt->dst.obsolete = -1;
1426 rt->dst.lastuse = jiffies;
1427 rt->dst.path = &rt->dst;
1428 rt->dst.neighbour = NULL;
1429 rt->dst.hh = NULL;
1430#ifdef CONFIG_XFRM
1431 rt->dst.xfrm = NULL;
1432#endif
1433 rt->rt_genid = rt_genid(net);
1434 rt->rt_flags |= RTCF_REDIRECTED;
1435
1436 /* Gateway is different ... */
1437 rt->rt_gateway = new_gw;
1438
1439 /* Redirect received -> path was valid */
1440 dst_confirm(&rth->dst);
1441
1442 if (rt->peer)
1443 atomic_inc(&rt->peer->refcnt);
1444
1445 if (arp_bind_neighbour(&rt->dst) ||
1446 !(rt->dst.neighbour->nud_state &
1447 NUD_VALID)) {
1448 if (rt->dst.neighbour)
1449 neigh_event_send(rt->dst.neighbour, NULL);
1450 ip_rt_put(rth);
1451 rt_drop(rt);
1452 goto do_next;
1453 }
1454 1301
1455 netevent.old = &rth->dst; 1302 inet_putpeer(peer);
1456 netevent.new = &rt->dst;
1457 call_netevent_notifiers(NETEVENT_REDIRECT,
1458 &netevent);
1459 1303
1460 rt_del(hash, rth); 1304 atomic_inc(&__rt_peer_genid);
1461 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1462 ip_rt_put(rt);
1463 goto do_next;
1464 }
1465 do_next:
1466 ;
1467 }
1468 } 1305 }
1469 return; 1306 return;
1470 1307
@@ -1479,6 +1316,23 @@ reject_redirect:
1479 ; 1316 ;
1480} 1317}
1481 1318
1319static bool peer_pmtu_expired(struct inet_peer *peer)
1320{
1321 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1322
1323 return orig &&
1324 time_after_eq(jiffies, orig) &&
1325 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1326}
1327
1328static bool peer_pmtu_cleaned(struct inet_peer *peer)
1329{
1330 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1331
1332 return orig &&
1333 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1334}
1335
1482static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1336static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1483{ 1337{
1484 struct rtable *rt = (struct rtable *)dst; 1338 struct rtable *rt = (struct rtable *)dst;
@@ -1488,18 +1342,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1488 if (dst->obsolete > 0) { 1342 if (dst->obsolete > 0) {
1489 ip_rt_put(rt); 1343 ip_rt_put(rt);
1490 ret = NULL; 1344 ret = NULL;
1491 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1345 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1492 (rt->dst.expires && 1346 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1493 time_after_eq(jiffies, rt->dst.expires))) { 1347 rt->rt_oif,
1494 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1495 rt->fl.oif,
1496 rt_genid(dev_net(dst->dev))); 1348 rt_genid(dev_net(dst->dev)));
1497#if RT_CACHE_DEBUG >= 1
1498 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1499 &rt->rt_dst, rt->fl.fl4_tos);
1500#endif
1501 rt_del(hash, rt); 1349 rt_del(hash, rt);
1502 ret = NULL; 1350 ret = NULL;
1351 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1352 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1503 } 1353 }
1504 } 1354 }
1505 return ret; 1355 return ret;
@@ -1525,6 +1375,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1525{ 1375{
1526 struct rtable *rt = skb_rtable(skb); 1376 struct rtable *rt = skb_rtable(skb);
1527 struct in_device *in_dev; 1377 struct in_device *in_dev;
1378 struct inet_peer *peer;
1528 int log_martians; 1379 int log_martians;
1529 1380
1530 rcu_read_lock(); 1381 rcu_read_lock();
@@ -1536,36 +1387,44 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1536 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1387 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1537 rcu_read_unlock(); 1388 rcu_read_unlock();
1538 1389
1390 if (!rt->peer)
1391 rt_bind_peer(rt, rt->rt_dst, 1);
1392 peer = rt->peer;
1393 if (!peer) {
1394 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1395 return;
1396 }
1397
1539 /* No redirected packets during ip_rt_redirect_silence; 1398 /* No redirected packets during ip_rt_redirect_silence;
1540 * reset the algorithm. 1399 * reset the algorithm.
1541 */ 1400 */
1542 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) 1401 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1543 rt->dst.rate_tokens = 0; 1402 peer->rate_tokens = 0;
1544 1403
1545 /* Too many ignored redirects; do not send anything 1404 /* Too many ignored redirects; do not send anything
1546 * set dst.rate_last to the last seen redirected packet. 1405 * set dst.rate_last to the last seen redirected packet.
1547 */ 1406 */
1548 if (rt->dst.rate_tokens >= ip_rt_redirect_number) { 1407 if (peer->rate_tokens >= ip_rt_redirect_number) {
1549 rt->dst.rate_last = jiffies; 1408 peer->rate_last = jiffies;
1550 return; 1409 return;
1551 } 1410 }
1552 1411
1553 /* Check for load limit; set rate_last to the latest sent 1412 /* Check for load limit; set rate_last to the latest sent
1554 * redirect. 1413 * redirect.
1555 */ 1414 */
1556 if (rt->dst.rate_tokens == 0 || 1415 if (peer->rate_tokens == 0 ||
1557 time_after(jiffies, 1416 time_after(jiffies,
1558 (rt->dst.rate_last + 1417 (peer->rate_last +
1559 (ip_rt_redirect_load << rt->dst.rate_tokens)))) { 1418 (ip_rt_redirect_load << peer->rate_tokens)))) {
1560 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1419 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1561 rt->dst.rate_last = jiffies; 1420 peer->rate_last = jiffies;
1562 ++rt->dst.rate_tokens; 1421 ++peer->rate_tokens;
1563#ifdef CONFIG_IP_ROUTE_VERBOSE 1422#ifdef CONFIG_IP_ROUTE_VERBOSE
1564 if (log_martians && 1423 if (log_martians &&
1565 rt->dst.rate_tokens == ip_rt_redirect_number && 1424 peer->rate_tokens == ip_rt_redirect_number &&
1566 net_ratelimit()) 1425 net_ratelimit())
1567 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1426 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1568 &rt->rt_src, rt->rt_iif, 1427 &ip_hdr(skb)->saddr, rt->rt_iif,
1569 &rt->rt_dst, &rt->rt_gateway); 1428 &rt->rt_dst, &rt->rt_gateway);
1570#endif 1429#endif
1571 } 1430 }
@@ -1574,7 +1433,9 @@ void ip_rt_send_redirect(struct sk_buff *skb)
1574static int ip_error(struct sk_buff *skb) 1433static int ip_error(struct sk_buff *skb)
1575{ 1434{
1576 struct rtable *rt = skb_rtable(skb); 1435 struct rtable *rt = skb_rtable(skb);
1436 struct inet_peer *peer;
1577 unsigned long now; 1437 unsigned long now;
1438 bool send;
1578 int code; 1439 int code;
1579 1440
1580 switch (rt->dst.error) { 1441 switch (rt->dst.error) {
@@ -1594,15 +1455,24 @@ static int ip_error(struct sk_buff *skb)
1594 break; 1455 break;
1595 } 1456 }
1596 1457
1597 now = jiffies; 1458 if (!rt->peer)
1598 rt->dst.rate_tokens += now - rt->dst.rate_last; 1459 rt_bind_peer(rt, rt->rt_dst, 1);
1599 if (rt->dst.rate_tokens > ip_rt_error_burst) 1460 peer = rt->peer;
1600 rt->dst.rate_tokens = ip_rt_error_burst; 1461
1601 rt->dst.rate_last = now; 1462 send = true;
1602 if (rt->dst.rate_tokens >= ip_rt_error_cost) { 1463 if (peer) {
1603 rt->dst.rate_tokens -= ip_rt_error_cost; 1464 now = jiffies;
1465 peer->rate_tokens += now - peer->rate_last;
1466 if (peer->rate_tokens > ip_rt_error_burst)
1467 peer->rate_tokens = ip_rt_error_burst;
1468 peer->rate_last = now;
1469 if (peer->rate_tokens >= ip_rt_error_cost)
1470 peer->rate_tokens -= ip_rt_error_cost;
1471 else
1472 send = false;
1473 }
1474 if (send)
1604 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1475 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1605 }
1606 1476
1607out: kfree_skb(skb); 1477out: kfree_skb(skb);
1608 return 0; 1478 return 0;
@@ -1626,88 +1496,148 @@ static inline unsigned short guess_mtu(unsigned short old_mtu)
1626 return 68; 1496 return 68;
1627} 1497}
1628 1498
1629unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, 1499unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1630 unsigned short new_mtu, 1500 unsigned short new_mtu,
1631 struct net_device *dev) 1501 struct net_device *dev)
1632{ 1502{
1633 int i, k;
1634 unsigned short old_mtu = ntohs(iph->tot_len); 1503 unsigned short old_mtu = ntohs(iph->tot_len);
1635 struct rtable *rth;
1636 int ikeys[2] = { dev->ifindex, 0 };
1637 __be32 skeys[2] = { iph->saddr, 0, };
1638 __be32 daddr = iph->daddr;
1639 unsigned short est_mtu = 0; 1504 unsigned short est_mtu = 0;
1505 struct inet_peer *peer;
1640 1506
1641 for (k = 0; k < 2; k++) { 1507 peer = inet_getpeer_v4(iph->daddr, 1);
1642 for (i = 0; i < 2; i++) { 1508 if (peer) {
1643 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1509 unsigned short mtu = new_mtu;
1644 rt_genid(net)); 1510
1645 1511 if (new_mtu < 68 || new_mtu >= old_mtu) {
1646 rcu_read_lock(); 1512 /* BSD 4.2 derived systems incorrectly adjust
1647 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 1513 * tot_len by the IP header length, and report
1648 rth = rcu_dereference(rth->dst.rt_next)) { 1514 * a zero MTU in the ICMP message.
1649 unsigned short mtu = new_mtu; 1515 */
1650 1516 if (mtu == 0 &&
1651 if (rth->fl.fl4_dst != daddr || 1517 old_mtu >= 68 + (iph->ihl << 2))
1652 rth->fl.fl4_src != skeys[i] || 1518 old_mtu -= iph->ihl << 2;
1653 rth->rt_dst != daddr || 1519 mtu = guess_mtu(old_mtu);
1654 rth->rt_src != iph->saddr || 1520 }
1655 rth->fl.oif != ikeys[k] ||
1656 rth->fl.iif != 0 ||
1657 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1658 !net_eq(dev_net(rth->dst.dev), net) ||
1659 rt_is_expired(rth))
1660 continue;
1661 1521
1662 if (new_mtu < 68 || new_mtu >= old_mtu) { 1522 if (mtu < ip_rt_min_pmtu)
1523 mtu = ip_rt_min_pmtu;
1524 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1525 unsigned long pmtu_expires;
1663 1526
1664 /* BSD 4.2 compatibility hack :-( */ 1527 pmtu_expires = jiffies + ip_rt_mtu_expires;
1665 if (mtu == 0 && 1528 if (!pmtu_expires)
1666 old_mtu >= dst_mtu(&rth->dst) && 1529 pmtu_expires = 1UL;
1667 old_mtu >= 68 + (iph->ihl << 2))
1668 old_mtu -= iph->ihl << 2;
1669 1530
1670 mtu = guess_mtu(old_mtu); 1531 est_mtu = mtu;
1671 } 1532 peer->pmtu_learned = mtu;
1672 if (mtu <= dst_mtu(&rth->dst)) { 1533 peer->pmtu_expires = pmtu_expires;
1673 if (mtu < dst_mtu(&rth->dst)) {
1674 dst_confirm(&rth->dst);
1675 if (mtu < ip_rt_min_pmtu) {
1676 mtu = ip_rt_min_pmtu;
1677 rth->dst.metrics[RTAX_LOCK-1] |=
1678 (1 << RTAX_MTU);
1679 }
1680 rth->dst.metrics[RTAX_MTU-1] = mtu;
1681 dst_set_expires(&rth->dst,
1682 ip_rt_mtu_expires);
1683 }
1684 est_mtu = mtu;
1685 }
1686 }
1687 rcu_read_unlock();
1688 } 1534 }
1535
1536 inet_putpeer(peer);
1537
1538 atomic_inc(&__rt_peer_genid);
1689 } 1539 }
1690 return est_mtu ? : new_mtu; 1540 return est_mtu ? : new_mtu;
1691} 1541}
1692 1542
1543static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1544{
1545 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1546
1547 if (!expires)
1548 return;
1549 if (time_before(jiffies, expires)) {
1550 u32 orig_dst_mtu = dst_mtu(dst);
1551 if (peer->pmtu_learned < orig_dst_mtu) {
1552 if (!peer->pmtu_orig)
1553 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1554 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1555 }
1556 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1557 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1558}
1559
1693static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1560static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1694{ 1561{
1695 if (dst_mtu(dst) > mtu && mtu >= 68 && 1562 struct rtable *rt = (struct rtable *) dst;
1696 !(dst_metric_locked(dst, RTAX_MTU))) { 1563 struct inet_peer *peer;
1697 if (mtu < ip_rt_min_pmtu) { 1564
1565 dst_confirm(dst);
1566
1567 if (!rt->peer)
1568 rt_bind_peer(rt, rt->rt_dst, 1);
1569 peer = rt->peer;
1570 if (peer) {
1571 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1572
1573 if (mtu < ip_rt_min_pmtu)
1698 mtu = ip_rt_min_pmtu; 1574 mtu = ip_rt_min_pmtu;
1699 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); 1575 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1576
1577 pmtu_expires = jiffies + ip_rt_mtu_expires;
1578 if (!pmtu_expires)
1579 pmtu_expires = 1UL;
1580
1581 peer->pmtu_learned = mtu;
1582 peer->pmtu_expires = pmtu_expires;
1583
1584 atomic_inc(&__rt_peer_genid);
1585 rt->rt_peer_genid = rt_peer_genid();
1700 } 1586 }
1701 dst->metrics[RTAX_MTU-1] = mtu; 1587 check_peer_pmtu(dst, peer);
1702 dst_set_expires(dst, ip_rt_mtu_expires); 1588 }
1703 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); 1589}
1590
1591static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1592{
1593 struct rtable *rt = (struct rtable *) dst;
1594 __be32 orig_gw = rt->rt_gateway;
1595
1596 dst_confirm(&rt->dst);
1597
1598 neigh_release(rt->dst.neighbour);
1599 rt->dst.neighbour = NULL;
1600
1601 rt->rt_gateway = peer->redirect_learned.a4;
1602 if (arp_bind_neighbour(&rt->dst) ||
1603 !(rt->dst.neighbour->nud_state & NUD_VALID)) {
1604 if (rt->dst.neighbour)
1605 neigh_event_send(rt->dst.neighbour, NULL);
1606 rt->rt_gateway = orig_gw;
1607 return -EAGAIN;
1608 } else {
1609 rt->rt_flags |= RTCF_REDIRECTED;
1610 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
1611 rt->dst.neighbour);
1704 } 1612 }
1613 return 0;
1705} 1614}
1706 1615
1707static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1616static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1708{ 1617{
1709 if (rt_is_expired((struct rtable *)dst)) 1618 struct rtable *rt = (struct rtable *) dst;
1619
1620 if (rt_is_expired(rt))
1710 return NULL; 1621 return NULL;
1622 if (rt->rt_peer_genid != rt_peer_genid()) {
1623 struct inet_peer *peer;
1624
1625 if (!rt->peer)
1626 rt_bind_peer(rt, rt->rt_dst, 0);
1627
1628 peer = rt->peer;
1629 if (peer) {
1630 check_peer_pmtu(dst, peer);
1631
1632 if (peer->redirect_learned.a4 &&
1633 peer->redirect_learned.a4 != rt->rt_gateway) {
1634 if (check_peer_redir(dst, peer))
1635 return NULL;
1636 }
1637 }
1638
1639 rt->rt_peer_genid = rt_peer_genid();
1640 }
1711 return dst; 1641 return dst;
1712} 1642}
1713 1643
@@ -1715,33 +1645,17 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
1715{ 1645{
1716 struct rtable *rt = (struct rtable *) dst; 1646 struct rtable *rt = (struct rtable *) dst;
1717 struct inet_peer *peer = rt->peer; 1647 struct inet_peer *peer = rt->peer;
1718 struct in_device *idev = rt->idev;
1719 1648
1649 if (rt->fi) {
1650 fib_info_put(rt->fi);
1651 rt->fi = NULL;
1652 }
1720 if (peer) { 1653 if (peer) {
1721 rt->peer = NULL; 1654 rt->peer = NULL;
1722 inet_putpeer(peer); 1655 inet_putpeer(peer);
1723 } 1656 }
1724
1725 if (idev) {
1726 rt->idev = NULL;
1727 in_dev_put(idev);
1728 }
1729} 1657}
1730 1658
1731static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1732 int how)
1733{
1734 struct rtable *rt = (struct rtable *) dst;
1735 struct in_device *idev = rt->idev;
1736 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1737 struct in_device *loopback_idev =
1738 in_dev_get(dev_net(dev)->loopback_dev);
1739 if (loopback_idev) {
1740 rt->idev = loopback_idev;
1741 in_dev_put(idev);
1742 }
1743 }
1744}
1745 1659
1746static void ipv4_link_failure(struct sk_buff *skb) 1660static void ipv4_link_failure(struct sk_buff *skb)
1747{ 1661{
@@ -1750,8 +1664,8 @@ static void ipv4_link_failure(struct sk_buff *skb)
1750 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1664 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1751 1665
1752 rt = skb_rtable(skb); 1666 rt = skb_rtable(skb);
1753 if (rt) 1667 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1754 dst_set_expires(&rt->dst, 0); 1668 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1755} 1669}
1756 1670
1757static int ip_rt_bug(struct sk_buff *skb) 1671static int ip_rt_bug(struct sk_buff *skb)
@@ -1760,6 +1674,7 @@ static int ip_rt_bug(struct sk_buff *skb)
1760 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1674 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1761 skb->dev ? skb->dev->name : "?"); 1675 skb->dev ? skb->dev->name : "?");
1762 kfree_skb(skb); 1676 kfree_skb(skb);
1677 WARN_ON(1);
1763 return 0; 1678 return 0;
1764} 1679}
1765 1680
@@ -1772,23 +1687,39 @@ static int ip_rt_bug(struct sk_buff *skb)
1772 in IP options! 1687 in IP options!
1773 */ 1688 */
1774 1689
1775void ip_rt_get_source(u8 *addr, struct rtable *rt) 1690void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1776{ 1691{
1777 __be32 src; 1692 __be32 src;
1778 struct fib_result res;
1779 1693
1780 if (rt->fl.iif == 0) 1694 if (rt_is_output_route(rt))
1781 src = rt->rt_src; 1695 src = ip_hdr(skb)->saddr;
1782 else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) { 1696 else {
1783 src = FIB_RES_PREFSRC(res); 1697 struct fib_result res;
1784 fib_res_put(&res); 1698 struct flowi4 fl4;
1785 } else 1699 struct iphdr *iph;
1786 src = inet_select_addr(rt->dst.dev, rt->rt_gateway, 1700
1701 iph = ip_hdr(skb);
1702
1703 memset(&fl4, 0, sizeof(fl4));
1704 fl4.daddr = iph->daddr;
1705 fl4.saddr = iph->saddr;
1706 fl4.flowi4_tos = iph->tos;
1707 fl4.flowi4_oif = rt->dst.dev->ifindex;
1708 fl4.flowi4_iif = skb->dev->ifindex;
1709 fl4.flowi4_mark = skb->mark;
1710
1711 rcu_read_lock();
1712 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1713 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1714 else
1715 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1787 RT_SCOPE_UNIVERSE); 1716 RT_SCOPE_UNIVERSE);
1717 rcu_read_unlock();
1718 }
1788 memcpy(addr, &src, 4); 1719 memcpy(addr, &src, 4);
1789} 1720}
1790 1721
1791#ifdef CONFIG_NET_CLS_ROUTE 1722#ifdef CONFIG_IP_ROUTE_CLASSID
1792static void set_class_tag(struct rtable *rt, u32 tag) 1723static void set_class_tag(struct rtable *rt, u32 tag)
1793{ 1724{
1794 if (!(rt->dst.tclassid & 0xFFFF)) 1725 if (!(rt->dst.tclassid & 0xFFFF))
@@ -1798,46 +1729,107 @@ static void set_class_tag(struct rtable *rt, u32 tag)
1798} 1729}
1799#endif 1730#endif
1800 1731
1801static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 1732static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1802{ 1733{
1803 struct fib_info *fi = res->fi; 1734 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1735
1736 if (advmss == 0) {
1737 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1738 ip_rt_min_advmss);
1739 if (advmss > 65535 - 40)
1740 advmss = 65535 - 40;
1741 }
1742 return advmss;
1743}
1744
1745static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1746{
1747 unsigned int mtu = dst->dev->mtu;
1748
1749 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1750 const struct rtable *rt = (const struct rtable *) dst;
1751
1752 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1753 mtu = 576;
1754 }
1755
1756 if (mtu > IP_MAX_MTU)
1757 mtu = IP_MAX_MTU;
1758
1759 return mtu;
1760}
1761
1762static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1763 struct fib_info *fi)
1764{
1765 struct inet_peer *peer;
1766 int create = 0;
1767
1768 /* If a peer entry exists for this destination, we must hook
1769 * it up in order to get at cached metrics.
1770 */
1771 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1772 create = 1;
1773
1774 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1775 if (peer) {
1776 rt->rt_peer_genid = rt_peer_genid();
1777 if (inet_metrics_new(peer))
1778 memcpy(peer->metrics, fi->fib_metrics,
1779 sizeof(u32) * RTAX_MAX);
1780 dst_init_metrics(&rt->dst, peer->metrics, false);
1781
1782 check_peer_pmtu(&rt->dst, peer);
1783 if (peer->redirect_learned.a4 &&
1784 peer->redirect_learned.a4 != rt->rt_gateway) {
1785 rt->rt_gateway = peer->redirect_learned.a4;
1786 rt->rt_flags |= RTCF_REDIRECTED;
1787 }
1788 } else {
1789 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1790 rt->fi = fi;
1791 atomic_inc(&fi->fib_clntref);
1792 }
1793 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1794 }
1795}
1796
1797static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1798 const struct fib_result *res,
1799 struct fib_info *fi, u16 type, u32 itag)
1800{
1801 struct dst_entry *dst = &rt->dst;
1804 1802
1805 if (fi) { 1803 if (fi) {
1806 if (FIB_RES_GW(*res) && 1804 if (FIB_RES_GW(*res) &&
1807 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1805 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1808 rt->rt_gateway = FIB_RES_GW(*res); 1806 rt->rt_gateway = FIB_RES_GW(*res);
1809 memcpy(rt->dst.metrics, fi->fib_metrics, 1807 rt_init_metrics(rt, fl4, fi);
1810 sizeof(rt->dst.metrics)); 1808#ifdef CONFIG_IP_ROUTE_CLASSID
1811 if (fi->fib_mtu == 0) { 1809 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1812 rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1813 if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1814 rt->rt_gateway != rt->rt_dst &&
1815 rt->dst.dev->mtu > 576)
1816 rt->dst.metrics[RTAX_MTU-1] = 576;
1817 }
1818#ifdef CONFIG_NET_CLS_ROUTE
1819 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1820#endif 1810#endif
1821 } else 1811 }
1822 rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu; 1812
1823 1813 if (dst_mtu(dst) > IP_MAX_MTU)
1824 if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0) 1814 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1825 rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; 1815 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1826 if (dst_mtu(&rt->dst) > IP_MAX_MTU) 1816 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1827 rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; 1817
1828 if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0) 1818#ifdef CONFIG_IP_ROUTE_CLASSID
1829 rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1830 ip_rt_min_advmss);
1831 if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1832 rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1833
1834#ifdef CONFIG_NET_CLS_ROUTE
1835#ifdef CONFIG_IP_MULTIPLE_TABLES 1819#ifdef CONFIG_IP_MULTIPLE_TABLES
1836 set_class_tag(rt, fib_rules_tclass(res)); 1820 set_class_tag(rt, fib_rules_tclass(res));
1837#endif 1821#endif
1838 set_class_tag(rt, itag); 1822 set_class_tag(rt, itag);
1839#endif 1823#endif
1840 rt->rt_type = res->type; 1824}
1825
1826static struct rtable *rt_dst_alloc(struct net_device *dev,
1827 bool nopolicy, bool noxfrm)
1828{
1829 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1830 DST_HOST |
1831 (nopolicy ? DST_NOPOLICY : 0) |
1832 (noxfrm ? DST_NOXFRM : 0));
1841} 1833}
1842 1834
1843/* called in rcu_read_lock() section */ 1835/* called in rcu_read_lock() section */
@@ -1865,42 +1857,38 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1865 goto e_inval; 1857 goto e_inval;
1866 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1858 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1867 } else { 1859 } else {
1868 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, 1860 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1869 &itag, 0); 1861 &itag);
1870 if (err < 0) 1862 if (err < 0)
1871 goto e_err; 1863 goto e_err;
1872 } 1864 }
1873 rth = dst_alloc(&ipv4_dst_ops); 1865 rth = rt_dst_alloc(init_net.loopback_dev,
1866 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1874 if (!rth) 1867 if (!rth)
1875 goto e_nobufs; 1868 goto e_nobufs;
1876 1869
1870#ifdef CONFIG_IP_ROUTE_CLASSID
1871 rth->dst.tclassid = itag;
1872#endif
1877 rth->dst.output = ip_rt_bug; 1873 rth->dst.output = ip_rt_bug;
1878 rth->dst.obsolete = -1;
1879 1874
1880 atomic_set(&rth->dst.__refcnt, 1); 1875 rth->rt_key_dst = daddr;
1881 rth->dst.flags= DST_HOST; 1876 rth->rt_key_src = saddr;
1882 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 1877 rth->rt_genid = rt_genid(dev_net(dev));
1883 rth->dst.flags |= DST_NOPOLICY; 1878 rth->rt_flags = RTCF_MULTICAST;
1884 rth->fl.fl4_dst = daddr; 1879 rth->rt_type = RTN_MULTICAST;
1880 rth->rt_key_tos = tos;
1885 rth->rt_dst = daddr; 1881 rth->rt_dst = daddr;
1886 rth->fl.fl4_tos = tos;
1887 rth->fl.mark = skb->mark;
1888 rth->fl.fl4_src = saddr;
1889 rth->rt_src = saddr; 1882 rth->rt_src = saddr;
1890#ifdef CONFIG_NET_CLS_ROUTE 1883 rth->rt_route_iif = dev->ifindex;
1891 rth->dst.tclassid = itag; 1884 rth->rt_iif = dev->ifindex;
1892#endif 1885 rth->rt_oif = 0;
1893 rth->rt_iif = 1886 rth->rt_mark = skb->mark;
1894 rth->fl.iif = dev->ifindex;
1895 rth->dst.dev = init_net.loopback_dev;
1896 dev_hold(rth->dst.dev);
1897 rth->idev = in_dev_get(rth->dst.dev);
1898 rth->fl.oif = 0;
1899 rth->rt_gateway = daddr; 1887 rth->rt_gateway = daddr;
1900 rth->rt_spec_dst= spec_dst; 1888 rth->rt_spec_dst= spec_dst;
1901 rth->rt_genid = rt_genid(dev_net(dev)); 1889 rth->rt_peer_genid = 0;
1902 rth->rt_flags = RTCF_MULTICAST; 1890 rth->peer = NULL;
1903 rth->rt_type = RTN_MULTICAST; 1891 rth->fi = NULL;
1904 if (our) { 1892 if (our) {
1905 rth->dst.input= ip_local_deliver; 1893 rth->dst.input= ip_local_deliver;
1906 rth->rt_flags |= RTCF_LOCAL; 1894 rth->rt_flags |= RTCF_LOCAL;
@@ -1913,7 +1901,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1913 RT_CACHE_STAT_INC(in_slow_mc); 1901 RT_CACHE_STAT_INC(in_slow_mc);
1914 1902
1915 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1903 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1916 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); 1904 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1905 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1917 1906
1918e_nobufs: 1907e_nobufs:
1919 return -ENOBUFS; 1908 return -ENOBUFS;
@@ -1956,7 +1945,7 @@ static void ip_handle_martian_source(struct net_device *dev,
1956 1945
1957/* called in rcu_read_lock() section */ 1946/* called in rcu_read_lock() section */
1958static int __mkroute_input(struct sk_buff *skb, 1947static int __mkroute_input(struct sk_buff *skb,
1959 struct fib_result *res, 1948 const struct fib_result *res,
1960 struct in_device *in_dev, 1949 struct in_device *in_dev,
1961 __be32 daddr, __be32 saddr, u32 tos, 1950 __be32 daddr, __be32 saddr, u32 tos,
1962 struct rtable **result) 1951 struct rtable **result)
@@ -1978,8 +1967,8 @@ static int __mkroute_input(struct sk_buff *skb,
1978 } 1967 }
1979 1968
1980 1969
1981 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 1970 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1982 in_dev->dev, &spec_dst, &itag, skb->mark); 1971 in_dev->dev, &spec_dst, &itag);
1983 if (err < 0) { 1972 if (err < 0) {
1984 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1973 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1985 saddr); 1974 saddr);
@@ -2010,42 +1999,36 @@ static int __mkroute_input(struct sk_buff *skb,
2010 } 1999 }
2011 } 2000 }
2012 2001
2013 2002 rth = rt_dst_alloc(out_dev->dev,
2014 rth = dst_alloc(&ipv4_dst_ops); 2003 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2004 IN_DEV_CONF_GET(out_dev, NOXFRM));
2015 if (!rth) { 2005 if (!rth) {
2016 err = -ENOBUFS; 2006 err = -ENOBUFS;
2017 goto cleanup; 2007 goto cleanup;
2018 } 2008 }
2019 2009
2020 atomic_set(&rth->dst.__refcnt, 1); 2010 rth->rt_key_dst = daddr;
2021 rth->dst.flags= DST_HOST; 2011 rth->rt_key_src = saddr;
2022 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2012 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2023 rth->dst.flags |= DST_NOPOLICY; 2013 rth->rt_flags = flags;
2024 if (IN_DEV_CONF_GET(out_dev, NOXFRM)) 2014 rth->rt_type = res->type;
2025 rth->dst.flags |= DST_NOXFRM; 2015 rth->rt_key_tos = tos;
2026 rth->fl.fl4_dst = daddr;
2027 rth->rt_dst = daddr; 2016 rth->rt_dst = daddr;
2028 rth->fl.fl4_tos = tos;
2029 rth->fl.mark = skb->mark;
2030 rth->fl.fl4_src = saddr;
2031 rth->rt_src = saddr; 2017 rth->rt_src = saddr;
2018 rth->rt_route_iif = in_dev->dev->ifindex;
2019 rth->rt_iif = in_dev->dev->ifindex;
2020 rth->rt_oif = 0;
2021 rth->rt_mark = skb->mark;
2032 rth->rt_gateway = daddr; 2022 rth->rt_gateway = daddr;
2033 rth->rt_iif =
2034 rth->fl.iif = in_dev->dev->ifindex;
2035 rth->dst.dev = (out_dev)->dev;
2036 dev_hold(rth->dst.dev);
2037 rth->idev = in_dev_get(rth->dst.dev);
2038 rth->fl.oif = 0;
2039 rth->rt_spec_dst= spec_dst; 2023 rth->rt_spec_dst= spec_dst;
2024 rth->rt_peer_genid = 0;
2025 rth->peer = NULL;
2026 rth->fi = NULL;
2040 2027
2041 rth->dst.obsolete = -1;
2042 rth->dst.input = ip_forward; 2028 rth->dst.input = ip_forward;
2043 rth->dst.output = ip_output; 2029 rth->dst.output = ip_output;
2044 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2045
2046 rt_set_nexthop(rth, res, itag);
2047 2030
2048 rth->rt_flags = flags; 2031 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2049 2032
2050 *result = rth; 2033 *result = rth;
2051 err = 0; 2034 err = 0;
@@ -2055,7 +2038,7 @@ static int __mkroute_input(struct sk_buff *skb,
2055 2038
2056static int ip_mkroute_input(struct sk_buff *skb, 2039static int ip_mkroute_input(struct sk_buff *skb,
2057 struct fib_result *res, 2040 struct fib_result *res,
2058 const struct flowi *fl, 2041 const struct flowi4 *fl4,
2059 struct in_device *in_dev, 2042 struct in_device *in_dev,
2060 __be32 daddr, __be32 saddr, u32 tos) 2043 __be32 daddr, __be32 saddr, u32 tos)
2061{ 2044{
@@ -2064,8 +2047,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
2064 unsigned hash; 2047 unsigned hash;
2065 2048
2066#ifdef CONFIG_IP_ROUTE_MULTIPATH 2049#ifdef CONFIG_IP_ROUTE_MULTIPATH
2067 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) 2050 if (res->fi && res->fi->fib_nhs > 1)
2068 fib_select_multipath(fl, res); 2051 fib_select_multipath(res);
2069#endif 2052#endif
2070 2053
2071 /* create a routing cache entry */ 2054 /* create a routing cache entry */
@@ -2074,9 +2057,12 @@ static int ip_mkroute_input(struct sk_buff *skb,
2074 return err; 2057 return err;
2075 2058
2076 /* put it into the cache */ 2059 /* put it into the cache */
2077 hash = rt_hash(daddr, saddr, fl->iif, 2060 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2078 rt_genid(dev_net(rth->dst.dev))); 2061 rt_genid(dev_net(rth->dst.dev)));
2079 return rt_intern_hash(hash, rth, NULL, skb, fl->iif); 2062 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2063 if (IS_ERR(rth))
2064 return PTR_ERR(rth);
2065 return 0;
2080} 2066}
2081 2067
2082/* 2068/*
@@ -2087,6 +2073,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
2087 * Such approach solves two big problems: 2073 * Such approach solves two big problems:
2088 * 1. Not simplex devices are handled properly. 2074 * 1. Not simplex devices are handled properly.
2089 * 2. IP spoofing attempts are filtered with 100% of guarantee. 2075 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2076 * called with rcu_read_lock()
2090 */ 2077 */
2091 2078
2092static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2079static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2094,21 +2081,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2094{ 2081{
2095 struct fib_result res; 2082 struct fib_result res;
2096 struct in_device *in_dev = __in_dev_get_rcu(dev); 2083 struct in_device *in_dev = __in_dev_get_rcu(dev);
2097 struct flowi fl = { .nl_u = { .ip4_u = 2084 struct flowi4 fl4;
2098 { .daddr = daddr,
2099 .saddr = saddr,
2100 .tos = tos,
2101 .scope = RT_SCOPE_UNIVERSE,
2102 } },
2103 .mark = skb->mark,
2104 .iif = dev->ifindex };
2105 unsigned flags = 0; 2085 unsigned flags = 0;
2106 u32 itag = 0; 2086 u32 itag = 0;
2107 struct rtable * rth; 2087 struct rtable * rth;
2108 unsigned hash; 2088 unsigned hash;
2109 __be32 spec_dst; 2089 __be32 spec_dst;
2110 int err = -EINVAL; 2090 int err = -EINVAL;
2111 int free_res = 0;
2112 struct net * net = dev_net(dev); 2091 struct net * net = dev_net(dev);
2113 2092
2114 /* IP on this device is disabled. */ 2093 /* IP on this device is disabled. */
@@ -2124,7 +2103,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2124 ipv4_is_loopback(saddr)) 2103 ipv4_is_loopback(saddr))
2125 goto martian_source; 2104 goto martian_source;
2126 2105
2127 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) 2106 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2128 goto brd_input; 2107 goto brd_input;
2129 2108
2130 /* Accept zero addresses only to limited broadcast; 2109 /* Accept zero addresses only to limited broadcast;
@@ -2133,19 +2112,25 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2133 if (ipv4_is_zeronet(saddr)) 2112 if (ipv4_is_zeronet(saddr))
2134 goto martian_source; 2113 goto martian_source;
2135 2114
2136 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || 2115 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2137 ipv4_is_loopback(daddr))
2138 goto martian_destination; 2116 goto martian_destination;
2139 2117
2140 /* 2118 /*
2141 * Now we are ready to route packet. 2119 * Now we are ready to route packet.
2142 */ 2120 */
2143 if ((err = fib_lookup(net, &fl, &res)) != 0) { 2121 fl4.flowi4_oif = 0;
2122 fl4.flowi4_iif = dev->ifindex;
2123 fl4.flowi4_mark = skb->mark;
2124 fl4.flowi4_tos = tos;
2125 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2126 fl4.daddr = daddr;
2127 fl4.saddr = saddr;
2128 err = fib_lookup(net, &fl4, &res);
2129 if (err != 0) {
2144 if (!IN_DEV_FORWARD(in_dev)) 2130 if (!IN_DEV_FORWARD(in_dev))
2145 goto e_hostunreach; 2131 goto e_hostunreach;
2146 goto no_route; 2132 goto no_route;
2147 } 2133 }
2148 free_res = 1;
2149 2134
2150 RT_CACHE_STAT_INC(in_slow_tot); 2135 RT_CACHE_STAT_INC(in_slow_tot);
2151 2136
@@ -2153,9 +2138,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2153 goto brd_input; 2138 goto brd_input;
2154 2139
2155 if (res.type == RTN_LOCAL) { 2140 if (res.type == RTN_LOCAL) {
2156 err = fib_validate_source(saddr, daddr, tos, 2141 err = fib_validate_source(skb, saddr, daddr, tos,
2157 net->loopback_dev->ifindex, 2142 net->loopback_dev->ifindex,
2158 dev, &spec_dst, &itag, skb->mark); 2143 dev, &spec_dst, &itag);
2159 if (err < 0) 2144 if (err < 0)
2160 goto martian_source_keep_err; 2145 goto martian_source_keep_err;
2161 if (err) 2146 if (err)
@@ -2169,10 +2154,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2169 if (res.type != RTN_UNICAST) 2154 if (res.type != RTN_UNICAST)
2170 goto martian_destination; 2155 goto martian_destination;
2171 2156
2172 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 2157 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2173done:
2174 if (free_res)
2175 fib_res_put(&res);
2176out: return err; 2158out: return err;
2177 2159
2178brd_input: 2160brd_input:
@@ -2182,8 +2164,8 @@ brd_input:
2182 if (ipv4_is_zeronet(saddr)) 2164 if (ipv4_is_zeronet(saddr))
2183 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 2165 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2184 else { 2166 else {
2185 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, 2167 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2186 &itag, skb->mark); 2168 &itag);
2187 if (err < 0) 2169 if (err < 0)
2188 goto martian_source_keep_err; 2170 goto martian_source_keep_err;
2189 if (err) 2171 if (err)
@@ -2194,45 +2176,48 @@ brd_input:
2194 RT_CACHE_STAT_INC(in_brd); 2176 RT_CACHE_STAT_INC(in_brd);
2195 2177
2196local_input: 2178local_input:
2197 rth = dst_alloc(&ipv4_dst_ops); 2179 rth = rt_dst_alloc(net->loopback_dev,
2180 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2198 if (!rth) 2181 if (!rth)
2199 goto e_nobufs; 2182 goto e_nobufs;
2200 2183
2184 rth->dst.input= ip_local_deliver;
2201 rth->dst.output= ip_rt_bug; 2185 rth->dst.output= ip_rt_bug;
2202 rth->dst.obsolete = -1; 2186#ifdef CONFIG_IP_ROUTE_CLASSID
2203 rth->rt_genid = rt_genid(net); 2187 rth->dst.tclassid = itag;
2188#endif
2204 2189
2205 atomic_set(&rth->dst.__refcnt, 1); 2190 rth->rt_key_dst = daddr;
2206 rth->dst.flags= DST_HOST; 2191 rth->rt_key_src = saddr;
2207 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2192 rth->rt_genid = rt_genid(net);
2208 rth->dst.flags |= DST_NOPOLICY; 2193 rth->rt_flags = flags|RTCF_LOCAL;
2209 rth->fl.fl4_dst = daddr; 2194 rth->rt_type = res.type;
2195 rth->rt_key_tos = tos;
2210 rth->rt_dst = daddr; 2196 rth->rt_dst = daddr;
2211 rth->fl.fl4_tos = tos;
2212 rth->fl.mark = skb->mark;
2213 rth->fl.fl4_src = saddr;
2214 rth->rt_src = saddr; 2197 rth->rt_src = saddr;
2215#ifdef CONFIG_NET_CLS_ROUTE 2198#ifdef CONFIG_IP_ROUTE_CLASSID
2216 rth->dst.tclassid = itag; 2199 rth->dst.tclassid = itag;
2217#endif 2200#endif
2218 rth->rt_iif = 2201 rth->rt_route_iif = dev->ifindex;
2219 rth->fl.iif = dev->ifindex; 2202 rth->rt_iif = dev->ifindex;
2220 rth->dst.dev = net->loopback_dev; 2203 rth->rt_oif = 0;
2221 dev_hold(rth->dst.dev); 2204 rth->rt_mark = skb->mark;
2222 rth->idev = in_dev_get(rth->dst.dev);
2223 rth->rt_gateway = daddr; 2205 rth->rt_gateway = daddr;
2224 rth->rt_spec_dst= spec_dst; 2206 rth->rt_spec_dst= spec_dst;
2225 rth->dst.input= ip_local_deliver; 2207 rth->rt_peer_genid = 0;
2226 rth->rt_flags = flags|RTCF_LOCAL; 2208 rth->peer = NULL;
2209 rth->fi = NULL;
2227 if (res.type == RTN_UNREACHABLE) { 2210 if (res.type == RTN_UNREACHABLE) {
2228 rth->dst.input= ip_error; 2211 rth->dst.input= ip_error;
2229 rth->dst.error= -err; 2212 rth->dst.error= -err;
2230 rth->rt_flags &= ~RTCF_LOCAL; 2213 rth->rt_flags &= ~RTCF_LOCAL;
2231 } 2214 }
2232 rth->rt_type = res.type; 2215 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2233 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); 2216 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2234 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); 2217 err = 0;
2235 goto done; 2218 if (IS_ERR(rth))
2219 err = PTR_ERR(rth);
2220 goto out;
2236 2221
2237no_route: 2222no_route:
2238 RT_CACHE_STAT_INC(in_no_route); 2223 RT_CACHE_STAT_INC(in_no_route);
@@ -2255,21 +2240,21 @@ martian_destination:
2255 2240
2256e_hostunreach: 2241e_hostunreach:
2257 err = -EHOSTUNREACH; 2242 err = -EHOSTUNREACH;
2258 goto done; 2243 goto out;
2259 2244
2260e_inval: 2245e_inval:
2261 err = -EINVAL; 2246 err = -EINVAL;
2262 goto done; 2247 goto out;
2263 2248
2264e_nobufs: 2249e_nobufs:
2265 err = -ENOBUFS; 2250 err = -ENOBUFS;
2266 goto done; 2251 goto out;
2267 2252
2268martian_source: 2253martian_source:
2269 err = -EINVAL; 2254 err = -EINVAL;
2270martian_source_keep_err: 2255martian_source_keep_err:
2271 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2256 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2272 goto done; 2257 goto out;
2273} 2258}
2274 2259
2275int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2260int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2293,12 +2278,12 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2293 2278
2294 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2279 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2295 rth = rcu_dereference(rth->dst.rt_next)) { 2280 rth = rcu_dereference(rth->dst.rt_next)) {
2296 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | 2281 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2297 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | 2282 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2298 (rth->fl.iif ^ iif) | 2283 (rth->rt_iif ^ iif) |
2299 rth->fl.oif | 2284 rth->rt_oif |
2300 (rth->fl.fl4_tos ^ tos)) == 0 && 2285 (rth->rt_key_tos ^ tos)) == 0 &&
2301 rth->fl.mark == skb->mark && 2286 rth->rt_mark == skb->mark &&
2302 net_eq(dev_net(rth->dst.dev), net) && 2287 net_eq(dev_net(rth->dst.dev), net) &&
2303 !rt_is_expired(rth)) { 2288 !rt_is_expired(rth)) {
2304 if (noref) { 2289 if (noref) {
@@ -2331,8 +2316,8 @@ skip_cache:
2331 struct in_device *in_dev = __in_dev_get_rcu(dev); 2316 struct in_device *in_dev = __in_dev_get_rcu(dev);
2332 2317
2333 if (in_dev) { 2318 if (in_dev) {
2334 int our = ip_check_mc(in_dev, daddr, saddr, 2319 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2335 ip_hdr(skb)->protocol); 2320 ip_hdr(skb)->protocol);
2336 if (our 2321 if (our
2337#ifdef CONFIG_IP_MROUTE 2322#ifdef CONFIG_IP_MROUTE
2338 || 2323 ||
@@ -2355,108 +2340,95 @@ skip_cache:
2355} 2340}
2356EXPORT_SYMBOL(ip_route_input_common); 2341EXPORT_SYMBOL(ip_route_input_common);
2357 2342
2358static int __mkroute_output(struct rtable **result, 2343/* called with rcu_read_lock() */
2359 struct fib_result *res, 2344static struct rtable *__mkroute_output(const struct fib_result *res,
2360 const struct flowi *fl, 2345 const struct flowi4 *fl4,
2361 const struct flowi *oldflp, 2346 __be32 orig_daddr, __be32 orig_saddr,
2362 struct net_device *dev_out, 2347 int orig_oif, struct net_device *dev_out,
2363 unsigned flags) 2348 unsigned int flags)
2364{ 2349{
2365 struct rtable *rth; 2350 struct fib_info *fi = res->fi;
2351 u32 tos = RT_FL_TOS(fl4);
2366 struct in_device *in_dev; 2352 struct in_device *in_dev;
2367 u32 tos = RT_FL_TOS(oldflp); 2353 u16 type = res->type;
2368 int err = 0; 2354 struct rtable *rth;
2369 2355
2370 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) 2356 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2371 return -EINVAL; 2357 return ERR_PTR(-EINVAL);
2372 2358
2373 if (fl->fl4_dst == htonl(0xFFFFFFFF)) 2359 if (ipv4_is_lbcast(fl4->daddr))
2374 res->type = RTN_BROADCAST; 2360 type = RTN_BROADCAST;
2375 else if (ipv4_is_multicast(fl->fl4_dst)) 2361 else if (ipv4_is_multicast(fl4->daddr))
2376 res->type = RTN_MULTICAST; 2362 type = RTN_MULTICAST;
2377 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) 2363 else if (ipv4_is_zeronet(fl4->daddr))
2378 return -EINVAL; 2364 return ERR_PTR(-EINVAL);
2379 2365
2380 if (dev_out->flags & IFF_LOOPBACK) 2366 if (dev_out->flags & IFF_LOOPBACK)
2381 flags |= RTCF_LOCAL; 2367 flags |= RTCF_LOCAL;
2382 2368
2383 /* get work reference to inet device */ 2369 in_dev = __in_dev_get_rcu(dev_out);
2384 in_dev = in_dev_get(dev_out);
2385 if (!in_dev) 2370 if (!in_dev)
2386 return -EINVAL; 2371 return ERR_PTR(-EINVAL);
2387 2372
2388 if (res->type == RTN_BROADCAST) { 2373 if (type == RTN_BROADCAST) {
2389 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2374 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2390 if (res->fi) { 2375 fi = NULL;
2391 fib_info_put(res->fi); 2376 } else if (type == RTN_MULTICAST) {
2392 res->fi = NULL; 2377 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2393 } 2378 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2394 } else if (res->type == RTN_MULTICAST) { 2379 fl4->flowi4_proto))
2395 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2396 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2397 oldflp->proto))
2398 flags &= ~RTCF_LOCAL; 2380 flags &= ~RTCF_LOCAL;
2399 /* If multicast route do not exist use 2381 /* If multicast route do not exist use
2400 default one, but do not gateway in this case. 2382 * default one, but do not gateway in this case.
2401 Yes, it is hack. 2383 * Yes, it is hack.
2402 */ 2384 */
2403 if (res->fi && res->prefixlen < 4) { 2385 if (fi && res->prefixlen < 4)
2404 fib_info_put(res->fi); 2386 fi = NULL;
2405 res->fi = NULL;
2406 }
2407 } 2387 }
2408 2388
2389 rth = rt_dst_alloc(dev_out,
2390 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2391 IN_DEV_CONF_GET(in_dev, NOXFRM));
2392 if (!rth)
2393 return ERR_PTR(-ENOBUFS);
2409 2394
2410 rth = dst_alloc(&ipv4_dst_ops); 2395 rth->dst.output = ip_output;
2411 if (!rth) {
2412 err = -ENOBUFS;
2413 goto cleanup;
2414 }
2415 2396
2416 atomic_set(&rth->dst.__refcnt, 1); 2397 rth->rt_key_dst = orig_daddr;
2417 rth->dst.flags= DST_HOST; 2398 rth->rt_key_src = orig_saddr;
2418 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2419 rth->dst.flags |= DST_NOXFRM;
2420 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2421 rth->dst.flags |= DST_NOPOLICY;
2422
2423 rth->fl.fl4_dst = oldflp->fl4_dst;
2424 rth->fl.fl4_tos = tos;
2425 rth->fl.fl4_src = oldflp->fl4_src;
2426 rth->fl.oif = oldflp->oif;
2427 rth->fl.mark = oldflp->mark;
2428 rth->rt_dst = fl->fl4_dst;
2429 rth->rt_src = fl->fl4_src;
2430 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2431 /* get references to the devices that are to be hold by the routing
2432 cache entry */
2433 rth->dst.dev = dev_out;
2434 dev_hold(dev_out);
2435 rth->idev = in_dev_get(dev_out);
2436 rth->rt_gateway = fl->fl4_dst;
2437 rth->rt_spec_dst= fl->fl4_src;
2438
2439 rth->dst.output=ip_output;
2440 rth->dst.obsolete = -1;
2441 rth->rt_genid = rt_genid(dev_net(dev_out)); 2399 rth->rt_genid = rt_genid(dev_net(dev_out));
2400 rth->rt_flags = flags;
2401 rth->rt_type = type;
2402 rth->rt_key_tos = tos;
2403 rth->rt_dst = fl4->daddr;
2404 rth->rt_src = fl4->saddr;
2405 rth->rt_route_iif = 0;
2406 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2407 rth->rt_oif = orig_oif;
2408 rth->rt_mark = fl4->flowi4_mark;
2409 rth->rt_gateway = fl4->daddr;
2410 rth->rt_spec_dst= fl4->saddr;
2411 rth->rt_peer_genid = 0;
2412 rth->peer = NULL;
2413 rth->fi = NULL;
2442 2414
2443 RT_CACHE_STAT_INC(out_slow_tot); 2415 RT_CACHE_STAT_INC(out_slow_tot);
2444 2416
2445 if (flags & RTCF_LOCAL) { 2417 if (flags & RTCF_LOCAL) {
2446 rth->dst.input = ip_local_deliver; 2418 rth->dst.input = ip_local_deliver;
2447 rth->rt_spec_dst = fl->fl4_dst; 2419 rth->rt_spec_dst = fl4->daddr;
2448 } 2420 }
2449 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2421 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2450 rth->rt_spec_dst = fl->fl4_src; 2422 rth->rt_spec_dst = fl4->saddr;
2451 if (flags & RTCF_LOCAL && 2423 if (flags & RTCF_LOCAL &&
2452 !(dev_out->flags & IFF_LOOPBACK)) { 2424 !(dev_out->flags & IFF_LOOPBACK)) {
2453 rth->dst.output = ip_mc_output; 2425 rth->dst.output = ip_mc_output;
2454 RT_CACHE_STAT_INC(out_slow_mc); 2426 RT_CACHE_STAT_INC(out_slow_mc);
2455 } 2427 }
2456#ifdef CONFIG_IP_MROUTE 2428#ifdef CONFIG_IP_MROUTE
2457 if (res->type == RTN_MULTICAST) { 2429 if (type == RTN_MULTICAST) {
2458 if (IN_DEV_MFORWARD(in_dev) && 2430 if (IN_DEV_MFORWARD(in_dev) &&
2459 !ipv4_is_local_multicast(oldflp->fl4_dst)) { 2431 !ipv4_is_local_multicast(fl4->daddr)) {
2460 rth->dst.input = ip_mr_input; 2432 rth->dst.input = ip_mr_input;
2461 rth->dst.output = ip_mc_output; 2433 rth->dst.output = ip_mc_output;
2462 } 2434 }
@@ -2464,73 +2436,47 @@ static int __mkroute_output(struct rtable **result,
2464#endif 2436#endif
2465 } 2437 }
2466 2438
2467 rt_set_nexthop(rth, res, 0); 2439 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2468
2469 rth->rt_flags = flags;
2470
2471 *result = rth;
2472 cleanup:
2473 /* release work reference to inet device */
2474 in_dev_put(in_dev);
2475
2476 return err;
2477}
2478
2479static int ip_mkroute_output(struct rtable **rp,
2480 struct fib_result *res,
2481 const struct flowi *fl,
2482 const struct flowi *oldflp,
2483 struct net_device *dev_out,
2484 unsigned flags)
2485{
2486 struct rtable *rth = NULL;
2487 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2488 unsigned hash;
2489 if (err == 0) {
2490 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2491 rt_genid(dev_net(dev_out)));
2492 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2493 }
2494 2440
2495 return err; 2441 return rth;
2496} 2442}
2497 2443
2498/* 2444/*
2499 * Major route resolver routine. 2445 * Major route resolver routine.
2446 * called with rcu_read_lock();
2500 */ 2447 */
2501 2448
2502static int ip_route_output_slow(struct net *net, struct rtable **rp, 2449static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2503 const struct flowi *oldflp) 2450{
2504{
2505 u32 tos = RT_FL_TOS(oldflp);
2506 struct flowi fl = { .nl_u = { .ip4_u =
2507 { .daddr = oldflp->fl4_dst,
2508 .saddr = oldflp->fl4_src,
2509 .tos = tos & IPTOS_RT_MASK,
2510 .scope = ((tos & RTO_ONLINK) ?
2511 RT_SCOPE_LINK :
2512 RT_SCOPE_UNIVERSE),
2513 } },
2514 .mark = oldflp->mark,
2515 .iif = net->loopback_dev->ifindex,
2516 .oif = oldflp->oif };
2517 struct fib_result res;
2518 unsigned flags = 0;
2519 struct net_device *dev_out = NULL; 2451 struct net_device *dev_out = NULL;
2520 int free_res = 0; 2452 u32 tos = RT_FL_TOS(fl4);
2521 int err; 2453 unsigned int flags = 0;
2522 2454 struct fib_result res;
2455 struct rtable *rth;
2456 __be32 orig_daddr;
2457 __be32 orig_saddr;
2458 int orig_oif;
2523 2459
2524 res.fi = NULL; 2460 res.fi = NULL;
2525#ifdef CONFIG_IP_MULTIPLE_TABLES 2461#ifdef CONFIG_IP_MULTIPLE_TABLES
2526 res.r = NULL; 2462 res.r = NULL;
2527#endif 2463#endif
2528 2464
2529 if (oldflp->fl4_src) { 2465 orig_daddr = fl4->daddr;
2530 err = -EINVAL; 2466 orig_saddr = fl4->saddr;
2531 if (ipv4_is_multicast(oldflp->fl4_src) || 2467 orig_oif = fl4->flowi4_oif;
2532 ipv4_is_lbcast(oldflp->fl4_src) || 2468
2533 ipv4_is_zeronet(oldflp->fl4_src)) 2469 fl4->flowi4_iif = net->loopback_dev->ifindex;
2470 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2471 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2472 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2473
2474 rcu_read_lock();
2475 if (fl4->saddr) {
2476 rth = ERR_PTR(-EINVAL);
2477 if (ipv4_is_multicast(fl4->saddr) ||
2478 ipv4_is_lbcast(fl4->saddr) ||
2479 ipv4_is_zeronet(fl4->saddr))
2534 goto out; 2480 goto out;
2535 2481
2536 /* I removed check for oif == dev_out->oif here. 2482 /* I removed check for oif == dev_out->oif here.
@@ -2541,11 +2487,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2541 of another iface. --ANK 2487 of another iface. --ANK
2542 */ 2488 */
2543 2489
2544 if (oldflp->oif == 0 && 2490 if (fl4->flowi4_oif == 0 &&
2545 (ipv4_is_multicast(oldflp->fl4_dst) || 2491 (ipv4_is_multicast(fl4->daddr) ||
2546 oldflp->fl4_dst == htonl(0xFFFFFFFF))) { 2492 ipv4_is_lbcast(fl4->daddr))) {
2547 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2493 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2548 dev_out = ip_dev_find(net, oldflp->fl4_src); 2494 dev_out = __ip_dev_find(net, fl4->saddr, false);
2549 if (dev_out == NULL) 2495 if (dev_out == NULL)
2550 goto out; 2496 goto out;
2551 2497
@@ -2564,67 +2510,60 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2564 Luckily, this hack is good workaround. 2510 Luckily, this hack is good workaround.
2565 */ 2511 */
2566 2512
2567 fl.oif = dev_out->ifindex; 2513 fl4->flowi4_oif = dev_out->ifindex;
2568 goto make_route; 2514 goto make_route;
2569 } 2515 }
2570 2516
2571 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { 2517 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2572 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2518 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2573 dev_out = ip_dev_find(net, oldflp->fl4_src); 2519 if (!__ip_dev_find(net, fl4->saddr, false))
2574 if (dev_out == NULL)
2575 goto out; 2520 goto out;
2576 dev_put(dev_out);
2577 dev_out = NULL;
2578 } 2521 }
2579 } 2522 }
2580 2523
2581 2524
2582 if (oldflp->oif) { 2525 if (fl4->flowi4_oif) {
2583 dev_out = dev_get_by_index(net, oldflp->oif); 2526 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2584 err = -ENODEV; 2527 rth = ERR_PTR(-ENODEV);
2585 if (dev_out == NULL) 2528 if (dev_out == NULL)
2586 goto out; 2529 goto out;
2587 2530
2588 /* RACE: Check return value of inet_select_addr instead. */ 2531 /* RACE: Check return value of inet_select_addr instead. */
2589 if (__in_dev_get_rtnl(dev_out) == NULL) { 2532 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2590 dev_put(dev_out); 2533 rth = ERR_PTR(-ENETUNREACH);
2591 goto out; /* Wrong error code */ 2534 goto out;
2592 } 2535 }
2593 2536 if (ipv4_is_local_multicast(fl4->daddr) ||
2594 if (ipv4_is_local_multicast(oldflp->fl4_dst) || 2537 ipv4_is_lbcast(fl4->daddr)) {
2595 oldflp->fl4_dst == htonl(0xFFFFFFFF)) { 2538 if (!fl4->saddr)
2596 if (!fl.fl4_src) 2539 fl4->saddr = inet_select_addr(dev_out, 0,
2597 fl.fl4_src = inet_select_addr(dev_out, 0,
2598 RT_SCOPE_LINK); 2540 RT_SCOPE_LINK);
2599 goto make_route; 2541 goto make_route;
2600 } 2542 }
2601 if (!fl.fl4_src) { 2543 if (fl4->saddr) {
2602 if (ipv4_is_multicast(oldflp->fl4_dst)) 2544 if (ipv4_is_multicast(fl4->daddr))
2603 fl.fl4_src = inet_select_addr(dev_out, 0, 2545 fl4->saddr = inet_select_addr(dev_out, 0,
2604 fl.fl4_scope); 2546 fl4->flowi4_scope);
2605 else if (!oldflp->fl4_dst) 2547 else if (!fl4->daddr)
2606 fl.fl4_src = inet_select_addr(dev_out, 0, 2548 fl4->saddr = inet_select_addr(dev_out, 0,
2607 RT_SCOPE_HOST); 2549 RT_SCOPE_HOST);
2608 } 2550 }
2609 } 2551 }
2610 2552
2611 if (!fl.fl4_dst) { 2553 if (!fl4->daddr) {
2612 fl.fl4_dst = fl.fl4_src; 2554 fl4->daddr = fl4->saddr;
2613 if (!fl.fl4_dst) 2555 if (!fl4->daddr)
2614 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); 2556 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2615 if (dev_out)
2616 dev_put(dev_out);
2617 dev_out = net->loopback_dev; 2557 dev_out = net->loopback_dev;
2618 dev_hold(dev_out); 2558 fl4->flowi4_oif = net->loopback_dev->ifindex;
2619 fl.oif = net->loopback_dev->ifindex;
2620 res.type = RTN_LOCAL; 2559 res.type = RTN_LOCAL;
2621 flags |= RTCF_LOCAL; 2560 flags |= RTCF_LOCAL;
2622 goto make_route; 2561 goto make_route;
2623 } 2562 }
2624 2563
2625 if (fib_lookup(net, &fl, &res)) { 2564 if (fib_lookup(net, fl4, &res)) {
2626 res.fi = NULL; 2565 res.fi = NULL;
2627 if (oldflp->oif) { 2566 if (fl4->flowi4_oif) {
2628 /* Apparently, routing tables are wrong. Assume, 2567 /* Apparently, routing tables are wrong. Assume,
2629 that the destination is on link. 2568 that the destination is on link.
2630 2569
@@ -2643,98 +2582,100 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2643 likely IPv6, but we do not. 2582 likely IPv6, but we do not.
2644 */ 2583 */
2645 2584
2646 if (fl.fl4_src == 0) 2585 if (fl4->saddr == 0)
2647 fl.fl4_src = inet_select_addr(dev_out, 0, 2586 fl4->saddr = inet_select_addr(dev_out, 0,
2648 RT_SCOPE_LINK); 2587 RT_SCOPE_LINK);
2649 res.type = RTN_UNICAST; 2588 res.type = RTN_UNICAST;
2650 goto make_route; 2589 goto make_route;
2651 } 2590 }
2652 if (dev_out) 2591 rth = ERR_PTR(-ENETUNREACH);
2653 dev_put(dev_out);
2654 err = -ENETUNREACH;
2655 goto out; 2592 goto out;
2656 } 2593 }
2657 free_res = 1;
2658 2594
2659 if (res.type == RTN_LOCAL) { 2595 if (res.type == RTN_LOCAL) {
2660 if (!fl.fl4_src) 2596 if (!fl4->saddr) {
2661 fl.fl4_src = fl.fl4_dst; 2597 if (res.fi->fib_prefsrc)
2662 if (dev_out) 2598 fl4->saddr = res.fi->fib_prefsrc;
2663 dev_put(dev_out); 2599 else
2600 fl4->saddr = fl4->daddr;
2601 }
2664 dev_out = net->loopback_dev; 2602 dev_out = net->loopback_dev;
2665 dev_hold(dev_out); 2603 fl4->flowi4_oif = dev_out->ifindex;
2666 fl.oif = dev_out->ifindex;
2667 if (res.fi)
2668 fib_info_put(res.fi);
2669 res.fi = NULL; 2604 res.fi = NULL;
2670 flags |= RTCF_LOCAL; 2605 flags |= RTCF_LOCAL;
2671 goto make_route; 2606 goto make_route;
2672 } 2607 }
2673 2608
2674#ifdef CONFIG_IP_ROUTE_MULTIPATH 2609#ifdef CONFIG_IP_ROUTE_MULTIPATH
2675 if (res.fi->fib_nhs > 1 && fl.oif == 0) 2610 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2676 fib_select_multipath(&fl, &res); 2611 fib_select_multipath(&res);
2677 else 2612 else
2678#endif 2613#endif
2679 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) 2614 if (!res.prefixlen &&
2680 fib_select_default(net, &fl, &res); 2615 res.table->tb_num_default > 1 &&
2616 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2617 fib_select_default(&res);
2681 2618
2682 if (!fl.fl4_src) 2619 if (!fl4->saddr)
2683 fl.fl4_src = FIB_RES_PREFSRC(res); 2620 fl4->saddr = FIB_RES_PREFSRC(net, res);
2684 2621
2685 if (dev_out)
2686 dev_put(dev_out);
2687 dev_out = FIB_RES_DEV(res); 2622 dev_out = FIB_RES_DEV(res);
2688 dev_hold(dev_out); 2623 fl4->flowi4_oif = dev_out->ifindex;
2689 fl.oif = dev_out->ifindex;
2690 2624
2691 2625
2692make_route: 2626make_route:
2693 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2627 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2628 dev_out, flags);
2629 if (!IS_ERR(rth)) {
2630 unsigned int hash;
2694 2631
2632 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2633 rt_genid(dev_net(dev_out)));
2634 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2635 }
2695 2636
2696 if (free_res) 2637out:
2697 fib_res_put(&res); 2638 rcu_read_unlock();
2698 if (dev_out) 2639 return rth;
2699 dev_put(dev_out);
2700out: return err;
2701} 2640}
2702 2641
2703int __ip_route_output_key(struct net *net, struct rtable **rp, 2642struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2704 const struct flowi *flp)
2705{ 2643{
2706 unsigned hash;
2707 struct rtable *rth; 2644 struct rtable *rth;
2645 unsigned int hash;
2708 2646
2709 if (!rt_caching(net)) 2647 if (!rt_caching(net))
2710 goto slow_output; 2648 goto slow_output;
2711 2649
2712 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); 2650 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2713 2651
2714 rcu_read_lock_bh(); 2652 rcu_read_lock_bh();
2715 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; 2653 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2716 rth = rcu_dereference_bh(rth->dst.rt_next)) { 2654 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2717 if (rth->fl.fl4_dst == flp->fl4_dst && 2655 if (rth->rt_key_dst == flp4->daddr &&
2718 rth->fl.fl4_src == flp->fl4_src && 2656 rth->rt_key_src == flp4->saddr &&
2719 rth->fl.iif == 0 && 2657 rt_is_output_route(rth) &&
2720 rth->fl.oif == flp->oif && 2658 rth->rt_oif == flp4->flowi4_oif &&
2721 rth->fl.mark == flp->mark && 2659 rth->rt_mark == flp4->flowi4_mark &&
2722 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2660 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2723 (IPTOS_RT_MASK | RTO_ONLINK)) && 2661 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2724 net_eq(dev_net(rth->dst.dev), net) && 2662 net_eq(dev_net(rth->dst.dev), net) &&
2725 !rt_is_expired(rth)) { 2663 !rt_is_expired(rth)) {
2726 dst_use(&rth->dst, jiffies); 2664 dst_use(&rth->dst, jiffies);
2727 RT_CACHE_STAT_INC(out_hit); 2665 RT_CACHE_STAT_INC(out_hit);
2728 rcu_read_unlock_bh(); 2666 rcu_read_unlock_bh();
2729 *rp = rth; 2667 if (!flp4->saddr)
2730 return 0; 2668 flp4->saddr = rth->rt_src;
2669 if (!flp4->daddr)
2670 flp4->daddr = rth->rt_dst;
2671 return rth;
2731 } 2672 }
2732 RT_CACHE_STAT_INC(out_hlist_search); 2673 RT_CACHE_STAT_INC(out_hlist_search);
2733 } 2674 }
2734 rcu_read_unlock_bh(); 2675 rcu_read_unlock_bh();
2735 2676
2736slow_output: 2677slow_output:
2737 return ip_route_output_slow(net, rp, flp); 2678 return ip_route_output_slow(net, flp4);
2738} 2679}
2739EXPORT_SYMBOL_GPL(__ip_route_output_key); 2680EXPORT_SYMBOL_GPL(__ip_route_output_key);
2740 2681
@@ -2743,95 +2684,96 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo
2743 return NULL; 2684 return NULL;
2744} 2685}
2745 2686
2687static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2688{
2689 return 0;
2690}
2691
2746static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2692static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2747{ 2693{
2748} 2694}
2749 2695
2696static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2697 unsigned long old)
2698{
2699 return NULL;
2700}
2701
2750static struct dst_ops ipv4_dst_blackhole_ops = { 2702static struct dst_ops ipv4_dst_blackhole_ops = {
2751 .family = AF_INET, 2703 .family = AF_INET,
2752 .protocol = cpu_to_be16(ETH_P_IP), 2704 .protocol = cpu_to_be16(ETH_P_IP),
2753 .destroy = ipv4_dst_destroy, 2705 .destroy = ipv4_dst_destroy,
2754 .check = ipv4_blackhole_dst_check, 2706 .check = ipv4_blackhole_dst_check,
2707 .default_mtu = ipv4_blackhole_default_mtu,
2708 .default_advmss = ipv4_default_advmss,
2755 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2709 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2756 .entries = ATOMIC_INIT(0), 2710 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2757}; 2711};
2758 2712
2759 2713struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2760static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2761{ 2714{
2762 struct rtable *ort = *rp; 2715 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2763 struct rtable *rt = (struct rtable *) 2716 struct rtable *ort = (struct rtable *) dst_orig;
2764 dst_alloc(&ipv4_dst_blackhole_ops);
2765 2717
2766 if (rt) { 2718 if (rt) {
2767 struct dst_entry *new = &rt->dst; 2719 struct dst_entry *new = &rt->dst;
2768 2720
2769 atomic_set(&new->__refcnt, 1);
2770 new->__use = 1; 2721 new->__use = 1;
2771 new->input = dst_discard; 2722 new->input = dst_discard;
2772 new->output = dst_discard; 2723 new->output = dst_discard;
2773 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32)); 2724 dst_copy_metrics(new, &ort->dst);
2774 2725
2775 new->dev = ort->dst.dev; 2726 new->dev = ort->dst.dev;
2776 if (new->dev) 2727 if (new->dev)
2777 dev_hold(new->dev); 2728 dev_hold(new->dev);
2778 2729
2779 rt->fl = ort->fl; 2730 rt->rt_key_dst = ort->rt_key_dst;
2731 rt->rt_key_src = ort->rt_key_src;
2732 rt->rt_key_tos = ort->rt_key_tos;
2733 rt->rt_route_iif = ort->rt_route_iif;
2734 rt->rt_iif = ort->rt_iif;
2735 rt->rt_oif = ort->rt_oif;
2736 rt->rt_mark = ort->rt_mark;
2780 2737
2781 rt->idev = ort->idev;
2782 if (rt->idev)
2783 in_dev_hold(rt->idev);
2784 rt->rt_genid = rt_genid(net); 2738 rt->rt_genid = rt_genid(net);
2785 rt->rt_flags = ort->rt_flags; 2739 rt->rt_flags = ort->rt_flags;
2786 rt->rt_type = ort->rt_type; 2740 rt->rt_type = ort->rt_type;
2787 rt->rt_dst = ort->rt_dst; 2741 rt->rt_dst = ort->rt_dst;
2788 rt->rt_src = ort->rt_src; 2742 rt->rt_src = ort->rt_src;
2789 rt->rt_iif = ort->rt_iif;
2790 rt->rt_gateway = ort->rt_gateway; 2743 rt->rt_gateway = ort->rt_gateway;
2791 rt->rt_spec_dst = ort->rt_spec_dst; 2744 rt->rt_spec_dst = ort->rt_spec_dst;
2792 rt->peer = ort->peer; 2745 rt->peer = ort->peer;
2793 if (rt->peer) 2746 if (rt->peer)
2794 atomic_inc(&rt->peer->refcnt); 2747 atomic_inc(&rt->peer->refcnt);
2748 rt->fi = ort->fi;
2749 if (rt->fi)
2750 atomic_inc(&rt->fi->fib_clntref);
2795 2751
2796 dst_free(new); 2752 dst_free(new);
2797 } 2753 }
2798 2754
2799 dst_release(&(*rp)->dst); 2755 dst_release(dst_orig);
2800 *rp = rt; 2756
2801 return (rt ? 0 : -ENOMEM); 2757 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2802} 2758}
2803 2759
2804int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, 2760struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2805 struct sock *sk, int flags) 2761 struct sock *sk)
2806{ 2762{
2807 int err; 2763 struct rtable *rt = __ip_route_output_key(net, flp4);
2808
2809 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2810 return err;
2811 2764
2812 if (flp->proto) { 2765 if (IS_ERR(rt))
2813 if (!flp->fl4_src) 2766 return rt;
2814 flp->fl4_src = (*rp)->rt_src;
2815 if (!flp->fl4_dst)
2816 flp->fl4_dst = (*rp)->rt_dst;
2817 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2818 flags ? XFRM_LOOKUP_WAIT : 0);
2819 if (err == -EREMOTE)
2820 err = ipv4_dst_blackhole(net, rp, flp);
2821 2767
2822 return err; 2768 if (flp4->flowi4_proto)
2823 } 2769 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2770 flowi4_to_flowi(flp4),
2771 sk, 0);
2824 2772
2825 return 0; 2773 return rt;
2826} 2774}
2827EXPORT_SYMBOL_GPL(ip_route_output_flow); 2775EXPORT_SYMBOL_GPL(ip_route_output_flow);
2828 2776
2829int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2830{
2831 return ip_route_output_flow(net, rp, flp, NULL, 0);
2832}
2833EXPORT_SYMBOL(ip_route_output_key);
2834
2835static int rt_fill_info(struct net *net, 2777static int rt_fill_info(struct net *net,
2836 struct sk_buff *skb, u32 pid, u32 seq, int event, 2778 struct sk_buff *skb, u32 pid, u32 seq, int event,
2837 int nowait, unsigned int flags) 2779 int nowait, unsigned int flags)
@@ -2839,7 +2781,8 @@ static int rt_fill_info(struct net *net,
2839 struct rtable *rt = skb_rtable(skb); 2781 struct rtable *rt = skb_rtable(skb);
2840 struct rtmsg *r; 2782 struct rtmsg *r;
2841 struct nlmsghdr *nlh; 2783 struct nlmsghdr *nlh;
2842 long expires; 2784 long expires = 0;
2785 const struct inet_peer *peer = rt->peer;
2843 u32 id = 0, ts = 0, tsage = 0, error; 2786 u32 id = 0, ts = 0, tsage = 0, error;
2844 2787
2845 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2788 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
@@ -2850,7 +2793,7 @@ static int rt_fill_info(struct net *net,
2850 r->rtm_family = AF_INET; 2793 r->rtm_family = AF_INET;
2851 r->rtm_dst_len = 32; 2794 r->rtm_dst_len = 32;
2852 r->rtm_src_len = 0; 2795 r->rtm_src_len = 0;
2853 r->rtm_tos = rt->fl.fl4_tos; 2796 r->rtm_tos = rt->rt_key_tos;
2854 r->rtm_table = RT_TABLE_MAIN; 2797 r->rtm_table = RT_TABLE_MAIN;
2855 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); 2798 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2856 r->rtm_type = rt->rt_type; 2799 r->rtm_type = rt->rt_type;
@@ -2862,48 +2805,52 @@ static int rt_fill_info(struct net *net,
2862 2805
2863 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); 2806 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2864 2807
2865 if (rt->fl.fl4_src) { 2808 if (rt->rt_key_src) {
2866 r->rtm_src_len = 32; 2809 r->rtm_src_len = 32;
2867 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); 2810 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2868 } 2811 }
2869 if (rt->dst.dev) 2812 if (rt->dst.dev)
2870 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); 2813 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2871#ifdef CONFIG_NET_CLS_ROUTE 2814#ifdef CONFIG_IP_ROUTE_CLASSID
2872 if (rt->dst.tclassid) 2815 if (rt->dst.tclassid)
2873 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); 2816 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2874#endif 2817#endif
2875 if (rt->fl.iif) 2818 if (rt_is_input_route(rt))
2876 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2819 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2877 else if (rt->rt_src != rt->fl.fl4_src) 2820 else if (rt->rt_src != rt->rt_key_src)
2878 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 2821 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2879 2822
2880 if (rt->rt_dst != rt->rt_gateway) 2823 if (rt->rt_dst != rt->rt_gateway)
2881 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2824 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2882 2825
2883 if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0) 2826 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2884 goto nla_put_failure; 2827 goto nla_put_failure;
2885 2828
2886 if (rt->fl.mark) 2829 if (rt->rt_mark)
2887 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); 2830 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2888 2831
2889 error = rt->dst.error; 2832 error = rt->dst.error;
2890 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; 2833 if (peer) {
2891 if (rt->peer) {
2892 inet_peer_refcheck(rt->peer); 2834 inet_peer_refcheck(rt->peer);
2893 id = atomic_read(&rt->peer->ip_id_count) & 0xffff; 2835 id = atomic_read(&peer->ip_id_count) & 0xffff;
2894 if (rt->peer->tcp_ts_stamp) { 2836 if (peer->tcp_ts_stamp) {
2895 ts = rt->peer->tcp_ts; 2837 ts = peer->tcp_ts;
2896 tsage = get_seconds() - rt->peer->tcp_ts_stamp; 2838 tsage = get_seconds() - peer->tcp_ts_stamp;
2897 } 2839 }
2840 expires = ACCESS_ONCE(peer->pmtu_expires);
2841 if (expires)
2842 expires -= jiffies;
2898 } 2843 }
2899 2844
2900 if (rt->fl.iif) { 2845 if (rt_is_input_route(rt)) {
2901#ifdef CONFIG_IP_MROUTE 2846#ifdef CONFIG_IP_MROUTE
2902 __be32 dst = rt->rt_dst; 2847 __be32 dst = rt->rt_dst;
2903 2848
2904 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 2849 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2905 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2850 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2906 int err = ipmr_get_route(net, skb, r, nowait); 2851 int err = ipmr_get_route(net, skb,
2852 rt->rt_src, rt->rt_dst,
2853 r, nowait);
2907 if (err <= 0) { 2854 if (err <= 0) {
2908 if (!nowait) { 2855 if (!nowait) {
2909 if (err == 0) 2856 if (err == 0)
@@ -2917,7 +2864,7 @@ static int rt_fill_info(struct net *net,
2917 } 2864 }
2918 } else 2865 } else
2919#endif 2866#endif
2920 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); 2867 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2921 } 2868 }
2922 2869
2923 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, 2870 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
@@ -2991,18 +2938,18 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2991 if (err == 0 && rt->dst.error) 2938 if (err == 0 && rt->dst.error)
2992 err = -rt->dst.error; 2939 err = -rt->dst.error;
2993 } else { 2940 } else {
2994 struct flowi fl = { 2941 struct flowi4 fl4 = {
2995 .nl_u = { 2942 .daddr = dst,
2996 .ip4_u = { 2943 .saddr = src,
2997 .daddr = dst, 2944 .flowi4_tos = rtm->rtm_tos,
2998 .saddr = src, 2945 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2999 .tos = rtm->rtm_tos, 2946 .flowi4_mark = mark,
3000 },
3001 },
3002 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3003 .mark = mark,
3004 }; 2947 };
3005 err = ip_route_output_key(net, &rt, &fl); 2948 rt = ip_route_output_key(net, &fl4);
2949
2950 err = 0;
2951 if (IS_ERR(rt))
2952 err = PTR_ERR(rt);
3006 } 2953 }
3007 2954
3008 if (err) 2955 if (err)
@@ -3285,6 +3232,8 @@ static __net_init int rt_genid_init(struct net *net)
3285{ 3232{
3286 get_random_bytes(&net->ipv4.rt_genid, 3233 get_random_bytes(&net->ipv4.rt_genid,
3287 sizeof(net->ipv4.rt_genid)); 3234 sizeof(net->ipv4.rt_genid));
3235 get_random_bytes(&net->ipv4.dev_addr_genid,
3236 sizeof(net->ipv4.dev_addr_genid));
3288 return 0; 3237 return 0;
3289} 3238}
3290 3239
@@ -3293,9 +3242,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
3293}; 3242};
3294 3243
3295 3244
3296#ifdef CONFIG_NET_CLS_ROUTE 3245#ifdef CONFIG_IP_ROUTE_CLASSID
3297struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3246struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3298#endif /* CONFIG_NET_CLS_ROUTE */ 3247#endif /* CONFIG_IP_ROUTE_CLASSID */
3299 3248
3300static __initdata unsigned long rhash_entries; 3249static __initdata unsigned long rhash_entries;
3301static int __init set_rhash_entries(char *str) 3250static int __init set_rhash_entries(char *str)
@@ -3311,7 +3260,7 @@ int __init ip_rt_init(void)
3311{ 3260{
3312 int rc = 0; 3261 int rc = 0;
3313 3262
3314#ifdef CONFIG_NET_CLS_ROUTE 3263#ifdef CONFIG_IP_ROUTE_CLASSID
3315 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3264 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3316 if (!ip_rt_acct) 3265 if (!ip_rt_acct)
3317 panic("IP: failed to allocate ip_rt_acct\n"); 3266 panic("IP: failed to allocate ip_rt_acct\n");
@@ -3323,6 +3272,12 @@ int __init ip_rt_init(void)
3323 3272
3324 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3273 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3325 3274
3275 if (dst_entries_init(&ipv4_dst_ops) < 0)
3276 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3277
3278 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3279 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3280
3326 rt_hash_table = (struct rt_hash_bucket *) 3281 rt_hash_table = (struct rt_hash_bucket *)
3327 alloc_large_system_hash("IP route cache", 3282 alloc_large_system_hash("IP route cache",
3328 sizeof(struct rt_hash_bucket), 3283 sizeof(struct rt_hash_bucket),
@@ -3342,14 +3297,6 @@ int __init ip_rt_init(void)
3342 devinet_init(); 3297 devinet_init();
3343 ip_fib_init(); 3298 ip_fib_init();
3344 3299
3345 /* All the timers, started at system startup tend
3346 to synchronize. Perturb it a bit.
3347 */
3348 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3349 expires_ljiffies = jiffies;
3350 schedule_delayed_work(&expires_work,
3351 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3352
3353 if (ip_rt_proc_init()) 3300 if (ip_rt_proc_init())
3354 printk(KERN_ERR "Unable to create route proc files\n"); 3301 printk(KERN_ERR "Unable to create route proc files\n");
3355#ifdef CONFIG_XFRM 3302#ifdef CONFIG_XFRM