aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/route.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r--net/ipv4/route.c210
1 files changed, 107 insertions, 103 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f70ae1bccb8a..3e5b7cc2db4f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -139,6 +139,8 @@ static unsigned long expires_ljiffies;
139 */ 139 */
140 140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
143static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
142static void ipv4_dst_destroy(struct dst_entry *dst); 144static void ipv4_dst_destroy(struct dst_entry *dst);
143static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144static void ipv4_link_failure(struct sk_buff *skb); 146static void ipv4_link_failure(struct sk_buff *skb);
@@ -155,6 +157,8 @@ static struct dst_ops ipv4_dst_ops = {
155 .protocol = cpu_to_be16(ETH_P_IP), 157 .protocol = cpu_to_be16(ETH_P_IP),
156 .gc = rt_garbage_collect, 158 .gc = rt_garbage_collect,
157 .check = ipv4_dst_check, 159 .check = ipv4_dst_check,
160 .default_advmss = ipv4_default_advmss,
161 .default_mtu = ipv4_default_mtu,
158 .destroy = ipv4_dst_destroy, 162 .destroy = ipv4_dst_destroy,
159 .ifdown = ipv4_dst_ifdown, 163 .ifdown = ipv4_dst_ifdown,
160 .negative_advice = ipv4_negative_advice, 164 .negative_advice = ipv4_negative_advice,
@@ -383,8 +387,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
383 (__force u32)r->rt_gateway, 387 (__force u32)r->rt_gateway,
384 r->rt_flags, atomic_read(&r->dst.__refcnt), 388 r->rt_flags, atomic_read(&r->dst.__refcnt),
385 r->dst.__use, 0, (__force u32)r->rt_src, 389 r->dst.__use, 0, (__force u32)r->rt_src,
386 (dst_metric(&r->dst, RTAX_ADVMSS) ? 390 dst_metric_advmss(&r->dst) + 40,
387 (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
388 dst_metric(&r->dst, RTAX_WINDOW), 391 dst_metric(&r->dst, RTAX_WINDOW),
389 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 392 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
390 dst_metric(&r->dst, RTAX_RTTVAR)), 393 dst_metric(&r->dst, RTAX_RTTVAR)),
@@ -684,17 +687,17 @@ static inline bool rt_caching(const struct net *net)
684static inline bool compare_hash_inputs(const struct flowi *fl1, 687static inline bool compare_hash_inputs(const struct flowi *fl1,
685 const struct flowi *fl2) 688 const struct flowi *fl2)
686{ 689{
687 return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | 690 return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
688 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | 691 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
689 (fl1->iif ^ fl2->iif)) == 0); 692 (fl1->iif ^ fl2->iif)) == 0);
690} 693}
691 694
692static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 695static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
693{ 696{
694 return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) | 697 return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
695 ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) | 698 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
696 (fl1->mark ^ fl2->mark) | 699 (fl1->mark ^ fl2->mark) |
697 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) | 700 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
698 (fl1->oif ^ fl2->oif) | 701 (fl1->oif ^ fl2->oif) |
699 (fl1->iif ^ fl2->iif)) == 0; 702 (fl1->iif ^ fl2->iif)) == 0;
700} 703}
@@ -714,13 +717,15 @@ static inline int rt_is_expired(struct rtable *rth)
714 * Can be called by a softirq or a process. 717 * Can be called by a softirq or a process.
715 * In the later case, we want to be reschedule if necessary 718 * In the later case, we want to be reschedule if necessary
716 */ 719 */
717static void rt_do_flush(int process_context) 720static void rt_do_flush(struct net *net, int process_context)
718{ 721{
719 unsigned int i; 722 unsigned int i;
720 struct rtable *rth, *next; 723 struct rtable *rth, *next;
721 struct rtable * tail;
722 724
723 for (i = 0; i <= rt_hash_mask; i++) { 725 for (i = 0; i <= rt_hash_mask; i++) {
726 struct rtable __rcu **pprev;
727 struct rtable *list;
728
724 if (process_context && need_resched()) 729 if (process_context && need_resched())
725 cond_resched(); 730 cond_resched();
726 rth = rcu_dereference_raw(rt_hash_table[i].chain); 731 rth = rcu_dereference_raw(rt_hash_table[i].chain);
@@ -728,50 +733,32 @@ static void rt_do_flush(int process_context)
728 continue; 733 continue;
729 734
730 spin_lock_bh(rt_hash_lock_addr(i)); 735 spin_lock_bh(rt_hash_lock_addr(i));
731#ifdef CONFIG_NET_NS
732 {
733 struct rtable __rcu **prev;
734 struct rtable *p;
735 736
736 rth = rcu_dereference_protected(rt_hash_table[i].chain, 737 list = NULL;
738 pprev = &rt_hash_table[i].chain;
739 rth = rcu_dereference_protected(*pprev,
737 lockdep_is_held(rt_hash_lock_addr(i))); 740 lockdep_is_held(rt_hash_lock_addr(i)));
738 741
739 /* defer releasing the head of the list after spin_unlock */ 742 while (rth) {
740 for (tail = rth; tail; 743 next = rcu_dereference_protected(rth->dst.rt_next,
741 tail = rcu_dereference_protected(tail->dst.rt_next,
742 lockdep_is_held(rt_hash_lock_addr(i))))
743 if (!rt_is_expired(tail))
744 break;
745 if (rth != tail)
746 rt_hash_table[i].chain = tail;
747
748 /* call rt_free on entries after the tail requiring flush */
749 prev = &rt_hash_table[i].chain;
750 for (p = rcu_dereference_protected(*prev,
751 lockdep_is_held(rt_hash_lock_addr(i))); 744 lockdep_is_held(rt_hash_lock_addr(i)));
752 p != NULL; 745
753 p = next) { 746 if (!net ||
754 next = rcu_dereference_protected(p->dst.rt_next, 747 net_eq(dev_net(rth->dst.dev), net)) {
755 lockdep_is_held(rt_hash_lock_addr(i))); 748 rcu_assign_pointer(*pprev, next);
756 if (!rt_is_expired(p)) { 749 rcu_assign_pointer(rth->dst.rt_next, list);
757 prev = &p->dst.rt_next; 750 list = rth;
758 } else { 751 } else {
759 *prev = next; 752 pprev = &rth->dst.rt_next;
760 rt_free(p);
761 } 753 }
754 rth = next;
762 } 755 }
763 } 756
764#else
765 rth = rcu_dereference_protected(rt_hash_table[i].chain,
766 lockdep_is_held(rt_hash_lock_addr(i)));
767 rcu_assign_pointer(rt_hash_table[i].chain, NULL);
768 tail = NULL;
769#endif
770 spin_unlock_bh(rt_hash_lock_addr(i)); 757 spin_unlock_bh(rt_hash_lock_addr(i));
771 758
772 for (; rth != tail; rth = next) { 759 for (; list; list = next) {
773 next = rcu_dereference_protected(rth->dst.rt_next, 1); 760 next = rcu_dereference_protected(list->dst.rt_next, 1);
774 rt_free(rth); 761 rt_free(list);
775 } 762 }
776 } 763 }
777} 764}
@@ -919,13 +906,13 @@ void rt_cache_flush(struct net *net, int delay)
919{ 906{
920 rt_cache_invalidate(net); 907 rt_cache_invalidate(net);
921 if (delay >= 0) 908 if (delay >= 0)
922 rt_do_flush(!in_softirq()); 909 rt_do_flush(net, !in_softirq());
923} 910}
924 911
925/* Flush previous cache invalidated entries from the cache */ 912/* Flush previous cache invalidated entries from the cache */
926void rt_cache_flush_batch(void) 913void rt_cache_flush_batch(struct net *net)
927{ 914{
928 rt_do_flush(!in_softirq()); 915 rt_do_flush(net, !in_softirq());
929} 916}
930 917
931static void rt_emergency_hash_rebuild(struct net *net) 918static void rt_emergency_hash_rebuild(struct net *net)
@@ -1289,7 +1276,7 @@ void rt_bind_peer(struct rtable *rt, int create)
1289{ 1276{
1290 struct inet_peer *peer; 1277 struct inet_peer *peer;
1291 1278
1292 peer = inet_getpeer(rt->rt_dst, create); 1279 peer = inet_getpeer_v4(rt->rt_dst, create);
1293 1280
1294 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) 1281 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1295 inet_putpeer(peer); 1282 inet_putpeer(peer);
@@ -1686,11 +1673,14 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1686 if (mtu < dst_mtu(&rth->dst)) { 1673 if (mtu < dst_mtu(&rth->dst)) {
1687 dst_confirm(&rth->dst); 1674 dst_confirm(&rth->dst);
1688 if (mtu < ip_rt_min_pmtu) { 1675 if (mtu < ip_rt_min_pmtu) {
1676 u32 lock = dst_metric(&rth->dst,
1677 RTAX_LOCK);
1689 mtu = ip_rt_min_pmtu; 1678 mtu = ip_rt_min_pmtu;
1690 rth->dst.metrics[RTAX_LOCK-1] |= 1679 lock |= (1 << RTAX_MTU);
1691 (1 << RTAX_MTU); 1680 dst_metric_set(&rth->dst, RTAX_LOCK,
1681 lock);
1692 } 1682 }
1693 rth->dst.metrics[RTAX_MTU-1] = mtu; 1683 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1694 dst_set_expires(&rth->dst, 1684 dst_set_expires(&rth->dst,
1695 ip_rt_mtu_expires); 1685 ip_rt_mtu_expires);
1696 } 1686 }
@@ -1708,10 +1698,11 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1708 if (dst_mtu(dst) > mtu && mtu >= 68 && 1698 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1709 !(dst_metric_locked(dst, RTAX_MTU))) { 1699 !(dst_metric_locked(dst, RTAX_MTU))) {
1710 if (mtu < ip_rt_min_pmtu) { 1700 if (mtu < ip_rt_min_pmtu) {
1701 u32 lock = dst_metric(dst, RTAX_LOCK);
1711 mtu = ip_rt_min_pmtu; 1702 mtu = ip_rt_min_pmtu;
1712 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); 1703 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1713 } 1704 }
1714 dst->metrics[RTAX_MTU-1] = mtu; 1705 dst_metric_set(dst, RTAX_MTU, mtu);
1715 dst_set_expires(dst, ip_rt_mtu_expires); 1706 dst_set_expires(dst, ip_rt_mtu_expires);
1716 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); 1707 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1717 } 1708 }
@@ -1794,38 +1785,55 @@ static void set_class_tag(struct rtable *rt, u32 tag)
1794} 1785}
1795#endif 1786#endif
1796 1787
1788static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1789{
1790 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1791
1792 if (advmss == 0) {
1793 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1794 ip_rt_min_advmss);
1795 if (advmss > 65535 - 40)
1796 advmss = 65535 - 40;
1797 }
1798 return advmss;
1799}
1800
1801static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1802{
1803 unsigned int mtu = dst->dev->mtu;
1804
1805 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1806 const struct rtable *rt = (const struct rtable *) dst;
1807
1808 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1809 mtu = 576;
1810 }
1811
1812 if (mtu > IP_MAX_MTU)
1813 mtu = IP_MAX_MTU;
1814
1815 return mtu;
1816}
1817
1797static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 1818static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1798{ 1819{
1820 struct dst_entry *dst = &rt->dst;
1799 struct fib_info *fi = res->fi; 1821 struct fib_info *fi = res->fi;
1800 1822
1801 if (fi) { 1823 if (fi) {
1802 if (FIB_RES_GW(*res) && 1824 if (FIB_RES_GW(*res) &&
1803 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1825 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1804 rt->rt_gateway = FIB_RES_GW(*res); 1826 rt->rt_gateway = FIB_RES_GW(*res);
1805 memcpy(rt->dst.metrics, fi->fib_metrics, 1827 dst_import_metrics(dst, fi->fib_metrics);
1806 sizeof(rt->dst.metrics));
1807 if (fi->fib_mtu == 0) {
1808 rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
1809 if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
1810 rt->rt_gateway != rt->rt_dst &&
1811 rt->dst.dev->mtu > 576)
1812 rt->dst.metrics[RTAX_MTU-1] = 576;
1813 }
1814#ifdef CONFIG_IP_ROUTE_CLASSID 1828#ifdef CONFIG_IP_ROUTE_CLASSID
1815 rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 1829 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1816#endif 1830#endif
1817 } else 1831 }
1818 rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu; 1832
1819 1833 if (dst_mtu(dst) > IP_MAX_MTU)
1820 if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0) 1834 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1821 rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; 1835 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1822 if (dst_mtu(&rt->dst) > IP_MAX_MTU) 1836 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1823 rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1824 if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
1825 rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
1826 ip_rt_min_advmss);
1827 if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
1828 rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1829 1837
1830#ifdef CONFIG_IP_ROUTE_CLASSID 1838#ifdef CONFIG_IP_ROUTE_CLASSID
1831#ifdef CONFIG_IP_MULTIPLE_TABLES 1839#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -2089,12 +2097,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2089{ 2097{
2090 struct fib_result res; 2098 struct fib_result res;
2091 struct in_device *in_dev = __in_dev_get_rcu(dev); 2099 struct in_device *in_dev = __in_dev_get_rcu(dev);
2092 struct flowi fl = { .nl_u = { .ip4_u = 2100 struct flowi fl = { .fl4_dst = daddr,
2093 { .daddr = daddr, 2101 .fl4_src = saddr,
2094 .saddr = saddr, 2102 .fl4_tos = tos,
2095 .tos = tos, 2103 .fl4_scope = RT_SCOPE_UNIVERSE,
2096 .scope = RT_SCOPE_UNIVERSE,
2097 } },
2098 .mark = skb->mark, 2104 .mark = skb->mark,
2099 .iif = dev->ifindex }; 2105 .iif = dev->ifindex };
2100 unsigned flags = 0; 2106 unsigned flags = 0;
@@ -2480,14 +2486,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2480 const struct flowi *oldflp) 2486 const struct flowi *oldflp)
2481{ 2487{
2482 u32 tos = RT_FL_TOS(oldflp); 2488 u32 tos = RT_FL_TOS(oldflp);
2483 struct flowi fl = { .nl_u = { .ip4_u = 2489 struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2484 { .daddr = oldflp->fl4_dst, 2490 .fl4_src = oldflp->fl4_src,
2485 .saddr = oldflp->fl4_src, 2491 .fl4_tos = tos & IPTOS_RT_MASK,
2486 .tos = tos & IPTOS_RT_MASK, 2492 .fl4_scope = ((tos & RTO_ONLINK) ?
2487 .scope = ((tos & RTO_ONLINK) ? 2493 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2488 RT_SCOPE_LINK :
2489 RT_SCOPE_UNIVERSE),
2490 } },
2491 .mark = oldflp->mark, 2494 .mark = oldflp->mark,
2492 .iif = net->loopback_dev->ifindex, 2495 .iif = net->loopback_dev->ifindex,
2493 .oif = oldflp->oif }; 2496 .oif = oldflp->oif };
@@ -2559,9 +2562,10 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2559 goto out; 2562 goto out;
2560 2563
2561 /* RACE: Check return value of inet_select_addr instead. */ 2564 /* RACE: Check return value of inet_select_addr instead. */
2562 if (rcu_dereference(dev_out->ip_ptr) == NULL) 2565 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2563 goto out; /* Wrong error code */ 2566 err = -ENETUNREACH;
2564 2567 goto out;
2568 }
2565 if (ipv4_is_local_multicast(oldflp->fl4_dst) || 2569 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2566 ipv4_is_lbcast(oldflp->fl4_dst)) { 2570 ipv4_is_lbcast(oldflp->fl4_dst)) {
2567 if (!fl.fl4_src) 2571 if (!fl.fl4_src)
@@ -2622,8 +2626,12 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2622 } 2626 }
2623 2627
2624 if (res.type == RTN_LOCAL) { 2628 if (res.type == RTN_LOCAL) {
2625 if (!fl.fl4_src) 2629 if (!fl.fl4_src) {
2626 fl.fl4_src = fl.fl4_dst; 2630 if (res.fi->fib_prefsrc)
2631 fl.fl4_src = res.fi->fib_prefsrc;
2632 else
2633 fl.fl4_src = fl.fl4_dst;
2634 }
2627 dev_out = net->loopback_dev; 2635 dev_out = net->loopback_dev;
2628 fl.oif = dev_out->ifindex; 2636 fl.oif = dev_out->ifindex;
2629 res.fi = NULL; 2637 res.fi = NULL;
@@ -2725,7 +2733,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
2725 new->__use = 1; 2733 new->__use = 1;
2726 new->input = dst_discard; 2734 new->input = dst_discard;
2727 new->output = dst_discard; 2735 new->output = dst_discard;
2728 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32)); 2736 dst_copy_metrics(new, &ort->dst);
2729 2737
2730 new->dev = ort->dst.dev; 2738 new->dev = ort->dst.dev;
2731 if (new->dev) 2739 if (new->dev)
@@ -2832,7 +2840,7 @@ static int rt_fill_info(struct net *net,
2832 if (rt->rt_dst != rt->rt_gateway) 2840 if (rt->rt_dst != rt->rt_gateway)
2833 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2841 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2834 2842
2835 if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0) 2843 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2836 goto nla_put_failure; 2844 goto nla_put_failure;
2837 2845
2838 if (rt->fl.mark) 2846 if (rt->fl.mark)
@@ -2944,13 +2952,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
2944 err = -rt->dst.error; 2952 err = -rt->dst.error;
2945 } else { 2953 } else {
2946 struct flowi fl = { 2954 struct flowi fl = {
2947 .nl_u = { 2955 .fl4_dst = dst,
2948 .ip4_u = { 2956 .fl4_src = src,
2949 .daddr = dst, 2957 .fl4_tos = rtm->rtm_tos,
2950 .saddr = src,
2951 .tos = rtm->rtm_tos,
2952 },
2953 },
2954 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 2958 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2955 .mark = mark, 2959 .mark = mark,
2956 }; 2960 };