diff options
author | David S. Miller <davem@davemloft.net> | 2011-02-09 23:42:07 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-02-15 00:33:07 -0500 |
commit | 2c8cec5c10bced2408082a6656170e74ac17231c (patch) | |
tree | 56f7d290b0c9f305267eeac915ad28f90c10aba2 /net/ipv4/route.c | |
parent | d606ef3fe0c57504b8e534c58498f73a6abc049a (diff) |
ipv4: Cache learned PMTU information in inetpeer.
The general idea is that if we learn new PMTU information, we
bump the peer genid.
This triggers the dst_ops->check() code to validate and if
necessary propagate the new PMTU value into the metrics.
Learned PMTU information self-expires.
This means that it is not necessary to kill a cached route
entry just because the PMTU information is too old.
As a consequence:
1) When the path appears unreachable (dst_ops->link_failure
or dst_ops->negative_advice) we unwind the PMTU state if
it is out of date, instead of killing the cached route.
A redirected route will still be invalidated in these
situations.
2) rt_check_expire(), rt_worker_func(), et al. are no longer
necessary at all.
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r-- | net/ipv4/route.c | 260 |
1 files changed, 86 insertions, 174 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0979e039104a..11faf14c7430 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; | |||
131 | static int ip_rt_min_advmss __read_mostly = 256; | 131 | static int ip_rt_min_advmss __read_mostly = 256; |
132 | static int rt_chain_length_max __read_mostly = 20; | 132 | static int rt_chain_length_max __read_mostly = 20; |
133 | 133 | ||
134 | static struct delayed_work expires_work; | ||
135 | static unsigned long expires_ljiffies; | ||
136 | |||
137 | /* | 134 | /* |
138 | * Interface to generic destination cache. | 135 | * Interface to generic destination cache. |
139 | */ | 136 | */ |
@@ -668,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth) | |||
668 | static inline int rt_valuable(struct rtable *rth) | 665 | static inline int rt_valuable(struct rtable *rth) |
669 | { | 666 | { |
670 | return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || | 667 | return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || |
671 | rth->dst.expires; | 668 | (rth->peer && rth->peer->pmtu_expires); |
672 | } | 669 | } |
673 | 670 | ||
674 | static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) | 671 | static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) |
@@ -679,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t | |||
679 | if (atomic_read(&rth->dst.__refcnt)) | 676 | if (atomic_read(&rth->dst.__refcnt)) |
680 | goto out; | 677 | goto out; |
681 | 678 | ||
682 | ret = 1; | ||
683 | if (rth->dst.expires && | ||
684 | time_after_eq(jiffies, rth->dst.expires)) | ||
685 | goto out; | ||
686 | |||
687 | age = jiffies - rth->dst.lastuse; | 679 | age = jiffies - rth->dst.lastuse; |
688 | ret = 0; | ||
689 | if ((age <= tmo1 && !rt_fast_clean(rth)) || | 680 | if ((age <= tmo1 && !rt_fast_clean(rth)) || |
690 | (age <= tmo2 && rt_valuable(rth))) | 681 | (age <= tmo2 && rt_valuable(rth))) |
691 | goto out; | 682 | goto out; |
@@ -829,97 +820,6 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) | |||
829 | return ONE; | 820 | return ONE; |
830 | } | 821 | } |
831 | 822 | ||
832 | static void rt_check_expire(void) | ||
833 | { | ||
834 | static unsigned int rover; | ||
835 | unsigned int i = rover, goal; | ||
836 | struct rtable *rth; | ||
837 | struct rtable __rcu **rthp; | ||
838 | unsigned long samples = 0; | ||
839 | unsigned long sum = 0, sum2 = 0; | ||
840 | unsigned long delta; | ||
841 | u64 mult; | ||
842 | |||
843 | delta = jiffies - expires_ljiffies; | ||
844 | expires_ljiffies = jiffies; | ||
845 | mult = ((u64)delta) << rt_hash_log; | ||
846 | if (ip_rt_gc_timeout > 1) | ||
847 | do_div(mult, ip_rt_gc_timeout); | ||
848 | goal = (unsigned int)mult; | ||
849 | if (goal > rt_hash_mask) | ||
850 | goal = rt_hash_mask + 1; | ||
851 | for (; goal > 0; goal--) { | ||
852 | unsigned long tmo = ip_rt_gc_timeout; | ||
853 | unsigned long length; | ||
854 | |||
855 | i = (i + 1) & rt_hash_mask; | ||
856 | rthp = &rt_hash_table[i].chain; | ||
857 | |||
858 | if (need_resched()) | ||
859 | cond_resched(); | ||
860 | |||
861 | samples++; | ||
862 | |||
863 | if (rcu_dereference_raw(*rthp) == NULL) | ||
864 | continue; | ||
865 | length = 0; | ||
866 | spin_lock_bh(rt_hash_lock_addr(i)); | ||
867 | while ((rth = rcu_dereference_protected(*rthp, | ||
868 | lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { | ||
869 | prefetch(rth->dst.rt_next); | ||
870 | if (rt_is_expired(rth)) { | ||
871 | *rthp = rth->dst.rt_next; | ||
872 | rt_free(rth); | ||
873 | continue; | ||
874 | } | ||
875 | if (rth->dst.expires) { | ||
876 | /* Entry is expired even if it is in use */ | ||
877 | if (time_before_eq(jiffies, rth->dst.expires)) { | ||
878 | nofree: | ||
879 | tmo >>= 1; | ||
880 | rthp = &rth->dst.rt_next; | ||
881 | /* | ||
882 | * We only count entries on | ||
883 | * a chain with equal hash inputs once | ||
884 | * so that entries for different QOS | ||
885 | * levels, and other non-hash input | ||
886 | * attributes don't unfairly skew | ||
887 | * the length computation | ||
888 | */ | ||
889 | length += has_noalias(rt_hash_table[i].chain, rth); | ||
890 | continue; | ||
891 | } | ||
892 | } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) | ||
893 | goto nofree; | ||
894 | |||
895 | /* Cleanup aged off entries. */ | ||
896 | *rthp = rth->dst.rt_next; | ||
897 | rt_free(rth); | ||
898 | } | ||
899 | spin_unlock_bh(rt_hash_lock_addr(i)); | ||
900 | sum += length; | ||
901 | sum2 += length*length; | ||
902 | } | ||
903 | if (samples) { | ||
904 | unsigned long avg = sum / samples; | ||
905 | unsigned long sd = int_sqrt(sum2 / samples - avg*avg); | ||
906 | rt_chain_length_max = max_t(unsigned long, | ||
907 | ip_rt_gc_elasticity, | ||
908 | (avg + 4*sd) >> FRACT_BITS); | ||
909 | } | ||
910 | rover = i; | ||
911 | } | ||
912 | |||
913 | /* | ||
914 | * rt_worker_func() is run in process context. | ||
915 | * we call rt_check_expire() to scan part of the hash table | ||
916 | */ | ||
917 | static void rt_worker_func(struct work_struct *work) | ||
918 | { | ||
919 | rt_check_expire(); | ||
920 | schedule_delayed_work(&expires_work, ip_rt_gc_interval); | ||
921 | } | ||
922 | |||
923 | /* | 823 | /* |
924 | * Pertubation of rt_genid by a small quantity [1..256] | 824 | * Pertubation of rt_genid by a small quantity [1..256] |
925 | * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() | 825 | * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() |
@@ -1535,9 +1435,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) | |||
1535 | if (dst->obsolete > 0) { | 1435 | if (dst->obsolete > 0) { |
1536 | ip_rt_put(rt); | 1436 | ip_rt_put(rt); |
1537 | ret = NULL; | 1437 | ret = NULL; |
1538 | } else if ((rt->rt_flags & RTCF_REDIRECTED) || | 1438 | } else if (rt->rt_flags & RTCF_REDIRECTED) { |
1539 | (rt->dst.expires && | ||
1540 | time_after_eq(jiffies, rt->dst.expires))) { | ||
1541 | unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, | 1439 | unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, |
1542 | rt->fl.oif, | 1440 | rt->fl.oif, |
1543 | rt_genid(dev_net(dst->dev))); | 1441 | rt_genid(dev_net(dst->dev))); |
@@ -1547,6 +1445,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) | |||
1547 | #endif | 1445 | #endif |
1548 | rt_del(hash, rt); | 1446 | rt_del(hash, rt); |
1549 | ret = NULL; | 1447 | ret = NULL; |
1448 | } else if (rt->peer && | ||
1449 | rt->peer->pmtu_expires && | ||
1450 | time_after_eq(jiffies, rt->peer->pmtu_expires)) { | ||
1451 | unsigned long orig = rt->peer->pmtu_expires; | ||
1452 | |||
1453 | if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig) | ||
1454 | dst_metric_set(dst, RTAX_MTU, | ||
1455 | rt->peer->pmtu_orig); | ||
1550 | } | 1456 | } |
1551 | } | 1457 | } |
1552 | return ret; | 1458 | return ret; |
@@ -1697,80 +1603,78 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, | |||
1697 | unsigned short new_mtu, | 1603 | unsigned short new_mtu, |
1698 | struct net_device *dev) | 1604 | struct net_device *dev) |
1699 | { | 1605 | { |
1700 | int i, k; | ||
1701 | unsigned short old_mtu = ntohs(iph->tot_len); | 1606 | unsigned short old_mtu = ntohs(iph->tot_len); |
1702 | struct rtable *rth; | ||
1703 | int ikeys[2] = { dev->ifindex, 0 }; | ||
1704 | __be32 skeys[2] = { iph->saddr, 0, }; | ||
1705 | __be32 daddr = iph->daddr; | ||
1706 | unsigned short est_mtu = 0; | 1607 | unsigned short est_mtu = 0; |
1608 | struct inet_peer *peer; | ||
1707 | 1609 | ||
1708 | for (k = 0; k < 2; k++) { | 1610 | peer = inet_getpeer_v4(iph->daddr, 1); |
1709 | for (i = 0; i < 2; i++) { | 1611 | if (peer) { |
1710 | unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], | 1612 | unsigned short mtu = new_mtu; |
1711 | rt_genid(net)); | ||
1712 | |||
1713 | rcu_read_lock(); | ||
1714 | for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; | ||
1715 | rth = rcu_dereference(rth->dst.rt_next)) { | ||
1716 | unsigned short mtu = new_mtu; | ||
1717 | 1613 | ||
1718 | if (rth->fl.fl4_dst != daddr || | 1614 | if (new_mtu < 68 || new_mtu >= old_mtu) { |
1719 | rth->fl.fl4_src != skeys[i] || | 1615 | /* BSD 4.2 derived systems incorrectly adjust |
1720 | rth->rt_dst != daddr || | 1616 | * tot_len by the IP header length, and report |
1721 | rth->rt_src != iph->saddr || | 1617 | * a zero MTU in the ICMP message. |
1722 | rth->fl.oif != ikeys[k] || | 1618 | */ |
1723 | rt_is_input_route(rth) || | 1619 | if (mtu == 0 && |
1724 | dst_metric_locked(&rth->dst, RTAX_MTU) || | 1620 | old_mtu >= 68 + (iph->ihl << 2)) |
1725 | !net_eq(dev_net(rth->dst.dev), net) || | 1621 | old_mtu -= iph->ihl << 2; |
1726 | rt_is_expired(rth)) | 1622 | mtu = guess_mtu(old_mtu); |
1727 | continue; | 1623 | } |
1728 | 1624 | ||
1729 | if (new_mtu < 68 || new_mtu >= old_mtu) { | 1625 | if (mtu < ip_rt_min_pmtu) |
1626 | mtu = ip_rt_min_pmtu; | ||
1627 | if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { | ||
1628 | est_mtu = mtu; | ||
1629 | peer->pmtu_learned = mtu; | ||
1630 | peer->pmtu_expires = jiffies + ip_rt_mtu_expires; | ||
1631 | } | ||
1730 | 1632 | ||
1731 | /* BSD 4.2 compatibility hack :-( */ | 1633 | inet_putpeer(peer); |
1732 | if (mtu == 0 && | ||
1733 | old_mtu >= dst_mtu(&rth->dst) && | ||
1734 | old_mtu >= 68 + (iph->ihl << 2)) | ||
1735 | old_mtu -= iph->ihl << 2; | ||
1736 | 1634 | ||
1737 | mtu = guess_mtu(old_mtu); | 1635 | atomic_inc(&__rt_peer_genid); |
1738 | } | ||
1739 | if (mtu <= dst_mtu(&rth->dst)) { | ||
1740 | if (mtu < dst_mtu(&rth->dst)) { | ||
1741 | dst_confirm(&rth->dst); | ||
1742 | if (mtu < ip_rt_min_pmtu) { | ||
1743 | u32 lock = dst_metric(&rth->dst, | ||
1744 | RTAX_LOCK); | ||
1745 | mtu = ip_rt_min_pmtu; | ||
1746 | lock |= (1 << RTAX_MTU); | ||
1747 | dst_metric_set(&rth->dst, RTAX_LOCK, | ||
1748 | lock); | ||
1749 | } | ||
1750 | dst_metric_set(&rth->dst, RTAX_MTU, mtu); | ||
1751 | dst_set_expires(&rth->dst, | ||
1752 | ip_rt_mtu_expires); | ||
1753 | } | ||
1754 | est_mtu = mtu; | ||
1755 | } | ||
1756 | } | ||
1757 | rcu_read_unlock(); | ||
1758 | } | ||
1759 | } | 1636 | } |
1760 | return est_mtu ? : new_mtu; | 1637 | return est_mtu ? : new_mtu; |
1761 | } | 1638 | } |
1762 | 1639 | ||
1640 | static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) | ||
1641 | { | ||
1642 | unsigned long expires = peer->pmtu_expires; | ||
1643 | |||
1644 | if (time_before(expires, jiffies)) { | ||
1645 | u32 orig_dst_mtu = dst_mtu(dst); | ||
1646 | if (peer->pmtu_learned < orig_dst_mtu) { | ||
1647 | if (!peer->pmtu_orig) | ||
1648 | peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); | ||
1649 | dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); | ||
1650 | } | ||
1651 | } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) | ||
1652 | dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); | ||
1653 | } | ||
1654 | |||
1763 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) | 1655 | static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) |
1764 | { | 1656 | { |
1765 | if (dst_mtu(dst) > mtu && mtu >= 68 && | 1657 | struct rtable *rt = (struct rtable *) dst; |
1766 | !(dst_metric_locked(dst, RTAX_MTU))) { | 1658 | struct inet_peer *peer; |
1767 | if (mtu < ip_rt_min_pmtu) { | 1659 | |
1768 | u32 lock = dst_metric(dst, RTAX_LOCK); | 1660 | dst_confirm(dst); |
1661 | |||
1662 | if (!rt->peer) | ||
1663 | rt_bind_peer(rt, 1); | ||
1664 | peer = rt->peer; | ||
1665 | if (peer) { | ||
1666 | if (mtu < ip_rt_min_pmtu) | ||
1769 | mtu = ip_rt_min_pmtu; | 1667 | mtu = ip_rt_min_pmtu; |
1770 | dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU)); | 1668 | if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { |
1669 | peer->pmtu_learned = mtu; | ||
1670 | peer->pmtu_expires = jiffies + ip_rt_mtu_expires; | ||
1671 | |||
1672 | atomic_inc(&__rt_peer_genid); | ||
1673 | rt->rt_peer_genid = rt_peer_genid(); | ||
1674 | |||
1675 | check_peer_pmtu(dst, peer); | ||
1771 | } | 1676 | } |
1772 | dst_metric_set(dst, RTAX_MTU, mtu); | 1677 | inet_putpeer(peer); |
1773 | dst_set_expires(dst, ip_rt_mtu_expires); | ||
1774 | } | 1678 | } |
1775 | } | 1679 | } |
1776 | 1680 | ||
@@ -1781,9 +1685,15 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) | |||
1781 | if (rt_is_expired(rt)) | 1685 | if (rt_is_expired(rt)) |
1782 | return NULL; | 1686 | return NULL; |
1783 | if (rt->rt_peer_genid != rt_peer_genid()) { | 1687 | if (rt->rt_peer_genid != rt_peer_genid()) { |
1688 | struct inet_peer *peer; | ||
1689 | |||
1784 | if (!rt->peer) | 1690 | if (!rt->peer) |
1785 | rt_bind_peer(rt, 0); | 1691 | rt_bind_peer(rt, 0); |
1786 | 1692 | ||
1693 | peer = rt->peer; | ||
1694 | if (peer && peer->pmtu_expires) | ||
1695 | check_peer_pmtu(dst, peer); | ||
1696 | |||
1787 | rt->rt_peer_genid = rt_peer_genid(); | 1697 | rt->rt_peer_genid = rt_peer_genid(); |
1788 | } | 1698 | } |
1789 | return dst; | 1699 | return dst; |
@@ -1812,8 +1722,14 @@ static void ipv4_link_failure(struct sk_buff *skb) | |||
1812 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); | 1722 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); |
1813 | 1723 | ||
1814 | rt = skb_rtable(skb); | 1724 | rt = skb_rtable(skb); |
1815 | if (rt) | 1725 | if (rt && |
1816 | dst_set_expires(&rt->dst, 0); | 1726 | rt->peer && |
1727 | rt->peer->pmtu_expires) { | ||
1728 | unsigned long orig = rt->peer->pmtu_expires; | ||
1729 | |||
1730 | if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig) | ||
1731 | dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); | ||
1732 | } | ||
1817 | } | 1733 | } |
1818 | 1734 | ||
1819 | static int ip_rt_bug(struct sk_buff *skb) | 1735 | static int ip_rt_bug(struct sk_buff *skb) |
@@ -1911,6 +1827,9 @@ static void rt_init_metrics(struct rtable *rt, struct fib_info *fi) | |||
1911 | memcpy(peer->metrics, fi->fib_metrics, | 1827 | memcpy(peer->metrics, fi->fib_metrics, |
1912 | sizeof(u32) * RTAX_MAX); | 1828 | sizeof(u32) * RTAX_MAX); |
1913 | dst_init_metrics(&rt->dst, peer->metrics, false); | 1829 | dst_init_metrics(&rt->dst, peer->metrics, false); |
1830 | |||
1831 | if (peer->pmtu_expires) | ||
1832 | check_peer_pmtu(&rt->dst, peer); | ||
1914 | } else { | 1833 | } else { |
1915 | if (fi->fib_metrics != (u32 *) dst_default_metrics) { | 1834 | if (fi->fib_metrics != (u32 *) dst_default_metrics) { |
1916 | rt->fi = fi; | 1835 | rt->fi = fi; |
@@ -2961,7 +2880,8 @@ static int rt_fill_info(struct net *net, | |||
2961 | NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); | 2880 | NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); |
2962 | 2881 | ||
2963 | error = rt->dst.error; | 2882 | error = rt->dst.error; |
2964 | expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; | 2883 | expires = (rt->peer && rt->peer->pmtu_expires) ? |
2884 | rt->peer->pmtu_expires - jiffies : 0; | ||
2965 | if (rt->peer) { | 2885 | if (rt->peer) { |
2966 | inet_peer_refcheck(rt->peer); | 2886 | inet_peer_refcheck(rt->peer); |
2967 | id = atomic_read(&rt->peer->ip_id_count) & 0xffff; | 2887 | id = atomic_read(&rt->peer->ip_id_count) & 0xffff; |
@@ -3418,14 +3338,6 @@ int __init ip_rt_init(void) | |||
3418 | devinet_init(); | 3338 | devinet_init(); |
3419 | ip_fib_init(); | 3339 | ip_fib_init(); |
3420 | 3340 | ||
3421 | /* All the timers, started at system startup tend | ||
3422 | to synchronize. Perturb it a bit. | ||
3423 | */ | ||
3424 | INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); | ||
3425 | expires_ljiffies = jiffies; | ||
3426 | schedule_delayed_work(&expires_work, | ||
3427 | net_random() % ip_rt_gc_interval + ip_rt_gc_interval); | ||
3428 | |||
3429 | if (ip_rt_proc_init()) | 3341 | if (ip_rt_proc_init()) |
3430 | printk(KERN_ERR "Unable to create route proc files\n"); | 3342 | printk(KERN_ERR "Unable to create route proc files\n"); |
3431 | #ifdef CONFIG_XFRM | 3343 | #ifdef CONFIG_XFRM |