aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/route.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2011-02-09 23:42:07 -0500
committerDavid S. Miller <davem@davemloft.net>2011-02-15 00:33:07 -0500
commit2c8cec5c10bced2408082a6656170e74ac17231c (patch)
tree56f7d290b0c9f305267eeac915ad28f90c10aba2 /net/ipv4/route.c
parentd606ef3fe0c57504b8e534c58498f73a6abc049a (diff)
ipv4: Cache learned PMTU information in inetpeer.
The general idea is that if we learn new PMTU information, we bump the peer genid. This triggers the dst_ops->check() code to validate and if necessary propagate the new PMTU value into the metrics. Learned PMTU information self-expires. This means that it is not necessary to kill a cached route entry just because the PMTU information is too old. As a consequence: 1) When the path appears unreachable (dst_ops->link_failure or dst_ops->negative_advice) we unwind the PMTU state if it is out of date, instead of killing the cached route. A redirected route will still be invalidated in these situations. 2) rt_check_expire(), rt_worker_func(), et al. are no longer necessary at all. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r--net/ipv4/route.c260
1 files changed, 86 insertions, 174 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0979e039104a..11faf14c7430 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256; 131static int ip_rt_min_advmss __read_mostly = 256;
132static int rt_chain_length_max __read_mostly = 20; 132static int rt_chain_length_max __read_mostly = 20;
133 133
134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
136
137/* 134/*
138 * Interface to generic destination cache. 135 * Interface to generic destination cache.
139 */ 136 */
@@ -668,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth)
668static inline int rt_valuable(struct rtable *rth) 665static inline int rt_valuable(struct rtable *rth)
669{ 666{
670 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 667 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
671 rth->dst.expires; 668 (rth->peer && rth->peer->pmtu_expires);
672} 669}
673 670
674static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 671static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -679,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
679 if (atomic_read(&rth->dst.__refcnt)) 676 if (atomic_read(&rth->dst.__refcnt))
680 goto out; 677 goto out;
681 678
682 ret = 1;
683 if (rth->dst.expires &&
684 time_after_eq(jiffies, rth->dst.expires))
685 goto out;
686
687 age = jiffies - rth->dst.lastuse; 679 age = jiffies - rth->dst.lastuse;
688 ret = 0;
689 if ((age <= tmo1 && !rt_fast_clean(rth)) || 680 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
690 (age <= tmo2 && rt_valuable(rth))) 681 (age <= tmo2 && rt_valuable(rth)))
691 goto out; 682 goto out;
@@ -829,97 +820,6 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
829 return ONE; 820 return ONE;
830} 821}
831 822
832static void rt_check_expire(void)
833{
834 static unsigned int rover;
835 unsigned int i = rover, goal;
836 struct rtable *rth;
837 struct rtable __rcu **rthp;
838 unsigned long samples = 0;
839 unsigned long sum = 0, sum2 = 0;
840 unsigned long delta;
841 u64 mult;
842
843 delta = jiffies - expires_ljiffies;
844 expires_ljiffies = jiffies;
845 mult = ((u64)delta) << rt_hash_log;
846 if (ip_rt_gc_timeout > 1)
847 do_div(mult, ip_rt_gc_timeout);
848 goal = (unsigned int)mult;
849 if (goal > rt_hash_mask)
850 goal = rt_hash_mask + 1;
851 for (; goal > 0; goal--) {
852 unsigned long tmo = ip_rt_gc_timeout;
853 unsigned long length;
854
855 i = (i + 1) & rt_hash_mask;
856 rthp = &rt_hash_table[i].chain;
857
858 if (need_resched())
859 cond_resched();
860
861 samples++;
862
863 if (rcu_dereference_raw(*rthp) == NULL)
864 continue;
865 length = 0;
866 spin_lock_bh(rt_hash_lock_addr(i));
867 while ((rth = rcu_dereference_protected(*rthp,
868 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
869 prefetch(rth->dst.rt_next);
870 if (rt_is_expired(rth)) {
871 *rthp = rth->dst.rt_next;
872 rt_free(rth);
873 continue;
874 }
875 if (rth->dst.expires) {
876 /* Entry is expired even if it is in use */
877 if (time_before_eq(jiffies, rth->dst.expires)) {
878nofree:
879 tmo >>= 1;
880 rthp = &rth->dst.rt_next;
881 /*
882 * We only count entries on
883 * a chain with equal hash inputs once
884 * so that entries for different QOS
885 * levels, and other non-hash input
886 * attributes don't unfairly skew
887 * the length computation
888 */
889 length += has_noalias(rt_hash_table[i].chain, rth);
890 continue;
891 }
892 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
893 goto nofree;
894
895 /* Cleanup aged off entries. */
896 *rthp = rth->dst.rt_next;
897 rt_free(rth);
898 }
899 spin_unlock_bh(rt_hash_lock_addr(i));
900 sum += length;
901 sum2 += length*length;
902 }
903 if (samples) {
904 unsigned long avg = sum / samples;
905 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
906 rt_chain_length_max = max_t(unsigned long,
907 ip_rt_gc_elasticity,
908 (avg + 4*sd) >> FRACT_BITS);
909 }
910 rover = i;
911}
912
913/*
914 * rt_worker_func() is run in process context.
915 * we call rt_check_expire() to scan part of the hash table
916 */
917static void rt_worker_func(struct work_struct *work)
918{
919 rt_check_expire();
920 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
921}
922
923/* 823/*
924 * Pertubation of rt_genid by a small quantity [1..256] 824 * Pertubation of rt_genid by a small quantity [1..256]
925 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 825 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -1535,9 +1435,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1535 if (dst->obsolete > 0) { 1435 if (dst->obsolete > 0) {
1536 ip_rt_put(rt); 1436 ip_rt_put(rt);
1537 ret = NULL; 1437 ret = NULL;
1538 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1438 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1539 (rt->dst.expires &&
1540 time_after_eq(jiffies, rt->dst.expires))) {
1541 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1439 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1542 rt->fl.oif, 1440 rt->fl.oif,
1543 rt_genid(dev_net(dst->dev))); 1441 rt_genid(dev_net(dst->dev)));
@@ -1547,6 +1445,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1547#endif 1445#endif
1548 rt_del(hash, rt); 1446 rt_del(hash, rt);
1549 ret = NULL; 1447 ret = NULL;
1448 } else if (rt->peer &&
1449 rt->peer->pmtu_expires &&
1450 time_after_eq(jiffies, rt->peer->pmtu_expires)) {
1451 unsigned long orig = rt->peer->pmtu_expires;
1452
1453 if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1454 dst_metric_set(dst, RTAX_MTU,
1455 rt->peer->pmtu_orig);
1550 } 1456 }
1551 } 1457 }
1552 return ret; 1458 return ret;
@@ -1697,80 +1603,78 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1697 unsigned short new_mtu, 1603 unsigned short new_mtu,
1698 struct net_device *dev) 1604 struct net_device *dev)
1699{ 1605{
1700 int i, k;
1701 unsigned short old_mtu = ntohs(iph->tot_len); 1606 unsigned short old_mtu = ntohs(iph->tot_len);
1702 struct rtable *rth;
1703 int ikeys[2] = { dev->ifindex, 0 };
1704 __be32 skeys[2] = { iph->saddr, 0, };
1705 __be32 daddr = iph->daddr;
1706 unsigned short est_mtu = 0; 1607 unsigned short est_mtu = 0;
1608 struct inet_peer *peer;
1707 1609
1708 for (k = 0; k < 2; k++) { 1610 peer = inet_getpeer_v4(iph->daddr, 1);
1709 for (i = 0; i < 2; i++) { 1611 if (peer) {
1710 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1612 unsigned short mtu = new_mtu;
1711 rt_genid(net));
1712
1713 rcu_read_lock();
1714 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1715 rth = rcu_dereference(rth->dst.rt_next)) {
1716 unsigned short mtu = new_mtu;
1717 1613
1718 if (rth->fl.fl4_dst != daddr || 1614 if (new_mtu < 68 || new_mtu >= old_mtu) {
1719 rth->fl.fl4_src != skeys[i] || 1615 /* BSD 4.2 derived systems incorrectly adjust
1720 rth->rt_dst != daddr || 1616 * tot_len by the IP header length, and report
1721 rth->rt_src != iph->saddr || 1617 * a zero MTU in the ICMP message.
1722 rth->fl.oif != ikeys[k] || 1618 */
1723 rt_is_input_route(rth) || 1619 if (mtu == 0 &&
1724 dst_metric_locked(&rth->dst, RTAX_MTU) || 1620 old_mtu >= 68 + (iph->ihl << 2))
1725 !net_eq(dev_net(rth->dst.dev), net) || 1621 old_mtu -= iph->ihl << 2;
1726 rt_is_expired(rth)) 1622 mtu = guess_mtu(old_mtu);
1727 continue; 1623 }
1728 1624
1729 if (new_mtu < 68 || new_mtu >= old_mtu) { 1625 if (mtu < ip_rt_min_pmtu)
1626 mtu = ip_rt_min_pmtu;
1627 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1628 est_mtu = mtu;
1629 peer->pmtu_learned = mtu;
1630 peer->pmtu_expires = jiffies + ip_rt_mtu_expires;
1631 }
1730 1632
1731 /* BSD 4.2 compatibility hack :-( */ 1633 inet_putpeer(peer);
1732 if (mtu == 0 &&
1733 old_mtu >= dst_mtu(&rth->dst) &&
1734 old_mtu >= 68 + (iph->ihl << 2))
1735 old_mtu -= iph->ihl << 2;
1736 1634
1737 mtu = guess_mtu(old_mtu); 1635 atomic_inc(&__rt_peer_genid);
1738 }
1739 if (mtu <= dst_mtu(&rth->dst)) {
1740 if (mtu < dst_mtu(&rth->dst)) {
1741 dst_confirm(&rth->dst);
1742 if (mtu < ip_rt_min_pmtu) {
1743 u32 lock = dst_metric(&rth->dst,
1744 RTAX_LOCK);
1745 mtu = ip_rt_min_pmtu;
1746 lock |= (1 << RTAX_MTU);
1747 dst_metric_set(&rth->dst, RTAX_LOCK,
1748 lock);
1749 }
1750 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1751 dst_set_expires(&rth->dst,
1752 ip_rt_mtu_expires);
1753 }
1754 est_mtu = mtu;
1755 }
1756 }
1757 rcu_read_unlock();
1758 }
1759 } 1636 }
1760 return est_mtu ? : new_mtu; 1637 return est_mtu ? : new_mtu;
1761} 1638}
1762 1639
1640static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1641{
1642 unsigned long expires = peer->pmtu_expires;
1643
1644 if (time_before(expires, jiffies)) {
1645 u32 orig_dst_mtu = dst_mtu(dst);
1646 if (peer->pmtu_learned < orig_dst_mtu) {
1647 if (!peer->pmtu_orig)
1648 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1649 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1650 }
1651 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1652 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1653}
1654
1763static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1655static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1764{ 1656{
1765 if (dst_mtu(dst) > mtu && mtu >= 68 && 1657 struct rtable *rt = (struct rtable *) dst;
1766 !(dst_metric_locked(dst, RTAX_MTU))) { 1658 struct inet_peer *peer;
1767 if (mtu < ip_rt_min_pmtu) { 1659
1768 u32 lock = dst_metric(dst, RTAX_LOCK); 1660 dst_confirm(dst);
1661
1662 if (!rt->peer)
1663 rt_bind_peer(rt, 1);
1664 peer = rt->peer;
1665 if (peer) {
1666 if (mtu < ip_rt_min_pmtu)
1769 mtu = ip_rt_min_pmtu; 1667 mtu = ip_rt_min_pmtu;
1770 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU)); 1668 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1669 peer->pmtu_learned = mtu;
1670 peer->pmtu_expires = jiffies + ip_rt_mtu_expires;
1671
1672 atomic_inc(&__rt_peer_genid);
1673 rt->rt_peer_genid = rt_peer_genid();
1674
1675 check_peer_pmtu(dst, peer);
1771 } 1676 }
1772 dst_metric_set(dst, RTAX_MTU, mtu); 1677 inet_putpeer(peer);
1773 dst_set_expires(dst, ip_rt_mtu_expires);
1774 } 1678 }
1775} 1679}
1776 1680
@@ -1781,9 +1685,15 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1781 if (rt_is_expired(rt)) 1685 if (rt_is_expired(rt))
1782 return NULL; 1686 return NULL;
1783 if (rt->rt_peer_genid != rt_peer_genid()) { 1687 if (rt->rt_peer_genid != rt_peer_genid()) {
1688 struct inet_peer *peer;
1689
1784 if (!rt->peer) 1690 if (!rt->peer)
1785 rt_bind_peer(rt, 0); 1691 rt_bind_peer(rt, 0);
1786 1692
1693 peer = rt->peer;
1694 if (peer && peer->pmtu_expires)
1695 check_peer_pmtu(dst, peer);
1696
1787 rt->rt_peer_genid = rt_peer_genid(); 1697 rt->rt_peer_genid = rt_peer_genid();
1788 } 1698 }
1789 return dst; 1699 return dst;
@@ -1812,8 +1722,14 @@ static void ipv4_link_failure(struct sk_buff *skb)
1812 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1722 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1813 1723
1814 rt = skb_rtable(skb); 1724 rt = skb_rtable(skb);
1815 if (rt) 1725 if (rt &&
1816 dst_set_expires(&rt->dst, 0); 1726 rt->peer &&
1727 rt->peer->pmtu_expires) {
1728 unsigned long orig = rt->peer->pmtu_expires;
1729
1730 if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
1731 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1732 }
1817} 1733}
1818 1734
1819static int ip_rt_bug(struct sk_buff *skb) 1735static int ip_rt_bug(struct sk_buff *skb)
@@ -1911,6 +1827,9 @@ static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
1911 memcpy(peer->metrics, fi->fib_metrics, 1827 memcpy(peer->metrics, fi->fib_metrics,
1912 sizeof(u32) * RTAX_MAX); 1828 sizeof(u32) * RTAX_MAX);
1913 dst_init_metrics(&rt->dst, peer->metrics, false); 1829 dst_init_metrics(&rt->dst, peer->metrics, false);
1830
1831 if (peer->pmtu_expires)
1832 check_peer_pmtu(&rt->dst, peer);
1914 } else { 1833 } else {
1915 if (fi->fib_metrics != (u32 *) dst_default_metrics) { 1834 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1916 rt->fi = fi; 1835 rt->fi = fi;
@@ -2961,7 +2880,8 @@ static int rt_fill_info(struct net *net,
2961 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); 2880 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2962 2881
2963 error = rt->dst.error; 2882 error = rt->dst.error;
2964 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; 2883 expires = (rt->peer && rt->peer->pmtu_expires) ?
2884 rt->peer->pmtu_expires - jiffies : 0;
2965 if (rt->peer) { 2885 if (rt->peer) {
2966 inet_peer_refcheck(rt->peer); 2886 inet_peer_refcheck(rt->peer);
2967 id = atomic_read(&rt->peer->ip_id_count) & 0xffff; 2887 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
@@ -3418,14 +3338,6 @@ int __init ip_rt_init(void)
3418 devinet_init(); 3338 devinet_init();
3419 ip_fib_init(); 3339 ip_fib_init();
3420 3340
3421 /* All the timers, started at system startup tend
3422 to synchronize. Perturb it a bit.
3423 */
3424 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3425 expires_ljiffies = jiffies;
3426 schedule_delayed_work(&expires_work,
3427 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3428
3429 if (ip_rt_proc_init()) 3341 if (ip_rt_proc_init())
3430 printk(KERN_ERR "Unable to create route proc files\n"); 3342 printk(KERN_ERR "Unable to create route proc files\n");
3431#ifdef CONFIG_XFRM 3343#ifdef CONFIG_XFRM