diff options
author | David S. Miller <davem@davemloft.net> | 2012-06-15 17:54:11 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-06-15 17:54:11 -0400 |
commit | 81aded24675ebda5de8a68843250ad15584ac38a (patch) | |
tree | 84f7bd5cf86cf010394de92efd5e4c5b636b3d20 /net/ipv6 | |
parent | 36393395536064e483b73d173f6afc103eadfbc4 (diff) |
ipv6: Handle PMTU in ICMP error handlers.
One tricky issue on the ipv6 side vs. ipv4 is that the ICMP callouts
to handle the error pass the 32-bit info cookie in network byte order
whereas ipv4 passes it around in host byte order.
Like the ipv4 side, we have two helper functions. One for when we
have a socket context and one for when we do not.
ip6ip6 tunnels are not handled here, because they handle PMTU events
by essentially relaying another ICMP packet-too-big message back to
the original sender.
This patch allows us to get rid of rt6_do_pmtu_disc(). It handles all
kinds of situations that simply cannot happen when we do the PMTU
update directly using a fully resolved route.
In fact, the "plen == 128" check in ip6_rt_update_pmtu() can very
likely be removed or changed into a BUG_ON() check. We should never
have a prefixed ipv6 route when we get there.
Another piece of strange history here is that TCP and DCCP, unlike in
ipv4, never invoke the update_pmtu() method from their ICMP error
handlers. This is incredibly astonishing since this is the context
where we have the most accurate context in which to make a PMTU
update, namely we have a fully connected socket and associated cached
socket route.
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv6')
-rw-r--r-- | net/ipv6/ah6.c | 3 | ||||
-rw-r--r-- | net/ipv6/esp6.c | 2 | ||||
-rw-r--r-- | net/ipv6/icmp.c | 6 | ||||
-rw-r--r-- | net/ipv6/ipcomp6.c | 2 | ||||
-rw-r--r-- | net/ipv6/raw.c | 5 | ||||
-rw-r--r-- | net/ipv6/route.c | 143 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 2 | ||||
-rw-r--r-- | net/ipv6/udp.c | 3 |
8 files changed, 48 insertions, 118 deletions
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index f1a4a2c28ed3..49d4d26bda88 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/pfkeyv2.h> | 35 | #include <linux/pfkeyv2.h> |
36 | #include <linux/string.h> | 36 | #include <linux/string.h> |
37 | #include <linux/scatterlist.h> | 37 | #include <linux/scatterlist.h> |
38 | #include <net/ip6_route.h> | ||
38 | #include <net/icmp.h> | 39 | #include <net/icmp.h> |
39 | #include <net/ipv6.h> | 40 | #include <net/ipv6.h> |
40 | #include <net/protocol.h> | 41 | #include <net/protocol.h> |
@@ -621,7 +622,7 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, | |||
621 | 622 | ||
622 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/%pI6\n", | 623 | NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/%pI6\n", |
623 | ntohl(ah->spi), &iph->daddr); | 624 | ntohl(ah->spi), &iph->daddr); |
624 | 625 | ip6_update_pmtu(skb, net, info, 0, 0); | |
625 | xfrm_state_put(x); | 626 | xfrm_state_put(x); |
626 | } | 627 | } |
627 | 628 | ||
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index db1521fcda5b..89a615ba84f8 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/random.h> | 39 | #include <linux/random.h> |
40 | #include <linux/slab.h> | 40 | #include <linux/slab.h> |
41 | #include <linux/spinlock.h> | 41 | #include <linux/spinlock.h> |
42 | #include <net/ip6_route.h> | ||
42 | #include <net/icmp.h> | 43 | #include <net/icmp.h> |
43 | #include <net/ipv6.h> | 44 | #include <net/ipv6.h> |
44 | #include <net/protocol.h> | 45 | #include <net/protocol.h> |
@@ -442,6 +443,7 @@ static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, | |||
442 | return; | 443 | return; |
443 | pr_debug("pmtu discovery on SA ESP/%08x/%pI6\n", | 444 | pr_debug("pmtu discovery on SA ESP/%08x/%pI6\n", |
444 | ntohl(esph->spi), &iph->daddr); | 445 | ntohl(esph->spi), &iph->daddr); |
446 | ip6_update_pmtu(skb, net, info, 0, 0); | ||
445 | xfrm_state_put(x); | 447 | xfrm_state_put(x); |
446 | } | 448 | } |
447 | 449 | ||
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index ed89bba745a1..5247d5c211f9 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c | |||
@@ -649,7 +649,6 @@ static int icmpv6_rcv(struct sk_buff *skb) | |||
649 | struct net_device *dev = skb->dev; | 649 | struct net_device *dev = skb->dev; |
650 | struct inet6_dev *idev = __in6_dev_get(dev); | 650 | struct inet6_dev *idev = __in6_dev_get(dev); |
651 | const struct in6_addr *saddr, *daddr; | 651 | const struct in6_addr *saddr, *daddr; |
652 | const struct ipv6hdr *orig_hdr; | ||
653 | struct icmp6hdr *hdr; | 652 | struct icmp6hdr *hdr; |
654 | u8 type; | 653 | u8 type; |
655 | 654 | ||
@@ -661,7 +660,7 @@ static int icmpv6_rcv(struct sk_buff *skb) | |||
661 | XFRM_STATE_ICMP)) | 660 | XFRM_STATE_ICMP)) |
662 | goto drop_no_count; | 661 | goto drop_no_count; |
663 | 662 | ||
664 | if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(*orig_hdr))) | 663 | if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(struct ipv6hdr))) |
665 | goto drop_no_count; | 664 | goto drop_no_count; |
666 | 665 | ||
667 | nh = skb_network_offset(skb); | 666 | nh = skb_network_offset(skb); |
@@ -722,9 +721,6 @@ static int icmpv6_rcv(struct sk_buff *skb) | |||
722 | if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) | 721 | if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) |
723 | goto discard_it; | 722 | goto discard_it; |
724 | hdr = icmp6_hdr(skb); | 723 | hdr = icmp6_hdr(skb); |
725 | orig_hdr = (struct ipv6hdr *) (hdr + 1); | ||
726 | rt6_pmtu_discovery(&orig_hdr->daddr, &orig_hdr->saddr, dev, | ||
727 | ntohl(hdr->icmp6_mtu)); | ||
728 | 724 | ||
729 | /* | 725 | /* |
730 | * Drop through to notify | 726 | * Drop through to notify |
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 5cb75bfe45b1..92832385a8ef 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include <linux/list.h> | 46 | #include <linux/list.h> |
47 | #include <linux/vmalloc.h> | 47 | #include <linux/vmalloc.h> |
48 | #include <linux/rtnetlink.h> | 48 | #include <linux/rtnetlink.h> |
49 | #include <net/ip6_route.h> | ||
49 | #include <net/icmp.h> | 50 | #include <net/icmp.h> |
50 | #include <net/ipv6.h> | 51 | #include <net/ipv6.h> |
51 | #include <net/protocol.h> | 52 | #include <net/protocol.h> |
@@ -74,6 +75,7 @@ static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, | |||
74 | 75 | ||
75 | pr_debug("pmtu discovery on SA IPCOMP/%08x/%pI6\n", | 76 | pr_debug("pmtu discovery on SA IPCOMP/%08x/%pI6\n", |
76 | spi, &iph->daddr); | 77 | spi, &iph->daddr); |
78 | ip6_update_pmtu(skb, net, info, 0, 0); | ||
77 | xfrm_state_put(x); | 79 | xfrm_state_put(x); |
78 | } | 80 | } |
79 | 81 | ||
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 93d69836fded..43b0042f15f4 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c | |||
@@ -328,9 +328,10 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb, | |||
328 | return; | 328 | return; |
329 | 329 | ||
330 | harderr = icmpv6_err_convert(type, code, &err); | 330 | harderr = icmpv6_err_convert(type, code, &err); |
331 | if (type == ICMPV6_PKT_TOOBIG) | 331 | if (type == ICMPV6_PKT_TOOBIG) { |
332 | ip6_sk_update_pmtu(skb, sk, info); | ||
332 | harderr = (np->pmtudisc == IPV6_PMTUDISC_DO); | 333 | harderr = (np->pmtudisc == IPV6_PMTUDISC_DO); |
333 | 334 | } | |
334 | if (np->recverr) { | 335 | if (np->recverr) { |
335 | u8 *payload = skb->data; | 336 | u8 *payload = skb->data; |
336 | if (!inet->hdrincl) | 337 | if (!inet->hdrincl) |
diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 58a3ec23da2f..0d41f68daff2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c | |||
@@ -1049,7 +1049,10 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) | |||
1049 | { | 1049 | { |
1050 | struct rt6_info *rt6 = (struct rt6_info*)dst; | 1050 | struct rt6_info *rt6 = (struct rt6_info*)dst; |
1051 | 1051 | ||
1052 | dst_confirm(dst); | ||
1052 | if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { | 1053 | if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { |
1054 | struct net *net = dev_net(dst->dev); | ||
1055 | |||
1053 | rt6->rt6i_flags |= RTF_MODIFIED; | 1056 | rt6->rt6i_flags |= RTF_MODIFIED; |
1054 | if (mtu < IPV6_MIN_MTU) { | 1057 | if (mtu < IPV6_MIN_MTU) { |
1055 | u32 features = dst_metric(dst, RTAX_FEATURES); | 1058 | u32 features = dst_metric(dst, RTAX_FEATURES); |
@@ -1058,9 +1061,39 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) | |||
1058 | dst_metric_set(dst, RTAX_FEATURES, features); | 1061 | dst_metric_set(dst, RTAX_FEATURES, features); |
1059 | } | 1062 | } |
1060 | dst_metric_set(dst, RTAX_MTU, mtu); | 1063 | dst_metric_set(dst, RTAX_MTU, mtu); |
1064 | rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires); | ||
1061 | } | 1065 | } |
1062 | } | 1066 | } |
1063 | 1067 | ||
1068 | void ip6_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, | ||
1069 | int oif, __be32 mark) | ||
1070 | { | ||
1071 | const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; | ||
1072 | struct dst_entry *dst; | ||
1073 | struct flowi6 fl6; | ||
1074 | |||
1075 | memset(&fl6, 0, sizeof(fl6)); | ||
1076 | fl6.flowi6_oif = oif; | ||
1077 | fl6.flowi6_mark = mark; | ||
1078 | fl6.flowi6_flags = FLOWI_FLAG_PRECOW_METRICS; | ||
1079 | fl6.daddr = iph->daddr; | ||
1080 | fl6.saddr = iph->saddr; | ||
1081 | fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK; | ||
1082 | |||
1083 | dst = ip6_route_output(net, NULL, &fl6); | ||
1084 | if (!dst->error) | ||
1085 | ip6_rt_update_pmtu(dst, ntohl(mtu)); | ||
1086 | dst_release(dst); | ||
1087 | } | ||
1088 | EXPORT_SYMBOL_GPL(ip6_update_pmtu); | ||
1089 | |||
1090 | void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) | ||
1091 | { | ||
1092 | ip6_update_pmtu(skb, sock_net(sk), mtu, | ||
1093 | sk->sk_bound_dev_if, sk->sk_mark); | ||
1094 | } | ||
1095 | EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); | ||
1096 | |||
1064 | static unsigned int ip6_default_advmss(const struct dst_entry *dst) | 1097 | static unsigned int ip6_default_advmss(const struct dst_entry *dst) |
1065 | { | 1098 | { |
1066 | struct net_device *dev = dst->dev; | 1099 | struct net_device *dev = dst->dev; |
@@ -1704,116 +1737,6 @@ out: | |||
1704 | } | 1737 | } |
1705 | 1738 | ||
1706 | /* | 1739 | /* |
1707 | * Handle ICMP "packet too big" messages | ||
1708 | * i.e. Path MTU discovery | ||
1709 | */ | ||
1710 | |||
1711 | static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr, | ||
1712 | struct net *net, u32 pmtu, int ifindex) | ||
1713 | { | ||
1714 | struct rt6_info *rt, *nrt; | ||
1715 | int allfrag = 0; | ||
1716 | again: | ||
1717 | rt = rt6_lookup(net, daddr, saddr, ifindex, 0); | ||
1718 | if (!rt) | ||
1719 | return; | ||
1720 | |||
1721 | if (rt6_check_expired(rt)) { | ||
1722 | ip6_del_rt(rt); | ||
1723 | goto again; | ||
1724 | } | ||
1725 | |||
1726 | if (pmtu >= dst_mtu(&rt->dst)) | ||
1727 | goto out; | ||
1728 | |||
1729 | if (pmtu < IPV6_MIN_MTU) { | ||
1730 | /* | ||
1731 | * According to RFC2460, PMTU is set to the IPv6 Minimum Link | ||
1732 | * MTU (1280) and a fragment header should always be included | ||
1733 | * after a node receiving Too Big message reporting PMTU is | ||
1734 | * less than the IPv6 Minimum Link MTU. | ||
1735 | */ | ||
1736 | pmtu = IPV6_MIN_MTU; | ||
1737 | allfrag = 1; | ||
1738 | } | ||
1739 | |||
1740 | /* New mtu received -> path was valid. | ||
1741 | They are sent only in response to data packets, | ||
1742 | so that this nexthop apparently is reachable. --ANK | ||
1743 | */ | ||
1744 | dst_confirm(&rt->dst); | ||
1745 | |||
1746 | /* Host route. If it is static, it would be better | ||
1747 | not to override it, but add new one, so that | ||
1748 | when cache entry will expire old pmtu | ||
1749 | would return automatically. | ||
1750 | */ | ||
1751 | if (rt->rt6i_flags & RTF_CACHE) { | ||
1752 | dst_metric_set(&rt->dst, RTAX_MTU, pmtu); | ||
1753 | if (allfrag) { | ||
1754 | u32 features = dst_metric(&rt->dst, RTAX_FEATURES); | ||
1755 | features |= RTAX_FEATURE_ALLFRAG; | ||
1756 | dst_metric_set(&rt->dst, RTAX_FEATURES, features); | ||
1757 | } | ||
1758 | rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); | ||
1759 | rt->rt6i_flags |= RTF_MODIFIED; | ||
1760 | goto out; | ||
1761 | } | ||
1762 | |||
1763 | /* Network route. | ||
1764 | Two cases are possible: | ||
1765 | 1. It is connected route. Action: COW | ||
1766 | 2. It is gatewayed route or NONEXTHOP route. Action: clone it. | ||
1767 | */ | ||
1768 | if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP)) | ||
1769 | nrt = rt6_alloc_cow(rt, daddr, saddr); | ||
1770 | else | ||
1771 | nrt = rt6_alloc_clone(rt, daddr); | ||
1772 | |||
1773 | if (nrt) { | ||
1774 | dst_metric_set(&nrt->dst, RTAX_MTU, pmtu); | ||
1775 | if (allfrag) { | ||
1776 | u32 features = dst_metric(&nrt->dst, RTAX_FEATURES); | ||
1777 | features |= RTAX_FEATURE_ALLFRAG; | ||
1778 | dst_metric_set(&nrt->dst, RTAX_FEATURES, features); | ||
1779 | } | ||
1780 | |||
1781 | /* According to RFC 1981, detecting PMTU increase shouldn't be | ||
1782 | * happened within 5 mins, the recommended timer is 10 mins. | ||
1783 | * Here this route expiration time is set to ip6_rt_mtu_expires | ||
1784 | * which is 10 mins. After 10 mins the decreased pmtu is expired | ||
1785 | * and detecting PMTU increase will be automatically happened. | ||
1786 | */ | ||
1787 | rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires); | ||
1788 | nrt->rt6i_flags |= RTF_DYNAMIC; | ||
1789 | ip6_ins_rt(nrt); | ||
1790 | } | ||
1791 | out: | ||
1792 | dst_release(&rt->dst); | ||
1793 | } | ||
1794 | |||
1795 | void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr, | ||
1796 | struct net_device *dev, u32 pmtu) | ||
1797 | { | ||
1798 | struct net *net = dev_net(dev); | ||
1799 | |||
1800 | /* | ||
1801 | * RFC 1981 states that a node "MUST reduce the size of the packets it | ||
1802 | * is sending along the path" that caused the Packet Too Big message. | ||
1803 | * Since it's not possible in the general case to determine which | ||
1804 | * interface was used to send the original packet, we update the MTU | ||
1805 | * on the interface that will be used to send future packets. We also | ||
1806 | * update the MTU on the interface that received the Packet Too Big in | ||
1807 | * case the original packet was forced out that interface with | ||
1808 | * SO_BINDTODEVICE or similar. This is the next best thing to the | ||
1809 | * correct behaviour, which would be to update the MTU on all | ||
1810 | * interfaces. | ||
1811 | */ | ||
1812 | rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0); | ||
1813 | rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex); | ||
1814 | } | ||
1815 | |||
1816 | /* | ||
1817 | * Misc support functions | 1740 | * Misc support functions |
1818 | */ | 1741 | */ |
1819 | 1742 | ||
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f91b0bfd12d5..26a88623940b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -415,6 +415,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, | |||
415 | } else | 415 | } else |
416 | dst_hold(dst); | 416 | dst_hold(dst); |
417 | 417 | ||
418 | dst->ops->update_pmtu(dst, ntohl(info)); | ||
419 | |||
418 | if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { | 420 | if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { |
419 | tcp_sync_mss(sk, dst_mtu(dst)); | 421 | tcp_sync_mss(sk, dst_mtu(dst)); |
420 | tcp_simple_retransmit(sk); | 422 | tcp_simple_retransmit(sk); |
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f05099fc5901..051ad481973f 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c | |||
@@ -479,6 +479,9 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, | |||
479 | if (sk == NULL) | 479 | if (sk == NULL) |
480 | return; | 480 | return; |
481 | 481 | ||
482 | if (type == ICMPV6_PKT_TOOBIG) | ||
483 | ip6_sk_update_pmtu(skb, sk, info); | ||
484 | |||
482 | np = inet6_sk(sk); | 485 | np = inet6_sk(sk); |
483 | 486 | ||
484 | if (!icmpv6_err_convert(type, code, &err) && !np->recverr) | 487 | if (!icmpv6_err_convert(type, code, &err) && !np->recverr) |