aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorJulian Anastasov <ja@ssi.bg>2012-07-20 04:59:52 -0400
committerSimon Horman <horms@verge.net.au>2012-08-09 21:35:03 -0400
commitf2edb9f7706dcb2c0d9a362b2ba849efe3a97f5e (patch)
treea7e92aae534c38c59e3a08a8497ba80e764d436a /net
parent2b2d280817bd576e97ccd243b9b3a344d11ddd11 (diff)
ipvs: implement passive PMTUD for IPIP packets
IPVS is missing the logic to update PMTU in routing for its IPIP packets. We monitor the dst_mtu and can return FRAG_NEEDED messages but if the tunneled packets get ICMP error we can not rely on other traffic to save the lowest MTU. The following patch adds ICMP handling for IPIP packets in incoming direction, from some remote host to our local IP used as saddr in the outer header. By this way we can forward any related ICMP traffic if it is for IPVS TUN connection. For the special case of PMTUD we update the routing and if client requested DF we can forward the error. To properly update the routing we have to bind the cached route (dest->dst_cache) to the selected saddr because ipv4_update_pmtu uses saddr for dst lookup. Add IP_VS_RT_MODE_CONNECT flag to force such binding with second route. Update ip_vs_tunnel_xmit to provide IP_VS_RT_MODE_CONNECT and change the code to copy DF. For now we prefer not to force PMTU discovery (outer DF=1) because we don't have configuration option to enable or disable PMTUD. As we do not keep any packets to resend, we prefer not to play games with packets without DF bit because the sender is not informed when they are rejected. Also, change ops->update_pmtu to be called only for local clients because there is no point to update MTU for input routes, in our case skb->dst->dev is lo. It seems the code is copied from ipip.c where the skb dst points to tunnel device. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
Diffstat (limited to 'net')
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c76
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c79
2 files changed, 128 insertions, 27 deletions
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b54eccef40b5..58918e20f9d5 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1303,7 +1303,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1303 struct ip_vs_conn *cp; 1303 struct ip_vs_conn *cp;
1304 struct ip_vs_protocol *pp; 1304 struct ip_vs_protocol *pp;
1305 struct ip_vs_proto_data *pd; 1305 struct ip_vs_proto_data *pd;
1306 unsigned int offset, ihl, verdict; 1306 unsigned int offset, offset2, ihl, verdict;
1307 bool ipip;
1307 1308
1308 *related = 1; 1309 *related = 1;
1309 1310
@@ -1345,6 +1346,21 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1345 1346
1346 net = skb_net(skb); 1347 net = skb_net(skb);
1347 1348
1349 /* Special case for errors for IPIP packets */
1350 ipip = false;
1351 if (cih->protocol == IPPROTO_IPIP) {
1352 if (unlikely(cih->frag_off & htons(IP_OFFSET)))
1353 return NF_ACCEPT;
1354 /* Error for our IPIP must arrive at LOCAL_IN */
1355 if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
1356 return NF_ACCEPT;
1357 offset += cih->ihl * 4;
1358 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1359 if (cih == NULL)
1360 return NF_ACCEPT; /* The packet looks wrong, ignore */
1361 ipip = true;
1362 }
1363
1348 pd = ip_vs_proto_data_get(net, cih->protocol); 1364 pd = ip_vs_proto_data_get(net, cih->protocol);
1349 if (!pd) 1365 if (!pd)
1350 return NF_ACCEPT; 1366 return NF_ACCEPT;
@@ -1358,11 +1374,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1358 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 1374 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
1359 "Checking incoming ICMP for"); 1375 "Checking incoming ICMP for");
1360 1376
1377 offset2 = offset;
1361 offset += cih->ihl * 4; 1378 offset += cih->ihl * 4;
1362 1379
1363 ip_vs_fill_iphdr(AF_INET, cih, &ciph); 1380 ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1364 /* The embedded headers contain source and dest in reverse order */ 1381 /* The embedded headers contain source and dest in reverse order.
1365 cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1); 1382 * For IPIP this is error for request, not for reply.
1383 */
1384 cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, ipip ? 0 : 1);
1366 if (!cp) 1385 if (!cp)
1367 return NF_ACCEPT; 1386 return NF_ACCEPT;
1368 1387
@@ -1376,6 +1395,57 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1376 goto out; 1395 goto out;
1377 } 1396 }
1378 1397
1398 if (ipip) {
1399 __be32 info = ic->un.gateway;
1400
1401 /* Update the MTU */
1402 if (ic->type == ICMP_DEST_UNREACH &&
1403 ic->code == ICMP_FRAG_NEEDED) {
1404 struct ip_vs_dest *dest = cp->dest;
1405 u32 mtu = ntohs(ic->un.frag.mtu);
1406
1407 /* Strip outer IP and ICMP, go to IPIP header */
1408 __skb_pull(skb, ihl + sizeof(_icmph));
1409 offset2 -= ihl + sizeof(_icmph);
1410 skb_reset_network_header(skb);
1411 IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
1412 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
1413 rcu_read_lock();
1414 ipv4_update_pmtu(skb, dev_net(skb->dev),
1415 mtu, 0, 0, 0, 0);
1416 rcu_read_unlock();
1417 /* Client uses PMTUD? */
1418 if (!(cih->frag_off & htons(IP_DF)))
1419 goto ignore_ipip;
1420 /* Prefer the resulting PMTU */
1421 if (dest) {
1422 spin_lock(&dest->dst_lock);
1423 if (dest->dst_cache)
1424 mtu = dst_mtu(dest->dst_cache);
1425 spin_unlock(&dest->dst_lock);
1426 }
1427 if (mtu > 68 + sizeof(struct iphdr))
1428 mtu -= sizeof(struct iphdr);
1429 info = htonl(mtu);
1430 }
1431 /* Strip outer IP, ICMP and IPIP, go to IP header of
1432 * original request.
1433 */
1434 __skb_pull(skb, offset2);
1435 skb_reset_network_header(skb);
1436 IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
1437 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1438 ic->type, ic->code, ntohl(info));
1439 icmp_send(skb, ic->type, ic->code, info);
1440 /* ICMP can be shorter but anyways, account it */
1441 ip_vs_out_stats(cp, skb);
1442
1443ignore_ipip:
1444 consume_skb(skb);
1445 verdict = NF_STOLEN;
1446 goto out;
1447 }
1448
1379 /* do the statistics and put it back */ 1449 /* do the statistics and put it back */
1380 ip_vs_in_stats(cp, skb); 1450 ip_vs_in_stats(cp, skb);
1381 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) 1451 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 65b616ae1716..c2275ba048e9 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -49,6 +49,7 @@ enum {
49 IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to 49 IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to
50 * local 50 * local
51 */ 51 */
52 IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */
52}; 53};
53 54
54/* 55/*
@@ -84,6 +85,42 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
84 return dst; 85 return dst;
85} 86}
86 87
88/* Get route to daddr, update *saddr, optionally bind route to saddr */
89static struct rtable *do_output_route4(struct net *net, __be32 daddr,
90 u32 rtos, int rt_mode, __be32 *saddr)
91{
92 struct flowi4 fl4;
93 struct rtable *rt;
94 int loop = 0;
95
96 memset(&fl4, 0, sizeof(fl4));
97 fl4.daddr = daddr;
98 fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
99 fl4.flowi4_tos = rtos;
100
101retry:
102 rt = ip_route_output_key(net, &fl4);
103 if (IS_ERR(rt)) {
104 /* Invalid saddr ? */
105 if (PTR_ERR(rt) == -EINVAL && *saddr &&
106 rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
107 *saddr = 0;
108 flowi4_update_output(&fl4, 0, rtos, daddr, 0);
109 goto retry;
110 }
111 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
112 return NULL;
113 } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
114 ip_rt_put(rt);
115 *saddr = fl4.saddr;
116 flowi4_update_output(&fl4, 0, rtos, daddr, fl4.saddr);
117 loop++;
118 goto retry;
119 }
120 *saddr = fl4.saddr;
121 return rt;
122}
123
87/* Get route to destination or remote server */ 124/* Get route to destination or remote server */
88static struct rtable * 125static struct rtable *
89__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, 126__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
@@ -98,20 +135,13 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
98 spin_lock(&dest->dst_lock); 135 spin_lock(&dest->dst_lock);
99 if (!(rt = (struct rtable *) 136 if (!(rt = (struct rtable *)
100 __ip_vs_dst_check(dest, rtos))) { 137 __ip_vs_dst_check(dest, rtos))) {
101 struct flowi4 fl4; 138 rt = do_output_route4(net, dest->addr.ip, rtos,
102 139 rt_mode, &dest->dst_saddr.ip);
103 memset(&fl4, 0, sizeof(fl4)); 140 if (!rt) {
104 fl4.daddr = dest->addr.ip;
105 fl4.flowi4_tos = rtos;
106 rt = ip_route_output_key(net, &fl4);
107 if (IS_ERR(rt)) {
108 spin_unlock(&dest->dst_lock); 141 spin_unlock(&dest->dst_lock);
109 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
110 &dest->addr.ip);
111 return NULL; 142 return NULL;
112 } 143 }
113 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0); 144 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
114 dest->dst_saddr.ip = fl4.saddr;
115 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, " 145 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, "
116 "rtos=%X\n", 146 "rtos=%X\n",
117 &dest->addr.ip, &dest->dst_saddr.ip, 147 &dest->addr.ip, &dest->dst_saddr.ip,
@@ -122,19 +152,17 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
122 *ret_saddr = dest->dst_saddr.ip; 152 *ret_saddr = dest->dst_saddr.ip;
123 spin_unlock(&dest->dst_lock); 153 spin_unlock(&dest->dst_lock);
124 } else { 154 } else {
125 struct flowi4 fl4; 155 __be32 saddr = htonl(INADDR_ANY);
126 156
127 memset(&fl4, 0, sizeof(fl4)); 157 /* For such unconfigured boxes avoid many route lookups
128 fl4.daddr = daddr; 158 * for performance reasons because we do not remember saddr
129 fl4.flowi4_tos = rtos; 159 */
130 rt = ip_route_output_key(net, &fl4); 160 rt_mode &= ~IP_VS_RT_MODE_CONNECT;
131 if (IS_ERR(rt)) { 161 rt = do_output_route4(net, daddr, rtos, rt_mode, &saddr);
132 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", 162 if (!rt)
133 &daddr);
134 return NULL; 163 return NULL;
135 }
136 if (ret_saddr) 164 if (ret_saddr)
137 *ret_saddr = fl4.saddr; 165 *ret_saddr = saddr;
138 } 166 }
139 167
140 local = rt->rt_flags & RTCF_LOCAL; 168 local = rt->rt_flags & RTCF_LOCAL;
@@ -331,6 +359,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
331 old_dst = dest->dst_cache; 359 old_dst = dest->dst_cache;
332 dest->dst_cache = NULL; 360 dest->dst_cache = NULL;
333 dst_release(old_dst); 361 dst_release(old_dst);
362 dest->dst_saddr.ip = 0;
334} 363}
335 364
336#define IP_VS_XMIT_TUNNEL(skb, cp) \ 365#define IP_VS_XMIT_TUNNEL(skb, cp) \
@@ -771,7 +800,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
771 struct net_device *tdev; /* Device to other host */ 800 struct net_device *tdev; /* Device to other host */
772 struct iphdr *old_iph = ip_hdr(skb); 801 struct iphdr *old_iph = ip_hdr(skb);
773 u8 tos = old_iph->tos; 802 u8 tos = old_iph->tos;
774 __be16 df = old_iph->frag_off; 803 __be16 df;
775 struct iphdr *iph; /* Our new IP header */ 804 struct iphdr *iph; /* Our new IP header */
776 unsigned int max_headroom; /* The extra header space needed */ 805 unsigned int max_headroom; /* The extra header space needed */
777 int mtu; 806 int mtu;
@@ -781,7 +810,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
781 810
782 if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, 811 if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
783 RT_TOS(tos), IP_VS_RT_MODE_LOCAL | 812 RT_TOS(tos), IP_VS_RT_MODE_LOCAL |
784 IP_VS_RT_MODE_NON_LOCAL, 813 IP_VS_RT_MODE_NON_LOCAL |
814 IP_VS_RT_MODE_CONNECT,
785 &saddr))) 815 &saddr)))
786 goto tx_error_icmp; 816 goto tx_error_icmp;
787 if (rt->rt_flags & RTCF_LOCAL) { 817 if (rt->rt_flags & RTCF_LOCAL) {
@@ -796,10 +826,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
796 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); 826 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
797 goto tx_error_put; 827 goto tx_error_put;
798 } 828 }
799 if (skb_dst(skb)) 829 if (rt_is_output_route(skb_rtable(skb)))
800 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); 830 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
801 831
802 df |= (old_iph->frag_off & htons(IP_DF)); 832 /* Copy DF, reset fragment offset and MF */
833 df = old_iph->frag_off & htons(IP_DF);
803 834
804 if ((old_iph->frag_off & htons(IP_DF) && 835 if ((old_iph->frag_off & htons(IP_DF) &&
805 mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) { 836 mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) {