aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c76
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c79
2 files changed, 128 insertions, 27 deletions
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b54eccef40b5..58918e20f9d5 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1303,7 +1303,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1303 struct ip_vs_conn *cp; 1303 struct ip_vs_conn *cp;
1304 struct ip_vs_protocol *pp; 1304 struct ip_vs_protocol *pp;
1305 struct ip_vs_proto_data *pd; 1305 struct ip_vs_proto_data *pd;
1306 unsigned int offset, ihl, verdict; 1306 unsigned int offset, offset2, ihl, verdict;
1307 bool ipip;
1307 1308
1308 *related = 1; 1309 *related = 1;
1309 1310
@@ -1345,6 +1346,21 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1345 1346
1346 net = skb_net(skb); 1347 net = skb_net(skb);
1347 1348
1349 /* Special case for errors for IPIP packets */
1350 ipip = false;
1351 if (cih->protocol == IPPROTO_IPIP) {
1352 if (unlikely(cih->frag_off & htons(IP_OFFSET)))
1353 return NF_ACCEPT;
1354 /* Error for our IPIP must arrive at LOCAL_IN */
1355 if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
1356 return NF_ACCEPT;
1357 offset += cih->ihl * 4;
1358 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1359 if (cih == NULL)
1360 return NF_ACCEPT; /* The packet looks wrong, ignore */
1361 ipip = true;
1362 }
1363
1348 pd = ip_vs_proto_data_get(net, cih->protocol); 1364 pd = ip_vs_proto_data_get(net, cih->protocol);
1349 if (!pd) 1365 if (!pd)
1350 return NF_ACCEPT; 1366 return NF_ACCEPT;
@@ -1358,11 +1374,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1358 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 1374 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
1359 "Checking incoming ICMP for"); 1375 "Checking incoming ICMP for");
1360 1376
1377 offset2 = offset;
1361 offset += cih->ihl * 4; 1378 offset += cih->ihl * 4;
1362 1379
1363 ip_vs_fill_iphdr(AF_INET, cih, &ciph); 1380 ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1364 /* The embedded headers contain source and dest in reverse order */ 1381 /* The embedded headers contain source and dest in reverse order.
1365 cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1); 1382 * For IPIP this is error for request, not for reply.
1383 */
1384 cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, ipip ? 0 : 1);
1366 if (!cp) 1385 if (!cp)
1367 return NF_ACCEPT; 1386 return NF_ACCEPT;
1368 1387
@@ -1376,6 +1395,57 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1376 goto out; 1395 goto out;
1377 } 1396 }
1378 1397
1398 if (ipip) {
1399 __be32 info = ic->un.gateway;
1400
1401 /* Update the MTU */
1402 if (ic->type == ICMP_DEST_UNREACH &&
1403 ic->code == ICMP_FRAG_NEEDED) {
1404 struct ip_vs_dest *dest = cp->dest;
1405 u32 mtu = ntohs(ic->un.frag.mtu);
1406
1407 /* Strip outer IP and ICMP, go to IPIP header */
1408 __skb_pull(skb, ihl + sizeof(_icmph));
1409 offset2 -= ihl + sizeof(_icmph);
1410 skb_reset_network_header(skb);
1411 IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
1412 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
1413 rcu_read_lock();
1414 ipv4_update_pmtu(skb, dev_net(skb->dev),
1415 mtu, 0, 0, 0, 0);
1416 rcu_read_unlock();
1417 /* Client uses PMTUD? */
1418 if (!(cih->frag_off & htons(IP_DF)))
1419 goto ignore_ipip;
1420 /* Prefer the resulting PMTU */
1421 if (dest) {
1422 spin_lock(&dest->dst_lock);
1423 if (dest->dst_cache)
1424 mtu = dst_mtu(dest->dst_cache);
1425 spin_unlock(&dest->dst_lock);
1426 }
1427 if (mtu > 68 + sizeof(struct iphdr))
1428 mtu -= sizeof(struct iphdr);
1429 info = htonl(mtu);
1430 }
1431 /* Strip outer IP, ICMP and IPIP, go to IP header of
1432 * original request.
1433 */
1434 __skb_pull(skb, offset2);
1435 skb_reset_network_header(skb);
1436 IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
1437 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1438 ic->type, ic->code, ntohl(info));
1439 icmp_send(skb, ic->type, ic->code, info);
1440 /* ICMP can be shorter but anyways, account it */
1441 ip_vs_out_stats(cp, skb);
1442
1443ignore_ipip:
1444 consume_skb(skb);
1445 verdict = NF_STOLEN;
1446 goto out;
1447 }
1448
1379 /* do the statistics and put it back */ 1449 /* do the statistics and put it back */
1380 ip_vs_in_stats(cp, skb); 1450 ip_vs_in_stats(cp, skb);
1381 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) 1451 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 65b616ae1716..c2275ba048e9 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -49,6 +49,7 @@ enum {
49 IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to 49 IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to
50 * local 50 * local
51 */ 51 */
52 IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */
52}; 53};
53 54
54/* 55/*
@@ -84,6 +85,42 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
84 return dst; 85 return dst;
85} 86}
86 87
88/* Get route to daddr, update *saddr, optionally bind route to saddr */
89static struct rtable *do_output_route4(struct net *net, __be32 daddr,
90 u32 rtos, int rt_mode, __be32 *saddr)
91{
92 struct flowi4 fl4;
93 struct rtable *rt;
94 int loop = 0;
95
96 memset(&fl4, 0, sizeof(fl4));
97 fl4.daddr = daddr;
98 fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
99 fl4.flowi4_tos = rtos;
100
101retry:
102 rt = ip_route_output_key(net, &fl4);
103 if (IS_ERR(rt)) {
104 /* Invalid saddr ? */
105 if (PTR_ERR(rt) == -EINVAL && *saddr &&
106 rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
107 *saddr = 0;
108 flowi4_update_output(&fl4, 0, rtos, daddr, 0);
109 goto retry;
110 }
111 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
112 return NULL;
113 } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
114 ip_rt_put(rt);
115 *saddr = fl4.saddr;
116 flowi4_update_output(&fl4, 0, rtos, daddr, fl4.saddr);
117 loop++;
118 goto retry;
119 }
120 *saddr = fl4.saddr;
121 return rt;
122}
123
87/* Get route to destination or remote server */ 124/* Get route to destination or remote server */
88static struct rtable * 125static struct rtable *
89__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, 126__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
@@ -98,20 +135,13 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
98 spin_lock(&dest->dst_lock); 135 spin_lock(&dest->dst_lock);
99 if (!(rt = (struct rtable *) 136 if (!(rt = (struct rtable *)
100 __ip_vs_dst_check(dest, rtos))) { 137 __ip_vs_dst_check(dest, rtos))) {
101 struct flowi4 fl4; 138 rt = do_output_route4(net, dest->addr.ip, rtos,
102 139 rt_mode, &dest->dst_saddr.ip);
103 memset(&fl4, 0, sizeof(fl4)); 140 if (!rt) {
104 fl4.daddr = dest->addr.ip;
105 fl4.flowi4_tos = rtos;
106 rt = ip_route_output_key(net, &fl4);
107 if (IS_ERR(rt)) {
108 spin_unlock(&dest->dst_lock); 141 spin_unlock(&dest->dst_lock);
109 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
110 &dest->addr.ip);
111 return NULL; 142 return NULL;
112 } 143 }
113 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0); 144 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
114 dest->dst_saddr.ip = fl4.saddr;
115 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, " 145 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, "
116 "rtos=%X\n", 146 "rtos=%X\n",
117 &dest->addr.ip, &dest->dst_saddr.ip, 147 &dest->addr.ip, &dest->dst_saddr.ip,
@@ -122,19 +152,17 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
122 *ret_saddr = dest->dst_saddr.ip; 152 *ret_saddr = dest->dst_saddr.ip;
123 spin_unlock(&dest->dst_lock); 153 spin_unlock(&dest->dst_lock);
124 } else { 154 } else {
125 struct flowi4 fl4; 155 __be32 saddr = htonl(INADDR_ANY);
126 156
127 memset(&fl4, 0, sizeof(fl4)); 157 /* For such unconfigured boxes avoid many route lookups
128 fl4.daddr = daddr; 158 * for performance reasons because we do not remember saddr
129 fl4.flowi4_tos = rtos; 159 */
130 rt = ip_route_output_key(net, &fl4); 160 rt_mode &= ~IP_VS_RT_MODE_CONNECT;
131 if (IS_ERR(rt)) { 161 rt = do_output_route4(net, daddr, rtos, rt_mode, &saddr);
132 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", 162 if (!rt)
133 &daddr);
134 return NULL; 163 return NULL;
135 }
136 if (ret_saddr) 164 if (ret_saddr)
137 *ret_saddr = fl4.saddr; 165 *ret_saddr = saddr;
138 } 166 }
139 167
140 local = rt->rt_flags & RTCF_LOCAL; 168 local = rt->rt_flags & RTCF_LOCAL;
@@ -331,6 +359,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
331 old_dst = dest->dst_cache; 359 old_dst = dest->dst_cache;
332 dest->dst_cache = NULL; 360 dest->dst_cache = NULL;
333 dst_release(old_dst); 361 dst_release(old_dst);
362 dest->dst_saddr.ip = 0;
334} 363}
335 364
336#define IP_VS_XMIT_TUNNEL(skb, cp) \ 365#define IP_VS_XMIT_TUNNEL(skb, cp) \
@@ -771,7 +800,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
771 struct net_device *tdev; /* Device to other host */ 800 struct net_device *tdev; /* Device to other host */
772 struct iphdr *old_iph = ip_hdr(skb); 801 struct iphdr *old_iph = ip_hdr(skb);
773 u8 tos = old_iph->tos; 802 u8 tos = old_iph->tos;
774 __be16 df = old_iph->frag_off; 803 __be16 df;
775 struct iphdr *iph; /* Our new IP header */ 804 struct iphdr *iph; /* Our new IP header */
776 unsigned int max_headroom; /* The extra header space needed */ 805 unsigned int max_headroom; /* The extra header space needed */
777 int mtu; 806 int mtu;
@@ -781,7 +810,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
781 810
782 if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, 811 if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
783 RT_TOS(tos), IP_VS_RT_MODE_LOCAL | 812 RT_TOS(tos), IP_VS_RT_MODE_LOCAL |
784 IP_VS_RT_MODE_NON_LOCAL, 813 IP_VS_RT_MODE_NON_LOCAL |
814 IP_VS_RT_MODE_CONNECT,
785 &saddr))) 815 &saddr)))
786 goto tx_error_icmp; 816 goto tx_error_icmp;
787 if (rt->rt_flags & RTCF_LOCAL) { 817 if (rt->rt_flags & RTCF_LOCAL) {
@@ -796,10 +826,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
796 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); 826 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
797 goto tx_error_put; 827 goto tx_error_put;
798 } 828 }
799 if (skb_dst(skb)) 829 if (rt_is_output_route(skb_rtable(skb)))
800 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); 830 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
801 831
802 df |= (old_iph->frag_off & htons(IP_DF)); 832 /* Copy DF, reset fragment offset and MF */
833 df = old_iph->frag_off & htons(IP_DF);
803 834
804 if ((old_iph->frag_off & htons(IP_DF) && 835 if ((old_iph->frag_off & htons(IP_DF) &&
805 mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) { 836 mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) {