diff options
-rw-r--r-- | net/netfilter/ipvs/ip_vs_core.c | 76 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_xmit.c | 79 |
2 files changed, 128 insertions, 27 deletions
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index b54eccef40b5..58918e20f9d5 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c | |||
@@ -1303,7 +1303,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) | |||
1303 | struct ip_vs_conn *cp; | 1303 | struct ip_vs_conn *cp; |
1304 | struct ip_vs_protocol *pp; | 1304 | struct ip_vs_protocol *pp; |
1305 | struct ip_vs_proto_data *pd; | 1305 | struct ip_vs_proto_data *pd; |
1306 | unsigned int offset, ihl, verdict; | 1306 | unsigned int offset, offset2, ihl, verdict; |
1307 | bool ipip; | ||
1307 | 1308 | ||
1308 | *related = 1; | 1309 | *related = 1; |
1309 | 1310 | ||
@@ -1345,6 +1346,21 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) | |||
1345 | 1346 | ||
1346 | net = skb_net(skb); | 1347 | net = skb_net(skb); |
1347 | 1348 | ||
1349 | /* Special case for errors for IPIP packets */ | ||
1350 | ipip = false; | ||
1351 | if (cih->protocol == IPPROTO_IPIP) { | ||
1352 | if (unlikely(cih->frag_off & htons(IP_OFFSET))) | ||
1353 | return NF_ACCEPT; | ||
1354 | /* Error for our IPIP must arrive at LOCAL_IN */ | ||
1355 | if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) | ||
1356 | return NF_ACCEPT; | ||
1357 | offset += cih->ihl * 4; | ||
1358 | cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); | ||
1359 | if (cih == NULL) | ||
1360 | return NF_ACCEPT; /* The packet looks wrong, ignore */ | ||
1361 | ipip = true; | ||
1362 | } | ||
1363 | |||
1348 | pd = ip_vs_proto_data_get(net, cih->protocol); | 1364 | pd = ip_vs_proto_data_get(net, cih->protocol); |
1349 | if (!pd) | 1365 | if (!pd) |
1350 | return NF_ACCEPT; | 1366 | return NF_ACCEPT; |
@@ -1358,11 +1374,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) | |||
1358 | IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, | 1374 | IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, |
1359 | "Checking incoming ICMP for"); | 1375 | "Checking incoming ICMP for"); |
1360 | 1376 | ||
1377 | offset2 = offset; | ||
1361 | offset += cih->ihl * 4; | 1378 | offset += cih->ihl * 4; |
1362 | 1379 | ||
1363 | ip_vs_fill_iphdr(AF_INET, cih, &ciph); | 1380 | ip_vs_fill_iphdr(AF_INET, cih, &ciph); |
1364 | /* The embedded headers contain source and dest in reverse order */ | 1381 | /* The embedded headers contain source and dest in reverse order. |
1365 | cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1); | 1382 | * For IPIP this is error for request, not for reply. |
1383 | */ | ||
1384 | cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, ipip ? 0 : 1); | ||
1366 | if (!cp) | 1385 | if (!cp) |
1367 | return NF_ACCEPT; | 1386 | return NF_ACCEPT; |
1368 | 1387 | ||
@@ -1376,6 +1395,57 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) | |||
1376 | goto out; | 1395 | goto out; |
1377 | } | 1396 | } |
1378 | 1397 | ||
1398 | if (ipip) { | ||
1399 | __be32 info = ic->un.gateway; | ||
1400 | |||
1401 | /* Update the MTU */ | ||
1402 | if (ic->type == ICMP_DEST_UNREACH && | ||
1403 | ic->code == ICMP_FRAG_NEEDED) { | ||
1404 | struct ip_vs_dest *dest = cp->dest; | ||
1405 | u32 mtu = ntohs(ic->un.frag.mtu); | ||
1406 | |||
1407 | /* Strip outer IP and ICMP, go to IPIP header */ | ||
1408 | __skb_pull(skb, ihl + sizeof(_icmph)); | ||
1409 | offset2 -= ihl + sizeof(_icmph); | ||
1410 | skb_reset_network_header(skb); | ||
1411 | IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", | ||
1412 | &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); | ||
1413 | rcu_read_lock(); | ||
1414 | ipv4_update_pmtu(skb, dev_net(skb->dev), | ||
1415 | mtu, 0, 0, 0, 0); | ||
1416 | rcu_read_unlock(); | ||
1417 | /* Client uses PMTUD? */ | ||
1418 | if (!(cih->frag_off & htons(IP_DF))) | ||
1419 | goto ignore_ipip; | ||
1420 | /* Prefer the resulting PMTU */ | ||
1421 | if (dest) { | ||
1422 | spin_lock(&dest->dst_lock); | ||
1423 | if (dest->dst_cache) | ||
1424 | mtu = dst_mtu(dest->dst_cache); | ||
1425 | spin_unlock(&dest->dst_lock); | ||
1426 | } | ||
1427 | if (mtu > 68 + sizeof(struct iphdr)) | ||
1428 | mtu -= sizeof(struct iphdr); | ||
1429 | info = htonl(mtu); | ||
1430 | } | ||
1431 | /* Strip outer IP, ICMP and IPIP, go to IP header of | ||
1432 | * original request. | ||
1433 | */ | ||
1434 | __skb_pull(skb, offset2); | ||
1435 | skb_reset_network_header(skb); | ||
1436 | IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", | ||
1437 | &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, | ||
1438 | ic->type, ic->code, ntohl(info)); | ||
1439 | icmp_send(skb, ic->type, ic->code, info); | ||
1440 | /* ICMP can be shorter but anyways, account it */ | ||
1441 | ip_vs_out_stats(cp, skb); | ||
1442 | |||
1443 | ignore_ipip: | ||
1444 | consume_skb(skb); | ||
1445 | verdict = NF_STOLEN; | ||
1446 | goto out; | ||
1447 | } | ||
1448 | |||
1379 | /* do the statistics and put it back */ | 1449 | /* do the statistics and put it back */ |
1380 | ip_vs_in_stats(cp, skb); | 1450 | ip_vs_in_stats(cp, skb); |
1381 | if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) | 1451 | if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) |
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 65b616ae1716..c2275ba048e9 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c | |||
@@ -49,6 +49,7 @@ enum { | |||
49 | IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to | 49 | IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to |
50 | * local | 50 | * local |
51 | */ | 51 | */ |
52 | IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ | ||
52 | }; | 53 | }; |
53 | 54 | ||
54 | /* | 55 | /* |
@@ -84,6 +85,42 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos) | |||
84 | return dst; | 85 | return dst; |
85 | } | 86 | } |
86 | 87 | ||
88 | /* Get route to daddr, update *saddr, optionally bind route to saddr */ | ||
89 | static struct rtable *do_output_route4(struct net *net, __be32 daddr, | ||
90 | u32 rtos, int rt_mode, __be32 *saddr) | ||
91 | { | ||
92 | struct flowi4 fl4; | ||
93 | struct rtable *rt; | ||
94 | int loop = 0; | ||
95 | |||
96 | memset(&fl4, 0, sizeof(fl4)); | ||
97 | fl4.daddr = daddr; | ||
98 | fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; | ||
99 | fl4.flowi4_tos = rtos; | ||
100 | |||
101 | retry: | ||
102 | rt = ip_route_output_key(net, &fl4); | ||
103 | if (IS_ERR(rt)) { | ||
104 | /* Invalid saddr ? */ | ||
105 | if (PTR_ERR(rt) == -EINVAL && *saddr && | ||
106 | rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { | ||
107 | *saddr = 0; | ||
108 | flowi4_update_output(&fl4, 0, rtos, daddr, 0); | ||
109 | goto retry; | ||
110 | } | ||
111 | IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); | ||
112 | return NULL; | ||
113 | } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { | ||
114 | ip_rt_put(rt); | ||
115 | *saddr = fl4.saddr; | ||
116 | flowi4_update_output(&fl4, 0, rtos, daddr, fl4.saddr); | ||
117 | loop++; | ||
118 | goto retry; | ||
119 | } | ||
120 | *saddr = fl4.saddr; | ||
121 | return rt; | ||
122 | } | ||
123 | |||
87 | /* Get route to destination or remote server */ | 124 | /* Get route to destination or remote server */ |
88 | static struct rtable * | 125 | static struct rtable * |
89 | __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, | 126 | __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, |
@@ -98,20 +135,13 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, | |||
98 | spin_lock(&dest->dst_lock); | 135 | spin_lock(&dest->dst_lock); |
99 | if (!(rt = (struct rtable *) | 136 | if (!(rt = (struct rtable *) |
100 | __ip_vs_dst_check(dest, rtos))) { | 137 | __ip_vs_dst_check(dest, rtos))) { |
101 | struct flowi4 fl4; | 138 | rt = do_output_route4(net, dest->addr.ip, rtos, |
102 | 139 | rt_mode, &dest->dst_saddr.ip); | |
103 | memset(&fl4, 0, sizeof(fl4)); | 140 | if (!rt) { |
104 | fl4.daddr = dest->addr.ip; | ||
105 | fl4.flowi4_tos = rtos; | ||
106 | rt = ip_route_output_key(net, &fl4); | ||
107 | if (IS_ERR(rt)) { | ||
108 | spin_unlock(&dest->dst_lock); | 141 | spin_unlock(&dest->dst_lock); |
109 | IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", | ||
110 | &dest->addr.ip); | ||
111 | return NULL; | 142 | return NULL; |
112 | } | 143 | } |
113 | __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0); | 144 | __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0); |
114 | dest->dst_saddr.ip = fl4.saddr; | ||
115 | IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, " | 145 | IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, " |
116 | "rtos=%X\n", | 146 | "rtos=%X\n", |
117 | &dest->addr.ip, &dest->dst_saddr.ip, | 147 | &dest->addr.ip, &dest->dst_saddr.ip, |
@@ -122,19 +152,17 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, | |||
122 | *ret_saddr = dest->dst_saddr.ip; | 152 | *ret_saddr = dest->dst_saddr.ip; |
123 | spin_unlock(&dest->dst_lock); | 153 | spin_unlock(&dest->dst_lock); |
124 | } else { | 154 | } else { |
125 | struct flowi4 fl4; | 155 | __be32 saddr = htonl(INADDR_ANY); |
126 | 156 | ||
127 | memset(&fl4, 0, sizeof(fl4)); | 157 | /* For such unconfigured boxes avoid many route lookups |
128 | fl4.daddr = daddr; | 158 | * for performance reasons because we do not remember saddr |
129 | fl4.flowi4_tos = rtos; | 159 | */ |
130 | rt = ip_route_output_key(net, &fl4); | 160 | rt_mode &= ~IP_VS_RT_MODE_CONNECT; |
131 | if (IS_ERR(rt)) { | 161 | rt = do_output_route4(net, daddr, rtos, rt_mode, &saddr); |
132 | IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", | 162 | if (!rt) |
133 | &daddr); | ||
134 | return NULL; | 163 | return NULL; |
135 | } | ||
136 | if (ret_saddr) | 164 | if (ret_saddr) |
137 | *ret_saddr = fl4.saddr; | 165 | *ret_saddr = saddr; |
138 | } | 166 | } |
139 | 167 | ||
140 | local = rt->rt_flags & RTCF_LOCAL; | 168 | local = rt->rt_flags & RTCF_LOCAL; |
@@ -331,6 +359,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) | |||
331 | old_dst = dest->dst_cache; | 359 | old_dst = dest->dst_cache; |
332 | dest->dst_cache = NULL; | 360 | dest->dst_cache = NULL; |
333 | dst_release(old_dst); | 361 | dst_release(old_dst); |
362 | dest->dst_saddr.ip = 0; | ||
334 | } | 363 | } |
335 | 364 | ||
336 | #define IP_VS_XMIT_TUNNEL(skb, cp) \ | 365 | #define IP_VS_XMIT_TUNNEL(skb, cp) \ |
@@ -771,7 +800,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | |||
771 | struct net_device *tdev; /* Device to other host */ | 800 | struct net_device *tdev; /* Device to other host */ |
772 | struct iphdr *old_iph = ip_hdr(skb); | 801 | struct iphdr *old_iph = ip_hdr(skb); |
773 | u8 tos = old_iph->tos; | 802 | u8 tos = old_iph->tos; |
774 | __be16 df = old_iph->frag_off; | 803 | __be16 df; |
775 | struct iphdr *iph; /* Our new IP header */ | 804 | struct iphdr *iph; /* Our new IP header */ |
776 | unsigned int max_headroom; /* The extra header space needed */ | 805 | unsigned int max_headroom; /* The extra header space needed */ |
777 | int mtu; | 806 | int mtu; |
@@ -781,7 +810,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | |||
781 | 810 | ||
782 | if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, | 811 | if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, |
783 | RT_TOS(tos), IP_VS_RT_MODE_LOCAL | | 812 | RT_TOS(tos), IP_VS_RT_MODE_LOCAL | |
784 | IP_VS_RT_MODE_NON_LOCAL, | 813 | IP_VS_RT_MODE_NON_LOCAL | |
814 | IP_VS_RT_MODE_CONNECT, | ||
785 | &saddr))) | 815 | &saddr))) |
786 | goto tx_error_icmp; | 816 | goto tx_error_icmp; |
787 | if (rt->rt_flags & RTCF_LOCAL) { | 817 | if (rt->rt_flags & RTCF_LOCAL) { |
@@ -796,10 +826,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | |||
796 | IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); | 826 | IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); |
797 | goto tx_error_put; | 827 | goto tx_error_put; |
798 | } | 828 | } |
799 | if (skb_dst(skb)) | 829 | if (rt_is_output_route(skb_rtable(skb))) |
800 | skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); | 830 | skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); |
801 | 831 | ||
802 | df |= (old_iph->frag_off & htons(IP_DF)); | 832 | /* Copy DF, reset fragment offset and MF */ |
833 | df = old_iph->frag_off & htons(IP_DF); | ||
803 | 834 | ||
804 | if ((old_iph->frag_off & htons(IP_DF) && | 835 | if ((old_iph->frag_off & htons(IP_DF) && |
805 | mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) { | 836 | mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) { |