aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorPatrick McHardy <kaber@trash.net>2010-10-21 10:25:51 -0400
committerPatrick McHardy <kaber@trash.net>2010-10-21 10:25:51 -0400
commit3b1a1ce6f418cb7ab35eb55c8a6575987a524e30 (patch)
treea3ebee69d6370631746a348f5852eeb955df5bd3 /net
parentcc6eb433856983e91071469c4ce57accb6947ccb (diff)
parentb0aeef30433ea6854e985c2e9842fa19f51b95cc (diff)
Merge branch 'for-patrick' of git://git.kernel.org/pub/scm/linux/kernel/git/horms/lvs-test-2.6
Diffstat (limited to 'net')
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c29
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c586
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c18
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c7
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c52
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c52
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c51
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c503
11 files changed, 918 insertions, 398 deletions
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index e2e00c4da883..0047923c1f22 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -462,6 +462,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
462 return 0; 462 return 0;
463 } 463 }
464 464
465 if (manip == IP_NAT_MANIP_SRC)
466 statusbit = IPS_SRC_NAT;
467 else
468 statusbit = IPS_DST_NAT;
469
470 /* Invert if this is reply dir. */
471 if (dir == IP_CT_DIR_REPLY)
472 statusbit ^= IPS_NAT_MASK;
473
474 if (!(ct->status & statusbit))
475 return 1;
476
465 pr_debug("icmp_reply_translation: translating error %p manip %u " 477 pr_debug("icmp_reply_translation: translating error %p manip %u "
466 "dir %s\n", skb, manip, 478 "dir %s\n", skb, manip,
467 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); 479 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
@@ -496,20 +508,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
496 508
497 /* Change outer to look the reply to an incoming packet 509 /* Change outer to look the reply to an incoming packet
498 * (proto 0 means don't invert per-proto part). */ 510 * (proto 0 means don't invert per-proto part). */
499 if (manip == IP_NAT_MANIP_SRC) 511 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
500 statusbit = IPS_SRC_NAT; 512 if (!manip_pkt(0, skb, 0, &target, manip))
501 else 513 return 0;
502 statusbit = IPS_DST_NAT;
503
504 /* Invert if this is reply dir. */
505 if (dir == IP_CT_DIR_REPLY)
506 statusbit ^= IPS_NAT_MASK;
507
508 if (ct->status & statusbit) {
509 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
510 if (!manip_pkt(0, skb, 0, &target, manip))
511 return 0;
512 }
513 514
514 return 1; 515 return 1;
515} 516}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 1d1a529dbe24..e9adecdc8ca4 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -563,6 +563,8 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
563 */ 563 */
564 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) 564 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
565 conn_flags &= ~IP_VS_CONN_F_INACTIVE; 565 conn_flags &= ~IP_VS_CONN_F_INACTIVE;
566 /* connections inherit forwarding method from dest */
567 cp->flags &= ~IP_VS_CONN_F_FWD_MASK;
566 } 568 }
567 cp->flags |= conn_flags; 569 cp->flags |= conn_flags;
568 cp->dest = dest; 570 cp->dest = dest;
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index e5fef7aef0d4..b4e51e9c5a04 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -48,6 +48,7 @@
48#ifdef CONFIG_IP_VS_IPV6 48#ifdef CONFIG_IP_VS_IPV6
49#include <net/ipv6.h> 49#include <net/ipv6.h>
50#include <linux/netfilter_ipv6.h> 50#include <linux/netfilter_ipv6.h>
51#include <net/ip6_route.h>
51#endif 52#endif
52 53
53#include <net/ip_vs.h> 54#include <net/ip_vs.h>
@@ -342,7 +343,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
342 * Protocols supported: TCP, UDP 343 * Protocols supported: TCP, UDP
343 */ 344 */
344struct ip_vs_conn * 345struct ip_vs_conn *
345ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) 346ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
347 struct ip_vs_protocol *pp, int *ignored)
346{ 348{
347 struct ip_vs_conn *cp = NULL; 349 struct ip_vs_conn *cp = NULL;
348 struct ip_vs_iphdr iph; 350 struct ip_vs_iphdr iph;
@@ -350,16 +352,44 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb)
350 __be16 _ports[2], *pptr; 352 __be16 _ports[2], *pptr;
351 unsigned int flags; 353 unsigned int flags;
352 354
355 *ignored = 1;
353 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); 356 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
354 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); 357 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
355 if (pptr == NULL) 358 if (pptr == NULL)
356 return NULL; 359 return NULL;
357 360
358 /* 361 /*
362 * FTPDATA needs this check when using local real server.
363 * Never schedule Active FTPDATA connections from real server.
364 * For LVS-NAT they must be already created. For other methods
365 * with persistence the connection is created on SYN+ACK.
366 */
367 if (pptr[0] == FTPDATA) {
368 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
369 "Not scheduling FTPDATA");
370 return NULL;
371 }
372
373 /*
374 * Do not schedule replies from local real server. It is risky
375 * for fwmark services but mostly for persistent services.
376 */
377 if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
378 (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
379 (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
380 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
381 "Not scheduling reply for existing connection");
382 __ip_vs_conn_put(cp);
383 return NULL;
384 }
385
386 /*
359 * Persistent service 387 * Persistent service
360 */ 388 */
361 if (svc->flags & IP_VS_SVC_F_PERSISTENT) 389 if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
390 *ignored = 0;
362 return ip_vs_sched_persist(svc, skb, pptr); 391 return ip_vs_sched_persist(svc, skb, pptr);
392 }
363 393
364 /* 394 /*
365 * Non-persistent service 395 * Non-persistent service
@@ -372,6 +402,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb)
372 return NULL; 402 return NULL;
373 } 403 }
374 404
405 *ignored = 0;
406
375 dest = svc->scheduler->schedule(svc, skb); 407 dest = svc->scheduler->schedule(svc, skb);
376 if (dest == NULL) { 408 if (dest == NULL) {
377 IP_VS_DBG(1, "Schedule: no dest found.\n"); 409 IP_VS_DBG(1, "Schedule: no dest found.\n");
@@ -498,35 +530,32 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
498 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ 530 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
499 */ 531 */
500#ifdef CONFIG_IP_VS_IPV6 532#ifdef CONFIG_IP_VS_IPV6
501 if (svc->af == AF_INET6) 533 if (svc->af == AF_INET6) {
534 if (!skb->dev) {
535 struct net *net = dev_net(skb_dst(skb)->dev);
536
537 skb->dev = net->loopback_dev;
538 }
502 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); 539 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
503 else 540 } else
504#endif 541#endif
505 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 542 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
506 543
507 return NF_DROP; 544 return NF_DROP;
508} 545}
509 546
510/* 547__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
511 * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
512 * chain and is used to avoid double NAT and confirmation when we do
513 * not want to keep the conntrack structure
514 */
515static unsigned int ip_vs_post_routing(unsigned int hooknum,
516 struct sk_buff *skb,
517 const struct net_device *in,
518 const struct net_device *out,
519 int (*okfn)(struct sk_buff *))
520{ 548{
521 if (!skb->ipvs_property) 549 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
522 return NF_ACCEPT;
523 /* The packet was sent from IPVS, exit this chain */
524 return NF_STOP;
525} 550}
526 551
527__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) 552static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
528{ 553{
529 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); 554 if (NF_INET_LOCAL_IN == hooknum)
555 return IP_DEFRAG_VS_IN;
556 if (NF_INET_FORWARD == hooknum)
557 return IP_DEFRAG_VS_FWD;
558 return IP_DEFRAG_VS_OUT;
530} 559}
531 560
532static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) 561static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
@@ -589,10 +618,10 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
589 skb->ip_summed = CHECKSUM_UNNECESSARY; 618 skb->ip_summed = CHECKSUM_UNNECESSARY;
590 619
591 if (inout) 620 if (inout)
592 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, 621 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
593 "Forwarding altered outgoing ICMP"); 622 "Forwarding altered outgoing ICMP");
594 else 623 else
595 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, 624 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
596 "Forwarding altered incoming ICMP"); 625 "Forwarding altered incoming ICMP");
597} 626}
598 627
@@ -634,11 +663,13 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
634 skb->ip_summed = CHECKSUM_PARTIAL; 663 skb->ip_summed = CHECKSUM_PARTIAL;
635 664
636 if (inout) 665 if (inout)
637 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, 666 IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
638 "Forwarding altered outgoing ICMPv6"); 667 (void *)ciph - (void *)iph,
668 "Forwarding altered outgoing ICMPv6");
639 else 669 else
640 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, 670 IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
641 "Forwarding altered incoming ICMPv6"); 671 (void *)ciph - (void *)iph,
672 "Forwarding altered incoming ICMPv6");
642} 673}
643#endif 674#endif
644 675
@@ -679,11 +710,23 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
679#endif 710#endif
680 ip_vs_nat_icmp(skb, pp, cp, 1); 711 ip_vs_nat_icmp(skb, pp, cp, 1);
681 712
713#ifdef CONFIG_IP_VS_IPV6
714 if (af == AF_INET6) {
715 if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
716 goto out;
717 } else
718#endif
719 if ((sysctl_ip_vs_snat_reroute ||
720 skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
721 ip_route_me_harder(skb, RTN_LOCAL) != 0)
722 goto out;
723
682 /* do the statistics and put it back */ 724 /* do the statistics and put it back */
683 ip_vs_out_stats(cp, skb); 725 ip_vs_out_stats(cp, skb);
684 726
727 skb->ipvs_property = 1;
685 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 728 if (!(cp->flags & IP_VS_CONN_F_NFCT))
686 skb->ipvs_property = 1; 729 ip_vs_notrack(skb);
687 else 730 else
688 ip_vs_update_conntrack(skb, cp, 0); 731 ip_vs_update_conntrack(skb, cp, 0);
689 verdict = NF_ACCEPT; 732 verdict = NF_ACCEPT;
@@ -699,7 +742,8 @@ out:
699 * Find any that might be relevant, check against existing connections. 742 * Find any that might be relevant, check against existing connections.
700 * Currently handles error types - unreachable, quench, ttl exceeded. 743 * Currently handles error types - unreachable, quench, ttl exceeded.
701 */ 744 */
702static int ip_vs_out_icmp(struct sk_buff *skb, int *related) 745static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
746 unsigned int hooknum)
703{ 747{
704 struct iphdr *iph; 748 struct iphdr *iph;
705 struct icmphdr _icmph, *ic; 749 struct icmphdr _icmph, *ic;
@@ -714,7 +758,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
714 758
715 /* reassemble IP fragments */ 759 /* reassemble IP fragments */
716 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { 760 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
717 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) 761 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
718 return NF_STOLEN; 762 return NF_STOLEN;
719 } 763 }
720 764
@@ -757,7 +801,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
757 pp->dont_defrag)) 801 pp->dont_defrag))
758 return NF_ACCEPT; 802 return NF_ACCEPT;
759 803
760 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); 804 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
805 "Checking outgoing ICMP for");
761 806
762 offset += cih->ihl * 4; 807 offset += cih->ihl * 4;
763 808
@@ -773,7 +818,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
773} 818}
774 819
775#ifdef CONFIG_IP_VS_IPV6 820#ifdef CONFIG_IP_VS_IPV6
776static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) 821static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
822 unsigned int hooknum)
777{ 823{
778 struct ipv6hdr *iph; 824 struct ipv6hdr *iph;
779 struct icmp6hdr _icmph, *ic; 825 struct icmp6hdr _icmph, *ic;
@@ -789,7 +835,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
789 835
790 /* reassemble IP fragments */ 836 /* reassemble IP fragments */
791 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { 837 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
792 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) 838 if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
793 return NF_STOLEN; 839 return NF_STOLEN;
794 } 840 }
795 841
@@ -832,7 +878,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
832 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) 878 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
833 return NF_ACCEPT; 879 return NF_ACCEPT;
834 880
835 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); 881 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
882 "Checking outgoing ICMPv6 for");
836 883
837 offset += sizeof(struct ipv6hdr); 884 offset += sizeof(struct ipv6hdr);
838 885
@@ -880,7 +927,7 @@ static unsigned int
880handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, 927handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
881 struct ip_vs_conn *cp, int ihl) 928 struct ip_vs_conn *cp, int ihl)
882{ 929{
883 IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); 930 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
884 931
885 if (!skb_make_writable(skb, ihl)) 932 if (!skb_make_writable(skb, ihl))
886 goto drop; 933 goto drop;
@@ -914,23 +961,24 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
914 * if it came from this machine itself. So re-compute 961 * if it came from this machine itself. So re-compute
915 * the routing information. 962 * the routing information.
916 */ 963 */
917 if (sysctl_ip_vs_snat_reroute) {
918#ifdef CONFIG_IP_VS_IPV6 964#ifdef CONFIG_IP_VS_IPV6
919 if (af == AF_INET6) { 965 if (af == AF_INET6) {
920 if (ip6_route_me_harder(skb) != 0) 966 if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
921 goto drop; 967 goto drop;
922 } else 968 } else
923#endif 969#endif
924 if (ip_route_me_harder(skb, RTN_LOCAL) != 0) 970 if ((sysctl_ip_vs_snat_reroute ||
925 goto drop; 971 skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
926 } 972 ip_route_me_harder(skb, RTN_LOCAL) != 0)
973 goto drop;
927 974
928 IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); 975 IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
929 976
930 ip_vs_out_stats(cp, skb); 977 ip_vs_out_stats(cp, skb);
931 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); 978 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
979 skb->ipvs_property = 1;
932 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 980 if (!(cp->flags & IP_VS_CONN_F_NFCT))
933 skb->ipvs_property = 1; 981 ip_vs_notrack(skb);
934 else 982 else
935 ip_vs_update_conntrack(skb, cp, 0); 983 ip_vs_update_conntrack(skb, cp, 0);
936 ip_vs_conn_put(cp); 984 ip_vs_conn_put(cp);
@@ -946,53 +994,54 @@ drop:
946} 994}
947 995
948/* 996/*
949 * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
950 * Check if outgoing packet belongs to the established ip_vs_conn. 997 * Check if outgoing packet belongs to the established ip_vs_conn.
951 */ 998 */
952static unsigned int 999static unsigned int
953ip_vs_out(unsigned int hooknum, struct sk_buff *skb, 1000ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
954 const struct net_device *in, const struct net_device *out,
955 int (*okfn)(struct sk_buff *))
956{ 1001{
957 struct ip_vs_iphdr iph; 1002 struct ip_vs_iphdr iph;
958 struct ip_vs_protocol *pp; 1003 struct ip_vs_protocol *pp;
959 struct ip_vs_conn *cp; 1004 struct ip_vs_conn *cp;
960 int af;
961 1005
962 EnterFunction(11); 1006 EnterFunction(11);
963 1007
964 af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; 1008 /* Already marked as IPVS request or reply? */
965
966 if (skb->ipvs_property) 1009 if (skb->ipvs_property)
967 return NF_ACCEPT; 1010 return NF_ACCEPT;
968 1011
1012 /* Bad... Do not break raw sockets */
1013 if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1014 af == AF_INET)) {
1015 struct sock *sk = skb->sk;
1016 struct inet_sock *inet = inet_sk(skb->sk);
1017
1018 if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1019 return NF_ACCEPT;
1020 }
1021
1022 if (unlikely(!skb_dst(skb)))
1023 return NF_ACCEPT;
1024
969 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); 1025 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
970#ifdef CONFIG_IP_VS_IPV6 1026#ifdef CONFIG_IP_VS_IPV6
971 if (af == AF_INET6) { 1027 if (af == AF_INET6) {
972 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1028 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
973 int related, verdict = ip_vs_out_icmp_v6(skb, &related); 1029 int related;
1030 int verdict = ip_vs_out_icmp_v6(skb, &related,
1031 hooknum);
974 1032
975 if (related) { 1033 if (related)
976 if (sysctl_ip_vs_snat_reroute &&
977 NF_ACCEPT == verdict &&
978 ip6_route_me_harder(skb))
979 verdict = NF_DROP;
980 return verdict; 1034 return verdict;
981 }
982 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); 1035 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
983 } 1036 }
984 } else 1037 } else
985#endif 1038#endif
986 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 1039 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
987 int related, verdict = ip_vs_out_icmp(skb, &related); 1040 int related;
1041 int verdict = ip_vs_out_icmp(skb, &related, hooknum);
988 1042
989 if (related) { 1043 if (related)
990 if (sysctl_ip_vs_snat_reroute &&
991 NF_ACCEPT == verdict &&
992 ip_route_me_harder(skb, RTN_LOCAL))
993 verdict = NF_DROP;
994 return verdict; 1044 return verdict;
995 }
996 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); 1045 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
997 } 1046 }
998 1047
@@ -1003,19 +1052,19 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
1003 /* reassemble IP fragments */ 1052 /* reassemble IP fragments */
1004#ifdef CONFIG_IP_VS_IPV6 1053#ifdef CONFIG_IP_VS_IPV6
1005 if (af == AF_INET6) { 1054 if (af == AF_INET6) {
1006 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1055 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1007 int related, verdict = ip_vs_out_icmp_v6(skb, &related); 1056 if (ip_vs_gather_frags_v6(skb,
1008 1057 ip_vs_defrag_user(hooknum)))
1009 if (related) 1058 return NF_STOLEN;
1010 return verdict;
1011
1012 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1013 } 1059 }
1060
1061 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1014 } else 1062 } else
1015#endif 1063#endif
1016 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && 1064 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
1017 !pp->dont_defrag)) { 1065 !pp->dont_defrag)) {
1018 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) 1066 if (ip_vs_gather_frags(skb,
1067 ip_vs_defrag_user(hooknum)))
1019 return NF_STOLEN; 1068 return NF_STOLEN;
1020 1069
1021 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); 1070 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
@@ -1026,55 +1075,123 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
1026 */ 1075 */
1027 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); 1076 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1028 1077
1029 if (unlikely(!cp)) { 1078 if (likely(cp))
1030 if (sysctl_ip_vs_nat_icmp_send && 1079 return handle_response(af, skb, pp, cp, iph.len);
1031 (pp->protocol == IPPROTO_TCP || 1080 if (sysctl_ip_vs_nat_icmp_send &&
1032 pp->protocol == IPPROTO_UDP || 1081 (pp->protocol == IPPROTO_TCP ||
1033 pp->protocol == IPPROTO_SCTP)) { 1082 pp->protocol == IPPROTO_UDP ||
1034 __be16 _ports[2], *pptr; 1083 pp->protocol == IPPROTO_SCTP)) {
1035 1084 __be16 _ports[2], *pptr;
1036 pptr = skb_header_pointer(skb, iph.len, 1085
1037 sizeof(_ports), _ports); 1086 pptr = skb_header_pointer(skb, iph.len,
1038 if (pptr == NULL) 1087 sizeof(_ports), _ports);
1039 return NF_ACCEPT; /* Not for me */ 1088 if (pptr == NULL)
1040 if (ip_vs_lookup_real_service(af, iph.protocol, 1089 return NF_ACCEPT; /* Not for me */
1041 &iph.saddr, 1090 if (ip_vs_lookup_real_service(af, iph.protocol,
1042 pptr[0])) { 1091 &iph.saddr,
1043 /* 1092 pptr[0])) {
1044 * Notify the real server: there is no 1093 /*
1045 * existing entry if it is not RST 1094 * Notify the real server: there is no
1046 * packet or not TCP packet. 1095 * existing entry if it is not RST
1047 */ 1096 * packet or not TCP packet.
1048 if ((iph.protocol != IPPROTO_TCP && 1097 */
1049 iph.protocol != IPPROTO_SCTP) 1098 if ((iph.protocol != IPPROTO_TCP &&
1050 || ((iph.protocol == IPPROTO_TCP 1099 iph.protocol != IPPROTO_SCTP)
1051 && !is_tcp_reset(skb, iph.len)) 1100 || ((iph.protocol == IPPROTO_TCP
1052 || (iph.protocol == IPPROTO_SCTP 1101 && !is_tcp_reset(skb, iph.len))
1053 && !is_sctp_abort(skb, 1102 || (iph.protocol == IPPROTO_SCTP
1054 iph.len)))) { 1103 && !is_sctp_abort(skb,
1104 iph.len)))) {
1055#ifdef CONFIG_IP_VS_IPV6 1105#ifdef CONFIG_IP_VS_IPV6
1056 if (af == AF_INET6) 1106 if (af == AF_INET6) {
1057 icmpv6_send(skb, 1107 struct net *net =
1058 ICMPV6_DEST_UNREACH, 1108 dev_net(skb_dst(skb)->dev);
1059 ICMPV6_PORT_UNREACH, 1109
1060 0); 1110 if (!skb->dev)
1061 else 1111 skb->dev = net->loopback_dev;
1112 icmpv6_send(skb,
1113 ICMPV6_DEST_UNREACH,
1114 ICMPV6_PORT_UNREACH,
1115 0);
1116 } else
1062#endif 1117#endif
1063 icmp_send(skb, 1118 icmp_send(skb,
1064 ICMP_DEST_UNREACH, 1119 ICMP_DEST_UNREACH,
1065 ICMP_PORT_UNREACH, 0); 1120 ICMP_PORT_UNREACH, 0);
1066 return NF_DROP; 1121 return NF_DROP;
1067 }
1068 } 1122 }
1069 } 1123 }
1070 IP_VS_DBG_PKT(12, pp, skb, 0,
1071 "packet continues traversal as normal");
1072 return NF_ACCEPT;
1073 } 1124 }
1125 IP_VS_DBG_PKT(12, af, pp, skb, 0,
1126 "ip_vs_out: packet continues traversal as normal");
1127 return NF_ACCEPT;
1128}
1129
1130/*
1131 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1132 * used only for VS/NAT.
1133 * Check if packet is reply for established ip_vs_conn.
1134 */
1135static unsigned int
1136ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
1137 const struct net_device *in, const struct net_device *out,
1138 int (*okfn)(struct sk_buff *))
1139{
1140 return ip_vs_out(hooknum, skb, AF_INET);
1141}
1142
1143/*
1144 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1145 * Check if packet is reply for established ip_vs_conn.
1146 */
1147static unsigned int
1148ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
1149 const struct net_device *in, const struct net_device *out,
1150 int (*okfn)(struct sk_buff *))
1151{
1152 unsigned int verdict;
1074 1153
1075 return handle_response(af, skb, pp, cp, iph.len); 1154 /* Disable BH in LOCAL_OUT until all places are fixed */
1155 local_bh_disable();
1156 verdict = ip_vs_out(hooknum, skb, AF_INET);
1157 local_bh_enable();
1158 return verdict;
1076} 1159}
1077 1160
1161#ifdef CONFIG_IP_VS_IPV6
1162
1163/*
1164 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1165 * used only for VS/NAT.
1166 * Check if packet is reply for established ip_vs_conn.
1167 */
1168static unsigned int
1169ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
1170 const struct net_device *in, const struct net_device *out,
1171 int (*okfn)(struct sk_buff *))
1172{
1173 return ip_vs_out(hooknum, skb, AF_INET6);
1174}
1175
1176/*
1177 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1178 * Check if packet is reply for established ip_vs_conn.
1179 */
1180static unsigned int
1181ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
1182 const struct net_device *in, const struct net_device *out,
1183 int (*okfn)(struct sk_buff *))
1184{
1185 unsigned int verdict;
1186
1187 /* Disable BH in LOCAL_OUT until all places are fixed */
1188 local_bh_disable();
1189 verdict = ip_vs_out(hooknum, skb, AF_INET6);
1190 local_bh_enable();
1191 return verdict;
1192}
1193
1194#endif
1078 1195
1079/* 1196/*
1080 * Handle ICMP messages in the outside-to-inside direction (incoming). 1197 * Handle ICMP messages in the outside-to-inside direction (incoming).
@@ -1098,8 +1215,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1098 1215
1099 /* reassemble IP fragments */ 1216 /* reassemble IP fragments */
1100 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { 1217 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1101 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? 1218 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
1102 IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1103 return NF_STOLEN; 1219 return NF_STOLEN;
1104 } 1220 }
1105 1221
@@ -1142,7 +1258,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1142 pp->dont_defrag)) 1258 pp->dont_defrag))
1143 return NF_ACCEPT; 1259 return NF_ACCEPT;
1144 1260
1145 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); 1261 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
1262 "Checking incoming ICMP for");
1146 1263
1147 offset += cih->ihl * 4; 1264 offset += cih->ihl * 4;
1148 1265
@@ -1176,7 +1293,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1176 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) 1293 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1177 offset += 2 * sizeof(__u16); 1294 offset += 2 * sizeof(__u16);
1178 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); 1295 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1179 /* do not touch skb anymore */ 1296 /* LOCALNODE from FORWARD hook is not supported */
1297 if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
1298 skb_rtable(skb)->rt_flags & RTCF_LOCAL) {
1299 IP_VS_DBG(1, "%s(): "
1300 "local delivery to %pI4 but in FORWARD\n",
1301 __func__, &skb_rtable(skb)->rt_dst);
1302 verdict = NF_DROP;
1303 }
1180 1304
1181 out: 1305 out:
1182 __ip_vs_conn_put(cp); 1306 __ip_vs_conn_put(cp);
@@ -1197,14 +1321,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1197 struct ip_vs_protocol *pp; 1321 struct ip_vs_protocol *pp;
1198 unsigned int offset, verdict; 1322 unsigned int offset, verdict;
1199 union nf_inet_addr snet; 1323 union nf_inet_addr snet;
1324 struct rt6_info *rt;
1200 1325
1201 *related = 1; 1326 *related = 1;
1202 1327
1203 /* reassemble IP fragments */ 1328 /* reassemble IP fragments */
1204 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { 1329 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1205 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? 1330 if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
1206 IP_DEFRAG_VS_IN :
1207 IP_DEFRAG_VS_FWD))
1208 return NF_STOLEN; 1331 return NF_STOLEN;
1209 } 1332 }
1210 1333
@@ -1247,7 +1370,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1247 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) 1370 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1248 return NF_ACCEPT; 1371 return NF_ACCEPT;
1249 1372
1250 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); 1373 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
1374 "Checking incoming ICMPv6 for");
1251 1375
1252 offset += sizeof(struct ipv6hdr); 1376 offset += sizeof(struct ipv6hdr);
1253 1377
@@ -1275,7 +1399,15 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1275 IPPROTO_SCTP == cih->nexthdr) 1399 IPPROTO_SCTP == cih->nexthdr)
1276 offset += 2 * sizeof(__u16); 1400 offset += 2 * sizeof(__u16);
1277 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); 1401 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1278 /* do not touch skb anymore */ 1402 /* LOCALNODE from FORWARD hook is not supported */
1403 if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
1404 (rt = (struct rt6_info *) skb_dst(skb)) &&
1405 rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) {
1406 IP_VS_DBG(1, "%s(): "
1407 "local delivery to %pI6 but in FORWARD\n",
1408 __func__, &rt->rt6i_dst);
1409 verdict = NF_DROP;
1410 }
1279 1411
1280 __ip_vs_conn_put(cp); 1412 __ip_vs_conn_put(cp);
1281 1413
@@ -1289,35 +1421,49 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1289 * and send it on its way... 1421 * and send it on its way...
1290 */ 1422 */
1291static unsigned int 1423static unsigned int
1292ip_vs_in(unsigned int hooknum, struct sk_buff *skb, 1424ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1293 const struct net_device *in, const struct net_device *out,
1294 int (*okfn)(struct sk_buff *))
1295{ 1425{
1296 struct ip_vs_iphdr iph; 1426 struct ip_vs_iphdr iph;
1297 struct ip_vs_protocol *pp; 1427 struct ip_vs_protocol *pp;
1298 struct ip_vs_conn *cp; 1428 struct ip_vs_conn *cp;
1299 int ret, restart, af, pkts; 1429 int ret, restart, pkts;
1300
1301 af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1302 1430
1303 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); 1431 /* Already marked as IPVS request or reply? */
1432 if (skb->ipvs_property)
1433 return NF_ACCEPT;
1304 1434
1305 /* 1435 /*
1306 * Big tappo: only PACKET_HOST, including loopback for local client 1436 * Big tappo:
1307 * Don't handle local packets on IPv6 for now 1437 * - remote client: only PACKET_HOST
1438 * - route: used for struct net when skb->dev is unset
1308 */ 1439 */
1309 if (unlikely(skb->pkt_type != PACKET_HOST)) { 1440 if (unlikely((skb->pkt_type != PACKET_HOST &&
1310 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", 1441 hooknum != NF_INET_LOCAL_OUT) ||
1311 skb->pkt_type, 1442 !skb_dst(skb))) {
1312 iph.protocol, 1443 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1313 IP_VS_DBG_ADDR(af, &iph.daddr)); 1444 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
1445 " ignored in hook %u\n",
1446 skb->pkt_type, iph.protocol,
1447 IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
1314 return NF_ACCEPT; 1448 return NF_ACCEPT;
1315 } 1449 }
1450 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1451
1452 /* Bad... Do not break raw sockets */
1453 if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1454 af == AF_INET)) {
1455 struct sock *sk = skb->sk;
1456 struct inet_sock *inet = inet_sk(skb->sk);
1457
1458 if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1459 return NF_ACCEPT;
1460 }
1316 1461
1317#ifdef CONFIG_IP_VS_IPV6 1462#ifdef CONFIG_IP_VS_IPV6
1318 if (af == AF_INET6) { 1463 if (af == AF_INET6) {
1319 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1464 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1320 int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum); 1465 int related;
1466 int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1321 1467
1322 if (related) 1468 if (related)
1323 return verdict; 1469 return verdict;
@@ -1326,7 +1472,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1326 } else 1472 } else
1327#endif 1473#endif
1328 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 1474 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1329 int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); 1475 int related;
1476 int verdict = ip_vs_in_icmp(skb, &related, hooknum);
1330 1477
1331 if (related) 1478 if (related)
1332 return verdict; 1479 return verdict;
@@ -1346,23 +1493,18 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1346 if (unlikely(!cp)) { 1493 if (unlikely(!cp)) {
1347 int v; 1494 int v;
1348 1495
1349 /* For local client packets, it could be a response */
1350 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1351 if (cp)
1352 return handle_response(af, skb, pp, cp, iph.len);
1353
1354 if (!pp->conn_schedule(af, skb, pp, &v, &cp)) 1496 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1355 return v; 1497 return v;
1356 } 1498 }
1357 1499
1358 if (unlikely(!cp)) { 1500 if (unlikely(!cp)) {
1359 /* sorry, all this trouble for a no-hit :) */ 1501 /* sorry, all this trouble for a no-hit :) */
1360 IP_VS_DBG_PKT(12, pp, skb, 0, 1502 IP_VS_DBG_PKT(12, af, pp, skb, 0,
1361 "packet continues traversal as normal"); 1503 "ip_vs_in: packet continues traversal as normal");
1362 return NF_ACCEPT; 1504 return NF_ACCEPT;
1363 } 1505 }
1364 1506
1365 IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); 1507 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
1366 1508
1367 /* Check the server status */ 1509 /* Check the server status */
1368 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { 1510 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
@@ -1429,6 +1571,72 @@ out:
1429 return ret; 1571 return ret;
1430} 1572}
1431 1573
1574/*
1575 * AF_INET handler in NF_INET_LOCAL_IN chain
1576 * Schedule and forward packets from remote clients
1577 */
1578static unsigned int
1579ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
1580 const struct net_device *in,
1581 const struct net_device *out,
1582 int (*okfn)(struct sk_buff *))
1583{
1584 return ip_vs_in(hooknum, skb, AF_INET);
1585}
1586
1587/*
1588 * AF_INET handler in NF_INET_LOCAL_OUT chain
1589 * Schedule and forward packets from local clients
1590 */
1591static unsigned int
1592ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
1593 const struct net_device *in, const struct net_device *out,
1594 int (*okfn)(struct sk_buff *))
1595{
1596 unsigned int verdict;
1597
1598 /* Disable BH in LOCAL_OUT until all places are fixed */
1599 local_bh_disable();
1600 verdict = ip_vs_in(hooknum, skb, AF_INET);
1601 local_bh_enable();
1602 return verdict;
1603}
1604
1605#ifdef CONFIG_IP_VS_IPV6
1606
1607/*
1608 * AF_INET6 handler in NF_INET_LOCAL_IN chain
1609 * Schedule and forward packets from remote clients
1610 */
1611static unsigned int
1612ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
1613 const struct net_device *in,
1614 const struct net_device *out,
1615 int (*okfn)(struct sk_buff *))
1616{
1617 return ip_vs_in(hooknum, skb, AF_INET6);
1618}
1619
1620/*
1621 * AF_INET6 handler in NF_INET_LOCAL_OUT chain
1622 * Schedule and forward packets from local clients
1623 */
1624static unsigned int
1625ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
1626 const struct net_device *in, const struct net_device *out,
1627 int (*okfn)(struct sk_buff *))
1628{
1629 unsigned int verdict;
1630
1631 /* Disable BH in LOCAL_OUT until all places are fixed */
1632 local_bh_disable();
1633 verdict = ip_vs_in(hooknum, skb, AF_INET6);
1634 local_bh_enable();
1635 return verdict;
1636}
1637
1638#endif
1639
1432 1640
1433/* 1641/*
1434 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP 1642 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
@@ -1469,23 +1677,39 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1469 1677
1470 1678
1471static struct nf_hook_ops ip_vs_ops[] __read_mostly = { 1679static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1680 /* After packet filtering, change source only for VS/NAT */
1681 {
1682 .hook = ip_vs_reply4,
1683 .owner = THIS_MODULE,
1684 .pf = PF_INET,
1685 .hooknum = NF_INET_LOCAL_IN,
1686 .priority = 99,
1687 },
1472 /* After packet filtering, forward packet through VS/DR, VS/TUN, 1688 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1473 * or VS/NAT(change destination), so that filtering rules can be 1689 * or VS/NAT(change destination), so that filtering rules can be
1474 * applied to IPVS. */ 1690 * applied to IPVS. */
1475 { 1691 {
1476 .hook = ip_vs_in, 1692 .hook = ip_vs_remote_request4,
1477 .owner = THIS_MODULE, 1693 .owner = THIS_MODULE,
1478 .pf = PF_INET, 1694 .pf = PF_INET,
1479 .hooknum = NF_INET_LOCAL_IN, 1695 .hooknum = NF_INET_LOCAL_IN,
1480 .priority = 100, 1696 .priority = 101,
1481 }, 1697 },
1482 /* After packet filtering, change source only for VS/NAT */ 1698 /* Before ip_vs_in, change source only for VS/NAT */
1699 {
1700 .hook = ip_vs_local_reply4,
1701 .owner = THIS_MODULE,
1702 .pf = PF_INET,
1703 .hooknum = NF_INET_LOCAL_OUT,
1704 .priority = -99,
1705 },
1706 /* After mangle, schedule and forward local requests */
1483 { 1707 {
1484 .hook = ip_vs_out, 1708 .hook = ip_vs_local_request4,
1485 .owner = THIS_MODULE, 1709 .owner = THIS_MODULE,
1486 .pf = PF_INET, 1710 .pf = PF_INET,
1487 .hooknum = NF_INET_FORWARD, 1711 .hooknum = NF_INET_LOCAL_OUT,
1488 .priority = 100, 1712 .priority = -98,
1489 }, 1713 },
1490 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 1714 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1491 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 1715 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1493,35 +1717,51 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1493 .hook = ip_vs_forward_icmp, 1717 .hook = ip_vs_forward_icmp,
1494 .owner = THIS_MODULE, 1718 .owner = THIS_MODULE,
1495 .pf = PF_INET, 1719 .pf = PF_INET,
1496 .hooknum = NF_INET_FORWARD, 1720 .hooknum = NF_INET_FORWARD,
1497 .priority = 99, 1721 .priority = 99,
1498 }, 1722 },
1499 /* Before the netfilter connection tracking, exit from POST_ROUTING */ 1723 /* After packet filtering, change source only for VS/NAT */
1500 { 1724 {
1501 .hook = ip_vs_post_routing, 1725 .hook = ip_vs_reply4,
1502 .owner = THIS_MODULE, 1726 .owner = THIS_MODULE,
1503 .pf = PF_INET, 1727 .pf = PF_INET,
1504 .hooknum = NF_INET_POST_ROUTING, 1728 .hooknum = NF_INET_FORWARD,
1505 .priority = NF_IP_PRI_NAT_SRC-1, 1729 .priority = 100,
1506 }, 1730 },
1507#ifdef CONFIG_IP_VS_IPV6 1731#ifdef CONFIG_IP_VS_IPV6
1732 /* After packet filtering, change source only for VS/NAT */
1733 {
1734 .hook = ip_vs_reply6,
1735 .owner = THIS_MODULE,
1736 .pf = PF_INET6,
1737 .hooknum = NF_INET_LOCAL_IN,
1738 .priority = 99,
1739 },
1508 /* After packet filtering, forward packet through VS/DR, VS/TUN, 1740 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1509 * or VS/NAT(change destination), so that filtering rules can be 1741 * or VS/NAT(change destination), so that filtering rules can be
1510 * applied to IPVS. */ 1742 * applied to IPVS. */
1511 { 1743 {
1512 .hook = ip_vs_in, 1744 .hook = ip_vs_remote_request6,
1513 .owner = THIS_MODULE, 1745 .owner = THIS_MODULE,
1514 .pf = PF_INET6, 1746 .pf = PF_INET6,
1515 .hooknum = NF_INET_LOCAL_IN, 1747 .hooknum = NF_INET_LOCAL_IN,
1516 .priority = 100, 1748 .priority = 101,
1517 }, 1749 },
1518 /* After packet filtering, change source only for VS/NAT */ 1750 /* Before ip_vs_in, change source only for VS/NAT */
1519 { 1751 {
1520 .hook = ip_vs_out, 1752 .hook = ip_vs_local_reply6,
1753 .owner = THIS_MODULE,
1754 .pf = PF_INET,
1755 .hooknum = NF_INET_LOCAL_OUT,
1756 .priority = -99,
1757 },
1758 /* After mangle, schedule and forward local requests */
1759 {
1760 .hook = ip_vs_local_request6,
1521 .owner = THIS_MODULE, 1761 .owner = THIS_MODULE,
1522 .pf = PF_INET6, 1762 .pf = PF_INET6,
1523 .hooknum = NF_INET_FORWARD, 1763 .hooknum = NF_INET_LOCAL_OUT,
1524 .priority = 100, 1764 .priority = -98,
1525 }, 1765 },
1526 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 1766 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1527 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 1767 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1529,16 +1769,16 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1529 .hook = ip_vs_forward_icmp_v6, 1769 .hook = ip_vs_forward_icmp_v6,
1530 .owner = THIS_MODULE, 1770 .owner = THIS_MODULE,
1531 .pf = PF_INET6, 1771 .pf = PF_INET6,
1532 .hooknum = NF_INET_FORWARD, 1772 .hooknum = NF_INET_FORWARD,
1533 .priority = 99, 1773 .priority = 99,
1534 }, 1774 },
1535 /* Before the netfilter connection tracking, exit from POST_ROUTING */ 1775 /* After packet filtering, change source only for VS/NAT */
1536 { 1776 {
1537 .hook = ip_vs_post_routing, 1777 .hook = ip_vs_reply6,
1538 .owner = THIS_MODULE, 1778 .owner = THIS_MODULE,
1539 .pf = PF_INET6, 1779 .pf = PF_INET6,
1540 .hooknum = NF_INET_POST_ROUTING, 1780 .hooknum = NF_INET_FORWARD,
1541 .priority = NF_IP6_PRI_NAT_SRC-1, 1781 .priority = 100,
1542 }, 1782 },
1543#endif 1783#endif
1544}; 1784};
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 0b884d3e192f..5f5daa30b0af 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -777,20 +777,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
777 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; 777 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
778 conn_flags |= IP_VS_CONN_F_INACTIVE; 778 conn_flags |= IP_VS_CONN_F_INACTIVE;
779 779
780 /* check if local node and update the flags */
781#ifdef CONFIG_IP_VS_IPV6
782 if (svc->af == AF_INET6) {
783 if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
784 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
785 | IP_VS_CONN_F_LOCALNODE;
786 }
787 } else
788#endif
789 if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
790 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
791 | IP_VS_CONN_F_LOCALNODE;
792 }
793
794 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ 780 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
795 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { 781 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
796 conn_flags |= IP_VS_CONN_F_NOOUTPUT; 782 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
@@ -824,6 +810,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
824 dest->u_threshold = udest->u_threshold; 810 dest->u_threshold = udest->u_threshold;
825 dest->l_threshold = udest->l_threshold; 811 dest->l_threshold = udest->l_threshold;
826 812
813 spin_lock(&dest->dst_lock);
814 ip_vs_dst_reset(dest);
815 spin_unlock(&dest->dst_lock);
816
827 if (add) 817 if (add)
828 ip_vs_new_estimator(&dest->stats); 818 ip_vs_new_estimator(&dest->stats);
829 819
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 090889a3b3af..75455000ad1c 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -242,9 +242,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
242 ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, 242 ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
243 start-data, end-start, 243 start-data, end-start,
244 buf, buf_len); 244 buf, buf_len);
245 if (ret) 245 if (ret) {
246 ip_vs_nfct_expect_related(skb, ct, n_cp, 246 ip_vs_nfct_expect_related(skb, ct, n_cp,
247 IPPROTO_TCP, 0, 0); 247 IPPROTO_TCP, 0, 0);
248 if (skb->ip_summed == CHECKSUM_COMPLETE)
249 skb->ip_summed = CHECKSUM_UNNECESSARY;
250 /* csum is updated */
251 ret = 1;
252 }
248 } 253 }
249 254
250 /* 255 /*
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 027f654799fe..c53998390877 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -172,8 +172,8 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
172 else if (ih->frag_off & htons(IP_OFFSET)) 172 else if (ih->frag_off & htons(IP_OFFSET))
173 sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr); 173 sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
174 else { 174 else {
175 __be16 _ports[2], *pptr 175 __be16 _ports[2], *pptr;
176; 176
177 pptr = skb_header_pointer(skb, offset + ih->ihl*4, 177 pptr = skb_header_pointer(skb, offset + ih->ihl*4,
178 sizeof(_ports), _ports); 178 sizeof(_ports), _ports);
179 if (pptr == NULL) 179 if (pptr == NULL)
@@ -223,13 +223,13 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
223 223
224 224
225void 225void
226ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, 226ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
227 const struct sk_buff *skb, 227 const struct sk_buff *skb,
228 int offset, 228 int offset,
229 const char *msg) 229 const char *msg)
230{ 230{
231#ifdef CONFIG_IP_VS_IPV6 231#ifdef CONFIG_IP_VS_IPV6
232 if (skb->protocol == htons(ETH_P_IPV6)) 232 if (af == AF_INET6)
233 ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); 233 ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
234 else 234 else
235#endif 235#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 8956ef33ea6c..3a0461117d3f 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -117,54 +117,6 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
117 return 0; 117 return 0;
118} 118}
119 119
120
121static void
122ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
123 int offset, const char *msg)
124{
125 char buf[256];
126 struct iphdr _iph, *ih;
127
128 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
129 if (ih == NULL)
130 sprintf(buf, "TRUNCATED");
131 else
132 sprintf(buf, "%pI4->%pI4", &ih->saddr, &ih->daddr);
133
134 pr_debug("%s: %s %s\n", msg, pp->name, buf);
135}
136
137#ifdef CONFIG_IP_VS_IPV6
138static void
139ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
140 int offset, const char *msg)
141{
142 char buf[256];
143 struct ipv6hdr _iph, *ih;
144
145 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
146 if (ih == NULL)
147 sprintf(buf, "TRUNCATED");
148 else
149 sprintf(buf, "%pI6->%pI6", &ih->saddr, &ih->daddr);
150
151 pr_debug("%s: %s %s\n", msg, pp->name, buf);
152}
153#endif
154
155static void
156ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
157 int offset, const char *msg)
158{
159#ifdef CONFIG_IP_VS_IPV6
160 if (skb->protocol == htons(ETH_P_IPV6))
161 ah_esp_debug_packet_v6(pp, skb, offset, msg);
162 else
163#endif
164 ah_esp_debug_packet_v4(pp, skb, offset, msg);
165}
166
167
168static void ah_esp_init(struct ip_vs_protocol *pp) 120static void ah_esp_init(struct ip_vs_protocol *pp)
169{ 121{
170 /* nothing to do now */ 122 /* nothing to do now */
@@ -195,7 +147,7 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
195 .register_app = NULL, 147 .register_app = NULL,
196 .unregister_app = NULL, 148 .unregister_app = NULL,
197 .app_conn_bind = NULL, 149 .app_conn_bind = NULL,
198 .debug_packet = ah_esp_debug_packet, 150 .debug_packet = ip_vs_tcpudp_debug_packet,
199 .timeout_change = NULL, /* ISAKMP */ 151 .timeout_change = NULL, /* ISAKMP */
200 .set_state_timeout = NULL, 152 .set_state_timeout = NULL,
201}; 153};
@@ -219,7 +171,7 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
219 .register_app = NULL, 171 .register_app = NULL,
220 .unregister_app = NULL, 172 .unregister_app = NULL,
221 .app_conn_bind = NULL, 173 .app_conn_bind = NULL,
222 .debug_packet = ah_esp_debug_packet, 174 .debug_packet = ip_vs_tcpudp_debug_packet,
223 .timeout_change = NULL, /* ISAKMP */ 175 .timeout_change = NULL, /* ISAKMP */
224}; 176};
225#endif 177#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 4c0855cb006e..d254345bfda7 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -31,6 +31,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
31 if ((sch->type == SCTP_CID_INIT) && 31 if ((sch->type == SCTP_CID_INIT) &&
32 (svc = ip_vs_service_get(af, skb->mark, iph.protocol, 32 (svc = ip_vs_service_get(af, skb->mark, iph.protocol,
33 &iph.daddr, sh->dest))) { 33 &iph.daddr, sh->dest))) {
34 int ignored;
35
34 if (ip_vs_todrop()) { 36 if (ip_vs_todrop()) {
35 /* 37 /*
36 * It seems that we are very loaded. 38 * It seems that we are very loaded.
@@ -44,8 +46,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
44 * Let the virtual server select a real server for the 46 * Let the virtual server select a real server for the
45 * incoming connection, and create a connection entry. 47 * incoming connection, and create a connection entry.
46 */ 48 */
47 *cpp = ip_vs_schedule(svc, skb); 49 *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
48 if (!*cpp) { 50 if (!*cpp && !ignored) {
49 *verdict = ip_vs_leave(svc, skb, pp); 51 *verdict = ip_vs_leave(svc, skb, pp);
50 return 0; 52 return 0;
51 } 53 }
@@ -174,7 +176,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
174 176
175 if (val != cmp) { 177 if (val != cmp) {
176 /* CRC failure, dump it. */ 178 /* CRC failure, dump it. */
177 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 179 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
178 "Failed checksum for"); 180 "Failed checksum for");
179 return 0; 181 return 0;
180 } 182 }
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 282d24de8592..f6c5200e2146 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -43,9 +43,12 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
43 return 0; 43 return 0;
44 } 44 }
45 45
46 /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
46 if (th->syn && 47 if (th->syn &&
47 (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, 48 (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
48 th->dest))) { 49 th->dest))) {
50 int ignored;
51
49 if (ip_vs_todrop()) { 52 if (ip_vs_todrop()) {
50 /* 53 /*
51 * It seems that we are very loaded. 54 * It seems that we are very loaded.
@@ -60,8 +63,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
60 * Let the virtual server select a real server for the 63 * Let the virtual server select a real server for the
61 * incoming connection, and create a connection entry. 64 * incoming connection, and create a connection entry.
62 */ 65 */
63 *cpp = ip_vs_schedule(svc, skb); 66 *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
64 if (!*cpp) { 67 if (!*cpp && !ignored) {
65 *verdict = ip_vs_leave(svc, skb, pp); 68 *verdict = ip_vs_leave(svc, skb, pp);
66 return 0; 69 return 0;
67 } 70 }
@@ -101,15 +104,15 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph,
101#ifdef CONFIG_IP_VS_IPV6 104#ifdef CONFIG_IP_VS_IPV6
102 if (af == AF_INET6) 105 if (af == AF_INET6)
103 tcph->check = 106 tcph->check =
104 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, 107 ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
105 ip_vs_check_diff2(oldlen, newlen, 108 ip_vs_check_diff2(oldlen, newlen,
106 ~csum_unfold(tcph->check)))); 109 csum_unfold(tcph->check))));
107 else 110 else
108#endif 111#endif
109 tcph->check = 112 tcph->check =
110 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, 113 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
111 ip_vs_check_diff2(oldlen, newlen, 114 ip_vs_check_diff2(oldlen, newlen,
112 ~csum_unfold(tcph->check)))); 115 csum_unfold(tcph->check))));
113} 116}
114 117
115 118
@@ -120,6 +123,7 @@ tcp_snat_handler(struct sk_buff *skb,
120 struct tcphdr *tcph; 123 struct tcphdr *tcph;
121 unsigned int tcphoff; 124 unsigned int tcphoff;
122 int oldlen; 125 int oldlen;
126 int payload_csum = 0;
123 127
124#ifdef CONFIG_IP_VS_IPV6 128#ifdef CONFIG_IP_VS_IPV6
125 if (cp->af == AF_INET6) 129 if (cp->af == AF_INET6)
@@ -134,13 +138,20 @@ tcp_snat_handler(struct sk_buff *skb,
134 return 0; 138 return 0;
135 139
136 if (unlikely(cp->app != NULL)) { 140 if (unlikely(cp->app != NULL)) {
141 int ret;
142
137 /* Some checks before mangling */ 143 /* Some checks before mangling */
138 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) 144 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
139 return 0; 145 return 0;
140 146
141 /* Call application helper if needed */ 147 /* Call application helper if needed */
142 if (!ip_vs_app_pkt_out(cp, skb)) 148 if (!(ret = ip_vs_app_pkt_out(cp, skb)))
143 return 0; 149 return 0;
150 /* ret=2: csum update is needed after payload mangling */
151 if (ret == 1)
152 oldlen = skb->len - tcphoff;
153 else
154 payload_csum = 1;
144 } 155 }
145 156
146 tcph = (void *)skb_network_header(skb) + tcphoff; 157 tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -151,12 +162,13 @@ tcp_snat_handler(struct sk_buff *skb,
151 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, 162 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
152 htons(oldlen), 163 htons(oldlen),
153 htons(skb->len - tcphoff)); 164 htons(skb->len - tcphoff));
154 } else if (!cp->app) { 165 } else if (!payload_csum) {
155 /* Only port and addr are changed, do fast csum update */ 166 /* Only port and addr are changed, do fast csum update */
156 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, 167 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
157 cp->dport, cp->vport); 168 cp->dport, cp->vport);
158 if (skb->ip_summed == CHECKSUM_COMPLETE) 169 if (skb->ip_summed == CHECKSUM_COMPLETE)
159 skb->ip_summed = CHECKSUM_NONE; 170 skb->ip_summed = (cp->app && pp->csum_check) ?
171 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
160 } else { 172 } else {
161 /* full checksum calculation */ 173 /* full checksum calculation */
162 tcph->check = 0; 174 tcph->check = 0;
@@ -174,6 +186,7 @@ tcp_snat_handler(struct sk_buff *skb,
174 skb->len - tcphoff, 186 skb->len - tcphoff,
175 cp->protocol, 187 cp->protocol,
176 skb->csum); 188 skb->csum);
189 skb->ip_summed = CHECKSUM_UNNECESSARY;
177 190
178 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", 191 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
179 pp->name, tcph->check, 192 pp->name, tcph->check,
@@ -190,6 +203,7 @@ tcp_dnat_handler(struct sk_buff *skb,
190 struct tcphdr *tcph; 203 struct tcphdr *tcph;
191 unsigned int tcphoff; 204 unsigned int tcphoff;
192 int oldlen; 205 int oldlen;
206 int payload_csum = 0;
193 207
194#ifdef CONFIG_IP_VS_IPV6 208#ifdef CONFIG_IP_VS_IPV6
195 if (cp->af == AF_INET6) 209 if (cp->af == AF_INET6)
@@ -204,6 +218,8 @@ tcp_dnat_handler(struct sk_buff *skb,
204 return 0; 218 return 0;
205 219
206 if (unlikely(cp->app != NULL)) { 220 if (unlikely(cp->app != NULL)) {
221 int ret;
222
207 /* Some checks before mangling */ 223 /* Some checks before mangling */
208 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) 224 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
209 return 0; 225 return 0;
@@ -212,8 +228,13 @@ tcp_dnat_handler(struct sk_buff *skb,
212 * Attempt ip_vs_app call. 228 * Attempt ip_vs_app call.
213 * It will fix ip_vs_conn and iph ack_seq stuff 229 * It will fix ip_vs_conn and iph ack_seq stuff
214 */ 230 */
215 if (!ip_vs_app_pkt_in(cp, skb)) 231 if (!(ret = ip_vs_app_pkt_in(cp, skb)))
216 return 0; 232 return 0;
233 /* ret=2: csum update is needed after payload mangling */
234 if (ret == 1)
235 oldlen = skb->len - tcphoff;
236 else
237 payload_csum = 1;
217 } 238 }
218 239
219 tcph = (void *)skb_network_header(skb) + tcphoff; 240 tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -223,15 +244,16 @@ tcp_dnat_handler(struct sk_buff *skb,
223 * Adjust TCP checksums 244 * Adjust TCP checksums
224 */ 245 */
225 if (skb->ip_summed == CHECKSUM_PARTIAL) { 246 if (skb->ip_summed == CHECKSUM_PARTIAL) {
226 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, 247 tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
227 htons(oldlen), 248 htons(oldlen),
228 htons(skb->len - tcphoff)); 249 htons(skb->len - tcphoff));
229 } else if (!cp->app) { 250 } else if (!payload_csum) {
230 /* Only port and addr are changed, do fast csum update */ 251 /* Only port and addr are changed, do fast csum update */
231 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, 252 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
232 cp->vport, cp->dport); 253 cp->vport, cp->dport);
233 if (skb->ip_summed == CHECKSUM_COMPLETE) 254 if (skb->ip_summed == CHECKSUM_COMPLETE)
234 skb->ip_summed = CHECKSUM_NONE; 255 skb->ip_summed = (cp->app && pp->csum_check) ?
256 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
235 } else { 257 } else {
236 /* full checksum calculation */ 258 /* full checksum calculation */
237 tcph->check = 0; 259 tcph->check = 0;
@@ -278,7 +300,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
278 skb->len - tcphoff, 300 skb->len - tcphoff,
279 ipv6_hdr(skb)->nexthdr, 301 ipv6_hdr(skb)->nexthdr,
280 skb->csum)) { 302 skb->csum)) {
281 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 303 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
282 "Failed checksum for"); 304 "Failed checksum for");
283 return 0; 305 return 0;
284 } 306 }
@@ -289,7 +311,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
289 skb->len - tcphoff, 311 skb->len - tcphoff,
290 ip_hdr(skb)->protocol, 312 ip_hdr(skb)->protocol,
291 skb->csum)) { 313 skb->csum)) {
292 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 314 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
293 "Failed checksum for"); 315 "Failed checksum for");
294 return 0; 316 return 0;
295 } 317 }
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 8553231b5d41..9d106a06bb0a 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -46,6 +46,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
46 svc = ip_vs_service_get(af, skb->mark, iph.protocol, 46 svc = ip_vs_service_get(af, skb->mark, iph.protocol,
47 &iph.daddr, uh->dest); 47 &iph.daddr, uh->dest);
48 if (svc) { 48 if (svc) {
49 int ignored;
50
49 if (ip_vs_todrop()) { 51 if (ip_vs_todrop()) {
50 /* 52 /*
51 * It seems that we are very loaded. 53 * It seems that we are very loaded.
@@ -60,8 +62,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
60 * Let the virtual server select a real server for the 62 * Let the virtual server select a real server for the
61 * incoming connection, and create a connection entry. 63 * incoming connection, and create a connection entry.
62 */ 64 */
63 *cpp = ip_vs_schedule(svc, skb); 65 *cpp = ip_vs_schedule(svc, skb, pp, &ignored);
64 if (!*cpp) { 66 if (!*cpp && !ignored) {
65 *verdict = ip_vs_leave(svc, skb, pp); 67 *verdict = ip_vs_leave(svc, skb, pp);
66 return 0; 68 return 0;
67 } 69 }
@@ -102,15 +104,15 @@ udp_partial_csum_update(int af, struct udphdr *uhdr,
102#ifdef CONFIG_IP_VS_IPV6 104#ifdef CONFIG_IP_VS_IPV6
103 if (af == AF_INET6) 105 if (af == AF_INET6)
104 uhdr->check = 106 uhdr->check =
105 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, 107 ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
106 ip_vs_check_diff2(oldlen, newlen, 108 ip_vs_check_diff2(oldlen, newlen,
107 ~csum_unfold(uhdr->check)))); 109 csum_unfold(uhdr->check))));
108 else 110 else
109#endif 111#endif
110 uhdr->check = 112 uhdr->check =
111 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, 113 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
112 ip_vs_check_diff2(oldlen, newlen, 114 ip_vs_check_diff2(oldlen, newlen,
113 ~csum_unfold(uhdr->check)))); 115 csum_unfold(uhdr->check))));
114} 116}
115 117
116 118
@@ -121,6 +123,7 @@ udp_snat_handler(struct sk_buff *skb,
121 struct udphdr *udph; 123 struct udphdr *udph;
122 unsigned int udphoff; 124 unsigned int udphoff;
123 int oldlen; 125 int oldlen;
126 int payload_csum = 0;
124 127
125#ifdef CONFIG_IP_VS_IPV6 128#ifdef CONFIG_IP_VS_IPV6
126 if (cp->af == AF_INET6) 129 if (cp->af == AF_INET6)
@@ -135,6 +138,8 @@ udp_snat_handler(struct sk_buff *skb,
135 return 0; 138 return 0;
136 139
137 if (unlikely(cp->app != NULL)) { 140 if (unlikely(cp->app != NULL)) {
141 int ret;
142
138 /* Some checks before mangling */ 143 /* Some checks before mangling */
139 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) 144 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
140 return 0; 145 return 0;
@@ -142,8 +147,13 @@ udp_snat_handler(struct sk_buff *skb,
142 /* 147 /*
143 * Call application helper if needed 148 * Call application helper if needed
144 */ 149 */
145 if (!ip_vs_app_pkt_out(cp, skb)) 150 if (!(ret = ip_vs_app_pkt_out(cp, skb)))
146 return 0; 151 return 0;
152 /* ret=2: csum update is needed after payload mangling */
153 if (ret == 1)
154 oldlen = skb->len - udphoff;
155 else
156 payload_csum = 1;
147 } 157 }
148 158
149 udph = (void *)skb_network_header(skb) + udphoff; 159 udph = (void *)skb_network_header(skb) + udphoff;
@@ -156,12 +166,13 @@ udp_snat_handler(struct sk_buff *skb,
156 udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, 166 udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
157 htons(oldlen), 167 htons(oldlen),
158 htons(skb->len - udphoff)); 168 htons(skb->len - udphoff));
159 } else if (!cp->app && (udph->check != 0)) { 169 } else if (!payload_csum && (udph->check != 0)) {
160 /* Only port and addr are changed, do fast csum update */ 170 /* Only port and addr are changed, do fast csum update */
161 udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, 171 udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
162 cp->dport, cp->vport); 172 cp->dport, cp->vport);
163 if (skb->ip_summed == CHECKSUM_COMPLETE) 173 if (skb->ip_summed == CHECKSUM_COMPLETE)
164 skb->ip_summed = CHECKSUM_NONE; 174 skb->ip_summed = (cp->app && pp->csum_check) ?
175 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
165 } else { 176 } else {
166 /* full checksum calculation */ 177 /* full checksum calculation */
167 udph->check = 0; 178 udph->check = 0;
@@ -181,6 +192,7 @@ udp_snat_handler(struct sk_buff *skb,
181 skb->csum); 192 skb->csum);
182 if (udph->check == 0) 193 if (udph->check == 0)
183 udph->check = CSUM_MANGLED_0; 194 udph->check = CSUM_MANGLED_0;
195 skb->ip_summed = CHECKSUM_UNNECESSARY;
184 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", 196 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
185 pp->name, udph->check, 197 pp->name, udph->check,
186 (char*)&(udph->check) - (char*)udph); 198 (char*)&(udph->check) - (char*)udph);
@@ -196,6 +208,7 @@ udp_dnat_handler(struct sk_buff *skb,
196 struct udphdr *udph; 208 struct udphdr *udph;
197 unsigned int udphoff; 209 unsigned int udphoff;
198 int oldlen; 210 int oldlen;
211 int payload_csum = 0;
199 212
200#ifdef CONFIG_IP_VS_IPV6 213#ifdef CONFIG_IP_VS_IPV6
201 if (cp->af == AF_INET6) 214 if (cp->af == AF_INET6)
@@ -210,6 +223,8 @@ udp_dnat_handler(struct sk_buff *skb,
210 return 0; 223 return 0;
211 224
212 if (unlikely(cp->app != NULL)) { 225 if (unlikely(cp->app != NULL)) {
226 int ret;
227
213 /* Some checks before mangling */ 228 /* Some checks before mangling */
214 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) 229 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
215 return 0; 230 return 0;
@@ -218,8 +233,13 @@ udp_dnat_handler(struct sk_buff *skb,
218 * Attempt ip_vs_app call. 233 * Attempt ip_vs_app call.
219 * It will fix ip_vs_conn 234 * It will fix ip_vs_conn
220 */ 235 */
221 if (!ip_vs_app_pkt_in(cp, skb)) 236 if (!(ret = ip_vs_app_pkt_in(cp, skb)))
222 return 0; 237 return 0;
238 /* ret=2: csum update is needed after payload mangling */
239 if (ret == 1)
240 oldlen = skb->len - udphoff;
241 else
242 payload_csum = 1;
223 } 243 }
224 244
225 udph = (void *)skb_network_header(skb) + udphoff; 245 udph = (void *)skb_network_header(skb) + udphoff;
@@ -229,15 +249,16 @@ udp_dnat_handler(struct sk_buff *skb,
229 * Adjust UDP checksums 249 * Adjust UDP checksums
230 */ 250 */
231 if (skb->ip_summed == CHECKSUM_PARTIAL) { 251 if (skb->ip_summed == CHECKSUM_PARTIAL) {
232 udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, 252 udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
233 htons(oldlen), 253 htons(oldlen),
234 htons(skb->len - udphoff)); 254 htons(skb->len - udphoff));
235 } else if (!cp->app && (udph->check != 0)) { 255 } else if (!payload_csum && (udph->check != 0)) {
236 /* Only port and addr are changed, do fast csum update */ 256 /* Only port and addr are changed, do fast csum update */
237 udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, 257 udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
238 cp->vport, cp->dport); 258 cp->vport, cp->dport);
239 if (skb->ip_summed == CHECKSUM_COMPLETE) 259 if (skb->ip_summed == CHECKSUM_COMPLETE)
240 skb->ip_summed = CHECKSUM_NONE; 260 skb->ip_summed = (cp->app && pp->csum_check) ?
261 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
241 } else { 262 } else {
242 /* full checksum calculation */ 263 /* full checksum calculation */
243 udph->check = 0; 264 udph->check = 0;
@@ -293,7 +314,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
293 skb->len - udphoff, 314 skb->len - udphoff,
294 ipv6_hdr(skb)->nexthdr, 315 ipv6_hdr(skb)->nexthdr,
295 skb->csum)) { 316 skb->csum)) {
296 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 317 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
297 "Failed checksum for"); 318 "Failed checksum for");
298 return 0; 319 return 0;
299 } 320 }
@@ -304,7 +325,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
304 skb->len - udphoff, 325 skb->len - udphoff,
305 ip_hdr(skb)->protocol, 326 ip_hdr(skb)->protocol,
306 skb->csum)) { 327 skb->csum)) {
307 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 328 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
308 "Failed checksum for"); 329 "Failed checksum for");
309 return 0; 330 return 0;
310 } 331 }
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index b0bd8afbf368..de04ea39cde8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -11,6 +11,16 @@
11 * 11 *
12 * Changes: 12 * Changes:
13 * 13 *
14 * Description of forwarding methods:
15 * - all transmitters are called from LOCAL_IN (remote clients) and
16 * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
17 * - not all connections have destination server, for example,
18 * connections in backup server when fwmark is used
19 * - bypass connections use daddr from packet
20 * LOCAL_OUT rules:
21 * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
22 * - skb->pkt_type is not set yet
23 * - the only place where we can see skb->sk != NULL
14 */ 24 */
15 25
16#define KMSG_COMPONENT "IPVS" 26#define KMSG_COMPONENT "IPVS"
@@ -67,12 +77,19 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
67 return dst; 77 return dst;
68} 78}
69 79
80/*
81 * Get route to destination or remote server
82 * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
83 * &4=Allow redirect from remote daddr to local
84 */
70static struct rtable * 85static struct rtable *
71__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos) 86__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
87 __be32 daddr, u32 rtos, int rt_mode)
72{ 88{
73 struct net *net = dev_net(skb->dev); 89 struct net *net = dev_net(skb_dst(skb)->dev);
74 struct rtable *rt; /* Route to the other host */ 90 struct rtable *rt; /* Route to the other host */
75 struct ip_vs_dest *dest = cp->dest; 91 struct rtable *ort; /* Original route */
92 int local;
76 93
77 if (dest) { 94 if (dest) {
78 spin_lock(&dest->dst_lock); 95 spin_lock(&dest->dst_lock);
@@ -104,23 +121,95 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
104 .oif = 0, 121 .oif = 0,
105 .nl_u = { 122 .nl_u = {
106 .ip4_u = { 123 .ip4_u = {
107 .daddr = cp->daddr.ip, 124 .daddr = daddr,
108 .saddr = 0, 125 .saddr = 0,
109 .tos = rtos, } }, 126 .tos = rtos, } },
110 }; 127 };
111 128
112 if (ip_route_output_key(net, &rt, &fl)) { 129 if (ip_route_output_key(net, &rt, &fl)) {
113 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", 130 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
114 &cp->daddr.ip); 131 &daddr);
115 return NULL; 132 return NULL;
116 } 133 }
117 } 134 }
118 135
136 local = rt->rt_flags & RTCF_LOCAL;
137 if (!((local ? 1 : 2) & rt_mode)) {
138 IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
139 (rt->rt_flags & RTCF_LOCAL) ?
140 "local":"non-local", &rt->rt_dst);
141 ip_rt_put(rt);
142 return NULL;
143 }
144 if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) &&
145 ort->rt_flags & RTCF_LOCAL)) {
146 IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
147 "requires NAT method, dest: %pI4\n",
148 &ip_hdr(skb)->daddr, &rt->rt_dst);
149 ip_rt_put(rt);
150 return NULL;
151 }
152 if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
153 IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
154 "to non-local address, dest: %pI4\n",
155 &ip_hdr(skb)->saddr, &rt->rt_dst);
156 ip_rt_put(rt);
157 return NULL;
158 }
159
119 return rt; 160 return rt;
120} 161}
121 162
163/* Reroute packet to local IPv4 stack after DNAT */
164static int
165__ip_vs_reroute_locally(struct sk_buff *skb)
166{
167 struct rtable *rt = skb_rtable(skb);
168 struct net_device *dev = rt->dst.dev;
169 struct net *net = dev_net(dev);
170 struct iphdr *iph = ip_hdr(skb);
171
172 if (rt->fl.iif) {
173 unsigned long orefdst = skb->_skb_refdst;
174
175 if (ip_route_input(skb, iph->daddr, iph->saddr,
176 iph->tos, skb->dev))
177 return 0;
178 refdst_drop(orefdst);
179 } else {
180 struct flowi fl = {
181 .oif = 0,
182 .nl_u = {
183 .ip4_u = {
184 .daddr = iph->daddr,
185 .saddr = iph->saddr,
186 .tos = RT_TOS(iph->tos),
187 }
188 },
189 .mark = skb->mark,
190 };
191 struct rtable *rt;
192
193 if (ip_route_output_key(net, &rt, &fl))
194 return 0;
195 if (!(rt->rt_flags & RTCF_LOCAL)) {
196 ip_rt_put(rt);
197 return 0;
198 }
199 /* Drop old route. */
200 skb_dst_drop(skb);
201 skb_dst_set(skb, &rt->dst);
202 }
203 return 1;
204}
205
122#ifdef CONFIG_IP_VS_IPV6 206#ifdef CONFIG_IP_VS_IPV6
123 207
208static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
209{
210 return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK;
211}
212
124static struct dst_entry * 213static struct dst_entry *
125__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, 214__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
126 struct in6_addr *ret_saddr, int do_xfrm) 215 struct in6_addr *ret_saddr, int do_xfrm)
@@ -155,14 +244,21 @@ out_err:
155 return NULL; 244 return NULL;
156} 245}
157 246
247/*
248 * Get route to destination or remote server
249 * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
250 * &4=Allow redirect from remote daddr to local
251 */
158static struct rt6_info * 252static struct rt6_info *
159__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 253__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
160 struct in6_addr *ret_saddr, int do_xfrm) 254 struct in6_addr *daddr, struct in6_addr *ret_saddr,
255 int do_xfrm, int rt_mode)
161{ 256{
162 struct net *net = dev_net(skb->dev); 257 struct net *net = dev_net(skb_dst(skb)->dev);
163 struct rt6_info *rt; /* Route to the other host */ 258 struct rt6_info *rt; /* Route to the other host */
164 struct ip_vs_dest *dest = cp->dest; 259 struct rt6_info *ort; /* Original route */
165 struct dst_entry *dst; 260 struct dst_entry *dst;
261 int local;
166 262
167 if (dest) { 263 if (dest) {
168 spin_lock(&dest->dst_lock); 264 spin_lock(&dest->dst_lock);
@@ -188,13 +284,38 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
188 ipv6_addr_copy(ret_saddr, &dest->dst_saddr); 284 ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
189 spin_unlock(&dest->dst_lock); 285 spin_unlock(&dest->dst_lock);
190 } else { 286 } else {
191 dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr, 287 dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
192 do_xfrm);
193 if (!dst) 288 if (!dst)
194 return NULL; 289 return NULL;
195 rt = (struct rt6_info *) dst; 290 rt = (struct rt6_info *) dst;
196 } 291 }
197 292
293 local = __ip_vs_is_local_route6(rt);
294 if (!((local ? 1 : 2) & rt_mode)) {
295 IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n",
296 local ? "local":"non-local", daddr);
297 dst_release(&rt->dst);
298 return NULL;
299 }
300 if (local && !(rt_mode & 4) &&
301 !((ort = (struct rt6_info *) skb_dst(skb)) &&
302 __ip_vs_is_local_route6(ort))) {
303 IP_VS_DBG_RL("Redirect from non-local address %pI6 to local "
304 "requires NAT method, dest: %pI6\n",
305 &ipv6_hdr(skb)->daddr, daddr);
306 dst_release(&rt->dst);
307 return NULL;
308 }
309 if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
310 ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
311 IPV6_ADDR_LOOPBACK)) {
312 IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 "
313 "to non-local address, dest: %pI6\n",
314 &ipv6_hdr(skb)->saddr, daddr);
315 dst_release(&rt->dst);
316 return NULL;
317 }
318
198 return rt; 319 return rt;
199} 320}
200#endif 321#endif
@@ -217,30 +338,37 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
217({ \ 338({ \
218 int __ret = NF_ACCEPT; \ 339 int __ret = NF_ACCEPT; \
219 \ 340 \
341 (skb)->ipvs_property = 1; \
220 if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \ 342 if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \
221 __ret = ip_vs_confirm_conntrack(skb, cp); \ 343 __ret = ip_vs_confirm_conntrack(skb, cp); \
222 if (__ret == NF_ACCEPT) { \ 344 if (__ret == NF_ACCEPT) { \
223 nf_reset(skb); \ 345 nf_reset(skb); \
224 (skb)->ip_summed = CHECKSUM_NONE; \ 346 skb_forward_csum(skb); \
225 } \ 347 } \
226 __ret; \ 348 __ret; \
227}) 349})
228 350
229#define IP_VS_XMIT_NAT(pf, skb, cp) \ 351#define IP_VS_XMIT_NAT(pf, skb, cp, local) \
230do { \ 352do { \
353 (skb)->ipvs_property = 1; \
231 if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ 354 if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
232 (skb)->ipvs_property = 1; \ 355 ip_vs_notrack(skb); \
233 else \ 356 else \
234 ip_vs_update_conntrack(skb, cp, 1); \ 357 ip_vs_update_conntrack(skb, cp, 1); \
358 if (local) \
359 return NF_ACCEPT; \
235 skb_forward_csum(skb); \ 360 skb_forward_csum(skb); \
236 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ 361 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
237 skb_dst(skb)->dev, dst_output); \ 362 skb_dst(skb)->dev, dst_output); \
238} while (0) 363} while (0)
239 364
240#define IP_VS_XMIT(pf, skb, cp) \ 365#define IP_VS_XMIT(pf, skb, cp, local) \
241do { \ 366do { \
367 (skb)->ipvs_property = 1; \
242 if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ 368 if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
243 (skb)->ipvs_property = 1; \ 369 ip_vs_notrack(skb); \
370 if (local) \
371 return NF_ACCEPT; \
244 skb_forward_csum(skb); \ 372 skb_forward_csum(skb); \
245 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ 373 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
246 skb_dst(skb)->dev, dst_output); \ 374 skb_dst(skb)->dev, dst_output); \
@@ -255,7 +383,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
255 struct ip_vs_protocol *pp) 383 struct ip_vs_protocol *pp)
256{ 384{
257 /* we do not touch skb and do not need pskb ptr */ 385 /* we do not touch skb and do not need pskb ptr */
258 return NF_ACCEPT; 386 IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
259} 387}
260 388
261 389
@@ -268,27 +396,15 @@ int
268ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 396ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
269 struct ip_vs_protocol *pp) 397 struct ip_vs_protocol *pp)
270{ 398{
271 struct net *net = dev_net(skb->dev);
272 struct rtable *rt; /* Route to the other host */ 399 struct rtable *rt; /* Route to the other host */
273 struct iphdr *iph = ip_hdr(skb); 400 struct iphdr *iph = ip_hdr(skb);
274 u8 tos = iph->tos;
275 int mtu; 401 int mtu;
276 struct flowi fl = {
277 .oif = 0,
278 .nl_u = {
279 .ip4_u = {
280 .daddr = iph->daddr,
281 .saddr = 0,
282 .tos = RT_TOS(tos), } },
283 };
284 402
285 EnterFunction(10); 403 EnterFunction(10);
286 404
287 if (ip_route_output_key(net, &rt, &fl)) { 405 if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr,
288 IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n", 406 RT_TOS(iph->tos), 2)))
289 __func__, &iph->daddr);
290 goto tx_error_icmp; 407 goto tx_error_icmp;
291 }
292 408
293 /* MTU checking */ 409 /* MTU checking */
294 mtu = dst_mtu(&rt->dst); 410 mtu = dst_mtu(&rt->dst);
@@ -316,7 +432,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
316 /* Another hack: avoid icmp_send in ip_fragment */ 432 /* Another hack: avoid icmp_send in ip_fragment */
317 skb->local_df = 1; 433 skb->local_df = 1;
318 434
319 IP_VS_XMIT(NFPROTO_IPV4, skb, cp); 435 IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
320 436
321 LeaveFunction(10); 437 LeaveFunction(10);
322 return NF_STOLEN; 438 return NF_STOLEN;
@@ -334,24 +450,25 @@ int
334ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 450ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
335 struct ip_vs_protocol *pp) 451 struct ip_vs_protocol *pp)
336{ 452{
337 struct net *net = dev_net(skb->dev);
338 struct dst_entry *dst;
339 struct rt6_info *rt; /* Route to the other host */ 453 struct rt6_info *rt; /* Route to the other host */
340 struct ipv6hdr *iph = ipv6_hdr(skb); 454 struct ipv6hdr *iph = ipv6_hdr(skb);
341 int mtu; 455 int mtu;
342 456
343 EnterFunction(10); 457 EnterFunction(10);
344 458
345 dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0); 459 if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, 2)))
346 if (!dst)
347 goto tx_error_icmp; 460 goto tx_error_icmp;
348 rt = (struct rt6_info *) dst;
349 461
350 /* MTU checking */ 462 /* MTU checking */
351 mtu = dst_mtu(&rt->dst); 463 mtu = dst_mtu(&rt->dst);
352 if (skb->len > mtu) { 464 if (skb->len > mtu) {
353 dst_release(&rt->dst); 465 if (!skb->dev) {
466 struct net *net = dev_net(skb_dst(skb)->dev);
467
468 skb->dev = net->loopback_dev;
469 }
354 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 470 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
471 dst_release(&rt->dst);
355 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 472 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
356 goto tx_error; 473 goto tx_error;
357 } 474 }
@@ -373,7 +490,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
373 /* Another hack: avoid icmp_send in ip_fragment */ 490 /* Another hack: avoid icmp_send in ip_fragment */
374 skb->local_df = 1; 491 skb->local_df = 1;
375 492
376 IP_VS_XMIT(NFPROTO_IPV6, skb, cp); 493 IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
377 494
378 LeaveFunction(10); 495 LeaveFunction(10);
379 return NF_STOLEN; 496 return NF_STOLEN;
@@ -398,6 +515,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
398 struct rtable *rt; /* Route to the other host */ 515 struct rtable *rt; /* Route to the other host */
399 int mtu; 516 int mtu;
400 struct iphdr *iph = ip_hdr(skb); 517 struct iphdr *iph = ip_hdr(skb);
518 int local;
401 519
402 EnterFunction(10); 520 EnterFunction(10);
403 521
@@ -411,16 +529,42 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
411 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 529 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
412 } 530 }
413 531
414 if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos)))) 532 if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
533 RT_TOS(iph->tos), 1|2|4)))
415 goto tx_error_icmp; 534 goto tx_error_icmp;
535 local = rt->rt_flags & RTCF_LOCAL;
536 /*
537 * Avoid duplicate tuple in reply direction for NAT traffic
538 * to local address when connection is sync-ed
539 */
540#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
541 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
542 enum ip_conntrack_info ctinfo;
543 struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
544
545 if (ct && !nf_ct_is_untracked(ct)) {
546 IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
547 "ip_vs_nat_xmit(): "
548 "stopping DNAT to local address");
549 goto tx_error_put;
550 }
551 }
552#endif
553
554 /* From world but DNAT to loopback address? */
555 if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
556 IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
557 "stopping DNAT to loopback address");
558 goto tx_error_put;
559 }
416 560
417 /* MTU checking */ 561 /* MTU checking */
418 mtu = dst_mtu(&rt->dst); 562 mtu = dst_mtu(&rt->dst);
419 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { 563 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
420 ip_rt_put(rt);
421 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 564 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
422 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); 565 IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
423 goto tx_error; 566 "ip_vs_nat_xmit(): frag needed for");
567 goto tx_error_put;
424 } 568 }
425 569
426 /* copy-on-write the packet before mangling it */ 570 /* copy-on-write the packet before mangling it */
@@ -430,17 +574,28 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
430 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 574 if (skb_cow(skb, rt->dst.dev->hard_header_len))
431 goto tx_error_put; 575 goto tx_error_put;
432 576
433 /* drop old route */
434 skb_dst_drop(skb);
435 skb_dst_set(skb, &rt->dst);
436
437 /* mangle the packet */ 577 /* mangle the packet */
438 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) 578 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
439 goto tx_error; 579 goto tx_error_put;
440 ip_hdr(skb)->daddr = cp->daddr.ip; 580 ip_hdr(skb)->daddr = cp->daddr.ip;
441 ip_send_check(ip_hdr(skb)); 581 ip_send_check(ip_hdr(skb));
442 582
443 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); 583 if (!local) {
584 /* drop old route */
585 skb_dst_drop(skb);
586 skb_dst_set(skb, &rt->dst);
587 } else {
588 ip_rt_put(rt);
589 /*
590 * Some IPv4 replies get local address from routes,
591 * not from iph, so while we DNAT after routing
592 * we need this second input/output route.
593 */
594 if (!__ip_vs_reroute_locally(skb))
595 goto tx_error;
596 }
597
598 IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
444 599
445 /* FIXME: when application helper enlarges the packet and the length 600 /* FIXME: when application helper enlarges the packet and the length
446 is larger than the MTU of outgoing device, there will be still 601 is larger than the MTU of outgoing device, there will be still
@@ -449,7 +604,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
449 /* Another hack: avoid icmp_send in ip_fragment */ 604 /* Another hack: avoid icmp_send in ip_fragment */
450 skb->local_df = 1; 605 skb->local_df = 1;
451 606
452 IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp); 607 IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
453 608
454 LeaveFunction(10); 609 LeaveFunction(10);
455 return NF_STOLEN; 610 return NF_STOLEN;
@@ -472,6 +627,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
472{ 627{
473 struct rt6_info *rt; /* Route to the other host */ 628 struct rt6_info *rt; /* Route to the other host */
474 int mtu; 629 int mtu;
630 int local;
475 631
476 EnterFunction(10); 632 EnterFunction(10);
477 633
@@ -486,18 +642,49 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
486 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 642 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
487 } 643 }
488 644
489 rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); 645 if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
490 if (!rt) 646 0, 1|2|4)))
491 goto tx_error_icmp; 647 goto tx_error_icmp;
648 local = __ip_vs_is_local_route6(rt);
649 /*
650 * Avoid duplicate tuple in reply direction for NAT traffic
651 * to local address when connection is sync-ed
652 */
653#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
654 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
655 enum ip_conntrack_info ctinfo;
656 struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
657
658 if (ct && !nf_ct_is_untracked(ct)) {
659 IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
660 "ip_vs_nat_xmit_v6(): "
661 "stopping DNAT to local address");
662 goto tx_error_put;
663 }
664 }
665#endif
666
667 /* From world but DNAT to loopback address? */
668 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
669 ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
670 IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
671 "ip_vs_nat_xmit_v6(): "
672 "stopping DNAT to loopback address");
673 goto tx_error_put;
674 }
492 675
493 /* MTU checking */ 676 /* MTU checking */
494 mtu = dst_mtu(&rt->dst); 677 mtu = dst_mtu(&rt->dst);
495 if (skb->len > mtu) { 678 if (skb->len > mtu) {
496 dst_release(&rt->dst); 679 if (!skb->dev) {
680 struct net *net = dev_net(skb_dst(skb)->dev);
681
682 skb->dev = net->loopback_dev;
683 }
497 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 684 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
498 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 685 IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
499 "ip_vs_nat_xmit_v6(): frag needed for"); 686 "ip_vs_nat_xmit_v6(): frag needed for");
500 goto tx_error; 687 goto tx_error_put;
501 } 688 }
502 689
503 /* copy-on-write the packet before mangling it */ 690 /* copy-on-write the packet before mangling it */
@@ -507,16 +694,21 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
507 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 694 if (skb_cow(skb, rt->dst.dev->hard_header_len))
508 goto tx_error_put; 695 goto tx_error_put;
509 696
510 /* drop old route */
511 skb_dst_drop(skb);
512 skb_dst_set(skb, &rt->dst);
513
514 /* mangle the packet */ 697 /* mangle the packet */
515 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) 698 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
516 goto tx_error; 699 goto tx_error;
517 ipv6_hdr(skb)->daddr = cp->daddr.in6; 700 ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6);
518 701
519 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); 702 if (!local || !skb->dev) {
703 /* drop the old route when skb is not shared */
704 skb_dst_drop(skb);
705 skb_dst_set(skb, &rt->dst);
706 } else {
707 /* destined to loopback, do we need to change route? */
708 dst_release(&rt->dst);
709 }
710
711 IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
520 712
521 /* FIXME: when application helper enlarges the packet and the length 713 /* FIXME: when application helper enlarges the packet and the length
522 is larger than the MTU of outgoing device, there will be still 714 is larger than the MTU of outgoing device, there will be still
@@ -525,7 +717,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
525 /* Another hack: avoid icmp_send in ip_fragment */ 717 /* Another hack: avoid icmp_send in ip_fragment */
526 skb->local_df = 1; 718 skb->local_df = 1;
527 719
528 IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp); 720 IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
529 721
530 LeaveFunction(10); 722 LeaveFunction(10);
531 return NF_STOLEN; 723 return NF_STOLEN;
@@ -578,23 +770,20 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
578 770
579 EnterFunction(10); 771 EnterFunction(10);
580 772
581 if (skb->protocol != htons(ETH_P_IP)) { 773 if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
582 IP_VS_DBG_RL("%s(): protocol error, " 774 RT_TOS(tos), 1|2)))
583 "ETH_P_IP: %d, skb protocol: %d\n",
584 __func__, htons(ETH_P_IP), skb->protocol);
585 goto tx_error;
586 }
587
588 if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
589 goto tx_error_icmp; 775 goto tx_error_icmp;
776 if (rt->rt_flags & RTCF_LOCAL) {
777 ip_rt_put(rt);
778 IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
779 }
590 780
591 tdev = rt->dst.dev; 781 tdev = rt->dst.dev;
592 782
593 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); 783 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
594 if (mtu < 68) { 784 if (mtu < 68) {
595 ip_rt_put(rt);
596 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); 785 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
597 goto tx_error; 786 goto tx_error_put;
598 } 787 }
599 if (skb_dst(skb)) 788 if (skb_dst(skb))
600 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); 789 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
@@ -604,9 +793,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
604 if ((old_iph->frag_off & htons(IP_DF)) 793 if ((old_iph->frag_off & htons(IP_DF))
605 && mtu < ntohs(old_iph->tot_len)) { 794 && mtu < ntohs(old_iph->tot_len)) {
606 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 795 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
607 ip_rt_put(rt);
608 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 796 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
609 goto tx_error; 797 goto tx_error_put;
610 } 798 }
611 799
612 /* 800 /*
@@ -675,6 +863,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
675 kfree_skb(skb); 863 kfree_skb(skb);
676 LeaveFunction(10); 864 LeaveFunction(10);
677 return NF_STOLEN; 865 return NF_STOLEN;
866tx_error_put:
867 ip_rt_put(rt);
868 goto tx_error;
678} 869}
679 870
680#ifdef CONFIG_IP_VS_IPV6 871#ifdef CONFIG_IP_VS_IPV6
@@ -693,34 +884,34 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
693 884
694 EnterFunction(10); 885 EnterFunction(10);
695 886
696 if (skb->protocol != htons(ETH_P_IPV6)) { 887 if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
697 IP_VS_DBG_RL("%s(): protocol error, " 888 &saddr, 1, 1|2)))
698 "ETH_P_IPV6: %d, skb protocol: %d\n",
699 __func__, htons(ETH_P_IPV6), skb->protocol);
700 goto tx_error;
701 }
702
703 rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
704 if (!rt)
705 goto tx_error_icmp; 889 goto tx_error_icmp;
890 if (__ip_vs_is_local_route6(rt)) {
891 dst_release(&rt->dst);
892 IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
893 }
706 894
707 tdev = rt->dst.dev; 895 tdev = rt->dst.dev;
708 896
709 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); 897 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
710 if (mtu < IPV6_MIN_MTU) { 898 if (mtu < IPV6_MIN_MTU) {
711 dst_release(&rt->dst);
712 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, 899 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
713 IPV6_MIN_MTU); 900 IPV6_MIN_MTU);
714 goto tx_error; 901 goto tx_error_put;
715 } 902 }
716 if (skb_dst(skb)) 903 if (skb_dst(skb))
717 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); 904 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
718 905
719 if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { 906 if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
907 if (!skb->dev) {
908 struct net *net = dev_net(skb_dst(skb)->dev);
909
910 skb->dev = net->loopback_dev;
911 }
720 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 912 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
721 dst_release(&rt->dst);
722 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 913 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
723 goto tx_error; 914 goto tx_error_put;
724 } 915 }
725 916
726 /* 917 /*
@@ -786,6 +977,9 @@ tx_error:
786 kfree_skb(skb); 977 kfree_skb(skb);
787 LeaveFunction(10); 978 LeaveFunction(10);
788 return NF_STOLEN; 979 return NF_STOLEN;
980tx_error_put:
981 dst_release(&rt->dst);
982 goto tx_error;
789} 983}
790#endif 984#endif
791 985
@@ -804,8 +998,13 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
804 998
805 EnterFunction(10); 999 EnterFunction(10);
806 1000
807 if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos)))) 1001 if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
1002 RT_TOS(iph->tos), 1|2)))
808 goto tx_error_icmp; 1003 goto tx_error_icmp;
1004 if (rt->rt_flags & RTCF_LOCAL) {
1005 ip_rt_put(rt);
1006 IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
1007 }
809 1008
810 /* MTU checking */ 1009 /* MTU checking */
811 mtu = dst_mtu(&rt->dst); 1010 mtu = dst_mtu(&rt->dst);
@@ -833,7 +1032,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
833 /* Another hack: avoid icmp_send in ip_fragment */ 1032 /* Another hack: avoid icmp_send in ip_fragment */
834 skb->local_df = 1; 1033 skb->local_df = 1;
835 1034
836 IP_VS_XMIT(NFPROTO_IPV4, skb, cp); 1035 IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
837 1036
838 LeaveFunction(10); 1037 LeaveFunction(10);
839 return NF_STOLEN; 1038 return NF_STOLEN;
@@ -856,13 +1055,22 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
856 1055
857 EnterFunction(10); 1056 EnterFunction(10);
858 1057
859 rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); 1058 if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
860 if (!rt) 1059 0, 1|2)))
861 goto tx_error_icmp; 1060 goto tx_error_icmp;
1061 if (__ip_vs_is_local_route6(rt)) {
1062 dst_release(&rt->dst);
1063 IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
1064 }
862 1065
863 /* MTU checking */ 1066 /* MTU checking */
864 mtu = dst_mtu(&rt->dst); 1067 mtu = dst_mtu(&rt->dst);
865 if (skb->len > mtu) { 1068 if (skb->len > mtu) {
1069 if (!skb->dev) {
1070 struct net *net = dev_net(skb_dst(skb)->dev);
1071
1072 skb->dev = net->loopback_dev;
1073 }
866 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1074 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
867 dst_release(&rt->dst); 1075 dst_release(&rt->dst);
868 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 1076 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -886,7 +1094,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
886 /* Another hack: avoid icmp_send in ip_fragment */ 1094 /* Another hack: avoid icmp_send in ip_fragment */
887 skb->local_df = 1; 1095 skb->local_df = 1;
888 1096
889 IP_VS_XMIT(NFPROTO_IPV6, skb, cp); 1097 IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
890 1098
891 LeaveFunction(10); 1099 LeaveFunction(10);
892 return NF_STOLEN; 1100 return NF_STOLEN;
@@ -912,6 +1120,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
912 struct rtable *rt; /* Route to the other host */ 1120 struct rtable *rt; /* Route to the other host */
913 int mtu; 1121 int mtu;
914 int rc; 1122 int rc;
1123 int local;
915 1124
916 EnterFunction(10); 1125 EnterFunction(10);
917 1126
@@ -932,16 +1141,43 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
932 * mangle and send the packet here (only for VS/NAT) 1141 * mangle and send the packet here (only for VS/NAT)
933 */ 1142 */
934 1143
935 if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos)))) 1144 if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
1145 RT_TOS(ip_hdr(skb)->tos), 1|2|4)))
936 goto tx_error_icmp; 1146 goto tx_error_icmp;
1147 local = rt->rt_flags & RTCF_LOCAL;
1148
1149 /*
1150 * Avoid duplicate tuple in reply direction for NAT traffic
1151 * to local address when connection is sync-ed
1152 */
1153#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
1154 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1155 enum ip_conntrack_info ctinfo;
1156 struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
1157
1158 if (ct && !nf_ct_is_untracked(ct)) {
1159 IP_VS_DBG(10, "%s(): "
1160 "stopping DNAT to local address %pI4\n",
1161 __func__, &cp->daddr.ip);
1162 goto tx_error_put;
1163 }
1164 }
1165#endif
1166
1167 /* From world but DNAT to loopback address? */
1168 if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
1169 IP_VS_DBG(1, "%s(): "
1170 "stopping DNAT to loopback %pI4\n",
1171 __func__, &cp->daddr.ip);
1172 goto tx_error_put;
1173 }
937 1174
938 /* MTU checking */ 1175 /* MTU checking */
939 mtu = dst_mtu(&rt->dst); 1176 mtu = dst_mtu(&rt->dst);
940 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { 1177 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
941 ip_rt_put(rt);
942 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 1178 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
943 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 1179 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
944 goto tx_error; 1180 goto tx_error_put;
945 } 1181 }
946 1182
947 /* copy-on-write the packet before mangling it */ 1183 /* copy-on-write the packet before mangling it */
@@ -951,16 +1187,27 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
951 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1187 if (skb_cow(skb, rt->dst.dev->hard_header_len))
952 goto tx_error_put; 1188 goto tx_error_put;
953 1189
954 /* drop the old route when skb is not shared */
955 skb_dst_drop(skb);
956 skb_dst_set(skb, &rt->dst);
957
958 ip_vs_nat_icmp(skb, pp, cp, 0); 1190 ip_vs_nat_icmp(skb, pp, cp, 0);
959 1191
1192 if (!local) {
1193 /* drop the old route when skb is not shared */
1194 skb_dst_drop(skb);
1195 skb_dst_set(skb, &rt->dst);
1196 } else {
1197 ip_rt_put(rt);
1198 /*
1199 * Some IPv4 replies get local address from routes,
1200 * not from iph, so while we DNAT after routing
1201 * we need this second input/output route.
1202 */
1203 if (!__ip_vs_reroute_locally(skb))
1204 goto tx_error;
1205 }
1206
960 /* Another hack: avoid icmp_send in ip_fragment */ 1207 /* Another hack: avoid icmp_send in ip_fragment */
961 skb->local_df = 1; 1208 skb->local_df = 1;
962 1209
963 IP_VS_XMIT(NFPROTO_IPV4, skb, cp); 1210 IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
964 1211
965 rc = NF_STOLEN; 1212 rc = NF_STOLEN;
966 goto out; 1213 goto out;
@@ -986,6 +1233,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
986 struct rt6_info *rt; /* Route to the other host */ 1233 struct rt6_info *rt; /* Route to the other host */
987 int mtu; 1234 int mtu;
988 int rc; 1235 int rc;
1236 int local;
989 1237
990 EnterFunction(10); 1238 EnterFunction(10);
991 1239
@@ -1006,17 +1254,49 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1006 * mangle and send the packet here (only for VS/NAT) 1254 * mangle and send the packet here (only for VS/NAT)
1007 */ 1255 */
1008 1256
1009 rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); 1257 if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
1010 if (!rt) 1258 0, 1|2|4)))
1011 goto tx_error_icmp; 1259 goto tx_error_icmp;
1012 1260
1261 local = __ip_vs_is_local_route6(rt);
1262 /*
1263 * Avoid duplicate tuple in reply direction for NAT traffic
1264 * to local address when connection is sync-ed
1265 */
1266#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
1267 if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1268 enum ip_conntrack_info ctinfo;
1269 struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
1270
1271 if (ct && !nf_ct_is_untracked(ct)) {
1272 IP_VS_DBG(10, "%s(): "
1273 "stopping DNAT to local address %pI6\n",
1274 __func__, &cp->daddr.in6);
1275 goto tx_error_put;
1276 }
1277 }
1278#endif
1279
1280 /* From world but DNAT to loopback address? */
1281 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
1282 ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
1283 IP_VS_DBG(1, "%s(): "
1284 "stopping DNAT to loopback %pI6\n",
1285 __func__, &cp->daddr.in6);
1286 goto tx_error_put;
1287 }
1288
1013 /* MTU checking */ 1289 /* MTU checking */
1014 mtu = dst_mtu(&rt->dst); 1290 mtu = dst_mtu(&rt->dst);
1015 if (skb->len > mtu) { 1291 if (skb->len > mtu) {
1016 dst_release(&rt->dst); 1292 if (!skb->dev) {
1293 struct net *net = dev_net(skb_dst(skb)->dev);
1294
1295 skb->dev = net->loopback_dev;
1296 }
1017 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1297 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1018 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 1298 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
1019 goto tx_error; 1299 goto tx_error_put;
1020 } 1300 }
1021 1301
1022 /* copy-on-write the packet before mangling it */ 1302 /* copy-on-write the packet before mangling it */
@@ -1026,16 +1306,21 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1026 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1306 if (skb_cow(skb, rt->dst.dev->hard_header_len))
1027 goto tx_error_put; 1307 goto tx_error_put;
1028 1308
1029 /* drop the old route when skb is not shared */
1030 skb_dst_drop(skb);
1031 skb_dst_set(skb, &rt->dst);
1032
1033 ip_vs_nat_icmp_v6(skb, pp, cp, 0); 1309 ip_vs_nat_icmp_v6(skb, pp, cp, 0);
1034 1310
1311 if (!local || !skb->dev) {
1312 /* drop the old route when skb is not shared */
1313 skb_dst_drop(skb);
1314 skb_dst_set(skb, &rt->dst);
1315 } else {
1316 /* destined to loopback, do we need to change route? */
1317 dst_release(&rt->dst);
1318 }
1319
1035 /* Another hack: avoid icmp_send in ip_fragment */ 1320 /* Another hack: avoid icmp_send in ip_fragment */
1036 skb->local_df = 1; 1321 skb->local_df = 1;
1037 1322
1038 IP_VS_XMIT(NFPROTO_IPV6, skb, cp); 1323 IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
1039 1324
1040 rc = NF_STOLEN; 1325 rc = NF_STOLEN;
1041 goto out; 1326 goto out;