aboutsummaryrefslogtreecommitdiffstats
path: root/net/openvswitch
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-03-14 22:10:25 -0400
committerDavid S. Miller <davem@davemloft.net>2016-03-14 22:10:25 -0400
commit1cdba550555561201398f6eb81c52d5bc511f1ad (patch)
tree3b7e171cf656ec1c38301e7fd38ed7d7e322dc2b /net/openvswitch
parentacffb584cda7069b0c2c83045503ccd07516a891 (diff)
parente39365be031e37b229f745ea49db0b25e82436fa (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next
Pablo Neira Ayuso says: ==================== Netfilter/IPVS/OVS updates for net-next The following patchset contains Netfilter/IPVS fixes and OVS NAT support, more specifically this batch is composed of: 1) Fix a crash in ipset when performing a parallel flush/dump with set:list type, from Jozsef Kadlecsik. 2) Make sure NFACCT_FILTER_* netlink attributes are in place before accessing them, from Phil Turnbull. 3) Check return error code from ip_vs_fill_iph_skb_off() in IPVS SIP helper, from Arnd Bergmann. 4) Add workaround to IPVS to reschedule existing connections to new destination server by dropping the packet and wait for retransmission of TCP syn packet, from Julian Anastasov. 5) Allow connection rescheduling in IPVS when in CLOSE state, also from Julian. 6) Fix wrong offset of SIP Call-ID in IPVS helper, from Marco Angaroni. 7) Validate IPSET_ATTR_ETHER netlink attribute length, from Jozsef. 8) Check match/targetinfo netlink attribute size in nft_compat, patch from Florian Westphal. 9) Check for integer overflow on 32-bit systems in x_tables, from Florian Westphal. Several patches from Jarno Rajahalme to prepare the introduction of NAT support to OVS based on the Netfilter infrastructure: 10) Schedule IP_CT_NEW_REPLY definition for removal in nf_conntrack_common.h. 11) Simplify checksumming recalculation in nf_nat. 12) Add comments to the openvswitch conntrack code, from Jarno. 13) Update the CT state key only after successful nf_conntrack_in() invocation. 14) Find existing conntrack entry after upcall. 15) Handle NF_REPEAT case due to templates in nf_conntrack_in(). 16) Call the conntrack helper functions once the conntrack has been confirmed. 17) And finally, add the NAT interface to OVS. The batch closes with: 18) Cleanup to use spin_unlock_wait() instead of spin_lock()/spin_unlock(), from Nicholas Mc Guire. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/openvswitch')
-rw-r--r--net/openvswitch/Kconfig3
-rw-r--r--net/openvswitch/conntrack.c660
-rw-r--r--net/openvswitch/conntrack.h3
3 files changed, 626 insertions, 40 deletions
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index cd5fd9d728a7..234a73344c6e 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -6,7 +6,8 @@ config OPENVSWITCH
6 tristate "Open vSwitch" 6 tristate "Open vSwitch"
7 depends on INET 7 depends on INET
8 depends on !NF_CONNTRACK || \ 8 depends on !NF_CONNTRACK || \
9 (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6)) 9 (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
10 (!NF_NAT || NF_NAT)))
10 select LIBCRC32C 11 select LIBCRC32C
11 select MPLS 12 select MPLS
12 select NET_MPLS_GSO 13 select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index ee6ff8ffc12d..dc5eb29fe7d6 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -13,21 +13,31 @@
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/openvswitch.h> 15#include <linux/openvswitch.h>
16#include <linux/tcp.h>
17#include <linux/udp.h>
18#include <linux/sctp.h>
16#include <net/ip.h> 19#include <net/ip.h>
17#include <net/netfilter/nf_conntrack_core.h> 20#include <net/netfilter/nf_conntrack_core.h>
18#include <net/netfilter/nf_conntrack_helper.h> 21#include <net/netfilter/nf_conntrack_helper.h>
19#include <net/netfilter/nf_conntrack_labels.h> 22#include <net/netfilter/nf_conntrack_labels.h>
23#include <net/netfilter/nf_conntrack_seqadj.h>
20#include <net/netfilter/nf_conntrack_zones.h> 24#include <net/netfilter/nf_conntrack_zones.h>
21#include <net/netfilter/ipv6/nf_defrag_ipv6.h> 25#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
22 26
27#ifdef CONFIG_NF_NAT_NEEDED
28#include <linux/netfilter/nf_nat.h>
29#include <net/netfilter/nf_nat_core.h>
30#include <net/netfilter/nf_nat_l3proto.h>
31#endif
32
23#include "datapath.h" 33#include "datapath.h"
24#include "conntrack.h" 34#include "conntrack.h"
25#include "flow.h" 35#include "flow.h"
26#include "flow_netlink.h" 36#include "flow_netlink.h"
27 37
28struct ovs_ct_len_tbl { 38struct ovs_ct_len_tbl {
29 size_t maxlen; 39 int maxlen;
30 size_t minlen; 40 int minlen;
31}; 41};
32 42
33/* Metadata mark for masked write to conntrack mark */ 43/* Metadata mark for masked write to conntrack mark */
@@ -42,15 +52,25 @@ struct md_labels {
42 struct ovs_key_ct_labels mask; 52 struct ovs_key_ct_labels mask;
43}; 53};
44 54
55enum ovs_ct_nat {
56 OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */
57 OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */
58 OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */
59};
60
45/* Conntrack action context for execution. */ 61/* Conntrack action context for execution. */
46struct ovs_conntrack_info { 62struct ovs_conntrack_info {
47 struct nf_conntrack_helper *helper; 63 struct nf_conntrack_helper *helper;
48 struct nf_conntrack_zone zone; 64 struct nf_conntrack_zone zone;
49 struct nf_conn *ct; 65 struct nf_conn *ct;
50 u8 commit : 1; 66 u8 commit : 1;
67 u8 nat : 3; /* enum ovs_ct_nat */
51 u16 family; 68 u16 family;
52 struct md_mark mark; 69 struct md_mark mark;
53 struct md_labels labels; 70 struct md_labels labels;
71#ifdef CONFIG_NF_NAT_NEEDED
72 struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */
73#endif
54}; 74};
55 75
56static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); 76static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -75,7 +95,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
75 switch (ctinfo) { 95 switch (ctinfo) {
76 case IP_CT_ESTABLISHED_REPLY: 96 case IP_CT_ESTABLISHED_REPLY:
77 case IP_CT_RELATED_REPLY: 97 case IP_CT_RELATED_REPLY:
78 case IP_CT_NEW_REPLY:
79 ct_state |= OVS_CS_F_REPLY_DIR; 98 ct_state |= OVS_CS_F_REPLY_DIR;
80 break; 99 break;
81 default: 100 default:
@@ -92,7 +111,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
92 ct_state |= OVS_CS_F_RELATED; 111 ct_state |= OVS_CS_F_RELATED;
93 break; 112 break;
94 case IP_CT_NEW: 113 case IP_CT_NEW:
95 case IP_CT_NEW_REPLY:
96 ct_state |= OVS_CS_F_NEW; 114 ct_state |= OVS_CS_F_NEW;
97 break; 115 break;
98 default: 116 default:
@@ -139,12 +157,15 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
139 ovs_ct_get_labels(ct, &key->ct.labels); 157 ovs_ct_get_labels(ct, &key->ct.labels);
140} 158}
141 159
142/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has 160/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
143 * previously sent the packet to conntrack via the ct action. 161 * previously sent the packet to conntrack via the ct action. If
162 * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
163 * initialized from the connection status.
144 */ 164 */
145static void ovs_ct_update_key(const struct sk_buff *skb, 165static void ovs_ct_update_key(const struct sk_buff *skb,
146 const struct ovs_conntrack_info *info, 166 const struct ovs_conntrack_info *info,
147 struct sw_flow_key *key, bool post_ct) 167 struct sw_flow_key *key, bool post_ct,
168 bool keep_nat_flags)
148{ 169{
149 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 170 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
150 enum ip_conntrack_info ctinfo; 171 enum ip_conntrack_info ctinfo;
@@ -154,10 +175,22 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
154 ct = nf_ct_get(skb, &ctinfo); 175 ct = nf_ct_get(skb, &ctinfo);
155 if (ct) { 176 if (ct) {
156 state = ovs_ct_get_state(ctinfo); 177 state = ovs_ct_get_state(ctinfo);
178 /* All unconfirmed entries are NEW connections. */
157 if (!nf_ct_is_confirmed(ct)) 179 if (!nf_ct_is_confirmed(ct))
158 state |= OVS_CS_F_NEW; 180 state |= OVS_CS_F_NEW;
181 /* OVS persists the related flag for the duration of the
182 * connection.
183 */
159 if (ct->master) 184 if (ct->master)
160 state |= OVS_CS_F_RELATED; 185 state |= OVS_CS_F_RELATED;
186 if (keep_nat_flags) {
187 state |= key->ct.state & OVS_CS_F_NAT_MASK;
188 } else {
189 if (ct->status & IPS_SRC_NAT)
190 state |= OVS_CS_F_SRC_NAT;
191 if (ct->status & IPS_DST_NAT)
192 state |= OVS_CS_F_DST_NAT;
193 }
161 zone = nf_ct_zone(ct); 194 zone = nf_ct_zone(ct);
162 } else if (post_ct) { 195 } else if (post_ct) {
163 state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; 196 state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
@@ -167,9 +200,12 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
167 __ovs_ct_update_key(key, state, zone, ct); 200 __ovs_ct_update_key(key, state, zone, ct);
168} 201}
169 202
203/* This is called to initialize CT key fields possibly coming in from the local
204 * stack.
205 */
170void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) 206void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
171{ 207{
172 ovs_ct_update_key(skb, NULL, key, false); 208 ovs_ct_update_key(skb, NULL, key, false, false);
173} 209}
174 210
175int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) 211int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
@@ -201,7 +237,6 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
201 struct nf_conn *ct; 237 struct nf_conn *ct;
202 u32 new_mark; 238 u32 new_mark;
203 239
204
205 /* The connection could be invalid, in which case set_mark is no-op. */ 240 /* The connection could be invalid, in which case set_mark is no-op. */
206 ct = nf_ct_get(skb, &ctinfo); 241 ct = nf_ct_get(skb, &ctinfo);
207 if (!ct) 242 if (!ct)
@@ -259,6 +294,7 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
259 enum ip_conntrack_info ctinfo; 294 enum ip_conntrack_info ctinfo;
260 unsigned int protoff; 295 unsigned int protoff;
261 struct nf_conn *ct; 296 struct nf_conn *ct;
297 int err;
262 298
263 ct = nf_ct_get(skb, &ctinfo); 299 ct = nf_ct_get(skb, &ctinfo);
264 if (!ct || ctinfo == IP_CT_RELATED_REPLY) 300 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
@@ -295,7 +331,18 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
295 return NF_DROP; 331 return NF_DROP;
296 } 332 }
297 333
298 return helper->help(skb, protoff, ct, ctinfo); 334 err = helper->help(skb, protoff, ct, ctinfo);
335 if (err != NF_ACCEPT)
336 return err;
337
338 /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
339 * FTP with NAT) adusting the TCP payload size when mangling IP
340 * addresses and/or port numbers in the text-based control connection.
341 */
342 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
343 !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
344 return NF_DROP;
345 return NF_ACCEPT;
299} 346}
300 347
301/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero 348/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
@@ -352,14 +399,101 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
352 return __nf_ct_expect_find(net, zone, &tuple); 399 return __nf_ct_expect_find(net, zone, &tuple);
353} 400}
354 401
402/* This replicates logic from nf_conntrack_core.c that is not exported. */
403static enum ip_conntrack_info
404ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
405{
406 const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
407
408 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
409 return IP_CT_ESTABLISHED_REPLY;
410 /* Once we've had two way comms, always ESTABLISHED. */
411 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status))
412 return IP_CT_ESTABLISHED;
413 if (test_bit(IPS_EXPECTED_BIT, &ct->status))
414 return IP_CT_RELATED;
415 return IP_CT_NEW;
416}
417
418/* Find an existing connection which this packet belongs to without
419 * re-attributing statistics or modifying the connection state. This allows an
420 * skb->nfct lost due to an upcall to be recovered during actions execution.
421 *
422 * Must be called with rcu_read_lock.
423 *
424 * On success, populates skb->nfct and skb->nfctinfo, and returns the
425 * connection. Returns NULL if there is no existing entry.
426 */
427static struct nf_conn *
428ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
429 u8 l3num, struct sk_buff *skb)
430{
431 struct nf_conntrack_l3proto *l3proto;
432 struct nf_conntrack_l4proto *l4proto;
433 struct nf_conntrack_tuple tuple;
434 struct nf_conntrack_tuple_hash *h;
435 enum ip_conntrack_info ctinfo;
436 struct nf_conn *ct;
437 unsigned int dataoff;
438 u8 protonum;
439
440 l3proto = __nf_ct_l3proto_find(l3num);
441 if (!l3proto) {
442 pr_debug("ovs_ct_find_existing: Can't get l3proto\n");
443 return NULL;
444 }
445 if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
446 &protonum) <= 0) {
447 pr_debug("ovs_ct_find_existing: Can't get protonum\n");
448 return NULL;
449 }
450 l4proto = __nf_ct_l4proto_find(l3num, protonum);
451 if (!l4proto) {
452 pr_debug("ovs_ct_find_existing: Can't get l4proto\n");
453 return NULL;
454 }
455 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
456 protonum, net, &tuple, l3proto, l4proto)) {
457 pr_debug("ovs_ct_find_existing: Can't get tuple\n");
458 return NULL;
459 }
460
461 /* look for tuple match */
462 h = nf_conntrack_find_get(net, zone, &tuple);
463 if (!h)
464 return NULL; /* Not found. */
465
466 ct = nf_ct_tuplehash_to_ctrack(h);
467
468 ctinfo = ovs_ct_get_info(h);
469 if (ctinfo == IP_CT_NEW) {
470 /* This should not happen. */
471 WARN_ONCE(1, "ovs_ct_find_existing: new packet for %p\n", ct);
472 }
473 skb->nfct = &ct->ct_general;
474 skb->nfctinfo = ctinfo;
475 return ct;
476}
477
355/* Determine whether skb->nfct is equal to the result of conntrack lookup. */ 478/* Determine whether skb->nfct is equal to the result of conntrack lookup. */
356static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, 479static bool skb_nfct_cached(struct net *net,
357 const struct ovs_conntrack_info *info) 480 const struct sw_flow_key *key,
481 const struct ovs_conntrack_info *info,
482 struct sk_buff *skb)
358{ 483{
359 enum ip_conntrack_info ctinfo; 484 enum ip_conntrack_info ctinfo;
360 struct nf_conn *ct; 485 struct nf_conn *ct;
361 486
362 ct = nf_ct_get(skb, &ctinfo); 487 ct = nf_ct_get(skb, &ctinfo);
488 /* If no ct, check if we have evidence that an existing conntrack entry
489 * might be found for this skb. This happens when we lose a skb->nfct
490 * due to an upcall. If the connection was not confirmed, it is not
491 * cached and needs to be run through conntrack again.
492 */
493 if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
494 !(key->ct.state & OVS_CS_F_INVALID) &&
495 key->ct.zone == info->zone.id)
496 ct = ovs_ct_find_existing(net, &info->zone, info->family, skb);
363 if (!ct) 497 if (!ct)
364 return false; 498 return false;
365 if (!net_eq(net, read_pnet(&ct->ct_net))) 499 if (!net_eq(net, read_pnet(&ct->ct_net)))
@@ -377,6 +511,206 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
377 return true; 511 return true;
378} 512}
379 513
514#ifdef CONFIG_NF_NAT_NEEDED
515/* Modelled after nf_nat_ipv[46]_fn().
516 * range is only used for new, uninitialized NAT state.
517 * Returns either NF_ACCEPT or NF_DROP.
518 */
519static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
520 enum ip_conntrack_info ctinfo,
521 const struct nf_nat_range *range,
522 enum nf_nat_manip_type maniptype)
523{
524 int hooknum, nh_off, err = NF_ACCEPT;
525
526 nh_off = skb_network_offset(skb);
527 skb_pull(skb, nh_off);
528
529 /* See HOOK2MANIP(). */
530 if (maniptype == NF_NAT_MANIP_SRC)
531 hooknum = NF_INET_LOCAL_IN; /* Source NAT */
532 else
533 hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
534
535 switch (ctinfo) {
536 case IP_CT_RELATED:
537 case IP_CT_RELATED_REPLY:
538 if (skb->protocol == htons(ETH_P_IP) &&
539 ip_hdr(skb)->protocol == IPPROTO_ICMP) {
540 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
541 hooknum))
542 err = NF_DROP;
543 goto push;
544#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
545 } else if (skb->protocol == htons(ETH_P_IPV6)) {
546 __be16 frag_off;
547 u8 nexthdr = ipv6_hdr(skb)->nexthdr;
548 int hdrlen = ipv6_skip_exthdr(skb,
549 sizeof(struct ipv6hdr),
550 &nexthdr, &frag_off);
551
552 if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
553 if (!nf_nat_icmpv6_reply_translation(skb, ct,
554 ctinfo,
555 hooknum,
556 hdrlen))
557 err = NF_DROP;
558 goto push;
559 }
560#endif
561 }
562 /* Non-ICMP, fall thru to initialize if needed. */
563 case IP_CT_NEW:
564 /* Seen it before? This can happen for loopback, retrans,
565 * or local packets.
566 */
567 if (!nf_nat_initialized(ct, maniptype)) {
568 /* Initialize according to the NAT action. */
569 err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
570 /* Action is set up to establish a new
571 * mapping.
572 */
573 ? nf_nat_setup_info(ct, range, maniptype)
574 : nf_nat_alloc_null_binding(ct, hooknum);
575 if (err != NF_ACCEPT)
576 goto push;
577 }
578 break;
579
580 case IP_CT_ESTABLISHED:
581 case IP_CT_ESTABLISHED_REPLY:
582 break;
583
584 default:
585 err = NF_DROP;
586 goto push;
587 }
588
589 err = nf_nat_packet(ct, ctinfo, hooknum, skb);
590push:
591 skb_push(skb, nh_off);
592
593 return err;
594}
595
596static void ovs_nat_update_key(struct sw_flow_key *key,
597 const struct sk_buff *skb,
598 enum nf_nat_manip_type maniptype)
599{
600 if (maniptype == NF_NAT_MANIP_SRC) {
601 __be16 src;
602
603 key->ct.state |= OVS_CS_F_SRC_NAT;
604 if (key->eth.type == htons(ETH_P_IP))
605 key->ipv4.addr.src = ip_hdr(skb)->saddr;
606 else if (key->eth.type == htons(ETH_P_IPV6))
607 memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr,
608 sizeof(key->ipv6.addr.src));
609 else
610 return;
611
612 if (key->ip.proto == IPPROTO_UDP)
613 src = udp_hdr(skb)->source;
614 else if (key->ip.proto == IPPROTO_TCP)
615 src = tcp_hdr(skb)->source;
616 else if (key->ip.proto == IPPROTO_SCTP)
617 src = sctp_hdr(skb)->source;
618 else
619 return;
620
621 key->tp.src = src;
622 } else {
623 __be16 dst;
624
625 key->ct.state |= OVS_CS_F_DST_NAT;
626 if (key->eth.type == htons(ETH_P_IP))
627 key->ipv4.addr.dst = ip_hdr(skb)->daddr;
628 else if (key->eth.type == htons(ETH_P_IPV6))
629 memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr,
630 sizeof(key->ipv6.addr.dst));
631 else
632 return;
633
634 if (key->ip.proto == IPPROTO_UDP)
635 dst = udp_hdr(skb)->dest;
636 else if (key->ip.proto == IPPROTO_TCP)
637 dst = tcp_hdr(skb)->dest;
638 else if (key->ip.proto == IPPROTO_SCTP)
639 dst = sctp_hdr(skb)->dest;
640 else
641 return;
642
643 key->tp.dst = dst;
644 }
645}
646
647/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
648static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
649 const struct ovs_conntrack_info *info,
650 struct sk_buff *skb, struct nf_conn *ct,
651 enum ip_conntrack_info ctinfo)
652{
653 enum nf_nat_manip_type maniptype;
654 int err;
655
656 if (nf_ct_is_untracked(ct)) {
657 /* A NAT action may only be performed on tracked packets. */
658 return NF_ACCEPT;
659 }
660
661 /* Add NAT extension if not confirmed yet. */
662 if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
663 return NF_ACCEPT; /* Can't NAT. */
664
665 /* Determine NAT type.
666 * Check if the NAT type can be deduced from the tracked connection.
667 * Make sure expected traffic is NATted only when committing.
668 */
669 if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW &&
670 ct->status & IPS_NAT_MASK &&
671 (!(ct->status & IPS_EXPECTED_BIT) || info->commit)) {
672 /* NAT an established or related connection like before. */
673 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
674 /* This is the REPLY direction for a connection
675 * for which NAT was applied in the forward
676 * direction. Do the reverse NAT.
677 */
678 maniptype = ct->status & IPS_SRC_NAT
679 ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
680 else
681 maniptype = ct->status & IPS_SRC_NAT
682 ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
683 } else if (info->nat & OVS_CT_SRC_NAT) {
684 maniptype = NF_NAT_MANIP_SRC;
685 } else if (info->nat & OVS_CT_DST_NAT) {
686 maniptype = NF_NAT_MANIP_DST;
687 } else {
688 return NF_ACCEPT; /* Connection is not NATed. */
689 }
690 err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
691
692 /* Mark NAT done if successful and update the flow key. */
693 if (err == NF_ACCEPT)
694 ovs_nat_update_key(key, skb, maniptype);
695
696 return err;
697}
698#else /* !CONFIG_NF_NAT_NEEDED */
699static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
700 const struct ovs_conntrack_info *info,
701 struct sk_buff *skb, struct nf_conn *ct,
702 enum ip_conntrack_info ctinfo)
703{
704 return NF_ACCEPT;
705}
706#endif
707
708/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
709 * not done already. Update key with new CT state after passing the packet
710 * through conntrack.
711 * Note that if the packet is deemed invalid by conntrack, skb->nfct will be
712 * set to NULL and 0 will be returned.
713 */
380static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, 714static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
381 const struct ovs_conntrack_info *info, 715 const struct ovs_conntrack_info *info,
382 struct sk_buff *skb) 716 struct sk_buff *skb)
@@ -386,8 +720,13 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
386 * actually run the packet through conntrack twice unless it's for a 720 * actually run the packet through conntrack twice unless it's for a
387 * different zone. 721 * different zone.
388 */ 722 */
389 if (!skb_nfct_cached(net, skb, info)) { 723 bool cached = skb_nfct_cached(net, key, info, skb);
724 enum ip_conntrack_info ctinfo;
725 struct nf_conn *ct;
726
727 if (!cached) {
390 struct nf_conn *tmpl = info->ct; 728 struct nf_conn *tmpl = info->ct;
729 int err;
391 730
392 /* Associate skb with specified zone. */ 731 /* Associate skb with specified zone. */
393 if (tmpl) { 732 if (tmpl) {
@@ -398,17 +737,53 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
398 skb->nfctinfo = IP_CT_NEW; 737 skb->nfctinfo = IP_CT_NEW;
399 } 738 }
400 739
401 if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING, 740 /* Repeat if requested, see nf_iterate(). */
402 skb) != NF_ACCEPT) 741 do {
742 err = nf_conntrack_in(net, info->family,
743 NF_INET_PRE_ROUTING, skb);
744 } while (err == NF_REPEAT);
745
746 if (err != NF_ACCEPT)
403 return -ENOENT; 747 return -ENOENT;
404 748
405 if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) { 749 /* Clear CT state NAT flags to mark that we have not yet done
406 WARN_ONCE(1, "helper rejected packet"); 750 * NAT after the nf_conntrack_in() call. We can actually clear
751 * the whole state, as it will be re-initialized below.
752 */
753 key->ct.state = 0;
754
755 /* Update the key, but keep the NAT flags. */
756 ovs_ct_update_key(skb, info, key, true, true);
757 }
758
759 ct = nf_ct_get(skb, &ctinfo);
760 if (ct) {
761 /* Packets starting a new connection must be NATted before the
762 * helper, so that the helper knows about the NAT. We enforce
763 * this by delaying both NAT and helper calls for unconfirmed
764 * connections until the committing CT action. For later
765 * packets NAT and Helper may be called in either order.
766 *
767 * NAT will be done only if the CT action has NAT, and only
768 * once per packet (per zone), as guarded by the NAT bits in
769 * the key->ct.state.
770 */
771 if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
772 (nf_ct_is_confirmed(ct) || info->commit) &&
773 ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
407 return -EINVAL; 774 return -EINVAL;
408 } 775 }
409 }
410 776
411 ovs_ct_update_key(skb, info, key, true); 777 /* Call the helper only if:
778 * - nf_conntrack_in() was executed above ("!cached") for a
779 * confirmed connection, or
780 * - When committing an unconfirmed connection.
781 */
782 if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
783 ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
784 return -EINVAL;
785 }
786 }
412 787
413 return 0; 788 return 0;
414} 789}
@@ -420,19 +795,24 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
420{ 795{
421 struct nf_conntrack_expect *exp; 796 struct nf_conntrack_expect *exp;
422 797
798 /* If we pass an expected packet through nf_conntrack_in() the
799 * expectation is typically removed, but the packet could still be
800 * lost in upcall processing. To prevent this from happening we
801 * perform an explicit expectation lookup. Expected connections are
802 * always new, and will be passed through conntrack only when they are
803 * committed, as it is OK to remove the expectation at that time.
804 */
423 exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); 805 exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
424 if (exp) { 806 if (exp) {
425 u8 state; 807 u8 state;
426 808
809 /* NOTE: New connections are NATted and Helped only when
810 * committed, so we are not calling into NAT here.
811 */
427 state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; 812 state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
428 __ovs_ct_update_key(key, state, &info->zone, exp->master); 813 __ovs_ct_update_key(key, state, &info->zone, exp->master);
429 } else { 814 } else
430 int err; 815 return __ovs_ct_lookup(net, key, info, skb);
431
432 err = __ovs_ct_lookup(net, key, info, skb);
433 if (err)
434 return err;
435 }
436 816
437 return 0; 817 return 0;
438} 818}
@@ -442,21 +822,12 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
442 const struct ovs_conntrack_info *info, 822 const struct ovs_conntrack_info *info,
443 struct sk_buff *skb) 823 struct sk_buff *skb)
444{ 824{
445 u8 state;
446 int err; 825 int err;
447 826
448 state = key->ct.state;
449 if (key->ct.zone == info->zone.id &&
450 ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) {
451 /* Previous lookup has shown that this connection is already
452 * tracked and committed. Skip committing.
453 */
454 return 0;
455 }
456
457 err = __ovs_ct_lookup(net, key, info, skb); 827 err = __ovs_ct_lookup(net, key, info, skb);
458 if (err) 828 if (err)
459 return err; 829 return err;
830 /* This is a no-op if the connection has already been confirmed. */
460 if (nf_conntrack_confirm(skb) != NF_ACCEPT) 831 if (nf_conntrack_confirm(skb) != NF_ACCEPT)
461 return -EINVAL; 832 return -EINVAL;
462 833
@@ -541,6 +912,135 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
541 return 0; 912 return 0;
542} 913}
543 914
915#ifdef CONFIG_NF_NAT_NEEDED
916static int parse_nat(const struct nlattr *attr,
917 struct ovs_conntrack_info *info, bool log)
918{
919 struct nlattr *a;
920 int rem;
921 bool have_ip_max = false;
922 bool have_proto_max = false;
923 bool ip_vers = (info->family == NFPROTO_IPV6);
924
925 nla_for_each_nested(a, attr, rem) {
926 static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
927 [OVS_NAT_ATTR_SRC] = {0, 0},
928 [OVS_NAT_ATTR_DST] = {0, 0},
929 [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
930 sizeof(struct in6_addr)},
931 [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
932 sizeof(struct in6_addr)},
933 [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)},
934 [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)},
935 [OVS_NAT_ATTR_PERSISTENT] = {0, 0},
936 [OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
937 [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
938 };
939 int type = nla_type(a);
940
941 if (type > OVS_NAT_ATTR_MAX) {
942 OVS_NLERR(log,
943 "Unknown NAT attribute (type=%d, max=%d).\n",
944 type, OVS_NAT_ATTR_MAX);
945 return -EINVAL;
946 }
947
948 if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) {
949 OVS_NLERR(log,
950 "NAT attribute type %d has unexpected length (%d != %d).\n",
951 type, nla_len(a),
952 ovs_nat_attr_lens[type][ip_vers]);
953 return -EINVAL;
954 }
955
956 switch (type) {
957 case OVS_NAT_ATTR_SRC:
958 case OVS_NAT_ATTR_DST:
959 if (info->nat) {
960 OVS_NLERR(log,
961 "Only one type of NAT may be specified.\n"
962 );
963 return -ERANGE;
964 }
965 info->nat |= OVS_CT_NAT;
966 info->nat |= ((type == OVS_NAT_ATTR_SRC)
967 ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT);
968 break;
969
970 case OVS_NAT_ATTR_IP_MIN:
971 nla_memcpy(&info->range.min_addr, a, nla_len(a));
972 info->range.flags |= NF_NAT_RANGE_MAP_IPS;
973 break;
974
975 case OVS_NAT_ATTR_IP_MAX:
976 have_ip_max = true;
977 nla_memcpy(&info->range.max_addr, a,
978 sizeof(info->range.max_addr));
979 info->range.flags |= NF_NAT_RANGE_MAP_IPS;
980 break;
981
982 case OVS_NAT_ATTR_PROTO_MIN:
983 info->range.min_proto.all = htons(nla_get_u16(a));
984 info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
985 break;
986
987 case OVS_NAT_ATTR_PROTO_MAX:
988 have_proto_max = true;
989 info->range.max_proto.all = htons(nla_get_u16(a));
990 info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
991 break;
992
993 case OVS_NAT_ATTR_PERSISTENT:
994 info->range.flags |= NF_NAT_RANGE_PERSISTENT;
995 break;
996
997 case OVS_NAT_ATTR_PROTO_HASH:
998 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM;
999 break;
1000
1001 case OVS_NAT_ATTR_PROTO_RANDOM:
1002 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY;
1003 break;
1004
1005 default:
1006 OVS_NLERR(log, "Unknown nat attribute (%d).\n", type);
1007 return -EINVAL;
1008 }
1009 }
1010
1011 if (rem > 0) {
1012 OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem);
1013 return -EINVAL;
1014 }
1015 if (!info->nat) {
1016 /* Do not allow flags if no type is given. */
1017 if (info->range.flags) {
1018 OVS_NLERR(log,
1019 "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n"
1020 );
1021 return -EINVAL;
1022 }
1023 info->nat = OVS_CT_NAT; /* NAT existing connections. */
1024 } else if (!info->commit) {
1025 OVS_NLERR(log,
1026 "NAT attributes may be specified only when CT COMMIT flag is also specified.\n"
1027 );
1028 return -EINVAL;
1029 }
1030 /* Allow missing IP_MAX. */
1031 if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) {
1032 memcpy(&info->range.max_addr, &info->range.min_addr,
1033 sizeof(info->range.max_addr));
1034 }
1035 /* Allow missing PROTO_MAX. */
1036 if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
1037 !have_proto_max) {
1038 info->range.max_proto.all = info->range.min_proto.all;
1039 }
1040 return 0;
1041}
1042#endif
1043
544static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { 1044static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
545 [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, 1045 [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 },
546 [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), 1046 [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16),
@@ -550,7 +1050,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
550 [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), 1050 [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels),
551 .maxlen = sizeof(struct md_labels) }, 1051 .maxlen = sizeof(struct md_labels) },
552 [OVS_CT_ATTR_HELPER] = { .minlen = 1, 1052 [OVS_CT_ATTR_HELPER] = { .minlen = 1,
553 .maxlen = NF_CT_HELPER_NAME_LEN } 1053 .maxlen = NF_CT_HELPER_NAME_LEN },
1054#ifdef CONFIG_NF_NAT_NEEDED
1055 /* NAT length is checked when parsing the nested attributes. */
1056 [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX },
1057#endif
554}; 1058};
555 1059
556static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, 1060static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -617,6 +1121,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
617 return -EINVAL; 1121 return -EINVAL;
618 } 1122 }
619 break; 1123 break;
1124#ifdef CONFIG_NF_NAT_NEEDED
1125 case OVS_CT_ATTR_NAT: {
1126 int err = parse_nat(a, info, log);
1127
1128 if (err)
1129 return err;
1130 break;
1131 }
1132#endif
620 default: 1133 default:
621 OVS_NLERR(log, "Unknown conntrack attr (%d)", 1134 OVS_NLERR(log, "Unknown conntrack attr (%d)",
622 type); 1135 type);
@@ -704,6 +1217,74 @@ err_free_ct:
704 return err; 1217 return err;
705} 1218}
706 1219
1220#ifdef CONFIG_NF_NAT_NEEDED
1221static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
1222 struct sk_buff *skb)
1223{
1224 struct nlattr *start;
1225
1226 start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
1227 if (!start)
1228 return false;
1229
1230 if (info->nat & OVS_CT_SRC_NAT) {
1231 if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
1232 return false;
1233 } else if (info->nat & OVS_CT_DST_NAT) {
1234 if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
1235 return false;
1236 } else {
1237 goto out;
1238 }
1239
1240 if (info->range.flags & NF_NAT_RANGE_MAP_IPS) {
1241 if (info->family == NFPROTO_IPV4) {
1242 if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN,
1243 info->range.min_addr.ip) ||
1244 (info->range.max_addr.ip
1245 != info->range.min_addr.ip &&
1246 (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX,
1247 info->range.max_addr.ip))))
1248 return false;
1249#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
1250 } else if (info->family == NFPROTO_IPV6) {
1251 if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN,
1252 &info->range.min_addr.in6) ||
1253 (memcmp(&info->range.max_addr.in6,
1254 &info->range.min_addr.in6,
1255 sizeof(info->range.max_addr.in6)) &&
1256 (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX,
1257 &info->range.max_addr.in6))))
1258 return false;
1259#endif
1260 } else {
1261 return false;
1262 }
1263 }
1264 if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
1265 (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN,
1266 ntohs(info->range.min_proto.all)) ||
1267 (info->range.max_proto.all != info->range.min_proto.all &&
1268 nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX,
1269 ntohs(info->range.max_proto.all)))))
1270 return false;
1271
1272 if (info->range.flags & NF_NAT_RANGE_PERSISTENT &&
1273 nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT))
1274 return false;
1275 if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM &&
1276 nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH))
1277 return false;
1278 if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY &&
1279 nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM))
1280 return false;
1281out:
1282 nla_nest_end(skb, start);
1283
1284 return true;
1285}
1286#endif
1287
707int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, 1288int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
708 struct sk_buff *skb) 1289 struct sk_buff *skb)
709{ 1290{
@@ -732,7 +1313,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
732 ct_info->helper->name)) 1313 ct_info->helper->name))
733 return -EMSGSIZE; 1314 return -EMSGSIZE;
734 } 1315 }
735 1316#ifdef CONFIG_NF_NAT_NEEDED
1317 if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb))
1318 return -EMSGSIZE;
1319#endif
736 nla_nest_end(skb, start); 1320 nla_nest_end(skb, start);
737 1321
738 return 0; 1322 return 0;
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index a7544f405c16..8f6230bd6183 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -37,7 +37,8 @@ void ovs_ct_free_action(const struct nlattr *a);
37 37
38#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ 38#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
39 OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \ 39 OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \
40 OVS_CS_F_INVALID | OVS_CS_F_TRACKED) 40 OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \
41 OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
41#else 42#else
42#include <linux/errno.h> 43#include <linux/errno.h>
43 44