aboutsummaryrefslogtreecommitdiffstats
path: root/net/openvswitch
diff options
context:
space:
mode:
Diffstat (limited to 'net/openvswitch')
-rw-r--r--net/openvswitch/Kconfig6
-rw-r--r--net/openvswitch/actions.c12
-rw-r--r--net/openvswitch/conntrack.c663
-rw-r--r--net/openvswitch/conntrack.h3
-rw-r--r--net/openvswitch/datapath.c108
-rw-r--r--net/openvswitch/datapath.h4
-rw-r--r--net/openvswitch/flow.h2
-rw-r--r--net/openvswitch/flow_netlink.c9
-rw-r--r--net/openvswitch/vport-geneve.c2
-rw-r--r--net/openvswitch/vport-internal_dev.c10
-rw-r--r--net/openvswitch/vport-netdev.c2
-rw-r--r--net/openvswitch/vport-vxlan.c2
-rw-r--r--net/openvswitch/vport.h7
13 files changed, 735 insertions, 95 deletions
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index d143aa9f6654..ce947292ae77 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -6,10 +6,14 @@ config OPENVSWITCH
6 tristate "Open vSwitch" 6 tristate "Open vSwitch"
7 depends on INET 7 depends on INET
8 depends on !NF_CONNTRACK || \ 8 depends on !NF_CONNTRACK || \
9 (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6)) 9 (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
10 (!NF_NAT || NF_NAT) && \
11 (!NF_NAT_IPV4 || NF_NAT_IPV4) && \
12 (!NF_NAT_IPV6 || NF_NAT_IPV6)))
10 select LIBCRC32C 13 select LIBCRC32C
11 select MPLS 14 select MPLS
12 select NET_MPLS_GSO 15 select NET_MPLS_GSO
16 select DST_CACHE
13 ---help--- 17 ---help---
14 Open vSwitch is a multilayer Ethernet switch targeted at virtualized 18 Open vSwitch is a multilayer Ethernet switch targeted at virtualized
15 environments. In addition to supporting a variety of features 19 environments. In addition to supporting a variety of features
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 2d59df521915..879185fe183f 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -158,9 +158,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
158 new_mpls_lse = (__be32 *)skb_mpls_header(skb); 158 new_mpls_lse = (__be32 *)skb_mpls_header(skb);
159 *new_mpls_lse = mpls->mpls_lse; 159 *new_mpls_lse = mpls->mpls_lse;
160 160
161 if (skb->ip_summed == CHECKSUM_COMPLETE) 161 skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
162 skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse,
163 MPLS_HLEN, 0));
164 162
165 hdr = eth_hdr(skb); 163 hdr = eth_hdr(skb);
166 hdr->h_proto = mpls->mpls_ethertype; 164 hdr->h_proto = mpls->mpls_ethertype;
@@ -280,7 +278,7 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
280 ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst, 278 ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst,
281 mask->eth_dst); 279 mask->eth_dst);
282 280
283 ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); 281 skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
284 282
285 ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source); 283 ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source);
286 ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest); 284 ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest);
@@ -463,7 +461,7 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
463 mask_ipv6_addr(saddr, key->ipv6_src, mask->ipv6_src, masked); 461 mask_ipv6_addr(saddr, key->ipv6_src, mask->ipv6_src, masked);
464 462
465 if (unlikely(memcmp(saddr, masked, sizeof(masked)))) { 463 if (unlikely(memcmp(saddr, masked, sizeof(masked)))) {
466 set_ipv6_addr(skb, key->ipv6_proto, saddr, masked, 464 set_ipv6_addr(skb, flow_key->ip.proto, saddr, masked,
467 true); 465 true);
468 memcpy(&flow_key->ipv6.addr.src, masked, 466 memcpy(&flow_key->ipv6.addr.src, masked,
469 sizeof(flow_key->ipv6.addr.src)); 467 sizeof(flow_key->ipv6.addr.src));
@@ -485,7 +483,7 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
485 NULL, &flags) 483 NULL, &flags)
486 != NEXTHDR_ROUTING); 484 != NEXTHDR_ROUTING);
487 485
488 set_ipv6_addr(skb, key->ipv6_proto, daddr, masked, 486 set_ipv6_addr(skb, flow_key->ip.proto, daddr, masked,
489 recalc_csum); 487 recalc_csum);
490 memcpy(&flow_key->ipv6.addr.dst, masked, 488 memcpy(&flow_key->ipv6.addr.dst, masked,
491 sizeof(flow_key->ipv6.addr.dst)); 489 sizeof(flow_key->ipv6.addr.dst));
@@ -639,7 +637,7 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk
639 /* Reconstruct the MAC header. */ 637 /* Reconstruct the MAC header. */
640 skb_push(skb, data->l2_len); 638 skb_push(skb, data->l2_len);
641 memcpy(skb->data, &data->l2_data, data->l2_len); 639 memcpy(skb->data, &data->l2_data, data->l2_len);
642 ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len); 640 skb_postpush_rcsum(skb, skb->data, data->l2_len);
643 skb_reset_mac_header(skb); 641 skb_reset_mac_header(skb);
644 642
645 ovs_vport_send(vport, skb); 643 ovs_vport_send(vport, skb);
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index ee6ff8ffc12d..b5fea1101faa 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -13,21 +13,31 @@
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/openvswitch.h> 15#include <linux/openvswitch.h>
16#include <linux/tcp.h>
17#include <linux/udp.h>
18#include <linux/sctp.h>
16#include <net/ip.h> 19#include <net/ip.h>
17#include <net/netfilter/nf_conntrack_core.h> 20#include <net/netfilter/nf_conntrack_core.h>
18#include <net/netfilter/nf_conntrack_helper.h> 21#include <net/netfilter/nf_conntrack_helper.h>
19#include <net/netfilter/nf_conntrack_labels.h> 22#include <net/netfilter/nf_conntrack_labels.h>
23#include <net/netfilter/nf_conntrack_seqadj.h>
20#include <net/netfilter/nf_conntrack_zones.h> 24#include <net/netfilter/nf_conntrack_zones.h>
21#include <net/netfilter/ipv6/nf_defrag_ipv6.h> 25#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
22 26
27#ifdef CONFIG_NF_NAT_NEEDED
28#include <linux/netfilter/nf_nat.h>
29#include <net/netfilter/nf_nat_core.h>
30#include <net/netfilter/nf_nat_l3proto.h>
31#endif
32
23#include "datapath.h" 33#include "datapath.h"
24#include "conntrack.h" 34#include "conntrack.h"
25#include "flow.h" 35#include "flow.h"
26#include "flow_netlink.h" 36#include "flow_netlink.h"
27 37
28struct ovs_ct_len_tbl { 38struct ovs_ct_len_tbl {
29 size_t maxlen; 39 int maxlen;
30 size_t minlen; 40 int minlen;
31}; 41};
32 42
33/* Metadata mark for masked write to conntrack mark */ 43/* Metadata mark for masked write to conntrack mark */
@@ -42,15 +52,25 @@ struct md_labels {
42 struct ovs_key_ct_labels mask; 52 struct ovs_key_ct_labels mask;
43}; 53};
44 54
55enum ovs_ct_nat {
56 OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */
57 OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */
58 OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */
59};
60
45/* Conntrack action context for execution. */ 61/* Conntrack action context for execution. */
46struct ovs_conntrack_info { 62struct ovs_conntrack_info {
47 struct nf_conntrack_helper *helper; 63 struct nf_conntrack_helper *helper;
48 struct nf_conntrack_zone zone; 64 struct nf_conntrack_zone zone;
49 struct nf_conn *ct; 65 struct nf_conn *ct;
50 u8 commit : 1; 66 u8 commit : 1;
67 u8 nat : 3; /* enum ovs_ct_nat */
51 u16 family; 68 u16 family;
52 struct md_mark mark; 69 struct md_mark mark;
53 struct md_labels labels; 70 struct md_labels labels;
71#ifdef CONFIG_NF_NAT_NEEDED
72 struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */
73#endif
54}; 74};
55 75
56static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); 76static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -75,7 +95,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
75 switch (ctinfo) { 95 switch (ctinfo) {
76 case IP_CT_ESTABLISHED_REPLY: 96 case IP_CT_ESTABLISHED_REPLY:
77 case IP_CT_RELATED_REPLY: 97 case IP_CT_RELATED_REPLY:
78 case IP_CT_NEW_REPLY:
79 ct_state |= OVS_CS_F_REPLY_DIR; 98 ct_state |= OVS_CS_F_REPLY_DIR;
80 break; 99 break;
81 default: 100 default:
@@ -92,7 +111,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
92 ct_state |= OVS_CS_F_RELATED; 111 ct_state |= OVS_CS_F_RELATED;
93 break; 112 break;
94 case IP_CT_NEW: 113 case IP_CT_NEW:
95 case IP_CT_NEW_REPLY:
96 ct_state |= OVS_CS_F_NEW; 114 ct_state |= OVS_CS_F_NEW;
97 break; 115 break;
98 default: 116 default:
@@ -139,12 +157,15 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
139 ovs_ct_get_labels(ct, &key->ct.labels); 157 ovs_ct_get_labels(ct, &key->ct.labels);
140} 158}
141 159
142/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has 160/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
143 * previously sent the packet to conntrack via the ct action. 161 * previously sent the packet to conntrack via the ct action. If
162 * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
163 * initialized from the connection status.
144 */ 164 */
145static void ovs_ct_update_key(const struct sk_buff *skb, 165static void ovs_ct_update_key(const struct sk_buff *skb,
146 const struct ovs_conntrack_info *info, 166 const struct ovs_conntrack_info *info,
147 struct sw_flow_key *key, bool post_ct) 167 struct sw_flow_key *key, bool post_ct,
168 bool keep_nat_flags)
148{ 169{
149 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 170 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
150 enum ip_conntrack_info ctinfo; 171 enum ip_conntrack_info ctinfo;
@@ -154,10 +175,22 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
154 ct = nf_ct_get(skb, &ctinfo); 175 ct = nf_ct_get(skb, &ctinfo);
155 if (ct) { 176 if (ct) {
156 state = ovs_ct_get_state(ctinfo); 177 state = ovs_ct_get_state(ctinfo);
178 /* All unconfirmed entries are NEW connections. */
157 if (!nf_ct_is_confirmed(ct)) 179 if (!nf_ct_is_confirmed(ct))
158 state |= OVS_CS_F_NEW; 180 state |= OVS_CS_F_NEW;
181 /* OVS persists the related flag for the duration of the
182 * connection.
183 */
159 if (ct->master) 184 if (ct->master)
160 state |= OVS_CS_F_RELATED; 185 state |= OVS_CS_F_RELATED;
186 if (keep_nat_flags) {
187 state |= key->ct.state & OVS_CS_F_NAT_MASK;
188 } else {
189 if (ct->status & IPS_SRC_NAT)
190 state |= OVS_CS_F_SRC_NAT;
191 if (ct->status & IPS_DST_NAT)
192 state |= OVS_CS_F_DST_NAT;
193 }
161 zone = nf_ct_zone(ct); 194 zone = nf_ct_zone(ct);
162 } else if (post_ct) { 195 } else if (post_ct) {
163 state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; 196 state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
@@ -167,9 +200,12 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
167 __ovs_ct_update_key(key, state, zone, ct); 200 __ovs_ct_update_key(key, state, zone, ct);
168} 201}
169 202
203/* This is called to initialize CT key fields possibly coming in from the local
204 * stack.
205 */
170void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) 206void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
171{ 207{
172 ovs_ct_update_key(skb, NULL, key, false); 208 ovs_ct_update_key(skb, NULL, key, false, false);
173} 209}
174 210
175int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) 211int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
@@ -201,7 +237,6 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
201 struct nf_conn *ct; 237 struct nf_conn *ct;
202 u32 new_mark; 238 u32 new_mark;
203 239
204
205 /* The connection could be invalid, in which case set_mark is no-op. */ 240 /* The connection could be invalid, in which case set_mark is no-op. */
206 ct = nf_ct_get(skb, &ctinfo); 241 ct = nf_ct_get(skb, &ctinfo);
207 if (!ct) 242 if (!ct)
@@ -259,6 +294,7 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
259 enum ip_conntrack_info ctinfo; 294 enum ip_conntrack_info ctinfo;
260 unsigned int protoff; 295 unsigned int protoff;
261 struct nf_conn *ct; 296 struct nf_conn *ct;
297 int err;
262 298
263 ct = nf_ct_get(skb, &ctinfo); 299 ct = nf_ct_get(skb, &ctinfo);
264 if (!ct || ctinfo == IP_CT_RELATED_REPLY) 300 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
@@ -295,7 +331,18 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
295 return NF_DROP; 331 return NF_DROP;
296 } 332 }
297 333
298 return helper->help(skb, protoff, ct, ctinfo); 334 err = helper->help(skb, protoff, ct, ctinfo);
335 if (err != NF_ACCEPT)
336 return err;
337
338 /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
339 * FTP with NAT) adusting the TCP payload size when mangling IP
340 * addresses and/or port numbers in the text-based control connection.
341 */
342 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
343 !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
344 return NF_DROP;
345 return NF_ACCEPT;
299} 346}
300 347
301/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero 348/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
@@ -320,6 +367,7 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
320 } else if (key->eth.type == htons(ETH_P_IPV6)) { 367 } else if (key->eth.type == htons(ETH_P_IPV6)) {
321 enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; 368 enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
322 369
370 skb_orphan(skb);
323 memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); 371 memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
324 err = nf_ct_frag6_gather(net, skb, user); 372 err = nf_ct_frag6_gather(net, skb, user);
325 if (err) 373 if (err)
@@ -352,14 +400,101 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
352 return __nf_ct_expect_find(net, zone, &tuple); 400 return __nf_ct_expect_find(net, zone, &tuple);
353} 401}
354 402
403/* This replicates logic from nf_conntrack_core.c that is not exported. */
404static enum ip_conntrack_info
405ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
406{
407 const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
408
409 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
410 return IP_CT_ESTABLISHED_REPLY;
411 /* Once we've had two way comms, always ESTABLISHED. */
412 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status))
413 return IP_CT_ESTABLISHED;
414 if (test_bit(IPS_EXPECTED_BIT, &ct->status))
415 return IP_CT_RELATED;
416 return IP_CT_NEW;
417}
418
419/* Find an existing connection which this packet belongs to without
420 * re-attributing statistics or modifying the connection state. This allows an
421 * skb->nfct lost due to an upcall to be recovered during actions execution.
422 *
423 * Must be called with rcu_read_lock.
424 *
425 * On success, populates skb->nfct and skb->nfctinfo, and returns the
426 * connection. Returns NULL if there is no existing entry.
427 */
428static struct nf_conn *
429ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
430 u8 l3num, struct sk_buff *skb)
431{
432 struct nf_conntrack_l3proto *l3proto;
433 struct nf_conntrack_l4proto *l4proto;
434 struct nf_conntrack_tuple tuple;
435 struct nf_conntrack_tuple_hash *h;
436 enum ip_conntrack_info ctinfo;
437 struct nf_conn *ct;
438 unsigned int dataoff;
439 u8 protonum;
440
441 l3proto = __nf_ct_l3proto_find(l3num);
442 if (!l3proto) {
443 pr_debug("ovs_ct_find_existing: Can't get l3proto\n");
444 return NULL;
445 }
446 if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
447 &protonum) <= 0) {
448 pr_debug("ovs_ct_find_existing: Can't get protonum\n");
449 return NULL;
450 }
451 l4proto = __nf_ct_l4proto_find(l3num, protonum);
452 if (!l4proto) {
453 pr_debug("ovs_ct_find_existing: Can't get l4proto\n");
454 return NULL;
455 }
456 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
457 protonum, net, &tuple, l3proto, l4proto)) {
458 pr_debug("ovs_ct_find_existing: Can't get tuple\n");
459 return NULL;
460 }
461
462 /* look for tuple match */
463 h = nf_conntrack_find_get(net, zone, &tuple);
464 if (!h)
465 return NULL; /* Not found. */
466
467 ct = nf_ct_tuplehash_to_ctrack(h);
468
469 ctinfo = ovs_ct_get_info(h);
470 if (ctinfo == IP_CT_NEW) {
471 /* This should not happen. */
472 WARN_ONCE(1, "ovs_ct_find_existing: new packet for %p\n", ct);
473 }
474 skb->nfct = &ct->ct_general;
475 skb->nfctinfo = ctinfo;
476 return ct;
477}
478
355/* Determine whether skb->nfct is equal to the result of conntrack lookup. */ 479/* Determine whether skb->nfct is equal to the result of conntrack lookup. */
356static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, 480static bool skb_nfct_cached(struct net *net,
357 const struct ovs_conntrack_info *info) 481 const struct sw_flow_key *key,
482 const struct ovs_conntrack_info *info,
483 struct sk_buff *skb)
358{ 484{
359 enum ip_conntrack_info ctinfo; 485 enum ip_conntrack_info ctinfo;
360 struct nf_conn *ct; 486 struct nf_conn *ct;
361 487
362 ct = nf_ct_get(skb, &ctinfo); 488 ct = nf_ct_get(skb, &ctinfo);
489 /* If no ct, check if we have evidence that an existing conntrack entry
490 * might be found for this skb. This happens when we lose a skb->nfct
491 * due to an upcall. If the connection was not confirmed, it is not
492 * cached and needs to be run through conntrack again.
493 */
494 if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
495 !(key->ct.state & OVS_CS_F_INVALID) &&
496 key->ct.zone == info->zone.id)
497 ct = ovs_ct_find_existing(net, &info->zone, info->family, skb);
363 if (!ct) 498 if (!ct)
364 return false; 499 return false;
365 if (!net_eq(net, read_pnet(&ct->ct_net))) 500 if (!net_eq(net, read_pnet(&ct->ct_net)))
@@ -377,6 +512,207 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
377 return true; 512 return true;
378} 513}
379 514
515#ifdef CONFIG_NF_NAT_NEEDED
516/* Modelled after nf_nat_ipv[46]_fn().
517 * range is only used for new, uninitialized NAT state.
518 * Returns either NF_ACCEPT or NF_DROP.
519 */
520static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
521 enum ip_conntrack_info ctinfo,
522 const struct nf_nat_range *range,
523 enum nf_nat_manip_type maniptype)
524{
525 int hooknum, nh_off, err = NF_ACCEPT;
526
527 nh_off = skb_network_offset(skb);
528 skb_pull(skb, nh_off);
529
530 /* See HOOK2MANIP(). */
531 if (maniptype == NF_NAT_MANIP_SRC)
532 hooknum = NF_INET_LOCAL_IN; /* Source NAT */
533 else
534 hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
535
536 switch (ctinfo) {
537 case IP_CT_RELATED:
538 case IP_CT_RELATED_REPLY:
539 if (IS_ENABLED(CONFIG_NF_NAT_IPV4) &&
540 skb->protocol == htons(ETH_P_IP) &&
541 ip_hdr(skb)->protocol == IPPROTO_ICMP) {
542 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
543 hooknum))
544 err = NF_DROP;
545 goto push;
546 } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) &&
547 skb->protocol == htons(ETH_P_IPV6)) {
548 __be16 frag_off;
549 u8 nexthdr = ipv6_hdr(skb)->nexthdr;
550 int hdrlen = ipv6_skip_exthdr(skb,
551 sizeof(struct ipv6hdr),
552 &nexthdr, &frag_off);
553
554 if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
555 if (!nf_nat_icmpv6_reply_translation(skb, ct,
556 ctinfo,
557 hooknum,
558 hdrlen))
559 err = NF_DROP;
560 goto push;
561 }
562 }
563 /* Non-ICMP, fall thru to initialize if needed. */
564 case IP_CT_NEW:
565 /* Seen it before? This can happen for loopback, retrans,
566 * or local packets.
567 */
568 if (!nf_nat_initialized(ct, maniptype)) {
569 /* Initialize according to the NAT action. */
570 err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
571 /* Action is set up to establish a new
572 * mapping.
573 */
574 ? nf_nat_setup_info(ct, range, maniptype)
575 : nf_nat_alloc_null_binding(ct, hooknum);
576 if (err != NF_ACCEPT)
577 goto push;
578 }
579 break;
580
581 case IP_CT_ESTABLISHED:
582 case IP_CT_ESTABLISHED_REPLY:
583 break;
584
585 default:
586 err = NF_DROP;
587 goto push;
588 }
589
590 err = nf_nat_packet(ct, ctinfo, hooknum, skb);
591push:
592 skb_push(skb, nh_off);
593
594 return err;
595}
596
597static void ovs_nat_update_key(struct sw_flow_key *key,
598 const struct sk_buff *skb,
599 enum nf_nat_manip_type maniptype)
600{
601 if (maniptype == NF_NAT_MANIP_SRC) {
602 __be16 src;
603
604 key->ct.state |= OVS_CS_F_SRC_NAT;
605 if (key->eth.type == htons(ETH_P_IP))
606 key->ipv4.addr.src = ip_hdr(skb)->saddr;
607 else if (key->eth.type == htons(ETH_P_IPV6))
608 memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr,
609 sizeof(key->ipv6.addr.src));
610 else
611 return;
612
613 if (key->ip.proto == IPPROTO_UDP)
614 src = udp_hdr(skb)->source;
615 else if (key->ip.proto == IPPROTO_TCP)
616 src = tcp_hdr(skb)->source;
617 else if (key->ip.proto == IPPROTO_SCTP)
618 src = sctp_hdr(skb)->source;
619 else
620 return;
621
622 key->tp.src = src;
623 } else {
624 __be16 dst;
625
626 key->ct.state |= OVS_CS_F_DST_NAT;
627 if (key->eth.type == htons(ETH_P_IP))
628 key->ipv4.addr.dst = ip_hdr(skb)->daddr;
629 else if (key->eth.type == htons(ETH_P_IPV6))
630 memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr,
631 sizeof(key->ipv6.addr.dst));
632 else
633 return;
634
635 if (key->ip.proto == IPPROTO_UDP)
636 dst = udp_hdr(skb)->dest;
637 else if (key->ip.proto == IPPROTO_TCP)
638 dst = tcp_hdr(skb)->dest;
639 else if (key->ip.proto == IPPROTO_SCTP)
640 dst = sctp_hdr(skb)->dest;
641 else
642 return;
643
644 key->tp.dst = dst;
645 }
646}
647
648/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
649static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
650 const struct ovs_conntrack_info *info,
651 struct sk_buff *skb, struct nf_conn *ct,
652 enum ip_conntrack_info ctinfo)
653{
654 enum nf_nat_manip_type maniptype;
655 int err;
656
657 if (nf_ct_is_untracked(ct)) {
658 /* A NAT action may only be performed on tracked packets. */
659 return NF_ACCEPT;
660 }
661
662 /* Add NAT extension if not confirmed yet. */
663 if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
664 return NF_ACCEPT; /* Can't NAT. */
665
666 /* Determine NAT type.
667 * Check if the NAT type can be deduced from the tracked connection.
668 * Make sure new expected connections (IP_CT_RELATED) are NATted only
669 * when committing.
670 */
671 if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW &&
672 ct->status & IPS_NAT_MASK &&
673 (ctinfo != IP_CT_RELATED || info->commit)) {
674 /* NAT an established or related connection like before. */
675 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
676 /* This is the REPLY direction for a connection
677 * for which NAT was applied in the forward
678 * direction. Do the reverse NAT.
679 */
680 maniptype = ct->status & IPS_SRC_NAT
681 ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
682 else
683 maniptype = ct->status & IPS_SRC_NAT
684 ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
685 } else if (info->nat & OVS_CT_SRC_NAT) {
686 maniptype = NF_NAT_MANIP_SRC;
687 } else if (info->nat & OVS_CT_DST_NAT) {
688 maniptype = NF_NAT_MANIP_DST;
689 } else {
690 return NF_ACCEPT; /* Connection is not NATed. */
691 }
692 err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
693
694 /* Mark NAT done if successful and update the flow key. */
695 if (err == NF_ACCEPT)
696 ovs_nat_update_key(key, skb, maniptype);
697
698 return err;
699}
700#else /* !CONFIG_NF_NAT_NEEDED */
701static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
702 const struct ovs_conntrack_info *info,
703 struct sk_buff *skb, struct nf_conn *ct,
704 enum ip_conntrack_info ctinfo)
705{
706 return NF_ACCEPT;
707}
708#endif
709
710/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
711 * not done already. Update key with new CT state after passing the packet
712 * through conntrack.
713 * Note that if the packet is deemed invalid by conntrack, skb->nfct will be
714 * set to NULL and 0 will be returned.
715 */
380static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, 716static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
381 const struct ovs_conntrack_info *info, 717 const struct ovs_conntrack_info *info,
382 struct sk_buff *skb) 718 struct sk_buff *skb)
@@ -386,8 +722,13 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
386 * actually run the packet through conntrack twice unless it's for a 722 * actually run the packet through conntrack twice unless it's for a
387 * different zone. 723 * different zone.
388 */ 724 */
389 if (!skb_nfct_cached(net, skb, info)) { 725 bool cached = skb_nfct_cached(net, key, info, skb);
726 enum ip_conntrack_info ctinfo;
727 struct nf_conn *ct;
728
729 if (!cached) {
390 struct nf_conn *tmpl = info->ct; 730 struct nf_conn *tmpl = info->ct;
731 int err;
391 732
392 /* Associate skb with specified zone. */ 733 /* Associate skb with specified zone. */
393 if (tmpl) { 734 if (tmpl) {
@@ -398,17 +739,53 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
398 skb->nfctinfo = IP_CT_NEW; 739 skb->nfctinfo = IP_CT_NEW;
399 } 740 }
400 741
401 if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING, 742 /* Repeat if requested, see nf_iterate(). */
402 skb) != NF_ACCEPT) 743 do {
744 err = nf_conntrack_in(net, info->family,
745 NF_INET_PRE_ROUTING, skb);
746 } while (err == NF_REPEAT);
747
748 if (err != NF_ACCEPT)
403 return -ENOENT; 749 return -ENOENT;
404 750
405 if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) { 751 /* Clear CT state NAT flags to mark that we have not yet done
406 WARN_ONCE(1, "helper rejected packet"); 752 * NAT after the nf_conntrack_in() call. We can actually clear
753 * the whole state, as it will be re-initialized below.
754 */
755 key->ct.state = 0;
756
757 /* Update the key, but keep the NAT flags. */
758 ovs_ct_update_key(skb, info, key, true, true);
759 }
760
761 ct = nf_ct_get(skb, &ctinfo);
762 if (ct) {
763 /* Packets starting a new connection must be NATted before the
764 * helper, so that the helper knows about the NAT. We enforce
765 * this by delaying both NAT and helper calls for unconfirmed
766 * connections until the committing CT action. For later
767 * packets NAT and Helper may be called in either order.
768 *
769 * NAT will be done only if the CT action has NAT, and only
770 * once per packet (per zone), as guarded by the NAT bits in
771 * the key->ct.state.
772 */
773 if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
774 (nf_ct_is_confirmed(ct) || info->commit) &&
775 ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
407 return -EINVAL; 776 return -EINVAL;
408 } 777 }
409 }
410 778
411 ovs_ct_update_key(skb, info, key, true); 779 /* Call the helper only if:
780 * - nf_conntrack_in() was executed above ("!cached") for a
781 * confirmed connection, or
782 * - When committing an unconfirmed connection.
783 */
784 if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
785 ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
786 return -EINVAL;
787 }
788 }
412 789
413 return 0; 790 return 0;
414} 791}
@@ -420,19 +797,24 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
420{ 797{
421 struct nf_conntrack_expect *exp; 798 struct nf_conntrack_expect *exp;
422 799
800 /* If we pass an expected packet through nf_conntrack_in() the
801 * expectation is typically removed, but the packet could still be
802 * lost in upcall processing. To prevent this from happening we
803 * perform an explicit expectation lookup. Expected connections are
804 * always new, and will be passed through conntrack only when they are
805 * committed, as it is OK to remove the expectation at that time.
806 */
423 exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); 807 exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
424 if (exp) { 808 if (exp) {
425 u8 state; 809 u8 state;
426 810
811 /* NOTE: New connections are NATted and Helped only when
812 * committed, so we are not calling into NAT here.
813 */
427 state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; 814 state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
428 __ovs_ct_update_key(key, state, &info->zone, exp->master); 815 __ovs_ct_update_key(key, state, &info->zone, exp->master);
429 } else { 816 } else
430 int err; 817 return __ovs_ct_lookup(net, key, info, skb);
431
432 err = __ovs_ct_lookup(net, key, info, skb);
433 if (err)
434 return err;
435 }
436 818
437 return 0; 819 return 0;
438} 820}
@@ -442,21 +824,12 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
442 const struct ovs_conntrack_info *info, 824 const struct ovs_conntrack_info *info,
443 struct sk_buff *skb) 825 struct sk_buff *skb)
444{ 826{
445 u8 state;
446 int err; 827 int err;
447 828
448 state = key->ct.state;
449 if (key->ct.zone == info->zone.id &&
450 ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) {
451 /* Previous lookup has shown that this connection is already
452 * tracked and committed. Skip committing.
453 */
454 return 0;
455 }
456
457 err = __ovs_ct_lookup(net, key, info, skb); 829 err = __ovs_ct_lookup(net, key, info, skb);
458 if (err) 830 if (err)
459 return err; 831 return err;
832 /* This is a no-op if the connection has already been confirmed. */
460 if (nf_conntrack_confirm(skb) != NF_ACCEPT) 833 if (nf_conntrack_confirm(skb) != NF_ACCEPT)
461 return -EINVAL; 834 return -EINVAL;
462 835
@@ -541,6 +914,136 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
541 return 0; 914 return 0;
542} 915}
543 916
917#ifdef CONFIG_NF_NAT_NEEDED
918static int parse_nat(const struct nlattr *attr,
919 struct ovs_conntrack_info *info, bool log)
920{
921 struct nlattr *a;
922 int rem;
923 bool have_ip_max = false;
924 bool have_proto_max = false;
925 bool ip_vers = (info->family == NFPROTO_IPV6);
926
927 nla_for_each_nested(a, attr, rem) {
928 static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
929 [OVS_NAT_ATTR_SRC] = {0, 0},
930 [OVS_NAT_ATTR_DST] = {0, 0},
931 [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
932 sizeof(struct in6_addr)},
933 [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
934 sizeof(struct in6_addr)},
935 [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)},
936 [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)},
937 [OVS_NAT_ATTR_PERSISTENT] = {0, 0},
938 [OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
939 [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
940 };
941 int type = nla_type(a);
942
943 if (type > OVS_NAT_ATTR_MAX) {
944 OVS_NLERR(log,
945 "Unknown NAT attribute (type=%d, max=%d).\n",
946 type, OVS_NAT_ATTR_MAX);
947 return -EINVAL;
948 }
949
950 if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) {
951 OVS_NLERR(log,
952 "NAT attribute type %d has unexpected length (%d != %d).\n",
953 type, nla_len(a),
954 ovs_nat_attr_lens[type][ip_vers]);
955 return -EINVAL;
956 }
957
958 switch (type) {
959 case OVS_NAT_ATTR_SRC:
960 case OVS_NAT_ATTR_DST:
961 if (info->nat) {
962 OVS_NLERR(log,
963 "Only one type of NAT may be specified.\n"
964 );
965 return -ERANGE;
966 }
967 info->nat |= OVS_CT_NAT;
968 info->nat |= ((type == OVS_NAT_ATTR_SRC)
969 ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT);
970 break;
971
972 case OVS_NAT_ATTR_IP_MIN:
973 nla_memcpy(&info->range.min_addr, a,
974 sizeof(info->range.min_addr));
975 info->range.flags |= NF_NAT_RANGE_MAP_IPS;
976 break;
977
978 case OVS_NAT_ATTR_IP_MAX:
979 have_ip_max = true;
980 nla_memcpy(&info->range.max_addr, a,
981 sizeof(info->range.max_addr));
982 info->range.flags |= NF_NAT_RANGE_MAP_IPS;
983 break;
984
985 case OVS_NAT_ATTR_PROTO_MIN:
986 info->range.min_proto.all = htons(nla_get_u16(a));
987 info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
988 break;
989
990 case OVS_NAT_ATTR_PROTO_MAX:
991 have_proto_max = true;
992 info->range.max_proto.all = htons(nla_get_u16(a));
993 info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
994 break;
995
996 case OVS_NAT_ATTR_PERSISTENT:
997 info->range.flags |= NF_NAT_RANGE_PERSISTENT;
998 break;
999
1000 case OVS_NAT_ATTR_PROTO_HASH:
1001 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM;
1002 break;
1003
1004 case OVS_NAT_ATTR_PROTO_RANDOM:
1005 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY;
1006 break;
1007
1008 default:
1009 OVS_NLERR(log, "Unknown nat attribute (%d).\n", type);
1010 return -EINVAL;
1011 }
1012 }
1013
1014 if (rem > 0) {
1015 OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem);
1016 return -EINVAL;
1017 }
1018 if (!info->nat) {
1019 /* Do not allow flags if no type is given. */
1020 if (info->range.flags) {
1021 OVS_NLERR(log,
1022 "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n"
1023 );
1024 return -EINVAL;
1025 }
1026 info->nat = OVS_CT_NAT; /* NAT existing connections. */
1027 } else if (!info->commit) {
1028 OVS_NLERR(log,
1029 "NAT attributes may be specified only when CT COMMIT flag is also specified.\n"
1030 );
1031 return -EINVAL;
1032 }
1033 /* Allow missing IP_MAX. */
1034 if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) {
1035 memcpy(&info->range.max_addr, &info->range.min_addr,
1036 sizeof(info->range.max_addr));
1037 }
1038 /* Allow missing PROTO_MAX. */
1039 if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
1040 !have_proto_max) {
1041 info->range.max_proto.all = info->range.min_proto.all;
1042 }
1043 return 0;
1044}
1045#endif
1046
544static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { 1047static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
545 [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, 1048 [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 },
546 [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), 1049 [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16),
@@ -550,7 +1053,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
550 [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), 1053 [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels),
551 .maxlen = sizeof(struct md_labels) }, 1054 .maxlen = sizeof(struct md_labels) },
552 [OVS_CT_ATTR_HELPER] = { .minlen = 1, 1055 [OVS_CT_ATTR_HELPER] = { .minlen = 1,
553 .maxlen = NF_CT_HELPER_NAME_LEN } 1056 .maxlen = NF_CT_HELPER_NAME_LEN },
1057#ifdef CONFIG_NF_NAT_NEEDED
1058 /* NAT length is checked when parsing the nested attributes. */
1059 [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX },
1060#endif
554}; 1061};
555 1062
556static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, 1063static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -617,6 +1124,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
617 return -EINVAL; 1124 return -EINVAL;
618 } 1125 }
619 break; 1126 break;
1127#ifdef CONFIG_NF_NAT_NEEDED
1128 case OVS_CT_ATTR_NAT: {
1129 int err = parse_nat(a, info, log);
1130
1131 if (err)
1132 return err;
1133 break;
1134 }
1135#endif
620 default: 1136 default:
621 OVS_NLERR(log, "Unknown conntrack attr (%d)", 1137 OVS_NLERR(log, "Unknown conntrack attr (%d)",
622 type); 1138 type);
@@ -704,6 +1220,74 @@ err_free_ct:
704 return err; 1220 return err;
705} 1221}
706 1222
1223#ifdef CONFIG_NF_NAT_NEEDED
1224static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
1225 struct sk_buff *skb)
1226{
1227 struct nlattr *start;
1228
1229 start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
1230 if (!start)
1231 return false;
1232
1233 if (info->nat & OVS_CT_SRC_NAT) {
1234 if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
1235 return false;
1236 } else if (info->nat & OVS_CT_DST_NAT) {
1237 if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
1238 return false;
1239 } else {
1240 goto out;
1241 }
1242
1243 if (info->range.flags & NF_NAT_RANGE_MAP_IPS) {
1244 if (IS_ENABLED(CONFIG_NF_NAT_IPV4) &&
1245 info->family == NFPROTO_IPV4) {
1246 if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN,
1247 info->range.min_addr.ip) ||
1248 (info->range.max_addr.ip
1249 != info->range.min_addr.ip &&
1250 (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX,
1251 info->range.max_addr.ip))))
1252 return false;
1253 } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) &&
1254 info->family == NFPROTO_IPV6) {
1255 if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN,
1256 &info->range.min_addr.in6) ||
1257 (memcmp(&info->range.max_addr.in6,
1258 &info->range.min_addr.in6,
1259 sizeof(info->range.max_addr.in6)) &&
1260 (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX,
1261 &info->range.max_addr.in6))))
1262 return false;
1263 } else {
1264 return false;
1265 }
1266 }
1267 if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
1268 (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN,
1269 ntohs(info->range.min_proto.all)) ||
1270 (info->range.max_proto.all != info->range.min_proto.all &&
1271 nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX,
1272 ntohs(info->range.max_proto.all)))))
1273 return false;
1274
1275 if (info->range.flags & NF_NAT_RANGE_PERSISTENT &&
1276 nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT))
1277 return false;
1278 if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM &&
1279 nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH))
1280 return false;
1281 if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY &&
1282 nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM))
1283 return false;
1284out:
1285 nla_nest_end(skb, start);
1286
1287 return true;
1288}
1289#endif
1290
707int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, 1291int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
708 struct sk_buff *skb) 1292 struct sk_buff *skb)
709{ 1293{
@@ -732,7 +1316,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
732 ct_info->helper->name)) 1316 ct_info->helper->name))
733 return -EMSGSIZE; 1317 return -EMSGSIZE;
734 } 1318 }
735 1319#ifdef CONFIG_NF_NAT_NEEDED
1320 if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb))
1321 return -EMSGSIZE;
1322#endif
736 nla_nest_end(skb, start); 1323 nla_nest_end(skb, start);
737 1324
738 return 0; 1325 return 0;
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index a7544f405c16..8f6230bd6183 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -37,7 +37,8 @@ void ovs_ct_free_action(const struct nlattr *a);
37 37
38#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ 38#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
39 OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \ 39 OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \
40 OVS_CS_F_INVALID | OVS_CS_F_TRACKED) 40 OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \
41 OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
41#else 42#else
42#include <linux/errno.h> 43#include <linux/errno.h>
43 44
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index deadfdab1bc3..0cc66a4e492d 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -422,10 +422,6 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
422 struct sk_buff *nskb = NULL; 422 struct sk_buff *nskb = NULL;
423 struct sk_buff *user_skb = NULL; /* to be queued to userspace */ 423 struct sk_buff *user_skb = NULL; /* to be queued to userspace */
424 struct nlattr *nla; 424 struct nlattr *nla;
425 struct genl_info info = {
426 .dst_sk = ovs_dp_get_net(dp)->genl_sock,
427 .snd_portid = upcall_info->portid,
428 };
429 size_t len; 425 size_t len;
430 unsigned int hlen; 426 unsigned int hlen;
431 int err, dp_ifindex; 427 int err, dp_ifindex;
@@ -466,7 +462,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
466 hlen = skb->len; 462 hlen = skb->len;
467 463
468 len = upcall_msg_size(upcall_info, hlen); 464 len = upcall_msg_size(upcall_info, hlen);
469 user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); 465 user_skb = genlmsg_new(len, GFP_ATOMIC);
470 if (!user_skb) { 466 if (!user_skb) {
471 err = -ENOMEM; 467 err = -ENOMEM;
472 goto out; 468 goto out;
@@ -654,7 +650,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
654 650
655static const struct genl_ops dp_packet_genl_ops[] = { 651static const struct genl_ops dp_packet_genl_ops[] = {
656 { .cmd = OVS_PACKET_CMD_EXECUTE, 652 { .cmd = OVS_PACKET_CMD_EXECUTE,
657 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 653 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
658 .policy = packet_policy, 654 .policy = packet_policy,
659 .doit = ovs_packet_cmd_execute 655 .doit = ovs_packet_cmd_execute
660 } 656 }
@@ -876,7 +872,7 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *act
876 return NULL; 872 return NULL;
877 873
878 len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); 874 len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags);
879 skb = genlmsg_new_unicast(len, info, GFP_KERNEL); 875 skb = genlmsg_new(len, GFP_KERNEL);
880 if (!skb) 876 if (!skb)
881 return ERR_PTR(-ENOMEM); 877 return ERR_PTR(-ENOMEM);
882 878
@@ -1100,26 +1096,32 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
1100 struct sw_flow_match match; 1096 struct sw_flow_match match;
1101 struct sw_flow_id sfid; 1097 struct sw_flow_id sfid;
1102 u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); 1098 u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
1103 int error; 1099 int error = 0;
1104 bool log = !a[OVS_FLOW_ATTR_PROBE]; 1100 bool log = !a[OVS_FLOW_ATTR_PROBE];
1105 bool ufid_present; 1101 bool ufid_present;
1106 1102
1107 /* Extract key. */
1108 error = -EINVAL;
1109 if (!a[OVS_FLOW_ATTR_KEY]) {
1110 OVS_NLERR(log, "Flow key attribute not present in set flow.");
1111 goto error;
1112 }
1113
1114 ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); 1103 ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
1115 ovs_match_init(&match, &key, &mask); 1104 if (a[OVS_FLOW_ATTR_KEY]) {
1116 error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], 1105 ovs_match_init(&match, &key, &mask);
1117 a[OVS_FLOW_ATTR_MASK], log); 1106 error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
1107 a[OVS_FLOW_ATTR_MASK], log);
1108 } else if (!ufid_present) {
1109 OVS_NLERR(log,
1110 "Flow set message rejected, Key attribute missing.");
1111 error = -EINVAL;
1112 }
1118 if (error) 1113 if (error)
1119 goto error; 1114 goto error;
1120 1115
1121 /* Validate actions. */ 1116 /* Validate actions. */
1122 if (a[OVS_FLOW_ATTR_ACTIONS]) { 1117 if (a[OVS_FLOW_ATTR_ACTIONS]) {
1118 if (!a[OVS_FLOW_ATTR_KEY]) {
1119 OVS_NLERR(log,
1120 "Flow key attribute not present in set flow.");
1121 error = -EINVAL;
1122 goto error;
1123 }
1124
1123 acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key, 1125 acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key,
1124 &mask, log); 1126 &mask, log);
1125 if (IS_ERR(acts)) { 1127 if (IS_ERR(acts)) {
@@ -1391,12 +1393,12 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
1391 1393
1392static const struct genl_ops dp_flow_genl_ops[] = { 1394static const struct genl_ops dp_flow_genl_ops[] = {
1393 { .cmd = OVS_FLOW_CMD_NEW, 1395 { .cmd = OVS_FLOW_CMD_NEW,
1394 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 1396 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1395 .policy = flow_policy, 1397 .policy = flow_policy,
1396 .doit = ovs_flow_cmd_new 1398 .doit = ovs_flow_cmd_new
1397 }, 1399 },
1398 { .cmd = OVS_FLOW_CMD_DEL, 1400 { .cmd = OVS_FLOW_CMD_DEL,
1399 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 1401 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1400 .policy = flow_policy, 1402 .policy = flow_policy,
1401 .doit = ovs_flow_cmd_del 1403 .doit = ovs_flow_cmd_del
1402 }, 1404 },
@@ -1407,7 +1409,7 @@ static const struct genl_ops dp_flow_genl_ops[] = {
1407 .dumpit = ovs_flow_cmd_dump 1409 .dumpit = ovs_flow_cmd_dump
1408 }, 1410 },
1409 { .cmd = OVS_FLOW_CMD_SET, 1411 { .cmd = OVS_FLOW_CMD_SET,
1410 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 1412 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1411 .policy = flow_policy, 1413 .policy = flow_policy,
1412 .doit = ovs_flow_cmd_set, 1414 .doit = ovs_flow_cmd_set,
1413 }, 1415 },
@@ -1481,9 +1483,9 @@ error:
1481 return -EMSGSIZE; 1483 return -EMSGSIZE;
1482} 1484}
1483 1485
1484static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info) 1486static struct sk_buff *ovs_dp_cmd_alloc_info(void)
1485{ 1487{
1486 return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL); 1488 return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL);
1487} 1489}
1488 1490
1489/* Called with rcu_read_lock or ovs_mutex. */ 1491/* Called with rcu_read_lock or ovs_mutex. */
@@ -1536,7 +1538,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
1536 if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) 1538 if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
1537 goto err; 1539 goto err;
1538 1540
1539 reply = ovs_dp_cmd_alloc_info(info); 1541 reply = ovs_dp_cmd_alloc_info();
1540 if (!reply) 1542 if (!reply)
1541 return -ENOMEM; 1543 return -ENOMEM;
1542 1544
@@ -1657,7 +1659,7 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
1657 struct datapath *dp; 1659 struct datapath *dp;
1658 int err; 1660 int err;
1659 1661
1660 reply = ovs_dp_cmd_alloc_info(info); 1662 reply = ovs_dp_cmd_alloc_info();
1661 if (!reply) 1663 if (!reply)
1662 return -ENOMEM; 1664 return -ENOMEM;
1663 1665
@@ -1690,7 +1692,7 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
1690 struct datapath *dp; 1692 struct datapath *dp;
1691 int err; 1693 int err;
1692 1694
1693 reply = ovs_dp_cmd_alloc_info(info); 1695 reply = ovs_dp_cmd_alloc_info();
1694 if (!reply) 1696 if (!reply)
1695 return -ENOMEM; 1697 return -ENOMEM;
1696 1698
@@ -1723,7 +1725,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
1723 struct datapath *dp; 1725 struct datapath *dp;
1724 int err; 1726 int err;
1725 1727
1726 reply = ovs_dp_cmd_alloc_info(info); 1728 reply = ovs_dp_cmd_alloc_info();
1727 if (!reply) 1729 if (!reply)
1728 return -ENOMEM; 1730 return -ENOMEM;
1729 1731
@@ -1777,12 +1779,12 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
1777 1779
1778static const struct genl_ops dp_datapath_genl_ops[] = { 1780static const struct genl_ops dp_datapath_genl_ops[] = {
1779 { .cmd = OVS_DP_CMD_NEW, 1781 { .cmd = OVS_DP_CMD_NEW,
1780 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 1782 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1781 .policy = datapath_policy, 1783 .policy = datapath_policy,
1782 .doit = ovs_dp_cmd_new 1784 .doit = ovs_dp_cmd_new
1783 }, 1785 },
1784 { .cmd = OVS_DP_CMD_DEL, 1786 { .cmd = OVS_DP_CMD_DEL,
1785 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 1787 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1786 .policy = datapath_policy, 1788 .policy = datapath_policy,
1787 .doit = ovs_dp_cmd_del 1789 .doit = ovs_dp_cmd_del
1788 }, 1790 },
@@ -1793,7 +1795,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
1793 .dumpit = ovs_dp_cmd_dump 1795 .dumpit = ovs_dp_cmd_dump
1794 }, 1796 },
1795 { .cmd = OVS_DP_CMD_SET, 1797 { .cmd = OVS_DP_CMD_SET,
1796 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 1798 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1797 .policy = datapath_policy, 1799 .policy = datapath_policy,
1798 .doit = ovs_dp_cmd_set, 1800 .doit = ovs_dp_cmd_set,
1799 }, 1801 },
@@ -1912,6 +1914,29 @@ static struct vport *lookup_vport(struct net *net,
1912 return ERR_PTR(-EINVAL); 1914 return ERR_PTR(-EINVAL);
1913} 1915}
1914 1916
1917/* Called with ovs_mutex */
1918static void update_headroom(struct datapath *dp)
1919{
1920 unsigned dev_headroom, max_headroom = 0;
1921 struct net_device *dev;
1922 struct vport *vport;
1923 int i;
1924
1925 for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
1926 hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
1927 dev = vport->dev;
1928 dev_headroom = netdev_get_fwd_headroom(dev);
1929 if (dev_headroom > max_headroom)
1930 max_headroom = dev_headroom;
1931 }
1932 }
1933
1934 dp->max_headroom = max_headroom;
1935 for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
1936 hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node)
1937 netdev_set_rx_headroom(vport->dev, max_headroom);
1938}
1939
1915static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) 1940static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
1916{ 1941{
1917 struct nlattr **a = info->attrs; 1942 struct nlattr **a = info->attrs;
@@ -1977,6 +2002,12 @@ restart:
1977 2002
1978 err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, 2003 err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
1979 info->snd_seq, 0, OVS_VPORT_CMD_NEW); 2004 info->snd_seq, 0, OVS_VPORT_CMD_NEW);
2005
2006 if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom)
2007 update_headroom(dp);
2008 else
2009 netdev_set_rx_headroom(vport->dev, dp->max_headroom);
2010
1980 BUG_ON(err < 0); 2011 BUG_ON(err < 0);
1981 ovs_unlock(); 2012 ovs_unlock();
1982 2013
@@ -2043,8 +2074,10 @@ exit_unlock_free:
2043 2074
2044static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) 2075static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
2045{ 2076{
2077 bool must_update_headroom = false;
2046 struct nlattr **a = info->attrs; 2078 struct nlattr **a = info->attrs;
2047 struct sk_buff *reply; 2079 struct sk_buff *reply;
2080 struct datapath *dp;
2048 struct vport *vport; 2081 struct vport *vport;
2049 int err; 2082 int err;
2050 2083
@@ -2066,7 +2099,16 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
2066 err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, 2099 err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2067 info->snd_seq, 0, OVS_VPORT_CMD_DEL); 2100 info->snd_seq, 0, OVS_VPORT_CMD_DEL);
2068 BUG_ON(err < 0); 2101 BUG_ON(err < 0);
2102
2103 /* the vport deletion may trigger dp headroom update */
2104 dp = vport->dp;
2105 if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom)
2106 must_update_headroom = true;
2107 netdev_reset_rx_headroom(vport->dev);
2069 ovs_dp_detach_port(vport); 2108 ovs_dp_detach_port(vport);
2109
2110 if (must_update_headroom)
2111 update_headroom(dp);
2070 ovs_unlock(); 2112 ovs_unlock();
2071 2113
2072 ovs_notify(&dp_vport_genl_family, reply, info); 2114 ovs_notify(&dp_vport_genl_family, reply, info);
@@ -2158,12 +2200,12 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
2158 2200
2159static const struct genl_ops dp_vport_genl_ops[] = { 2201static const struct genl_ops dp_vport_genl_ops[] = {
2160 { .cmd = OVS_VPORT_CMD_NEW, 2202 { .cmd = OVS_VPORT_CMD_NEW,
2161 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 2203 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2162 .policy = vport_policy, 2204 .policy = vport_policy,
2163 .doit = ovs_vport_cmd_new 2205 .doit = ovs_vport_cmd_new
2164 }, 2206 },
2165 { .cmd = OVS_VPORT_CMD_DEL, 2207 { .cmd = OVS_VPORT_CMD_DEL,
2166 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 2208 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2167 .policy = vport_policy, 2209 .policy = vport_policy,
2168 .doit = ovs_vport_cmd_del 2210 .doit = ovs_vport_cmd_del
2169 }, 2211 },
@@ -2174,7 +2216,7 @@ static const struct genl_ops dp_vport_genl_ops[] = {
2174 .dumpit = ovs_vport_cmd_dump 2216 .dumpit = ovs_vport_cmd_dump
2175 }, 2217 },
2176 { .cmd = OVS_VPORT_CMD_SET, 2218 { .cmd = OVS_VPORT_CMD_SET,
2177 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 2219 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2178 .policy = vport_policy, 2220 .policy = vport_policy,
2179 .doit = ovs_vport_cmd_set, 2221 .doit = ovs_vport_cmd_set,
2180 }, 2222 },
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 67bdecd9fdc1..427e39a045cf 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -68,6 +68,8 @@ struct dp_stats_percpu {
68 * ovs_mutex and RCU. 68 * ovs_mutex and RCU.
69 * @stats_percpu: Per-CPU datapath statistics. 69 * @stats_percpu: Per-CPU datapath statistics.
70 * @net: Reference to net namespace. 70 * @net: Reference to net namespace.
71 * @max_headroom: the maximum headroom of all vports in this datapath; it will
72 * be used by all the internal vports in this dp.
71 * 73 *
72 * Context: See the comment on locking at the top of datapath.c for additional 74 * Context: See the comment on locking at the top of datapath.c for additional
73 * locking information. 75 * locking information.
@@ -89,6 +91,8 @@ struct datapath {
89 possible_net_t net; 91 possible_net_t net;
90 92
91 u32 user_features; 93 u32 user_features;
94
95 u32 max_headroom;
92}; 96};
93 97
94/** 98/**
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 1d055c559eaf..03378e75a67c 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -55,7 +55,7 @@ struct ovs_tunnel_info {
55 FIELD_SIZEOF(struct sw_flow_key, recirc_id)) 55 FIELD_SIZEOF(struct sw_flow_key, recirc_id))
56 56
57struct sw_flow_key { 57struct sw_flow_key {
58 u8 tun_opts[255]; 58 u8 tun_opts[IP_TUNNEL_OPTS_MAX];
59 u8 tun_opts_len; 59 u8 tun_opts_len;
60 struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ 60 struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */
61 struct { 61 struct {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index d1bd4a45ca2d..689c17264221 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1959,6 +1959,12 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
1959 if (!tun_dst) 1959 if (!tun_dst)
1960 return -ENOMEM; 1960 return -ENOMEM;
1961 1961
1962 err = dst_cache_init(&tun_dst->u.tun_info.dst_cache, GFP_KERNEL);
1963 if (err) {
1964 dst_release((struct dst_entry *)tun_dst);
1965 return err;
1966 }
1967
1962 a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, 1968 a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
1963 sizeof(*ovs_tun), log); 1969 sizeof(*ovs_tun), log);
1964 if (IS_ERR(a)) { 1970 if (IS_ERR(a)) {
@@ -2038,9 +2044,6 @@ static int validate_set(const struct nlattr *a,
2038 break; 2044 break;
2039 2045
2040 case OVS_KEY_ATTR_TUNNEL: 2046 case OVS_KEY_ATTR_TUNNEL:
2041 if (eth_p_mpls(eth_type))
2042 return -EINVAL;
2043
2044 if (masked) 2047 if (masked)
2045 return -EINVAL; /* Masked tunnel set not supported. */ 2048 return -EINVAL; /* Masked tunnel set not supported. */
2046 2049
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 30ab8e127288..1a1fcec88695 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -132,6 +132,6 @@ static void __exit ovs_geneve_tnl_exit(void)
132module_init(ovs_geneve_tnl_init); 132module_init(ovs_geneve_tnl_init);
133module_exit(ovs_geneve_tnl_exit); 133module_exit(ovs_geneve_tnl_exit);
134 134
135MODULE_DESCRIPTION("OVS: Geneve swiching port"); 135MODULE_DESCRIPTION("OVS: Geneve switching port");
136MODULE_LICENSE("GPL"); 136MODULE_LICENSE("GPL");
137MODULE_ALIAS("vport-type-5"); 137MODULE_ALIAS("vport-type-5");
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index ec76398a792f..7c8b90bf0e54 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -138,6 +138,11 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
138 return stats; 138 return stats;
139} 139}
140 140
141static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
142{
143 dev->needed_headroom = new_hr;
144}
145
141static const struct net_device_ops internal_dev_netdev_ops = { 146static const struct net_device_ops internal_dev_netdev_ops = {
142 .ndo_open = internal_dev_open, 147 .ndo_open = internal_dev_open,
143 .ndo_stop = internal_dev_stop, 148 .ndo_stop = internal_dev_stop,
@@ -145,6 +150,7 @@ static const struct net_device_ops internal_dev_netdev_ops = {
145 .ndo_set_mac_address = eth_mac_addr, 150 .ndo_set_mac_address = eth_mac_addr,
146 .ndo_change_mtu = internal_dev_change_mtu, 151 .ndo_change_mtu = internal_dev_change_mtu,
147 .ndo_get_stats64 = internal_get_stats, 152 .ndo_get_stats64 = internal_get_stats,
153 .ndo_set_rx_headroom = internal_set_rx_headroom,
148}; 154};
149 155
150static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { 156static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
@@ -158,7 +164,8 @@ static void do_setup(struct net_device *netdev)
158 netdev->netdev_ops = &internal_dev_netdev_ops; 164 netdev->netdev_ops = &internal_dev_netdev_ops;
159 165
160 netdev->priv_flags &= ~IFF_TX_SKB_SHARING; 166 netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
161 netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; 167 netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
168 IFF_PHONY_HEADROOM;
162 netdev->destructor = internal_dev_destructor; 169 netdev->destructor = internal_dev_destructor;
163 netdev->ethtool_ops = &internal_dev_ethtool_ops; 170 netdev->ethtool_ops = &internal_dev_ethtool_ops;
164 netdev->rtnl_link_ops = &internal_dev_link_ops; 171 netdev->rtnl_link_ops = &internal_dev_link_ops;
@@ -199,6 +206,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
199 err = -ENOMEM; 206 err = -ENOMEM;
200 goto error_free_netdev; 207 goto error_free_netdev;
201 } 208 }
209 vport->dev->needed_headroom = vport->dp->max_headroom;
202 210
203 dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); 211 dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
204 internal_dev = internal_dev_priv(vport->dev); 212 internal_dev = internal_dev_priv(vport->dev);
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 6a6adf314363..4e3972344aa6 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -58,7 +58,7 @@ static void netdev_port_receive(struct sk_buff *skb)
58 return; 58 return;
59 59
60 skb_push(skb, ETH_HLEN); 60 skb_push(skb, ETH_HLEN);
61 ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); 61 skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
62 ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); 62 ovs_vport_receive(vport, skb, skb_tunnel_info(skb));
63 return; 63 return;
64error: 64error:
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index de9cb19efb6a..5eb7694348b5 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -90,7 +90,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
90 int err; 90 int err;
91 struct vxlan_config conf = { 91 struct vxlan_config conf = {
92 .no_share = true, 92 .no_share = true,
93 .flags = VXLAN_F_COLLECT_METADATA, 93 .flags = VXLAN_F_COLLECT_METADATA | VXLAN_F_UDP_ZERO_CSUM6_RX,
94 /* Don't restrict the packets that can be sent by MTU */ 94 /* Don't restrict the packets that can be sent by MTU */
95 .mtu = IP_MAX_MTU, 95 .mtu = IP_MAX_MTU,
96 }; 96 };
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index c10899cb9040..f01f28a567ad 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -185,13 +185,6 @@ static inline struct vport *vport_from_priv(void *priv)
185int ovs_vport_receive(struct vport *, struct sk_buff *, 185int ovs_vport_receive(struct vport *, struct sk_buff *,
186 const struct ip_tunnel_info *); 186 const struct ip_tunnel_info *);
187 187
188static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
189 const void *start, unsigned int len)
190{
191 if (skb->ip_summed == CHECKSUM_COMPLETE)
192 skb->csum = csum_add(skb->csum, csum_partial(start, len, 0));
193}
194
195static inline const char *ovs_vport_name(struct vport *vport) 188static inline const char *ovs_vport_name(struct vport *vport)
196{ 189{
197 return vport->dev->name; 190 return vport->dev->name;