aboutsummaryrefslogtreecommitdiffstats
path: root/net/openvswitch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-19 13:05:34 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-19 13:05:34 -0400
commit1200b6809dfd9d73bc4c7db76d288c35fa4b2ebe (patch)
tree552e03de245cdbd0780ca1215914edc4a26540f7 /net/openvswitch
parent6b5f04b6cf8ebab9a65d9c0026c650bb2538fd0f (diff)
parentfe30937b65354c7fec244caebbdaae68e28ca797 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Highlights: 1) Support more Realtek wireless chips, from Jes Sorenson. 2) New BPF types for per-cpu hash and arrap maps, from Alexei Starovoitov. 3) Make several TCP sysctls per-namespace, from Nikolay Borisov. 4) Allow the use of SO_REUSEPORT in order to do per-thread processing of incoming TCP/UDP connections. The muxing can be done using a BPF program which hashes the incoming packet. From Craig Gallek. 5) Add a multiplexer for TCP streams, to provide a messaged based interface. BPF programs can be used to determine the message boundaries. From Tom Herbert. 6) Add 802.1AE MACSEC support, from Sabrina Dubroca. 7) Avoid factorial complexity when taking down an inetdev interface with lots of configured addresses. We were doing things like traversing the entire address less for each address removed, and flushing the entire netfilter conntrack table for every address as well. 8) Add and use SKB bulk free infrastructure, from Jesper Brouer. 9) Allow offloading u32 classifiers to hardware, and implement for ixgbe, from John Fastabend. 10) Allow configuring IRQ coalescing parameters on a per-queue basis, from Kan Liang. 11) Extend ethtool so that larger link mode masks can be supported. From David Decotigny. 12) Introduce devlink, which can be used to configure port link types (ethernet vs Infiniband, etc.), port splitting, and switch device level attributes as a whole. From Jiri Pirko. 13) Hardware offload support for flower classifiers, from Amir Vadai. 14) Add "Local Checksum Offload". Basically, for a tunneled packet the checksum of the outer header is 'constant' (because with the checksum field filled into the inner protocol header, the payload of the outer frame checksums to 'zero'), and we can take advantage of that in various ways. From Edward Cree" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1548 commits) bonding: fix bond_get_stats() net: bcmgenet: fix dma api length mismatch net/mlx4_core: Fix backward compatibility on VFs phy: mdio-thunder: Fix some Kconfig typos lan78xx: add ndo_get_stats64 lan78xx: handle statistics counter rollover RDS: TCP: Remove unused constant RDS: TCP: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket net: smc911x: convert pxa dma to dmaengine team: remove duplicate set of flag IFF_MULTICAST bonding: remove duplicate set of flag IFF_MULTICAST net: fix a comment typo ethernet: micrel: fix some error codes ip_tunnels, bpf: define IP_TUNNEL_OPTS_MAX and use it bpf, dst: add and use dst_tclassid helper bpf: make skb->tc_classid also readable net: mvneta: bm: clarify dependencies cls_bpf: reset class and reuse major in da ldmvsw: Checkpatch sunvnet.c and sunvnet_common.c ldmvsw: Add ldmvsw.c driver code ...
Diffstat (limited to 'net/openvswitch')
-rw-r--r--net/openvswitch/Kconfig4
-rw-r--r--net/openvswitch/actions.c8
-rw-r--r--net/openvswitch/conntrack.c660
-rw-r--r--net/openvswitch/conntrack.h3
-rw-r--r--net/openvswitch/datapath.c108
-rw-r--r--net/openvswitch/datapath.h4
-rw-r--r--net/openvswitch/flow.h2
-rw-r--r--net/openvswitch/flow_netlink.c9
-rw-r--r--net/openvswitch/vport-internal_dev.c10
-rw-r--r--net/openvswitch/vport-netdev.c2
-rw-r--r--net/openvswitch/vport.h7
11 files changed, 726 insertions, 91 deletions
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index d143aa9f6654..234a73344c6e 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -6,10 +6,12 @@ config OPENVSWITCH
6 tristate "Open vSwitch" 6 tristate "Open vSwitch"
7 depends on INET 7 depends on INET
8 depends on !NF_CONNTRACK || \ 8 depends on !NF_CONNTRACK || \
9 (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6)) 9 (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
10 (!NF_NAT || NF_NAT)))
10 select LIBCRC32C 11 select LIBCRC32C
11 select MPLS 12 select MPLS
12 select NET_MPLS_GSO 13 select NET_MPLS_GSO
14 select DST_CACHE
13 ---help--- 15 ---help---
14 Open vSwitch is a multilayer Ethernet switch targeted at virtualized 16 Open vSwitch is a multilayer Ethernet switch targeted at virtualized
15 environments. In addition to supporting a variety of features 17 environments. In addition to supporting a variety of features
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 2d59df521915..e9dd47b2a85b 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -158,9 +158,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
158 new_mpls_lse = (__be32 *)skb_mpls_header(skb); 158 new_mpls_lse = (__be32 *)skb_mpls_header(skb);
159 *new_mpls_lse = mpls->mpls_lse; 159 *new_mpls_lse = mpls->mpls_lse;
160 160
161 if (skb->ip_summed == CHECKSUM_COMPLETE) 161 skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
162 skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse,
163 MPLS_HLEN, 0));
164 162
165 hdr = eth_hdr(skb); 163 hdr = eth_hdr(skb);
166 hdr->h_proto = mpls->mpls_ethertype; 164 hdr->h_proto = mpls->mpls_ethertype;
@@ -280,7 +278,7 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
280 ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst, 278 ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst,
281 mask->eth_dst); 279 mask->eth_dst);
282 280
283 ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); 281 skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
284 282
285 ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source); 283 ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source);
286 ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest); 284 ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest);
@@ -639,7 +637,7 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk
639 /* Reconstruct the MAC header. */ 637 /* Reconstruct the MAC header. */
640 skb_push(skb, data->l2_len); 638 skb_push(skb, data->l2_len);
641 memcpy(skb->data, &data->l2_data, data->l2_len); 639 memcpy(skb->data, &data->l2_data, data->l2_len);
642 ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len); 640 skb_postpush_rcsum(skb, skb->data, data->l2_len);
643 skb_reset_mac_header(skb); 641 skb_reset_mac_header(skb);
644 642
645 ovs_vport_send(vport, skb); 643 ovs_vport_send(vport, skb);
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index ee6ff8ffc12d..dc5eb29fe7d6 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -13,21 +13,31 @@
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/openvswitch.h> 15#include <linux/openvswitch.h>
16#include <linux/tcp.h>
17#include <linux/udp.h>
18#include <linux/sctp.h>
16#include <net/ip.h> 19#include <net/ip.h>
17#include <net/netfilter/nf_conntrack_core.h> 20#include <net/netfilter/nf_conntrack_core.h>
18#include <net/netfilter/nf_conntrack_helper.h> 21#include <net/netfilter/nf_conntrack_helper.h>
19#include <net/netfilter/nf_conntrack_labels.h> 22#include <net/netfilter/nf_conntrack_labels.h>
23#include <net/netfilter/nf_conntrack_seqadj.h>
20#include <net/netfilter/nf_conntrack_zones.h> 24#include <net/netfilter/nf_conntrack_zones.h>
21#include <net/netfilter/ipv6/nf_defrag_ipv6.h> 25#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
22 26
27#ifdef CONFIG_NF_NAT_NEEDED
28#include <linux/netfilter/nf_nat.h>
29#include <net/netfilter/nf_nat_core.h>
30#include <net/netfilter/nf_nat_l3proto.h>
31#endif
32
23#include "datapath.h" 33#include "datapath.h"
24#include "conntrack.h" 34#include "conntrack.h"
25#include "flow.h" 35#include "flow.h"
26#include "flow_netlink.h" 36#include "flow_netlink.h"
27 37
28struct ovs_ct_len_tbl { 38struct ovs_ct_len_tbl {
29 size_t maxlen; 39 int maxlen;
30 size_t minlen; 40 int minlen;
31}; 41};
32 42
33/* Metadata mark for masked write to conntrack mark */ 43/* Metadata mark for masked write to conntrack mark */
@@ -42,15 +52,25 @@ struct md_labels {
42 struct ovs_key_ct_labels mask; 52 struct ovs_key_ct_labels mask;
43}; 53};
44 54
55enum ovs_ct_nat {
56 OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */
57 OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */
58 OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */
59};
60
45/* Conntrack action context for execution. */ 61/* Conntrack action context for execution. */
46struct ovs_conntrack_info { 62struct ovs_conntrack_info {
47 struct nf_conntrack_helper *helper; 63 struct nf_conntrack_helper *helper;
48 struct nf_conntrack_zone zone; 64 struct nf_conntrack_zone zone;
49 struct nf_conn *ct; 65 struct nf_conn *ct;
50 u8 commit : 1; 66 u8 commit : 1;
67 u8 nat : 3; /* enum ovs_ct_nat */
51 u16 family; 68 u16 family;
52 struct md_mark mark; 69 struct md_mark mark;
53 struct md_labels labels; 70 struct md_labels labels;
71#ifdef CONFIG_NF_NAT_NEEDED
72 struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */
73#endif
54}; 74};
55 75
56static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); 76static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -75,7 +95,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
75 switch (ctinfo) { 95 switch (ctinfo) {
76 case IP_CT_ESTABLISHED_REPLY: 96 case IP_CT_ESTABLISHED_REPLY:
77 case IP_CT_RELATED_REPLY: 97 case IP_CT_RELATED_REPLY:
78 case IP_CT_NEW_REPLY:
79 ct_state |= OVS_CS_F_REPLY_DIR; 98 ct_state |= OVS_CS_F_REPLY_DIR;
80 break; 99 break;
81 default: 100 default:
@@ -92,7 +111,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
92 ct_state |= OVS_CS_F_RELATED; 111 ct_state |= OVS_CS_F_RELATED;
93 break; 112 break;
94 case IP_CT_NEW: 113 case IP_CT_NEW:
95 case IP_CT_NEW_REPLY:
96 ct_state |= OVS_CS_F_NEW; 114 ct_state |= OVS_CS_F_NEW;
97 break; 115 break;
98 default: 116 default:
@@ -139,12 +157,15 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
139 ovs_ct_get_labels(ct, &key->ct.labels); 157 ovs_ct_get_labels(ct, &key->ct.labels);
140} 158}
141 159
142/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has 160/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
143 * previously sent the packet to conntrack via the ct action. 161 * previously sent the packet to conntrack via the ct action. If
162 * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
163 * initialized from the connection status.
144 */ 164 */
145static void ovs_ct_update_key(const struct sk_buff *skb, 165static void ovs_ct_update_key(const struct sk_buff *skb,
146 const struct ovs_conntrack_info *info, 166 const struct ovs_conntrack_info *info,
147 struct sw_flow_key *key, bool post_ct) 167 struct sw_flow_key *key, bool post_ct,
168 bool keep_nat_flags)
148{ 169{
149 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 170 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
150 enum ip_conntrack_info ctinfo; 171 enum ip_conntrack_info ctinfo;
@@ -154,10 +175,22 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
154 ct = nf_ct_get(skb, &ctinfo); 175 ct = nf_ct_get(skb, &ctinfo);
155 if (ct) { 176 if (ct) {
156 state = ovs_ct_get_state(ctinfo); 177 state = ovs_ct_get_state(ctinfo);
178 /* All unconfirmed entries are NEW connections. */
157 if (!nf_ct_is_confirmed(ct)) 179 if (!nf_ct_is_confirmed(ct))
158 state |= OVS_CS_F_NEW; 180 state |= OVS_CS_F_NEW;
181 /* OVS persists the related flag for the duration of the
182 * connection.
183 */
159 if (ct->master) 184 if (ct->master)
160 state |= OVS_CS_F_RELATED; 185 state |= OVS_CS_F_RELATED;
186 if (keep_nat_flags) {
187 state |= key->ct.state & OVS_CS_F_NAT_MASK;
188 } else {
189 if (ct->status & IPS_SRC_NAT)
190 state |= OVS_CS_F_SRC_NAT;
191 if (ct->status & IPS_DST_NAT)
192 state |= OVS_CS_F_DST_NAT;
193 }
161 zone = nf_ct_zone(ct); 194 zone = nf_ct_zone(ct);
162 } else if (post_ct) { 195 } else if (post_ct) {
163 state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; 196 state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
@@ -167,9 +200,12 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
167 __ovs_ct_update_key(key, state, zone, ct); 200 __ovs_ct_update_key(key, state, zone, ct);
168} 201}
169 202
203/* This is called to initialize CT key fields possibly coming in from the local
204 * stack.
205 */
170void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) 206void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
171{ 207{
172 ovs_ct_update_key(skb, NULL, key, false); 208 ovs_ct_update_key(skb, NULL, key, false, false);
173} 209}
174 210
175int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) 211int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
@@ -201,7 +237,6 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
201 struct nf_conn *ct; 237 struct nf_conn *ct;
202 u32 new_mark; 238 u32 new_mark;
203 239
204
205 /* The connection could be invalid, in which case set_mark is no-op. */ 240 /* The connection could be invalid, in which case set_mark is no-op. */
206 ct = nf_ct_get(skb, &ctinfo); 241 ct = nf_ct_get(skb, &ctinfo);
207 if (!ct) 242 if (!ct)
@@ -259,6 +294,7 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
259 enum ip_conntrack_info ctinfo; 294 enum ip_conntrack_info ctinfo;
260 unsigned int protoff; 295 unsigned int protoff;
261 struct nf_conn *ct; 296 struct nf_conn *ct;
297 int err;
262 298
263 ct = nf_ct_get(skb, &ctinfo); 299 ct = nf_ct_get(skb, &ctinfo);
264 if (!ct || ctinfo == IP_CT_RELATED_REPLY) 300 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
@@ -295,7 +331,18 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
295 return NF_DROP; 331 return NF_DROP;
296 } 332 }
297 333
298 return helper->help(skb, protoff, ct, ctinfo); 334 err = helper->help(skb, protoff, ct, ctinfo);
335 if (err != NF_ACCEPT)
336 return err;
337
338 /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
339 * FTP with NAT) adusting the TCP payload size when mangling IP
340 * addresses and/or port numbers in the text-based control connection.
341 */
342 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
343 !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
344 return NF_DROP;
345 return NF_ACCEPT;
299} 346}
300 347
301/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero 348/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
@@ -352,14 +399,101 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
352 return __nf_ct_expect_find(net, zone, &tuple); 399 return __nf_ct_expect_find(net, zone, &tuple);
353} 400}
354 401
402/* This replicates logic from nf_conntrack_core.c that is not exported. */
403static enum ip_conntrack_info
404ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
405{
406 const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
407
408 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
409 return IP_CT_ESTABLISHED_REPLY;
410 /* Once we've had two way comms, always ESTABLISHED. */
411 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status))
412 return IP_CT_ESTABLISHED;
413 if (test_bit(IPS_EXPECTED_BIT, &ct->status))
414 return IP_CT_RELATED;
415 return IP_CT_NEW;
416}
417
418/* Find an existing connection which this packet belongs to without
419 * re-attributing statistics or modifying the connection state. This allows an
420 * skb->nfct lost due to an upcall to be recovered during actions execution.
421 *
422 * Must be called with rcu_read_lock.
423 *
424 * On success, populates skb->nfct and skb->nfctinfo, and returns the
425 * connection. Returns NULL if there is no existing entry.
426 */
427static struct nf_conn *
428ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
429 u8 l3num, struct sk_buff *skb)
430{
431 struct nf_conntrack_l3proto *l3proto;
432 struct nf_conntrack_l4proto *l4proto;
433 struct nf_conntrack_tuple tuple;
434 struct nf_conntrack_tuple_hash *h;
435 enum ip_conntrack_info ctinfo;
436 struct nf_conn *ct;
437 unsigned int dataoff;
438 u8 protonum;
439
440 l3proto = __nf_ct_l3proto_find(l3num);
441 if (!l3proto) {
442 pr_debug("ovs_ct_find_existing: Can't get l3proto\n");
443 return NULL;
444 }
445 if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
446 &protonum) <= 0) {
447 pr_debug("ovs_ct_find_existing: Can't get protonum\n");
448 return NULL;
449 }
450 l4proto = __nf_ct_l4proto_find(l3num, protonum);
451 if (!l4proto) {
452 pr_debug("ovs_ct_find_existing: Can't get l4proto\n");
453 return NULL;
454 }
455 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
456 protonum, net, &tuple, l3proto, l4proto)) {
457 pr_debug("ovs_ct_find_existing: Can't get tuple\n");
458 return NULL;
459 }
460
461 /* look for tuple match */
462 h = nf_conntrack_find_get(net, zone, &tuple);
463 if (!h)
464 return NULL; /* Not found. */
465
466 ct = nf_ct_tuplehash_to_ctrack(h);
467
468 ctinfo = ovs_ct_get_info(h);
469 if (ctinfo == IP_CT_NEW) {
470 /* This should not happen. */
471 WARN_ONCE(1, "ovs_ct_find_existing: new packet for %p\n", ct);
472 }
473 skb->nfct = &ct->ct_general;
474 skb->nfctinfo = ctinfo;
475 return ct;
476}
477
355/* Determine whether skb->nfct is equal to the result of conntrack lookup. */ 478/* Determine whether skb->nfct is equal to the result of conntrack lookup. */
356static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, 479static bool skb_nfct_cached(struct net *net,
357 const struct ovs_conntrack_info *info) 480 const struct sw_flow_key *key,
481 const struct ovs_conntrack_info *info,
482 struct sk_buff *skb)
358{ 483{
359 enum ip_conntrack_info ctinfo; 484 enum ip_conntrack_info ctinfo;
360 struct nf_conn *ct; 485 struct nf_conn *ct;
361 486
362 ct = nf_ct_get(skb, &ctinfo); 487 ct = nf_ct_get(skb, &ctinfo);
488 /* If no ct, check if we have evidence that an existing conntrack entry
489 * might be found for this skb. This happens when we lose a skb->nfct
490 * due to an upcall. If the connection was not confirmed, it is not
491 * cached and needs to be run through conntrack again.
492 */
493 if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
494 !(key->ct.state & OVS_CS_F_INVALID) &&
495 key->ct.zone == info->zone.id)
496 ct = ovs_ct_find_existing(net, &info->zone, info->family, skb);
363 if (!ct) 497 if (!ct)
364 return false; 498 return false;
365 if (!net_eq(net, read_pnet(&ct->ct_net))) 499 if (!net_eq(net, read_pnet(&ct->ct_net)))
@@ -377,6 +511,206 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
377 return true; 511 return true;
378} 512}
379 513
514#ifdef CONFIG_NF_NAT_NEEDED
515/* Modelled after nf_nat_ipv[46]_fn().
516 * range is only used for new, uninitialized NAT state.
517 * Returns either NF_ACCEPT or NF_DROP.
518 */
519static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
520 enum ip_conntrack_info ctinfo,
521 const struct nf_nat_range *range,
522 enum nf_nat_manip_type maniptype)
523{
524 int hooknum, nh_off, err = NF_ACCEPT;
525
526 nh_off = skb_network_offset(skb);
527 skb_pull(skb, nh_off);
528
529 /* See HOOK2MANIP(). */
530 if (maniptype == NF_NAT_MANIP_SRC)
531 hooknum = NF_INET_LOCAL_IN; /* Source NAT */
532 else
533 hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
534
535 switch (ctinfo) {
536 case IP_CT_RELATED:
537 case IP_CT_RELATED_REPLY:
538 if (skb->protocol == htons(ETH_P_IP) &&
539 ip_hdr(skb)->protocol == IPPROTO_ICMP) {
540 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
541 hooknum))
542 err = NF_DROP;
543 goto push;
544#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
545 } else if (skb->protocol == htons(ETH_P_IPV6)) {
546 __be16 frag_off;
547 u8 nexthdr = ipv6_hdr(skb)->nexthdr;
548 int hdrlen = ipv6_skip_exthdr(skb,
549 sizeof(struct ipv6hdr),
550 &nexthdr, &frag_off);
551
552 if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
553 if (!nf_nat_icmpv6_reply_translation(skb, ct,
554 ctinfo,
555 hooknum,
556 hdrlen))
557 err = NF_DROP;
558 goto push;
559 }
560#endif
561 }
562 /* Non-ICMP, fall thru to initialize if needed. */
563 case IP_CT_NEW:
564 /* Seen it before? This can happen for loopback, retrans,
565 * or local packets.
566 */
567 if (!nf_nat_initialized(ct, maniptype)) {
568 /* Initialize according to the NAT action. */
569 err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
570 /* Action is set up to establish a new
571 * mapping.
572 */
573 ? nf_nat_setup_info(ct, range, maniptype)
574 : nf_nat_alloc_null_binding(ct, hooknum);
575 if (err != NF_ACCEPT)
576 goto push;
577 }
578 break;
579
580 case IP_CT_ESTABLISHED:
581 case IP_CT_ESTABLISHED_REPLY:
582 break;
583
584 default:
585 err = NF_DROP;
586 goto push;
587 }
588
589 err = nf_nat_packet(ct, ctinfo, hooknum, skb);
590push:
591 skb_push(skb, nh_off);
592
593 return err;
594}
595
596static void ovs_nat_update_key(struct sw_flow_key *key,
597 const struct sk_buff *skb,
598 enum nf_nat_manip_type maniptype)
599{
600 if (maniptype == NF_NAT_MANIP_SRC) {
601 __be16 src;
602
603 key->ct.state |= OVS_CS_F_SRC_NAT;
604 if (key->eth.type == htons(ETH_P_IP))
605 key->ipv4.addr.src = ip_hdr(skb)->saddr;
606 else if (key->eth.type == htons(ETH_P_IPV6))
607 memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr,
608 sizeof(key->ipv6.addr.src));
609 else
610 return;
611
612 if (key->ip.proto == IPPROTO_UDP)
613 src = udp_hdr(skb)->source;
614 else if (key->ip.proto == IPPROTO_TCP)
615 src = tcp_hdr(skb)->source;
616 else if (key->ip.proto == IPPROTO_SCTP)
617 src = sctp_hdr(skb)->source;
618 else
619 return;
620
621 key->tp.src = src;
622 } else {
623 __be16 dst;
624
625 key->ct.state |= OVS_CS_F_DST_NAT;
626 if (key->eth.type == htons(ETH_P_IP))
627 key->ipv4.addr.dst = ip_hdr(skb)->daddr;
628 else if (key->eth.type == htons(ETH_P_IPV6))
629 memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr,
630 sizeof(key->ipv6.addr.dst));
631 else
632 return;
633
634 if (key->ip.proto == IPPROTO_UDP)
635 dst = udp_hdr(skb)->dest;
636 else if (key->ip.proto == IPPROTO_TCP)
637 dst = tcp_hdr(skb)->dest;
638 else if (key->ip.proto == IPPROTO_SCTP)
639 dst = sctp_hdr(skb)->dest;
640 else
641 return;
642
643 key->tp.dst = dst;
644 }
645}
646
647/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
648static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
649 const struct ovs_conntrack_info *info,
650 struct sk_buff *skb, struct nf_conn *ct,
651 enum ip_conntrack_info ctinfo)
652{
653 enum nf_nat_manip_type maniptype;
654 int err;
655
656 if (nf_ct_is_untracked(ct)) {
657 /* A NAT action may only be performed on tracked packets. */
658 return NF_ACCEPT;
659 }
660
661 /* Add NAT extension if not confirmed yet. */
662 if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
663 return NF_ACCEPT; /* Can't NAT. */
664
665 /* Determine NAT type.
666 * Check if the NAT type can be deduced from the tracked connection.
667 * Make sure expected traffic is NATted only when committing.
668 */
669 if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW &&
670 ct->status & IPS_NAT_MASK &&
671 (!(ct->status & IPS_EXPECTED_BIT) || info->commit)) {
672 /* NAT an established or related connection like before. */
673 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
674 /* This is the REPLY direction for a connection
675 * for which NAT was applied in the forward
676 * direction. Do the reverse NAT.
677 */
678 maniptype = ct->status & IPS_SRC_NAT
679 ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
680 else
681 maniptype = ct->status & IPS_SRC_NAT
682 ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
683 } else if (info->nat & OVS_CT_SRC_NAT) {
684 maniptype = NF_NAT_MANIP_SRC;
685 } else if (info->nat & OVS_CT_DST_NAT) {
686 maniptype = NF_NAT_MANIP_DST;
687 } else {
688 return NF_ACCEPT; /* Connection is not NATed. */
689 }
690 err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
691
692 /* Mark NAT done if successful and update the flow key. */
693 if (err == NF_ACCEPT)
694 ovs_nat_update_key(key, skb, maniptype);
695
696 return err;
697}
698#else /* !CONFIG_NF_NAT_NEEDED */
699static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
700 const struct ovs_conntrack_info *info,
701 struct sk_buff *skb, struct nf_conn *ct,
702 enum ip_conntrack_info ctinfo)
703{
704 return NF_ACCEPT;
705}
706#endif
707
708/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
709 * not done already. Update key with new CT state after passing the packet
710 * through conntrack.
711 * Note that if the packet is deemed invalid by conntrack, skb->nfct will be
712 * set to NULL and 0 will be returned.
713 */
380static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, 714static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
381 const struct ovs_conntrack_info *info, 715 const struct ovs_conntrack_info *info,
382 struct sk_buff *skb) 716 struct sk_buff *skb)
@@ -386,8 +720,13 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
386 * actually run the packet through conntrack twice unless it's for a 720 * actually run the packet through conntrack twice unless it's for a
387 * different zone. 721 * different zone.
388 */ 722 */
389 if (!skb_nfct_cached(net, skb, info)) { 723 bool cached = skb_nfct_cached(net, key, info, skb);
724 enum ip_conntrack_info ctinfo;
725 struct nf_conn *ct;
726
727 if (!cached) {
390 struct nf_conn *tmpl = info->ct; 728 struct nf_conn *tmpl = info->ct;
729 int err;
391 730
392 /* Associate skb with specified zone. */ 731 /* Associate skb with specified zone. */
393 if (tmpl) { 732 if (tmpl) {
@@ -398,17 +737,53 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
398 skb->nfctinfo = IP_CT_NEW; 737 skb->nfctinfo = IP_CT_NEW;
399 } 738 }
400 739
401 if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING, 740 /* Repeat if requested, see nf_iterate(). */
402 skb) != NF_ACCEPT) 741 do {
742 err = nf_conntrack_in(net, info->family,
743 NF_INET_PRE_ROUTING, skb);
744 } while (err == NF_REPEAT);
745
746 if (err != NF_ACCEPT)
403 return -ENOENT; 747 return -ENOENT;
404 748
405 if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) { 749 /* Clear CT state NAT flags to mark that we have not yet done
406 WARN_ONCE(1, "helper rejected packet"); 750 * NAT after the nf_conntrack_in() call. We can actually clear
751 * the whole state, as it will be re-initialized below.
752 */
753 key->ct.state = 0;
754
755 /* Update the key, but keep the NAT flags. */
756 ovs_ct_update_key(skb, info, key, true, true);
757 }
758
759 ct = nf_ct_get(skb, &ctinfo);
760 if (ct) {
761 /* Packets starting a new connection must be NATted before the
762 * helper, so that the helper knows about the NAT. We enforce
763 * this by delaying both NAT and helper calls for unconfirmed
764 * connections until the committing CT action. For later
765 * packets NAT and Helper may be called in either order.
766 *
767 * NAT will be done only if the CT action has NAT, and only
768 * once per packet (per zone), as guarded by the NAT bits in
769 * the key->ct.state.
770 */
771 if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
772 (nf_ct_is_confirmed(ct) || info->commit) &&
773 ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
407 return -EINVAL; 774 return -EINVAL;
408 } 775 }
409 }
410 776
411 ovs_ct_update_key(skb, info, key, true); 777 /* Call the helper only if:
778 * - nf_conntrack_in() was executed above ("!cached") for a
779 * confirmed connection, or
780 * - When committing an unconfirmed connection.
781 */
782 if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
783 ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
784 return -EINVAL;
785 }
786 }
412 787
413 return 0; 788 return 0;
414} 789}
@@ -420,19 +795,24 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
420{ 795{
421 struct nf_conntrack_expect *exp; 796 struct nf_conntrack_expect *exp;
422 797
798 /* If we pass an expected packet through nf_conntrack_in() the
799 * expectation is typically removed, but the packet could still be
800 * lost in upcall processing. To prevent this from happening we
801 * perform an explicit expectation lookup. Expected connections are
802 * always new, and will be passed through conntrack only when they are
803 * committed, as it is OK to remove the expectation at that time.
804 */
423 exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); 805 exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
424 if (exp) { 806 if (exp) {
425 u8 state; 807 u8 state;
426 808
809 /* NOTE: New connections are NATted and Helped only when
810 * committed, so we are not calling into NAT here.
811 */
427 state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; 812 state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
428 __ovs_ct_update_key(key, state, &info->zone, exp->master); 813 __ovs_ct_update_key(key, state, &info->zone, exp->master);
429 } else { 814 } else
430 int err; 815 return __ovs_ct_lookup(net, key, info, skb);
431
432 err = __ovs_ct_lookup(net, key, info, skb);
433 if (err)
434 return err;
435 }
436 816
437 return 0; 817 return 0;
438} 818}
@@ -442,21 +822,12 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
442 const struct ovs_conntrack_info *info, 822 const struct ovs_conntrack_info *info,
443 struct sk_buff *skb) 823 struct sk_buff *skb)
444{ 824{
445 u8 state;
446 int err; 825 int err;
447 826
448 state = key->ct.state;
449 if (key->ct.zone == info->zone.id &&
450 ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) {
451 /* Previous lookup has shown that this connection is already
452 * tracked and committed. Skip committing.
453 */
454 return 0;
455 }
456
457 err = __ovs_ct_lookup(net, key, info, skb); 827 err = __ovs_ct_lookup(net, key, info, skb);
458 if (err) 828 if (err)
459 return err; 829 return err;
830 /* This is a no-op if the connection has already been confirmed. */
460 if (nf_conntrack_confirm(skb) != NF_ACCEPT) 831 if (nf_conntrack_confirm(skb) != NF_ACCEPT)
461 return -EINVAL; 832 return -EINVAL;
462 833
@@ -541,6 +912,135 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
541 return 0; 912 return 0;
542} 913}
543 914
915#ifdef CONFIG_NF_NAT_NEEDED
916static int parse_nat(const struct nlattr *attr,
917 struct ovs_conntrack_info *info, bool log)
918{
919 struct nlattr *a;
920 int rem;
921 bool have_ip_max = false;
922 bool have_proto_max = false;
923 bool ip_vers = (info->family == NFPROTO_IPV6);
924
925 nla_for_each_nested(a, attr, rem) {
926 static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
927 [OVS_NAT_ATTR_SRC] = {0, 0},
928 [OVS_NAT_ATTR_DST] = {0, 0},
929 [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
930 sizeof(struct in6_addr)},
931 [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
932 sizeof(struct in6_addr)},
933 [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)},
934 [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)},
935 [OVS_NAT_ATTR_PERSISTENT] = {0, 0},
936 [OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
937 [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
938 };
939 int type = nla_type(a);
940
941 if (type > OVS_NAT_ATTR_MAX) {
942 OVS_NLERR(log,
943 "Unknown NAT attribute (type=%d, max=%d).\n",
944 type, OVS_NAT_ATTR_MAX);
945 return -EINVAL;
946 }
947
948 if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) {
949 OVS_NLERR(log,
950 "NAT attribute type %d has unexpected length (%d != %d).\n",
951 type, nla_len(a),
952 ovs_nat_attr_lens[type][ip_vers]);
953 return -EINVAL;
954 }
955
956 switch (type) {
957 case OVS_NAT_ATTR_SRC:
958 case OVS_NAT_ATTR_DST:
959 if (info->nat) {
960 OVS_NLERR(log,
961 "Only one type of NAT may be specified.\n"
962 );
963 return -ERANGE;
964 }
965 info->nat |= OVS_CT_NAT;
966 info->nat |= ((type == OVS_NAT_ATTR_SRC)
967 ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT);
968 break;
969
970 case OVS_NAT_ATTR_IP_MIN:
971 nla_memcpy(&info->range.min_addr, a, nla_len(a));
972 info->range.flags |= NF_NAT_RANGE_MAP_IPS;
973 break;
974
975 case OVS_NAT_ATTR_IP_MAX:
976 have_ip_max = true;
977 nla_memcpy(&info->range.max_addr, a,
978 sizeof(info->range.max_addr));
979 info->range.flags |= NF_NAT_RANGE_MAP_IPS;
980 break;
981
982 case OVS_NAT_ATTR_PROTO_MIN:
983 info->range.min_proto.all = htons(nla_get_u16(a));
984 info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
985 break;
986
987 case OVS_NAT_ATTR_PROTO_MAX:
988 have_proto_max = true;
989 info->range.max_proto.all = htons(nla_get_u16(a));
990 info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
991 break;
992
993 case OVS_NAT_ATTR_PERSISTENT:
994 info->range.flags |= NF_NAT_RANGE_PERSISTENT;
995 break;
996
997 case OVS_NAT_ATTR_PROTO_HASH:
998 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM;
999 break;
1000
1001 case OVS_NAT_ATTR_PROTO_RANDOM:
1002 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY;
1003 break;
1004
1005 default:
1006 OVS_NLERR(log, "Unknown nat attribute (%d).\n", type);
1007 return -EINVAL;
1008 }
1009 }
1010
1011 if (rem > 0) {
1012 OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem);
1013 return -EINVAL;
1014 }
1015 if (!info->nat) {
1016 /* Do not allow flags if no type is given. */
1017 if (info->range.flags) {
1018 OVS_NLERR(log,
1019 "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n"
1020 );
1021 return -EINVAL;
1022 }
1023 info->nat = OVS_CT_NAT; /* NAT existing connections. */
1024 } else if (!info->commit) {
1025 OVS_NLERR(log,
1026 "NAT attributes may be specified only when CT COMMIT flag is also specified.\n"
1027 );
1028 return -EINVAL;
1029 }
1030 /* Allow missing IP_MAX. */
1031 if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) {
1032 memcpy(&info->range.max_addr, &info->range.min_addr,
1033 sizeof(info->range.max_addr));
1034 }
1035 /* Allow missing PROTO_MAX. */
1036 if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
1037 !have_proto_max) {
1038 info->range.max_proto.all = info->range.min_proto.all;
1039 }
1040 return 0;
1041}
1042#endif
1043
544static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { 1044static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
545 [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, 1045 [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 },
546 [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), 1046 [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16),
@@ -550,7 +1050,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
550 [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), 1050 [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels),
551 .maxlen = sizeof(struct md_labels) }, 1051 .maxlen = sizeof(struct md_labels) },
552 [OVS_CT_ATTR_HELPER] = { .minlen = 1, 1052 [OVS_CT_ATTR_HELPER] = { .minlen = 1,
553 .maxlen = NF_CT_HELPER_NAME_LEN } 1053 .maxlen = NF_CT_HELPER_NAME_LEN },
1054#ifdef CONFIG_NF_NAT_NEEDED
1055 /* NAT length is checked when parsing the nested attributes. */
1056 [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX },
1057#endif
554}; 1058};
555 1059
556static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, 1060static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -617,6 +1121,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
617 return -EINVAL; 1121 return -EINVAL;
618 } 1122 }
619 break; 1123 break;
1124#ifdef CONFIG_NF_NAT_NEEDED
1125 case OVS_CT_ATTR_NAT: {
1126 int err = parse_nat(a, info, log);
1127
1128 if (err)
1129 return err;
1130 break;
1131 }
1132#endif
620 default: 1133 default:
621 OVS_NLERR(log, "Unknown conntrack attr (%d)", 1134 OVS_NLERR(log, "Unknown conntrack attr (%d)",
622 type); 1135 type);
@@ -704,6 +1217,74 @@ err_free_ct:
704 return err; 1217 return err;
705} 1218}
706 1219
1220#ifdef CONFIG_NF_NAT_NEEDED
1221static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
1222 struct sk_buff *skb)
1223{
1224 struct nlattr *start;
1225
1226 start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
1227 if (!start)
1228 return false;
1229
1230 if (info->nat & OVS_CT_SRC_NAT) {
1231 if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
1232 return false;
1233 } else if (info->nat & OVS_CT_DST_NAT) {
1234 if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
1235 return false;
1236 } else {
1237 goto out;
1238 }
1239
1240 if (info->range.flags & NF_NAT_RANGE_MAP_IPS) {
1241 if (info->family == NFPROTO_IPV4) {
1242 if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN,
1243 info->range.min_addr.ip) ||
1244 (info->range.max_addr.ip
1245 != info->range.min_addr.ip &&
1246 (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX,
1247 info->range.max_addr.ip))))
1248 return false;
1249#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
1250 } else if (info->family == NFPROTO_IPV6) {
1251 if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN,
1252 &info->range.min_addr.in6) ||
1253 (memcmp(&info->range.max_addr.in6,
1254 &info->range.min_addr.in6,
1255 sizeof(info->range.max_addr.in6)) &&
1256 (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX,
1257 &info->range.max_addr.in6))))
1258 return false;
1259#endif
1260 } else {
1261 return false;
1262 }
1263 }
1264 if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
1265 (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN,
1266 ntohs(info->range.min_proto.all)) ||
1267 (info->range.max_proto.all != info->range.min_proto.all &&
1268 nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX,
1269 ntohs(info->range.max_proto.all)))))
1270 return false;
1271
1272 if (info->range.flags & NF_NAT_RANGE_PERSISTENT &&
1273 nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT))
1274 return false;
1275 if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM &&
1276 nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH))
1277 return false;
1278 if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY &&
1279 nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM))
1280 return false;
1281out:
1282 nla_nest_end(skb, start);
1283
1284 return true;
1285}
1286#endif
1287
707int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, 1288int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
708 struct sk_buff *skb) 1289 struct sk_buff *skb)
709{ 1290{
@@ -732,7 +1313,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
732 ct_info->helper->name)) 1313 ct_info->helper->name))
733 return -EMSGSIZE; 1314 return -EMSGSIZE;
734 } 1315 }
735 1316#ifdef CONFIG_NF_NAT_NEEDED
1317 if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb))
1318 return -EMSGSIZE;
1319#endif
736 nla_nest_end(skb, start); 1320 nla_nest_end(skb, start);
737 1321
738 return 0; 1322 return 0;
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index a7544f405c16..8f6230bd6183 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -37,7 +37,8 @@ void ovs_ct_free_action(const struct nlattr *a);
37 37
38#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ 38#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
39 OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \ 39 OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \
40 OVS_CS_F_INVALID | OVS_CS_F_TRACKED) 40 OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \
41 OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
41#else 42#else
42#include <linux/errno.h> 43#include <linux/errno.h>
43 44
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index deadfdab1bc3..0cc66a4e492d 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -422,10 +422,6 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
422 struct sk_buff *nskb = NULL; 422 struct sk_buff *nskb = NULL;
423 struct sk_buff *user_skb = NULL; /* to be queued to userspace */ 423 struct sk_buff *user_skb = NULL; /* to be queued to userspace */
424 struct nlattr *nla; 424 struct nlattr *nla;
425 struct genl_info info = {
426 .dst_sk = ovs_dp_get_net(dp)->genl_sock,
427 .snd_portid = upcall_info->portid,
428 };
429 size_t len; 425 size_t len;
430 unsigned int hlen; 426 unsigned int hlen;
431 int err, dp_ifindex; 427 int err, dp_ifindex;
@@ -466,7 +462,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
466 hlen = skb->len; 462 hlen = skb->len;
467 463
468 len = upcall_msg_size(upcall_info, hlen); 464 len = upcall_msg_size(upcall_info, hlen);
469 user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); 465 user_skb = genlmsg_new(len, GFP_ATOMIC);
470 if (!user_skb) { 466 if (!user_skb) {
471 err = -ENOMEM; 467 err = -ENOMEM;
472 goto out; 468 goto out;
@@ -654,7 +650,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
654 650
655static const struct genl_ops dp_packet_genl_ops[] = { 651static const struct genl_ops dp_packet_genl_ops[] = {
656 { .cmd = OVS_PACKET_CMD_EXECUTE, 652 { .cmd = OVS_PACKET_CMD_EXECUTE,
657 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 653 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
658 .policy = packet_policy, 654 .policy = packet_policy,
659 .doit = ovs_packet_cmd_execute 655 .doit = ovs_packet_cmd_execute
660 } 656 }
@@ -876,7 +872,7 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *act
876 return NULL; 872 return NULL;
877 873
878 len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); 874 len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags);
879 skb = genlmsg_new_unicast(len, info, GFP_KERNEL); 875 skb = genlmsg_new(len, GFP_KERNEL);
880 if (!skb) 876 if (!skb)
881 return ERR_PTR(-ENOMEM); 877 return ERR_PTR(-ENOMEM);
882 878
@@ -1100,26 +1096,32 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
1100 struct sw_flow_match match; 1096 struct sw_flow_match match;
1101 struct sw_flow_id sfid; 1097 struct sw_flow_id sfid;
1102 u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); 1098 u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
1103 int error; 1099 int error = 0;
1104 bool log = !a[OVS_FLOW_ATTR_PROBE]; 1100 bool log = !a[OVS_FLOW_ATTR_PROBE];
1105 bool ufid_present; 1101 bool ufid_present;
1106 1102
1107 /* Extract key. */
1108 error = -EINVAL;
1109 if (!a[OVS_FLOW_ATTR_KEY]) {
1110 OVS_NLERR(log, "Flow key attribute not present in set flow.");
1111 goto error;
1112 }
1113
1114 ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); 1103 ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
1115 ovs_match_init(&match, &key, &mask); 1104 if (a[OVS_FLOW_ATTR_KEY]) {
1116 error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], 1105 ovs_match_init(&match, &key, &mask);
1117 a[OVS_FLOW_ATTR_MASK], log); 1106 error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
1107 a[OVS_FLOW_ATTR_MASK], log);
1108 } else if (!ufid_present) {
1109 OVS_NLERR(log,
1110 "Flow set message rejected, Key attribute missing.");
1111 error = -EINVAL;
1112 }
1118 if (error) 1113 if (error)
1119 goto error; 1114 goto error;
1120 1115
1121 /* Validate actions. */ 1116 /* Validate actions. */
1122 if (a[OVS_FLOW_ATTR_ACTIONS]) { 1117 if (a[OVS_FLOW_ATTR_ACTIONS]) {
1118 if (!a[OVS_FLOW_ATTR_KEY]) {
1119 OVS_NLERR(log,
1120 "Flow key attribute not present in set flow.");
1121 error = -EINVAL;
1122 goto error;
1123 }
1124
1123 acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key, 1125 acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key,
1124 &mask, log); 1126 &mask, log);
1125 if (IS_ERR(acts)) { 1127 if (IS_ERR(acts)) {
@@ -1391,12 +1393,12 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
1391 1393
1392static const struct genl_ops dp_flow_genl_ops[] = { 1394static const struct genl_ops dp_flow_genl_ops[] = {
1393 { .cmd = OVS_FLOW_CMD_NEW, 1395 { .cmd = OVS_FLOW_CMD_NEW,
1394 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 1396 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1395 .policy = flow_policy, 1397 .policy = flow_policy,
1396 .doit = ovs_flow_cmd_new 1398 .doit = ovs_flow_cmd_new
1397 }, 1399 },
1398 { .cmd = OVS_FLOW_CMD_DEL, 1400 { .cmd = OVS_FLOW_CMD_DEL,
1399 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 1401 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1400 .policy = flow_policy, 1402 .policy = flow_policy,
1401 .doit = ovs_flow_cmd_del 1403 .doit = ovs_flow_cmd_del
1402 }, 1404 },
@@ -1407,7 +1409,7 @@ static const struct genl_ops dp_flow_genl_ops[] = {
1407 .dumpit = ovs_flow_cmd_dump 1409 .dumpit = ovs_flow_cmd_dump
1408 }, 1410 },
1409 { .cmd = OVS_FLOW_CMD_SET, 1411 { .cmd = OVS_FLOW_CMD_SET,
1410 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 1412 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1411 .policy = flow_policy, 1413 .policy = flow_policy,
1412 .doit = ovs_flow_cmd_set, 1414 .doit = ovs_flow_cmd_set,
1413 }, 1415 },
@@ -1481,9 +1483,9 @@ error:
1481 return -EMSGSIZE; 1483 return -EMSGSIZE;
1482} 1484}
1483 1485
1484static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info) 1486static struct sk_buff *ovs_dp_cmd_alloc_info(void)
1485{ 1487{
1486 return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL); 1488 return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL);
1487} 1489}
1488 1490
1489/* Called with rcu_read_lock or ovs_mutex. */ 1491/* Called with rcu_read_lock or ovs_mutex. */
@@ -1536,7 +1538,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
1536 if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) 1538 if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
1537 goto err; 1539 goto err;
1538 1540
1539 reply = ovs_dp_cmd_alloc_info(info); 1541 reply = ovs_dp_cmd_alloc_info();
1540 if (!reply) 1542 if (!reply)
1541 return -ENOMEM; 1543 return -ENOMEM;
1542 1544
@@ -1657,7 +1659,7 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
1657 struct datapath *dp; 1659 struct datapath *dp;
1658 int err; 1660 int err;
1659 1661
1660 reply = ovs_dp_cmd_alloc_info(info); 1662 reply = ovs_dp_cmd_alloc_info();
1661 if (!reply) 1663 if (!reply)
1662 return -ENOMEM; 1664 return -ENOMEM;
1663 1665
@@ -1690,7 +1692,7 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
1690 struct datapath *dp; 1692 struct datapath *dp;
1691 int err; 1693 int err;
1692 1694
1693 reply = ovs_dp_cmd_alloc_info(info); 1695 reply = ovs_dp_cmd_alloc_info();
1694 if (!reply) 1696 if (!reply)
1695 return -ENOMEM; 1697 return -ENOMEM;
1696 1698
@@ -1723,7 +1725,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
1723 struct datapath *dp; 1725 struct datapath *dp;
1724 int err; 1726 int err;
1725 1727
1726 reply = ovs_dp_cmd_alloc_info(info); 1728 reply = ovs_dp_cmd_alloc_info();
1727 if (!reply) 1729 if (!reply)
1728 return -ENOMEM; 1730 return -ENOMEM;
1729 1731
@@ -1777,12 +1779,12 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
1777 1779
1778static const struct genl_ops dp_datapath_genl_ops[] = { 1780static const struct genl_ops dp_datapath_genl_ops[] = {
1779 { .cmd = OVS_DP_CMD_NEW, 1781 { .cmd = OVS_DP_CMD_NEW,
1780 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 1782 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1781 .policy = datapath_policy, 1783 .policy = datapath_policy,
1782 .doit = ovs_dp_cmd_new 1784 .doit = ovs_dp_cmd_new
1783 }, 1785 },
1784 { .cmd = OVS_DP_CMD_DEL, 1786 { .cmd = OVS_DP_CMD_DEL,
1785 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 1787 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1786 .policy = datapath_policy, 1788 .policy = datapath_policy,
1787 .doit = ovs_dp_cmd_del 1789 .doit = ovs_dp_cmd_del
1788 }, 1790 },
@@ -1793,7 +1795,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
1793 .dumpit = ovs_dp_cmd_dump 1795 .dumpit = ovs_dp_cmd_dump
1794 }, 1796 },
1795 { .cmd = OVS_DP_CMD_SET, 1797 { .cmd = OVS_DP_CMD_SET,
1796 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 1798 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1797 .policy = datapath_policy, 1799 .policy = datapath_policy,
1798 .doit = ovs_dp_cmd_set, 1800 .doit = ovs_dp_cmd_set,
1799 }, 1801 },
@@ -1912,6 +1914,29 @@ static struct vport *lookup_vport(struct net *net,
1912 return ERR_PTR(-EINVAL); 1914 return ERR_PTR(-EINVAL);
1913} 1915}
1914 1916
1917/* Called with ovs_mutex */
1918static void update_headroom(struct datapath *dp)
1919{
1920 unsigned dev_headroom, max_headroom = 0;
1921 struct net_device *dev;
1922 struct vport *vport;
1923 int i;
1924
1925 for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
1926 hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
1927 dev = vport->dev;
1928 dev_headroom = netdev_get_fwd_headroom(dev);
1929 if (dev_headroom > max_headroom)
1930 max_headroom = dev_headroom;
1931 }
1932 }
1933
1934 dp->max_headroom = max_headroom;
1935 for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
1936 hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node)
1937 netdev_set_rx_headroom(vport->dev, max_headroom);
1938}
1939
1915static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) 1940static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
1916{ 1941{
1917 struct nlattr **a = info->attrs; 1942 struct nlattr **a = info->attrs;
@@ -1977,6 +2002,12 @@ restart:
1977 2002
1978 err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, 2003 err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
1979 info->snd_seq, 0, OVS_VPORT_CMD_NEW); 2004 info->snd_seq, 0, OVS_VPORT_CMD_NEW);
2005
2006 if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom)
2007 update_headroom(dp);
2008 else
2009 netdev_set_rx_headroom(vport->dev, dp->max_headroom);
2010
1980 BUG_ON(err < 0); 2011 BUG_ON(err < 0);
1981 ovs_unlock(); 2012 ovs_unlock();
1982 2013
@@ -2043,8 +2074,10 @@ exit_unlock_free:
2043 2074
2044static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) 2075static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
2045{ 2076{
2077 bool must_update_headroom = false;
2046 struct nlattr **a = info->attrs; 2078 struct nlattr **a = info->attrs;
2047 struct sk_buff *reply; 2079 struct sk_buff *reply;
2080 struct datapath *dp;
2048 struct vport *vport; 2081 struct vport *vport;
2049 int err; 2082 int err;
2050 2083
@@ -2066,7 +2099,16 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
2066 err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, 2099 err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
2067 info->snd_seq, 0, OVS_VPORT_CMD_DEL); 2100 info->snd_seq, 0, OVS_VPORT_CMD_DEL);
2068 BUG_ON(err < 0); 2101 BUG_ON(err < 0);
2102
2103 /* the vport deletion may trigger dp headroom update */
2104 dp = vport->dp;
2105 if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom)
2106 must_update_headroom = true;
2107 netdev_reset_rx_headroom(vport->dev);
2069 ovs_dp_detach_port(vport); 2108 ovs_dp_detach_port(vport);
2109
2110 if (must_update_headroom)
2111 update_headroom(dp);
2070 ovs_unlock(); 2112 ovs_unlock();
2071 2113
2072 ovs_notify(&dp_vport_genl_family, reply, info); 2114 ovs_notify(&dp_vport_genl_family, reply, info);
@@ -2158,12 +2200,12 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
2158 2200
2159static const struct genl_ops dp_vport_genl_ops[] = { 2201static const struct genl_ops dp_vport_genl_ops[] = {
2160 { .cmd = OVS_VPORT_CMD_NEW, 2202 { .cmd = OVS_VPORT_CMD_NEW,
2161 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 2203 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2162 .policy = vport_policy, 2204 .policy = vport_policy,
2163 .doit = ovs_vport_cmd_new 2205 .doit = ovs_vport_cmd_new
2164 }, 2206 },
2165 { .cmd = OVS_VPORT_CMD_DEL, 2207 { .cmd = OVS_VPORT_CMD_DEL,
2166 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 2208 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2167 .policy = vport_policy, 2209 .policy = vport_policy,
2168 .doit = ovs_vport_cmd_del 2210 .doit = ovs_vport_cmd_del
2169 }, 2211 },
@@ -2174,7 +2216,7 @@ static const struct genl_ops dp_vport_genl_ops[] = {
2174 .dumpit = ovs_vport_cmd_dump 2216 .dumpit = ovs_vport_cmd_dump
2175 }, 2217 },
2176 { .cmd = OVS_VPORT_CMD_SET, 2218 { .cmd = OVS_VPORT_CMD_SET,
2177 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ 2219 .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
2178 .policy = vport_policy, 2220 .policy = vport_policy,
2179 .doit = ovs_vport_cmd_set, 2221 .doit = ovs_vport_cmd_set,
2180 }, 2222 },
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 67bdecd9fdc1..427e39a045cf 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -68,6 +68,8 @@ struct dp_stats_percpu {
68 * ovs_mutex and RCU. 68 * ovs_mutex and RCU.
69 * @stats_percpu: Per-CPU datapath statistics. 69 * @stats_percpu: Per-CPU datapath statistics.
70 * @net: Reference to net namespace. 70 * @net: Reference to net namespace.
71 * @max_headroom: the maximum headroom of all vports in this datapath; it will
72 * be used by all the internal vports in this dp.
71 * 73 *
72 * Context: See the comment on locking at the top of datapath.c for additional 74 * Context: See the comment on locking at the top of datapath.c for additional
73 * locking information. 75 * locking information.
@@ -89,6 +91,8 @@ struct datapath {
89 possible_net_t net; 91 possible_net_t net;
90 92
91 u32 user_features; 93 u32 user_features;
94
95 u32 max_headroom;
92}; 96};
93 97
94/** 98/**
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 1d055c559eaf..03378e75a67c 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -55,7 +55,7 @@ struct ovs_tunnel_info {
55 FIELD_SIZEOF(struct sw_flow_key, recirc_id)) 55 FIELD_SIZEOF(struct sw_flow_key, recirc_id))
56 56
57struct sw_flow_key { 57struct sw_flow_key {
58 u8 tun_opts[255]; 58 u8 tun_opts[IP_TUNNEL_OPTS_MAX];
59 u8 tun_opts_len; 59 u8 tun_opts_len;
60 struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ 60 struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */
61 struct { 61 struct {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index d1bd4a45ca2d..689c17264221 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1959,6 +1959,12 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
1959 if (!tun_dst) 1959 if (!tun_dst)
1960 return -ENOMEM; 1960 return -ENOMEM;
1961 1961
1962 err = dst_cache_init(&tun_dst->u.tun_info.dst_cache, GFP_KERNEL);
1963 if (err) {
1964 dst_release((struct dst_entry *)tun_dst);
1965 return err;
1966 }
1967
1962 a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, 1968 a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
1963 sizeof(*ovs_tun), log); 1969 sizeof(*ovs_tun), log);
1964 if (IS_ERR(a)) { 1970 if (IS_ERR(a)) {
@@ -2038,9 +2044,6 @@ static int validate_set(const struct nlattr *a,
2038 break; 2044 break;
2039 2045
2040 case OVS_KEY_ATTR_TUNNEL: 2046 case OVS_KEY_ATTR_TUNNEL:
2041 if (eth_p_mpls(eth_type))
2042 return -EINVAL;
2043
2044 if (masked) 2047 if (masked)
2045 return -EINVAL; /* Masked tunnel set not supported. */ 2048 return -EINVAL; /* Masked tunnel set not supported. */
2046 2049
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index ec76398a792f..7c8b90bf0e54 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -138,6 +138,11 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
138 return stats; 138 return stats;
139} 139}
140 140
141static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
142{
143 dev->needed_headroom = new_hr;
144}
145
141static const struct net_device_ops internal_dev_netdev_ops = { 146static const struct net_device_ops internal_dev_netdev_ops = {
142 .ndo_open = internal_dev_open, 147 .ndo_open = internal_dev_open,
143 .ndo_stop = internal_dev_stop, 148 .ndo_stop = internal_dev_stop,
@@ -145,6 +150,7 @@ static const struct net_device_ops internal_dev_netdev_ops = {
145 .ndo_set_mac_address = eth_mac_addr, 150 .ndo_set_mac_address = eth_mac_addr,
146 .ndo_change_mtu = internal_dev_change_mtu, 151 .ndo_change_mtu = internal_dev_change_mtu,
147 .ndo_get_stats64 = internal_get_stats, 152 .ndo_get_stats64 = internal_get_stats,
153 .ndo_set_rx_headroom = internal_set_rx_headroom,
148}; 154};
149 155
150static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { 156static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
@@ -158,7 +164,8 @@ static void do_setup(struct net_device *netdev)
158 netdev->netdev_ops = &internal_dev_netdev_ops; 164 netdev->netdev_ops = &internal_dev_netdev_ops;
159 165
160 netdev->priv_flags &= ~IFF_TX_SKB_SHARING; 166 netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
161 netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; 167 netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
168 IFF_PHONY_HEADROOM;
162 netdev->destructor = internal_dev_destructor; 169 netdev->destructor = internal_dev_destructor;
163 netdev->ethtool_ops = &internal_dev_ethtool_ops; 170 netdev->ethtool_ops = &internal_dev_ethtool_ops;
164 netdev->rtnl_link_ops = &internal_dev_link_ops; 171 netdev->rtnl_link_ops = &internal_dev_link_ops;
@@ -199,6 +206,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
199 err = -ENOMEM; 206 err = -ENOMEM;
200 goto error_free_netdev; 207 goto error_free_netdev;
201 } 208 }
209 vport->dev->needed_headroom = vport->dp->max_headroom;
202 210
203 dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); 211 dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
204 internal_dev = internal_dev_priv(vport->dev); 212 internal_dev = internal_dev_priv(vport->dev);
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 6a6adf314363..4e3972344aa6 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -58,7 +58,7 @@ static void netdev_port_receive(struct sk_buff *skb)
58 return; 58 return;
59 59
60 skb_push(skb, ETH_HLEN); 60 skb_push(skb, ETH_HLEN);
61 ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); 61 skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
62 ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); 62 ovs_vport_receive(vport, skb, skb_tunnel_info(skb));
63 return; 63 return;
64error: 64error:
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index c10899cb9040..f01f28a567ad 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -185,13 +185,6 @@ static inline struct vport *vport_from_priv(void *priv)
185int ovs_vport_receive(struct vport *, struct sk_buff *, 185int ovs_vport_receive(struct vport *, struct sk_buff *,
186 const struct ip_tunnel_info *); 186 const struct ip_tunnel_info *);
187 187
188static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
189 const void *start, unsigned int len)
190{
191 if (skb->ip_summed == CHECKSUM_COMPLETE)
192 skb->csum = csum_add(skb->csum, csum_partial(start, len, 0));
193}
194
195static inline const char *ovs_vport_name(struct vport *vport) 188static inline const char *ovs_vport_name(struct vport *vport)
196{ 189{
197 return vport->dev->name; 190 return vport->dev->name;