aboutsummaryrefslogtreecommitdiffstats
path: root/net/openvswitch
diff options
context:
space:
mode:
Diffstat (limited to 'net/openvswitch')
-rw-r--r--net/openvswitch/Kconfig28
-rw-r--r--net/openvswitch/Makefile14
-rw-r--r--net/openvswitch/actions.c415
-rw-r--r--net/openvswitch/datapath.c1910
-rw-r--r--net/openvswitch/datapath.h124
-rw-r--r--net/openvswitch/dp_notify.c66
-rw-r--r--net/openvswitch/flow.c1345
-rw-r--r--net/openvswitch/flow.h199
-rw-r--r--net/openvswitch/vport-internal_dev.c240
-rw-r--r--net/openvswitch/vport-internal_dev.h28
-rw-r--r--net/openvswitch/vport-netdev.c198
-rw-r--r--net/openvswitch/vport-netdev.h42
-rw-r--r--net/openvswitch/vport.c397
-rw-r--r--net/openvswitch/vport.h205
14 files changed, 5211 insertions, 0 deletions
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
new file mode 100644
index 000000000000..d9ea33c361be
--- /dev/null
+++ b/net/openvswitch/Kconfig
@@ -0,0 +1,28 @@
1#
2# Open vSwitch
3#
4
5config OPENVSWITCH
6 tristate "Open vSwitch"
7 ---help---
8 Open vSwitch is a multilayer Ethernet switch targeted at virtualized
9 environments. In addition to supporting a variety of features
10 expected in a traditional hardware switch, it enables fine-grained
11 programmatic extension and flow-based control of the network. This
12 control is useful in a wide variety of applications but is
13 particularly important in multi-server virtualization deployments,
14 which are often characterized by highly dynamic endpoints and the
15 need to maintain logical abstractions for multiple tenants.
16
17 The Open vSwitch datapath provides an in-kernel fast path for packet
18 forwarding. It is complemented by a userspace daemon, ovs-vswitchd,
19 which is able to accept configuration from a variety of sources and
20 translate it into packet processing rules.
21
22 See http://openvswitch.org for more information and userspace
23 utilities.
24
25 To compile this code as a module, choose M here: the module will be
26 called openvswitch.
27
28 If unsure, say N.
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
new file mode 100644
index 000000000000..15e7384745c1
--- /dev/null
+++ b/net/openvswitch/Makefile
@@ -0,0 +1,14 @@
1#
2# Makefile for Open vSwitch.
3#
4
5obj-$(CONFIG_OPENVSWITCH) += openvswitch.o
6
7openvswitch-y := \
8 actions.o \
9 datapath.o \
10 dp_notify.o \
11 flow.o \
12 vport.o \
13 vport-internal_dev.o \
14 vport-netdev.o \
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
new file mode 100644
index 000000000000..2725d1bdf291
--- /dev/null
+++ b/net/openvswitch/actions.c
@@ -0,0 +1,415 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/skbuff.h>
22#include <linux/in.h>
23#include <linux/ip.h>
24#include <linux/openvswitch.h>
25#include <linux/tcp.h>
26#include <linux/udp.h>
27#include <linux/in6.h>
28#include <linux/if_arp.h>
29#include <linux/if_vlan.h>
30#include <net/ip.h>
31#include <net/checksum.h>
32#include <net/dsfield.h>
33
34#include "datapath.h"
35#include "vport.h"
36
37static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
38 const struct nlattr *attr, int len, bool keep_skb);
39
40static int make_writable(struct sk_buff *skb, int write_len)
41{
42 if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
43 return 0;
44
45 return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
46}
47
48/* remove VLAN header from packet and update csum accrodingly. */
49static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
50{
51 struct vlan_hdr *vhdr;
52 int err;
53
54 err = make_writable(skb, VLAN_ETH_HLEN);
55 if (unlikely(err))
56 return err;
57
58 if (skb->ip_summed == CHECKSUM_COMPLETE)
59 skb->csum = csum_sub(skb->csum, csum_partial(skb->data
60 + ETH_HLEN, VLAN_HLEN, 0));
61
62 vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
63 *current_tci = vhdr->h_vlan_TCI;
64
65 memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
66 __skb_pull(skb, VLAN_HLEN);
67
68 vlan_set_encap_proto(skb, vhdr);
69 skb->mac_header += VLAN_HLEN;
70 skb_reset_mac_len(skb);
71
72 return 0;
73}
74
75static int pop_vlan(struct sk_buff *skb)
76{
77 __be16 tci;
78 int err;
79
80 if (likely(vlan_tx_tag_present(skb))) {
81 skb->vlan_tci = 0;
82 } else {
83 if (unlikely(skb->protocol != htons(ETH_P_8021Q) ||
84 skb->len < VLAN_ETH_HLEN))
85 return 0;
86
87 err = __pop_vlan_tci(skb, &tci);
88 if (err)
89 return err;
90 }
91 /* move next vlan tag to hw accel tag */
92 if (likely(skb->protocol != htons(ETH_P_8021Q) ||
93 skb->len < VLAN_ETH_HLEN))
94 return 0;
95
96 err = __pop_vlan_tci(skb, &tci);
97 if (unlikely(err))
98 return err;
99
100 __vlan_hwaccel_put_tag(skb, ntohs(tci));
101 return 0;
102}
103
104static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vlan)
105{
106 if (unlikely(vlan_tx_tag_present(skb))) {
107 u16 current_tag;
108
109 /* push down current VLAN tag */
110 current_tag = vlan_tx_tag_get(skb);
111
112 if (!__vlan_put_tag(skb, current_tag))
113 return -ENOMEM;
114
115 if (skb->ip_summed == CHECKSUM_COMPLETE)
116 skb->csum = csum_add(skb->csum, csum_partial(skb->data
117 + ETH_HLEN, VLAN_HLEN, 0));
118
119 }
120 __vlan_hwaccel_put_tag(skb, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
121 return 0;
122}
123
124static int set_eth_addr(struct sk_buff *skb,
125 const struct ovs_key_ethernet *eth_key)
126{
127 int err;
128 err = make_writable(skb, ETH_HLEN);
129 if (unlikely(err))
130 return err;
131
132 memcpy(eth_hdr(skb)->h_source, eth_key->eth_src, ETH_ALEN);
133 memcpy(eth_hdr(skb)->h_dest, eth_key->eth_dst, ETH_ALEN);
134
135 return 0;
136}
137
138static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh,
139 __be32 *addr, __be32 new_addr)
140{
141 int transport_len = skb->len - skb_transport_offset(skb);
142
143 if (nh->protocol == IPPROTO_TCP) {
144 if (likely(transport_len >= sizeof(struct tcphdr)))
145 inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb,
146 *addr, new_addr, 1);
147 } else if (nh->protocol == IPPROTO_UDP) {
148 if (likely(transport_len >= sizeof(struct udphdr)))
149 inet_proto_csum_replace4(&udp_hdr(skb)->check, skb,
150 *addr, new_addr, 1);
151 }
152
153 csum_replace4(&nh->check, *addr, new_addr);
154 skb->rxhash = 0;
155 *addr = new_addr;
156}
157
158static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl)
159{
160 csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8));
161 nh->ttl = new_ttl;
162}
163
164static int set_ipv4(struct sk_buff *skb, const struct ovs_key_ipv4 *ipv4_key)
165{
166 struct iphdr *nh;
167 int err;
168
169 err = make_writable(skb, skb_network_offset(skb) +
170 sizeof(struct iphdr));
171 if (unlikely(err))
172 return err;
173
174 nh = ip_hdr(skb);
175
176 if (ipv4_key->ipv4_src != nh->saddr)
177 set_ip_addr(skb, nh, &nh->saddr, ipv4_key->ipv4_src);
178
179 if (ipv4_key->ipv4_dst != nh->daddr)
180 set_ip_addr(skb, nh, &nh->daddr, ipv4_key->ipv4_dst);
181
182 if (ipv4_key->ipv4_tos != nh->tos)
183 ipv4_change_dsfield(nh, 0, ipv4_key->ipv4_tos);
184
185 if (ipv4_key->ipv4_ttl != nh->ttl)
186 set_ip_ttl(skb, nh, ipv4_key->ipv4_ttl);
187
188 return 0;
189}
190
191/* Must follow make_writable() since that can move the skb data. */
192static void set_tp_port(struct sk_buff *skb, __be16 *port,
193 __be16 new_port, __sum16 *check)
194{
195 inet_proto_csum_replace2(check, skb, *port, new_port, 0);
196 *port = new_port;
197 skb->rxhash = 0;
198}
199
200static int set_udp_port(struct sk_buff *skb,
201 const struct ovs_key_udp *udp_port_key)
202{
203 struct udphdr *uh;
204 int err;
205
206 err = make_writable(skb, skb_transport_offset(skb) +
207 sizeof(struct udphdr));
208 if (unlikely(err))
209 return err;
210
211 uh = udp_hdr(skb);
212 if (udp_port_key->udp_src != uh->source)
213 set_tp_port(skb, &uh->source, udp_port_key->udp_src, &uh->check);
214
215 if (udp_port_key->udp_dst != uh->dest)
216 set_tp_port(skb, &uh->dest, udp_port_key->udp_dst, &uh->check);
217
218 return 0;
219}
220
221static int set_tcp_port(struct sk_buff *skb,
222 const struct ovs_key_tcp *tcp_port_key)
223{
224 struct tcphdr *th;
225 int err;
226
227 err = make_writable(skb, skb_transport_offset(skb) +
228 sizeof(struct tcphdr));
229 if (unlikely(err))
230 return err;
231
232 th = tcp_hdr(skb);
233 if (tcp_port_key->tcp_src != th->source)
234 set_tp_port(skb, &th->source, tcp_port_key->tcp_src, &th->check);
235
236 if (tcp_port_key->tcp_dst != th->dest)
237 set_tp_port(skb, &th->dest, tcp_port_key->tcp_dst, &th->check);
238
239 return 0;
240}
241
242static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
243{
244 struct vport *vport;
245
246 if (unlikely(!skb))
247 return -ENOMEM;
248
249 vport = rcu_dereference(dp->ports[out_port]);
250 if (unlikely(!vport)) {
251 kfree_skb(skb);
252 return -ENODEV;
253 }
254
255 ovs_vport_send(vport, skb);
256 return 0;
257}
258
259static int output_userspace(struct datapath *dp, struct sk_buff *skb,
260 const struct nlattr *attr)
261{
262 struct dp_upcall_info upcall;
263 const struct nlattr *a;
264 int rem;
265
266 upcall.cmd = OVS_PACKET_CMD_ACTION;
267 upcall.key = &OVS_CB(skb)->flow->key;
268 upcall.userdata = NULL;
269 upcall.pid = 0;
270
271 for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
272 a = nla_next(a, &rem)) {
273 switch (nla_type(a)) {
274 case OVS_USERSPACE_ATTR_USERDATA:
275 upcall.userdata = a;
276 break;
277
278 case OVS_USERSPACE_ATTR_PID:
279 upcall.pid = nla_get_u32(a);
280 break;
281 }
282 }
283
284 return ovs_dp_upcall(dp, skb, &upcall);
285}
286
287static int sample(struct datapath *dp, struct sk_buff *skb,
288 const struct nlattr *attr)
289{
290 const struct nlattr *acts_list = NULL;
291 const struct nlattr *a;
292 int rem;
293
294 for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
295 a = nla_next(a, &rem)) {
296 switch (nla_type(a)) {
297 case OVS_SAMPLE_ATTR_PROBABILITY:
298 if (net_random() >= nla_get_u32(a))
299 return 0;
300 break;
301
302 case OVS_SAMPLE_ATTR_ACTIONS:
303 acts_list = a;
304 break;
305 }
306 }
307
308 return do_execute_actions(dp, skb, nla_data(acts_list),
309 nla_len(acts_list), true);
310}
311
312static int execute_set_action(struct sk_buff *skb,
313 const struct nlattr *nested_attr)
314{
315 int err = 0;
316
317 switch (nla_type(nested_attr)) {
318 case OVS_KEY_ATTR_PRIORITY:
319 skb->priority = nla_get_u32(nested_attr);
320 break;
321
322 case OVS_KEY_ATTR_ETHERNET:
323 err = set_eth_addr(skb, nla_data(nested_attr));
324 break;
325
326 case OVS_KEY_ATTR_IPV4:
327 err = set_ipv4(skb, nla_data(nested_attr));
328 break;
329
330 case OVS_KEY_ATTR_TCP:
331 err = set_tcp_port(skb, nla_data(nested_attr));
332 break;
333
334 case OVS_KEY_ATTR_UDP:
335 err = set_udp_port(skb, nla_data(nested_attr));
336 break;
337 }
338
339 return err;
340}
341
342/* Execute a list of actions against 'skb'. */
343static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
344 const struct nlattr *attr, int len, bool keep_skb)
345{
346 /* Every output action needs a separate clone of 'skb', but the common
347 * case is just a single output action, so that doing a clone and
348 * then freeing the original skbuff is wasteful. So the following code
349 * is slightly obscure just to avoid that. */
350 int prev_port = -1;
351 const struct nlattr *a;
352 int rem;
353
354 for (a = attr, rem = len; rem > 0;
355 a = nla_next(a, &rem)) {
356 int err = 0;
357
358 if (prev_port != -1) {
359 do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port);
360 prev_port = -1;
361 }
362
363 switch (nla_type(a)) {
364 case OVS_ACTION_ATTR_OUTPUT:
365 prev_port = nla_get_u32(a);
366 break;
367
368 case OVS_ACTION_ATTR_USERSPACE:
369 output_userspace(dp, skb, a);
370 break;
371
372 case OVS_ACTION_ATTR_PUSH_VLAN:
373 err = push_vlan(skb, nla_data(a));
374 if (unlikely(err)) /* skb already freed. */
375 return err;
376 break;
377
378 case OVS_ACTION_ATTR_POP_VLAN:
379 err = pop_vlan(skb);
380 break;
381
382 case OVS_ACTION_ATTR_SET:
383 err = execute_set_action(skb, nla_data(a));
384 break;
385
386 case OVS_ACTION_ATTR_SAMPLE:
387 err = sample(dp, skb, a);
388 break;
389 }
390
391 if (unlikely(err)) {
392 kfree_skb(skb);
393 return err;
394 }
395 }
396
397 if (prev_port != -1) {
398 if (keep_skb)
399 skb = skb_clone(skb, GFP_ATOMIC);
400
401 do_output(dp, skb, prev_port);
402 } else if (!keep_skb)
403 consume_skb(skb);
404
405 return 0;
406}
407
408/* Execute a list of actions against 'skb'. */
409int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb)
410{
411 struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts);
412
413 return do_execute_actions(dp, skb, acts->actions,
414 acts->actions_len, false);
415}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
new file mode 100644
index 000000000000..ce64c18b8c79
--- /dev/null
+++ b/net/openvswitch/datapath.c
@@ -0,0 +1,1910 @@
1/*
2 * Copyright (c) 2007-2012 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/init.h>
22#include <linux/module.h>
23#include <linux/if_arp.h>
24#include <linux/if_vlan.h>
25#include <linux/in.h>
26#include <linux/ip.h>
27#include <linux/jhash.h>
28#include <linux/delay.h>
29#include <linux/time.h>
30#include <linux/etherdevice.h>
31#include <linux/genetlink.h>
32#include <linux/kernel.h>
33#include <linux/kthread.h>
34#include <linux/mutex.h>
35#include <linux/percpu.h>
36#include <linux/rcupdate.h>
37#include <linux/tcp.h>
38#include <linux/udp.h>
39#include <linux/ethtool.h>
40#include <linux/wait.h>
41#include <asm/system.h>
42#include <asm/div64.h>
43#include <linux/highmem.h>
44#include <linux/netfilter_bridge.h>
45#include <linux/netfilter_ipv4.h>
46#include <linux/inetdevice.h>
47#include <linux/list.h>
48#include <linux/openvswitch.h>
49#include <linux/rculist.h>
50#include <linux/dmi.h>
51#include <linux/workqueue.h>
52#include <net/genetlink.h>
53
54#include "datapath.h"
55#include "flow.h"
56#include "vport-internal_dev.h"
57
58/**
59 * DOC: Locking:
60 *
61 * Writes to device state (add/remove datapath, port, set operations on vports,
62 * etc.) are protected by RTNL.
63 *
64 * Writes to other state (flow table modifications, set miscellaneous datapath
65 * parameters, etc.) are protected by genl_mutex. The RTNL lock nests inside
66 * genl_mutex.
67 *
68 * Reads are protected by RCU.
69 *
70 * There are a few special cases (mostly stats) that have their own
71 * synchronization but they nest under all of above and don't interact with
72 * each other.
73 */
74
75/* Global list of datapaths to enable dumping them all out.
76 * Protected by genl_mutex.
77 */
78static LIST_HEAD(dps);
79
80#define REHASH_FLOW_INTERVAL (10 * 60 * HZ)
81static void rehash_flow_table(struct work_struct *work);
82static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table);
83
84static struct vport *new_vport(const struct vport_parms *);
85static int queue_gso_packets(int dp_ifindex, struct sk_buff *,
86 const struct dp_upcall_info *);
87static int queue_userspace_packet(int dp_ifindex, struct sk_buff *,
88 const struct dp_upcall_info *);
89
90/* Must be called with rcu_read_lock, genl_mutex, or RTNL lock. */
91static struct datapath *get_dp(int dp_ifindex)
92{
93 struct datapath *dp = NULL;
94 struct net_device *dev;
95
96 rcu_read_lock();
97 dev = dev_get_by_index_rcu(&init_net, dp_ifindex);
98 if (dev) {
99 struct vport *vport = ovs_internal_dev_get_vport(dev);
100 if (vport)
101 dp = vport->dp;
102 }
103 rcu_read_unlock();
104
105 return dp;
106}
107
108/* Must be called with rcu_read_lock or RTNL lock. */
109const char *ovs_dp_name(const struct datapath *dp)
110{
111 struct vport *vport = rcu_dereference_rtnl(dp->ports[OVSP_LOCAL]);
112 return vport->ops->get_name(vport);
113}
114
115static int get_dpifindex(struct datapath *dp)
116{
117 struct vport *local;
118 int ifindex;
119
120 rcu_read_lock();
121
122 local = rcu_dereference(dp->ports[OVSP_LOCAL]);
123 if (local)
124 ifindex = local->ops->get_ifindex(local);
125 else
126 ifindex = 0;
127
128 rcu_read_unlock();
129
130 return ifindex;
131}
132
133static void destroy_dp_rcu(struct rcu_head *rcu)
134{
135 struct datapath *dp = container_of(rcu, struct datapath, rcu);
136
137 ovs_flow_tbl_destroy((__force struct flow_table *)dp->table);
138 free_percpu(dp->stats_percpu);
139 kfree(dp);
140}
141
142/* Called with RTNL lock and genl_lock. */
143static struct vport *new_vport(const struct vport_parms *parms)
144{
145 struct vport *vport;
146
147 vport = ovs_vport_add(parms);
148 if (!IS_ERR(vport)) {
149 struct datapath *dp = parms->dp;
150
151 rcu_assign_pointer(dp->ports[parms->port_no], vport);
152 list_add(&vport->node, &dp->port_list);
153 }
154
155 return vport;
156}
157
158/* Called with RTNL lock. */
159void ovs_dp_detach_port(struct vport *p)
160{
161 ASSERT_RTNL();
162
163 /* First drop references to device. */
164 list_del(&p->node);
165 rcu_assign_pointer(p->dp->ports[p->port_no], NULL);
166
167 /* Then destroy it. */
168 ovs_vport_del(p);
169}
170
171/* Must be called with rcu_read_lock. */
172void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
173{
174 struct datapath *dp = p->dp;
175 struct sw_flow *flow;
176 struct dp_stats_percpu *stats;
177 struct sw_flow_key key;
178 u64 *stats_counter;
179 int error;
180 int key_len;
181
182 stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
183
184 /* Extract flow from 'skb' into 'key'. */
185 error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
186 if (unlikely(error)) {
187 kfree_skb(skb);
188 return;
189 }
190
191 /* Look up flow. */
192 flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len);
193 if (unlikely(!flow)) {
194 struct dp_upcall_info upcall;
195
196 upcall.cmd = OVS_PACKET_CMD_MISS;
197 upcall.key = &key;
198 upcall.userdata = NULL;
199 upcall.pid = p->upcall_pid;
200 ovs_dp_upcall(dp, skb, &upcall);
201 consume_skb(skb);
202 stats_counter = &stats->n_missed;
203 goto out;
204 }
205
206 OVS_CB(skb)->flow = flow;
207
208 stats_counter = &stats->n_hit;
209 ovs_flow_used(OVS_CB(skb)->flow, skb);
210 ovs_execute_actions(dp, skb);
211
212out:
213 /* Update datapath statistics. */
214 u64_stats_update_begin(&stats->sync);
215 (*stats_counter)++;
216 u64_stats_update_end(&stats->sync);
217}
218
219static struct genl_family dp_packet_genl_family = {
220 .id = GENL_ID_GENERATE,
221 .hdrsize = sizeof(struct ovs_header),
222 .name = OVS_PACKET_FAMILY,
223 .version = OVS_PACKET_VERSION,
224 .maxattr = OVS_PACKET_ATTR_MAX
225};
226
227int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
228 const struct dp_upcall_info *upcall_info)
229{
230 struct dp_stats_percpu *stats;
231 int dp_ifindex;
232 int err;
233
234 if (upcall_info->pid == 0) {
235 err = -ENOTCONN;
236 goto err;
237 }
238
239 dp_ifindex = get_dpifindex(dp);
240 if (!dp_ifindex) {
241 err = -ENODEV;
242 goto err;
243 }
244
245 if (!skb_is_gso(skb))
246 err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
247 else
248 err = queue_gso_packets(dp_ifindex, skb, upcall_info);
249 if (err)
250 goto err;
251
252 return 0;
253
254err:
255 stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
256
257 u64_stats_update_begin(&stats->sync);
258 stats->n_lost++;
259 u64_stats_update_end(&stats->sync);
260
261 return err;
262}
263
264static int queue_gso_packets(int dp_ifindex, struct sk_buff *skb,
265 const struct dp_upcall_info *upcall_info)
266{
267 struct dp_upcall_info later_info;
268 struct sw_flow_key later_key;
269 struct sk_buff *segs, *nskb;
270 int err;
271
272 segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
273 if (IS_ERR(skb))
274 return PTR_ERR(skb);
275
276 /* Queue all of the segments. */
277 skb = segs;
278 do {
279 err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
280 if (err)
281 break;
282
283 if (skb == segs && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
284 /* The initial flow key extracted by ovs_flow_extract()
285 * in this case is for a first fragment, so we need to
286 * properly mark later fragments.
287 */
288 later_key = *upcall_info->key;
289 later_key.ip.frag = OVS_FRAG_TYPE_LATER;
290
291 later_info = *upcall_info;
292 later_info.key = &later_key;
293 upcall_info = &later_info;
294 }
295 } while ((skb = skb->next));
296
297 /* Free all of the segments. */
298 skb = segs;
299 do {
300 nskb = skb->next;
301 if (err)
302 kfree_skb(skb);
303 else
304 consume_skb(skb);
305 } while ((skb = nskb));
306 return err;
307}
308
309static int queue_userspace_packet(int dp_ifindex, struct sk_buff *skb,
310 const struct dp_upcall_info *upcall_info)
311{
312 struct ovs_header *upcall;
313 struct sk_buff *nskb = NULL;
314 struct sk_buff *user_skb; /* to be queued to userspace */
315 struct nlattr *nla;
316 unsigned int len;
317 int err;
318
319 if (vlan_tx_tag_present(skb)) {
320 nskb = skb_clone(skb, GFP_ATOMIC);
321 if (!nskb)
322 return -ENOMEM;
323
324 nskb = __vlan_put_tag(nskb, vlan_tx_tag_get(nskb));
325 if (!skb)
326 return -ENOMEM;
327
328 nskb->vlan_tci = 0;
329 skb = nskb;
330 }
331
332 if (nla_attr_size(skb->len) > USHRT_MAX) {
333 err = -EFBIG;
334 goto out;
335 }
336
337 len = sizeof(struct ovs_header);
338 len += nla_total_size(skb->len);
339 len += nla_total_size(FLOW_BUFSIZE);
340 if (upcall_info->cmd == OVS_PACKET_CMD_ACTION)
341 len += nla_total_size(8);
342
343 user_skb = genlmsg_new(len, GFP_ATOMIC);
344 if (!user_skb) {
345 err = -ENOMEM;
346 goto out;
347 }
348
349 upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
350 0, upcall_info->cmd);
351 upcall->dp_ifindex = dp_ifindex;
352
353 nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
354 ovs_flow_to_nlattrs(upcall_info->key, user_skb);
355 nla_nest_end(user_skb, nla);
356
357 if (upcall_info->userdata)
358 nla_put_u64(user_skb, OVS_PACKET_ATTR_USERDATA,
359 nla_get_u64(upcall_info->userdata));
360
361 nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
362
363 skb_copy_and_csum_dev(skb, nla_data(nla));
364
365 err = genlmsg_unicast(&init_net, user_skb, upcall_info->pid);
366
367out:
368 kfree_skb(nskb);
369 return err;
370}
371
372/* Called with genl_mutex. */
373static int flush_flows(int dp_ifindex)
374{
375 struct flow_table *old_table;
376 struct flow_table *new_table;
377 struct datapath *dp;
378
379 dp = get_dp(dp_ifindex);
380 if (!dp)
381 return -ENODEV;
382
383 old_table = genl_dereference(dp->table);
384 new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS);
385 if (!new_table)
386 return -ENOMEM;
387
388 rcu_assign_pointer(dp->table, new_table);
389
390 ovs_flow_tbl_deferred_destroy(old_table);
391 return 0;
392}
393
394static int validate_actions(const struct nlattr *attr,
395 const struct sw_flow_key *key, int depth);
396
397static int validate_sample(const struct nlattr *attr,
398 const struct sw_flow_key *key, int depth)
399{
400 const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
401 const struct nlattr *probability, *actions;
402 const struct nlattr *a;
403 int rem;
404
405 memset(attrs, 0, sizeof(attrs));
406 nla_for_each_nested(a, attr, rem) {
407 int type = nla_type(a);
408 if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
409 return -EINVAL;
410 attrs[type] = a;
411 }
412 if (rem)
413 return -EINVAL;
414
415 probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
416 if (!probability || nla_len(probability) != sizeof(u32))
417 return -EINVAL;
418
419 actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
420 if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
421 return -EINVAL;
422 return validate_actions(actions, key, depth + 1);
423}
424
425static int validate_set(const struct nlattr *a,
426 const struct sw_flow_key *flow_key)
427{
428 const struct nlattr *ovs_key = nla_data(a);
429 int key_type = nla_type(ovs_key);
430
431 /* There can be only one key in a action */
432 if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
433 return -EINVAL;
434
435 if (key_type > OVS_KEY_ATTR_MAX ||
436 nla_len(ovs_key) != ovs_key_lens[key_type])
437 return -EINVAL;
438
439 switch (key_type) {
440 const struct ovs_key_ipv4 *ipv4_key;
441
442 case OVS_KEY_ATTR_PRIORITY:
443 case OVS_KEY_ATTR_ETHERNET:
444 break;
445
446 case OVS_KEY_ATTR_IPV4:
447 if (flow_key->eth.type != htons(ETH_P_IP))
448 return -EINVAL;
449
450 if (!flow_key->ipv4.addr.src || !flow_key->ipv4.addr.dst)
451 return -EINVAL;
452
453 ipv4_key = nla_data(ovs_key);
454 if (ipv4_key->ipv4_proto != flow_key->ip.proto)
455 return -EINVAL;
456
457 if (ipv4_key->ipv4_frag != flow_key->ip.frag)
458 return -EINVAL;
459
460 break;
461
462 case OVS_KEY_ATTR_TCP:
463 if (flow_key->ip.proto != IPPROTO_TCP)
464 return -EINVAL;
465
466 if (!flow_key->ipv4.tp.src || !flow_key->ipv4.tp.dst)
467 return -EINVAL;
468
469 break;
470
471 case OVS_KEY_ATTR_UDP:
472 if (flow_key->ip.proto != IPPROTO_UDP)
473 return -EINVAL;
474
475 if (!flow_key->ipv4.tp.src || !flow_key->ipv4.tp.dst)
476 return -EINVAL;
477 break;
478
479 default:
480 return -EINVAL;
481 }
482
483 return 0;
484}
485
486static int validate_userspace(const struct nlattr *attr)
487{
488 static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = {
489 [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
490 [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_U64 },
491 };
492 struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
493 int error;
494
495 error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
496 attr, userspace_policy);
497 if (error)
498 return error;
499
500 if (!a[OVS_USERSPACE_ATTR_PID] ||
501 !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
502 return -EINVAL;
503
504 return 0;
505}
506
507static int validate_actions(const struct nlattr *attr,
508 const struct sw_flow_key *key, int depth)
509{
510 const struct nlattr *a;
511 int rem, err;
512
513 if (depth >= SAMPLE_ACTION_DEPTH)
514 return -EOVERFLOW;
515
516 nla_for_each_nested(a, attr, rem) {
517 /* Expected argument lengths, (u32)-1 for variable length. */
518 static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
519 [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
520 [OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
521 [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
522 [OVS_ACTION_ATTR_POP_VLAN] = 0,
523 [OVS_ACTION_ATTR_SET] = (u32)-1,
524 [OVS_ACTION_ATTR_SAMPLE] = (u32)-1
525 };
526 const struct ovs_action_push_vlan *vlan;
527 int type = nla_type(a);
528
529 if (type > OVS_ACTION_ATTR_MAX ||
530 (action_lens[type] != nla_len(a) &&
531 action_lens[type] != (u32)-1))
532 return -EINVAL;
533
534 switch (type) {
535 case OVS_ACTION_ATTR_UNSPEC:
536 return -EINVAL;
537
538 case OVS_ACTION_ATTR_USERSPACE:
539 err = validate_userspace(a);
540 if (err)
541 return err;
542 break;
543
544 case OVS_ACTION_ATTR_OUTPUT:
545 if (nla_get_u32(a) >= DP_MAX_PORTS)
546 return -EINVAL;
547 break;
548
549
550 case OVS_ACTION_ATTR_POP_VLAN:
551 break;
552
553 case OVS_ACTION_ATTR_PUSH_VLAN:
554 vlan = nla_data(a);
555 if (vlan->vlan_tpid != htons(ETH_P_8021Q))
556 return -EINVAL;
557 if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
558 return -EINVAL;
559 break;
560
561 case OVS_ACTION_ATTR_SET:
562 err = validate_set(a, key);
563 if (err)
564 return err;
565 break;
566
567 case OVS_ACTION_ATTR_SAMPLE:
568 err = validate_sample(a, key, depth);
569 if (err)
570 return err;
571 break;
572
573 default:
574 return -EINVAL;
575 }
576 }
577
578 if (rem > 0)
579 return -EINVAL;
580
581 return 0;
582}
583
584static void clear_stats(struct sw_flow *flow)
585{
586 flow->used = 0;
587 flow->tcp_flags = 0;
588 flow->packet_count = 0;
589 flow->byte_count = 0;
590}
591
592static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
593{
594 struct ovs_header *ovs_header = info->userhdr;
595 struct nlattr **a = info->attrs;
596 struct sw_flow_actions *acts;
597 struct sk_buff *packet;
598 struct sw_flow *flow;
599 struct datapath *dp;
600 struct ethhdr *eth;
601 int len;
602 int err;
603 int key_len;
604
605 err = -EINVAL;
606 if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
607 !a[OVS_PACKET_ATTR_ACTIONS] ||
608 nla_len(a[OVS_PACKET_ATTR_PACKET]) < ETH_HLEN)
609 goto err;
610
611 len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
612 packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
613 err = -ENOMEM;
614 if (!packet)
615 goto err;
616 skb_reserve(packet, NET_IP_ALIGN);
617
618 memcpy(__skb_put(packet, len), nla_data(a[OVS_PACKET_ATTR_PACKET]), len);
619
620 skb_reset_mac_header(packet);
621 eth = eth_hdr(packet);
622
623 /* Normally, setting the skb 'protocol' field would be handled by a
624 * call to eth_type_trans(), but it assumes there's a sending
625 * device, which we may not have. */
626 if (ntohs(eth->h_proto) >= 1536)
627 packet->protocol = eth->h_proto;
628 else
629 packet->protocol = htons(ETH_P_802_2);
630
631 /* Build an sw_flow for sending this packet. */
632 flow = ovs_flow_alloc();
633 err = PTR_ERR(flow);
634 if (IS_ERR(flow))
635 goto err_kfree_skb;
636
637 err = ovs_flow_extract(packet, -1, &flow->key, &key_len);
638 if (err)
639 goto err_flow_free;
640
641 err = ovs_flow_metadata_from_nlattrs(&flow->key.phy.priority,
642 &flow->key.phy.in_port,
643 a[OVS_PACKET_ATTR_KEY]);
644 if (err)
645 goto err_flow_free;
646
647 err = validate_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0);
648 if (err)
649 goto err_flow_free;
650
651 flow->hash = ovs_flow_hash(&flow->key, key_len);
652
653 acts = ovs_flow_actions_alloc(a[OVS_PACKET_ATTR_ACTIONS]);
654 err = PTR_ERR(acts);
655 if (IS_ERR(acts))
656 goto err_flow_free;
657 rcu_assign_pointer(flow->sf_acts, acts);
658
659 OVS_CB(packet)->flow = flow;
660 packet->priority = flow->key.phy.priority;
661
662 rcu_read_lock();
663 dp = get_dp(ovs_header->dp_ifindex);
664 err = -ENODEV;
665 if (!dp)
666 goto err_unlock;
667
668 local_bh_disable();
669 err = ovs_execute_actions(dp, packet);
670 local_bh_enable();
671 rcu_read_unlock();
672
673 ovs_flow_free(flow);
674 return err;
675
676err_unlock:
677 rcu_read_unlock();
678err_flow_free:
679 ovs_flow_free(flow);
680err_kfree_skb:
681 kfree_skb(packet);
682err:
683 return err;
684}
685
686static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
687 [OVS_PACKET_ATTR_PACKET] = { .type = NLA_UNSPEC },
688 [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
689 [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
690};
691
692static struct genl_ops dp_packet_genl_ops[] = {
693 { .cmd = OVS_PACKET_CMD_EXECUTE,
694 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
695 .policy = packet_policy,
696 .doit = ovs_packet_cmd_execute
697 }
698};
699
700static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
701{
702 int i;
703 struct flow_table *table = genl_dereference(dp->table);
704
705 stats->n_flows = ovs_flow_tbl_count(table);
706
707 stats->n_hit = stats->n_missed = stats->n_lost = 0;
708 for_each_possible_cpu(i) {
709 const struct dp_stats_percpu *percpu_stats;
710 struct dp_stats_percpu local_stats;
711 unsigned int start;
712
713 percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
714
715 do {
716 start = u64_stats_fetch_begin_bh(&percpu_stats->sync);
717 local_stats = *percpu_stats;
718 } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start));
719
720 stats->n_hit += local_stats.n_hit;
721 stats->n_missed += local_stats.n_missed;
722 stats->n_lost += local_stats.n_lost;
723 }
724}
725
726static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
727 [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
728 [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
729 [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
730};
731
732static struct genl_family dp_flow_genl_family = {
733 .id = GENL_ID_GENERATE,
734 .hdrsize = sizeof(struct ovs_header),
735 .name = OVS_FLOW_FAMILY,
736 .version = OVS_FLOW_VERSION,
737 .maxattr = OVS_FLOW_ATTR_MAX
738};
739
740static struct genl_multicast_group ovs_dp_flow_multicast_group = {
741 .name = OVS_FLOW_MCGROUP
742};
743
744/* Called with genl_lock. */
745static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
746 struct sk_buff *skb, u32 pid,
747 u32 seq, u32 flags, u8 cmd)
748{
749 const int skb_orig_len = skb->len;
750 const struct sw_flow_actions *sf_acts;
751 struct ovs_flow_stats stats;
752 struct ovs_header *ovs_header;
753 struct nlattr *nla;
754 unsigned long used;
755 u8 tcp_flags;
756 int err;
757
758 sf_acts = rcu_dereference_protected(flow->sf_acts,
759 lockdep_genl_is_held());
760
761 ovs_header = genlmsg_put(skb, pid, seq, &dp_flow_genl_family, flags, cmd);
762 if (!ovs_header)
763 return -EMSGSIZE;
764
765 ovs_header->dp_ifindex = get_dpifindex(dp);
766
767 nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
768 if (!nla)
769 goto nla_put_failure;
770 err = ovs_flow_to_nlattrs(&flow->key, skb);
771 if (err)
772 goto error;
773 nla_nest_end(skb, nla);
774
775 spin_lock_bh(&flow->lock);
776 used = flow->used;
777 stats.n_packets = flow->packet_count;
778 stats.n_bytes = flow->byte_count;
779 tcp_flags = flow->tcp_flags;
780 spin_unlock_bh(&flow->lock);
781
782 if (used)
783 NLA_PUT_U64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used));
784
785 if (stats.n_packets)
786 NLA_PUT(skb, OVS_FLOW_ATTR_STATS,
787 sizeof(struct ovs_flow_stats), &stats);
788
789 if (tcp_flags)
790 NLA_PUT_U8(skb, OVS_FLOW_ATTR_TCP_FLAGS, tcp_flags);
791
792 /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
793 * this is the first flow to be dumped into 'skb'. This is unusual for
794 * Netlink but individual action lists can be longer than
795 * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
796 * The userspace caller can always fetch the actions separately if it
797 * really wants them. (Most userspace callers in fact don't care.)
798 *
799 * This can only fail for dump operations because the skb is always
800 * properly sized for single flows.
801 */
802 err = nla_put(skb, OVS_FLOW_ATTR_ACTIONS, sf_acts->actions_len,
803 sf_acts->actions);
804 if (err < 0 && skb_orig_len)
805 goto error;
806
807 return genlmsg_end(skb, ovs_header);
808
809nla_put_failure:
810 err = -EMSGSIZE;
811error:
812 genlmsg_cancel(skb, ovs_header);
813 return err;
814}
815
816static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow)
817{
818 const struct sw_flow_actions *sf_acts;
819 int len;
820
821 sf_acts = rcu_dereference_protected(flow->sf_acts,
822 lockdep_genl_is_held());
823
824 /* OVS_FLOW_ATTR_KEY */
825 len = nla_total_size(FLOW_BUFSIZE);
826 /* OVS_FLOW_ATTR_ACTIONS */
827 len += nla_total_size(sf_acts->actions_len);
828 /* OVS_FLOW_ATTR_STATS */
829 len += nla_total_size(sizeof(struct ovs_flow_stats));
830 /* OVS_FLOW_ATTR_TCP_FLAGS */
831 len += nla_total_size(1);
832 /* OVS_FLOW_ATTR_USED */
833 len += nla_total_size(8);
834
835 len += NLMSG_ALIGN(sizeof(struct ovs_header));
836
837 return genlmsg_new(len, GFP_KERNEL);
838}
839
840static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow,
841 struct datapath *dp,
842 u32 pid, u32 seq, u8 cmd)
843{
844 struct sk_buff *skb;
845 int retval;
846
847 skb = ovs_flow_cmd_alloc_info(flow);
848 if (!skb)
849 return ERR_PTR(-ENOMEM);
850
851 retval = ovs_flow_cmd_fill_info(flow, dp, skb, pid, seq, 0, cmd);
852 BUG_ON(retval < 0);
853 return skb;
854}
855
856static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
857{
858 struct nlattr **a = info->attrs;
859 struct ovs_header *ovs_header = info->userhdr;
860 struct sw_flow_key key;
861 struct sw_flow *flow;
862 struct sk_buff *reply;
863 struct datapath *dp;
864 struct flow_table *table;
865 int error;
866 int key_len;
867
868 /* Extract key. */
869 error = -EINVAL;
870 if (!a[OVS_FLOW_ATTR_KEY])
871 goto error;
872 error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
873 if (error)
874 goto error;
875
876 /* Validate actions. */
877 if (a[OVS_FLOW_ATTR_ACTIONS]) {
878 error = validate_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0);
879 if (error)
880 goto error;
881 } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) {
882 error = -EINVAL;
883 goto error;
884 }
885
886 dp = get_dp(ovs_header->dp_ifindex);
887 error = -ENODEV;
888 if (!dp)
889 goto error;
890
891 table = genl_dereference(dp->table);
892 flow = ovs_flow_tbl_lookup(table, &key, key_len);
893 if (!flow) {
894 struct sw_flow_actions *acts;
895
896 /* Bail out if we're not allowed to create a new flow. */
897 error = -ENOENT;
898 if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)
899 goto error;
900
901 /* Expand table, if necessary, to make room. */
902 if (ovs_flow_tbl_need_to_expand(table)) {
903 struct flow_table *new_table;
904
905 new_table = ovs_flow_tbl_expand(table);
906 if (!IS_ERR(new_table)) {
907 rcu_assign_pointer(dp->table, new_table);
908 ovs_flow_tbl_deferred_destroy(table);
909 table = genl_dereference(dp->table);
910 }
911 }
912
913 /* Allocate flow. */
914 flow = ovs_flow_alloc();
915 if (IS_ERR(flow)) {
916 error = PTR_ERR(flow);
917 goto error;
918 }
919 flow->key = key;
920 clear_stats(flow);
921
922 /* Obtain actions. */
923 acts = ovs_flow_actions_alloc(a[OVS_FLOW_ATTR_ACTIONS]);
924 error = PTR_ERR(acts);
925 if (IS_ERR(acts))
926 goto error_free_flow;
927 rcu_assign_pointer(flow->sf_acts, acts);
928
929 /* Put flow in bucket. */
930 flow->hash = ovs_flow_hash(&key, key_len);
931 ovs_flow_tbl_insert(table, flow);
932
933 reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
934 info->snd_seq,
935 OVS_FLOW_CMD_NEW);
936 } else {
937 /* We found a matching flow. */
938 struct sw_flow_actions *old_acts;
939 struct nlattr *acts_attrs;
940
941 /* Bail out if we're not allowed to modify an existing flow.
942 * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
943 * because Generic Netlink treats the latter as a dump
944 * request. We also accept NLM_F_EXCL in case that bug ever
945 * gets fixed.
946 */
947 error = -EEXIST;
948 if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW &&
949 info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))
950 goto error;
951
952 /* Update actions. */
953 old_acts = rcu_dereference_protected(flow->sf_acts,
954 lockdep_genl_is_held());
955 acts_attrs = a[OVS_FLOW_ATTR_ACTIONS];
956 if (acts_attrs &&
957 (old_acts->actions_len != nla_len(acts_attrs) ||
958 memcmp(old_acts->actions, nla_data(acts_attrs),
959 old_acts->actions_len))) {
960 struct sw_flow_actions *new_acts;
961
962 new_acts = ovs_flow_actions_alloc(acts_attrs);
963 error = PTR_ERR(new_acts);
964 if (IS_ERR(new_acts))
965 goto error;
966
967 rcu_assign_pointer(flow->sf_acts, new_acts);
968 ovs_flow_deferred_free_acts(old_acts);
969 }
970
971 reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
972 info->snd_seq, OVS_FLOW_CMD_NEW);
973
974 /* Clear stats. */
975 if (a[OVS_FLOW_ATTR_CLEAR]) {
976 spin_lock_bh(&flow->lock);
977 clear_stats(flow);
978 spin_unlock_bh(&flow->lock);
979 }
980 }
981
982 if (!IS_ERR(reply))
983 genl_notify(reply, genl_info_net(info), info->snd_pid,
984 ovs_dp_flow_multicast_group.id, info->nlhdr,
985 GFP_KERNEL);
986 else
987 netlink_set_err(init_net.genl_sock, 0,
988 ovs_dp_flow_multicast_group.id, PTR_ERR(reply));
989 return 0;
990
991error_free_flow:
992 ovs_flow_free(flow);
993error:
994 return error;
995}
996
997static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
998{
999 struct nlattr **a = info->attrs;
1000 struct ovs_header *ovs_header = info->userhdr;
1001 struct sw_flow_key key;
1002 struct sk_buff *reply;
1003 struct sw_flow *flow;
1004 struct datapath *dp;
1005 struct flow_table *table;
1006 int err;
1007 int key_len;
1008
1009 if (!a[OVS_FLOW_ATTR_KEY])
1010 return -EINVAL;
1011 err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
1012 if (err)
1013 return err;
1014
1015 dp = get_dp(ovs_header->dp_ifindex);
1016 if (!dp)
1017 return -ENODEV;
1018
1019 table = genl_dereference(dp->table);
1020 flow = ovs_flow_tbl_lookup(table, &key, key_len);
1021 if (!flow)
1022 return -ENOENT;
1023
1024 reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
1025 info->snd_seq, OVS_FLOW_CMD_NEW);
1026 if (IS_ERR(reply))
1027 return PTR_ERR(reply);
1028
1029 return genlmsg_reply(reply, info);
1030}
1031
1032static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
1033{
1034 struct nlattr **a = info->attrs;
1035 struct ovs_header *ovs_header = info->userhdr;
1036 struct sw_flow_key key;
1037 struct sk_buff *reply;
1038 struct sw_flow *flow;
1039 struct datapath *dp;
1040 struct flow_table *table;
1041 int err;
1042 int key_len;
1043
1044 if (!a[OVS_FLOW_ATTR_KEY])
1045 return flush_flows(ovs_header->dp_ifindex);
1046 err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
1047 if (err)
1048 return err;
1049
1050 dp = get_dp(ovs_header->dp_ifindex);
1051 if (!dp)
1052 return -ENODEV;
1053
1054 table = genl_dereference(dp->table);
1055 flow = ovs_flow_tbl_lookup(table, &key, key_len);
1056 if (!flow)
1057 return -ENOENT;
1058
1059 reply = ovs_flow_cmd_alloc_info(flow);
1060 if (!reply)
1061 return -ENOMEM;
1062
1063 ovs_flow_tbl_remove(table, flow);
1064
1065 err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_pid,
1066 info->snd_seq, 0, OVS_FLOW_CMD_DEL);
1067 BUG_ON(err < 0);
1068
1069 ovs_flow_deferred_free(flow);
1070
1071 genl_notify(reply, genl_info_net(info), info->snd_pid,
1072 ovs_dp_flow_multicast_group.id, info->nlhdr, GFP_KERNEL);
1073 return 0;
1074}
1075
1076static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1077{
1078 struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
1079 struct datapath *dp;
1080 struct flow_table *table;
1081
1082 dp = get_dp(ovs_header->dp_ifindex);
1083 if (!dp)
1084 return -ENODEV;
1085
1086 table = genl_dereference(dp->table);
1087
1088 for (;;) {
1089 struct sw_flow *flow;
1090 u32 bucket, obj;
1091
1092 bucket = cb->args[0];
1093 obj = cb->args[1];
1094 flow = ovs_flow_tbl_next(table, &bucket, &obj);
1095 if (!flow)
1096 break;
1097
1098 if (ovs_flow_cmd_fill_info(flow, dp, skb,
1099 NETLINK_CB(cb->skb).pid,
1100 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1101 OVS_FLOW_CMD_NEW) < 0)
1102 break;
1103
1104 cb->args[0] = bucket;
1105 cb->args[1] = obj;
1106 }
1107 return skb->len;
1108}
1109
1110static struct genl_ops dp_flow_genl_ops[] = {
1111 { .cmd = OVS_FLOW_CMD_NEW,
1112 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1113 .policy = flow_policy,
1114 .doit = ovs_flow_cmd_new_or_set
1115 },
1116 { .cmd = OVS_FLOW_CMD_DEL,
1117 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1118 .policy = flow_policy,
1119 .doit = ovs_flow_cmd_del
1120 },
1121 { .cmd = OVS_FLOW_CMD_GET,
1122 .flags = 0, /* OK for unprivileged users. */
1123 .policy = flow_policy,
1124 .doit = ovs_flow_cmd_get,
1125 .dumpit = ovs_flow_cmd_dump
1126 },
1127 { .cmd = OVS_FLOW_CMD_SET,
1128 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1129 .policy = flow_policy,
1130 .doit = ovs_flow_cmd_new_or_set,
1131 },
1132};
1133
1134static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
1135 [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
1136 [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
1137};
1138
1139static struct genl_family dp_datapath_genl_family = {
1140 .id = GENL_ID_GENERATE,
1141 .hdrsize = sizeof(struct ovs_header),
1142 .name = OVS_DATAPATH_FAMILY,
1143 .version = OVS_DATAPATH_VERSION,
1144 .maxattr = OVS_DP_ATTR_MAX
1145};
1146
1147static struct genl_multicast_group ovs_dp_datapath_multicast_group = {
1148 .name = OVS_DATAPATH_MCGROUP
1149};
1150
1151static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
1152 u32 pid, u32 seq, u32 flags, u8 cmd)
1153{
1154 struct ovs_header *ovs_header;
1155 struct ovs_dp_stats dp_stats;
1156 int err;
1157
1158 ovs_header = genlmsg_put(skb, pid, seq, &dp_datapath_genl_family,
1159 flags, cmd);
1160 if (!ovs_header)
1161 goto error;
1162
1163 ovs_header->dp_ifindex = get_dpifindex(dp);
1164
1165 rcu_read_lock();
1166 err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp));
1167 rcu_read_unlock();
1168 if (err)
1169 goto nla_put_failure;
1170
1171 get_dp_stats(dp, &dp_stats);
1172 NLA_PUT(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats);
1173
1174 return genlmsg_end(skb, ovs_header);
1175
1176nla_put_failure:
1177 genlmsg_cancel(skb, ovs_header);
1178error:
1179 return -EMSGSIZE;
1180}
1181
1182static struct sk_buff *ovs_dp_cmd_build_info(struct datapath *dp, u32 pid,
1183 u32 seq, u8 cmd)
1184{
1185 struct sk_buff *skb;
1186 int retval;
1187
1188 skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1189 if (!skb)
1190 return ERR_PTR(-ENOMEM);
1191
1192 retval = ovs_dp_cmd_fill_info(dp, skb, pid, seq, 0, cmd);
1193 if (retval < 0) {
1194 kfree_skb(skb);
1195 return ERR_PTR(retval);
1196 }
1197 return skb;
1198}
1199
1200/* Called with genl_mutex and optionally with RTNL lock also. */
1201static struct datapath *lookup_datapath(struct ovs_header *ovs_header,
1202 struct nlattr *a[OVS_DP_ATTR_MAX + 1])
1203{
1204 struct datapath *dp;
1205
1206 if (!a[OVS_DP_ATTR_NAME])
1207 dp = get_dp(ovs_header->dp_ifindex);
1208 else {
1209 struct vport *vport;
1210
1211 rcu_read_lock();
1212 vport = ovs_vport_locate(nla_data(a[OVS_DP_ATTR_NAME]));
1213 dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL;
1214 rcu_read_unlock();
1215 }
1216 return dp ? dp : ERR_PTR(-ENODEV);
1217}
1218
1219static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
1220{
1221 struct nlattr **a = info->attrs;
1222 struct vport_parms parms;
1223 struct sk_buff *reply;
1224 struct datapath *dp;
1225 struct vport *vport;
1226 int err;
1227
1228 err = -EINVAL;
1229 if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
1230 goto err;
1231
1232 rtnl_lock();
1233 err = -ENODEV;
1234 if (!try_module_get(THIS_MODULE))
1235 goto err_unlock_rtnl;
1236
1237 err = -ENOMEM;
1238 dp = kzalloc(sizeof(*dp), GFP_KERNEL);
1239 if (dp == NULL)
1240 goto err_put_module;
1241 INIT_LIST_HEAD(&dp->port_list);
1242
1243 /* Allocate table. */
1244 err = -ENOMEM;
1245 rcu_assign_pointer(dp->table, ovs_flow_tbl_alloc(TBL_MIN_BUCKETS));
1246 if (!dp->table)
1247 goto err_free_dp;
1248
1249 dp->stats_percpu = alloc_percpu(struct dp_stats_percpu);
1250 if (!dp->stats_percpu) {
1251 err = -ENOMEM;
1252 goto err_destroy_table;
1253 }
1254
1255 /* Set up our datapath device. */
1256 parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
1257 parms.type = OVS_VPORT_TYPE_INTERNAL;
1258 parms.options = NULL;
1259 parms.dp = dp;
1260 parms.port_no = OVSP_LOCAL;
1261 parms.upcall_pid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]);
1262
1263 vport = new_vport(&parms);
1264 if (IS_ERR(vport)) {
1265 err = PTR_ERR(vport);
1266 if (err == -EBUSY)
1267 err = -EEXIST;
1268
1269 goto err_destroy_percpu;
1270 }
1271
1272 reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1273 info->snd_seq, OVS_DP_CMD_NEW);
1274 err = PTR_ERR(reply);
1275 if (IS_ERR(reply))
1276 goto err_destroy_local_port;
1277
1278 list_add_tail(&dp->list_node, &dps);
1279 rtnl_unlock();
1280
1281 genl_notify(reply, genl_info_net(info), info->snd_pid,
1282 ovs_dp_datapath_multicast_group.id, info->nlhdr,
1283 GFP_KERNEL);
1284 return 0;
1285
1286err_destroy_local_port:
1287 ovs_dp_detach_port(rtnl_dereference(dp->ports[OVSP_LOCAL]));
1288err_destroy_percpu:
1289 free_percpu(dp->stats_percpu);
1290err_destroy_table:
1291 ovs_flow_tbl_destroy(genl_dereference(dp->table));
1292err_free_dp:
1293 kfree(dp);
1294err_put_module:
1295 module_put(THIS_MODULE);
1296err_unlock_rtnl:
1297 rtnl_unlock();
1298err:
1299 return err;
1300}
1301
1302static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
1303{
1304 struct vport *vport, *next_vport;
1305 struct sk_buff *reply;
1306 struct datapath *dp;
1307 int err;
1308
1309 rtnl_lock();
1310 dp = lookup_datapath(info->userhdr, info->attrs);
1311 err = PTR_ERR(dp);
1312 if (IS_ERR(dp))
1313 goto exit_unlock;
1314
1315 reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1316 info->snd_seq, OVS_DP_CMD_DEL);
1317 err = PTR_ERR(reply);
1318 if (IS_ERR(reply))
1319 goto exit_unlock;
1320
1321 list_for_each_entry_safe(vport, next_vport, &dp->port_list, node)
1322 if (vport->port_no != OVSP_LOCAL)
1323 ovs_dp_detach_port(vport);
1324
1325 list_del(&dp->list_node);
1326 ovs_dp_detach_port(rtnl_dereference(dp->ports[OVSP_LOCAL]));
1327
1328 /* rtnl_unlock() will wait until all the references to devices that
1329 * are pending unregistration have been dropped. We do it here to
1330 * ensure that any internal devices (which contain DP pointers) are
1331 * fully destroyed before freeing the datapath.
1332 */
1333 rtnl_unlock();
1334
1335 call_rcu(&dp->rcu, destroy_dp_rcu);
1336 module_put(THIS_MODULE);
1337
1338 genl_notify(reply, genl_info_net(info), info->snd_pid,
1339 ovs_dp_datapath_multicast_group.id, info->nlhdr,
1340 GFP_KERNEL);
1341
1342 return 0;
1343
1344exit_unlock:
1345 rtnl_unlock();
1346 return err;
1347}
1348
1349static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
1350{
1351 struct sk_buff *reply;
1352 struct datapath *dp;
1353 int err;
1354
1355 dp = lookup_datapath(info->userhdr, info->attrs);
1356 if (IS_ERR(dp))
1357 return PTR_ERR(dp);
1358
1359 reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1360 info->snd_seq, OVS_DP_CMD_NEW);
1361 if (IS_ERR(reply)) {
1362 err = PTR_ERR(reply);
1363 netlink_set_err(init_net.genl_sock, 0,
1364 ovs_dp_datapath_multicast_group.id, err);
1365 return 0;
1366 }
1367
1368 genl_notify(reply, genl_info_net(info), info->snd_pid,
1369 ovs_dp_datapath_multicast_group.id, info->nlhdr,
1370 GFP_KERNEL);
1371
1372 return 0;
1373}
1374
1375static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
1376{
1377 struct sk_buff *reply;
1378 struct datapath *dp;
1379
1380 dp = lookup_datapath(info->userhdr, info->attrs);
1381 if (IS_ERR(dp))
1382 return PTR_ERR(dp);
1383
1384 reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1385 info->snd_seq, OVS_DP_CMD_NEW);
1386 if (IS_ERR(reply))
1387 return PTR_ERR(reply);
1388
1389 return genlmsg_reply(reply, info);
1390}
1391
1392static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1393{
1394 struct datapath *dp;
1395 int skip = cb->args[0];
1396 int i = 0;
1397
1398 list_for_each_entry(dp, &dps, list_node) {
1399 if (i >= skip &&
1400 ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).pid,
1401 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1402 OVS_DP_CMD_NEW) < 0)
1403 break;
1404 i++;
1405 }
1406
1407 cb->args[0] = i;
1408
1409 return skb->len;
1410}
1411
1412static struct genl_ops dp_datapath_genl_ops[] = {
1413 { .cmd = OVS_DP_CMD_NEW,
1414 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1415 .policy = datapath_policy,
1416 .doit = ovs_dp_cmd_new
1417 },
1418 { .cmd = OVS_DP_CMD_DEL,
1419 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1420 .policy = datapath_policy,
1421 .doit = ovs_dp_cmd_del
1422 },
1423 { .cmd = OVS_DP_CMD_GET,
1424 .flags = 0, /* OK for unprivileged users. */
1425 .policy = datapath_policy,
1426 .doit = ovs_dp_cmd_get,
1427 .dumpit = ovs_dp_cmd_dump
1428 },
1429 { .cmd = OVS_DP_CMD_SET,
1430 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1431 .policy = datapath_policy,
1432 .doit = ovs_dp_cmd_set,
1433 },
1434};
1435
1436static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
1437 [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
1438 [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
1439 [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
1440 [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
1441 [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
1442 [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
1443};
1444
1445static struct genl_family dp_vport_genl_family = {
1446 .id = GENL_ID_GENERATE,
1447 .hdrsize = sizeof(struct ovs_header),
1448 .name = OVS_VPORT_FAMILY,
1449 .version = OVS_VPORT_VERSION,
1450 .maxattr = OVS_VPORT_ATTR_MAX
1451};
1452
1453struct genl_multicast_group ovs_dp_vport_multicast_group = {
1454 .name = OVS_VPORT_MCGROUP
1455};
1456
1457/* Called with RTNL lock or RCU read lock. */
1458static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
1459 u32 pid, u32 seq, u32 flags, u8 cmd)
1460{
1461 struct ovs_header *ovs_header;
1462 struct ovs_vport_stats vport_stats;
1463 int err;
1464
1465 ovs_header = genlmsg_put(skb, pid, seq, &dp_vport_genl_family,
1466 flags, cmd);
1467 if (!ovs_header)
1468 return -EMSGSIZE;
1469
1470 ovs_header->dp_ifindex = get_dpifindex(vport->dp);
1471
1472 NLA_PUT_U32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no);
1473 NLA_PUT_U32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type);
1474 NLA_PUT_STRING(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport));
1475 NLA_PUT_U32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_pid);
1476
1477 ovs_vport_get_stats(vport, &vport_stats);
1478 NLA_PUT(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats),
1479 &vport_stats);
1480
1481 err = ovs_vport_get_options(vport, skb);
1482 if (err == -EMSGSIZE)
1483 goto error;
1484
1485 return genlmsg_end(skb, ovs_header);
1486
1487nla_put_failure:
1488 err = -EMSGSIZE;
1489error:
1490 genlmsg_cancel(skb, ovs_header);
1491 return err;
1492}
1493
1494/* Called with RTNL lock or RCU read lock. */
1495struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 pid,
1496 u32 seq, u8 cmd)
1497{
1498 struct sk_buff *skb;
1499 int retval;
1500
1501 skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
1502 if (!skb)
1503 return ERR_PTR(-ENOMEM);
1504
1505 retval = ovs_vport_cmd_fill_info(vport, skb, pid, seq, 0, cmd);
1506 if (retval < 0) {
1507 kfree_skb(skb);
1508 return ERR_PTR(retval);
1509 }
1510 return skb;
1511}
1512
1513/* Called with RTNL lock or RCU read lock. */
1514static struct vport *lookup_vport(struct ovs_header *ovs_header,
1515 struct nlattr *a[OVS_VPORT_ATTR_MAX + 1])
1516{
1517 struct datapath *dp;
1518 struct vport *vport;
1519
1520 if (a[OVS_VPORT_ATTR_NAME]) {
1521 vport = ovs_vport_locate(nla_data(a[OVS_VPORT_ATTR_NAME]));
1522 if (!vport)
1523 return ERR_PTR(-ENODEV);
1524 return vport;
1525 } else if (a[OVS_VPORT_ATTR_PORT_NO]) {
1526 u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
1527
1528 if (port_no >= DP_MAX_PORTS)
1529 return ERR_PTR(-EFBIG);
1530
1531 dp = get_dp(ovs_header->dp_ifindex);
1532 if (!dp)
1533 return ERR_PTR(-ENODEV);
1534
1535 vport = rcu_dereference_rtnl(dp->ports[port_no]);
1536 if (!vport)
1537 return ERR_PTR(-ENOENT);
1538 return vport;
1539 } else
1540 return ERR_PTR(-EINVAL);
1541}
1542
1543static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
1544{
1545 struct nlattr **a = info->attrs;
1546 struct ovs_header *ovs_header = info->userhdr;
1547 struct vport_parms parms;
1548 struct sk_buff *reply;
1549 struct vport *vport;
1550 struct datapath *dp;
1551 u32 port_no;
1552 int err;
1553
1554 err = -EINVAL;
1555 if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
1556 !a[OVS_VPORT_ATTR_UPCALL_PID])
1557 goto exit;
1558
1559 rtnl_lock();
1560 dp = get_dp(ovs_header->dp_ifindex);
1561 err = -ENODEV;
1562 if (!dp)
1563 goto exit_unlock;
1564
1565 if (a[OVS_VPORT_ATTR_PORT_NO]) {
1566 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
1567
1568 err = -EFBIG;
1569 if (port_no >= DP_MAX_PORTS)
1570 goto exit_unlock;
1571
1572 vport = rtnl_dereference(dp->ports[port_no]);
1573 err = -EBUSY;
1574 if (vport)
1575 goto exit_unlock;
1576 } else {
1577 for (port_no = 1; ; port_no++) {
1578 if (port_no >= DP_MAX_PORTS) {
1579 err = -EFBIG;
1580 goto exit_unlock;
1581 }
1582 vport = rtnl_dereference(dp->ports[port_no]);
1583 if (!vport)
1584 break;
1585 }
1586 }
1587
1588 parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
1589 parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
1590 parms.options = a[OVS_VPORT_ATTR_OPTIONS];
1591 parms.dp = dp;
1592 parms.port_no = port_no;
1593 parms.upcall_pid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
1594
1595 vport = new_vport(&parms);
1596 err = PTR_ERR(vport);
1597 if (IS_ERR(vport))
1598 goto exit_unlock;
1599
1600 reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1601 OVS_VPORT_CMD_NEW);
1602 if (IS_ERR(reply)) {
1603 err = PTR_ERR(reply);
1604 ovs_dp_detach_port(vport);
1605 goto exit_unlock;
1606 }
1607 genl_notify(reply, genl_info_net(info), info->snd_pid,
1608 ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
1609
1610exit_unlock:
1611 rtnl_unlock();
1612exit:
1613 return err;
1614}
1615
1616static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
1617{
1618 struct nlattr **a = info->attrs;
1619 struct sk_buff *reply;
1620 struct vport *vport;
1621 int err;
1622
1623 rtnl_lock();
1624 vport = lookup_vport(info->userhdr, a);
1625 err = PTR_ERR(vport);
1626 if (IS_ERR(vport))
1627 goto exit_unlock;
1628
1629 err = 0;
1630 if (a[OVS_VPORT_ATTR_TYPE] &&
1631 nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type)
1632 err = -EINVAL;
1633
1634 if (!err && a[OVS_VPORT_ATTR_OPTIONS])
1635 err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]);
1636 if (!err && a[OVS_VPORT_ATTR_UPCALL_PID])
1637 vport->upcall_pid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
1638
1639 reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1640 OVS_VPORT_CMD_NEW);
1641 if (IS_ERR(reply)) {
1642 err = PTR_ERR(reply);
1643 netlink_set_err(init_net.genl_sock, 0,
1644 ovs_dp_vport_multicast_group.id, err);
1645 return 0;
1646 }
1647
1648 genl_notify(reply, genl_info_net(info), info->snd_pid,
1649 ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
1650
1651exit_unlock:
1652 rtnl_unlock();
1653 return err;
1654}
1655
1656static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
1657{
1658 struct nlattr **a = info->attrs;
1659 struct sk_buff *reply;
1660 struct vport *vport;
1661 int err;
1662
1663 rtnl_lock();
1664 vport = lookup_vport(info->userhdr, a);
1665 err = PTR_ERR(vport);
1666 if (IS_ERR(vport))
1667 goto exit_unlock;
1668
1669 if (vport->port_no == OVSP_LOCAL) {
1670 err = -EINVAL;
1671 goto exit_unlock;
1672 }
1673
1674 reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1675 OVS_VPORT_CMD_DEL);
1676 err = PTR_ERR(reply);
1677 if (IS_ERR(reply))
1678 goto exit_unlock;
1679
1680 ovs_dp_detach_port(vport);
1681
1682 genl_notify(reply, genl_info_net(info), info->snd_pid,
1683 ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
1684
1685exit_unlock:
1686 rtnl_unlock();
1687 return err;
1688}
1689
1690static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
1691{
1692 struct nlattr **a = info->attrs;
1693 struct ovs_header *ovs_header = info->userhdr;
1694 struct sk_buff *reply;
1695 struct vport *vport;
1696 int err;
1697
1698 rcu_read_lock();
1699 vport = lookup_vport(ovs_header, a);
1700 err = PTR_ERR(vport);
1701 if (IS_ERR(vport))
1702 goto exit_unlock;
1703
1704 reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1705 OVS_VPORT_CMD_NEW);
1706 err = PTR_ERR(reply);
1707 if (IS_ERR(reply))
1708 goto exit_unlock;
1709
1710 rcu_read_unlock();
1711
1712 return genlmsg_reply(reply, info);
1713
1714exit_unlock:
1715 rcu_read_unlock();
1716 return err;
1717}
1718
1719static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1720{
1721 struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
1722 struct datapath *dp;
1723 u32 port_no;
1724 int retval;
1725
1726 dp = get_dp(ovs_header->dp_ifindex);
1727 if (!dp)
1728 return -ENODEV;
1729
1730 rcu_read_lock();
1731 for (port_no = cb->args[0]; port_no < DP_MAX_PORTS; port_no++) {
1732 struct vport *vport;
1733
1734 vport = rcu_dereference(dp->ports[port_no]);
1735 if (!vport)
1736 continue;
1737
1738 if (ovs_vport_cmd_fill_info(vport, skb, NETLINK_CB(cb->skb).pid,
1739 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1740 OVS_VPORT_CMD_NEW) < 0)
1741 break;
1742 }
1743 rcu_read_unlock();
1744
1745 cb->args[0] = port_no;
1746 retval = skb->len;
1747
1748 return retval;
1749}
1750
1751static void rehash_flow_table(struct work_struct *work)
1752{
1753 struct datapath *dp;
1754
1755 genl_lock();
1756
1757 list_for_each_entry(dp, &dps, list_node) {
1758 struct flow_table *old_table = genl_dereference(dp->table);
1759 struct flow_table *new_table;
1760
1761 new_table = ovs_flow_tbl_rehash(old_table);
1762 if (!IS_ERR(new_table)) {
1763 rcu_assign_pointer(dp->table, new_table);
1764 ovs_flow_tbl_deferred_destroy(old_table);
1765 }
1766 }
1767
1768 genl_unlock();
1769
1770 schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
1771}
1772
1773static struct genl_ops dp_vport_genl_ops[] = {
1774 { .cmd = OVS_VPORT_CMD_NEW,
1775 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1776 .policy = vport_policy,
1777 .doit = ovs_vport_cmd_new
1778 },
1779 { .cmd = OVS_VPORT_CMD_DEL,
1780 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1781 .policy = vport_policy,
1782 .doit = ovs_vport_cmd_del
1783 },
1784 { .cmd = OVS_VPORT_CMD_GET,
1785 .flags = 0, /* OK for unprivileged users. */
1786 .policy = vport_policy,
1787 .doit = ovs_vport_cmd_get,
1788 .dumpit = ovs_vport_cmd_dump
1789 },
1790 { .cmd = OVS_VPORT_CMD_SET,
1791 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1792 .policy = vport_policy,
1793 .doit = ovs_vport_cmd_set,
1794 },
1795};
1796
1797struct genl_family_and_ops {
1798 struct genl_family *family;
1799 struct genl_ops *ops;
1800 int n_ops;
1801 struct genl_multicast_group *group;
1802};
1803
1804static const struct genl_family_and_ops dp_genl_families[] = {
1805 { &dp_datapath_genl_family,
1806 dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops),
1807 &ovs_dp_datapath_multicast_group },
1808 { &dp_vport_genl_family,
1809 dp_vport_genl_ops, ARRAY_SIZE(dp_vport_genl_ops),
1810 &ovs_dp_vport_multicast_group },
1811 { &dp_flow_genl_family,
1812 dp_flow_genl_ops, ARRAY_SIZE(dp_flow_genl_ops),
1813 &ovs_dp_flow_multicast_group },
1814 { &dp_packet_genl_family,
1815 dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops),
1816 NULL },
1817};
1818
1819static void dp_unregister_genl(int n_families)
1820{
1821 int i;
1822
1823 for (i = 0; i < n_families; i++)
1824 genl_unregister_family(dp_genl_families[i].family);
1825}
1826
1827static int dp_register_genl(void)
1828{
1829 int n_registered;
1830 int err;
1831 int i;
1832
1833 n_registered = 0;
1834 for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
1835 const struct genl_family_and_ops *f = &dp_genl_families[i];
1836
1837 err = genl_register_family_with_ops(f->family, f->ops,
1838 f->n_ops);
1839 if (err)
1840 goto error;
1841 n_registered++;
1842
1843 if (f->group) {
1844 err = genl_register_mc_group(f->family, f->group);
1845 if (err)
1846 goto error;
1847 }
1848 }
1849
1850 return 0;
1851
1852error:
1853 dp_unregister_genl(n_registered);
1854 return err;
1855}
1856
1857static int __init dp_init(void)
1858{
1859 struct sk_buff *dummy_skb;
1860 int err;
1861
1862 BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof(dummy_skb->cb));
1863
1864 pr_info("Open vSwitch switching datapath\n");
1865
1866 err = ovs_flow_init();
1867 if (err)
1868 goto error;
1869
1870 err = ovs_vport_init();
1871 if (err)
1872 goto error_flow_exit;
1873
1874 err = register_netdevice_notifier(&ovs_dp_device_notifier);
1875 if (err)
1876 goto error_vport_exit;
1877
1878 err = dp_register_genl();
1879 if (err < 0)
1880 goto error_unreg_notifier;
1881
1882 schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
1883
1884 return 0;
1885
1886error_unreg_notifier:
1887 unregister_netdevice_notifier(&ovs_dp_device_notifier);
1888error_vport_exit:
1889 ovs_vport_exit();
1890error_flow_exit:
1891 ovs_flow_exit();
1892error:
1893 return err;
1894}
1895
1896static void dp_cleanup(void)
1897{
1898 cancel_delayed_work_sync(&rehash_flow_wq);
1899 rcu_barrier();
1900 dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
1901 unregister_netdevice_notifier(&ovs_dp_device_notifier);
1902 ovs_vport_exit();
1903 ovs_flow_exit();
1904}
1905
1906module_init(dp_init);
1907module_exit(dp_cleanup);
1908
1909MODULE_DESCRIPTION("Open vSwitch switching datapath");
1910MODULE_LICENSE("GPL");
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
new file mode 100644
index 000000000000..c73370cc1f02
--- /dev/null
+++ b/net/openvswitch/datapath.h
@@ -0,0 +1,124 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#ifndef DATAPATH_H
20#define DATAPATH_H 1
21
22#include <asm/page.h>
23#include <linux/kernel.h>
24#include <linux/mutex.h>
25#include <linux/netdevice.h>
26#include <linux/skbuff.h>
27#include <linux/u64_stats_sync.h>
28
29#include "flow.h"
30
31struct vport;
32
33#define DP_MAX_PORTS 1024
34#define SAMPLE_ACTION_DEPTH 3
35
36/**
37 * struct dp_stats_percpu - per-cpu packet processing statistics for a given
38 * datapath.
39 * @n_hit: Number of received packets for which a matching flow was found in
40 * the flow table.
41 * @n_miss: Number of received packets that had no matching flow in the flow
42 * table. The sum of @n_hit and @n_miss is the number of packets that have
43 * been received by the datapath.
44 * @n_lost: Number of received packets that had no matching flow in the flow
45 * table that could not be sent to userspace (normally due to an overflow in
46 * one of the datapath's queues).
47 */
48struct dp_stats_percpu {
49 u64 n_hit;
50 u64 n_missed;
51 u64 n_lost;
52 struct u64_stats_sync sync;
53};
54
55/**
56 * struct datapath - datapath for flow-based packet switching
57 * @rcu: RCU callback head for deferred destruction.
58 * @list_node: Element in global 'dps' list.
59 * @n_flows: Number of flows currently in flow table.
60 * @table: Current flow table. Protected by genl_lock and RCU.
61 * @ports: Map from port number to &struct vport. %OVSP_LOCAL port
62 * always exists, other ports may be %NULL. Protected by RTNL and RCU.
63 * @port_list: List of all ports in @ports in arbitrary order. RTNL required
64 * to iterate or modify.
65 * @stats_percpu: Per-CPU datapath statistics.
66 *
67 * Context: See the comment on locking at the top of datapath.c for additional
68 * locking information.
69 */
70struct datapath {
71 struct rcu_head rcu;
72 struct list_head list_node;
73
74 /* Flow table. */
75 struct flow_table __rcu *table;
76
77 /* Switch ports. */
78 struct vport __rcu *ports[DP_MAX_PORTS];
79 struct list_head port_list;
80
81 /* Stats. */
82 struct dp_stats_percpu __percpu *stats_percpu;
83};
84
85/**
86 * struct ovs_skb_cb - OVS data in skb CB
87 * @flow: The flow associated with this packet. May be %NULL if no flow.
88 */
89struct ovs_skb_cb {
90 struct sw_flow *flow;
91};
92#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
93
94/**
95 * struct dp_upcall - metadata to include with a packet to send to userspace
96 * @cmd: One of %OVS_PACKET_CMD_*.
97 * @key: Becomes %OVS_PACKET_ATTR_KEY. Must be nonnull.
98 * @userdata: If nonnull, its u64 value is extracted and passed to userspace as
99 * %OVS_PACKET_ATTR_USERDATA.
100 * @pid: Netlink PID to which packet should be sent. If @pid is 0 then no
101 * packet is sent and the packet is accounted in the datapath's @n_lost
102 * counter.
103 */
104struct dp_upcall_info {
105 u8 cmd;
106 const struct sw_flow_key *key;
107 const struct nlattr *userdata;
108 u32 pid;
109};
110
111extern struct notifier_block ovs_dp_device_notifier;
112extern struct genl_multicast_group ovs_dp_vport_multicast_group;
113
114void ovs_dp_process_received_packet(struct vport *, struct sk_buff *);
115void ovs_dp_detach_port(struct vport *);
116int ovs_dp_upcall(struct datapath *, struct sk_buff *,
117 const struct dp_upcall_info *);
118
119const char *ovs_dp_name(const struct datapath *dp);
120struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq,
121 u8 cmd);
122
123int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb);
124#endif /* datapath.h */
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
new file mode 100644
index 000000000000..46736518c453
--- /dev/null
+++ b/net/openvswitch/dp_notify.c
@@ -0,0 +1,66 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#include <linux/netdevice.h>
20#include <net/genetlink.h>
21
22#include "datapath.h"
23#include "vport-internal_dev.h"
24#include "vport-netdev.h"
25
26static int dp_device_event(struct notifier_block *unused, unsigned long event,
27 void *ptr)
28{
29 struct net_device *dev = ptr;
30 struct vport *vport;
31
32 if (ovs_is_internal_dev(dev))
33 vport = ovs_internal_dev_get_vport(dev);
34 else
35 vport = ovs_netdev_get_vport(dev);
36
37 if (!vport)
38 return NOTIFY_DONE;
39
40 switch (event) {
41 case NETDEV_UNREGISTER:
42 if (!ovs_is_internal_dev(dev)) {
43 struct sk_buff *notify;
44
45 notify = ovs_vport_cmd_build_info(vport, 0, 0,
46 OVS_VPORT_CMD_DEL);
47 ovs_dp_detach_port(vport);
48 if (IS_ERR(notify)) {
49 netlink_set_err(init_net.genl_sock, 0,
50 ovs_dp_vport_multicast_group.id,
51 PTR_ERR(notify));
52 break;
53 }
54
55 genlmsg_multicast(notify, 0, ovs_dp_vport_multicast_group.id,
56 GFP_KERNEL);
57 }
58 break;
59 }
60
61 return NOTIFY_DONE;
62}
63
64struct notifier_block ovs_dp_device_notifier = {
65 .notifier_call = dp_device_event
66};
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
new file mode 100644
index 000000000000..1252c3081ef1
--- /dev/null
+++ b/net/openvswitch/flow.c
@@ -0,0 +1,1345 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#include "flow.h"
20#include "datapath.h"
21#include <linux/uaccess.h>
22#include <linux/netdevice.h>
23#include <linux/etherdevice.h>
24#include <linux/if_ether.h>
25#include <linux/if_vlan.h>
26#include <net/llc_pdu.h>
27#include <linux/kernel.h>
28#include <linux/jhash.h>
29#include <linux/jiffies.h>
30#include <linux/llc.h>
31#include <linux/module.h>
32#include <linux/in.h>
33#include <linux/rcupdate.h>
34#include <linux/if_arp.h>
35#include <linux/ip.h>
36#include <linux/ipv6.h>
37#include <linux/tcp.h>
38#include <linux/udp.h>
39#include <linux/icmp.h>
40#include <linux/icmpv6.h>
41#include <linux/rculist.h>
42#include <net/ip.h>
43#include <net/ipv6.h>
44#include <net/ndisc.h>
45
46static struct kmem_cache *flow_cache;
47
48static int check_header(struct sk_buff *skb, int len)
49{
50 if (unlikely(skb->len < len))
51 return -EINVAL;
52 if (unlikely(!pskb_may_pull(skb, len)))
53 return -ENOMEM;
54 return 0;
55}
56
57static bool arphdr_ok(struct sk_buff *skb)
58{
59 return pskb_may_pull(skb, skb_network_offset(skb) +
60 sizeof(struct arp_eth_header));
61}
62
63static int check_iphdr(struct sk_buff *skb)
64{
65 unsigned int nh_ofs = skb_network_offset(skb);
66 unsigned int ip_len;
67 int err;
68
69 err = check_header(skb, nh_ofs + sizeof(struct iphdr));
70 if (unlikely(err))
71 return err;
72
73 ip_len = ip_hdrlen(skb);
74 if (unlikely(ip_len < sizeof(struct iphdr) ||
75 skb->len < nh_ofs + ip_len))
76 return -EINVAL;
77
78 skb_set_transport_header(skb, nh_ofs + ip_len);
79 return 0;
80}
81
82static bool tcphdr_ok(struct sk_buff *skb)
83{
84 int th_ofs = skb_transport_offset(skb);
85 int tcp_len;
86
87 if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr))))
88 return false;
89
90 tcp_len = tcp_hdrlen(skb);
91 if (unlikely(tcp_len < sizeof(struct tcphdr) ||
92 skb->len < th_ofs + tcp_len))
93 return false;
94
95 return true;
96}
97
98static bool udphdr_ok(struct sk_buff *skb)
99{
100 return pskb_may_pull(skb, skb_transport_offset(skb) +
101 sizeof(struct udphdr));
102}
103
104static bool icmphdr_ok(struct sk_buff *skb)
105{
106 return pskb_may_pull(skb, skb_transport_offset(skb) +
107 sizeof(struct icmphdr));
108}
109
110u64 ovs_flow_used_time(unsigned long flow_jiffies)
111{
112 struct timespec cur_ts;
113 u64 cur_ms, idle_ms;
114
115 ktime_get_ts(&cur_ts);
116 idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
117 cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
118 cur_ts.tv_nsec / NSEC_PER_MSEC;
119
120 return cur_ms - idle_ms;
121}
122
123#define SW_FLOW_KEY_OFFSET(field) \
124 (offsetof(struct sw_flow_key, field) + \
125 FIELD_SIZEOF(struct sw_flow_key, field))
126
127static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key,
128 int *key_lenp)
129{
130 unsigned int nh_ofs = skb_network_offset(skb);
131 unsigned int nh_len;
132 int payload_ofs;
133 struct ipv6hdr *nh;
134 uint8_t nexthdr;
135 __be16 frag_off;
136 int err;
137
138 *key_lenp = SW_FLOW_KEY_OFFSET(ipv6.label);
139
140 err = check_header(skb, nh_ofs + sizeof(*nh));
141 if (unlikely(err))
142 return err;
143
144 nh = ipv6_hdr(skb);
145 nexthdr = nh->nexthdr;
146 payload_ofs = (u8 *)(nh + 1) - skb->data;
147
148 key->ip.proto = NEXTHDR_NONE;
149 key->ip.tos = ipv6_get_dsfield(nh);
150 key->ip.ttl = nh->hop_limit;
151 key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
152 key->ipv6.addr.src = nh->saddr;
153 key->ipv6.addr.dst = nh->daddr;
154
155 payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
156 if (unlikely(payload_ofs < 0))
157 return -EINVAL;
158
159 if (frag_off) {
160 if (frag_off & htons(~0x7))
161 key->ip.frag = OVS_FRAG_TYPE_LATER;
162 else
163 key->ip.frag = OVS_FRAG_TYPE_FIRST;
164 }
165
166 nh_len = payload_ofs - nh_ofs;
167 skb_set_transport_header(skb, nh_ofs + nh_len);
168 key->ip.proto = nexthdr;
169 return nh_len;
170}
171
172static bool icmp6hdr_ok(struct sk_buff *skb)
173{
174 return pskb_may_pull(skb, skb_transport_offset(skb) +
175 sizeof(struct icmp6hdr));
176}
177
178#define TCP_FLAGS_OFFSET 13
179#define TCP_FLAG_MASK 0x3f
180
181void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb)
182{
183 u8 tcp_flags = 0;
184
185 if (flow->key.eth.type == htons(ETH_P_IP) &&
186 flow->key.ip.proto == IPPROTO_TCP) {
187 u8 *tcp = (u8 *)tcp_hdr(skb);
188 tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK;
189 }
190
191 spin_lock(&flow->lock);
192 flow->used = jiffies;
193 flow->packet_count++;
194 flow->byte_count += skb->len;
195 flow->tcp_flags |= tcp_flags;
196 spin_unlock(&flow->lock);
197}
198
199struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *actions)
200{
201 int actions_len = nla_len(actions);
202 struct sw_flow_actions *sfa;
203
204 /* At least DP_MAX_PORTS actions are required to be able to flood a
205 * packet to every port. Factor of 2 allows for setting VLAN tags,
206 * etc. */
207 if (actions_len > 2 * DP_MAX_PORTS * nla_total_size(4))
208 return ERR_PTR(-EINVAL);
209
210 sfa = kmalloc(sizeof(*sfa) + actions_len, GFP_KERNEL);
211 if (!sfa)
212 return ERR_PTR(-ENOMEM);
213
214 sfa->actions_len = actions_len;
215 memcpy(sfa->actions, nla_data(actions), actions_len);
216 return sfa;
217}
218
219struct sw_flow *ovs_flow_alloc(void)
220{
221 struct sw_flow *flow;
222
223 flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
224 if (!flow)
225 return ERR_PTR(-ENOMEM);
226
227 spin_lock_init(&flow->lock);
228 flow->sf_acts = NULL;
229
230 return flow;
231}
232
233static struct hlist_head *find_bucket(struct flow_table *table, u32 hash)
234{
235 hash = jhash_1word(hash, table->hash_seed);
236 return flex_array_get(table->buckets,
237 (hash & (table->n_buckets - 1)));
238}
239
240static struct flex_array *alloc_buckets(unsigned int n_buckets)
241{
242 struct flex_array *buckets;
243 int i, err;
244
245 buckets = flex_array_alloc(sizeof(struct hlist_head *),
246 n_buckets, GFP_KERNEL);
247 if (!buckets)
248 return NULL;
249
250 err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
251 if (err) {
252 flex_array_free(buckets);
253 return NULL;
254 }
255
256 for (i = 0; i < n_buckets; i++)
257 INIT_HLIST_HEAD((struct hlist_head *)
258 flex_array_get(buckets, i));
259
260 return buckets;
261}
262
263static void free_buckets(struct flex_array *buckets)
264{
265 flex_array_free(buckets);
266}
267
268struct flow_table *ovs_flow_tbl_alloc(int new_size)
269{
270 struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL);
271
272 if (!table)
273 return NULL;
274
275 table->buckets = alloc_buckets(new_size);
276
277 if (!table->buckets) {
278 kfree(table);
279 return NULL;
280 }
281 table->n_buckets = new_size;
282 table->count = 0;
283 table->node_ver = 0;
284 table->keep_flows = false;
285 get_random_bytes(&table->hash_seed, sizeof(u32));
286
287 return table;
288}
289
290void ovs_flow_tbl_destroy(struct flow_table *table)
291{
292 int i;
293
294 if (!table)
295 return;
296
297 if (table->keep_flows)
298 goto skip_flows;
299
300 for (i = 0; i < table->n_buckets; i++) {
301 struct sw_flow *flow;
302 struct hlist_head *head = flex_array_get(table->buckets, i);
303 struct hlist_node *node, *n;
304 int ver = table->node_ver;
305
306 hlist_for_each_entry_safe(flow, node, n, head, hash_node[ver]) {
307 hlist_del_rcu(&flow->hash_node[ver]);
308 ovs_flow_free(flow);
309 }
310 }
311
312skip_flows:
313 free_buckets(table->buckets);
314 kfree(table);
315}
316
317static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
318{
319 struct flow_table *table = container_of(rcu, struct flow_table, rcu);
320
321 ovs_flow_tbl_destroy(table);
322}
323
324void ovs_flow_tbl_deferred_destroy(struct flow_table *table)
325{
326 if (!table)
327 return;
328
329 call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb);
330}
331
332struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last)
333{
334 struct sw_flow *flow;
335 struct hlist_head *head;
336 struct hlist_node *n;
337 int ver;
338 int i;
339
340 ver = table->node_ver;
341 while (*bucket < table->n_buckets) {
342 i = 0;
343 head = flex_array_get(table->buckets, *bucket);
344 hlist_for_each_entry_rcu(flow, n, head, hash_node[ver]) {
345 if (i < *last) {
346 i++;
347 continue;
348 }
349 *last = i + 1;
350 return flow;
351 }
352 (*bucket)++;
353 *last = 0;
354 }
355
356 return NULL;
357}
358
359static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new)
360{
361 int old_ver;
362 int i;
363
364 old_ver = old->node_ver;
365 new->node_ver = !old_ver;
366
367 /* Insert in new table. */
368 for (i = 0; i < old->n_buckets; i++) {
369 struct sw_flow *flow;
370 struct hlist_head *head;
371 struct hlist_node *n;
372
373 head = flex_array_get(old->buckets, i);
374
375 hlist_for_each_entry(flow, n, head, hash_node[old_ver])
376 ovs_flow_tbl_insert(new, flow);
377 }
378 old->keep_flows = true;
379}
380
381static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buckets)
382{
383 struct flow_table *new_table;
384
385 new_table = ovs_flow_tbl_alloc(n_buckets);
386 if (!new_table)
387 return ERR_PTR(-ENOMEM);
388
389 flow_table_copy_flows(table, new_table);
390
391 return new_table;
392}
393
394struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table)
395{
396 return __flow_tbl_rehash(table, table->n_buckets);
397}
398
399struct flow_table *ovs_flow_tbl_expand(struct flow_table *table)
400{
401 return __flow_tbl_rehash(table, table->n_buckets * 2);
402}
403
404void ovs_flow_free(struct sw_flow *flow)
405{
406 if (unlikely(!flow))
407 return;
408
409 kfree((struct sf_flow_acts __force *)flow->sf_acts);
410 kmem_cache_free(flow_cache, flow);
411}
412
413/* RCU callback used by ovs_flow_deferred_free. */
414static void rcu_free_flow_callback(struct rcu_head *rcu)
415{
416 struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
417
418 ovs_flow_free(flow);
419}
420
421/* Schedules 'flow' to be freed after the next RCU grace period.
422 * The caller must hold rcu_read_lock for this to be sensible. */
423void ovs_flow_deferred_free(struct sw_flow *flow)
424{
425 call_rcu(&flow->rcu, rcu_free_flow_callback);
426}
427
428/* RCU callback used by ovs_flow_deferred_free_acts. */
429static void rcu_free_acts_callback(struct rcu_head *rcu)
430{
431 struct sw_flow_actions *sf_acts = container_of(rcu,
432 struct sw_flow_actions, rcu);
433 kfree(sf_acts);
434}
435
436/* Schedules 'sf_acts' to be freed after the next RCU grace period.
437 * The caller must hold rcu_read_lock for this to be sensible. */
438void ovs_flow_deferred_free_acts(struct sw_flow_actions *sf_acts)
439{
440 call_rcu(&sf_acts->rcu, rcu_free_acts_callback);
441}
442
443static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
444{
445 struct qtag_prefix {
446 __be16 eth_type; /* ETH_P_8021Q */
447 __be16 tci;
448 };
449 struct qtag_prefix *qp;
450
451 if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16)))
452 return 0;
453
454 if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) +
455 sizeof(__be16))))
456 return -ENOMEM;
457
458 qp = (struct qtag_prefix *) skb->data;
459 key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT);
460 __skb_pull(skb, sizeof(struct qtag_prefix));
461
462 return 0;
463}
464
465static __be16 parse_ethertype(struct sk_buff *skb)
466{
467 struct llc_snap_hdr {
468 u8 dsap; /* Always 0xAA */
469 u8 ssap; /* Always 0xAA */
470 u8 ctrl;
471 u8 oui[3];
472 __be16 ethertype;
473 };
474 struct llc_snap_hdr *llc;
475 __be16 proto;
476
477 proto = *(__be16 *) skb->data;
478 __skb_pull(skb, sizeof(__be16));
479
480 if (ntohs(proto) >= 1536)
481 return proto;
482
483 if (skb->len < sizeof(struct llc_snap_hdr))
484 return htons(ETH_P_802_2);
485
486 if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr))))
487 return htons(0);
488
489 llc = (struct llc_snap_hdr *) skb->data;
490 if (llc->dsap != LLC_SAP_SNAP ||
491 llc->ssap != LLC_SAP_SNAP ||
492 (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0)
493 return htons(ETH_P_802_2);
494
495 __skb_pull(skb, sizeof(struct llc_snap_hdr));
496 return llc->ethertype;
497}
498
499static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
500 int *key_lenp, int nh_len)
501{
502 struct icmp6hdr *icmp = icmp6_hdr(skb);
503 int error = 0;
504 int key_len;
505
506 /* The ICMPv6 type and code fields use the 16-bit transport port
507 * fields, so we need to store them in 16-bit network byte order.
508 */
509 key->ipv6.tp.src = htons(icmp->icmp6_type);
510 key->ipv6.tp.dst = htons(icmp->icmp6_code);
511 key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
512
513 if (icmp->icmp6_code == 0 &&
514 (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
515 icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) {
516 int icmp_len = skb->len - skb_transport_offset(skb);
517 struct nd_msg *nd;
518 int offset;
519
520 key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
521
522 /* In order to process neighbor discovery options, we need the
523 * entire packet.
524 */
525 if (unlikely(icmp_len < sizeof(*nd)))
526 goto out;
527 if (unlikely(skb_linearize(skb))) {
528 error = -ENOMEM;
529 goto out;
530 }
531
532 nd = (struct nd_msg *)skb_transport_header(skb);
533 key->ipv6.nd.target = nd->target;
534 key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
535
536 icmp_len -= sizeof(*nd);
537 offset = 0;
538 while (icmp_len >= 8) {
539 struct nd_opt_hdr *nd_opt =
540 (struct nd_opt_hdr *)(nd->opt + offset);
541 int opt_len = nd_opt->nd_opt_len * 8;
542
543 if (unlikely(!opt_len || opt_len > icmp_len))
544 goto invalid;
545
546 /* Store the link layer address if the appropriate
547 * option is provided. It is considered an error if
548 * the same link layer option is specified twice.
549 */
550 if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR
551 && opt_len == 8) {
552 if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll)))
553 goto invalid;
554 memcpy(key->ipv6.nd.sll,
555 &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
556 } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR
557 && opt_len == 8) {
558 if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll)))
559 goto invalid;
560 memcpy(key->ipv6.nd.tll,
561 &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
562 }
563
564 icmp_len -= opt_len;
565 offset += opt_len;
566 }
567 }
568
569 goto out;
570
571invalid:
572 memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target));
573 memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll));
574 memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll));
575
576out:
577 *key_lenp = key_len;
578 return error;
579}
580
581/**
582 * ovs_flow_extract - extracts a flow key from an Ethernet frame.
583 * @skb: sk_buff that contains the frame, with skb->data pointing to the
584 * Ethernet header
585 * @in_port: port number on which @skb was received.
586 * @key: output flow key
587 * @key_lenp: length of output flow key
588 *
589 * The caller must ensure that skb->len >= ETH_HLEN.
590 *
591 * Returns 0 if successful, otherwise a negative errno value.
592 *
593 * Initializes @skb header pointers as follows:
594 *
595 * - skb->mac_header: the Ethernet header.
596 *
597 * - skb->network_header: just past the Ethernet header, or just past the
598 * VLAN header, to the first byte of the Ethernet payload.
599 *
600 * - skb->transport_header: If key->dl_type is ETH_P_IP or ETH_P_IPV6
601 * on output, then just past the IP header, if one is present and
602 * of a correct length, otherwise the same as skb->network_header.
603 * For other key->dl_type values it is left untouched.
604 */
605int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
606 int *key_lenp)
607{
608 int error = 0;
609 int key_len = SW_FLOW_KEY_OFFSET(eth);
610 struct ethhdr *eth;
611
612 memset(key, 0, sizeof(*key));
613
614 key->phy.priority = skb->priority;
615 key->phy.in_port = in_port;
616
617 skb_reset_mac_header(skb);
618
619 /* Link layer. We are guaranteed to have at least the 14 byte Ethernet
620 * header in the linear data area.
621 */
622 eth = eth_hdr(skb);
623 memcpy(key->eth.src, eth->h_source, ETH_ALEN);
624 memcpy(key->eth.dst, eth->h_dest, ETH_ALEN);
625
626 __skb_pull(skb, 2 * ETH_ALEN);
627
628 if (vlan_tx_tag_present(skb))
629 key->eth.tci = htons(skb->vlan_tci);
630 else if (eth->h_proto == htons(ETH_P_8021Q))
631 if (unlikely(parse_vlan(skb, key)))
632 return -ENOMEM;
633
634 key->eth.type = parse_ethertype(skb);
635 if (unlikely(key->eth.type == htons(0)))
636 return -ENOMEM;
637
638 skb_reset_network_header(skb);
639 __skb_push(skb, skb->data - skb_mac_header(skb));
640
641 /* Network layer. */
642 if (key->eth.type == htons(ETH_P_IP)) {
643 struct iphdr *nh;
644 __be16 offset;
645
646 key_len = SW_FLOW_KEY_OFFSET(ipv4.addr);
647
648 error = check_iphdr(skb);
649 if (unlikely(error)) {
650 if (error == -EINVAL) {
651 skb->transport_header = skb->network_header;
652 error = 0;
653 }
654 goto out;
655 }
656
657 nh = ip_hdr(skb);
658 key->ipv4.addr.src = nh->saddr;
659 key->ipv4.addr.dst = nh->daddr;
660
661 key->ip.proto = nh->protocol;
662 key->ip.tos = nh->tos;
663 key->ip.ttl = nh->ttl;
664
665 offset = nh->frag_off & htons(IP_OFFSET);
666 if (offset) {
667 key->ip.frag = OVS_FRAG_TYPE_LATER;
668 goto out;
669 }
670 if (nh->frag_off & htons(IP_MF) ||
671 skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
672 key->ip.frag = OVS_FRAG_TYPE_FIRST;
673
674 /* Transport layer. */
675 if (key->ip.proto == IPPROTO_TCP) {
676 key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
677 if (tcphdr_ok(skb)) {
678 struct tcphdr *tcp = tcp_hdr(skb);
679 key->ipv4.tp.src = tcp->source;
680 key->ipv4.tp.dst = tcp->dest;
681 }
682 } else if (key->ip.proto == IPPROTO_UDP) {
683 key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
684 if (udphdr_ok(skb)) {
685 struct udphdr *udp = udp_hdr(skb);
686 key->ipv4.tp.src = udp->source;
687 key->ipv4.tp.dst = udp->dest;
688 }
689 } else if (key->ip.proto == IPPROTO_ICMP) {
690 key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
691 if (icmphdr_ok(skb)) {
692 struct icmphdr *icmp = icmp_hdr(skb);
693 /* The ICMP type and code fields use the 16-bit
694 * transport port fields, so we need to store
695 * them in 16-bit network byte order. */
696 key->ipv4.tp.src = htons(icmp->type);
697 key->ipv4.tp.dst = htons(icmp->code);
698 }
699 }
700
701 } else if (key->eth.type == htons(ETH_P_ARP) && arphdr_ok(skb)) {
702 struct arp_eth_header *arp;
703
704 arp = (struct arp_eth_header *)skb_network_header(skb);
705
706 if (arp->ar_hrd == htons(ARPHRD_ETHER)
707 && arp->ar_pro == htons(ETH_P_IP)
708 && arp->ar_hln == ETH_ALEN
709 && arp->ar_pln == 4) {
710
711 /* We only match on the lower 8 bits of the opcode. */
712 if (ntohs(arp->ar_op) <= 0xff)
713 key->ip.proto = ntohs(arp->ar_op);
714
715 if (key->ip.proto == ARPOP_REQUEST
716 || key->ip.proto == ARPOP_REPLY) {
717 memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
718 memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
719 memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
720 memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
721 key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
722 }
723 }
724 } else if (key->eth.type == htons(ETH_P_IPV6)) {
725 int nh_len; /* IPv6 Header + Extensions */
726
727 nh_len = parse_ipv6hdr(skb, key, &key_len);
728 if (unlikely(nh_len < 0)) {
729 if (nh_len == -EINVAL)
730 skb->transport_header = skb->network_header;
731 else
732 error = nh_len;
733 goto out;
734 }
735
736 if (key->ip.frag == OVS_FRAG_TYPE_LATER)
737 goto out;
738 if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
739 key->ip.frag = OVS_FRAG_TYPE_FIRST;
740
741 /* Transport layer. */
742 if (key->ip.proto == NEXTHDR_TCP) {
743 key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
744 if (tcphdr_ok(skb)) {
745 struct tcphdr *tcp = tcp_hdr(skb);
746 key->ipv6.tp.src = tcp->source;
747 key->ipv6.tp.dst = tcp->dest;
748 }
749 } else if (key->ip.proto == NEXTHDR_UDP) {
750 key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
751 if (udphdr_ok(skb)) {
752 struct udphdr *udp = udp_hdr(skb);
753 key->ipv6.tp.src = udp->source;
754 key->ipv6.tp.dst = udp->dest;
755 }
756 } else if (key->ip.proto == NEXTHDR_ICMP) {
757 key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
758 if (icmp6hdr_ok(skb)) {
759 error = parse_icmpv6(skb, key, &key_len, nh_len);
760 if (error < 0)
761 goto out;
762 }
763 }
764 }
765
766out:
767 *key_lenp = key_len;
768 return error;
769}
770
771u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len)
772{
773 return jhash2((u32 *)key, DIV_ROUND_UP(key_len, sizeof(u32)), 0);
774}
775
776struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
777 struct sw_flow_key *key, int key_len)
778{
779 struct sw_flow *flow;
780 struct hlist_node *n;
781 struct hlist_head *head;
782 u32 hash;
783
784 hash = ovs_flow_hash(key, key_len);
785
786 head = find_bucket(table, hash);
787 hlist_for_each_entry_rcu(flow, n, head, hash_node[table->node_ver]) {
788
789 if (flow->hash == hash &&
790 !memcmp(&flow->key, key, key_len)) {
791 return flow;
792 }
793 }
794 return NULL;
795}
796
797void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow)
798{
799 struct hlist_head *head;
800
801 head = find_bucket(table, flow->hash);
802 hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
803 table->count++;
804}
805
806void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
807{
808 hlist_del_rcu(&flow->hash_node[table->node_ver]);
809 table->count--;
810 BUG_ON(table->count < 0);
811}
812
813/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
814const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
815 [OVS_KEY_ATTR_ENCAP] = -1,
816 [OVS_KEY_ATTR_PRIORITY] = sizeof(u32),
817 [OVS_KEY_ATTR_IN_PORT] = sizeof(u32),
818 [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet),
819 [OVS_KEY_ATTR_VLAN] = sizeof(__be16),
820 [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16),
821 [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4),
822 [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6),
823 [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp),
824 [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp),
825 [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp),
826 [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
827 [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
828 [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
829};
830
831static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
832 const struct nlattr *a[], u32 *attrs)
833{
834 const struct ovs_key_icmp *icmp_key;
835 const struct ovs_key_tcp *tcp_key;
836 const struct ovs_key_udp *udp_key;
837
838 switch (swkey->ip.proto) {
839 case IPPROTO_TCP:
840 if (!(*attrs & (1 << OVS_KEY_ATTR_TCP)))
841 return -EINVAL;
842 *attrs &= ~(1 << OVS_KEY_ATTR_TCP);
843
844 *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
845 tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
846 swkey->ipv4.tp.src = tcp_key->tcp_src;
847 swkey->ipv4.tp.dst = tcp_key->tcp_dst;
848 break;
849
850 case IPPROTO_UDP:
851 if (!(*attrs & (1 << OVS_KEY_ATTR_UDP)))
852 return -EINVAL;
853 *attrs &= ~(1 << OVS_KEY_ATTR_UDP);
854
855 *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
856 udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
857 swkey->ipv4.tp.src = udp_key->udp_src;
858 swkey->ipv4.tp.dst = udp_key->udp_dst;
859 break;
860
861 case IPPROTO_ICMP:
862 if (!(*attrs & (1 << OVS_KEY_ATTR_ICMP)))
863 return -EINVAL;
864 *attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
865
866 *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
867 icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
868 swkey->ipv4.tp.src = htons(icmp_key->icmp_type);
869 swkey->ipv4.tp.dst = htons(icmp_key->icmp_code);
870 break;
871 }
872
873 return 0;
874}
875
876static int ipv6_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
877 const struct nlattr *a[], u32 *attrs)
878{
879 const struct ovs_key_icmpv6 *icmpv6_key;
880 const struct ovs_key_tcp *tcp_key;
881 const struct ovs_key_udp *udp_key;
882
883 switch (swkey->ip.proto) {
884 case IPPROTO_TCP:
885 if (!(*attrs & (1 << OVS_KEY_ATTR_TCP)))
886 return -EINVAL;
887 *attrs &= ~(1 << OVS_KEY_ATTR_TCP);
888
889 *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
890 tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
891 swkey->ipv6.tp.src = tcp_key->tcp_src;
892 swkey->ipv6.tp.dst = tcp_key->tcp_dst;
893 break;
894
895 case IPPROTO_UDP:
896 if (!(*attrs & (1 << OVS_KEY_ATTR_UDP)))
897 return -EINVAL;
898 *attrs &= ~(1 << OVS_KEY_ATTR_UDP);
899
900 *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
901 udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
902 swkey->ipv6.tp.src = udp_key->udp_src;
903 swkey->ipv6.tp.dst = udp_key->udp_dst;
904 break;
905
906 case IPPROTO_ICMPV6:
907 if (!(*attrs & (1 << OVS_KEY_ATTR_ICMPV6)))
908 return -EINVAL;
909 *attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
910
911 *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
912 icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
913 swkey->ipv6.tp.src = htons(icmpv6_key->icmpv6_type);
914 swkey->ipv6.tp.dst = htons(icmpv6_key->icmpv6_code);
915
916 if (swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
917 swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
918 const struct ovs_key_nd *nd_key;
919
920 if (!(*attrs & (1 << OVS_KEY_ATTR_ND)))
921 return -EINVAL;
922 *attrs &= ~(1 << OVS_KEY_ATTR_ND);
923
924 *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
925 nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
926 memcpy(&swkey->ipv6.nd.target, nd_key->nd_target,
927 sizeof(swkey->ipv6.nd.target));
928 memcpy(swkey->ipv6.nd.sll, nd_key->nd_sll, ETH_ALEN);
929 memcpy(swkey->ipv6.nd.tll, nd_key->nd_tll, ETH_ALEN);
930 }
931 break;
932 }
933
934 return 0;
935}
936
937static int parse_flow_nlattrs(const struct nlattr *attr,
938 const struct nlattr *a[], u32 *attrsp)
939{
940 const struct nlattr *nla;
941 u32 attrs;
942 int rem;
943
944 attrs = 0;
945 nla_for_each_nested(nla, attr, rem) {
946 u16 type = nla_type(nla);
947 int expected_len;
948
949 if (type > OVS_KEY_ATTR_MAX || attrs & (1 << type))
950 return -EINVAL;
951
952 expected_len = ovs_key_lens[type];
953 if (nla_len(nla) != expected_len && expected_len != -1)
954 return -EINVAL;
955
956 attrs |= 1 << type;
957 a[type] = nla;
958 }
959 if (rem)
960 return -EINVAL;
961
962 *attrsp = attrs;
963 return 0;
964}
965
966/**
967 * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key.
968 * @swkey: receives the extracted flow key.
969 * @key_lenp: number of bytes used in @swkey.
970 * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
971 * sequence.
972 */
973int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
974 const struct nlattr *attr)
975{
976 const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
977 const struct ovs_key_ethernet *eth_key;
978 int key_len;
979 u32 attrs;
980 int err;
981
982 memset(swkey, 0, sizeof(struct sw_flow_key));
983 key_len = SW_FLOW_KEY_OFFSET(eth);
984
985 err = parse_flow_nlattrs(attr, a, &attrs);
986 if (err)
987 return err;
988
989 /* Metadata attributes. */
990 if (attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
991 swkey->phy.priority = nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]);
992 attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
993 }
994 if (attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
995 u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
996 if (in_port >= DP_MAX_PORTS)
997 return -EINVAL;
998 swkey->phy.in_port = in_port;
999 attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
1000 } else {
1001 swkey->phy.in_port = USHRT_MAX;
1002 }
1003
1004 /* Data attributes. */
1005 if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET)))
1006 return -EINVAL;
1007 attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
1008
1009 eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
1010 memcpy(swkey->eth.src, eth_key->eth_src, ETH_ALEN);
1011 memcpy(swkey->eth.dst, eth_key->eth_dst, ETH_ALEN);
1012
1013 if (attrs & (1u << OVS_KEY_ATTR_ETHERTYPE) &&
1014 nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q)) {
1015 const struct nlattr *encap;
1016 __be16 tci;
1017
1018 if (attrs != ((1 << OVS_KEY_ATTR_VLAN) |
1019 (1 << OVS_KEY_ATTR_ETHERTYPE) |
1020 (1 << OVS_KEY_ATTR_ENCAP)))
1021 return -EINVAL;
1022
1023 encap = a[OVS_KEY_ATTR_ENCAP];
1024 tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
1025 if (tci & htons(VLAN_TAG_PRESENT)) {
1026 swkey->eth.tci = tci;
1027
1028 err = parse_flow_nlattrs(encap, a, &attrs);
1029 if (err)
1030 return err;
1031 } else if (!tci) {
1032 /* Corner case for truncated 802.1Q header. */
1033 if (nla_len(encap))
1034 return -EINVAL;
1035
1036 swkey->eth.type = htons(ETH_P_8021Q);
1037 *key_lenp = key_len;
1038 return 0;
1039 } else {
1040 return -EINVAL;
1041 }
1042 }
1043
1044 if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
1045 swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
1046 if (ntohs(swkey->eth.type) < 1536)
1047 return -EINVAL;
1048 attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
1049 } else {
1050 swkey->eth.type = htons(ETH_P_802_2);
1051 }
1052
1053 if (swkey->eth.type == htons(ETH_P_IP)) {
1054 const struct ovs_key_ipv4 *ipv4_key;
1055
1056 if (!(attrs & (1 << OVS_KEY_ATTR_IPV4)))
1057 return -EINVAL;
1058 attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
1059
1060 key_len = SW_FLOW_KEY_OFFSET(ipv4.addr);
1061 ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
1062 if (ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX)
1063 return -EINVAL;
1064 swkey->ip.proto = ipv4_key->ipv4_proto;
1065 swkey->ip.tos = ipv4_key->ipv4_tos;
1066 swkey->ip.ttl = ipv4_key->ipv4_ttl;
1067 swkey->ip.frag = ipv4_key->ipv4_frag;
1068 swkey->ipv4.addr.src = ipv4_key->ipv4_src;
1069 swkey->ipv4.addr.dst = ipv4_key->ipv4_dst;
1070
1071 if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
1072 err = ipv4_flow_from_nlattrs(swkey, &key_len, a, &attrs);
1073 if (err)
1074 return err;
1075 }
1076 } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
1077 const struct ovs_key_ipv6 *ipv6_key;
1078
1079 if (!(attrs & (1 << OVS_KEY_ATTR_IPV6)))
1080 return -EINVAL;
1081 attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
1082
1083 key_len = SW_FLOW_KEY_OFFSET(ipv6.label);
1084 ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
1085 if (ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX)
1086 return -EINVAL;
1087 swkey->ipv6.label = ipv6_key->ipv6_label;
1088 swkey->ip.proto = ipv6_key->ipv6_proto;
1089 swkey->ip.tos = ipv6_key->ipv6_tclass;
1090 swkey->ip.ttl = ipv6_key->ipv6_hlimit;
1091 swkey->ip.frag = ipv6_key->ipv6_frag;
1092 memcpy(&swkey->ipv6.addr.src, ipv6_key->ipv6_src,
1093 sizeof(swkey->ipv6.addr.src));
1094 memcpy(&swkey->ipv6.addr.dst, ipv6_key->ipv6_dst,
1095 sizeof(swkey->ipv6.addr.dst));
1096
1097 if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
1098 err = ipv6_flow_from_nlattrs(swkey, &key_len, a, &attrs);
1099 if (err)
1100 return err;
1101 }
1102 } else if (swkey->eth.type == htons(ETH_P_ARP)) {
1103 const struct ovs_key_arp *arp_key;
1104
1105 if (!(attrs & (1 << OVS_KEY_ATTR_ARP)))
1106 return -EINVAL;
1107 attrs &= ~(1 << OVS_KEY_ATTR_ARP);
1108
1109 key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
1110 arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
1111 swkey->ipv4.addr.src = arp_key->arp_sip;
1112 swkey->ipv4.addr.dst = arp_key->arp_tip;
1113 if (arp_key->arp_op & htons(0xff00))
1114 return -EINVAL;
1115 swkey->ip.proto = ntohs(arp_key->arp_op);
1116 memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN);
1117 memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN);
1118 }
1119
1120 if (attrs)
1121 return -EINVAL;
1122 *key_lenp = key_len;
1123
1124 return 0;
1125}
1126
1127/**
1128 * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key.
1129 * @in_port: receives the extracted input port.
1130 * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
1131 * sequence.
1132 *
1133 * This parses a series of Netlink attributes that form a flow key, which must
1134 * take the same form accepted by flow_from_nlattrs(), but only enough of it to
1135 * get the metadata, that is, the parts of the flow key that cannot be
1136 * extracted from the packet itself.
1137 */
1138int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port,
1139 const struct nlattr *attr)
1140{
1141 const struct nlattr *nla;
1142 int rem;
1143
1144 *in_port = USHRT_MAX;
1145 *priority = 0;
1146
1147 nla_for_each_nested(nla, attr, rem) {
1148 int type = nla_type(nla);
1149
1150 if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) {
1151 if (nla_len(nla) != ovs_key_lens[type])
1152 return -EINVAL;
1153
1154 switch (type) {
1155 case OVS_KEY_ATTR_PRIORITY:
1156 *priority = nla_get_u32(nla);
1157 break;
1158
1159 case OVS_KEY_ATTR_IN_PORT:
1160 if (nla_get_u32(nla) >= DP_MAX_PORTS)
1161 return -EINVAL;
1162 *in_port = nla_get_u32(nla);
1163 break;
1164 }
1165 }
1166 }
1167 if (rem)
1168 return -EINVAL;
1169 return 0;
1170}
1171
1172int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
1173{
1174 struct ovs_key_ethernet *eth_key;
1175 struct nlattr *nla, *encap;
1176
1177 if (swkey->phy.priority)
1178 NLA_PUT_U32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority);
1179
1180 if (swkey->phy.in_port != USHRT_MAX)
1181 NLA_PUT_U32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port);
1182
1183 nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
1184 if (!nla)
1185 goto nla_put_failure;
1186 eth_key = nla_data(nla);
1187 memcpy(eth_key->eth_src, swkey->eth.src, ETH_ALEN);
1188 memcpy(eth_key->eth_dst, swkey->eth.dst, ETH_ALEN);
1189
1190 if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
1191 NLA_PUT_BE16(skb, OVS_KEY_ATTR_ETHERTYPE, htons(ETH_P_8021Q));
1192 NLA_PUT_BE16(skb, OVS_KEY_ATTR_VLAN, swkey->eth.tci);
1193 encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
1194 if (!swkey->eth.tci)
1195 goto unencap;
1196 } else {
1197 encap = NULL;
1198 }
1199
1200 if (swkey->eth.type == htons(ETH_P_802_2))
1201 goto unencap;
1202
1203 NLA_PUT_BE16(skb, OVS_KEY_ATTR_ETHERTYPE, swkey->eth.type);
1204
1205 if (swkey->eth.type == htons(ETH_P_IP)) {
1206 struct ovs_key_ipv4 *ipv4_key;
1207
1208 nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key));
1209 if (!nla)
1210 goto nla_put_failure;
1211 ipv4_key = nla_data(nla);
1212 ipv4_key->ipv4_src = swkey->ipv4.addr.src;
1213 ipv4_key->ipv4_dst = swkey->ipv4.addr.dst;
1214 ipv4_key->ipv4_proto = swkey->ip.proto;
1215 ipv4_key->ipv4_tos = swkey->ip.tos;
1216 ipv4_key->ipv4_ttl = swkey->ip.ttl;
1217 ipv4_key->ipv4_frag = swkey->ip.frag;
1218 } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
1219 struct ovs_key_ipv6 *ipv6_key;
1220
1221 nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key));
1222 if (!nla)
1223 goto nla_put_failure;
1224 ipv6_key = nla_data(nla);
1225 memcpy(ipv6_key->ipv6_src, &swkey->ipv6.addr.src,
1226 sizeof(ipv6_key->ipv6_src));
1227 memcpy(ipv6_key->ipv6_dst, &swkey->ipv6.addr.dst,
1228 sizeof(ipv6_key->ipv6_dst));
1229 ipv6_key->ipv6_label = swkey->ipv6.label;
1230 ipv6_key->ipv6_proto = swkey->ip.proto;
1231 ipv6_key->ipv6_tclass = swkey->ip.tos;
1232 ipv6_key->ipv6_hlimit = swkey->ip.ttl;
1233 ipv6_key->ipv6_frag = swkey->ip.frag;
1234 } else if (swkey->eth.type == htons(ETH_P_ARP)) {
1235 struct ovs_key_arp *arp_key;
1236
1237 nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key));
1238 if (!nla)
1239 goto nla_put_failure;
1240 arp_key = nla_data(nla);
1241 memset(arp_key, 0, sizeof(struct ovs_key_arp));
1242 arp_key->arp_sip = swkey->ipv4.addr.src;
1243 arp_key->arp_tip = swkey->ipv4.addr.dst;
1244 arp_key->arp_op = htons(swkey->ip.proto);
1245 memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN);
1246 memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN);
1247 }
1248
1249 if ((swkey->eth.type == htons(ETH_P_IP) ||
1250 swkey->eth.type == htons(ETH_P_IPV6)) &&
1251 swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
1252
1253 if (swkey->ip.proto == IPPROTO_TCP) {
1254 struct ovs_key_tcp *tcp_key;
1255
1256 nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key));
1257 if (!nla)
1258 goto nla_put_failure;
1259 tcp_key = nla_data(nla);
1260 if (swkey->eth.type == htons(ETH_P_IP)) {
1261 tcp_key->tcp_src = swkey->ipv4.tp.src;
1262 tcp_key->tcp_dst = swkey->ipv4.tp.dst;
1263 } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
1264 tcp_key->tcp_src = swkey->ipv6.tp.src;
1265 tcp_key->tcp_dst = swkey->ipv6.tp.dst;
1266 }
1267 } else if (swkey->ip.proto == IPPROTO_UDP) {
1268 struct ovs_key_udp *udp_key;
1269
1270 nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key));
1271 if (!nla)
1272 goto nla_put_failure;
1273 udp_key = nla_data(nla);
1274 if (swkey->eth.type == htons(ETH_P_IP)) {
1275 udp_key->udp_src = swkey->ipv4.tp.src;
1276 udp_key->udp_dst = swkey->ipv4.tp.dst;
1277 } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
1278 udp_key->udp_src = swkey->ipv6.tp.src;
1279 udp_key->udp_dst = swkey->ipv6.tp.dst;
1280 }
1281 } else if (swkey->eth.type == htons(ETH_P_IP) &&
1282 swkey->ip.proto == IPPROTO_ICMP) {
1283 struct ovs_key_icmp *icmp_key;
1284
1285 nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key));
1286 if (!nla)
1287 goto nla_put_failure;
1288 icmp_key = nla_data(nla);
1289 icmp_key->icmp_type = ntohs(swkey->ipv4.tp.src);
1290 icmp_key->icmp_code = ntohs(swkey->ipv4.tp.dst);
1291 } else if (swkey->eth.type == htons(ETH_P_IPV6) &&
1292 swkey->ip.proto == IPPROTO_ICMPV6) {
1293 struct ovs_key_icmpv6 *icmpv6_key;
1294
1295 nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6,
1296 sizeof(*icmpv6_key));
1297 if (!nla)
1298 goto nla_put_failure;
1299 icmpv6_key = nla_data(nla);
1300 icmpv6_key->icmpv6_type = ntohs(swkey->ipv6.tp.src);
1301 icmpv6_key->icmpv6_code = ntohs(swkey->ipv6.tp.dst);
1302
1303 if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
1304 icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
1305 struct ovs_key_nd *nd_key;
1306
1307 nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key));
1308 if (!nla)
1309 goto nla_put_failure;
1310 nd_key = nla_data(nla);
1311 memcpy(nd_key->nd_target, &swkey->ipv6.nd.target,
1312 sizeof(nd_key->nd_target));
1313 memcpy(nd_key->nd_sll, swkey->ipv6.nd.sll, ETH_ALEN);
1314 memcpy(nd_key->nd_tll, swkey->ipv6.nd.tll, ETH_ALEN);
1315 }
1316 }
1317 }
1318
1319unencap:
1320 if (encap)
1321 nla_nest_end(skb, encap);
1322
1323 return 0;
1324
1325nla_put_failure:
1326 return -EMSGSIZE;
1327}
1328
1329/* Initializes the flow module.
1330 * Returns zero if successful or a negative error code. */
1331int ovs_flow_init(void)
1332{
1333 flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
1334 0, NULL);
1335 if (flow_cache == NULL)
1336 return -ENOMEM;
1337
1338 return 0;
1339}
1340
1341/* Uninitializes the flow module. */
1342void ovs_flow_exit(void)
1343{
1344 kmem_cache_destroy(flow_cache);
1345}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
new file mode 100644
index 000000000000..2747dc2c4ac1
--- /dev/null
+++ b/net/openvswitch/flow.h
@@ -0,0 +1,199 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#ifndef FLOW_H
20#define FLOW_H 1
21
22#include <linux/kernel.h>
23#include <linux/netlink.h>
24#include <linux/openvswitch.h>
25#include <linux/spinlock.h>
26#include <linux/types.h>
27#include <linux/rcupdate.h>
28#include <linux/if_ether.h>
29#include <linux/in6.h>
30#include <linux/jiffies.h>
31#include <linux/time.h>
32#include <linux/flex_array.h>
33#include <net/inet_ecn.h>
34
35struct sk_buff;
36
37struct sw_flow_actions {
38 struct rcu_head rcu;
39 u32 actions_len;
40 struct nlattr actions[];
41};
42
43struct sw_flow_key {
44 struct {
45 u32 priority; /* Packet QoS priority. */
46 u16 in_port; /* Input switch port (or USHRT_MAX). */
47 } phy;
48 struct {
49 u8 src[ETH_ALEN]; /* Ethernet source address. */
50 u8 dst[ETH_ALEN]; /* Ethernet destination address. */
51 __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
52 __be16 type; /* Ethernet frame type. */
53 } eth;
54 struct {
55 u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */
56 u8 tos; /* IP ToS. */
57 u8 ttl; /* IP TTL/hop limit. */
58 u8 frag; /* One of OVS_FRAG_TYPE_*. */
59 } ip;
60 union {
61 struct {
62 struct {
63 __be32 src; /* IP source address. */
64 __be32 dst; /* IP destination address. */
65 } addr;
66 union {
67 struct {
68 __be16 src; /* TCP/UDP source port. */
69 __be16 dst; /* TCP/UDP destination port. */
70 } tp;
71 struct {
72 u8 sha[ETH_ALEN]; /* ARP source hardware address. */
73 u8 tha[ETH_ALEN]; /* ARP target hardware address. */
74 } arp;
75 };
76 } ipv4;
77 struct {
78 struct {
79 struct in6_addr src; /* IPv6 source address. */
80 struct in6_addr dst; /* IPv6 destination address. */
81 } addr;
82 __be32 label; /* IPv6 flow label. */
83 struct {
84 __be16 src; /* TCP/UDP source port. */
85 __be16 dst; /* TCP/UDP destination port. */
86 } tp;
87 struct {
88 struct in6_addr target; /* ND target address. */
89 u8 sll[ETH_ALEN]; /* ND source link layer address. */
90 u8 tll[ETH_ALEN]; /* ND target link layer address. */
91 } nd;
92 } ipv6;
93 };
94};
95
96struct sw_flow {
97 struct rcu_head rcu;
98 struct hlist_node hash_node[2];
99 u32 hash;
100
101 struct sw_flow_key key;
102 struct sw_flow_actions __rcu *sf_acts;
103
104 spinlock_t lock; /* Lock for values below. */
105 unsigned long used; /* Last used time (in jiffies). */
106 u64 packet_count; /* Number of packets matched. */
107 u64 byte_count; /* Number of bytes matched. */
108 u8 tcp_flags; /* Union of seen TCP flags. */
109};
110
111struct arp_eth_header {
112 __be16 ar_hrd; /* format of hardware address */
113 __be16 ar_pro; /* format of protocol address */
114 unsigned char ar_hln; /* length of hardware address */
115 unsigned char ar_pln; /* length of protocol address */
116 __be16 ar_op; /* ARP opcode (command) */
117
118 /* Ethernet+IPv4 specific members. */
119 unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */
120 unsigned char ar_sip[4]; /* sender IP address */
121 unsigned char ar_tha[ETH_ALEN]; /* target hardware address */
122 unsigned char ar_tip[4]; /* target IP address */
123} __packed;
124
125int ovs_flow_init(void);
126void ovs_flow_exit(void);
127
128struct sw_flow *ovs_flow_alloc(void);
129void ovs_flow_deferred_free(struct sw_flow *);
130void ovs_flow_free(struct sw_flow *flow);
131
132struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *);
133void ovs_flow_deferred_free_acts(struct sw_flow_actions *);
134
135int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *,
136 int *key_lenp);
137void ovs_flow_used(struct sw_flow *, struct sk_buff *);
138u64 ovs_flow_used_time(unsigned long flow_jiffies);
139
140/* Upper bound on the length of a nlattr-formatted flow key. The longest
141 * nlattr-formatted flow key would be:
142 *
143 * struct pad nl hdr total
144 * ------ --- ------ -----
145 * OVS_KEY_ATTR_PRIORITY 4 -- 4 8
146 * OVS_KEY_ATTR_IN_PORT 4 -- 4 8
147 * OVS_KEY_ATTR_ETHERNET 12 -- 4 16
148 * OVS_KEY_ATTR_8021Q 4 -- 4 8
149 * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8
150 * OVS_KEY_ATTR_IPV6 40 -- 4 44
151 * OVS_KEY_ATTR_ICMPV6 2 2 4 8
152 * OVS_KEY_ATTR_ND 28 -- 4 32
153 * -------------------------------------------------
154 * total 132
155 */
156#define FLOW_BUFSIZE 132
157
158int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *);
159int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
160 const struct nlattr *);
161int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port,
162 const struct nlattr *);
163
164#define TBL_MIN_BUCKETS 1024
165
166struct flow_table {
167 struct flex_array *buckets;
168 unsigned int count, n_buckets;
169 struct rcu_head rcu;
170 int node_ver;
171 u32 hash_seed;
172 bool keep_flows;
173};
174
175static inline int ovs_flow_tbl_count(struct flow_table *table)
176{
177 return table->count;
178}
179
180static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table)
181{
182 return (table->count > table->n_buckets);
183}
184
185struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
186 struct sw_flow_key *key, int len);
187void ovs_flow_tbl_destroy(struct flow_table *table);
188void ovs_flow_tbl_deferred_destroy(struct flow_table *table);
189struct flow_table *ovs_flow_tbl_alloc(int new_size);
190struct flow_table *ovs_flow_tbl_expand(struct flow_table *table);
191struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table);
192void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow);
193void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);
194u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len);
195
196struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx);
197extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1];
198
199#endif /* flow.h */
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
new file mode 100644
index 000000000000..322b8d206693
--- /dev/null
+++ b/net/openvswitch/vport-internal_dev.c
@@ -0,0 +1,240 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#include <linux/hardirq.h>
20#include <linux/if_vlan.h>
21#include <linux/kernel.h>
22#include <linux/netdevice.h>
23#include <linux/etherdevice.h>
24#include <linux/ethtool.h>
25#include <linux/skbuff.h>
26
27#include "datapath.h"
28#include "vport-internal_dev.h"
29#include "vport-netdev.h"
30
31struct internal_dev {
32 struct vport *vport;
33};
34
35static struct internal_dev *internal_dev_priv(struct net_device *netdev)
36{
37 return netdev_priv(netdev);
38}
39
40/* This function is only called by the kernel network layer.*/
41static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev,
42 struct rtnl_link_stats64 *stats)
43{
44 struct vport *vport = ovs_internal_dev_get_vport(netdev);
45 struct ovs_vport_stats vport_stats;
46
47 ovs_vport_get_stats(vport, &vport_stats);
48
49 /* The tx and rx stats need to be swapped because the
50 * switch and host OS have opposite perspectives. */
51 stats->rx_packets = vport_stats.tx_packets;
52 stats->tx_packets = vport_stats.rx_packets;
53 stats->rx_bytes = vport_stats.tx_bytes;
54 stats->tx_bytes = vport_stats.rx_bytes;
55 stats->rx_errors = vport_stats.tx_errors;
56 stats->tx_errors = vport_stats.rx_errors;
57 stats->rx_dropped = vport_stats.tx_dropped;
58 stats->tx_dropped = vport_stats.rx_dropped;
59
60 return stats;
61}
62
63static int internal_dev_mac_addr(struct net_device *dev, void *p)
64{
65 struct sockaddr *addr = p;
66
67 if (!is_valid_ether_addr(addr->sa_data))
68 return -EADDRNOTAVAIL;
69 memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
70 return 0;
71}
72
73/* Called with rcu_read_lock_bh. */
74static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
75{
76 rcu_read_lock();
77 ovs_vport_receive(internal_dev_priv(netdev)->vport, skb);
78 rcu_read_unlock();
79 return 0;
80}
81
82static int internal_dev_open(struct net_device *netdev)
83{
84 netif_start_queue(netdev);
85 return 0;
86}
87
88static int internal_dev_stop(struct net_device *netdev)
89{
90 netif_stop_queue(netdev);
91 return 0;
92}
93
94static void internal_dev_getinfo(struct net_device *netdev,
95 struct ethtool_drvinfo *info)
96{
97 strcpy(info->driver, "openvswitch");
98}
99
100static const struct ethtool_ops internal_dev_ethtool_ops = {
101 .get_drvinfo = internal_dev_getinfo,
102 .get_link = ethtool_op_get_link,
103};
104
105static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu)
106{
107 if (new_mtu < 68)
108 return -EINVAL;
109
110 netdev->mtu = new_mtu;
111 return 0;
112}
113
114static void internal_dev_destructor(struct net_device *dev)
115{
116 struct vport *vport = ovs_internal_dev_get_vport(dev);
117
118 ovs_vport_free(vport);
119 free_netdev(dev);
120}
121
122static const struct net_device_ops internal_dev_netdev_ops = {
123 .ndo_open = internal_dev_open,
124 .ndo_stop = internal_dev_stop,
125 .ndo_start_xmit = internal_dev_xmit,
126 .ndo_set_mac_address = internal_dev_mac_addr,
127 .ndo_change_mtu = internal_dev_change_mtu,
128 .ndo_get_stats64 = internal_dev_get_stats,
129};
130
131static void do_setup(struct net_device *netdev)
132{
133 ether_setup(netdev);
134
135 netdev->netdev_ops = &internal_dev_netdev_ops;
136
137 netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
138 netdev->destructor = internal_dev_destructor;
139 SET_ETHTOOL_OPS(netdev, &internal_dev_ethtool_ops);
140 netdev->tx_queue_len = 0;
141
142 netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST |
143 NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO;
144
145 netdev->vlan_features = netdev->features;
146 netdev->features |= NETIF_F_HW_VLAN_TX;
147 netdev->hw_features = netdev->features & ~NETIF_F_LLTX;
148 random_ether_addr(netdev->dev_addr);
149}
150
151static struct vport *internal_dev_create(const struct vport_parms *parms)
152{
153 struct vport *vport;
154 struct netdev_vport *netdev_vport;
155 struct internal_dev *internal_dev;
156 int err;
157
158 vport = ovs_vport_alloc(sizeof(struct netdev_vport),
159 &ovs_internal_vport_ops, parms);
160 if (IS_ERR(vport)) {
161 err = PTR_ERR(vport);
162 goto error;
163 }
164
165 netdev_vport = netdev_vport_priv(vport);
166
167 netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev),
168 parms->name, do_setup);
169 if (!netdev_vport->dev) {
170 err = -ENOMEM;
171 goto error_free_vport;
172 }
173
174 internal_dev = internal_dev_priv(netdev_vport->dev);
175 internal_dev->vport = vport;
176
177 err = register_netdevice(netdev_vport->dev);
178 if (err)
179 goto error_free_netdev;
180
181 dev_set_promiscuity(netdev_vport->dev, 1);
182 netif_start_queue(netdev_vport->dev);
183
184 return vport;
185
186error_free_netdev:
187 free_netdev(netdev_vport->dev);
188error_free_vport:
189 ovs_vport_free(vport);
190error:
191 return ERR_PTR(err);
192}
193
194static void internal_dev_destroy(struct vport *vport)
195{
196 struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
197
198 netif_stop_queue(netdev_vport->dev);
199 dev_set_promiscuity(netdev_vport->dev, -1);
200
201 /* unregister_netdevice() waits for an RCU grace period. */
202 unregister_netdevice(netdev_vport->dev);
203}
204
205static int internal_dev_recv(struct vport *vport, struct sk_buff *skb)
206{
207 struct net_device *netdev = netdev_vport_priv(vport)->dev;
208 int len;
209
210 len = skb->len;
211 skb->dev = netdev;
212 skb->pkt_type = PACKET_HOST;
213 skb->protocol = eth_type_trans(skb, netdev);
214
215 netif_rx(skb);
216
217 return len;
218}
219
220const struct vport_ops ovs_internal_vport_ops = {
221 .type = OVS_VPORT_TYPE_INTERNAL,
222 .create = internal_dev_create,
223 .destroy = internal_dev_destroy,
224 .get_name = ovs_netdev_get_name,
225 .get_ifindex = ovs_netdev_get_ifindex,
226 .send = internal_dev_recv,
227};
228
229int ovs_is_internal_dev(const struct net_device *netdev)
230{
231 return netdev->netdev_ops == &internal_dev_netdev_ops;
232}
233
234struct vport *ovs_internal_dev_get_vport(struct net_device *netdev)
235{
236 if (!ovs_is_internal_dev(netdev))
237 return NULL;
238
239 return internal_dev_priv(netdev)->vport;
240}
diff --git a/net/openvswitch/vport-internal_dev.h b/net/openvswitch/vport-internal_dev.h
new file mode 100644
index 000000000000..3454447c5f11
--- /dev/null
+++ b/net/openvswitch/vport-internal_dev.h
@@ -0,0 +1,28 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#ifndef VPORT_INTERNAL_DEV_H
20#define VPORT_INTERNAL_DEV_H 1
21
22#include "datapath.h"
23#include "vport.h"
24
25int ovs_is_internal_dev(const struct net_device *);
26struct vport *ovs_internal_dev_get_vport(struct net_device *);
27
28#endif /* vport-internal_dev.h */
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
new file mode 100644
index 000000000000..c1068aed03d1
--- /dev/null
+++ b/net/openvswitch/vport-netdev.c
@@ -0,0 +1,198 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/if_arp.h>
22#include <linux/if_bridge.h>
23#include <linux/if_vlan.h>
24#include <linux/kernel.h>
25#include <linux/llc.h>
26#include <linux/rtnetlink.h>
27#include <linux/skbuff.h>
28
29#include <net/llc.h>
30
31#include "datapath.h"
32#include "vport-internal_dev.h"
33#include "vport-netdev.h"
34
35/* Must be called with rcu_read_lock. */
36static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
37{
38 if (unlikely(!vport)) {
39 kfree_skb(skb);
40 return;
41 }
42
43 /* Make our own copy of the packet. Otherwise we will mangle the
44 * packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
45 * (No one comes after us, since we tell handle_bridge() that we took
46 * the packet.) */
47 skb = skb_share_check(skb, GFP_ATOMIC);
48 if (unlikely(!skb))
49 return;
50
51 skb_push(skb, ETH_HLEN);
52 ovs_vport_receive(vport, skb);
53}
54
55/* Called with rcu_read_lock and bottom-halves disabled. */
56static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb)
57{
58 struct sk_buff *skb = *pskb;
59 struct vport *vport;
60
61 if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
62 return RX_HANDLER_PASS;
63
64 vport = ovs_netdev_get_vport(skb->dev);
65
66 netdev_port_receive(vport, skb);
67
68 return RX_HANDLER_CONSUMED;
69}
70
71static struct vport *netdev_create(const struct vport_parms *parms)
72{
73 struct vport *vport;
74 struct netdev_vport *netdev_vport;
75 int err;
76
77 vport = ovs_vport_alloc(sizeof(struct netdev_vport),
78 &ovs_netdev_vport_ops, parms);
79 if (IS_ERR(vport)) {
80 err = PTR_ERR(vport);
81 goto error;
82 }
83
84 netdev_vport = netdev_vport_priv(vport);
85
86 netdev_vport->dev = dev_get_by_name(&init_net, parms->name);
87 if (!netdev_vport->dev) {
88 err = -ENODEV;
89 goto error_free_vport;
90 }
91
92 if (netdev_vport->dev->flags & IFF_LOOPBACK ||
93 netdev_vport->dev->type != ARPHRD_ETHER ||
94 ovs_is_internal_dev(netdev_vport->dev)) {
95 err = -EINVAL;
96 goto error_put;
97 }
98
99 err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
100 vport);
101 if (err)
102 goto error_put;
103
104 dev_set_promiscuity(netdev_vport->dev, 1);
105 netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
106
107 return vport;
108
109error_put:
110 dev_put(netdev_vport->dev);
111error_free_vport:
112 ovs_vport_free(vport);
113error:
114 return ERR_PTR(err);
115}
116
117static void netdev_destroy(struct vport *vport)
118{
119 struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
120
121 netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
122 netdev_rx_handler_unregister(netdev_vport->dev);
123 dev_set_promiscuity(netdev_vport->dev, -1);
124
125 synchronize_rcu();
126
127 dev_put(netdev_vport->dev);
128 ovs_vport_free(vport);
129}
130
131const char *ovs_netdev_get_name(const struct vport *vport)
132{
133 const struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
134 return netdev_vport->dev->name;
135}
136
137int ovs_netdev_get_ifindex(const struct vport *vport)
138{
139 const struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
140 return netdev_vport->dev->ifindex;
141}
142
143static unsigned packet_length(const struct sk_buff *skb)
144{
145 unsigned length = skb->len - ETH_HLEN;
146
147 if (skb->protocol == htons(ETH_P_8021Q))
148 length -= VLAN_HLEN;
149
150 return length;
151}
152
153static int netdev_send(struct vport *vport, struct sk_buff *skb)
154{
155 struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
156 int mtu = netdev_vport->dev->mtu;
157 int len;
158
159 if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
160 if (net_ratelimit())
161 pr_warn("%s: dropped over-mtu packet: %d > %d\n",
162 ovs_dp_name(vport->dp), packet_length(skb), mtu);
163 goto error;
164 }
165
166 if (unlikely(skb_warn_if_lro(skb)))
167 goto error;
168
169 skb->dev = netdev_vport->dev;
170 len = skb->len;
171 dev_queue_xmit(skb);
172
173 return len;
174
175error:
176 kfree_skb(skb);
177 ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
178 return 0;
179}
180
181/* Returns null if this device is not attached to a datapath. */
182struct vport *ovs_netdev_get_vport(struct net_device *dev)
183{
184 if (likely(dev->priv_flags & IFF_OVS_DATAPATH))
185 return (struct vport *)
186 rcu_dereference_rtnl(dev->rx_handler_data);
187 else
188 return NULL;
189}
190
191const struct vport_ops ovs_netdev_vport_ops = {
192 .type = OVS_VPORT_TYPE_NETDEV,
193 .create = netdev_create,
194 .destroy = netdev_destroy,
195 .get_name = ovs_netdev_get_name,
196 .get_ifindex = ovs_netdev_get_ifindex,
197 .send = netdev_send,
198};
diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h
new file mode 100644
index 000000000000..fd9b008a0e6e
--- /dev/null
+++ b/net/openvswitch/vport-netdev.h
@@ -0,0 +1,42 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#ifndef VPORT_NETDEV_H
20#define VPORT_NETDEV_H 1
21
22#include <linux/netdevice.h>
23
24#include "vport.h"
25
26struct vport *ovs_netdev_get_vport(struct net_device *dev);
27
28struct netdev_vport {
29 struct net_device *dev;
30};
31
32static inline struct netdev_vport *
33netdev_vport_priv(const struct vport *vport)
34{
35 return vport_priv(vport);
36}
37
38const char *ovs_netdev_get_name(const struct vport *);
39const char *ovs_netdev_get_config(const struct vport *);
40int ovs_netdev_get_ifindex(const struct vport *);
41
42#endif /* vport_netdev.h */
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
new file mode 100644
index 000000000000..6c066ba25dc7
--- /dev/null
+++ b/net/openvswitch/vport.c
@@ -0,0 +1,397 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#include <linux/dcache.h>
20#include <linux/etherdevice.h>
21#include <linux/if.h>
22#include <linux/if_vlan.h>
23#include <linux/kernel.h>
24#include <linux/list.h>
25#include <linux/mutex.h>
26#include <linux/percpu.h>
27#include <linux/rcupdate.h>
28#include <linux/rtnetlink.h>
29#include <linux/compat.h>
30
31#include "vport.h"
32#include "vport-internal_dev.h"
33
34/* List of statically compiled vport implementations. Don't forget to also
35 * add yours to the list at the bottom of vport.h. */
36static const struct vport_ops *vport_ops_list[] = {
37 &ovs_netdev_vport_ops,
38 &ovs_internal_vport_ops,
39};
40
41/* Protected by RCU read lock for reading, RTNL lock for writing. */
42static struct hlist_head *dev_table;
43#define VPORT_HASH_BUCKETS 1024
44
45/**
46 * ovs_vport_init - initialize vport subsystem
47 *
48 * Called at module load time to initialize the vport subsystem.
49 */
50int ovs_vport_init(void)
51{
52 dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
53 GFP_KERNEL);
54 if (!dev_table)
55 return -ENOMEM;
56
57 return 0;
58}
59
60/**
61 * ovs_vport_exit - shutdown vport subsystem
62 *
63 * Called at module exit time to shutdown the vport subsystem.
64 */
65void ovs_vport_exit(void)
66{
67 kfree(dev_table);
68}
69
70static struct hlist_head *hash_bucket(const char *name)
71{
72 unsigned int hash = full_name_hash(name, strlen(name));
73 return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)];
74}
75
76/**
77 * ovs_vport_locate - find a port that has already been created
78 *
79 * @name: name of port to find
80 *
81 * Must be called with RTNL or RCU read lock.
82 */
83struct vport *ovs_vport_locate(const char *name)
84{
85 struct hlist_head *bucket = hash_bucket(name);
86 struct vport *vport;
87 struct hlist_node *node;
88
89 hlist_for_each_entry_rcu(vport, node, bucket, hash_node)
90 if (!strcmp(name, vport->ops->get_name(vport)))
91 return vport;
92
93 return NULL;
94}
95
96/**
97 * ovs_vport_alloc - allocate and initialize new vport
98 *
99 * @priv_size: Size of private data area to allocate.
100 * @ops: vport device ops
101 *
102 * Allocate and initialize a new vport defined by @ops. The vport will contain
103 * a private data area of size @priv_size that can be accessed using
104 * vport_priv(). vports that are no longer needed should be released with
105 * vport_free().
106 */
107struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
108 const struct vport_parms *parms)
109{
110 struct vport *vport;
111 size_t alloc_size;
112
113 alloc_size = sizeof(struct vport);
114 if (priv_size) {
115 alloc_size = ALIGN(alloc_size, VPORT_ALIGN);
116 alloc_size += priv_size;
117 }
118
119 vport = kzalloc(alloc_size, GFP_KERNEL);
120 if (!vport)
121 return ERR_PTR(-ENOMEM);
122
123 vport->dp = parms->dp;
124 vport->port_no = parms->port_no;
125 vport->upcall_pid = parms->upcall_pid;
126 vport->ops = ops;
127
128 vport->percpu_stats = alloc_percpu(struct vport_percpu_stats);
129 if (!vport->percpu_stats) {
130 kfree(vport);
131 return ERR_PTR(-ENOMEM);
132 }
133
134 spin_lock_init(&vport->stats_lock);
135
136 return vport;
137}
138
139/**
140 * ovs_vport_free - uninitialize and free vport
141 *
142 * @vport: vport to free
143 *
144 * Frees a vport allocated with vport_alloc() when it is no longer needed.
145 *
146 * The caller must ensure that an RCU grace period has passed since the last
147 * time @vport was in a datapath.
148 */
149void ovs_vport_free(struct vport *vport)
150{
151 free_percpu(vport->percpu_stats);
152 kfree(vport);
153}
154
155/**
156 * ovs_vport_add - add vport device (for kernel callers)
157 *
158 * @parms: Information about new vport.
159 *
160 * Creates a new vport with the specified configuration (which is dependent on
161 * device type). RTNL lock must be held.
162 */
163struct vport *ovs_vport_add(const struct vport_parms *parms)
164{
165 struct vport *vport;
166 int err = 0;
167 int i;
168
169 ASSERT_RTNL();
170
171 for (i = 0; i < ARRAY_SIZE(vport_ops_list); i++) {
172 if (vport_ops_list[i]->type == parms->type) {
173 vport = vport_ops_list[i]->create(parms);
174 if (IS_ERR(vport)) {
175 err = PTR_ERR(vport);
176 goto out;
177 }
178
179 hlist_add_head_rcu(&vport->hash_node,
180 hash_bucket(vport->ops->get_name(vport)));
181 return vport;
182 }
183 }
184
185 err = -EAFNOSUPPORT;
186
187out:
188 return ERR_PTR(err);
189}
190
191/**
192 * ovs_vport_set_options - modify existing vport device (for kernel callers)
193 *
194 * @vport: vport to modify.
195 * @port: New configuration.
196 *
197 * Modifies an existing device with the specified configuration (which is
198 * dependent on device type). RTNL lock must be held.
199 */
200int ovs_vport_set_options(struct vport *vport, struct nlattr *options)
201{
202 ASSERT_RTNL();
203
204 if (!vport->ops->set_options)
205 return -EOPNOTSUPP;
206 return vport->ops->set_options(vport, options);
207}
208
209/**
210 * ovs_vport_del - delete existing vport device
211 *
212 * @vport: vport to delete.
213 *
214 * Detaches @vport from its datapath and destroys it. It is possible to fail
215 * for reasons such as lack of memory. RTNL lock must be held.
216 */
217void ovs_vport_del(struct vport *vport)
218{
219 ASSERT_RTNL();
220
221 hlist_del_rcu(&vport->hash_node);
222
223 vport->ops->destroy(vport);
224}
225
226/**
227 * ovs_vport_get_stats - retrieve device stats
228 *
229 * @vport: vport from which to retrieve the stats
230 * @stats: location to store stats
231 *
232 * Retrieves transmit, receive, and error stats for the given device.
233 *
234 * Must be called with RTNL lock or rcu_read_lock.
235 */
236void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats)
237{
238 int i;
239
240 memset(stats, 0, sizeof(*stats));
241
242 /* We potentially have 2 sources of stats that need to be combined:
243 * those we have collected (split into err_stats and percpu_stats) from
244 * set_stats() and device error stats from netdev->get_stats() (for
245 * errors that happen downstream and therefore aren't reported through
246 * our vport_record_error() function).
247 * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS).
248 * netdev-stats can be directly read over netlink-ioctl.
249 */
250
251 spin_lock_bh(&vport->stats_lock);
252
253 stats->rx_errors = vport->err_stats.rx_errors;
254 stats->tx_errors = vport->err_stats.tx_errors;
255 stats->tx_dropped = vport->err_stats.tx_dropped;
256 stats->rx_dropped = vport->err_stats.rx_dropped;
257
258 spin_unlock_bh(&vport->stats_lock);
259
260 for_each_possible_cpu(i) {
261 const struct vport_percpu_stats *percpu_stats;
262 struct vport_percpu_stats local_stats;
263 unsigned int start;
264
265 percpu_stats = per_cpu_ptr(vport->percpu_stats, i);
266
267 do {
268 start = u64_stats_fetch_begin_bh(&percpu_stats->sync);
269 local_stats = *percpu_stats;
270 } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start));
271
272 stats->rx_bytes += local_stats.rx_bytes;
273 stats->rx_packets += local_stats.rx_packets;
274 stats->tx_bytes += local_stats.tx_bytes;
275 stats->tx_packets += local_stats.tx_packets;
276 }
277}
278
279/**
280 * ovs_vport_get_options - retrieve device options
281 *
282 * @vport: vport from which to retrieve the options.
283 * @skb: sk_buff where options should be appended.
284 *
285 * Retrieves the configuration of the given device, appending an
286 * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested
287 * vport-specific attributes to @skb.
288 *
289 * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another
290 * negative error code if a real error occurred. If an error occurs, @skb is
291 * left unmodified.
292 *
293 * Must be called with RTNL lock or rcu_read_lock.
294 */
295int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb)
296{
297 struct nlattr *nla;
298
299 nla = nla_nest_start(skb, OVS_VPORT_ATTR_OPTIONS);
300 if (!nla)
301 return -EMSGSIZE;
302
303 if (vport->ops->get_options) {
304 int err = vport->ops->get_options(vport, skb);
305 if (err) {
306 nla_nest_cancel(skb, nla);
307 return err;
308 }
309 }
310
311 nla_nest_end(skb, nla);
312 return 0;
313}
314
315/**
316 * ovs_vport_receive - pass up received packet to the datapath for processing
317 *
318 * @vport: vport that received the packet
319 * @skb: skb that was received
320 *
321 * Must be called with rcu_read_lock. The packet cannot be shared and
322 * skb->data should point to the Ethernet header. The caller must have already
323 * called compute_ip_summed() to initialize the checksumming fields.
324 */
325void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
326{
327 struct vport_percpu_stats *stats;
328
329 stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());
330
331 u64_stats_update_begin(&stats->sync);
332 stats->rx_packets++;
333 stats->rx_bytes += skb->len;
334 u64_stats_update_end(&stats->sync);
335
336 ovs_dp_process_received_packet(vport, skb);
337}
338
339/**
340 * ovs_vport_send - send a packet on a device
341 *
342 * @vport: vport on which to send the packet
343 * @skb: skb to send
344 *
345 * Sends the given packet and returns the length of data sent. Either RTNL
346 * lock or rcu_read_lock must be held.
347 */
348int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
349{
350 int sent = vport->ops->send(vport, skb);
351
352 if (likely(sent)) {
353 struct vport_percpu_stats *stats;
354
355 stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());
356
357 u64_stats_update_begin(&stats->sync);
358 stats->tx_packets++;
359 stats->tx_bytes += sent;
360 u64_stats_update_end(&stats->sync);
361 }
362 return sent;
363}
364
365/**
366 * ovs_vport_record_error - indicate device error to generic stats layer
367 *
368 * @vport: vport that encountered the error
369 * @err_type: one of enum vport_err_type types to indicate the error type
370 *
371 * If using the vport generic stats layer indicate that an error of the given
372 * type has occured.
373 */
374void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type)
375{
376 spin_lock(&vport->stats_lock);
377
378 switch (err_type) {
379 case VPORT_E_RX_DROPPED:
380 vport->err_stats.rx_dropped++;
381 break;
382
383 case VPORT_E_RX_ERROR:
384 vport->err_stats.rx_errors++;
385 break;
386
387 case VPORT_E_TX_DROPPED:
388 vport->err_stats.tx_dropped++;
389 break;
390
391 case VPORT_E_TX_ERROR:
392 vport->err_stats.tx_errors++;
393 break;
394 };
395
396 spin_unlock(&vport->stats_lock);
397}
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
new file mode 100644
index 000000000000..19609629dabd
--- /dev/null
+++ b/net/openvswitch/vport.h
@@ -0,0 +1,205 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#ifndef VPORT_H
20#define VPORT_H 1
21
22#include <linux/list.h>
23#include <linux/openvswitch.h>
24#include <linux/skbuff.h>
25#include <linux/spinlock.h>
26#include <linux/u64_stats_sync.h>
27
28#include "datapath.h"
29
30struct vport;
31struct vport_parms;
32
33/* The following definitions are for users of the vport subsytem: */
34
35int ovs_vport_init(void);
36void ovs_vport_exit(void);
37
38struct vport *ovs_vport_add(const struct vport_parms *);
39void ovs_vport_del(struct vport *);
40
41struct vport *ovs_vport_locate(const char *name);
42
43void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *);
44
45int ovs_vport_set_options(struct vport *, struct nlattr *options);
46int ovs_vport_get_options(const struct vport *, struct sk_buff *);
47
48int ovs_vport_send(struct vport *, struct sk_buff *);
49
50/* The following definitions are for implementers of vport devices: */
51
52struct vport_percpu_stats {
53 u64 rx_bytes;
54 u64 rx_packets;
55 u64 tx_bytes;
56 u64 tx_packets;
57 struct u64_stats_sync sync;
58};
59
60struct vport_err_stats {
61 u64 rx_dropped;
62 u64 rx_errors;
63 u64 tx_dropped;
64 u64 tx_errors;
65};
66
67/**
68 * struct vport - one port within a datapath
69 * @rcu: RCU callback head for deferred destruction.
70 * @port_no: Index into @dp's @ports array.
71 * @dp: Datapath to which this port belongs.
72 * @node: Element in @dp's @port_list.
73 * @upcall_pid: The Netlink port to use for packets received on this port that
74 * miss the flow table.
75 * @hash_node: Element in @dev_table hash table in vport.c.
76 * @ops: Class structure.
77 * @percpu_stats: Points to per-CPU statistics used and maintained by vport
78 * @stats_lock: Protects @err_stats;
79 * @err_stats: Points to error statistics used and maintained by vport
80 */
81struct vport {
82 struct rcu_head rcu;
83 u16 port_no;
84 struct datapath *dp;
85 struct list_head node;
86 u32 upcall_pid;
87
88 struct hlist_node hash_node;
89 const struct vport_ops *ops;
90
91 struct vport_percpu_stats __percpu *percpu_stats;
92
93 spinlock_t stats_lock;
94 struct vport_err_stats err_stats;
95};
96
97/**
98 * struct vport_parms - parameters for creating a new vport
99 *
100 * @name: New vport's name.
101 * @type: New vport's type.
102 * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if
103 * none was supplied.
104 * @dp: New vport's datapath.
105 * @port_no: New vport's port number.
106 */
107struct vport_parms {
108 const char *name;
109 enum ovs_vport_type type;
110 struct nlattr *options;
111
112 /* For ovs_vport_alloc(). */
113 struct datapath *dp;
114 u16 port_no;
115 u32 upcall_pid;
116};
117
118/**
119 * struct vport_ops - definition of a type of virtual port
120 *
121 * @type: %OVS_VPORT_TYPE_* value for this type of virtual port.
122 * @create: Create a new vport configured as specified. On success returns
123 * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value.
124 * @destroy: Destroys a vport. Must call vport_free() on the vport but not
125 * before an RCU grace period has elapsed.
126 * @set_options: Modify the configuration of an existing vport. May be %NULL
127 * if modification is not supported.
128 * @get_options: Appends vport-specific attributes for the configuration of an
129 * existing vport to a &struct sk_buff. May be %NULL for a vport that does not
130 * have any configuration.
131 * @get_name: Get the device's name.
132 * @get_config: Get the device's configuration.
133 * @get_ifindex: Get the system interface index associated with the device.
134 * May be null if the device does not have an ifindex.
135 * @send: Send a packet on the device. Returns the length of the packet sent.
136 */
137struct vport_ops {
138 enum ovs_vport_type type;
139
140 /* Called with RTNL lock. */
141 struct vport *(*create)(const struct vport_parms *);
142 void (*destroy)(struct vport *);
143
144 int (*set_options)(struct vport *, struct nlattr *);
145 int (*get_options)(const struct vport *, struct sk_buff *);
146
147 /* Called with rcu_read_lock or RTNL lock. */
148 const char *(*get_name)(const struct vport *);
149 void (*get_config)(const struct vport *, void *);
150 int (*get_ifindex)(const struct vport *);
151
152 int (*send)(struct vport *, struct sk_buff *);
153};
154
155enum vport_err_type {
156 VPORT_E_RX_DROPPED,
157 VPORT_E_RX_ERROR,
158 VPORT_E_TX_DROPPED,
159 VPORT_E_TX_ERROR,
160};
161
162struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *,
163 const struct vport_parms *);
164void ovs_vport_free(struct vport *);
165
166#define VPORT_ALIGN 8
167
168/**
169 * vport_priv - access private data area of vport
170 *
171 * @vport: vport to access
172 *
173 * If a nonzero size was passed in priv_size of vport_alloc() a private data
174 * area was allocated on creation. This allows that area to be accessed and
175 * used for any purpose needed by the vport implementer.
176 */
177static inline void *vport_priv(const struct vport *vport)
178{
179 return (u8 *)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN);
180}
181
182/**
183 * vport_from_priv - lookup vport from private data pointer
184 *
185 * @priv: Start of private data area.
186 *
187 * It is sometimes useful to translate from a pointer to the private data
188 * area to the vport, such as in the case where the private data pointer is
189 * the result of a hash table lookup. @priv must point to the start of the
190 * private data area.
191 */
192static inline struct vport *vport_from_priv(const void *priv)
193{
194 return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN));
195}
196
197void ovs_vport_receive(struct vport *, struct sk_buff *);
198void ovs_vport_record_error(struct vport *, enum vport_err_type err_type);
199
200/* List of statically compiled vport implementations. Don't forget to also
201 * add yours to the list at the top of vport.c. */
202extern const struct vport_ops ovs_netdev_vport_ops;
203extern const struct vport_ops ovs_internal_vport_ops;
204
205#endif /* vport.h */