aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJesse Gross <jesse@nicira.com>2011-10-25 22:26:31 -0400
committerJesse Gross <jesse@nicira.com>2011-12-03 12:35:17 -0500
commitccb1352e76cff0524e7ccb2074826a092dd13016 (patch)
tree9122ceff5d75ec64e327a9fad4ad2013744c2999
parent75f2811c6460ccc59d83c66059943ce9c9f81a18 (diff)
net: Add Open vSwitch kernel components.
Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features expected in a traditional hardware switch, it enables fine-grained programmatic extension and flow-based control of the network. This control is useful in a wide variety of applications but is particularly important in multi-server virtualization deployments, which are often characterized by highly dynamic endpoints and the need to maintain logical abstractions for multiple tenants. The Open vSwitch datapath provides an in-kernel fast path for packet forwarding. It is complemented by a userspace daemon, ovs-vswitchd, which is able to accept configuration from a variety of sources and translate it into packet processing rules. See http://openvswitch.org for more information and userspace utilities. Signed-off-by: Jesse Gross <jesse@nicira.com>
-rw-r--r--Documentation/networking/00-INDEX2
-rw-r--r--Documentation/networking/openvswitch.txt195
-rw-r--r--MAINTAINERS8
-rw-r--r--include/linux/openvswitch.h452
-rw-r--r--net/Kconfig1
-rw-r--r--net/Makefile1
-rw-r--r--net/openvswitch/Kconfig28
-rw-r--r--net/openvswitch/Makefile14
-rw-r--r--net/openvswitch/actions.c415
-rw-r--r--net/openvswitch/datapath.c1912
-rw-r--r--net/openvswitch/datapath.h125
-rw-r--r--net/openvswitch/dp_notify.c66
-rw-r--r--net/openvswitch/flow.c1346
-rw-r--r--net/openvswitch/flow.h199
-rw-r--r--net/openvswitch/vport-internal_dev.c241
-rw-r--r--net/openvswitch/vport-internal_dev.h28
-rw-r--r--net/openvswitch/vport-netdev.c198
-rw-r--r--net/openvswitch/vport-netdev.h42
-rw-r--r--net/openvswitch/vport.c396
-rw-r--r--net/openvswitch/vport.h205
20 files changed, 5874 insertions, 0 deletions
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
index bbce1215434..9ad9ddeb384 100644
--- a/Documentation/networking/00-INDEX
+++ b/Documentation/networking/00-INDEX
@@ -144,6 +144,8 @@ nfc.txt
144 - The Linux Near Field Communication (NFS) subsystem. 144 - The Linux Near Field Communication (NFS) subsystem.
145olympic.txt 145olympic.txt
146 - IBM PCI Pit/Pit-Phy/Olympic Token Ring driver info. 146 - IBM PCI Pit/Pit-Phy/Olympic Token Ring driver info.
147openvswitch.txt
148 - Open vSwitch developer documentation.
147operstates.txt 149operstates.txt
148 - Overview of network interface operational states. 150 - Overview of network interface operational states.
149packet_mmap.txt 151packet_mmap.txt
diff --git a/Documentation/networking/openvswitch.txt b/Documentation/networking/openvswitch.txt
new file mode 100644
index 00000000000..b8a048b8df3
--- /dev/null
+++ b/Documentation/networking/openvswitch.txt
@@ -0,0 +1,195 @@
1Open vSwitch datapath developer documentation
2=============================================
3
4The Open vSwitch kernel module allows flexible userspace control over
5flow-level packet processing on selected network devices. It can be
6used to implement a plain Ethernet switch, network device bonding,
7VLAN processing, network access control, flow-based network control,
8and so on.
9
10The kernel module implements multiple "datapaths" (analogous to
11bridges), each of which can have multiple "vports" (analogous to ports
12within a bridge). Each datapath also has associated with it a "flow
13table" that userspace populates with "flows" that map from keys based
14on packet headers and metadata to sets of actions. The most common
15action forwards the packet to another vport; other actions are also
16implemented.
17
18When a packet arrives on a vport, the kernel module processes it by
19extracting its flow key and looking it up in the flow table. If there
20is a matching flow, it executes the associated actions. If there is
21no match, it queues the packet to userspace for processing (as part of
22its processing, userspace will likely set up a flow to handle further
23packets of the same type entirely in-kernel).
24
25
26Flow key compatibility
27----------------------
28
29Network protocols evolve over time. New protocols become important
30and existing protocols lose their prominence. For the Open vSwitch
31kernel module to remain relevant, it must be possible for newer
32versions to parse additional protocols as part of the flow key. It
33might even be desirable, someday, to drop support for parsing
34protocols that have become obsolete. Therefore, the Netlink interface
35to Open vSwitch is designed to allow carefully written userspace
36applications to work with any version of the flow key, past or future.
37
38To support this forward and backward compatibility, whenever the
39kernel module passes a packet to userspace, it also passes along the
40flow key that it parsed from the packet. Userspace then extracts its
41own notion of a flow key from the packet and compares it against the
42kernel-provided version:
43
44 - If userspace's notion of the flow key for the packet matches the
45 kernel's, then nothing special is necessary.
46
47 - If the kernel's flow key includes more fields than the userspace
48 version of the flow key, for example if the kernel decoded IPv6
49 headers but userspace stopped at the Ethernet type (because it
50 does not understand IPv6), then again nothing special is
51 necessary. Userspace can still set up a flow in the usual way,
52 as long as it uses the kernel-provided flow key to do it.
53
54 - If the userspace flow key includes more fields than the
55 kernel's, for example if userspace decoded an IPv6 header but
56 the kernel stopped at the Ethernet type, then userspace can
57 forward the packet manually, without setting up a flow in the
58 kernel. This case is bad for performance because every packet
59 that the kernel considers part of the flow must go to userspace,
60 but the forwarding behavior is correct. (If userspace can
61 determine that the values of the extra fields would not affect
62 forwarding behavior, then it could set up a flow anyway.)
63
64How flow keys evolve over time is important to making this work, so
65the following sections go into detail.
66
67
68Flow key format
69---------------
70
71A flow key is passed over a Netlink socket as a sequence of Netlink
72attributes. Some attributes represent packet metadata, defined as any
73information about a packet that cannot be extracted from the packet
74itself, e.g. the vport on which the packet was received. Most
75attributes, however, are extracted from headers within the packet,
76e.g. source and destination addresses from Ethernet, IP, or TCP
77headers.
78
79The <linux/openvswitch.h> header file defines the exact format of the
80flow key attributes. For informal explanatory purposes here, we write
81them as comma-separated strings, with parentheses indicating arguments
82and nesting. For example, the following could represent a flow key
83corresponding to a TCP packet that arrived on vport 1:
84
85 in_port(1), eth(src=e0:91:f5:21:d0:b2, dst=00:02:e3:0f:80:a4),
86 eth_type(0x0800), ipv4(src=172.16.0.20, dst=172.18.0.52, proto=17, tos=0,
87 frag=no), tcp(src=49163, dst=80)
88
89Often we ellipsize arguments not important to the discussion, e.g.:
90
91 in_port(1), eth(...), eth_type(0x0800), ipv4(...), tcp(...)
92
93
94Basic rule for evolving flow keys
95---------------------------------
96
97Some care is needed to really maintain forward and backward
98compatibility for applications that follow the rules listed under
99"Flow key compatibility" above.
100
101The basic rule is obvious:
102
103 ------------------------------------------------------------------
104 New network protocol support must only supplement existing flow
105 key attributes. It must not change the meaning of already defined
106 flow key attributes.
107 ------------------------------------------------------------------
108
109This rule does have less-obvious consequences so it is worth working
110through a few examples. Suppose, for example, that the kernel module
111did not already implement VLAN parsing. Instead, it just interpreted
112the 802.1Q TPID (0x8100) as the Ethertype then stopped parsing the
113packet. The flow key for any packet with an 802.1Q header would look
114essentially like this, ignoring metadata:
115
116 eth(...), eth_type(0x8100)
117
118Naively, to add VLAN support, it makes sense to add a new "vlan" flow
119key attribute to contain the VLAN tag, then continue to decode the
120encapsulated headers beyond the VLAN tag using the existing field
121definitions. With this change, an TCP packet in VLAN 10 would have a
122flow key much like this:
123
124 eth(...), vlan(vid=10, pcp=0), eth_type(0x0800), ip(proto=6, ...), tcp(...)
125
126But this change would negatively affect a userspace application that
127has not been updated to understand the new "vlan" flow key attribute.
128The application could, following the flow compatibility rules above,
129ignore the "vlan" attribute that it does not understand and therefore
130assume that the flow contained IP packets. This is a bad assumption
131(the flow only contains IP packets if one parses and skips over the
132802.1Q header) and it could cause the application's behavior to change
133across kernel versions even though it follows the compatibility rules.
134
135The solution is to use a set of nested attributes. This is, for
136example, why 802.1Q support uses nested attributes. A TCP packet in
137VLAN 10 is actually expressed as:
138
139 eth(...), eth_type(0x8100), vlan(vid=10, pcp=0), encap(eth_type(0x0800),
140 ip(proto=6, ...), tcp(...)))
141
142Notice how the "eth_type", "ip", and "tcp" flow key attributes are
143nested inside the "encap" attribute. Thus, an application that does
144not understand the "vlan" key will not see either of those attributes
145and therefore will not misinterpret them. (Also, the outer eth_type
146is still 0x8100, not changed to 0x0800.)
147
148Handling malformed packets
149--------------------------
150
151Don't drop packets in the kernel for malformed protocol headers, bad
152checksums, etc. This would prevent userspace from implementing a
153simple Ethernet switch that forwards every packet.
154
155Instead, in such a case, include an attribute with "empty" content.
156It doesn't matter if the empty content could be valid protocol values,
157as long as those values are rarely seen in practice, because userspace
158can always forward all packets with those values to userspace and
159handle them individually.
160
161For example, consider a packet that contains an IP header that
162indicates protocol 6 for TCP, but which is truncated just after the IP
163header, so that the TCP header is missing. The flow key for this
164packet would include a tcp attribute with all-zero src and dst, like
165this:
166
167 eth(...), eth_type(0x0800), ip(proto=6, ...), tcp(src=0, dst=0)
168
169As another example, consider a packet with an Ethernet type of 0x8100,
170indicating that a VLAN TCI should follow, but which is truncated just
171after the Ethernet type. The flow key for this packet would include
172an all-zero-bits vlan and an empty encap attribute, like this:
173
174 eth(...), eth_type(0x8100), vlan(0), encap()
175
176Unlike a TCP packet with source and destination ports 0, an
177all-zero-bits VLAN TCI is not that rare, so the CFI bit (aka
178VLAN_TAG_PRESENT inside the kernel) is ordinarily set in a vlan
179attribute expressly to allow this situation to be distinguished.
180Thus, the flow key in this second example unambiguously indicates a
181missing or malformed VLAN TCI.
182
183Other rules
184-----------
185
186The other rules for flow keys are much less subtle:
187
188 - Duplicate attributes are not allowed at a given nesting level.
189
190 - Ordering of attributes is not significant.
191
192 - When the kernel sends a given flow key to userspace, it always
193 composes it the same way. This allows userspace to hash and
194 compare entire flow keys that it may not be able to fully
195 interpret.
diff --git a/MAINTAINERS b/MAINTAINERS
index c88eb7bb3a6..209ad0695ba 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4868,6 +4868,14 @@ S: Maintained
4868T: git git://openrisc.net/~jonas/linux 4868T: git git://openrisc.net/~jonas/linux
4869F: arch/openrisc 4869F: arch/openrisc
4870 4870
4871OPENVSWITCH
4872M: Jesse Gross <jesse@nicira.com>
4873L: dev@openvswitch.org
4874W: http://openvswitch.org
4875T: git git://git.kernel.org/pub/scm/linux/kernel/git/jesse/openvswitch.git
4876S: Maintained
4877F: net/openvswitch/
4878
4871OPL4 DRIVER 4879OPL4 DRIVER
4872M: Clemens Ladisch <clemens@ladisch.de> 4880M: Clemens Ladisch <clemens@ladisch.de>
4873L: alsa-devel@alsa-project.org (moderated for non-subscribers) 4881L: alsa-devel@alsa-project.org (moderated for non-subscribers)
diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
new file mode 100644
index 00000000000..eb1efa54fe8
--- /dev/null
+++ b/include/linux/openvswitch.h
@@ -0,0 +1,452 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#ifndef _LINUX_OPENVSWITCH_H
20#define _LINUX_OPENVSWITCH_H 1
21
22#include <linux/types.h>
23
24/**
25 * struct ovs_header - header for OVS Generic Netlink messages.
26 * @dp_ifindex: ifindex of local port for datapath (0 to make a request not
27 * specific to a datapath).
28 *
29 * Attributes following the header are specific to a particular OVS Generic
30 * Netlink family, but all of the OVS families use this header.
31 */
32
33struct ovs_header {
34 int dp_ifindex;
35};
36
37/* Datapaths. */
38
39#define OVS_DATAPATH_FAMILY "ovs_datapath"
40#define OVS_DATAPATH_MCGROUP "ovs_datapath"
41#define OVS_DATAPATH_VERSION 0x1
42
43enum ovs_datapath_cmd {
44 OVS_DP_CMD_UNSPEC,
45 OVS_DP_CMD_NEW,
46 OVS_DP_CMD_DEL,
47 OVS_DP_CMD_GET,
48 OVS_DP_CMD_SET
49};
50
51/**
52 * enum ovs_datapath_attr - attributes for %OVS_DP_* commands.
53 * @OVS_DP_ATTR_NAME: Name of the network device that serves as the "local
54 * port". This is the name of the network device whose dp_ifindex is given in
55 * the &struct ovs_header. Always present in notifications. Required in
56 * %OVS_DP_NEW requests. May be used as an alternative to specifying
57 * dp_ifindex in other requests (with a dp_ifindex of 0).
58 * @OVS_DP_ATTR_UPCALL_PID: The Netlink socket in userspace that is initially
59 * set on the datapath port (for OVS_ACTION_ATTR_MISS). Only valid on
60 * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
61 * not be sent.
62 * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
63 * datapath. Always present in notifications.
64 *
65 * These attributes follow the &struct ovs_header within the Generic Netlink
66 * payload for %OVS_DP_* commands.
67 */
68enum ovs_datapath_attr {
69 OVS_DP_ATTR_UNSPEC,
70 OVS_DP_ATTR_NAME, /* name of dp_ifindex netdev */
71 OVS_DP_ATTR_UPCALL_PID, /* Netlink PID to receive upcalls */
72 OVS_DP_ATTR_STATS, /* struct ovs_dp_stats */
73 __OVS_DP_ATTR_MAX
74};
75
76#define OVS_DP_ATTR_MAX (__OVS_DP_ATTR_MAX - 1)
77
78struct ovs_dp_stats {
79 __u64 n_hit; /* Number of flow table matches. */
80 __u64 n_missed; /* Number of flow table misses. */
81 __u64 n_lost; /* Number of misses not sent to userspace. */
82 __u64 n_flows; /* Number of flows present */
83};
84
85struct ovs_vport_stats {
86 __u64 rx_packets; /* total packets received */
87 __u64 tx_packets; /* total packets transmitted */
88 __u64 rx_bytes; /* total bytes received */
89 __u64 tx_bytes; /* total bytes transmitted */
90 __u64 rx_errors; /* bad packets received */
91 __u64 tx_errors; /* packet transmit problems */
92 __u64 rx_dropped; /* no space in linux buffers */
93 __u64 tx_dropped; /* no space available in linux */
94};
95
96/* Fixed logical ports. */
97#define OVSP_LOCAL ((__u16)0)
98
99/* Packet transfer. */
100
101#define OVS_PACKET_FAMILY "ovs_packet"
102#define OVS_PACKET_VERSION 0x1
103
104enum ovs_packet_cmd {
105 OVS_PACKET_CMD_UNSPEC,
106
107 /* Kernel-to-user notifications. */
108 OVS_PACKET_CMD_MISS, /* Flow table miss. */
109 OVS_PACKET_CMD_ACTION, /* OVS_ACTION_ATTR_USERSPACE action. */
110
111 /* Userspace commands. */
112 OVS_PACKET_CMD_EXECUTE /* Apply actions to a packet. */
113};
114
115/**
116 * enum ovs_packet_attr - attributes for %OVS_PACKET_* commands.
117 * @OVS_PACKET_ATTR_PACKET: Present for all notifications. Contains the entire
118 * packet as received, from the start of the Ethernet header onward. For
119 * %OVS_PACKET_CMD_ACTION, %OVS_PACKET_ATTR_PACKET reflects changes made by
120 * actions preceding %OVS_ACTION_ATTR_USERSPACE, but %OVS_PACKET_ATTR_KEY is
121 * the flow key extracted from the packet as originally received.
122 * @OVS_PACKET_ATTR_KEY: Present for all notifications. Contains the flow key
123 * extracted from the packet as nested %OVS_KEY_ATTR_* attributes. This allows
124 * userspace to adapt its flow setup strategy by comparing its notion of the
125 * flow key against the kernel's.
126 * @OVS_PACKET_ATTR_ACTIONS: Contains actions for the packet. Used
127 * for %OVS_PACKET_CMD_EXECUTE. It has nested %OVS_ACTION_ATTR_* attributes.
128 * @OVS_PACKET_ATTR_USERDATA: Present for an %OVS_PACKET_CMD_ACTION
129 * notification if the %OVS_ACTION_ATTR_USERSPACE action specified an
130 * %OVS_USERSPACE_ATTR_USERDATA attribute.
131 *
132 * These attributes follow the &struct ovs_header within the Generic Netlink
133 * payload for %OVS_PACKET_* commands.
134 */
135enum ovs_packet_attr {
136 OVS_PACKET_ATTR_UNSPEC,
137 OVS_PACKET_ATTR_PACKET, /* Packet data. */
138 OVS_PACKET_ATTR_KEY, /* Nested OVS_KEY_ATTR_* attributes. */
139 OVS_PACKET_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */
140 OVS_PACKET_ATTR_USERDATA, /* u64 OVS_ACTION_ATTR_USERSPACE arg. */
141 __OVS_PACKET_ATTR_MAX
142};
143
144#define OVS_PACKET_ATTR_MAX (__OVS_PACKET_ATTR_MAX - 1)
145
146/* Virtual ports. */
147
148#define OVS_VPORT_FAMILY "ovs_vport"
149#define OVS_VPORT_MCGROUP "ovs_vport"
150#define OVS_VPORT_VERSION 0x1
151
152enum ovs_vport_cmd {
153 OVS_VPORT_CMD_UNSPEC,
154 OVS_VPORT_CMD_NEW,
155 OVS_VPORT_CMD_DEL,
156 OVS_VPORT_CMD_GET,
157 OVS_VPORT_CMD_SET
158};
159
160enum ovs_vport_type {
161 OVS_VPORT_TYPE_UNSPEC,
162 OVS_VPORT_TYPE_NETDEV, /* network device */
163 OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */
164 __OVS_VPORT_TYPE_MAX
165};
166
167#define OVS_VPORT_TYPE_MAX (__OVS_VPORT_TYPE_MAX - 1)
168
169/**
170 * enum ovs_vport_attr - attributes for %OVS_VPORT_* commands.
171 * @OVS_VPORT_ATTR_PORT_NO: 32-bit port number within datapath.
172 * @OVS_VPORT_ATTR_TYPE: 32-bit %OVS_VPORT_TYPE_* constant describing the type
173 * of vport.
174 * @OVS_VPORT_ATTR_NAME: Name of vport. For a vport based on a network device
175 * this is the name of the network device. Maximum length %IFNAMSIZ-1 bytes
176 * plus a null terminator.
177 * @OVS_VPORT_ATTR_OPTIONS: Vport-specific configuration information.
178 * @OVS_VPORT_ATTR_UPCALL_PID: The Netlink socket in userspace that
179 * OVS_PACKET_CMD_MISS upcalls will be directed to for packets received on
180 * this port. A value of zero indicates that upcalls should not be sent.
181 * @OVS_VPORT_ATTR_STATS: A &struct ovs_vport_stats giving statistics for
182 * packets sent or received through the vport.
183 *
184 * These attributes follow the &struct ovs_header within the Generic Netlink
185 * payload for %OVS_VPORT_* commands.
186 *
187 * For %OVS_VPORT_CMD_NEW requests, the %OVS_VPORT_ATTR_TYPE and
188 * %OVS_VPORT_ATTR_NAME attributes are required. %OVS_VPORT_ATTR_PORT_NO is
189 * optional; if not specified a free port number is automatically selected.
190 * Whether %OVS_VPORT_ATTR_OPTIONS is required or optional depends on the type
191 * of vport.
192 * and other attributes are ignored.
193 *
194 * For other requests, if %OVS_VPORT_ATTR_NAME is specified then it is used to
195 * look up the vport to operate on; otherwise dp_idx from the &struct
196 * ovs_header plus %OVS_VPORT_ATTR_PORT_NO determine the vport.
197 */
198enum ovs_vport_attr {
199 OVS_VPORT_ATTR_UNSPEC,
200 OVS_VPORT_ATTR_PORT_NO, /* u32 port number within datapath */
201 OVS_VPORT_ATTR_TYPE, /* u32 OVS_VPORT_TYPE_* constant. */
202 OVS_VPORT_ATTR_NAME, /* string name, up to IFNAMSIZ bytes long */
203 OVS_VPORT_ATTR_OPTIONS, /* nested attributes, varies by vport type */
204 OVS_VPORT_ATTR_UPCALL_PID, /* u32 Netlink PID to receive upcalls */
205 OVS_VPORT_ATTR_STATS, /* struct ovs_vport_stats */
206 __OVS_VPORT_ATTR_MAX
207};
208
209#define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1)
210
211/* Flows. */
212
213#define OVS_FLOW_FAMILY "ovs_flow"
214#define OVS_FLOW_MCGROUP "ovs_flow"
215#define OVS_FLOW_VERSION 0x1
216
217enum ovs_flow_cmd {
218 OVS_FLOW_CMD_UNSPEC,
219 OVS_FLOW_CMD_NEW,
220 OVS_FLOW_CMD_DEL,
221 OVS_FLOW_CMD_GET,
222 OVS_FLOW_CMD_SET
223};
224
225struct ovs_flow_stats {
226 __u64 n_packets; /* Number of matched packets. */
227 __u64 n_bytes; /* Number of matched bytes. */
228};
229
230enum ovs_key_attr {
231 OVS_KEY_ATTR_UNSPEC,
232 OVS_KEY_ATTR_ENCAP, /* Nested set of encapsulated attributes. */
233 OVS_KEY_ATTR_PRIORITY, /* u32 skb->priority */
234 OVS_KEY_ATTR_IN_PORT, /* u32 OVS dp port number */
235 OVS_KEY_ATTR_ETHERNET, /* struct ovs_key_ethernet */
236 OVS_KEY_ATTR_VLAN, /* be16 VLAN TCI */
237 OVS_KEY_ATTR_ETHERTYPE, /* be16 Ethernet type */
238 OVS_KEY_ATTR_IPV4, /* struct ovs_key_ipv4 */
239 OVS_KEY_ATTR_IPV6, /* struct ovs_key_ipv6 */
240 OVS_KEY_ATTR_TCP, /* struct ovs_key_tcp */
241 OVS_KEY_ATTR_UDP, /* struct ovs_key_udp */
242 OVS_KEY_ATTR_ICMP, /* struct ovs_key_icmp */
243 OVS_KEY_ATTR_ICMPV6, /* struct ovs_key_icmpv6 */
244 OVS_KEY_ATTR_ARP, /* struct ovs_key_arp */
245 OVS_KEY_ATTR_ND, /* struct ovs_key_nd */
246 __OVS_KEY_ATTR_MAX
247};
248
249#define OVS_KEY_ATTR_MAX (__OVS_KEY_ATTR_MAX - 1)
250
251/**
252 * enum ovs_frag_type - IPv4 and IPv6 fragment type
253 * @OVS_FRAG_TYPE_NONE: Packet is not a fragment.
254 * @OVS_FRAG_TYPE_FIRST: Packet is a fragment with offset 0.
255 * @OVS_FRAG_TYPE_LATER: Packet is a fragment with nonzero offset.
256 *
257 * Used as the @ipv4_frag in &struct ovs_key_ipv4 and as @ipv6_frag &struct
258 * ovs_key_ipv6.
259 */
260enum ovs_frag_type {
261 OVS_FRAG_TYPE_NONE,
262 OVS_FRAG_TYPE_FIRST,
263 OVS_FRAG_TYPE_LATER,
264 __OVS_FRAG_TYPE_MAX
265};
266
267#define OVS_FRAG_TYPE_MAX (__OVS_FRAG_TYPE_MAX - 1)
268
269struct ovs_key_ethernet {
270 __u8 eth_src[6];
271 __u8 eth_dst[6];
272};
273
274struct ovs_key_ipv4 {
275 __be32 ipv4_src;
276 __be32 ipv4_dst;
277 __u8 ipv4_proto;
278 __u8 ipv4_tos;
279 __u8 ipv4_ttl;
280 __u8 ipv4_frag; /* One of OVS_FRAG_TYPE_*. */
281};
282
283struct ovs_key_ipv6 {
284 __be32 ipv6_src[4];
285 __be32 ipv6_dst[4];
286 __be32 ipv6_label; /* 20-bits in least-significant bits. */
287 __u8 ipv6_proto;
288 __u8 ipv6_tclass;
289 __u8 ipv6_hlimit;
290 __u8 ipv6_frag; /* One of OVS_FRAG_TYPE_*. */
291};
292
293struct ovs_key_tcp {
294 __be16 tcp_src;
295 __be16 tcp_dst;
296};
297
298struct ovs_key_udp {
299 __be16 udp_src;
300 __be16 udp_dst;
301};
302
303struct ovs_key_icmp {
304 __u8 icmp_type;
305 __u8 icmp_code;
306};
307
308struct ovs_key_icmpv6 {
309 __u8 icmpv6_type;
310 __u8 icmpv6_code;
311};
312
313struct ovs_key_arp {
314 __be32 arp_sip;
315 __be32 arp_tip;
316 __be16 arp_op;
317 __u8 arp_sha[6];
318 __u8 arp_tha[6];
319};
320
321struct ovs_key_nd {
322 __u32 nd_target[4];
323 __u8 nd_sll[6];
324 __u8 nd_tll[6];
325};
326
327/**
328 * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
329 * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
330 * key. Always present in notifications. Required for all requests (except
331 * dumps).
332 * @OVS_FLOW_ATTR_ACTIONS: Nested %OVS_ACTION_ATTR_* attributes specifying
333 * the actions to take for packets that match the key. Always present in
334 * notifications. Required for %OVS_FLOW_CMD_NEW requests, optional for
335 * %OVS_FLOW_CMD_SET requests.
336 * @OVS_FLOW_ATTR_STATS: &struct ovs_flow_stats giving statistics for this
337 * flow. Present in notifications if the stats would be nonzero. Ignored in
338 * requests.
339 * @OVS_FLOW_ATTR_TCP_FLAGS: An 8-bit value giving the OR'd value of all of the
340 * TCP flags seen on packets in this flow. Only present in notifications for
341 * TCP flows, and only if it would be nonzero. Ignored in requests.
342 * @OVS_FLOW_ATTR_USED: A 64-bit integer giving the time, in milliseconds on
343 * the system monotonic clock, at which a packet was last processed for this
344 * flow. Only present in notifications if a packet has been processed for this
345 * flow. Ignored in requests.
346 * @OVS_FLOW_ATTR_CLEAR: If present in a %OVS_FLOW_CMD_SET request, clears the
347 * last-used time, accumulated TCP flags, and statistics for this flow.
348 * Otherwise ignored in requests. Never present in notifications.
349 *
350 * These attributes follow the &struct ovs_header within the Generic Netlink
351 * payload for %OVS_FLOW_* commands.
352 */
353enum ovs_flow_attr {
354 OVS_FLOW_ATTR_UNSPEC,
355 OVS_FLOW_ATTR_KEY, /* Sequence of OVS_KEY_ATTR_* attributes. */
356 OVS_FLOW_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */
357 OVS_FLOW_ATTR_STATS, /* struct ovs_flow_stats. */
358 OVS_FLOW_ATTR_TCP_FLAGS, /* 8-bit OR'd TCP flags. */
359 OVS_FLOW_ATTR_USED, /* u64 msecs last used in monotonic time. */
360 OVS_FLOW_ATTR_CLEAR, /* Flag to clear stats, tcp_flags, used. */
361 __OVS_FLOW_ATTR_MAX
362};
363
364#define OVS_FLOW_ATTR_MAX (__OVS_FLOW_ATTR_MAX - 1)
365
366/**
367 * enum ovs_sample_attr - Attributes for %OVS_ACTION_ATTR_SAMPLE action.
368 * @OVS_SAMPLE_ATTR_PROBABILITY: 32-bit fraction of packets to sample with
369 * @OVS_ACTION_ATTR_SAMPLE. A value of 0 samples no packets, a value of
370 * %UINT32_MAX samples all packets and intermediate values sample intermediate
371 * fractions of packets.
372 * @OVS_SAMPLE_ATTR_ACTIONS: Set of actions to execute in sampling event.
373 * Actions are passed as nested attributes.
374 *
375 * Executes the specified actions with the given probability on a per-packet
376 * basis.
377 */
378enum ovs_sample_attr {
379 OVS_SAMPLE_ATTR_UNSPEC,
380 OVS_SAMPLE_ATTR_PROBABILITY, /* u32 number */
381 OVS_SAMPLE_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */
382 __OVS_SAMPLE_ATTR_MAX,
383};
384
385#define OVS_SAMPLE_ATTR_MAX (__OVS_SAMPLE_ATTR_MAX - 1)
386
387/**
388 * enum ovs_userspace_attr - Attributes for %OVS_ACTION_ATTR_USERSPACE action.
389 * @OVS_USERSPACE_ATTR_PID: u32 Netlink PID to which the %OVS_PACKET_CMD_ACTION
390 * message should be sent. Required.
391 * @OVS_USERSPACE_ATTR_USERDATA: If present, its u64 argument is copied to the
392 * %OVS_PACKET_CMD_ACTION message as %OVS_PACKET_ATTR_USERDATA,
393 */
394enum ovs_userspace_attr {
395 OVS_USERSPACE_ATTR_UNSPEC,
396 OVS_USERSPACE_ATTR_PID, /* u32 Netlink PID to receive upcalls. */
397 OVS_USERSPACE_ATTR_USERDATA, /* u64 optional user-specified cookie. */
398 __OVS_USERSPACE_ATTR_MAX
399};
400
401#define OVS_USERSPACE_ATTR_MAX (__OVS_USERSPACE_ATTR_MAX - 1)
402
403/**
404 * struct ovs_action_push_vlan - %OVS_ACTION_ATTR_PUSH_VLAN action argument.
405 * @vlan_tpid: Tag protocol identifier (TPID) to push.
406 * @vlan_tci: Tag control identifier (TCI) to push. The CFI bit must be set
407 * (but it will not be set in the 802.1Q header that is pushed).
408 *
409 * The @vlan_tpid value is typically %ETH_P_8021Q. The only acceptable TPID
410 * values are those that the kernel module also parses as 802.1Q headers, to
411 * prevent %OVS_ACTION_ATTR_PUSH_VLAN followed by %OVS_ACTION_ATTR_POP_VLAN
412 * from having surprising results.
413 */
414struct ovs_action_push_vlan {
415 __be16 vlan_tpid; /* 802.1Q TPID. */
416 __be16 vlan_tci; /* 802.1Q TCI (VLAN ID and priority). */
417};
418
419/**
420 * enum ovs_action_attr - Action types.
421 *
422 * @OVS_ACTION_ATTR_OUTPUT: Output packet to port.
423 * @OVS_ACTION_ATTR_USERSPACE: Send packet to userspace according to nested
424 * %OVS_USERSPACE_ATTR_* attributes.
425 * @OVS_ACTION_ATTR_SET: Replaces the contents of an existing header. The
426 * single nested %OVS_KEY_ATTR_* attribute specifies a header to modify and its
427 * value.
428 * @OVS_ACTION_ATTR_PUSH_VLAN: Push a new outermost 802.1Q header onto the
429 * packet.
430 * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q header off the packet.
431 * @OVS_ACTION_ATTR_SAMPLE: Probabilitically executes actions, as specified in
432 * the nested %OVS_SAMPLE_ATTR_* attributes.
433 *
434 * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all
435 * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
436 * type may not be changed.
437 */
438
439enum ovs_action_attr {
440 OVS_ACTION_ATTR_UNSPEC,
441 OVS_ACTION_ATTR_OUTPUT, /* u32 port number. */
442 OVS_ACTION_ATTR_USERSPACE, /* Nested OVS_USERSPACE_ATTR_*. */
443 OVS_ACTION_ATTR_SET, /* One nested OVS_KEY_ATTR_*. */
444 OVS_ACTION_ATTR_PUSH_VLAN, /* struct ovs_action_push_vlan. */
445 OVS_ACTION_ATTR_POP_VLAN, /* No argument. */
446 OVS_ACTION_ATTR_SAMPLE, /* Nested OVS_SAMPLE_ATTR_*. */
447 __OVS_ACTION_ATTR_MAX
448};
449
450#define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1)
451
452#endif /* _LINUX_OPENVSWITCH_H */
diff --git a/net/Kconfig b/net/Kconfig
index 2d998735c4d..e07272d0bb2 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -215,6 +215,7 @@ source "net/sched/Kconfig"
215source "net/dcb/Kconfig" 215source "net/dcb/Kconfig"
216source "net/dns_resolver/Kconfig" 216source "net/dns_resolver/Kconfig"
217source "net/batman-adv/Kconfig" 217source "net/batman-adv/Kconfig"
218source "net/openvswitch/Kconfig"
218 219
219config RPS 220config RPS
220 boolean 221 boolean
diff --git a/net/Makefile b/net/Makefile
index acdde4950de..ad432fa4d93 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -69,3 +69,4 @@ obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
69obj-$(CONFIG_CEPH_LIB) += ceph/ 69obj-$(CONFIG_CEPH_LIB) += ceph/
70obj-$(CONFIG_BATMAN_ADV) += batman-adv/ 70obj-$(CONFIG_BATMAN_ADV) += batman-adv/
71obj-$(CONFIG_NFC) += nfc/ 71obj-$(CONFIG_NFC) += nfc/
72obj-$(CONFIG_OPENVSWITCH) += openvswitch/
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
new file mode 100644
index 00000000000..d9ea33c361b
--- /dev/null
+++ b/net/openvswitch/Kconfig
@@ -0,0 +1,28 @@
1#
2# Open vSwitch
3#
4
5config OPENVSWITCH
6 tristate "Open vSwitch"
7 ---help---
8 Open vSwitch is a multilayer Ethernet switch targeted at virtualized
9 environments. In addition to supporting a variety of features
10 expected in a traditional hardware switch, it enables fine-grained
11 programmatic extension and flow-based control of the network. This
12 control is useful in a wide variety of applications but is
13 particularly important in multi-server virtualization deployments,
14 which are often characterized by highly dynamic endpoints and the
15 need to maintain logical abstractions for multiple tenants.
16
17 The Open vSwitch datapath provides an in-kernel fast path for packet
18 forwarding. It is complemented by a userspace daemon, ovs-vswitchd,
19 which is able to accept configuration from a variety of sources and
20 translate it into packet processing rules.
21
22 See http://openvswitch.org for more information and userspace
23 utilities.
24
25 To compile this code as a module, choose M here: the module will be
26 called openvswitch.
27
28 If unsure, say N.
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
new file mode 100644
index 00000000000..15e7384745c
--- /dev/null
+++ b/net/openvswitch/Makefile
@@ -0,0 +1,14 @@
1#
2# Makefile for Open vSwitch.
3#
4
5obj-$(CONFIG_OPENVSWITCH) += openvswitch.o
6
7openvswitch-y := \
8 actions.o \
9 datapath.o \
10 dp_notify.o \
11 flow.o \
12 vport.o \
13 vport-internal_dev.o \
14 vport-netdev.o \
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
new file mode 100644
index 00000000000..2725d1bdf29
--- /dev/null
+++ b/net/openvswitch/actions.c
@@ -0,0 +1,415 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/skbuff.h>
22#include <linux/in.h>
23#include <linux/ip.h>
24#include <linux/openvswitch.h>
25#include <linux/tcp.h>
26#include <linux/udp.h>
27#include <linux/in6.h>
28#include <linux/if_arp.h>
29#include <linux/if_vlan.h>
30#include <net/ip.h>
31#include <net/checksum.h>
32#include <net/dsfield.h>
33
34#include "datapath.h"
35#include "vport.h"
36
37static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
38 const struct nlattr *attr, int len, bool keep_skb);
39
40static int make_writable(struct sk_buff *skb, int write_len)
41{
42 if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
43 return 0;
44
45 return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
46}
47
48/* remove VLAN header from packet and update csum accrodingly. */
49static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
50{
51 struct vlan_hdr *vhdr;
52 int err;
53
54 err = make_writable(skb, VLAN_ETH_HLEN);
55 if (unlikely(err))
56 return err;
57
58 if (skb->ip_summed == CHECKSUM_COMPLETE)
59 skb->csum = csum_sub(skb->csum, csum_partial(skb->data
60 + ETH_HLEN, VLAN_HLEN, 0));
61
62 vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
63 *current_tci = vhdr->h_vlan_TCI;
64
65 memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
66 __skb_pull(skb, VLAN_HLEN);
67
68 vlan_set_encap_proto(skb, vhdr);
69 skb->mac_header += VLAN_HLEN;
70 skb_reset_mac_len(skb);
71
72 return 0;
73}
74
75static int pop_vlan(struct sk_buff *skb)
76{
77 __be16 tci;
78 int err;
79
80 if (likely(vlan_tx_tag_present(skb))) {
81 skb->vlan_tci = 0;
82 } else {
83 if (unlikely(skb->protocol != htons(ETH_P_8021Q) ||
84 skb->len < VLAN_ETH_HLEN))
85 return 0;
86
87 err = __pop_vlan_tci(skb, &tci);
88 if (err)
89 return err;
90 }
91 /* move next vlan tag to hw accel tag */
92 if (likely(skb->protocol != htons(ETH_P_8021Q) ||
93 skb->len < VLAN_ETH_HLEN))
94 return 0;
95
96 err = __pop_vlan_tci(skb, &tci);
97 if (unlikely(err))
98 return err;
99
100 __vlan_hwaccel_put_tag(skb, ntohs(tci));
101 return 0;
102}
103
104static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vlan)
105{
106 if (unlikely(vlan_tx_tag_present(skb))) {
107 u16 current_tag;
108
109 /* push down current VLAN tag */
110 current_tag = vlan_tx_tag_get(skb);
111
112 if (!__vlan_put_tag(skb, current_tag))
113 return -ENOMEM;
114
115 if (skb->ip_summed == CHECKSUM_COMPLETE)
116 skb->csum = csum_add(skb->csum, csum_partial(skb->data
117 + ETH_HLEN, VLAN_HLEN, 0));
118
119 }
120 __vlan_hwaccel_put_tag(skb, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
121 return 0;
122}
123
124static int set_eth_addr(struct sk_buff *skb,
125 const struct ovs_key_ethernet *eth_key)
126{
127 int err;
128 err = make_writable(skb, ETH_HLEN);
129 if (unlikely(err))
130 return err;
131
132 memcpy(eth_hdr(skb)->h_source, eth_key->eth_src, ETH_ALEN);
133 memcpy(eth_hdr(skb)->h_dest, eth_key->eth_dst, ETH_ALEN);
134
135 return 0;
136}
137
138static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh,
139 __be32 *addr, __be32 new_addr)
140{
141 int transport_len = skb->len - skb_transport_offset(skb);
142
143 if (nh->protocol == IPPROTO_TCP) {
144 if (likely(transport_len >= sizeof(struct tcphdr)))
145 inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb,
146 *addr, new_addr, 1);
147 } else if (nh->protocol == IPPROTO_UDP) {
148 if (likely(transport_len >= sizeof(struct udphdr)))
149 inet_proto_csum_replace4(&udp_hdr(skb)->check, skb,
150 *addr, new_addr, 1);
151 }
152
153 csum_replace4(&nh->check, *addr, new_addr);
154 skb->rxhash = 0;
155 *addr = new_addr;
156}
157
158static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl)
159{
160 csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8));
161 nh->ttl = new_ttl;
162}
163
164static int set_ipv4(struct sk_buff *skb, const struct ovs_key_ipv4 *ipv4_key)
165{
166 struct iphdr *nh;
167 int err;
168
169 err = make_writable(skb, skb_network_offset(skb) +
170 sizeof(struct iphdr));
171 if (unlikely(err))
172 return err;
173
174 nh = ip_hdr(skb);
175
176 if (ipv4_key->ipv4_src != nh->saddr)
177 set_ip_addr(skb, nh, &nh->saddr, ipv4_key->ipv4_src);
178
179 if (ipv4_key->ipv4_dst != nh->daddr)
180 set_ip_addr(skb, nh, &nh->daddr, ipv4_key->ipv4_dst);
181
182 if (ipv4_key->ipv4_tos != nh->tos)
183 ipv4_change_dsfield(nh, 0, ipv4_key->ipv4_tos);
184
185 if (ipv4_key->ipv4_ttl != nh->ttl)
186 set_ip_ttl(skb, nh, ipv4_key->ipv4_ttl);
187
188 return 0;
189}
190
191/* Must follow make_writable() since that can move the skb data. */
192static void set_tp_port(struct sk_buff *skb, __be16 *port,
193 __be16 new_port, __sum16 *check)
194{
195 inet_proto_csum_replace2(check, skb, *port, new_port, 0);
196 *port = new_port;
197 skb->rxhash = 0;
198}
199
200static int set_udp_port(struct sk_buff *skb,
201 const struct ovs_key_udp *udp_port_key)
202{
203 struct udphdr *uh;
204 int err;
205
206 err = make_writable(skb, skb_transport_offset(skb) +
207 sizeof(struct udphdr));
208 if (unlikely(err))
209 return err;
210
211 uh = udp_hdr(skb);
212 if (udp_port_key->udp_src != uh->source)
213 set_tp_port(skb, &uh->source, udp_port_key->udp_src, &uh->check);
214
215 if (udp_port_key->udp_dst != uh->dest)
216 set_tp_port(skb, &uh->dest, udp_port_key->udp_dst, &uh->check);
217
218 return 0;
219}
220
221static int set_tcp_port(struct sk_buff *skb,
222 const struct ovs_key_tcp *tcp_port_key)
223{
224 struct tcphdr *th;
225 int err;
226
227 err = make_writable(skb, skb_transport_offset(skb) +
228 sizeof(struct tcphdr));
229 if (unlikely(err))
230 return err;
231
232 th = tcp_hdr(skb);
233 if (tcp_port_key->tcp_src != th->source)
234 set_tp_port(skb, &th->source, tcp_port_key->tcp_src, &th->check);
235
236 if (tcp_port_key->tcp_dst != th->dest)
237 set_tp_port(skb, &th->dest, tcp_port_key->tcp_dst, &th->check);
238
239 return 0;
240}
241
242static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
243{
244 struct vport *vport;
245
246 if (unlikely(!skb))
247 return -ENOMEM;
248
249 vport = rcu_dereference(dp->ports[out_port]);
250 if (unlikely(!vport)) {
251 kfree_skb(skb);
252 return -ENODEV;
253 }
254
255 ovs_vport_send(vport, skb);
256 return 0;
257}
258
259static int output_userspace(struct datapath *dp, struct sk_buff *skb,
260 const struct nlattr *attr)
261{
262 struct dp_upcall_info upcall;
263 const struct nlattr *a;
264 int rem;
265
266 upcall.cmd = OVS_PACKET_CMD_ACTION;
267 upcall.key = &OVS_CB(skb)->flow->key;
268 upcall.userdata = NULL;
269 upcall.pid = 0;
270
271 for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
272 a = nla_next(a, &rem)) {
273 switch (nla_type(a)) {
274 case OVS_USERSPACE_ATTR_USERDATA:
275 upcall.userdata = a;
276 break;
277
278 case OVS_USERSPACE_ATTR_PID:
279 upcall.pid = nla_get_u32(a);
280 break;
281 }
282 }
283
284 return ovs_dp_upcall(dp, skb, &upcall);
285}
286
287static int sample(struct datapath *dp, struct sk_buff *skb,
288 const struct nlattr *attr)
289{
290 const struct nlattr *acts_list = NULL;
291 const struct nlattr *a;
292 int rem;
293
294 for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
295 a = nla_next(a, &rem)) {
296 switch (nla_type(a)) {
297 case OVS_SAMPLE_ATTR_PROBABILITY:
298 if (net_random() >= nla_get_u32(a))
299 return 0;
300 break;
301
302 case OVS_SAMPLE_ATTR_ACTIONS:
303 acts_list = a;
304 break;
305 }
306 }
307
308 return do_execute_actions(dp, skb, nla_data(acts_list),
309 nla_len(acts_list), true);
310}
311
312static int execute_set_action(struct sk_buff *skb,
313 const struct nlattr *nested_attr)
314{
315 int err = 0;
316
317 switch (nla_type(nested_attr)) {
318 case OVS_KEY_ATTR_PRIORITY:
319 skb->priority = nla_get_u32(nested_attr);
320 break;
321
322 case OVS_KEY_ATTR_ETHERNET:
323 err = set_eth_addr(skb, nla_data(nested_attr));
324 break;
325
326 case OVS_KEY_ATTR_IPV4:
327 err = set_ipv4(skb, nla_data(nested_attr));
328 break;
329
330 case OVS_KEY_ATTR_TCP:
331 err = set_tcp_port(skb, nla_data(nested_attr));
332 break;
333
334 case OVS_KEY_ATTR_UDP:
335 err = set_udp_port(skb, nla_data(nested_attr));
336 break;
337 }
338
339 return err;
340}
341
342/* Execute a list of actions against 'skb'. */
343static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
344 const struct nlattr *attr, int len, bool keep_skb)
345{
346 /* Every output action needs a separate clone of 'skb', but the common
347 * case is just a single output action, so that doing a clone and
348 * then freeing the original skbuff is wasteful. So the following code
349 * is slightly obscure just to avoid that. */
350 int prev_port = -1;
351 const struct nlattr *a;
352 int rem;
353
354 for (a = attr, rem = len; rem > 0;
355 a = nla_next(a, &rem)) {
356 int err = 0;
357
358 if (prev_port != -1) {
359 do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port);
360 prev_port = -1;
361 }
362
363 switch (nla_type(a)) {
364 case OVS_ACTION_ATTR_OUTPUT:
365 prev_port = nla_get_u32(a);
366 break;
367
368 case OVS_ACTION_ATTR_USERSPACE:
369 output_userspace(dp, skb, a);
370 break;
371
372 case OVS_ACTION_ATTR_PUSH_VLAN:
373 err = push_vlan(skb, nla_data(a));
374 if (unlikely(err)) /* skb already freed. */
375 return err;
376 break;
377
378 case OVS_ACTION_ATTR_POP_VLAN:
379 err = pop_vlan(skb);
380 break;
381
382 case OVS_ACTION_ATTR_SET:
383 err = execute_set_action(skb, nla_data(a));
384 break;
385
386 case OVS_ACTION_ATTR_SAMPLE:
387 err = sample(dp, skb, a);
388 break;
389 }
390
391 if (unlikely(err)) {
392 kfree_skb(skb);
393 return err;
394 }
395 }
396
397 if (prev_port != -1) {
398 if (keep_skb)
399 skb = skb_clone(skb, GFP_ATOMIC);
400
401 do_output(dp, skb, prev_port);
402 } else if (!keep_skb)
403 consume_skb(skb);
404
405 return 0;
406}
407
408/* Execute a list of actions against 'skb'. */
409int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb)
410{
411 struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts);
412
413 return do_execute_actions(dp, skb, acts->actions,
414 acts->actions_len, false);
415}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
new file mode 100644
index 00000000000..9a2725114e9
--- /dev/null
+++ b/net/openvswitch/datapath.c
@@ -0,0 +1,1912 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/init.h>
22#include <linux/module.h>
23#include <linux/if_arp.h>
24#include <linux/if_vlan.h>
25#include <linux/in.h>
26#include <linux/ip.h>
27#include <linux/jhash.h>
28#include <linux/delay.h>
29#include <linux/time.h>
30#include <linux/etherdevice.h>
31#include <linux/genetlink.h>
32#include <linux/kernel.h>
33#include <linux/kthread.h>
34#include <linux/mutex.h>
35#include <linux/percpu.h>
36#include <linux/rcupdate.h>
37#include <linux/tcp.h>
38#include <linux/udp.h>
39#include <linux/version.h>
40#include <linux/ethtool.h>
41#include <linux/wait.h>
42#include <asm/system.h>
43#include <asm/div64.h>
44#include <linux/highmem.h>
45#include <linux/netfilter_bridge.h>
46#include <linux/netfilter_ipv4.h>
47#include <linux/inetdevice.h>
48#include <linux/list.h>
49#include <linux/openvswitch.h>
50#include <linux/rculist.h>
51#include <linux/dmi.h>
52#include <linux/workqueue.h>
53#include <net/genetlink.h>
54
55#include "datapath.h"
56#include "flow.h"
57#include "vport-internal_dev.h"
58
59/**
60 * DOC: Locking:
61 *
62 * Writes to device state (add/remove datapath, port, set operations on vports,
63 * etc.) are protected by RTNL.
64 *
65 * Writes to other state (flow table modifications, set miscellaneous datapath
66 * parameters, etc.) are protected by genl_mutex. The RTNL lock nests inside
67 * genl_mutex.
68 *
69 * Reads are protected by RCU.
70 *
71 * There are a few special cases (mostly stats) that have their own
72 * synchronization but they nest under all of above and don't interact with
73 * each other.
74 */
75
76/* Global list of datapaths to enable dumping them all out.
77 * Protected by genl_mutex.
78 */
79static LIST_HEAD(dps);
80
81#define REHASH_FLOW_INTERVAL (10 * 60 * HZ)
82static void rehash_flow_table(struct work_struct *work);
83static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table);
84
85static struct vport *new_vport(const struct vport_parms *);
86static int queue_gso_packets(int dp_ifindex, struct sk_buff *,
87 const struct dp_upcall_info *);
88static int queue_userspace_packet(int dp_ifindex, struct sk_buff *,
89 const struct dp_upcall_info *);
90
91/* Must be called with rcu_read_lock, genl_mutex, or RTNL lock. */
92static struct datapath *get_dp(int dp_ifindex)
93{
94 struct datapath *dp = NULL;
95 struct net_device *dev;
96
97 rcu_read_lock();
98 dev = dev_get_by_index_rcu(&init_net, dp_ifindex);
99 if (dev) {
100 struct vport *vport = ovs_internal_dev_get_vport(dev);
101 if (vport)
102 dp = vport->dp;
103 }
104 rcu_read_unlock();
105
106 return dp;
107}
108
109/* Must be called with rcu_read_lock or RTNL lock. */
110const char *ovs_dp_name(const struct datapath *dp)
111{
112 struct vport *vport = rcu_dereference_rtnl(dp->ports[OVSP_LOCAL]);
113 return vport->ops->get_name(vport);
114}
115
116static int get_dpifindex(struct datapath *dp)
117{
118 struct vport *local;
119 int ifindex;
120
121 rcu_read_lock();
122
123 local = rcu_dereference(dp->ports[OVSP_LOCAL]);
124 if (local)
125 ifindex = local->ops->get_ifindex(local);
126 else
127 ifindex = 0;
128
129 rcu_read_unlock();
130
131 return ifindex;
132}
133
134static void destroy_dp_rcu(struct rcu_head *rcu)
135{
136 struct datapath *dp = container_of(rcu, struct datapath, rcu);
137
138 ovs_flow_tbl_destroy((__force struct flow_table *)dp->table);
139 free_percpu(dp->stats_percpu);
140 kfree(dp);
141}
142
143/* Called with RTNL lock and genl_lock. */
144static struct vport *new_vport(const struct vport_parms *parms)
145{
146 struct vport *vport;
147
148 vport = ovs_vport_add(parms);
149 if (!IS_ERR(vport)) {
150 struct datapath *dp = parms->dp;
151
152 rcu_assign_pointer(dp->ports[parms->port_no], vport);
153 list_add(&vport->node, &dp->port_list);
154 }
155
156 return vport;
157}
158
159/* Called with RTNL lock. */
160void ovs_dp_detach_port(struct vport *p)
161{
162 ASSERT_RTNL();
163
164 /* First drop references to device. */
165 list_del(&p->node);
166 rcu_assign_pointer(p->dp->ports[p->port_no], NULL);
167
168 /* Then destroy it. */
169 ovs_vport_del(p);
170}
171
172/* Must be called with rcu_read_lock. */
173void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
174{
175 struct datapath *dp = p->dp;
176 struct sw_flow *flow;
177 struct dp_stats_percpu *stats;
178 struct sw_flow_key key;
179 u64 *stats_counter;
180 int error;
181 int key_len;
182
183 stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
184
185 /* Extract flow from 'skb' into 'key'. */
186 error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
187 if (unlikely(error)) {
188 kfree_skb(skb);
189 return;
190 }
191
192 /* Look up flow. */
193 flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len);
194 if (unlikely(!flow)) {
195 struct dp_upcall_info upcall;
196
197 upcall.cmd = OVS_PACKET_CMD_MISS;
198 upcall.key = &key;
199 upcall.userdata = NULL;
200 upcall.pid = p->upcall_pid;
201 ovs_dp_upcall(dp, skb, &upcall);
202 consume_skb(skb);
203 stats_counter = &stats->n_missed;
204 goto out;
205 }
206
207 OVS_CB(skb)->flow = flow;
208
209 stats_counter = &stats->n_hit;
210 ovs_flow_used(OVS_CB(skb)->flow, skb);
211 ovs_execute_actions(dp, skb);
212
213out:
214 /* Update datapath statistics. */
215 u64_stats_update_begin(&stats->sync);
216 (*stats_counter)++;
217 u64_stats_update_end(&stats->sync);
218}
219
220static struct genl_family dp_packet_genl_family = {
221 .id = GENL_ID_GENERATE,
222 .hdrsize = sizeof(struct ovs_header),
223 .name = OVS_PACKET_FAMILY,
224 .version = OVS_PACKET_VERSION,
225 .maxattr = OVS_PACKET_ATTR_MAX
226};
227
228int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb,
229 const struct dp_upcall_info *upcall_info)
230{
231 struct dp_stats_percpu *stats;
232 int dp_ifindex;
233 int err;
234
235 if (upcall_info->pid == 0) {
236 err = -ENOTCONN;
237 goto err;
238 }
239
240 dp_ifindex = get_dpifindex(dp);
241 if (!dp_ifindex) {
242 err = -ENODEV;
243 goto err;
244 }
245
246 if (!skb_is_gso(skb))
247 err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
248 else
249 err = queue_gso_packets(dp_ifindex, skb, upcall_info);
250 if (err)
251 goto err;
252
253 return 0;
254
255err:
256 stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
257
258 u64_stats_update_begin(&stats->sync);
259 stats->n_lost++;
260 u64_stats_update_end(&stats->sync);
261
262 return err;
263}
264
265static int queue_gso_packets(int dp_ifindex, struct sk_buff *skb,
266 const struct dp_upcall_info *upcall_info)
267{
268 struct dp_upcall_info later_info;
269 struct sw_flow_key later_key;
270 struct sk_buff *segs, *nskb;
271 int err;
272
273 segs = skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM);
274 if (IS_ERR(skb))
275 return PTR_ERR(skb);
276
277 /* Queue all of the segments. */
278 skb = segs;
279 do {
280 err = queue_userspace_packet(dp_ifindex, skb, upcall_info);
281 if (err)
282 break;
283
284 if (skb == segs && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
285 /* The initial flow key extracted by ovs_flow_extract()
286 * in this case is for a first fragment, so we need to
287 * properly mark later fragments.
288 */
289 later_key = *upcall_info->key;
290 later_key.ip.frag = OVS_FRAG_TYPE_LATER;
291
292 later_info = *upcall_info;
293 later_info.key = &later_key;
294 upcall_info = &later_info;
295 }
296 } while ((skb = skb->next));
297
298 /* Free all of the segments. */
299 skb = segs;
300 do {
301 nskb = skb->next;
302 if (err)
303 kfree_skb(skb);
304 else
305 consume_skb(skb);
306 } while ((skb = nskb));
307 return err;
308}
309
310static int queue_userspace_packet(int dp_ifindex, struct sk_buff *skb,
311 const struct dp_upcall_info *upcall_info)
312{
313 struct ovs_header *upcall;
314 struct sk_buff *nskb = NULL;
315 struct sk_buff *user_skb; /* to be queued to userspace */
316 struct nlattr *nla;
317 unsigned int len;
318 int err;
319
320 if (vlan_tx_tag_present(skb)) {
321 nskb = skb_clone(skb, GFP_ATOMIC);
322 if (!nskb)
323 return -ENOMEM;
324
325 nskb = __vlan_put_tag(nskb, vlan_tx_tag_get(nskb));
326 if (!skb)
327 return -ENOMEM;
328
329 nskb->vlan_tci = 0;
330 skb = nskb;
331 }
332
333 if (nla_attr_size(skb->len) > USHRT_MAX) {
334 err = -EFBIG;
335 goto out;
336 }
337
338 len = sizeof(struct ovs_header);
339 len += nla_total_size(skb->len);
340 len += nla_total_size(FLOW_BUFSIZE);
341 if (upcall_info->cmd == OVS_PACKET_CMD_ACTION)
342 len += nla_total_size(8);
343
344 user_skb = genlmsg_new(len, GFP_ATOMIC);
345 if (!user_skb) {
346 err = -ENOMEM;
347 goto out;
348 }
349
350 upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family,
351 0, upcall_info->cmd);
352 upcall->dp_ifindex = dp_ifindex;
353
354 nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
355 ovs_flow_to_nlattrs(upcall_info->key, user_skb);
356 nla_nest_end(user_skb, nla);
357
358 if (upcall_info->userdata)
359 nla_put_u64(user_skb, OVS_PACKET_ATTR_USERDATA,
360 nla_get_u64(upcall_info->userdata));
361
362 nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
363
364 skb_copy_and_csum_dev(skb, nla_data(nla));
365
366 err = genlmsg_unicast(&init_net, user_skb, upcall_info->pid);
367
368out:
369 kfree_skb(nskb);
370 return err;
371}
372
373/* Called with genl_mutex. */
374static int flush_flows(int dp_ifindex)
375{
376 struct flow_table *old_table;
377 struct flow_table *new_table;
378 struct datapath *dp;
379
380 dp = get_dp(dp_ifindex);
381 if (!dp)
382 return -ENODEV;
383
384 old_table = genl_dereference(dp->table);
385 new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS);
386 if (!new_table)
387 return -ENOMEM;
388
389 rcu_assign_pointer(dp->table, new_table);
390
391 ovs_flow_tbl_deferred_destroy(old_table);
392 return 0;
393}
394
395static int validate_actions(const struct nlattr *attr,
396 const struct sw_flow_key *key, int depth);
397
398static int validate_sample(const struct nlattr *attr,
399 const struct sw_flow_key *key, int depth)
400{
401 const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
402 const struct nlattr *probability, *actions;
403 const struct nlattr *a;
404 int rem;
405
406 memset(attrs, 0, sizeof(attrs));
407 nla_for_each_nested(a, attr, rem) {
408 int type = nla_type(a);
409 if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type])
410 return -EINVAL;
411 attrs[type] = a;
412 }
413 if (rem)
414 return -EINVAL;
415
416 probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY];
417 if (!probability || nla_len(probability) != sizeof(u32))
418 return -EINVAL;
419
420 actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
421 if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
422 return -EINVAL;
423 return validate_actions(actions, key, depth + 1);
424}
425
426static int validate_set(const struct nlattr *a,
427 const struct sw_flow_key *flow_key)
428{
429 const struct nlattr *ovs_key = nla_data(a);
430 int key_type = nla_type(ovs_key);
431
432 /* There can be only one key in a action */
433 if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
434 return -EINVAL;
435
436 if (key_type > OVS_KEY_ATTR_MAX ||
437 nla_len(ovs_key) != ovs_key_lens[key_type])
438 return -EINVAL;
439
440 switch (key_type) {
441 const struct ovs_key_ipv4 *ipv4_key;
442
443 case OVS_KEY_ATTR_PRIORITY:
444 case OVS_KEY_ATTR_ETHERNET:
445 break;
446
447 case OVS_KEY_ATTR_IPV4:
448 if (flow_key->eth.type != htons(ETH_P_IP))
449 return -EINVAL;
450
451 if (!flow_key->ipv4.addr.src || !flow_key->ipv4.addr.dst)
452 return -EINVAL;
453
454 ipv4_key = nla_data(ovs_key);
455 if (ipv4_key->ipv4_proto != flow_key->ip.proto)
456 return -EINVAL;
457
458 if (ipv4_key->ipv4_frag != flow_key->ip.frag)
459 return -EINVAL;
460
461 break;
462
463 case OVS_KEY_ATTR_TCP:
464 if (flow_key->ip.proto != IPPROTO_TCP)
465 return -EINVAL;
466
467 if (!flow_key->ipv4.tp.src || !flow_key->ipv4.tp.dst)
468 return -EINVAL;
469
470 break;
471
472 case OVS_KEY_ATTR_UDP:
473 if (flow_key->ip.proto != IPPROTO_UDP)
474 return -EINVAL;
475
476 if (!flow_key->ipv4.tp.src || !flow_key->ipv4.tp.dst)
477 return -EINVAL;
478 break;
479
480 default:
481 return -EINVAL;
482 }
483
484 return 0;
485}
486
487static int validate_userspace(const struct nlattr *attr)
488{
489 static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = {
490 [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
491 [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_U64 },
492 };
493 struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
494 int error;
495
496 error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX,
497 attr, userspace_policy);
498 if (error)
499 return error;
500
501 if (!a[OVS_USERSPACE_ATTR_PID] ||
502 !nla_get_u32(a[OVS_USERSPACE_ATTR_PID]))
503 return -EINVAL;
504
505 return 0;
506}
507
508static int validate_actions(const struct nlattr *attr,
509 const struct sw_flow_key *key, int depth)
510{
511 const struct nlattr *a;
512 int rem, err;
513
514 if (depth >= SAMPLE_ACTION_DEPTH)
515 return -EOVERFLOW;
516
517 nla_for_each_nested(a, attr, rem) {
518 /* Expected argument lengths, (u32)-1 for variable length. */
519 static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
520 [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
521 [OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
522 [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
523 [OVS_ACTION_ATTR_POP_VLAN] = 0,
524 [OVS_ACTION_ATTR_SET] = (u32)-1,
525 [OVS_ACTION_ATTR_SAMPLE] = (u32)-1
526 };
527 const struct ovs_action_push_vlan *vlan;
528 int type = nla_type(a);
529
530 if (type > OVS_ACTION_ATTR_MAX ||
531 (action_lens[type] != nla_len(a) &&
532 action_lens[type] != (u32)-1))
533 return -EINVAL;
534
535 switch (type) {
536 case OVS_ACTION_ATTR_UNSPEC:
537 return -EINVAL;
538
539 case OVS_ACTION_ATTR_USERSPACE:
540 err = validate_userspace(a);
541 if (err)
542 return err;
543 break;
544
545 case OVS_ACTION_ATTR_OUTPUT:
546 if (nla_get_u32(a) >= DP_MAX_PORTS)
547 return -EINVAL;
548 break;
549
550
551 case OVS_ACTION_ATTR_POP_VLAN:
552 break;
553
554 case OVS_ACTION_ATTR_PUSH_VLAN:
555 vlan = nla_data(a);
556 if (vlan->vlan_tpid != htons(ETH_P_8021Q))
557 return -EINVAL;
558 if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
559 return -EINVAL;
560 break;
561
562 case OVS_ACTION_ATTR_SET:
563 err = validate_set(a, key);
564 if (err)
565 return err;
566 break;
567
568 case OVS_ACTION_ATTR_SAMPLE:
569 err = validate_sample(a, key, depth);
570 if (err)
571 return err;
572 break;
573
574 default:
575 return -EINVAL;
576 }
577 }
578
579 if (rem > 0)
580 return -EINVAL;
581
582 return 0;
583}
584
585static void clear_stats(struct sw_flow *flow)
586{
587 flow->used = 0;
588 flow->tcp_flags = 0;
589 flow->packet_count = 0;
590 flow->byte_count = 0;
591}
592
593static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
594{
595 struct ovs_header *ovs_header = info->userhdr;
596 struct nlattr **a = info->attrs;
597 struct sw_flow_actions *acts;
598 struct sk_buff *packet;
599 struct sw_flow *flow;
600 struct datapath *dp;
601 struct ethhdr *eth;
602 int len;
603 int err;
604 int key_len;
605
606 err = -EINVAL;
607 if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
608 !a[OVS_PACKET_ATTR_ACTIONS] ||
609 nla_len(a[OVS_PACKET_ATTR_PACKET]) < ETH_HLEN)
610 goto err;
611
612 len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
613 packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
614 err = -ENOMEM;
615 if (!packet)
616 goto err;
617 skb_reserve(packet, NET_IP_ALIGN);
618
619 memcpy(__skb_put(packet, len), nla_data(a[OVS_PACKET_ATTR_PACKET]), len);
620
621 skb_reset_mac_header(packet);
622 eth = eth_hdr(packet);
623
624 /* Normally, setting the skb 'protocol' field would be handled by a
625 * call to eth_type_trans(), but it assumes there's a sending
626 * device, which we may not have. */
627 if (ntohs(eth->h_proto) >= 1536)
628 packet->protocol = eth->h_proto;
629 else
630 packet->protocol = htons(ETH_P_802_2);
631
632 /* Build an sw_flow for sending this packet. */
633 flow = ovs_flow_alloc();
634 err = PTR_ERR(flow);
635 if (IS_ERR(flow))
636 goto err_kfree_skb;
637
638 err = ovs_flow_extract(packet, -1, &flow->key, &key_len);
639 if (err)
640 goto err_flow_free;
641
642 err = ovs_flow_metadata_from_nlattrs(&flow->key.phy.priority,
643 &flow->key.phy.in_port,
644 a[OVS_PACKET_ATTR_KEY]);
645 if (err)
646 goto err_flow_free;
647
648 err = validate_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0);
649 if (err)
650 goto err_flow_free;
651
652 flow->hash = ovs_flow_hash(&flow->key, key_len);
653
654 acts = ovs_flow_actions_alloc(a[OVS_PACKET_ATTR_ACTIONS]);
655 err = PTR_ERR(acts);
656 if (IS_ERR(acts))
657 goto err_flow_free;
658 rcu_assign_pointer(flow->sf_acts, acts);
659
660 OVS_CB(packet)->flow = flow;
661 packet->priority = flow->key.phy.priority;
662
663 rcu_read_lock();
664 dp = get_dp(ovs_header->dp_ifindex);
665 err = -ENODEV;
666 if (!dp)
667 goto err_unlock;
668
669 local_bh_disable();
670 err = ovs_execute_actions(dp, packet);
671 local_bh_enable();
672 rcu_read_unlock();
673
674 ovs_flow_free(flow);
675 return err;
676
677err_unlock:
678 rcu_read_unlock();
679err_flow_free:
680 ovs_flow_free(flow);
681err_kfree_skb:
682 kfree_skb(packet);
683err:
684 return err;
685}
686
687static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
688 [OVS_PACKET_ATTR_PACKET] = { .type = NLA_UNSPEC },
689 [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED },
690 [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
691};
692
693static struct genl_ops dp_packet_genl_ops[] = {
694 { .cmd = OVS_PACKET_CMD_EXECUTE,
695 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
696 .policy = packet_policy,
697 .doit = ovs_packet_cmd_execute
698 }
699};
700
701static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats)
702{
703 int i;
704 struct flow_table *table = genl_dereference(dp->table);
705
706 stats->n_flows = ovs_flow_tbl_count(table);
707
708 stats->n_hit = stats->n_missed = stats->n_lost = 0;
709 for_each_possible_cpu(i) {
710 const struct dp_stats_percpu *percpu_stats;
711 struct dp_stats_percpu local_stats;
712 unsigned int start;
713
714 percpu_stats = per_cpu_ptr(dp->stats_percpu, i);
715
716 do {
717 start = u64_stats_fetch_begin_bh(&percpu_stats->sync);
718 local_stats = *percpu_stats;
719 } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start));
720
721 stats->n_hit += local_stats.n_hit;
722 stats->n_missed += local_stats.n_missed;
723 stats->n_lost += local_stats.n_lost;
724 }
725}
726
727static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
728 [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
729 [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
730 [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
731};
732
733static struct genl_family dp_flow_genl_family = {
734 .id = GENL_ID_GENERATE,
735 .hdrsize = sizeof(struct ovs_header),
736 .name = OVS_FLOW_FAMILY,
737 .version = OVS_FLOW_VERSION,
738 .maxattr = OVS_FLOW_ATTR_MAX
739};
740
741static struct genl_multicast_group ovs_dp_flow_multicast_group = {
742 .name = OVS_FLOW_MCGROUP
743};
744
745/* Called with genl_lock. */
746static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
747 struct sk_buff *skb, u32 pid,
748 u32 seq, u32 flags, u8 cmd)
749{
750 const int skb_orig_len = skb->len;
751 const struct sw_flow_actions *sf_acts;
752 struct ovs_flow_stats stats;
753 struct ovs_header *ovs_header;
754 struct nlattr *nla;
755 unsigned long used;
756 u8 tcp_flags;
757 int err;
758
759 sf_acts = rcu_dereference_protected(flow->sf_acts,
760 lockdep_genl_is_held());
761
762 ovs_header = genlmsg_put(skb, pid, seq, &dp_flow_genl_family, flags, cmd);
763 if (!ovs_header)
764 return -EMSGSIZE;
765
766 ovs_header->dp_ifindex = get_dpifindex(dp);
767
768 nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
769 if (!nla)
770 goto nla_put_failure;
771 err = ovs_flow_to_nlattrs(&flow->key, skb);
772 if (err)
773 goto error;
774 nla_nest_end(skb, nla);
775
776 spin_lock_bh(&flow->lock);
777 used = flow->used;
778 stats.n_packets = flow->packet_count;
779 stats.n_bytes = flow->byte_count;
780 tcp_flags = flow->tcp_flags;
781 spin_unlock_bh(&flow->lock);
782
783 if (used)
784 NLA_PUT_U64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used));
785
786 if (stats.n_packets)
787 NLA_PUT(skb, OVS_FLOW_ATTR_STATS,
788 sizeof(struct ovs_flow_stats), &stats);
789
790 if (tcp_flags)
791 NLA_PUT_U8(skb, OVS_FLOW_ATTR_TCP_FLAGS, tcp_flags);
792
793 /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if
794 * this is the first flow to be dumped into 'skb'. This is unusual for
795 * Netlink but individual action lists can be longer than
796 * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this.
797 * The userspace caller can always fetch the actions separately if it
798 * really wants them. (Most userspace callers in fact don't care.)
799 *
800 * This can only fail for dump operations because the skb is always
801 * properly sized for single flows.
802 */
803 err = nla_put(skb, OVS_FLOW_ATTR_ACTIONS, sf_acts->actions_len,
804 sf_acts->actions);
805 if (err < 0 && skb_orig_len)
806 goto error;
807
808 return genlmsg_end(skb, ovs_header);
809
810nla_put_failure:
811 err = -EMSGSIZE;
812error:
813 genlmsg_cancel(skb, ovs_header);
814 return err;
815}
816
817static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow)
818{
819 const struct sw_flow_actions *sf_acts;
820 int len;
821
822 sf_acts = rcu_dereference_protected(flow->sf_acts,
823 lockdep_genl_is_held());
824
825 /* OVS_FLOW_ATTR_KEY */
826 len = nla_total_size(FLOW_BUFSIZE);
827 /* OVS_FLOW_ATTR_ACTIONS */
828 len += nla_total_size(sf_acts->actions_len);
829 /* OVS_FLOW_ATTR_STATS */
830 len += nla_total_size(sizeof(struct ovs_flow_stats));
831 /* OVS_FLOW_ATTR_TCP_FLAGS */
832 len += nla_total_size(1);
833 /* OVS_FLOW_ATTR_USED */
834 len += nla_total_size(8);
835
836 len += NLMSG_ALIGN(sizeof(struct ovs_header));
837
838 return genlmsg_new(len, GFP_KERNEL);
839}
840
841static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow,
842 struct datapath *dp,
843 u32 pid, u32 seq, u8 cmd)
844{
845 struct sk_buff *skb;
846 int retval;
847
848 skb = ovs_flow_cmd_alloc_info(flow);
849 if (!skb)
850 return ERR_PTR(-ENOMEM);
851
852 retval = ovs_flow_cmd_fill_info(flow, dp, skb, pid, seq, 0, cmd);
853 BUG_ON(retval < 0);
854 return skb;
855}
856
857static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
858{
859 struct nlattr **a = info->attrs;
860 struct ovs_header *ovs_header = info->userhdr;
861 struct sw_flow_key key;
862 struct sw_flow *flow;
863 struct sk_buff *reply;
864 struct datapath *dp;
865 struct flow_table *table;
866 int error;
867 int key_len;
868
869 /* Extract key. */
870 error = -EINVAL;
871 if (!a[OVS_FLOW_ATTR_KEY])
872 goto error;
873 error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
874 if (error)
875 goto error;
876
877 /* Validate actions. */
878 if (a[OVS_FLOW_ATTR_ACTIONS]) {
879 error = validate_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0);
880 if (error)
881 goto error;
882 } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) {
883 error = -EINVAL;
884 goto error;
885 }
886
887 dp = get_dp(ovs_header->dp_ifindex);
888 error = -ENODEV;
889 if (!dp)
890 goto error;
891
892 table = genl_dereference(dp->table);
893 flow = ovs_flow_tbl_lookup(table, &key, key_len);
894 if (!flow) {
895 struct sw_flow_actions *acts;
896
897 /* Bail out if we're not allowed to create a new flow. */
898 error = -ENOENT;
899 if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)
900 goto error;
901
902 /* Expand table, if necessary, to make room. */
903 if (ovs_flow_tbl_need_to_expand(table)) {
904 struct flow_table *new_table;
905
906 new_table = ovs_flow_tbl_expand(table);
907 if (!IS_ERR(new_table)) {
908 rcu_assign_pointer(dp->table, new_table);
909 ovs_flow_tbl_deferred_destroy(table);
910 table = genl_dereference(dp->table);
911 }
912 }
913
914 /* Allocate flow. */
915 flow = ovs_flow_alloc();
916 if (IS_ERR(flow)) {
917 error = PTR_ERR(flow);
918 goto error;
919 }
920 flow->key = key;
921 clear_stats(flow);
922
923 /* Obtain actions. */
924 acts = ovs_flow_actions_alloc(a[OVS_FLOW_ATTR_ACTIONS]);
925 error = PTR_ERR(acts);
926 if (IS_ERR(acts))
927 goto error_free_flow;
928 rcu_assign_pointer(flow->sf_acts, acts);
929
930 /* Put flow in bucket. */
931 flow->hash = ovs_flow_hash(&key, key_len);
932 ovs_flow_tbl_insert(table, flow);
933
934 reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
935 info->snd_seq,
936 OVS_FLOW_CMD_NEW);
937 } else {
938 /* We found a matching flow. */
939 struct sw_flow_actions *old_acts;
940 struct nlattr *acts_attrs;
941
942 /* Bail out if we're not allowed to modify an existing flow.
943 * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
944 * because Generic Netlink treats the latter as a dump
945 * request. We also accept NLM_F_EXCL in case that bug ever
946 * gets fixed.
947 */
948 error = -EEXIST;
949 if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW &&
950 info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))
951 goto error;
952
953 /* Update actions. */
954 old_acts = rcu_dereference_protected(flow->sf_acts,
955 lockdep_genl_is_held());
956 acts_attrs = a[OVS_FLOW_ATTR_ACTIONS];
957 if (acts_attrs &&
958 (old_acts->actions_len != nla_len(acts_attrs) ||
959 memcmp(old_acts->actions, nla_data(acts_attrs),
960 old_acts->actions_len))) {
961 struct sw_flow_actions *new_acts;
962
963 new_acts = ovs_flow_actions_alloc(acts_attrs);
964 error = PTR_ERR(new_acts);
965 if (IS_ERR(new_acts))
966 goto error;
967
968 rcu_assign_pointer(flow->sf_acts, new_acts);
969 ovs_flow_deferred_free_acts(old_acts);
970 }
971
972 reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
973 info->snd_seq, OVS_FLOW_CMD_NEW);
974
975 /* Clear stats. */
976 if (a[OVS_FLOW_ATTR_CLEAR]) {
977 spin_lock_bh(&flow->lock);
978 clear_stats(flow);
979 spin_unlock_bh(&flow->lock);
980 }
981 }
982
983 if (!IS_ERR(reply))
984 genl_notify(reply, genl_info_net(info), info->snd_pid,
985 ovs_dp_flow_multicast_group.id, info->nlhdr,
986 GFP_KERNEL);
987 else
988 netlink_set_err(init_net.genl_sock, 0,
989 ovs_dp_flow_multicast_group.id, PTR_ERR(reply));
990 return 0;
991
992error_free_flow:
993 ovs_flow_free(flow);
994error:
995 return error;
996}
997
998static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
999{
1000 struct nlattr **a = info->attrs;
1001 struct ovs_header *ovs_header = info->userhdr;
1002 struct sw_flow_key key;
1003 struct sk_buff *reply;
1004 struct sw_flow *flow;
1005 struct datapath *dp;
1006 struct flow_table *table;
1007 int err;
1008 int key_len;
1009
1010 if (!a[OVS_FLOW_ATTR_KEY])
1011 return -EINVAL;
1012 err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
1013 if (err)
1014 return err;
1015
1016 dp = get_dp(ovs_header->dp_ifindex);
1017 if (!dp)
1018 return -ENODEV;
1019
1020 table = genl_dereference(dp->table);
1021 flow = ovs_flow_tbl_lookup(table, &key, key_len);
1022 if (!flow)
1023 return -ENOENT;
1024
1025 reply = ovs_flow_cmd_build_info(flow, dp, info->snd_pid,
1026 info->snd_seq, OVS_FLOW_CMD_NEW);
1027 if (IS_ERR(reply))
1028 return PTR_ERR(reply);
1029
1030 return genlmsg_reply(reply, info);
1031}
1032
1033static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
1034{
1035 struct nlattr **a = info->attrs;
1036 struct ovs_header *ovs_header = info->userhdr;
1037 struct sw_flow_key key;
1038 struct sk_buff *reply;
1039 struct sw_flow *flow;
1040 struct datapath *dp;
1041 struct flow_table *table;
1042 int err;
1043 int key_len;
1044
1045 if (!a[OVS_FLOW_ATTR_KEY])
1046 return flush_flows(ovs_header->dp_ifindex);
1047 err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
1048 if (err)
1049 return err;
1050
1051 dp = get_dp(ovs_header->dp_ifindex);
1052 if (!dp)
1053 return -ENODEV;
1054
1055 table = genl_dereference(dp->table);
1056 flow = ovs_flow_tbl_lookup(table, &key, key_len);
1057 if (!flow)
1058 return -ENOENT;
1059
1060 reply = ovs_flow_cmd_alloc_info(flow);
1061 if (!reply)
1062 return -ENOMEM;
1063
1064 ovs_flow_tbl_remove(table, flow);
1065
1066 err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_pid,
1067 info->snd_seq, 0, OVS_FLOW_CMD_DEL);
1068 BUG_ON(err < 0);
1069
1070 ovs_flow_deferred_free(flow);
1071
1072 genl_notify(reply, genl_info_net(info), info->snd_pid,
1073 ovs_dp_flow_multicast_group.id, info->nlhdr, GFP_KERNEL);
1074 return 0;
1075}
1076
1077static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1078{
1079 struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
1080 struct datapath *dp;
1081 struct flow_table *table;
1082
1083 dp = get_dp(ovs_header->dp_ifindex);
1084 if (!dp)
1085 return -ENODEV;
1086
1087 table = genl_dereference(dp->table);
1088
1089 for (;;) {
1090 struct sw_flow *flow;
1091 u32 bucket, obj;
1092
1093 bucket = cb->args[0];
1094 obj = cb->args[1];
1095 flow = ovs_flow_tbl_next(table, &bucket, &obj);
1096 if (!flow)
1097 break;
1098
1099 if (ovs_flow_cmd_fill_info(flow, dp, skb,
1100 NETLINK_CB(cb->skb).pid,
1101 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1102 OVS_FLOW_CMD_NEW) < 0)
1103 break;
1104
1105 cb->args[0] = bucket;
1106 cb->args[1] = obj;
1107 }
1108 return skb->len;
1109}
1110
1111static struct genl_ops dp_flow_genl_ops[] = {
1112 { .cmd = OVS_FLOW_CMD_NEW,
1113 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1114 .policy = flow_policy,
1115 .doit = ovs_flow_cmd_new_or_set
1116 },
1117 { .cmd = OVS_FLOW_CMD_DEL,
1118 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1119 .policy = flow_policy,
1120 .doit = ovs_flow_cmd_del
1121 },
1122 { .cmd = OVS_FLOW_CMD_GET,
1123 .flags = 0, /* OK for unprivileged users. */
1124 .policy = flow_policy,
1125 .doit = ovs_flow_cmd_get,
1126 .dumpit = ovs_flow_cmd_dump
1127 },
1128 { .cmd = OVS_FLOW_CMD_SET,
1129 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1130 .policy = flow_policy,
1131 .doit = ovs_flow_cmd_new_or_set,
1132 },
1133};
1134
1135static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
1136 [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
1137 [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
1138};
1139
1140static struct genl_family dp_datapath_genl_family = {
1141 .id = GENL_ID_GENERATE,
1142 .hdrsize = sizeof(struct ovs_header),
1143 .name = OVS_DATAPATH_FAMILY,
1144 .version = OVS_DATAPATH_VERSION,
1145 .maxattr = OVS_DP_ATTR_MAX
1146};
1147
1148static struct genl_multicast_group ovs_dp_datapath_multicast_group = {
1149 .name = OVS_DATAPATH_MCGROUP
1150};
1151
1152static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
1153 u32 pid, u32 seq, u32 flags, u8 cmd)
1154{
1155 struct ovs_header *ovs_header;
1156 struct ovs_dp_stats dp_stats;
1157 int err;
1158
1159 ovs_header = genlmsg_put(skb, pid, seq, &dp_datapath_genl_family,
1160 flags, cmd);
1161 if (!ovs_header)
1162 goto error;
1163
1164 ovs_header->dp_ifindex = get_dpifindex(dp);
1165
1166 rcu_read_lock();
1167 err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp));
1168 rcu_read_unlock();
1169 if (err)
1170 goto nla_put_failure;
1171
1172 get_dp_stats(dp, &dp_stats);
1173 NLA_PUT(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats);
1174
1175 return genlmsg_end(skb, ovs_header);
1176
1177nla_put_failure:
1178 genlmsg_cancel(skb, ovs_header);
1179error:
1180 return -EMSGSIZE;
1181}
1182
1183static struct sk_buff *ovs_dp_cmd_build_info(struct datapath *dp, u32 pid,
1184 u32 seq, u8 cmd)
1185{
1186 struct sk_buff *skb;
1187 int retval;
1188
1189 skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
1190 if (!skb)
1191 return ERR_PTR(-ENOMEM);
1192
1193 retval = ovs_dp_cmd_fill_info(dp, skb, pid, seq, 0, cmd);
1194 if (retval < 0) {
1195 kfree_skb(skb);
1196 return ERR_PTR(retval);
1197 }
1198 return skb;
1199}
1200
1201/* Called with genl_mutex and optionally with RTNL lock also. */
1202static struct datapath *lookup_datapath(struct ovs_header *ovs_header,
1203 struct nlattr *a[OVS_DP_ATTR_MAX + 1])
1204{
1205 struct datapath *dp;
1206
1207 if (!a[OVS_DP_ATTR_NAME])
1208 dp = get_dp(ovs_header->dp_ifindex);
1209 else {
1210 struct vport *vport;
1211
1212 rcu_read_lock();
1213 vport = ovs_vport_locate(nla_data(a[OVS_DP_ATTR_NAME]));
1214 dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL;
1215 rcu_read_unlock();
1216 }
1217 return dp ? dp : ERR_PTR(-ENODEV);
1218}
1219
1220static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
1221{
1222 struct nlattr **a = info->attrs;
1223 struct vport_parms parms;
1224 struct sk_buff *reply;
1225 struct datapath *dp;
1226 struct vport *vport;
1227 int err;
1228
1229 err = -EINVAL;
1230 if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
1231 goto err;
1232
1233 rtnl_lock();
1234 err = -ENODEV;
1235 if (!try_module_get(THIS_MODULE))
1236 goto err_unlock_rtnl;
1237
1238 err = -ENOMEM;
1239 dp = kzalloc(sizeof(*dp), GFP_KERNEL);
1240 if (dp == NULL)
1241 goto err_put_module;
1242 INIT_LIST_HEAD(&dp->port_list);
1243
1244 /* Allocate table. */
1245 err = -ENOMEM;
1246 rcu_assign_pointer(dp->table, ovs_flow_tbl_alloc(TBL_MIN_BUCKETS));
1247 if (!dp->table)
1248 goto err_free_dp;
1249
1250 dp->stats_percpu = alloc_percpu(struct dp_stats_percpu);
1251 if (!dp->stats_percpu) {
1252 err = -ENOMEM;
1253 goto err_destroy_table;
1254 }
1255
1256 /* Set up our datapath device. */
1257 parms.name = nla_data(a[OVS_DP_ATTR_NAME]);
1258 parms.type = OVS_VPORT_TYPE_INTERNAL;
1259 parms.options = NULL;
1260 parms.dp = dp;
1261 parms.port_no = OVSP_LOCAL;
1262 parms.upcall_pid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]);
1263
1264 vport = new_vport(&parms);
1265 if (IS_ERR(vport)) {
1266 err = PTR_ERR(vport);
1267 if (err == -EBUSY)
1268 err = -EEXIST;
1269
1270 goto err_destroy_percpu;
1271 }
1272
1273 reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1274 info->snd_seq, OVS_DP_CMD_NEW);
1275 err = PTR_ERR(reply);
1276 if (IS_ERR(reply))
1277 goto err_destroy_local_port;
1278
1279 list_add_tail(&dp->list_node, &dps);
1280 rtnl_unlock();
1281
1282 genl_notify(reply, genl_info_net(info), info->snd_pid,
1283 ovs_dp_datapath_multicast_group.id, info->nlhdr,
1284 GFP_KERNEL);
1285 return 0;
1286
1287err_destroy_local_port:
1288 ovs_dp_detach_port(rtnl_dereference(dp->ports[OVSP_LOCAL]));
1289err_destroy_percpu:
1290 free_percpu(dp->stats_percpu);
1291err_destroy_table:
1292 ovs_flow_tbl_destroy(genl_dereference(dp->table));
1293err_free_dp:
1294 kfree(dp);
1295err_put_module:
1296 module_put(THIS_MODULE);
1297err_unlock_rtnl:
1298 rtnl_unlock();
1299err:
1300 return err;
1301}
1302
1303static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
1304{
1305 struct vport *vport, *next_vport;
1306 struct sk_buff *reply;
1307 struct datapath *dp;
1308 int err;
1309
1310 rtnl_lock();
1311 dp = lookup_datapath(info->userhdr, info->attrs);
1312 err = PTR_ERR(dp);
1313 if (IS_ERR(dp))
1314 goto exit_unlock;
1315
1316 reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1317 info->snd_seq, OVS_DP_CMD_DEL);
1318 err = PTR_ERR(reply);
1319 if (IS_ERR(reply))
1320 goto exit_unlock;
1321
1322 list_for_each_entry_safe(vport, next_vport, &dp->port_list, node)
1323 if (vport->port_no != OVSP_LOCAL)
1324 ovs_dp_detach_port(vport);
1325
1326 list_del(&dp->list_node);
1327 ovs_dp_detach_port(rtnl_dereference(dp->ports[OVSP_LOCAL]));
1328
1329 /* rtnl_unlock() will wait until all the references to devices that
1330 * are pending unregistration have been dropped. We do it here to
1331 * ensure that any internal devices (which contain DP pointers) are
1332 * fully destroyed before freeing the datapath.
1333 */
1334 rtnl_unlock();
1335
1336 call_rcu(&dp->rcu, destroy_dp_rcu);
1337 module_put(THIS_MODULE);
1338
1339 genl_notify(reply, genl_info_net(info), info->snd_pid,
1340 ovs_dp_datapath_multicast_group.id, info->nlhdr,
1341 GFP_KERNEL);
1342
1343 return 0;
1344
1345exit_unlock:
1346 rtnl_unlock();
1347 return err;
1348}
1349
1350static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
1351{
1352 struct sk_buff *reply;
1353 struct datapath *dp;
1354 int err;
1355
1356 dp = lookup_datapath(info->userhdr, info->attrs);
1357 if (IS_ERR(dp))
1358 return PTR_ERR(dp);
1359
1360 reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1361 info->snd_seq, OVS_DP_CMD_NEW);
1362 if (IS_ERR(reply)) {
1363 err = PTR_ERR(reply);
1364 netlink_set_err(init_net.genl_sock, 0,
1365 ovs_dp_datapath_multicast_group.id, err);
1366 return 0;
1367 }
1368
1369 genl_notify(reply, genl_info_net(info), info->snd_pid,
1370 ovs_dp_datapath_multicast_group.id, info->nlhdr,
1371 GFP_KERNEL);
1372
1373 return 0;
1374}
1375
1376static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
1377{
1378 struct sk_buff *reply;
1379 struct datapath *dp;
1380
1381 dp = lookup_datapath(info->userhdr, info->attrs);
1382 if (IS_ERR(dp))
1383 return PTR_ERR(dp);
1384
1385 reply = ovs_dp_cmd_build_info(dp, info->snd_pid,
1386 info->snd_seq, OVS_DP_CMD_NEW);
1387 if (IS_ERR(reply))
1388 return PTR_ERR(reply);
1389
1390 return genlmsg_reply(reply, info);
1391}
1392
1393static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1394{
1395 struct datapath *dp;
1396 int skip = cb->args[0];
1397 int i = 0;
1398
1399 list_for_each_entry(dp, &dps, list_node) {
1400 if (i < skip)
1401 continue;
1402 if (ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).pid,
1403 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1404 OVS_DP_CMD_NEW) < 0)
1405 break;
1406 i++;
1407 }
1408
1409 cb->args[0] = i;
1410
1411 return skb->len;
1412}
1413
1414static struct genl_ops dp_datapath_genl_ops[] = {
1415 { .cmd = OVS_DP_CMD_NEW,
1416 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1417 .policy = datapath_policy,
1418 .doit = ovs_dp_cmd_new
1419 },
1420 { .cmd = OVS_DP_CMD_DEL,
1421 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1422 .policy = datapath_policy,
1423 .doit = ovs_dp_cmd_del
1424 },
1425 { .cmd = OVS_DP_CMD_GET,
1426 .flags = 0, /* OK for unprivileged users. */
1427 .policy = datapath_policy,
1428 .doit = ovs_dp_cmd_get,
1429 .dumpit = ovs_dp_cmd_dump
1430 },
1431 { .cmd = OVS_DP_CMD_SET,
1432 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1433 .policy = datapath_policy,
1434 .doit = ovs_dp_cmd_set,
1435 },
1436};
1437
1438static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
1439 [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
1440 [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) },
1441 [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 },
1442 [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 },
1443 [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 },
1444 [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED },
1445};
1446
1447static struct genl_family dp_vport_genl_family = {
1448 .id = GENL_ID_GENERATE,
1449 .hdrsize = sizeof(struct ovs_header),
1450 .name = OVS_VPORT_FAMILY,
1451 .version = OVS_VPORT_VERSION,
1452 .maxattr = OVS_VPORT_ATTR_MAX
1453};
1454
1455struct genl_multicast_group ovs_dp_vport_multicast_group = {
1456 .name = OVS_VPORT_MCGROUP
1457};
1458
1459/* Called with RTNL lock or RCU read lock. */
1460static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
1461 u32 pid, u32 seq, u32 flags, u8 cmd)
1462{
1463 struct ovs_header *ovs_header;
1464 struct ovs_vport_stats vport_stats;
1465 int err;
1466
1467 ovs_header = genlmsg_put(skb, pid, seq, &dp_vport_genl_family,
1468 flags, cmd);
1469 if (!ovs_header)
1470 return -EMSGSIZE;
1471
1472 ovs_header->dp_ifindex = get_dpifindex(vport->dp);
1473
1474 NLA_PUT_U32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no);
1475 NLA_PUT_U32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type);
1476 NLA_PUT_STRING(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport));
1477 NLA_PUT_U32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_pid);
1478
1479 ovs_vport_get_stats(vport, &vport_stats);
1480 NLA_PUT(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats),
1481 &vport_stats);
1482
1483 err = ovs_vport_get_options(vport, skb);
1484 if (err == -EMSGSIZE)
1485 goto error;
1486
1487 return genlmsg_end(skb, ovs_header);
1488
1489nla_put_failure:
1490 err = -EMSGSIZE;
1491error:
1492 genlmsg_cancel(skb, ovs_header);
1493 return err;
1494}
1495
1496/* Called with RTNL lock or RCU read lock. */
1497struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 pid,
1498 u32 seq, u8 cmd)
1499{
1500 struct sk_buff *skb;
1501 int retval;
1502
1503 skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
1504 if (!skb)
1505 return ERR_PTR(-ENOMEM);
1506
1507 retval = ovs_vport_cmd_fill_info(vport, skb, pid, seq, 0, cmd);
1508 if (retval < 0) {
1509 kfree_skb(skb);
1510 return ERR_PTR(retval);
1511 }
1512 return skb;
1513}
1514
1515/* Called with RTNL lock or RCU read lock. */
1516static struct vport *lookup_vport(struct ovs_header *ovs_header,
1517 struct nlattr *a[OVS_VPORT_ATTR_MAX + 1])
1518{
1519 struct datapath *dp;
1520 struct vport *vport;
1521
1522 if (a[OVS_VPORT_ATTR_NAME]) {
1523 vport = ovs_vport_locate(nla_data(a[OVS_VPORT_ATTR_NAME]));
1524 if (!vport)
1525 return ERR_PTR(-ENODEV);
1526 return vport;
1527 } else if (a[OVS_VPORT_ATTR_PORT_NO]) {
1528 u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
1529
1530 if (port_no >= DP_MAX_PORTS)
1531 return ERR_PTR(-EFBIG);
1532
1533 dp = get_dp(ovs_header->dp_ifindex);
1534 if (!dp)
1535 return ERR_PTR(-ENODEV);
1536
1537 vport = rcu_dereference_rtnl(dp->ports[port_no]);
1538 if (!vport)
1539 return ERR_PTR(-ENOENT);
1540 return vport;
1541 } else
1542 return ERR_PTR(-EINVAL);
1543}
1544
1545static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
1546{
1547 struct nlattr **a = info->attrs;
1548 struct ovs_header *ovs_header = info->userhdr;
1549 struct vport_parms parms;
1550 struct sk_buff *reply;
1551 struct vport *vport;
1552 struct datapath *dp;
1553 u32 port_no;
1554 int err;
1555
1556 err = -EINVAL;
1557 if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] ||
1558 !a[OVS_VPORT_ATTR_UPCALL_PID])
1559 goto exit;
1560
1561 rtnl_lock();
1562 dp = get_dp(ovs_header->dp_ifindex);
1563 err = -ENODEV;
1564 if (!dp)
1565 goto exit_unlock;
1566
1567 if (a[OVS_VPORT_ATTR_PORT_NO]) {
1568 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
1569
1570 err = -EFBIG;
1571 if (port_no >= DP_MAX_PORTS)
1572 goto exit_unlock;
1573
1574 vport = rtnl_dereference(dp->ports[port_no]);
1575 err = -EBUSY;
1576 if (vport)
1577 goto exit_unlock;
1578 } else {
1579 for (port_no = 1; ; port_no++) {
1580 if (port_no >= DP_MAX_PORTS) {
1581 err = -EFBIG;
1582 goto exit_unlock;
1583 }
1584 vport = rtnl_dereference(dp->ports[port_no]);
1585 if (!vport)
1586 break;
1587 }
1588 }
1589
1590 parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
1591 parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
1592 parms.options = a[OVS_VPORT_ATTR_OPTIONS];
1593 parms.dp = dp;
1594 parms.port_no = port_no;
1595 parms.upcall_pid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
1596
1597 vport = new_vport(&parms);
1598 err = PTR_ERR(vport);
1599 if (IS_ERR(vport))
1600 goto exit_unlock;
1601
1602 reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1603 OVS_VPORT_CMD_NEW);
1604 if (IS_ERR(reply)) {
1605 err = PTR_ERR(reply);
1606 ovs_dp_detach_port(vport);
1607 goto exit_unlock;
1608 }
1609 genl_notify(reply, genl_info_net(info), info->snd_pid,
1610 ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
1611
1612exit_unlock:
1613 rtnl_unlock();
1614exit:
1615 return err;
1616}
1617
1618static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
1619{
1620 struct nlattr **a = info->attrs;
1621 struct sk_buff *reply;
1622 struct vport *vport;
1623 int err;
1624
1625 rtnl_lock();
1626 vport = lookup_vport(info->userhdr, a);
1627 err = PTR_ERR(vport);
1628 if (IS_ERR(vport))
1629 goto exit_unlock;
1630
1631 err = 0;
1632 if (a[OVS_VPORT_ATTR_TYPE] &&
1633 nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type)
1634 err = -EINVAL;
1635
1636 if (!err && a[OVS_VPORT_ATTR_OPTIONS])
1637 err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]);
1638 if (!err && a[OVS_VPORT_ATTR_UPCALL_PID])
1639 vport->upcall_pid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
1640
1641 reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1642 OVS_VPORT_CMD_NEW);
1643 if (IS_ERR(reply)) {
1644 err = PTR_ERR(reply);
1645 netlink_set_err(init_net.genl_sock, 0,
1646 ovs_dp_vport_multicast_group.id, err);
1647 return 0;
1648 }
1649
1650 genl_notify(reply, genl_info_net(info), info->snd_pid,
1651 ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
1652
1653exit_unlock:
1654 rtnl_unlock();
1655 return err;
1656}
1657
1658static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
1659{
1660 struct nlattr **a = info->attrs;
1661 struct sk_buff *reply;
1662 struct vport *vport;
1663 int err;
1664
1665 rtnl_lock();
1666 vport = lookup_vport(info->userhdr, a);
1667 err = PTR_ERR(vport);
1668 if (IS_ERR(vport))
1669 goto exit_unlock;
1670
1671 if (vport->port_no == OVSP_LOCAL) {
1672 err = -EINVAL;
1673 goto exit_unlock;
1674 }
1675
1676 reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1677 OVS_VPORT_CMD_DEL);
1678 err = PTR_ERR(reply);
1679 if (IS_ERR(reply))
1680 goto exit_unlock;
1681
1682 ovs_dp_detach_port(vport);
1683
1684 genl_notify(reply, genl_info_net(info), info->snd_pid,
1685 ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
1686
1687exit_unlock:
1688 rtnl_unlock();
1689 return err;
1690}
1691
1692static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
1693{
1694 struct nlattr **a = info->attrs;
1695 struct ovs_header *ovs_header = info->userhdr;
1696 struct sk_buff *reply;
1697 struct vport *vport;
1698 int err;
1699
1700 rcu_read_lock();
1701 vport = lookup_vport(ovs_header, a);
1702 err = PTR_ERR(vport);
1703 if (IS_ERR(vport))
1704 goto exit_unlock;
1705
1706 reply = ovs_vport_cmd_build_info(vport, info->snd_pid, info->snd_seq,
1707 OVS_VPORT_CMD_NEW);
1708 err = PTR_ERR(reply);
1709 if (IS_ERR(reply))
1710 goto exit_unlock;
1711
1712 rcu_read_unlock();
1713
1714 return genlmsg_reply(reply, info);
1715
1716exit_unlock:
1717 rcu_read_unlock();
1718 return err;
1719}
1720
1721static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
1722{
1723 struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh));
1724 struct datapath *dp;
1725 u32 port_no;
1726 int retval;
1727
1728 dp = get_dp(ovs_header->dp_ifindex);
1729 if (!dp)
1730 return -ENODEV;
1731
1732 rcu_read_lock();
1733 for (port_no = cb->args[0]; port_no < DP_MAX_PORTS; port_no++) {
1734 struct vport *vport;
1735
1736 vport = rcu_dereference(dp->ports[port_no]);
1737 if (!vport)
1738 continue;
1739
1740 if (ovs_vport_cmd_fill_info(vport, skb, NETLINK_CB(cb->skb).pid,
1741 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1742 OVS_VPORT_CMD_NEW) < 0)
1743 break;
1744 }
1745 rcu_read_unlock();
1746
1747 cb->args[0] = port_no;
1748 retval = skb->len;
1749
1750 return retval;
1751}
1752
1753static void rehash_flow_table(struct work_struct *work)
1754{
1755 struct datapath *dp;
1756
1757 genl_lock();
1758
1759 list_for_each_entry(dp, &dps, list_node) {
1760 struct flow_table *old_table = genl_dereference(dp->table);
1761 struct flow_table *new_table;
1762
1763 new_table = ovs_flow_tbl_rehash(old_table);
1764 if (!IS_ERR(new_table)) {
1765 rcu_assign_pointer(dp->table, new_table);
1766 ovs_flow_tbl_deferred_destroy(old_table);
1767 }
1768 }
1769
1770 genl_unlock();
1771
1772 schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
1773}
1774
1775static struct genl_ops dp_vport_genl_ops[] = {
1776 { .cmd = OVS_VPORT_CMD_NEW,
1777 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1778 .policy = vport_policy,
1779 .doit = ovs_vport_cmd_new
1780 },
1781 { .cmd = OVS_VPORT_CMD_DEL,
1782 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1783 .policy = vport_policy,
1784 .doit = ovs_vport_cmd_del
1785 },
1786 { .cmd = OVS_VPORT_CMD_GET,
1787 .flags = 0, /* OK for unprivileged users. */
1788 .policy = vport_policy,
1789 .doit = ovs_vport_cmd_get,
1790 .dumpit = ovs_vport_cmd_dump
1791 },
1792 { .cmd = OVS_VPORT_CMD_SET,
1793 .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
1794 .policy = vport_policy,
1795 .doit = ovs_vport_cmd_set,
1796 },
1797};
1798
1799struct genl_family_and_ops {
1800 struct genl_family *family;
1801 struct genl_ops *ops;
1802 int n_ops;
1803 struct genl_multicast_group *group;
1804};
1805
1806static const struct genl_family_and_ops dp_genl_families[] = {
1807 { &dp_datapath_genl_family,
1808 dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops),
1809 &ovs_dp_datapath_multicast_group },
1810 { &dp_vport_genl_family,
1811 dp_vport_genl_ops, ARRAY_SIZE(dp_vport_genl_ops),
1812 &ovs_dp_vport_multicast_group },
1813 { &dp_flow_genl_family,
1814 dp_flow_genl_ops, ARRAY_SIZE(dp_flow_genl_ops),
1815 &ovs_dp_flow_multicast_group },
1816 { &dp_packet_genl_family,
1817 dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops),
1818 NULL },
1819};
1820
1821static void dp_unregister_genl(int n_families)
1822{
1823 int i;
1824
1825 for (i = 0; i < n_families; i++)
1826 genl_unregister_family(dp_genl_families[i].family);
1827}
1828
1829static int dp_register_genl(void)
1830{
1831 int n_registered;
1832 int err;
1833 int i;
1834
1835 n_registered = 0;
1836 for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
1837 const struct genl_family_and_ops *f = &dp_genl_families[i];
1838
1839 err = genl_register_family_with_ops(f->family, f->ops,
1840 f->n_ops);
1841 if (err)
1842 goto error;
1843 n_registered++;
1844
1845 if (f->group) {
1846 err = genl_register_mc_group(f->family, f->group);
1847 if (err)
1848 goto error;
1849 }
1850 }
1851
1852 return 0;
1853
1854error:
1855 dp_unregister_genl(n_registered);
1856 return err;
1857}
1858
1859static int __init dp_init(void)
1860{
1861 struct sk_buff *dummy_skb;
1862 int err;
1863
1864 BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof(dummy_skb->cb));
1865
1866 pr_info("Open vSwitch switching datapath\n");
1867
1868 err = ovs_flow_init();
1869 if (err)
1870 goto error;
1871
1872 err = ovs_vport_init();
1873 if (err)
1874 goto error_flow_exit;
1875
1876 err = register_netdevice_notifier(&ovs_dp_device_notifier);
1877 if (err)
1878 goto error_vport_exit;
1879
1880 err = dp_register_genl();
1881 if (err < 0)
1882 goto error_unreg_notifier;
1883
1884 schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
1885
1886 return 0;
1887
1888error_unreg_notifier:
1889 unregister_netdevice_notifier(&ovs_dp_device_notifier);
1890error_vport_exit:
1891 ovs_vport_exit();
1892error_flow_exit:
1893 ovs_flow_exit();
1894error:
1895 return err;
1896}
1897
1898static void dp_cleanup(void)
1899{
1900 cancel_delayed_work_sync(&rehash_flow_wq);
1901 rcu_barrier();
1902 dp_unregister_genl(ARRAY_SIZE(dp_genl_families));
1903 unregister_netdevice_notifier(&ovs_dp_device_notifier);
1904 ovs_vport_exit();
1905 ovs_flow_exit();
1906}
1907
1908module_init(dp_init);
1909module_exit(dp_cleanup);
1910
1911MODULE_DESCRIPTION("Open vSwitch switching datapath");
1912MODULE_LICENSE("GPL");
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
new file mode 100644
index 00000000000..5b9f884b705
--- /dev/null
+++ b/net/openvswitch/datapath.h
@@ -0,0 +1,125 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#ifndef DATAPATH_H
20#define DATAPATH_H 1
21
22#include <asm/page.h>
23#include <linux/kernel.h>
24#include <linux/mutex.h>
25#include <linux/netdevice.h>
26#include <linux/skbuff.h>
27#include <linux/u64_stats_sync.h>
28#include <linux/version.h>
29
30#include "flow.h"
31
32struct vport;
33
34#define DP_MAX_PORTS 1024
35#define SAMPLE_ACTION_DEPTH 3
36
37/**
38 * struct dp_stats_percpu - per-cpu packet processing statistics for a given
39 * datapath.
40 * @n_hit: Number of received packets for which a matching flow was found in
41 * the flow table.
42 * @n_miss: Number of received packets that had no matching flow in the flow
43 * table. The sum of @n_hit and @n_miss is the number of packets that have
44 * been received by the datapath.
45 * @n_lost: Number of received packets that had no matching flow in the flow
46 * table that could not be sent to userspace (normally due to an overflow in
47 * one of the datapath's queues).
48 */
49struct dp_stats_percpu {
50 u64 n_hit;
51 u64 n_missed;
52 u64 n_lost;
53 struct u64_stats_sync sync;
54};
55
56/**
57 * struct datapath - datapath for flow-based packet switching
58 * @rcu: RCU callback head for deferred destruction.
59 * @list_node: Element in global 'dps' list.
60 * @n_flows: Number of flows currently in flow table.
61 * @table: Current flow table. Protected by genl_lock and RCU.
62 * @ports: Map from port number to &struct vport. %OVSP_LOCAL port
63 * always exists, other ports may be %NULL. Protected by RTNL and RCU.
64 * @port_list: List of all ports in @ports in arbitrary order. RTNL required
65 * to iterate or modify.
66 * @stats_percpu: Per-CPU datapath statistics.
67 *
68 * Context: See the comment on locking at the top of datapath.c for additional
69 * locking information.
70 */
71struct datapath {
72 struct rcu_head rcu;
73 struct list_head list_node;
74
75 /* Flow table. */
76 struct flow_table __rcu *table;
77
78 /* Switch ports. */
79 struct vport __rcu *ports[DP_MAX_PORTS];
80 struct list_head port_list;
81
82 /* Stats. */
83 struct dp_stats_percpu __percpu *stats_percpu;
84};
85
86/**
87 * struct ovs_skb_cb - OVS data in skb CB
88 * @flow: The flow associated with this packet. May be %NULL if no flow.
89 */
90struct ovs_skb_cb {
91 struct sw_flow *flow;
92};
93#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
94
95/**
96 * struct dp_upcall - metadata to include with a packet to send to userspace
97 * @cmd: One of %OVS_PACKET_CMD_*.
98 * @key: Becomes %OVS_PACKET_ATTR_KEY. Must be nonnull.
99 * @userdata: If nonnull, its u64 value is extracted and passed to userspace as
100 * %OVS_PACKET_ATTR_USERDATA.
101 * @pid: Netlink PID to which packet should be sent. If @pid is 0 then no
102 * packet is sent and the packet is accounted in the datapath's @n_lost
103 * counter.
104 */
105struct dp_upcall_info {
106 u8 cmd;
107 const struct sw_flow_key *key;
108 const struct nlattr *userdata;
109 u32 pid;
110};
111
112extern struct notifier_block ovs_dp_device_notifier;
113extern struct genl_multicast_group ovs_dp_vport_multicast_group;
114
115void ovs_dp_process_received_packet(struct vport *, struct sk_buff *);
116void ovs_dp_detach_port(struct vport *);
117int ovs_dp_upcall(struct datapath *, struct sk_buff *,
118 const struct dp_upcall_info *);
119
120const char *ovs_dp_name(const struct datapath *dp);
121struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq,
122 u8 cmd);
123
124int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb);
125#endif /* datapath.h */
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
new file mode 100644
index 00000000000..46736518c45
--- /dev/null
+++ b/net/openvswitch/dp_notify.c
@@ -0,0 +1,66 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#include <linux/netdevice.h>
20#include <net/genetlink.h>
21
22#include "datapath.h"
23#include "vport-internal_dev.h"
24#include "vport-netdev.h"
25
26static int dp_device_event(struct notifier_block *unused, unsigned long event,
27 void *ptr)
28{
29 struct net_device *dev = ptr;
30 struct vport *vport;
31
32 if (ovs_is_internal_dev(dev))
33 vport = ovs_internal_dev_get_vport(dev);
34 else
35 vport = ovs_netdev_get_vport(dev);
36
37 if (!vport)
38 return NOTIFY_DONE;
39
40 switch (event) {
41 case NETDEV_UNREGISTER:
42 if (!ovs_is_internal_dev(dev)) {
43 struct sk_buff *notify;
44
45 notify = ovs_vport_cmd_build_info(vport, 0, 0,
46 OVS_VPORT_CMD_DEL);
47 ovs_dp_detach_port(vport);
48 if (IS_ERR(notify)) {
49 netlink_set_err(init_net.genl_sock, 0,
50 ovs_dp_vport_multicast_group.id,
51 PTR_ERR(notify));
52 break;
53 }
54
55 genlmsg_multicast(notify, 0, ovs_dp_vport_multicast_group.id,
56 GFP_KERNEL);
57 }
58 break;
59 }
60
61 return NOTIFY_DONE;
62}
63
64struct notifier_block ovs_dp_device_notifier = {
65 .notifier_call = dp_device_event
66};
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
new file mode 100644
index 00000000000..fe7f020a843
--- /dev/null
+++ b/net/openvswitch/flow.c
@@ -0,0 +1,1346 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#include "flow.h"
20#include "datapath.h"
21#include <linux/uaccess.h>
22#include <linux/netdevice.h>
23#include <linux/etherdevice.h>
24#include <linux/if_ether.h>
25#include <linux/if_vlan.h>
26#include <net/llc_pdu.h>
27#include <linux/kernel.h>
28#include <linux/jhash.h>
29#include <linux/jiffies.h>
30#include <linux/llc.h>
31#include <linux/module.h>
32#include <linux/in.h>
33#include <linux/rcupdate.h>
34#include <linux/if_arp.h>
35#include <linux/if_ether.h>
36#include <linux/ip.h>
37#include <linux/ipv6.h>
38#include <linux/tcp.h>
39#include <linux/udp.h>
40#include <linux/icmp.h>
41#include <linux/icmpv6.h>
42#include <linux/rculist.h>
43#include <net/ip.h>
44#include <net/ipv6.h>
45#include <net/ndisc.h>
46
47static struct kmem_cache *flow_cache;
48
49static int check_header(struct sk_buff *skb, int len)
50{
51 if (unlikely(skb->len < len))
52 return -EINVAL;
53 if (unlikely(!pskb_may_pull(skb, len)))
54 return -ENOMEM;
55 return 0;
56}
57
58static bool arphdr_ok(struct sk_buff *skb)
59{
60 return pskb_may_pull(skb, skb_network_offset(skb) +
61 sizeof(struct arp_eth_header));
62}
63
64static int check_iphdr(struct sk_buff *skb)
65{
66 unsigned int nh_ofs = skb_network_offset(skb);
67 unsigned int ip_len;
68 int err;
69
70 err = check_header(skb, nh_ofs + sizeof(struct iphdr));
71 if (unlikely(err))
72 return err;
73
74 ip_len = ip_hdrlen(skb);
75 if (unlikely(ip_len < sizeof(struct iphdr) ||
76 skb->len < nh_ofs + ip_len))
77 return -EINVAL;
78
79 skb_set_transport_header(skb, nh_ofs + ip_len);
80 return 0;
81}
82
83static bool tcphdr_ok(struct sk_buff *skb)
84{
85 int th_ofs = skb_transport_offset(skb);
86 int tcp_len;
87
88 if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr))))
89 return false;
90
91 tcp_len = tcp_hdrlen(skb);
92 if (unlikely(tcp_len < sizeof(struct tcphdr) ||
93 skb->len < th_ofs + tcp_len))
94 return false;
95
96 return true;
97}
98
99static bool udphdr_ok(struct sk_buff *skb)
100{
101 return pskb_may_pull(skb, skb_transport_offset(skb) +
102 sizeof(struct udphdr));
103}
104
105static bool icmphdr_ok(struct sk_buff *skb)
106{
107 return pskb_may_pull(skb, skb_transport_offset(skb) +
108 sizeof(struct icmphdr));
109}
110
111u64 ovs_flow_used_time(unsigned long flow_jiffies)
112{
113 struct timespec cur_ts;
114 u64 cur_ms, idle_ms;
115
116 ktime_get_ts(&cur_ts);
117 idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
118 cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC +
119 cur_ts.tv_nsec / NSEC_PER_MSEC;
120
121 return cur_ms - idle_ms;
122}
123
124#define SW_FLOW_KEY_OFFSET(field) \
125 (offsetof(struct sw_flow_key, field) + \
126 FIELD_SIZEOF(struct sw_flow_key, field))
127
128static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key,
129 int *key_lenp)
130{
131 unsigned int nh_ofs = skb_network_offset(skb);
132 unsigned int nh_len;
133 int payload_ofs;
134 struct ipv6hdr *nh;
135 uint8_t nexthdr;
136 __be16 frag_off;
137 int err;
138
139 *key_lenp = SW_FLOW_KEY_OFFSET(ipv6.label);
140
141 err = check_header(skb, nh_ofs + sizeof(*nh));
142 if (unlikely(err))
143 return err;
144
145 nh = ipv6_hdr(skb);
146 nexthdr = nh->nexthdr;
147 payload_ofs = (u8 *)(nh + 1) - skb->data;
148
149 key->ip.proto = NEXTHDR_NONE;
150 key->ip.tos = ipv6_get_dsfield(nh);
151 key->ip.ttl = nh->hop_limit;
152 key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
153 key->ipv6.addr.src = nh->saddr;
154 key->ipv6.addr.dst = nh->daddr;
155
156 payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
157 if (unlikely(payload_ofs < 0))
158 return -EINVAL;
159
160 if (frag_off) {
161 if (frag_off & htons(~0x7))
162 key->ip.frag = OVS_FRAG_TYPE_LATER;
163 else
164 key->ip.frag = OVS_FRAG_TYPE_FIRST;
165 }
166
167 nh_len = payload_ofs - nh_ofs;
168 skb_set_transport_header(skb, nh_ofs + nh_len);
169 key->ip.proto = nexthdr;
170 return nh_len;
171}
172
173static bool icmp6hdr_ok(struct sk_buff *skb)
174{
175 return pskb_may_pull(skb, skb_transport_offset(skb) +
176 sizeof(struct icmp6hdr));
177}
178
179#define TCP_FLAGS_OFFSET 13
180#define TCP_FLAG_MASK 0x3f
181
182void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb)
183{
184 u8 tcp_flags = 0;
185
186 if (flow->key.eth.type == htons(ETH_P_IP) &&
187 flow->key.ip.proto == IPPROTO_TCP) {
188 u8 *tcp = (u8 *)tcp_hdr(skb);
189 tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK;
190 }
191
192 spin_lock(&flow->lock);
193 flow->used = jiffies;
194 flow->packet_count++;
195 flow->byte_count += skb->len;
196 flow->tcp_flags |= tcp_flags;
197 spin_unlock(&flow->lock);
198}
199
200struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *actions)
201{
202 int actions_len = nla_len(actions);
203 struct sw_flow_actions *sfa;
204
205 /* At least DP_MAX_PORTS actions are required to be able to flood a
206 * packet to every port. Factor of 2 allows for setting VLAN tags,
207 * etc. */
208 if (actions_len > 2 * DP_MAX_PORTS * nla_total_size(4))
209 return ERR_PTR(-EINVAL);
210
211 sfa = kmalloc(sizeof(*sfa) + actions_len, GFP_KERNEL);
212 if (!sfa)
213 return ERR_PTR(-ENOMEM);
214
215 sfa->actions_len = actions_len;
216 memcpy(sfa->actions, nla_data(actions), actions_len);
217 return sfa;
218}
219
220struct sw_flow *ovs_flow_alloc(void)
221{
222 struct sw_flow *flow;
223
224 flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
225 if (!flow)
226 return ERR_PTR(-ENOMEM);
227
228 spin_lock_init(&flow->lock);
229 flow->sf_acts = NULL;
230
231 return flow;
232}
233
234static struct hlist_head *find_bucket(struct flow_table *table, u32 hash)
235{
236 hash = jhash_1word(hash, table->hash_seed);
237 return flex_array_get(table->buckets,
238 (hash & (table->n_buckets - 1)));
239}
240
241static struct flex_array *alloc_buckets(unsigned int n_buckets)
242{
243 struct flex_array *buckets;
244 int i, err;
245
246 buckets = flex_array_alloc(sizeof(struct hlist_head *),
247 n_buckets, GFP_KERNEL);
248 if (!buckets)
249 return NULL;
250
251 err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
252 if (err) {
253 flex_array_free(buckets);
254 return NULL;
255 }
256
257 for (i = 0; i < n_buckets; i++)
258 INIT_HLIST_HEAD((struct hlist_head *)
259 flex_array_get(buckets, i));
260
261 return buckets;
262}
263
264static void free_buckets(struct flex_array *buckets)
265{
266 flex_array_free(buckets);
267}
268
269struct flow_table *ovs_flow_tbl_alloc(int new_size)
270{
271 struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL);
272
273 if (!table)
274 return NULL;
275
276 table->buckets = alloc_buckets(new_size);
277
278 if (!table->buckets) {
279 kfree(table);
280 return NULL;
281 }
282 table->n_buckets = new_size;
283 table->count = 0;
284 table->node_ver = 0;
285 table->keep_flows = false;
286 get_random_bytes(&table->hash_seed, sizeof(u32));
287
288 return table;
289}
290
291void ovs_flow_tbl_destroy(struct flow_table *table)
292{
293 int i;
294
295 if (!table)
296 return;
297
298 if (table->keep_flows)
299 goto skip_flows;
300
301 for (i = 0; i < table->n_buckets; i++) {
302 struct sw_flow *flow;
303 struct hlist_head *head = flex_array_get(table->buckets, i);
304 struct hlist_node *node, *n;
305 int ver = table->node_ver;
306
307 hlist_for_each_entry_safe(flow, node, n, head, hash_node[ver]) {
308 hlist_del_rcu(&flow->hash_node[ver]);
309 ovs_flow_free(flow);
310 }
311 }
312
313skip_flows:
314 free_buckets(table->buckets);
315 kfree(table);
316}
317
318static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
319{
320 struct flow_table *table = container_of(rcu, struct flow_table, rcu);
321
322 ovs_flow_tbl_destroy(table);
323}
324
325void ovs_flow_tbl_deferred_destroy(struct flow_table *table)
326{
327 if (!table)
328 return;
329
330 call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb);
331}
332
333struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last)
334{
335 struct sw_flow *flow;
336 struct hlist_head *head;
337 struct hlist_node *n;
338 int ver;
339 int i;
340
341 ver = table->node_ver;
342 while (*bucket < table->n_buckets) {
343 i = 0;
344 head = flex_array_get(table->buckets, *bucket);
345 hlist_for_each_entry_rcu(flow, n, head, hash_node[ver]) {
346 if (i < *last) {
347 i++;
348 continue;
349 }
350 *last = i + 1;
351 return flow;
352 }
353 (*bucket)++;
354 *last = 0;
355 }
356
357 return NULL;
358}
359
360static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new)
361{
362 int old_ver;
363 int i;
364
365 old_ver = old->node_ver;
366 new->node_ver = !old_ver;
367
368 /* Insert in new table. */
369 for (i = 0; i < old->n_buckets; i++) {
370 struct sw_flow *flow;
371 struct hlist_head *head;
372 struct hlist_node *n;
373
374 head = flex_array_get(old->buckets, i);
375
376 hlist_for_each_entry(flow, n, head, hash_node[old_ver])
377 ovs_flow_tbl_insert(new, flow);
378 }
379 old->keep_flows = true;
380}
381
382static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buckets)
383{
384 struct flow_table *new_table;
385
386 new_table = ovs_flow_tbl_alloc(n_buckets);
387 if (!new_table)
388 return ERR_PTR(-ENOMEM);
389
390 flow_table_copy_flows(table, new_table);
391
392 return new_table;
393}
394
395struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table)
396{
397 return __flow_tbl_rehash(table, table->n_buckets);
398}
399
400struct flow_table *ovs_flow_tbl_expand(struct flow_table *table)
401{
402 return __flow_tbl_rehash(table, table->n_buckets * 2);
403}
404
405void ovs_flow_free(struct sw_flow *flow)
406{
407 if (unlikely(!flow))
408 return;
409
410 kfree((struct sf_flow_acts __force *)flow->sf_acts);
411 kmem_cache_free(flow_cache, flow);
412}
413
414/* RCU callback used by ovs_flow_deferred_free. */
415static void rcu_free_flow_callback(struct rcu_head *rcu)
416{
417 struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
418
419 ovs_flow_free(flow);
420}
421
422/* Schedules 'flow' to be freed after the next RCU grace period.
423 * The caller must hold rcu_read_lock for this to be sensible. */
424void ovs_flow_deferred_free(struct sw_flow *flow)
425{
426 call_rcu(&flow->rcu, rcu_free_flow_callback);
427}
428
429/* RCU callback used by ovs_flow_deferred_free_acts. */
430static void rcu_free_acts_callback(struct rcu_head *rcu)
431{
432 struct sw_flow_actions *sf_acts = container_of(rcu,
433 struct sw_flow_actions, rcu);
434 kfree(sf_acts);
435}
436
437/* Schedules 'sf_acts' to be freed after the next RCU grace period.
438 * The caller must hold rcu_read_lock for this to be sensible. */
439void ovs_flow_deferred_free_acts(struct sw_flow_actions *sf_acts)
440{
441 call_rcu(&sf_acts->rcu, rcu_free_acts_callback);
442}
443
444static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
445{
446 struct qtag_prefix {
447 __be16 eth_type; /* ETH_P_8021Q */
448 __be16 tci;
449 };
450 struct qtag_prefix *qp;
451
452 if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16)))
453 return 0;
454
455 if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) +
456 sizeof(__be16))))
457 return -ENOMEM;
458
459 qp = (struct qtag_prefix *) skb->data;
460 key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT);
461 __skb_pull(skb, sizeof(struct qtag_prefix));
462
463 return 0;
464}
465
466static __be16 parse_ethertype(struct sk_buff *skb)
467{
468 struct llc_snap_hdr {
469 u8 dsap; /* Always 0xAA */
470 u8 ssap; /* Always 0xAA */
471 u8 ctrl;
472 u8 oui[3];
473 __be16 ethertype;
474 };
475 struct llc_snap_hdr *llc;
476 __be16 proto;
477
478 proto = *(__be16 *) skb->data;
479 __skb_pull(skb, sizeof(__be16));
480
481 if (ntohs(proto) >= 1536)
482 return proto;
483
484 if (skb->len < sizeof(struct llc_snap_hdr))
485 return htons(ETH_P_802_2);
486
487 if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr))))
488 return htons(0);
489
490 llc = (struct llc_snap_hdr *) skb->data;
491 if (llc->dsap != LLC_SAP_SNAP ||
492 llc->ssap != LLC_SAP_SNAP ||
493 (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0)
494 return htons(ETH_P_802_2);
495
496 __skb_pull(skb, sizeof(struct llc_snap_hdr));
497 return llc->ethertype;
498}
499
500static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
501 int *key_lenp, int nh_len)
502{
503 struct icmp6hdr *icmp = icmp6_hdr(skb);
504 int error = 0;
505 int key_len;
506
507 /* The ICMPv6 type and code fields use the 16-bit transport port
508 * fields, so we need to store them in 16-bit network byte order.
509 */
510 key->ipv6.tp.src = htons(icmp->icmp6_type);
511 key->ipv6.tp.dst = htons(icmp->icmp6_code);
512 key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
513
514 if (icmp->icmp6_code == 0 &&
515 (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
516 icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) {
517 int icmp_len = skb->len - skb_transport_offset(skb);
518 struct nd_msg *nd;
519 int offset;
520
521 key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
522
523 /* In order to process neighbor discovery options, we need the
524 * entire packet.
525 */
526 if (unlikely(icmp_len < sizeof(*nd)))
527 goto out;
528 if (unlikely(skb_linearize(skb))) {
529 error = -ENOMEM;
530 goto out;
531 }
532
533 nd = (struct nd_msg *)skb_transport_header(skb);
534 key->ipv6.nd.target = nd->target;
535 key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
536
537 icmp_len -= sizeof(*nd);
538 offset = 0;
539 while (icmp_len >= 8) {
540 struct nd_opt_hdr *nd_opt =
541 (struct nd_opt_hdr *)(nd->opt + offset);
542 int opt_len = nd_opt->nd_opt_len * 8;
543
544 if (unlikely(!opt_len || opt_len > icmp_len))
545 goto invalid;
546
547 /* Store the link layer address if the appropriate
548 * option is provided. It is considered an error if
549 * the same link layer option is specified twice.
550 */
551 if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR
552 && opt_len == 8) {
553 if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll)))
554 goto invalid;
555 memcpy(key->ipv6.nd.sll,
556 &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
557 } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR
558 && opt_len == 8) {
559 if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll)))
560 goto invalid;
561 memcpy(key->ipv6.nd.tll,
562 &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN);
563 }
564
565 icmp_len -= opt_len;
566 offset += opt_len;
567 }
568 }
569
570 goto out;
571
572invalid:
573 memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target));
574 memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll));
575 memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll));
576
577out:
578 *key_lenp = key_len;
579 return error;
580}
581
582/**
583 * ovs_flow_extract - extracts a flow key from an Ethernet frame.
584 * @skb: sk_buff that contains the frame, with skb->data pointing to the
585 * Ethernet header
586 * @in_port: port number on which @skb was received.
587 * @key: output flow key
588 * @key_lenp: length of output flow key
589 *
590 * The caller must ensure that skb->len >= ETH_HLEN.
591 *
592 * Returns 0 if successful, otherwise a negative errno value.
593 *
594 * Initializes @skb header pointers as follows:
595 *
596 * - skb->mac_header: the Ethernet header.
597 *
598 * - skb->network_header: just past the Ethernet header, or just past the
599 * VLAN header, to the first byte of the Ethernet payload.
600 *
601 * - skb->transport_header: If key->dl_type is ETH_P_IP or ETH_P_IPV6
602 * on output, then just past the IP header, if one is present and
603 * of a correct length, otherwise the same as skb->network_header.
604 * For other key->dl_type values it is left untouched.
605 */
606int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
607 int *key_lenp)
608{
609 int error = 0;
610 int key_len = SW_FLOW_KEY_OFFSET(eth);
611 struct ethhdr *eth;
612
613 memset(key, 0, sizeof(*key));
614
615 key->phy.priority = skb->priority;
616 key->phy.in_port = in_port;
617
618 skb_reset_mac_header(skb);
619
620 /* Link layer. We are guaranteed to have at least the 14 byte Ethernet
621 * header in the linear data area.
622 */
623 eth = eth_hdr(skb);
624 memcpy(key->eth.src, eth->h_source, ETH_ALEN);
625 memcpy(key->eth.dst, eth->h_dest, ETH_ALEN);
626
627 __skb_pull(skb, 2 * ETH_ALEN);
628
629 if (vlan_tx_tag_present(skb))
630 key->eth.tci = htons(skb->vlan_tci);
631 else if (eth->h_proto == htons(ETH_P_8021Q))
632 if (unlikely(parse_vlan(skb, key)))
633 return -ENOMEM;
634
635 key->eth.type = parse_ethertype(skb);
636 if (unlikely(key->eth.type == htons(0)))
637 return -ENOMEM;
638
639 skb_reset_network_header(skb);
640 __skb_push(skb, skb->data - skb_mac_header(skb));
641
642 /* Network layer. */
643 if (key->eth.type == htons(ETH_P_IP)) {
644 struct iphdr *nh;
645 __be16 offset;
646
647 key_len = SW_FLOW_KEY_OFFSET(ipv4.addr);
648
649 error = check_iphdr(skb);
650 if (unlikely(error)) {
651 if (error == -EINVAL) {
652 skb->transport_header = skb->network_header;
653 error = 0;
654 }
655 goto out;
656 }
657
658 nh = ip_hdr(skb);
659 key->ipv4.addr.src = nh->saddr;
660 key->ipv4.addr.dst = nh->daddr;
661
662 key->ip.proto = nh->protocol;
663 key->ip.tos = nh->tos;
664 key->ip.ttl = nh->ttl;
665
666 offset = nh->frag_off & htons(IP_OFFSET);
667 if (offset) {
668 key->ip.frag = OVS_FRAG_TYPE_LATER;
669 goto out;
670 }
671 if (nh->frag_off & htons(IP_MF) ||
672 skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
673 key->ip.frag = OVS_FRAG_TYPE_FIRST;
674
675 /* Transport layer. */
676 if (key->ip.proto == IPPROTO_TCP) {
677 key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
678 if (tcphdr_ok(skb)) {
679 struct tcphdr *tcp = tcp_hdr(skb);
680 key->ipv4.tp.src = tcp->source;
681 key->ipv4.tp.dst = tcp->dest;
682 }
683 } else if (key->ip.proto == IPPROTO_UDP) {
684 key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
685 if (udphdr_ok(skb)) {
686 struct udphdr *udp = udp_hdr(skb);
687 key->ipv4.tp.src = udp->source;
688 key->ipv4.tp.dst = udp->dest;
689 }
690 } else if (key->ip.proto == IPPROTO_ICMP) {
691 key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
692 if (icmphdr_ok(skb)) {
693 struct icmphdr *icmp = icmp_hdr(skb);
694 /* The ICMP type and code fields use the 16-bit
695 * transport port fields, so we need to store
696 * them in 16-bit network byte order. */
697 key->ipv4.tp.src = htons(icmp->type);
698 key->ipv4.tp.dst = htons(icmp->code);
699 }
700 }
701
702 } else if (key->eth.type == htons(ETH_P_ARP) && arphdr_ok(skb)) {
703 struct arp_eth_header *arp;
704
705 arp = (struct arp_eth_header *)skb_network_header(skb);
706
707 if (arp->ar_hrd == htons(ARPHRD_ETHER)
708 && arp->ar_pro == htons(ETH_P_IP)
709 && arp->ar_hln == ETH_ALEN
710 && arp->ar_pln == 4) {
711
712 /* We only match on the lower 8 bits of the opcode. */
713 if (ntohs(arp->ar_op) <= 0xff)
714 key->ip.proto = ntohs(arp->ar_op);
715
716 if (key->ip.proto == ARPOP_REQUEST
717 || key->ip.proto == ARPOP_REPLY) {
718 memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
719 memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
720 memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
721 memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
722 key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
723 }
724 }
725 } else if (key->eth.type == htons(ETH_P_IPV6)) {
726 int nh_len; /* IPv6 Header + Extensions */
727
728 nh_len = parse_ipv6hdr(skb, key, &key_len);
729 if (unlikely(nh_len < 0)) {
730 if (nh_len == -EINVAL)
731 skb->transport_header = skb->network_header;
732 else
733 error = nh_len;
734 goto out;
735 }
736
737 if (key->ip.frag == OVS_FRAG_TYPE_LATER)
738 goto out;
739 if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
740 key->ip.frag = OVS_FRAG_TYPE_FIRST;
741
742 /* Transport layer. */
743 if (key->ip.proto == NEXTHDR_TCP) {
744 key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
745 if (tcphdr_ok(skb)) {
746 struct tcphdr *tcp = tcp_hdr(skb);
747 key->ipv6.tp.src = tcp->source;
748 key->ipv6.tp.dst = tcp->dest;
749 }
750 } else if (key->ip.proto == NEXTHDR_UDP) {
751 key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
752 if (udphdr_ok(skb)) {
753 struct udphdr *udp = udp_hdr(skb);
754 key->ipv6.tp.src = udp->source;
755 key->ipv6.tp.dst = udp->dest;
756 }
757 } else if (key->ip.proto == NEXTHDR_ICMP) {
758 key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
759 if (icmp6hdr_ok(skb)) {
760 error = parse_icmpv6(skb, key, &key_len, nh_len);
761 if (error < 0)
762 goto out;
763 }
764 }
765 }
766
767out:
768 *key_lenp = key_len;
769 return error;
770}
771
772u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len)
773{
774 return jhash2((u32 *)key, DIV_ROUND_UP(key_len, sizeof(u32)), 0);
775}
776
777struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
778 struct sw_flow_key *key, int key_len)
779{
780 struct sw_flow *flow;
781 struct hlist_node *n;
782 struct hlist_head *head;
783 u32 hash;
784
785 hash = ovs_flow_hash(key, key_len);
786
787 head = find_bucket(table, hash);
788 hlist_for_each_entry_rcu(flow, n, head, hash_node[table->node_ver]) {
789
790 if (flow->hash == hash &&
791 !memcmp(&flow->key, key, key_len)) {
792 return flow;
793 }
794 }
795 return NULL;
796}
797
798void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow)
799{
800 struct hlist_head *head;
801
802 head = find_bucket(table, flow->hash);
803 hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
804 table->count++;
805}
806
807void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
808{
809 hlist_del_rcu(&flow->hash_node[table->node_ver]);
810 table->count--;
811 BUG_ON(table->count < 0);
812}
813
814/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */
815const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
816 [OVS_KEY_ATTR_ENCAP] = -1,
817 [OVS_KEY_ATTR_PRIORITY] = sizeof(u32),
818 [OVS_KEY_ATTR_IN_PORT] = sizeof(u32),
819 [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet),
820 [OVS_KEY_ATTR_VLAN] = sizeof(__be16),
821 [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16),
822 [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4),
823 [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6),
824 [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp),
825 [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp),
826 [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp),
827 [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
828 [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
829 [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
830};
831
832static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
833 const struct nlattr *a[], u32 *attrs)
834{
835 const struct ovs_key_icmp *icmp_key;
836 const struct ovs_key_tcp *tcp_key;
837 const struct ovs_key_udp *udp_key;
838
839 switch (swkey->ip.proto) {
840 case IPPROTO_TCP:
841 if (!(*attrs & (1 << OVS_KEY_ATTR_TCP)))
842 return -EINVAL;
843 *attrs &= ~(1 << OVS_KEY_ATTR_TCP);
844
845 *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
846 tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
847 swkey->ipv4.tp.src = tcp_key->tcp_src;
848 swkey->ipv4.tp.dst = tcp_key->tcp_dst;
849 break;
850
851 case IPPROTO_UDP:
852 if (!(*attrs & (1 << OVS_KEY_ATTR_UDP)))
853 return -EINVAL;
854 *attrs &= ~(1 << OVS_KEY_ATTR_UDP);
855
856 *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
857 udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
858 swkey->ipv4.tp.src = udp_key->udp_src;
859 swkey->ipv4.tp.dst = udp_key->udp_dst;
860 break;
861
862 case IPPROTO_ICMP:
863 if (!(*attrs & (1 << OVS_KEY_ATTR_ICMP)))
864 return -EINVAL;
865 *attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
866
867 *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
868 icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
869 swkey->ipv4.tp.src = htons(icmp_key->icmp_type);
870 swkey->ipv4.tp.dst = htons(icmp_key->icmp_code);
871 break;
872 }
873
874 return 0;
875}
876
877static int ipv6_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
878 const struct nlattr *a[], u32 *attrs)
879{
880 const struct ovs_key_icmpv6 *icmpv6_key;
881 const struct ovs_key_tcp *tcp_key;
882 const struct ovs_key_udp *udp_key;
883
884 switch (swkey->ip.proto) {
885 case IPPROTO_TCP:
886 if (!(*attrs & (1 << OVS_KEY_ATTR_TCP)))
887 return -EINVAL;
888 *attrs &= ~(1 << OVS_KEY_ATTR_TCP);
889
890 *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
891 tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
892 swkey->ipv6.tp.src = tcp_key->tcp_src;
893 swkey->ipv6.tp.dst = tcp_key->tcp_dst;
894 break;
895
896 case IPPROTO_UDP:
897 if (!(*attrs & (1 << OVS_KEY_ATTR_UDP)))
898 return -EINVAL;
899 *attrs &= ~(1 << OVS_KEY_ATTR_UDP);
900
901 *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
902 udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
903 swkey->ipv6.tp.src = udp_key->udp_src;
904 swkey->ipv6.tp.dst = udp_key->udp_dst;
905 break;
906
907 case IPPROTO_ICMPV6:
908 if (!(*attrs & (1 << OVS_KEY_ATTR_ICMPV6)))
909 return -EINVAL;
910 *attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
911
912 *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
913 icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
914 swkey->ipv6.tp.src = htons(icmpv6_key->icmpv6_type);
915 swkey->ipv6.tp.dst = htons(icmpv6_key->icmpv6_code);
916
917 if (swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
918 swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
919 const struct ovs_key_nd *nd_key;
920
921 if (!(*attrs & (1 << OVS_KEY_ATTR_ND)))
922 return -EINVAL;
923 *attrs &= ~(1 << OVS_KEY_ATTR_ND);
924
925 *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
926 nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
927 memcpy(&swkey->ipv6.nd.target, nd_key->nd_target,
928 sizeof(swkey->ipv6.nd.target));
929 memcpy(swkey->ipv6.nd.sll, nd_key->nd_sll, ETH_ALEN);
930 memcpy(swkey->ipv6.nd.tll, nd_key->nd_tll, ETH_ALEN);
931 }
932 break;
933 }
934
935 return 0;
936}
937
938static int parse_flow_nlattrs(const struct nlattr *attr,
939 const struct nlattr *a[], u32 *attrsp)
940{
941 const struct nlattr *nla;
942 u32 attrs;
943 int rem;
944
945 attrs = 0;
946 nla_for_each_nested(nla, attr, rem) {
947 u16 type = nla_type(nla);
948 int expected_len;
949
950 if (type > OVS_KEY_ATTR_MAX || attrs & (1 << type))
951 return -EINVAL;
952
953 expected_len = ovs_key_lens[type];
954 if (nla_len(nla) != expected_len && expected_len != -1)
955 return -EINVAL;
956
957 attrs |= 1 << type;
958 a[type] = nla;
959 }
960 if (rem)
961 return -EINVAL;
962
963 *attrsp = attrs;
964 return 0;
965}
966
967/**
968 * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key.
969 * @swkey: receives the extracted flow key.
970 * @key_lenp: number of bytes used in @swkey.
971 * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
972 * sequence.
973 */
974int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
975 const struct nlattr *attr)
976{
977 const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
978 const struct ovs_key_ethernet *eth_key;
979 int key_len;
980 u32 attrs;
981 int err;
982
983 memset(swkey, 0, sizeof(struct sw_flow_key));
984 key_len = SW_FLOW_KEY_OFFSET(eth);
985
986 err = parse_flow_nlattrs(attr, a, &attrs);
987 if (err)
988 return err;
989
990 /* Metadata attributes. */
991 if (attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
992 swkey->phy.priority = nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]);
993 attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
994 }
995 if (attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
996 u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
997 if (in_port >= DP_MAX_PORTS)
998 return -EINVAL;
999 swkey->phy.in_port = in_port;
1000 attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
1001 } else {
1002 swkey->phy.in_port = USHRT_MAX;
1003 }
1004
1005 /* Data attributes. */
1006 if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET)))
1007 return -EINVAL;
1008 attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
1009
1010 eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
1011 memcpy(swkey->eth.src, eth_key->eth_src, ETH_ALEN);
1012 memcpy(swkey->eth.dst, eth_key->eth_dst, ETH_ALEN);
1013
1014 if (attrs & (1u << OVS_KEY_ATTR_ETHERTYPE) &&
1015 nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q)) {
1016 const struct nlattr *encap;
1017 __be16 tci;
1018
1019 if (attrs != ((1 << OVS_KEY_ATTR_VLAN) |
1020 (1 << OVS_KEY_ATTR_ETHERTYPE) |
1021 (1 << OVS_KEY_ATTR_ENCAP)))
1022 return -EINVAL;
1023
1024 encap = a[OVS_KEY_ATTR_ENCAP];
1025 tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
1026 if (tci & htons(VLAN_TAG_PRESENT)) {
1027 swkey->eth.tci = tci;
1028
1029 err = parse_flow_nlattrs(encap, a, &attrs);
1030 if (err)
1031 return err;
1032 } else if (!tci) {
1033 /* Corner case for truncated 802.1Q header. */
1034 if (nla_len(encap))
1035 return -EINVAL;
1036
1037 swkey->eth.type = htons(ETH_P_8021Q);
1038 *key_lenp = key_len;
1039 return 0;
1040 } else {
1041 return -EINVAL;
1042 }
1043 }
1044
1045 if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
1046 swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
1047 if (ntohs(swkey->eth.type) < 1536)
1048 return -EINVAL;
1049 attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
1050 } else {
1051 swkey->eth.type = htons(ETH_P_802_2);
1052 }
1053
1054 if (swkey->eth.type == htons(ETH_P_IP)) {
1055 const struct ovs_key_ipv4 *ipv4_key;
1056
1057 if (!(attrs & (1 << OVS_KEY_ATTR_IPV4)))
1058 return -EINVAL;
1059 attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
1060
1061 key_len = SW_FLOW_KEY_OFFSET(ipv4.addr);
1062 ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
1063 if (ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX)
1064 return -EINVAL;
1065 swkey->ip.proto = ipv4_key->ipv4_proto;
1066 swkey->ip.tos = ipv4_key->ipv4_tos;
1067 swkey->ip.ttl = ipv4_key->ipv4_ttl;
1068 swkey->ip.frag = ipv4_key->ipv4_frag;
1069 swkey->ipv4.addr.src = ipv4_key->ipv4_src;
1070 swkey->ipv4.addr.dst = ipv4_key->ipv4_dst;
1071
1072 if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
1073 err = ipv4_flow_from_nlattrs(swkey, &key_len, a, &attrs);
1074 if (err)
1075 return err;
1076 }
1077 } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
1078 const struct ovs_key_ipv6 *ipv6_key;
1079
1080 if (!(attrs & (1 << OVS_KEY_ATTR_IPV6)))
1081 return -EINVAL;
1082 attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
1083
1084 key_len = SW_FLOW_KEY_OFFSET(ipv6.label);
1085 ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
1086 if (ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX)
1087 return -EINVAL;
1088 swkey->ipv6.label = ipv6_key->ipv6_label;
1089 swkey->ip.proto = ipv6_key->ipv6_proto;
1090 swkey->ip.tos = ipv6_key->ipv6_tclass;
1091 swkey->ip.ttl = ipv6_key->ipv6_hlimit;
1092 swkey->ip.frag = ipv6_key->ipv6_frag;
1093 memcpy(&swkey->ipv6.addr.src, ipv6_key->ipv6_src,
1094 sizeof(swkey->ipv6.addr.src));
1095 memcpy(&swkey->ipv6.addr.dst, ipv6_key->ipv6_dst,
1096 sizeof(swkey->ipv6.addr.dst));
1097
1098 if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
1099 err = ipv6_flow_from_nlattrs(swkey, &key_len, a, &attrs);
1100 if (err)
1101 return err;
1102 }
1103 } else if (swkey->eth.type == htons(ETH_P_ARP)) {
1104 const struct ovs_key_arp *arp_key;
1105
1106 if (!(attrs & (1 << OVS_KEY_ATTR_ARP)))
1107 return -EINVAL;
1108 attrs &= ~(1 << OVS_KEY_ATTR_ARP);
1109
1110 key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
1111 arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
1112 swkey->ipv4.addr.src = arp_key->arp_sip;
1113 swkey->ipv4.addr.dst = arp_key->arp_tip;
1114 if (arp_key->arp_op & htons(0xff00))
1115 return -EINVAL;
1116 swkey->ip.proto = ntohs(arp_key->arp_op);
1117 memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN);
1118 memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN);
1119 }
1120
1121 if (attrs)
1122 return -EINVAL;
1123 *key_lenp = key_len;
1124
1125 return 0;
1126}
1127
1128/**
1129 * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key.
1130 * @in_port: receives the extracted input port.
1131 * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
1132 * sequence.
1133 *
1134 * This parses a series of Netlink attributes that form a flow key, which must
1135 * take the same form accepted by flow_from_nlattrs(), but only enough of it to
1136 * get the metadata, that is, the parts of the flow key that cannot be
1137 * extracted from the packet itself.
1138 */
1139int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port,
1140 const struct nlattr *attr)
1141{
1142 const struct nlattr *nla;
1143 int rem;
1144
1145 *in_port = USHRT_MAX;
1146 *priority = 0;
1147
1148 nla_for_each_nested(nla, attr, rem) {
1149 int type = nla_type(nla);
1150
1151 if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) {
1152 if (nla_len(nla) != ovs_key_lens[type])
1153 return -EINVAL;
1154
1155 switch (type) {
1156 case OVS_KEY_ATTR_PRIORITY:
1157 *priority = nla_get_u32(nla);
1158 break;
1159
1160 case OVS_KEY_ATTR_IN_PORT:
1161 if (nla_get_u32(nla) >= DP_MAX_PORTS)
1162 return -EINVAL;
1163 *in_port = nla_get_u32(nla);
1164 break;
1165 }
1166 }
1167 }
1168 if (rem)
1169 return -EINVAL;
1170 return 0;
1171}
1172
1173int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
1174{
1175 struct ovs_key_ethernet *eth_key;
1176 struct nlattr *nla, *encap;
1177
1178 if (swkey->phy.priority)
1179 NLA_PUT_U32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority);
1180
1181 if (swkey->phy.in_port != USHRT_MAX)
1182 NLA_PUT_U32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port);
1183
1184 nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
1185 if (!nla)
1186 goto nla_put_failure;
1187 eth_key = nla_data(nla);
1188 memcpy(eth_key->eth_src, swkey->eth.src, ETH_ALEN);
1189 memcpy(eth_key->eth_dst, swkey->eth.dst, ETH_ALEN);
1190
1191 if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
1192 NLA_PUT_BE16(skb, OVS_KEY_ATTR_ETHERTYPE, htons(ETH_P_8021Q));
1193 NLA_PUT_BE16(skb, OVS_KEY_ATTR_VLAN, swkey->eth.tci);
1194 encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
1195 if (!swkey->eth.tci)
1196 goto unencap;
1197 } else {
1198 encap = NULL;
1199 }
1200
1201 if (swkey->eth.type == htons(ETH_P_802_2))
1202 goto unencap;
1203
1204 NLA_PUT_BE16(skb, OVS_KEY_ATTR_ETHERTYPE, swkey->eth.type);
1205
1206 if (swkey->eth.type == htons(ETH_P_IP)) {
1207 struct ovs_key_ipv4 *ipv4_key;
1208
1209 nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key));
1210 if (!nla)
1211 goto nla_put_failure;
1212 ipv4_key = nla_data(nla);
1213 ipv4_key->ipv4_src = swkey->ipv4.addr.src;
1214 ipv4_key->ipv4_dst = swkey->ipv4.addr.dst;
1215 ipv4_key->ipv4_proto = swkey->ip.proto;
1216 ipv4_key->ipv4_tos = swkey->ip.tos;
1217 ipv4_key->ipv4_ttl = swkey->ip.ttl;
1218 ipv4_key->ipv4_frag = swkey->ip.frag;
1219 } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
1220 struct ovs_key_ipv6 *ipv6_key;
1221
1222 nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key));
1223 if (!nla)
1224 goto nla_put_failure;
1225 ipv6_key = nla_data(nla);
1226 memcpy(ipv6_key->ipv6_src, &swkey->ipv6.addr.src,
1227 sizeof(ipv6_key->ipv6_src));
1228 memcpy(ipv6_key->ipv6_dst, &swkey->ipv6.addr.dst,
1229 sizeof(ipv6_key->ipv6_dst));
1230 ipv6_key->ipv6_label = swkey->ipv6.label;
1231 ipv6_key->ipv6_proto = swkey->ip.proto;
1232 ipv6_key->ipv6_tclass = swkey->ip.tos;
1233 ipv6_key->ipv6_hlimit = swkey->ip.ttl;
1234 ipv6_key->ipv6_frag = swkey->ip.frag;
1235 } else if (swkey->eth.type == htons(ETH_P_ARP)) {
1236 struct ovs_key_arp *arp_key;
1237
1238 nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key));
1239 if (!nla)
1240 goto nla_put_failure;
1241 arp_key = nla_data(nla);
1242 memset(arp_key, 0, sizeof(struct ovs_key_arp));
1243 arp_key->arp_sip = swkey->ipv4.addr.src;
1244 arp_key->arp_tip = swkey->ipv4.addr.dst;
1245 arp_key->arp_op = htons(swkey->ip.proto);
1246 memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN);
1247 memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN);
1248 }
1249
1250 if ((swkey->eth.type == htons(ETH_P_IP) ||
1251 swkey->eth.type == htons(ETH_P_IPV6)) &&
1252 swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
1253
1254 if (swkey->ip.proto == IPPROTO_TCP) {
1255 struct ovs_key_tcp *tcp_key;
1256
1257 nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key));
1258 if (!nla)
1259 goto nla_put_failure;
1260 tcp_key = nla_data(nla);
1261 if (swkey->eth.type == htons(ETH_P_IP)) {
1262 tcp_key->tcp_src = swkey->ipv4.tp.src;
1263 tcp_key->tcp_dst = swkey->ipv4.tp.dst;
1264 } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
1265 tcp_key->tcp_src = swkey->ipv6.tp.src;
1266 tcp_key->tcp_dst = swkey->ipv6.tp.dst;
1267 }
1268 } else if (swkey->ip.proto == IPPROTO_UDP) {
1269 struct ovs_key_udp *udp_key;
1270
1271 nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key));
1272 if (!nla)
1273 goto nla_put_failure;
1274 udp_key = nla_data(nla);
1275 if (swkey->eth.type == htons(ETH_P_IP)) {
1276 udp_key->udp_src = swkey->ipv4.tp.src;
1277 udp_key->udp_dst = swkey->ipv4.tp.dst;
1278 } else if (swkey->eth.type == htons(ETH_P_IPV6)) {
1279 udp_key->udp_src = swkey->ipv6.tp.src;
1280 udp_key->udp_dst = swkey->ipv6.tp.dst;
1281 }
1282 } else if (swkey->eth.type == htons(ETH_P_IP) &&
1283 swkey->ip.proto == IPPROTO_ICMP) {
1284 struct ovs_key_icmp *icmp_key;
1285
1286 nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key));
1287 if (!nla)
1288 goto nla_put_failure;
1289 icmp_key = nla_data(nla);
1290 icmp_key->icmp_type = ntohs(swkey->ipv4.tp.src);
1291 icmp_key->icmp_code = ntohs(swkey->ipv4.tp.dst);
1292 } else if (swkey->eth.type == htons(ETH_P_IPV6) &&
1293 swkey->ip.proto == IPPROTO_ICMPV6) {
1294 struct ovs_key_icmpv6 *icmpv6_key;
1295
1296 nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6,
1297 sizeof(*icmpv6_key));
1298 if (!nla)
1299 goto nla_put_failure;
1300 icmpv6_key = nla_data(nla);
1301 icmpv6_key->icmpv6_type = ntohs(swkey->ipv6.tp.src);
1302 icmpv6_key->icmpv6_code = ntohs(swkey->ipv6.tp.dst);
1303
1304 if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
1305 icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
1306 struct ovs_key_nd *nd_key;
1307
1308 nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key));
1309 if (!nla)
1310 goto nla_put_failure;
1311 nd_key = nla_data(nla);
1312 memcpy(nd_key->nd_target, &swkey->ipv6.nd.target,
1313 sizeof(nd_key->nd_target));
1314 memcpy(nd_key->nd_sll, swkey->ipv6.nd.sll, ETH_ALEN);
1315 memcpy(nd_key->nd_tll, swkey->ipv6.nd.tll, ETH_ALEN);
1316 }
1317 }
1318 }
1319
1320unencap:
1321 if (encap)
1322 nla_nest_end(skb, encap);
1323
1324 return 0;
1325
1326nla_put_failure:
1327 return -EMSGSIZE;
1328}
1329
1330/* Initializes the flow module.
1331 * Returns zero if successful or a negative error code. */
1332int ovs_flow_init(void)
1333{
1334 flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
1335 0, NULL);
1336 if (flow_cache == NULL)
1337 return -ENOMEM;
1338
1339 return 0;
1340}
1341
1342/* Uninitializes the flow module. */
1343void ovs_flow_exit(void)
1344{
1345 kmem_cache_destroy(flow_cache);
1346}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
new file mode 100644
index 00000000000..2747dc2c4ac
--- /dev/null
+++ b/net/openvswitch/flow.h
@@ -0,0 +1,199 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#ifndef FLOW_H
20#define FLOW_H 1
21
22#include <linux/kernel.h>
23#include <linux/netlink.h>
24#include <linux/openvswitch.h>
25#include <linux/spinlock.h>
26#include <linux/types.h>
27#include <linux/rcupdate.h>
28#include <linux/if_ether.h>
29#include <linux/in6.h>
30#include <linux/jiffies.h>
31#include <linux/time.h>
32#include <linux/flex_array.h>
33#include <net/inet_ecn.h>
34
35struct sk_buff;
36
37struct sw_flow_actions {
38 struct rcu_head rcu;
39 u32 actions_len;
40 struct nlattr actions[];
41};
42
43struct sw_flow_key {
44 struct {
45 u32 priority; /* Packet QoS priority. */
46 u16 in_port; /* Input switch port (or USHRT_MAX). */
47 } phy;
48 struct {
49 u8 src[ETH_ALEN]; /* Ethernet source address. */
50 u8 dst[ETH_ALEN]; /* Ethernet destination address. */
51 __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
52 __be16 type; /* Ethernet frame type. */
53 } eth;
54 struct {
55 u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */
56 u8 tos; /* IP ToS. */
57 u8 ttl; /* IP TTL/hop limit. */
58 u8 frag; /* One of OVS_FRAG_TYPE_*. */
59 } ip;
60 union {
61 struct {
62 struct {
63 __be32 src; /* IP source address. */
64 __be32 dst; /* IP destination address. */
65 } addr;
66 union {
67 struct {
68 __be16 src; /* TCP/UDP source port. */
69 __be16 dst; /* TCP/UDP destination port. */
70 } tp;
71 struct {
72 u8 sha[ETH_ALEN]; /* ARP source hardware address. */
73 u8 tha[ETH_ALEN]; /* ARP target hardware address. */
74 } arp;
75 };
76 } ipv4;
77 struct {
78 struct {
79 struct in6_addr src; /* IPv6 source address. */
80 struct in6_addr dst; /* IPv6 destination address. */
81 } addr;
82 __be32 label; /* IPv6 flow label. */
83 struct {
84 __be16 src; /* TCP/UDP source port. */
85 __be16 dst; /* TCP/UDP destination port. */
86 } tp;
87 struct {
88 struct in6_addr target; /* ND target address. */
89 u8 sll[ETH_ALEN]; /* ND source link layer address. */
90 u8 tll[ETH_ALEN]; /* ND target link layer address. */
91 } nd;
92 } ipv6;
93 };
94};
95
96struct sw_flow {
97 struct rcu_head rcu;
98 struct hlist_node hash_node[2];
99 u32 hash;
100
101 struct sw_flow_key key;
102 struct sw_flow_actions __rcu *sf_acts;
103
104 spinlock_t lock; /* Lock for values below. */
105 unsigned long used; /* Last used time (in jiffies). */
106 u64 packet_count; /* Number of packets matched. */
107 u64 byte_count; /* Number of bytes matched. */
108 u8 tcp_flags; /* Union of seen TCP flags. */
109};
110
111struct arp_eth_header {
112 __be16 ar_hrd; /* format of hardware address */
113 __be16 ar_pro; /* format of protocol address */
114 unsigned char ar_hln; /* length of hardware address */
115 unsigned char ar_pln; /* length of protocol address */
116 __be16 ar_op; /* ARP opcode (command) */
117
118 /* Ethernet+IPv4 specific members. */
119 unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */
120 unsigned char ar_sip[4]; /* sender IP address */
121 unsigned char ar_tha[ETH_ALEN]; /* target hardware address */
122 unsigned char ar_tip[4]; /* target IP address */
123} __packed;
124
125int ovs_flow_init(void);
126void ovs_flow_exit(void);
127
128struct sw_flow *ovs_flow_alloc(void);
129void ovs_flow_deferred_free(struct sw_flow *);
130void ovs_flow_free(struct sw_flow *flow);
131
132struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *);
133void ovs_flow_deferred_free_acts(struct sw_flow_actions *);
134
135int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *,
136 int *key_lenp);
137void ovs_flow_used(struct sw_flow *, struct sk_buff *);
138u64 ovs_flow_used_time(unsigned long flow_jiffies);
139
140/* Upper bound on the length of a nlattr-formatted flow key. The longest
141 * nlattr-formatted flow key would be:
142 *
143 * struct pad nl hdr total
144 * ------ --- ------ -----
145 * OVS_KEY_ATTR_PRIORITY 4 -- 4 8
146 * OVS_KEY_ATTR_IN_PORT 4 -- 4 8
147 * OVS_KEY_ATTR_ETHERNET 12 -- 4 16
148 * OVS_KEY_ATTR_8021Q 4 -- 4 8
149 * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8
150 * OVS_KEY_ATTR_IPV6 40 -- 4 44
151 * OVS_KEY_ATTR_ICMPV6 2 2 4 8
152 * OVS_KEY_ATTR_ND 28 -- 4 32
153 * -------------------------------------------------
154 * total 132
155 */
156#define FLOW_BUFSIZE 132
157
158int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *);
159int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
160 const struct nlattr *);
161int ovs_flow_metadata_from_nlattrs(u32 *priority, u16 *in_port,
162 const struct nlattr *);
163
164#define TBL_MIN_BUCKETS 1024
165
166struct flow_table {
167 struct flex_array *buckets;
168 unsigned int count, n_buckets;
169 struct rcu_head rcu;
170 int node_ver;
171 u32 hash_seed;
172 bool keep_flows;
173};
174
175static inline int ovs_flow_tbl_count(struct flow_table *table)
176{
177 return table->count;
178}
179
180static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table)
181{
182 return (table->count > table->n_buckets);
183}
184
185struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
186 struct sw_flow_key *key, int len);
187void ovs_flow_tbl_destroy(struct flow_table *table);
188void ovs_flow_tbl_deferred_destroy(struct flow_table *table);
189struct flow_table *ovs_flow_tbl_alloc(int new_size);
190struct flow_table *ovs_flow_tbl_expand(struct flow_table *table);
191struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table);
192void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow);
193void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);
194u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len);
195
196struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx);
197extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1];
198
199#endif /* flow.h */
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
new file mode 100644
index 00000000000..8fc28b86f2b
--- /dev/null
+++ b/net/openvswitch/vport-internal_dev.c
@@ -0,0 +1,241 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#include <linux/hardirq.h>
20#include <linux/if_vlan.h>
21#include <linux/kernel.h>
22#include <linux/netdevice.h>
23#include <linux/etherdevice.h>
24#include <linux/ethtool.h>
25#include <linux/skbuff.h>
26#include <linux/version.h>
27
28#include "datapath.h"
29#include "vport-internal_dev.h"
30#include "vport-netdev.h"
31
32struct internal_dev {
33 struct vport *vport;
34};
35
36static struct internal_dev *internal_dev_priv(struct net_device *netdev)
37{
38 return netdev_priv(netdev);
39}
40
41/* This function is only called by the kernel network layer.*/
42static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev,
43 struct rtnl_link_stats64 *stats)
44{
45 struct vport *vport = ovs_internal_dev_get_vport(netdev);
46 struct ovs_vport_stats vport_stats;
47
48 ovs_vport_get_stats(vport, &vport_stats);
49
50 /* The tx and rx stats need to be swapped because the
51 * switch and host OS have opposite perspectives. */
52 stats->rx_packets = vport_stats.tx_packets;
53 stats->tx_packets = vport_stats.rx_packets;
54 stats->rx_bytes = vport_stats.tx_bytes;
55 stats->tx_bytes = vport_stats.rx_bytes;
56 stats->rx_errors = vport_stats.tx_errors;
57 stats->tx_errors = vport_stats.rx_errors;
58 stats->rx_dropped = vport_stats.tx_dropped;
59 stats->tx_dropped = vport_stats.rx_dropped;
60
61 return stats;
62}
63
64static int internal_dev_mac_addr(struct net_device *dev, void *p)
65{
66 struct sockaddr *addr = p;
67
68 if (!is_valid_ether_addr(addr->sa_data))
69 return -EADDRNOTAVAIL;
70 memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
71 return 0;
72}
73
74/* Called with rcu_read_lock_bh. */
75static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
76{
77 rcu_read_lock();
78 ovs_vport_receive(internal_dev_priv(netdev)->vport, skb);
79 rcu_read_unlock();
80 return 0;
81}
82
83static int internal_dev_open(struct net_device *netdev)
84{
85 netif_start_queue(netdev);
86 return 0;
87}
88
89static int internal_dev_stop(struct net_device *netdev)
90{
91 netif_stop_queue(netdev);
92 return 0;
93}
94
95static void internal_dev_getinfo(struct net_device *netdev,
96 struct ethtool_drvinfo *info)
97{
98 strcpy(info->driver, "openvswitch");
99}
100
101static const struct ethtool_ops internal_dev_ethtool_ops = {
102 .get_drvinfo = internal_dev_getinfo,
103 .get_link = ethtool_op_get_link,
104};
105
106static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu)
107{
108 if (new_mtu < 68)
109 return -EINVAL;
110
111 netdev->mtu = new_mtu;
112 return 0;
113}
114
115static void internal_dev_destructor(struct net_device *dev)
116{
117 struct vport *vport = ovs_internal_dev_get_vport(dev);
118
119 ovs_vport_free(vport);
120 free_netdev(dev);
121}
122
123static const struct net_device_ops internal_dev_netdev_ops = {
124 .ndo_open = internal_dev_open,
125 .ndo_stop = internal_dev_stop,
126 .ndo_start_xmit = internal_dev_xmit,
127 .ndo_set_mac_address = internal_dev_mac_addr,
128 .ndo_change_mtu = internal_dev_change_mtu,
129 .ndo_get_stats64 = internal_dev_get_stats,
130};
131
132static void do_setup(struct net_device *netdev)
133{
134 ether_setup(netdev);
135
136 netdev->netdev_ops = &internal_dev_netdev_ops;
137
138 netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
139 netdev->destructor = internal_dev_destructor;
140 SET_ETHTOOL_OPS(netdev, &internal_dev_ethtool_ops);
141 netdev->tx_queue_len = 0;
142
143 netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST |
144 NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO;
145
146 netdev->vlan_features = netdev->features;
147 netdev->features |= NETIF_F_HW_VLAN_TX;
148 netdev->hw_features = netdev->features & ~NETIF_F_LLTX;
149 random_ether_addr(netdev->dev_addr);
150}
151
152static struct vport *internal_dev_create(const struct vport_parms *parms)
153{
154 struct vport *vport;
155 struct netdev_vport *netdev_vport;
156 struct internal_dev *internal_dev;
157 int err;
158
159 vport = ovs_vport_alloc(sizeof(struct netdev_vport),
160 &ovs_internal_vport_ops, parms);
161 if (IS_ERR(vport)) {
162 err = PTR_ERR(vport);
163 goto error;
164 }
165
166 netdev_vport = netdev_vport_priv(vport);
167
168 netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev),
169 parms->name, do_setup);
170 if (!netdev_vport->dev) {
171 err = -ENOMEM;
172 goto error_free_vport;
173 }
174
175 internal_dev = internal_dev_priv(netdev_vport->dev);
176 internal_dev->vport = vport;
177
178 err = register_netdevice(netdev_vport->dev);
179 if (err)
180 goto error_free_netdev;
181
182 dev_set_promiscuity(netdev_vport->dev, 1);
183 netif_start_queue(netdev_vport->dev);
184
185 return vport;
186
187error_free_netdev:
188 free_netdev(netdev_vport->dev);
189error_free_vport:
190 ovs_vport_free(vport);
191error:
192 return ERR_PTR(err);
193}
194
195static void internal_dev_destroy(struct vport *vport)
196{
197 struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
198
199 netif_stop_queue(netdev_vport->dev);
200 dev_set_promiscuity(netdev_vport->dev, -1);
201
202 /* unregister_netdevice() waits for an RCU grace period. */
203 unregister_netdevice(netdev_vport->dev);
204}
205
206static int internal_dev_recv(struct vport *vport, struct sk_buff *skb)
207{
208 struct net_device *netdev = netdev_vport_priv(vport)->dev;
209 int len;
210
211 len = skb->len;
212 skb->dev = netdev;
213 skb->pkt_type = PACKET_HOST;
214 skb->protocol = eth_type_trans(skb, netdev);
215
216 netif_rx(skb);
217
218 return len;
219}
220
221const struct vport_ops ovs_internal_vport_ops = {
222 .type = OVS_VPORT_TYPE_INTERNAL,
223 .create = internal_dev_create,
224 .destroy = internal_dev_destroy,
225 .get_name = ovs_netdev_get_name,
226 .get_ifindex = ovs_netdev_get_ifindex,
227 .send = internal_dev_recv,
228};
229
230int ovs_is_internal_dev(const struct net_device *netdev)
231{
232 return netdev->netdev_ops == &internal_dev_netdev_ops;
233}
234
235struct vport *ovs_internal_dev_get_vport(struct net_device *netdev)
236{
237 if (!ovs_is_internal_dev(netdev))
238 return NULL;
239
240 return internal_dev_priv(netdev)->vport;
241}
diff --git a/net/openvswitch/vport-internal_dev.h b/net/openvswitch/vport-internal_dev.h
new file mode 100644
index 00000000000..3454447c5f1
--- /dev/null
+++ b/net/openvswitch/vport-internal_dev.h
@@ -0,0 +1,28 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#ifndef VPORT_INTERNAL_DEV_H
20#define VPORT_INTERNAL_DEV_H 1
21
22#include "datapath.h"
23#include "vport.h"
24
25int ovs_is_internal_dev(const struct net_device *);
26struct vport *ovs_internal_dev_get_vport(struct net_device *);
27
28#endif /* vport-internal_dev.h */
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
new file mode 100644
index 00000000000..c1068aed03d
--- /dev/null
+++ b/net/openvswitch/vport-netdev.c
@@ -0,0 +1,198 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/if_arp.h>
22#include <linux/if_bridge.h>
23#include <linux/if_vlan.h>
24#include <linux/kernel.h>
25#include <linux/llc.h>
26#include <linux/rtnetlink.h>
27#include <linux/skbuff.h>
28
29#include <net/llc.h>
30
31#include "datapath.h"
32#include "vport-internal_dev.h"
33#include "vport-netdev.h"
34
35/* Must be called with rcu_read_lock. */
36static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
37{
38 if (unlikely(!vport)) {
39 kfree_skb(skb);
40 return;
41 }
42
43 /* Make our own copy of the packet. Otherwise we will mangle the
44 * packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
45 * (No one comes after us, since we tell handle_bridge() that we took
46 * the packet.) */
47 skb = skb_share_check(skb, GFP_ATOMIC);
48 if (unlikely(!skb))
49 return;
50
51 skb_push(skb, ETH_HLEN);
52 ovs_vport_receive(vport, skb);
53}
54
55/* Called with rcu_read_lock and bottom-halves disabled. */
56static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb)
57{
58 struct sk_buff *skb = *pskb;
59 struct vport *vport;
60
61 if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
62 return RX_HANDLER_PASS;
63
64 vport = ovs_netdev_get_vport(skb->dev);
65
66 netdev_port_receive(vport, skb);
67
68 return RX_HANDLER_CONSUMED;
69}
70
71static struct vport *netdev_create(const struct vport_parms *parms)
72{
73 struct vport *vport;
74 struct netdev_vport *netdev_vport;
75 int err;
76
77 vport = ovs_vport_alloc(sizeof(struct netdev_vport),
78 &ovs_netdev_vport_ops, parms);
79 if (IS_ERR(vport)) {
80 err = PTR_ERR(vport);
81 goto error;
82 }
83
84 netdev_vport = netdev_vport_priv(vport);
85
86 netdev_vport->dev = dev_get_by_name(&init_net, parms->name);
87 if (!netdev_vport->dev) {
88 err = -ENODEV;
89 goto error_free_vport;
90 }
91
92 if (netdev_vport->dev->flags & IFF_LOOPBACK ||
93 netdev_vport->dev->type != ARPHRD_ETHER ||
94 ovs_is_internal_dev(netdev_vport->dev)) {
95 err = -EINVAL;
96 goto error_put;
97 }
98
99 err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
100 vport);
101 if (err)
102 goto error_put;
103
104 dev_set_promiscuity(netdev_vport->dev, 1);
105 netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
106
107 return vport;
108
109error_put:
110 dev_put(netdev_vport->dev);
111error_free_vport:
112 ovs_vport_free(vport);
113error:
114 return ERR_PTR(err);
115}
116
117static void netdev_destroy(struct vport *vport)
118{
119 struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
120
121 netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
122 netdev_rx_handler_unregister(netdev_vport->dev);
123 dev_set_promiscuity(netdev_vport->dev, -1);
124
125 synchronize_rcu();
126
127 dev_put(netdev_vport->dev);
128 ovs_vport_free(vport);
129}
130
131const char *ovs_netdev_get_name(const struct vport *vport)
132{
133 const struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
134 return netdev_vport->dev->name;
135}
136
137int ovs_netdev_get_ifindex(const struct vport *vport)
138{
139 const struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
140 return netdev_vport->dev->ifindex;
141}
142
143static unsigned packet_length(const struct sk_buff *skb)
144{
145 unsigned length = skb->len - ETH_HLEN;
146
147 if (skb->protocol == htons(ETH_P_8021Q))
148 length -= VLAN_HLEN;
149
150 return length;
151}
152
153static int netdev_send(struct vport *vport, struct sk_buff *skb)
154{
155 struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
156 int mtu = netdev_vport->dev->mtu;
157 int len;
158
159 if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
160 if (net_ratelimit())
161 pr_warn("%s: dropped over-mtu packet: %d > %d\n",
162 ovs_dp_name(vport->dp), packet_length(skb), mtu);
163 goto error;
164 }
165
166 if (unlikely(skb_warn_if_lro(skb)))
167 goto error;
168
169 skb->dev = netdev_vport->dev;
170 len = skb->len;
171 dev_queue_xmit(skb);
172
173 return len;
174
175error:
176 kfree_skb(skb);
177 ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
178 return 0;
179}
180
181/* Returns null if this device is not attached to a datapath. */
182struct vport *ovs_netdev_get_vport(struct net_device *dev)
183{
184 if (likely(dev->priv_flags & IFF_OVS_DATAPATH))
185 return (struct vport *)
186 rcu_dereference_rtnl(dev->rx_handler_data);
187 else
188 return NULL;
189}
190
191const struct vport_ops ovs_netdev_vport_ops = {
192 .type = OVS_VPORT_TYPE_NETDEV,
193 .create = netdev_create,
194 .destroy = netdev_destroy,
195 .get_name = ovs_netdev_get_name,
196 .get_ifindex = ovs_netdev_get_ifindex,
197 .send = netdev_send,
198};
diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h
new file mode 100644
index 00000000000..fd9b008a0e6
--- /dev/null
+++ b/net/openvswitch/vport-netdev.h
@@ -0,0 +1,42 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#ifndef VPORT_NETDEV_H
20#define VPORT_NETDEV_H 1
21
22#include <linux/netdevice.h>
23
24#include "vport.h"
25
26struct vport *ovs_netdev_get_vport(struct net_device *dev);
27
28struct netdev_vport {
29 struct net_device *dev;
30};
31
32static inline struct netdev_vport *
33netdev_vport_priv(const struct vport *vport)
34{
35 return vport_priv(vport);
36}
37
38const char *ovs_netdev_get_name(const struct vport *);
39const char *ovs_netdev_get_config(const struct vport *);
40int ovs_netdev_get_ifindex(const struct vport *);
41
42#endif /* vport_netdev.h */
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
new file mode 100644
index 00000000000..6cd760131f1
--- /dev/null
+++ b/net/openvswitch/vport.c
@@ -0,0 +1,396 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#include <linux/dcache.h>
20#include <linux/etherdevice.h>
21#include <linux/if.h>
22#include <linux/if_vlan.h>
23#include <linux/kernel.h>
24#include <linux/list.h>
25#include <linux/mutex.h>
26#include <linux/percpu.h>
27#include <linux/rcupdate.h>
28#include <linux/rtnetlink.h>
29#include <linux/compat.h>
30#include <linux/version.h>
31
32#include "vport.h"
33#include "vport-internal_dev.h"
34
35/* List of statically compiled vport implementations. Don't forget to also
36 * add yours to the list at the bottom of vport.h. */
37static const struct vport_ops *vport_ops_list[] = {
38 &ovs_netdev_vport_ops,
39 &ovs_internal_vport_ops,
40};
41
42/* Protected by RCU read lock for reading, RTNL lock for writing. */
43static struct hlist_head *dev_table;
44#define VPORT_HASH_BUCKETS 1024
45
46/**
47 * ovs_vport_init - initialize vport subsystem
48 *
49 * Called at module load time to initialize the vport subsystem.
50 */
51int ovs_vport_init(void)
52{
53 dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
54 GFP_KERNEL);
55 if (!dev_table)
56 return -ENOMEM;
57
58 return 0;
59}
60
61/**
62 * ovs_vport_exit - shutdown vport subsystem
63 *
64 * Called at module exit time to shutdown the vport subsystem.
65 */
66void ovs_vport_exit(void)
67{
68 kfree(dev_table);
69}
70
71static struct hlist_head *hash_bucket(const char *name)
72{
73 unsigned int hash = full_name_hash(name, strlen(name));
74 return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)];
75}
76
77/**
78 * ovs_vport_locate - find a port that has already been created
79 *
80 * @name: name of port to find
81 *
82 * Must be called with RTNL or RCU read lock.
83 */
84struct vport *ovs_vport_locate(const char *name)
85{
86 struct hlist_head *bucket = hash_bucket(name);
87 struct vport *vport;
88 struct hlist_node *node;
89
90 hlist_for_each_entry_rcu(vport, node, bucket, hash_node)
91 if (!strcmp(name, vport->ops->get_name(vport)))
92 return vport;
93
94 return NULL;
95}
96
97/**
98 * ovs_vport_alloc - allocate and initialize new vport
99 *
100 * @priv_size: Size of private data area to allocate.
101 * @ops: vport device ops
102 *
103 * Allocate and initialize a new vport defined by @ops. The vport will contain
104 * a private data area of size @priv_size that can be accessed using
105 * vport_priv(). vports that are no longer needed should be released with
106 * vport_free().
107 */
108struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
109 const struct vport_parms *parms)
110{
111 struct vport *vport;
112 size_t alloc_size;
113
114 alloc_size = sizeof(struct vport);
115 if (priv_size) {
116 alloc_size = ALIGN(alloc_size, VPORT_ALIGN);
117 alloc_size += priv_size;
118 }
119
120 vport = kzalloc(alloc_size, GFP_KERNEL);
121 if (!vport)
122 return ERR_PTR(-ENOMEM);
123
124 vport->dp = parms->dp;
125 vport->port_no = parms->port_no;
126 vport->upcall_pid = parms->upcall_pid;
127 vport->ops = ops;
128
129 vport->percpu_stats = alloc_percpu(struct vport_percpu_stats);
130 if (!vport->percpu_stats)
131 return ERR_PTR(-ENOMEM);
132
133 spin_lock_init(&vport->stats_lock);
134
135 return vport;
136}
137
138/**
139 * ovs_vport_free - uninitialize and free vport
140 *
141 * @vport: vport to free
142 *
143 * Frees a vport allocated with vport_alloc() when it is no longer needed.
144 *
145 * The caller must ensure that an RCU grace period has passed since the last
146 * time @vport was in a datapath.
147 */
148void ovs_vport_free(struct vport *vport)
149{
150 free_percpu(vport->percpu_stats);
151 kfree(vport);
152}
153
154/**
155 * ovs_vport_add - add vport device (for kernel callers)
156 *
157 * @parms: Information about new vport.
158 *
159 * Creates a new vport with the specified configuration (which is dependent on
160 * device type). RTNL lock must be held.
161 */
162struct vport *ovs_vport_add(const struct vport_parms *parms)
163{
164 struct vport *vport;
165 int err = 0;
166 int i;
167
168 ASSERT_RTNL();
169
170 for (i = 0; i < ARRAY_SIZE(vport_ops_list); i++) {
171 if (vport_ops_list[i]->type == parms->type) {
172 vport = vport_ops_list[i]->create(parms);
173 if (IS_ERR(vport)) {
174 err = PTR_ERR(vport);
175 goto out;
176 }
177
178 hlist_add_head_rcu(&vport->hash_node,
179 hash_bucket(vport->ops->get_name(vport)));
180 return vport;
181 }
182 }
183
184 err = -EAFNOSUPPORT;
185
186out:
187 return ERR_PTR(err);
188}
189
190/**
191 * ovs_vport_set_options - modify existing vport device (for kernel callers)
192 *
193 * @vport: vport to modify.
194 * @port: New configuration.
195 *
196 * Modifies an existing device with the specified configuration (which is
197 * dependent on device type). RTNL lock must be held.
198 */
199int ovs_vport_set_options(struct vport *vport, struct nlattr *options)
200{
201 ASSERT_RTNL();
202
203 if (!vport->ops->set_options)
204 return -EOPNOTSUPP;
205 return vport->ops->set_options(vport, options);
206}
207
208/**
209 * ovs_vport_del - delete existing vport device
210 *
211 * @vport: vport to delete.
212 *
213 * Detaches @vport from its datapath and destroys it. It is possible to fail
214 * for reasons such as lack of memory. RTNL lock must be held.
215 */
216void ovs_vport_del(struct vport *vport)
217{
218 ASSERT_RTNL();
219
220 hlist_del_rcu(&vport->hash_node);
221
222 vport->ops->destroy(vport);
223}
224
225/**
226 * ovs_vport_get_stats - retrieve device stats
227 *
228 * @vport: vport from which to retrieve the stats
229 * @stats: location to store stats
230 *
231 * Retrieves transmit, receive, and error stats for the given device.
232 *
233 * Must be called with RTNL lock or rcu_read_lock.
234 */
235void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats)
236{
237 int i;
238
239 memset(stats, 0, sizeof(*stats));
240
241 /* We potentially have 2 sources of stats that need to be combined:
242 * those we have collected (split into err_stats and percpu_stats) from
243 * set_stats() and device error stats from netdev->get_stats() (for
244 * errors that happen downstream and therefore aren't reported through
245 * our vport_record_error() function).
246 * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS).
247 * netdev-stats can be directly read over netlink-ioctl.
248 */
249
250 spin_lock_bh(&vport->stats_lock);
251
252 stats->rx_errors = vport->err_stats.rx_errors;
253 stats->tx_errors = vport->err_stats.tx_errors;
254 stats->tx_dropped = vport->err_stats.tx_dropped;
255 stats->rx_dropped = vport->err_stats.rx_dropped;
256
257 spin_unlock_bh(&vport->stats_lock);
258
259 for_each_possible_cpu(i) {
260 const struct vport_percpu_stats *percpu_stats;
261 struct vport_percpu_stats local_stats;
262 unsigned int start;
263
264 percpu_stats = per_cpu_ptr(vport->percpu_stats, i);
265
266 do {
267 start = u64_stats_fetch_begin_bh(&percpu_stats->sync);
268 local_stats = *percpu_stats;
269 } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start));
270
271 stats->rx_bytes += local_stats.rx_bytes;
272 stats->rx_packets += local_stats.rx_packets;
273 stats->tx_bytes += local_stats.tx_bytes;
274 stats->tx_packets += local_stats.tx_packets;
275 }
276}
277
278/**
279 * ovs_vport_get_options - retrieve device options
280 *
281 * @vport: vport from which to retrieve the options.
282 * @skb: sk_buff where options should be appended.
283 *
284 * Retrieves the configuration of the given device, appending an
285 * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested
286 * vport-specific attributes to @skb.
287 *
288 * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another
289 * negative error code if a real error occurred. If an error occurs, @skb is
290 * left unmodified.
291 *
292 * Must be called with RTNL lock or rcu_read_lock.
293 */
294int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb)
295{
296 struct nlattr *nla;
297
298 nla = nla_nest_start(skb, OVS_VPORT_ATTR_OPTIONS);
299 if (!nla)
300 return -EMSGSIZE;
301
302 if (vport->ops->get_options) {
303 int err = vport->ops->get_options(vport, skb);
304 if (err) {
305 nla_nest_cancel(skb, nla);
306 return err;
307 }
308 }
309
310 nla_nest_end(skb, nla);
311 return 0;
312}
313
314/**
315 * ovs_vport_receive - pass up received packet to the datapath for processing
316 *
317 * @vport: vport that received the packet
318 * @skb: skb that was received
319 *
320 * Must be called with rcu_read_lock. The packet cannot be shared and
321 * skb->data should point to the Ethernet header. The caller must have already
322 * called compute_ip_summed() to initialize the checksumming fields.
323 */
324void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
325{
326 struct vport_percpu_stats *stats;
327
328 stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());
329
330 u64_stats_update_begin(&stats->sync);
331 stats->rx_packets++;
332 stats->rx_bytes += skb->len;
333 u64_stats_update_end(&stats->sync);
334
335 ovs_dp_process_received_packet(vport, skb);
336}
337
338/**
339 * ovs_vport_send - send a packet on a device
340 *
341 * @vport: vport on which to send the packet
342 * @skb: skb to send
343 *
344 * Sends the given packet and returns the length of data sent. Either RTNL
345 * lock or rcu_read_lock must be held.
346 */
347int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
348{
349 int sent = vport->ops->send(vport, skb);
350
351 if (likely(sent)) {
352 struct vport_percpu_stats *stats;
353
354 stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());
355
356 u64_stats_update_begin(&stats->sync);
357 stats->tx_packets++;
358 stats->tx_bytes += sent;
359 u64_stats_update_end(&stats->sync);
360 }
361 return sent;
362}
363
364/**
365 * ovs_vport_record_error - indicate device error to generic stats layer
366 *
367 * @vport: vport that encountered the error
368 * @err_type: one of enum vport_err_type types to indicate the error type
369 *
370 * If using the vport generic stats layer indicate that an error of the given
371 * type has occured.
372 */
373void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type)
374{
375 spin_lock(&vport->stats_lock);
376
377 switch (err_type) {
378 case VPORT_E_RX_DROPPED:
379 vport->err_stats.rx_dropped++;
380 break;
381
382 case VPORT_E_RX_ERROR:
383 vport->err_stats.rx_errors++;
384 break;
385
386 case VPORT_E_TX_DROPPED:
387 vport->err_stats.tx_dropped++;
388 break;
389
390 case VPORT_E_TX_ERROR:
391 vport->err_stats.tx_errors++;
392 break;
393 };
394
395 spin_unlock(&vport->stats_lock);
396}
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
new file mode 100644
index 00000000000..19609629dab
--- /dev/null
+++ b/net/openvswitch/vport.h
@@ -0,0 +1,205 @@
1/*
2 * Copyright (c) 2007-2011 Nicira Networks.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#ifndef VPORT_H
20#define VPORT_H 1
21
22#include <linux/list.h>
23#include <linux/openvswitch.h>
24#include <linux/skbuff.h>
25#include <linux/spinlock.h>
26#include <linux/u64_stats_sync.h>
27
28#include "datapath.h"
29
30struct vport;
31struct vport_parms;
32
33/* The following definitions are for users of the vport subsytem: */
34
35int ovs_vport_init(void);
36void ovs_vport_exit(void);
37
38struct vport *ovs_vport_add(const struct vport_parms *);
39void ovs_vport_del(struct vport *);
40
41struct vport *ovs_vport_locate(const char *name);
42
43void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *);
44
45int ovs_vport_set_options(struct vport *, struct nlattr *options);
46int ovs_vport_get_options(const struct vport *, struct sk_buff *);
47
48int ovs_vport_send(struct vport *, struct sk_buff *);
49
50/* The following definitions are for implementers of vport devices: */
51
52struct vport_percpu_stats {
53 u64 rx_bytes;
54 u64 rx_packets;
55 u64 tx_bytes;
56 u64 tx_packets;
57 struct u64_stats_sync sync;
58};
59
60struct vport_err_stats {
61 u64 rx_dropped;
62 u64 rx_errors;
63 u64 tx_dropped;
64 u64 tx_errors;
65};
66
67/**
68 * struct vport - one port within a datapath
69 * @rcu: RCU callback head for deferred destruction.
70 * @port_no: Index into @dp's @ports array.
71 * @dp: Datapath to which this port belongs.
72 * @node: Element in @dp's @port_list.
73 * @upcall_pid: The Netlink port to use for packets received on this port that
74 * miss the flow table.
75 * @hash_node: Element in @dev_table hash table in vport.c.
76 * @ops: Class structure.
77 * @percpu_stats: Points to per-CPU statistics used and maintained by vport
78 * @stats_lock: Protects @err_stats;
79 * @err_stats: Points to error statistics used and maintained by vport
80 */
81struct vport {
82 struct rcu_head rcu;
83 u16 port_no;
84 struct datapath *dp;
85 struct list_head node;
86 u32 upcall_pid;
87
88 struct hlist_node hash_node;
89 const struct vport_ops *ops;
90
91 struct vport_percpu_stats __percpu *percpu_stats;
92
93 spinlock_t stats_lock;
94 struct vport_err_stats err_stats;
95};
96
97/**
98 * struct vport_parms - parameters for creating a new vport
99 *
100 * @name: New vport's name.
101 * @type: New vport's type.
102 * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if
103 * none was supplied.
104 * @dp: New vport's datapath.
105 * @port_no: New vport's port number.
106 */
107struct vport_parms {
108 const char *name;
109 enum ovs_vport_type type;
110 struct nlattr *options;
111
112 /* For ovs_vport_alloc(). */
113 struct datapath *dp;
114 u16 port_no;
115 u32 upcall_pid;
116};
117
118/**
119 * struct vport_ops - definition of a type of virtual port
120 *
121 * @type: %OVS_VPORT_TYPE_* value for this type of virtual port.
122 * @create: Create a new vport configured as specified. On success returns
123 * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value.
124 * @destroy: Destroys a vport. Must call vport_free() on the vport but not
125 * before an RCU grace period has elapsed.
126 * @set_options: Modify the configuration of an existing vport. May be %NULL
127 * if modification is not supported.
128 * @get_options: Appends vport-specific attributes for the configuration of an
129 * existing vport to a &struct sk_buff. May be %NULL for a vport that does not
130 * have any configuration.
131 * @get_name: Get the device's name.
132 * @get_config: Get the device's configuration.
133 * @get_ifindex: Get the system interface index associated with the device.
134 * May be null if the device does not have an ifindex.
135 * @send: Send a packet on the device. Returns the length of the packet sent.
136 */
137struct vport_ops {
138 enum ovs_vport_type type;
139
140 /* Called with RTNL lock. */
141 struct vport *(*create)(const struct vport_parms *);
142 void (*destroy)(struct vport *);
143
144 int (*set_options)(struct vport *, struct nlattr *);
145 int (*get_options)(const struct vport *, struct sk_buff *);
146
147 /* Called with rcu_read_lock or RTNL lock. */
148 const char *(*get_name)(const struct vport *);
149 void (*get_config)(const struct vport *, void *);
150 int (*get_ifindex)(const struct vport *);
151
152 int (*send)(struct vport *, struct sk_buff *);
153};
154
155enum vport_err_type {
156 VPORT_E_RX_DROPPED,
157 VPORT_E_RX_ERROR,
158 VPORT_E_TX_DROPPED,
159 VPORT_E_TX_ERROR,
160};
161
162struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *,
163 const struct vport_parms *);
164void ovs_vport_free(struct vport *);
165
166#define VPORT_ALIGN 8
167
168/**
169 * vport_priv - access private data area of vport
170 *
171 * @vport: vport to access
172 *
173 * If a nonzero size was passed in priv_size of vport_alloc() a private data
174 * area was allocated on creation. This allows that area to be accessed and
175 * used for any purpose needed by the vport implementer.
176 */
177static inline void *vport_priv(const struct vport *vport)
178{
179 return (u8 *)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN);
180}
181
182/**
183 * vport_from_priv - lookup vport from private data pointer
184 *
185 * @priv: Start of private data area.
186 *
187 * It is sometimes useful to translate from a pointer to the private data
188 * area to the vport, such as in the case where the private data pointer is
189 * the result of a hash table lookup. @priv must point to the start of the
190 * private data area.
191 */
192static inline struct vport *vport_from_priv(const void *priv)
193{
194 return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN));
195}
196
197void ovs_vport_receive(struct vport *, struct sk_buff *);
198void ovs_vport_record_error(struct vport *, enum vport_err_type err_type);
199
200/* List of statically compiled vport implementations. Don't forget to also
201 * add yours to the list at the top of vport.c. */
202extern const struct vport_ops ovs_netdev_vport_ops;
203extern const struct vport_ops ovs_internal_vport_ops;
204
205#endif /* vport.h */