diff options
author | Jesse Gross <jesse@nicira.com> | 2011-10-25 22:26:31 -0400 |
---|---|---|
committer | Jesse Gross <jesse@nicira.com> | 2011-12-03 12:35:17 -0500 |
commit | ccb1352e76cff0524e7ccb2074826a092dd13016 (patch) | |
tree | 9122ceff5d75ec64e327a9fad4ad2013744c2999 /net/openvswitch/vport.c | |
parent | 75f2811c6460ccc59d83c66059943ce9c9f81a18 (diff) |
net: Add Open vSwitch kernel components.
Open vSwitch is a multilayer Ethernet switch targeted at virtualized
environments. In addition to supporting a variety of features
expected in a traditional hardware switch, it enables fine-grained
programmatic extension and flow-based control of the network.
This control is useful in a wide variety of applications but is
particularly important in multi-server virtualization deployments,
which are often characterized by highly dynamic endpoints and the need
to maintain logical abstractions for multiple tenants.
The Open vSwitch datapath provides an in-kernel fast path for packet
forwarding. It is complemented by a userspace daemon, ovs-vswitchd,
which is able to accept configuration from a variety of sources and
translate it into packet processing rules.
See http://openvswitch.org for more information and userspace
utilities.
Signed-off-by: Jesse Gross <jesse@nicira.com>
Diffstat (limited to 'net/openvswitch/vport.c')
-rw-r--r-- | net/openvswitch/vport.c | 396 |
1 files changed, 396 insertions, 0 deletions
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c new file mode 100644 index 000000000000..6cd760131f15 --- /dev/null +++ b/net/openvswitch/vport.c | |||
@@ -0,0 +1,396 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2007-2011 Nicira Networks. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of version 2 of the GNU General Public | ||
6 | * License as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but | ||
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program; if not, write to the Free Software | ||
15 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA | ||
16 | * 02110-1301, USA | ||
17 | */ | ||
18 | |||
19 | #include <linux/dcache.h> | ||
20 | #include <linux/etherdevice.h> | ||
21 | #include <linux/if.h> | ||
22 | #include <linux/if_vlan.h> | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/list.h> | ||
25 | #include <linux/mutex.h> | ||
26 | #include <linux/percpu.h> | ||
27 | #include <linux/rcupdate.h> | ||
28 | #include <linux/rtnetlink.h> | ||
29 | #include <linux/compat.h> | ||
30 | #include <linux/version.h> | ||
31 | |||
32 | #include "vport.h" | ||
33 | #include "vport-internal_dev.h" | ||
34 | |||
35 | /* List of statically compiled vport implementations. Don't forget to also | ||
36 | * add yours to the list at the bottom of vport.h. */ | ||
37 | static const struct vport_ops *vport_ops_list[] = { | ||
38 | &ovs_netdev_vport_ops, | ||
39 | &ovs_internal_vport_ops, | ||
40 | }; | ||
41 | |||
42 | /* Protected by RCU read lock for reading, RTNL lock for writing. */ | ||
43 | static struct hlist_head *dev_table; | ||
44 | #define VPORT_HASH_BUCKETS 1024 | ||
45 | |||
46 | /** | ||
47 | * ovs_vport_init - initialize vport subsystem | ||
48 | * | ||
49 | * Called at module load time to initialize the vport subsystem. | ||
50 | */ | ||
51 | int ovs_vport_init(void) | ||
52 | { | ||
53 | dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head), | ||
54 | GFP_KERNEL); | ||
55 | if (!dev_table) | ||
56 | return -ENOMEM; | ||
57 | |||
58 | return 0; | ||
59 | } | ||
60 | |||
61 | /** | ||
62 | * ovs_vport_exit - shutdown vport subsystem | ||
63 | * | ||
64 | * Called at module exit time to shutdown the vport subsystem. | ||
65 | */ | ||
66 | void ovs_vport_exit(void) | ||
67 | { | ||
68 | kfree(dev_table); | ||
69 | } | ||
70 | |||
71 | static struct hlist_head *hash_bucket(const char *name) | ||
72 | { | ||
73 | unsigned int hash = full_name_hash(name, strlen(name)); | ||
74 | return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; | ||
75 | } | ||
76 | |||
77 | /** | ||
78 | * ovs_vport_locate - find a port that has already been created | ||
79 | * | ||
80 | * @name: name of port to find | ||
81 | * | ||
82 | * Must be called with RTNL or RCU read lock. | ||
83 | */ | ||
84 | struct vport *ovs_vport_locate(const char *name) | ||
85 | { | ||
86 | struct hlist_head *bucket = hash_bucket(name); | ||
87 | struct vport *vport; | ||
88 | struct hlist_node *node; | ||
89 | |||
90 | hlist_for_each_entry_rcu(vport, node, bucket, hash_node) | ||
91 | if (!strcmp(name, vport->ops->get_name(vport))) | ||
92 | return vport; | ||
93 | |||
94 | return NULL; | ||
95 | } | ||
96 | |||
97 | /** | ||
98 | * ovs_vport_alloc - allocate and initialize new vport | ||
99 | * | ||
100 | * @priv_size: Size of private data area to allocate. | ||
101 | * @ops: vport device ops | ||
102 | * | ||
103 | * Allocate and initialize a new vport defined by @ops. The vport will contain | ||
104 | * a private data area of size @priv_size that can be accessed using | ||
105 | * vport_priv(). vports that are no longer needed should be released with | ||
106 | * vport_free(). | ||
107 | */ | ||
108 | struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, | ||
109 | const struct vport_parms *parms) | ||
110 | { | ||
111 | struct vport *vport; | ||
112 | size_t alloc_size; | ||
113 | |||
114 | alloc_size = sizeof(struct vport); | ||
115 | if (priv_size) { | ||
116 | alloc_size = ALIGN(alloc_size, VPORT_ALIGN); | ||
117 | alloc_size += priv_size; | ||
118 | } | ||
119 | |||
120 | vport = kzalloc(alloc_size, GFP_KERNEL); | ||
121 | if (!vport) | ||
122 | return ERR_PTR(-ENOMEM); | ||
123 | |||
124 | vport->dp = parms->dp; | ||
125 | vport->port_no = parms->port_no; | ||
126 | vport->upcall_pid = parms->upcall_pid; | ||
127 | vport->ops = ops; | ||
128 | |||
129 | vport->percpu_stats = alloc_percpu(struct vport_percpu_stats); | ||
130 | if (!vport->percpu_stats) | ||
131 | return ERR_PTR(-ENOMEM); | ||
132 | |||
133 | spin_lock_init(&vport->stats_lock); | ||
134 | |||
135 | return vport; | ||
136 | } | ||
137 | |||
138 | /** | ||
139 | * ovs_vport_free - uninitialize and free vport | ||
140 | * | ||
141 | * @vport: vport to free | ||
142 | * | ||
143 | * Frees a vport allocated with vport_alloc() when it is no longer needed. | ||
144 | * | ||
145 | * The caller must ensure that an RCU grace period has passed since the last | ||
146 | * time @vport was in a datapath. | ||
147 | */ | ||
148 | void ovs_vport_free(struct vport *vport) | ||
149 | { | ||
150 | free_percpu(vport->percpu_stats); | ||
151 | kfree(vport); | ||
152 | } | ||
153 | |||
154 | /** | ||
155 | * ovs_vport_add - add vport device (for kernel callers) | ||
156 | * | ||
157 | * @parms: Information about new vport. | ||
158 | * | ||
159 | * Creates a new vport with the specified configuration (which is dependent on | ||
160 | * device type). RTNL lock must be held. | ||
161 | */ | ||
162 | struct vport *ovs_vport_add(const struct vport_parms *parms) | ||
163 | { | ||
164 | struct vport *vport; | ||
165 | int err = 0; | ||
166 | int i; | ||
167 | |||
168 | ASSERT_RTNL(); | ||
169 | |||
170 | for (i = 0; i < ARRAY_SIZE(vport_ops_list); i++) { | ||
171 | if (vport_ops_list[i]->type == parms->type) { | ||
172 | vport = vport_ops_list[i]->create(parms); | ||
173 | if (IS_ERR(vport)) { | ||
174 | err = PTR_ERR(vport); | ||
175 | goto out; | ||
176 | } | ||
177 | |||
178 | hlist_add_head_rcu(&vport->hash_node, | ||
179 | hash_bucket(vport->ops->get_name(vport))); | ||
180 | return vport; | ||
181 | } | ||
182 | } | ||
183 | |||
184 | err = -EAFNOSUPPORT; | ||
185 | |||
186 | out: | ||
187 | return ERR_PTR(err); | ||
188 | } | ||
189 | |||
190 | /** | ||
191 | * ovs_vport_set_options - modify existing vport device (for kernel callers) | ||
192 | * | ||
193 | * @vport: vport to modify. | ||
194 | * @port: New configuration. | ||
195 | * | ||
196 | * Modifies an existing device with the specified configuration (which is | ||
197 | * dependent on device type). RTNL lock must be held. | ||
198 | */ | ||
199 | int ovs_vport_set_options(struct vport *vport, struct nlattr *options) | ||
200 | { | ||
201 | ASSERT_RTNL(); | ||
202 | |||
203 | if (!vport->ops->set_options) | ||
204 | return -EOPNOTSUPP; | ||
205 | return vport->ops->set_options(vport, options); | ||
206 | } | ||
207 | |||
208 | /** | ||
209 | * ovs_vport_del - delete existing vport device | ||
210 | * | ||
211 | * @vport: vport to delete. | ||
212 | * | ||
213 | * Detaches @vport from its datapath and destroys it. It is possible to fail | ||
214 | * for reasons such as lack of memory. RTNL lock must be held. | ||
215 | */ | ||
216 | void ovs_vport_del(struct vport *vport) | ||
217 | { | ||
218 | ASSERT_RTNL(); | ||
219 | |||
220 | hlist_del_rcu(&vport->hash_node); | ||
221 | |||
222 | vport->ops->destroy(vport); | ||
223 | } | ||
224 | |||
225 | /** | ||
226 | * ovs_vport_get_stats - retrieve device stats | ||
227 | * | ||
228 | * @vport: vport from which to retrieve the stats | ||
229 | * @stats: location to store stats | ||
230 | * | ||
231 | * Retrieves transmit, receive, and error stats for the given device. | ||
232 | * | ||
233 | * Must be called with RTNL lock or rcu_read_lock. | ||
234 | */ | ||
235 | void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) | ||
236 | { | ||
237 | int i; | ||
238 | |||
239 | memset(stats, 0, sizeof(*stats)); | ||
240 | |||
241 | /* We potentially have 2 sources of stats that need to be combined: | ||
242 | * those we have collected (split into err_stats and percpu_stats) from | ||
243 | * set_stats() and device error stats from netdev->get_stats() (for | ||
244 | * errors that happen downstream and therefore aren't reported through | ||
245 | * our vport_record_error() function). | ||
246 | * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS). | ||
247 | * netdev-stats can be directly read over netlink-ioctl. | ||
248 | */ | ||
249 | |||
250 | spin_lock_bh(&vport->stats_lock); | ||
251 | |||
252 | stats->rx_errors = vport->err_stats.rx_errors; | ||
253 | stats->tx_errors = vport->err_stats.tx_errors; | ||
254 | stats->tx_dropped = vport->err_stats.tx_dropped; | ||
255 | stats->rx_dropped = vport->err_stats.rx_dropped; | ||
256 | |||
257 | spin_unlock_bh(&vport->stats_lock); | ||
258 | |||
259 | for_each_possible_cpu(i) { | ||
260 | const struct vport_percpu_stats *percpu_stats; | ||
261 | struct vport_percpu_stats local_stats; | ||
262 | unsigned int start; | ||
263 | |||
264 | percpu_stats = per_cpu_ptr(vport->percpu_stats, i); | ||
265 | |||
266 | do { | ||
267 | start = u64_stats_fetch_begin_bh(&percpu_stats->sync); | ||
268 | local_stats = *percpu_stats; | ||
269 | } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start)); | ||
270 | |||
271 | stats->rx_bytes += local_stats.rx_bytes; | ||
272 | stats->rx_packets += local_stats.rx_packets; | ||
273 | stats->tx_bytes += local_stats.tx_bytes; | ||
274 | stats->tx_packets += local_stats.tx_packets; | ||
275 | } | ||
276 | } | ||
277 | |||
278 | /** | ||
279 | * ovs_vport_get_options - retrieve device options | ||
280 | * | ||
281 | * @vport: vport from which to retrieve the options. | ||
282 | * @skb: sk_buff where options should be appended. | ||
283 | * | ||
284 | * Retrieves the configuration of the given device, appending an | ||
285 | * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested | ||
286 | * vport-specific attributes to @skb. | ||
287 | * | ||
288 | * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another | ||
289 | * negative error code if a real error occurred. If an error occurs, @skb is | ||
290 | * left unmodified. | ||
291 | * | ||
292 | * Must be called with RTNL lock or rcu_read_lock. | ||
293 | */ | ||
294 | int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) | ||
295 | { | ||
296 | struct nlattr *nla; | ||
297 | |||
298 | nla = nla_nest_start(skb, OVS_VPORT_ATTR_OPTIONS); | ||
299 | if (!nla) | ||
300 | return -EMSGSIZE; | ||
301 | |||
302 | if (vport->ops->get_options) { | ||
303 | int err = vport->ops->get_options(vport, skb); | ||
304 | if (err) { | ||
305 | nla_nest_cancel(skb, nla); | ||
306 | return err; | ||
307 | } | ||
308 | } | ||
309 | |||
310 | nla_nest_end(skb, nla); | ||
311 | return 0; | ||
312 | } | ||
313 | |||
314 | /** | ||
315 | * ovs_vport_receive - pass up received packet to the datapath for processing | ||
316 | * | ||
317 | * @vport: vport that received the packet | ||
318 | * @skb: skb that was received | ||
319 | * | ||
320 | * Must be called with rcu_read_lock. The packet cannot be shared and | ||
321 | * skb->data should point to the Ethernet header. The caller must have already | ||
322 | * called compute_ip_summed() to initialize the checksumming fields. | ||
323 | */ | ||
324 | void ovs_vport_receive(struct vport *vport, struct sk_buff *skb) | ||
325 | { | ||
326 | struct vport_percpu_stats *stats; | ||
327 | |||
328 | stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id()); | ||
329 | |||
330 | u64_stats_update_begin(&stats->sync); | ||
331 | stats->rx_packets++; | ||
332 | stats->rx_bytes += skb->len; | ||
333 | u64_stats_update_end(&stats->sync); | ||
334 | |||
335 | ovs_dp_process_received_packet(vport, skb); | ||
336 | } | ||
337 | |||
338 | /** | ||
339 | * ovs_vport_send - send a packet on a device | ||
340 | * | ||
341 | * @vport: vport on which to send the packet | ||
342 | * @skb: skb to send | ||
343 | * | ||
344 | * Sends the given packet and returns the length of data sent. Either RTNL | ||
345 | * lock or rcu_read_lock must be held. | ||
346 | */ | ||
347 | int ovs_vport_send(struct vport *vport, struct sk_buff *skb) | ||
348 | { | ||
349 | int sent = vport->ops->send(vport, skb); | ||
350 | |||
351 | if (likely(sent)) { | ||
352 | struct vport_percpu_stats *stats; | ||
353 | |||
354 | stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id()); | ||
355 | |||
356 | u64_stats_update_begin(&stats->sync); | ||
357 | stats->tx_packets++; | ||
358 | stats->tx_bytes += sent; | ||
359 | u64_stats_update_end(&stats->sync); | ||
360 | } | ||
361 | return sent; | ||
362 | } | ||
363 | |||
364 | /** | ||
365 | * ovs_vport_record_error - indicate device error to generic stats layer | ||
366 | * | ||
367 | * @vport: vport that encountered the error | ||
368 | * @err_type: one of enum vport_err_type types to indicate the error type | ||
369 | * | ||
370 | * If using the vport generic stats layer indicate that an error of the given | ||
371 | * type has occured. | ||
372 | */ | ||
373 | void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type) | ||
374 | { | ||
375 | spin_lock(&vport->stats_lock); | ||
376 | |||
377 | switch (err_type) { | ||
378 | case VPORT_E_RX_DROPPED: | ||
379 | vport->err_stats.rx_dropped++; | ||
380 | break; | ||
381 | |||
382 | case VPORT_E_RX_ERROR: | ||
383 | vport->err_stats.rx_errors++; | ||
384 | break; | ||
385 | |||
386 | case VPORT_E_TX_DROPPED: | ||
387 | vport->err_stats.tx_dropped++; | ||
388 | break; | ||
389 | |||
390 | case VPORT_E_TX_ERROR: | ||
391 | vport->err_stats.tx_errors++; | ||
392 | break; | ||
393 | }; | ||
394 | |||
395 | spin_unlock(&vport->stats_lock); | ||
396 | } | ||