diff options
author | Andrea Bastoni <bastoni@cs.unc.edu> | 2010-05-30 19:16:45 -0400 |
---|---|---|
committer | Andrea Bastoni <bastoni@cs.unc.edu> | 2010-05-30 19:16:45 -0400 |
commit | ada47b5fe13d89735805b566185f4885f5a3f750 (patch) | |
tree | 644b88f8a71896307d71438e9b3af49126ffb22b /drivers/net/macvtap.c | |
parent | 43e98717ad40a4ae64545b5ba047c7b86aa44f4f (diff) | |
parent | 3280f21d43ee541f97f8cda5792150d2dbec20d5 (diff) |
Merge branch 'wip-2.6.34' into old-private-masterarchived-private-master
Diffstat (limited to 'drivers/net/macvtap.c')
-rw-r--r-- | drivers/net/macvtap.c | 804 |
1 files changed, 804 insertions, 0 deletions
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c new file mode 100644 index 000000000000..abba3cc81f12 --- /dev/null +++ b/drivers/net/macvtap.c | |||
@@ -0,0 +1,804 @@ | |||
1 | #include <linux/etherdevice.h> | ||
2 | #include <linux/if_macvlan.h> | ||
3 | #include <linux/interrupt.h> | ||
4 | #include <linux/nsproxy.h> | ||
5 | #include <linux/compat.h> | ||
6 | #include <linux/if_tun.h> | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/skbuff.h> | ||
9 | #include <linux/cache.h> | ||
10 | #include <linux/sched.h> | ||
11 | #include <linux/types.h> | ||
12 | #include <linux/slab.h> | ||
13 | #include <linux/init.h> | ||
14 | #include <linux/wait.h> | ||
15 | #include <linux/cdev.h> | ||
16 | #include <linux/fs.h> | ||
17 | |||
18 | #include <net/net_namespace.h> | ||
19 | #include <net/rtnetlink.h> | ||
20 | #include <net/sock.h> | ||
21 | #include <linux/virtio_net.h> | ||
22 | |||
23 | /* | ||
24 | * A macvtap queue is the central object of this driver, it connects | ||
25 | * an open character device to a macvlan interface. There can be | ||
26 | * multiple queues on one interface, which map back to queues | ||
27 | * implemented in hardware on the underlying device. | ||
28 | * | ||
29 | * macvtap_proto is used to allocate queues through the sock allocation | ||
30 | * mechanism. | ||
31 | * | ||
32 | * TODO: multiqueue support is currently not implemented, even though | ||
33 | * macvtap is basically prepared for that. We will need to add this | ||
34 | * here as well as in virtio-net and qemu to get line rate on 10gbit | ||
35 | * adapters from a guest. | ||
36 | */ | ||
37 | struct macvtap_queue { | ||
38 | struct sock sk; | ||
39 | struct socket sock; | ||
40 | struct macvlan_dev *vlan; | ||
41 | struct file *file; | ||
42 | unsigned int flags; | ||
43 | }; | ||
44 | |||
45 | static struct proto macvtap_proto = { | ||
46 | .name = "macvtap", | ||
47 | .owner = THIS_MODULE, | ||
48 | .obj_size = sizeof (struct macvtap_queue), | ||
49 | }; | ||
50 | |||
51 | /* | ||
52 | * Minor number matches netdev->ifindex, so need a potentially | ||
53 | * large value. This also makes it possible to split the | ||
54 | * tap functionality out again in the future by offering it | ||
55 | * from other drivers besides macvtap. As long as every device | ||
56 | * only has one tap, the interface numbers assure that the | ||
57 | * device nodes are unique. | ||
58 | */ | ||
59 | static unsigned int macvtap_major; | ||
60 | #define MACVTAP_NUM_DEVS 65536 | ||
61 | static struct class *macvtap_class; | ||
62 | static struct cdev macvtap_cdev; | ||
63 | |||
64 | static const struct proto_ops macvtap_socket_ops; | ||
65 | |||
66 | /* | ||
67 | * RCU usage: | ||
68 | * The macvtap_queue and the macvlan_dev are loosely coupled, the | ||
69 | * pointers from one to the other can only be read while rcu_read_lock | ||
70 | * or macvtap_lock is held. | ||
71 | * | ||
72 | * Both the file and the macvlan_dev hold a reference on the macvtap_queue | ||
73 | * through sock_hold(&q->sk). When the macvlan_dev goes away first, | ||
74 | * q->vlan becomes inaccessible. When the files gets closed, | ||
75 | * macvtap_get_queue() fails. | ||
76 | * | ||
77 | * There may still be references to the struct sock inside of the | ||
78 | * queue from outbound SKBs, but these never reference back to the | ||
79 | * file or the dev. The data structure is freed through __sk_free | ||
80 | * when both our references and any pending SKBs are gone. | ||
81 | */ | ||
82 | static DEFINE_SPINLOCK(macvtap_lock); | ||
83 | |||
84 | /* | ||
85 | * Choose the next free queue, for now there is only one | ||
86 | */ | ||
87 | static int macvtap_set_queue(struct net_device *dev, struct file *file, | ||
88 | struct macvtap_queue *q) | ||
89 | { | ||
90 | struct macvlan_dev *vlan = netdev_priv(dev); | ||
91 | int err = -EBUSY; | ||
92 | |||
93 | spin_lock(&macvtap_lock); | ||
94 | if (rcu_dereference(vlan->tap)) | ||
95 | goto out; | ||
96 | |||
97 | err = 0; | ||
98 | rcu_assign_pointer(q->vlan, vlan); | ||
99 | rcu_assign_pointer(vlan->tap, q); | ||
100 | sock_hold(&q->sk); | ||
101 | |||
102 | q->file = file; | ||
103 | file->private_data = q; | ||
104 | |||
105 | out: | ||
106 | spin_unlock(&macvtap_lock); | ||
107 | return err; | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * The file owning the queue got closed, give up both | ||
112 | * the reference that the files holds as well as the | ||
113 | * one from the macvlan_dev if that still exists. | ||
114 | * | ||
115 | * Using the spinlock makes sure that we don't get | ||
116 | * to the queue again after destroying it. | ||
117 | */ | ||
118 | static void macvtap_put_queue(struct macvtap_queue *q) | ||
119 | { | ||
120 | struct macvlan_dev *vlan; | ||
121 | |||
122 | spin_lock(&macvtap_lock); | ||
123 | vlan = rcu_dereference(q->vlan); | ||
124 | if (vlan) { | ||
125 | rcu_assign_pointer(vlan->tap, NULL); | ||
126 | rcu_assign_pointer(q->vlan, NULL); | ||
127 | sock_put(&q->sk); | ||
128 | } | ||
129 | |||
130 | spin_unlock(&macvtap_lock); | ||
131 | |||
132 | synchronize_rcu(); | ||
133 | sock_put(&q->sk); | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * Since we only support one queue, just dereference the pointer. | ||
138 | */ | ||
139 | static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, | ||
140 | struct sk_buff *skb) | ||
141 | { | ||
142 | struct macvlan_dev *vlan = netdev_priv(dev); | ||
143 | |||
144 | return rcu_dereference(vlan->tap); | ||
145 | } | ||
146 | |||
147 | /* | ||
148 | * The net_device is going away, give up the reference | ||
149 | * that it holds on the queue (all the queues one day) | ||
150 | * and safely set the pointer from the queues to NULL. | ||
151 | */ | ||
152 | static void macvtap_del_queues(struct net_device *dev) | ||
153 | { | ||
154 | struct macvlan_dev *vlan = netdev_priv(dev); | ||
155 | struct macvtap_queue *q; | ||
156 | |||
157 | spin_lock(&macvtap_lock); | ||
158 | q = rcu_dereference(vlan->tap); | ||
159 | if (!q) { | ||
160 | spin_unlock(&macvtap_lock); | ||
161 | return; | ||
162 | } | ||
163 | |||
164 | rcu_assign_pointer(vlan->tap, NULL); | ||
165 | rcu_assign_pointer(q->vlan, NULL); | ||
166 | spin_unlock(&macvtap_lock); | ||
167 | |||
168 | synchronize_rcu(); | ||
169 | sock_put(&q->sk); | ||
170 | } | ||
171 | |||
172 | /* | ||
173 | * Forward happens for data that gets sent from one macvlan | ||
174 | * endpoint to another one in bridge mode. We just take | ||
175 | * the skb and put it into the receive queue. | ||
176 | */ | ||
177 | static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) | ||
178 | { | ||
179 | struct macvtap_queue *q = macvtap_get_queue(dev, skb); | ||
180 | if (!q) | ||
181 | return -ENOLINK; | ||
182 | |||
183 | skb_queue_tail(&q->sk.sk_receive_queue, skb); | ||
184 | wake_up_interruptible_poll(q->sk.sk_sleep, POLLIN | POLLRDNORM | POLLRDBAND); | ||
185 | return 0; | ||
186 | } | ||
187 | |||
188 | /* | ||
189 | * Receive is for data from the external interface (lowerdev), | ||
190 | * in case of macvtap, we can treat that the same way as | ||
191 | * forward, which macvlan cannot. | ||
192 | */ | ||
193 | static int macvtap_receive(struct sk_buff *skb) | ||
194 | { | ||
195 | skb_push(skb, ETH_HLEN); | ||
196 | return macvtap_forward(skb->dev, skb); | ||
197 | } | ||
198 | |||
199 | static int macvtap_newlink(struct net *src_net, | ||
200 | struct net_device *dev, | ||
201 | struct nlattr *tb[], | ||
202 | struct nlattr *data[]) | ||
203 | { | ||
204 | struct device *classdev; | ||
205 | dev_t devt; | ||
206 | int err; | ||
207 | |||
208 | err = macvlan_common_newlink(src_net, dev, tb, data, | ||
209 | macvtap_receive, macvtap_forward); | ||
210 | if (err) | ||
211 | goto out; | ||
212 | |||
213 | devt = MKDEV(MAJOR(macvtap_major), dev->ifindex); | ||
214 | |||
215 | classdev = device_create(macvtap_class, &dev->dev, devt, | ||
216 | dev, "tap%d", dev->ifindex); | ||
217 | if (IS_ERR(classdev)) { | ||
218 | err = PTR_ERR(classdev); | ||
219 | macvtap_del_queues(dev); | ||
220 | } | ||
221 | |||
222 | out: | ||
223 | return err; | ||
224 | } | ||
225 | |||
226 | static void macvtap_dellink(struct net_device *dev, | ||
227 | struct list_head *head) | ||
228 | { | ||
229 | device_destroy(macvtap_class, | ||
230 | MKDEV(MAJOR(macvtap_major), dev->ifindex)); | ||
231 | |||
232 | macvtap_del_queues(dev); | ||
233 | macvlan_dellink(dev, head); | ||
234 | } | ||
235 | |||
236 | static struct rtnl_link_ops macvtap_link_ops __read_mostly = { | ||
237 | .kind = "macvtap", | ||
238 | .newlink = macvtap_newlink, | ||
239 | .dellink = macvtap_dellink, | ||
240 | }; | ||
241 | |||
242 | |||
243 | static void macvtap_sock_write_space(struct sock *sk) | ||
244 | { | ||
245 | if (!sock_writeable(sk) || | ||
246 | !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) | ||
247 | return; | ||
248 | |||
249 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | ||
250 | wake_up_interruptible_poll(sk->sk_sleep, POLLOUT | POLLWRNORM | POLLWRBAND); | ||
251 | } | ||
252 | |||
253 | static int macvtap_open(struct inode *inode, struct file *file) | ||
254 | { | ||
255 | struct net *net = current->nsproxy->net_ns; | ||
256 | struct net_device *dev = dev_get_by_index(net, iminor(inode)); | ||
257 | struct macvtap_queue *q; | ||
258 | int err; | ||
259 | |||
260 | err = -ENODEV; | ||
261 | if (!dev) | ||
262 | goto out; | ||
263 | |||
264 | /* check if this is a macvtap device */ | ||
265 | err = -EINVAL; | ||
266 | if (dev->rtnl_link_ops != &macvtap_link_ops) | ||
267 | goto out; | ||
268 | |||
269 | err = -ENOMEM; | ||
270 | q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, | ||
271 | &macvtap_proto); | ||
272 | if (!q) | ||
273 | goto out; | ||
274 | |||
275 | init_waitqueue_head(&q->sock.wait); | ||
276 | q->sock.type = SOCK_RAW; | ||
277 | q->sock.state = SS_CONNECTED; | ||
278 | q->sock.file = file; | ||
279 | q->sock.ops = &macvtap_socket_ops; | ||
280 | sock_init_data(&q->sock, &q->sk); | ||
281 | q->sk.sk_write_space = macvtap_sock_write_space; | ||
282 | q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; | ||
283 | |||
284 | err = macvtap_set_queue(dev, file, q); | ||
285 | if (err) | ||
286 | sock_put(&q->sk); | ||
287 | |||
288 | out: | ||
289 | if (dev) | ||
290 | dev_put(dev); | ||
291 | |||
292 | return err; | ||
293 | } | ||
294 | |||
295 | static int macvtap_release(struct inode *inode, struct file *file) | ||
296 | { | ||
297 | struct macvtap_queue *q = file->private_data; | ||
298 | macvtap_put_queue(q); | ||
299 | return 0; | ||
300 | } | ||
301 | |||
302 | static unsigned int macvtap_poll(struct file *file, poll_table * wait) | ||
303 | { | ||
304 | struct macvtap_queue *q = file->private_data; | ||
305 | unsigned int mask = POLLERR; | ||
306 | |||
307 | if (!q) | ||
308 | goto out; | ||
309 | |||
310 | mask = 0; | ||
311 | poll_wait(file, &q->sock.wait, wait); | ||
312 | |||
313 | if (!skb_queue_empty(&q->sk.sk_receive_queue)) | ||
314 | mask |= POLLIN | POLLRDNORM; | ||
315 | |||
316 | if (sock_writeable(&q->sk) || | ||
317 | (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) && | ||
318 | sock_writeable(&q->sk))) | ||
319 | mask |= POLLOUT | POLLWRNORM; | ||
320 | |||
321 | out: | ||
322 | return mask; | ||
323 | } | ||
324 | |||
325 | static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad, | ||
326 | size_t len, size_t linear, | ||
327 | int noblock, int *err) | ||
328 | { | ||
329 | struct sk_buff *skb; | ||
330 | |||
331 | /* Under a page? Don't bother with paged skb. */ | ||
332 | if (prepad + len < PAGE_SIZE || !linear) | ||
333 | linear = len; | ||
334 | |||
335 | skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, | ||
336 | err); | ||
337 | if (!skb) | ||
338 | return NULL; | ||
339 | |||
340 | skb_reserve(skb, prepad); | ||
341 | skb_put(skb, linear); | ||
342 | skb->data_len = len - linear; | ||
343 | skb->len += len - linear; | ||
344 | |||
345 | return skb; | ||
346 | } | ||
347 | |||
348 | /* | ||
349 | * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should | ||
350 | * be shared with the tun/tap driver. | ||
351 | */ | ||
352 | static int macvtap_skb_from_vnet_hdr(struct sk_buff *skb, | ||
353 | struct virtio_net_hdr *vnet_hdr) | ||
354 | { | ||
355 | unsigned short gso_type = 0; | ||
356 | if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { | ||
357 | switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { | ||
358 | case VIRTIO_NET_HDR_GSO_TCPV4: | ||
359 | gso_type = SKB_GSO_TCPV4; | ||
360 | break; | ||
361 | case VIRTIO_NET_HDR_GSO_TCPV6: | ||
362 | gso_type = SKB_GSO_TCPV6; | ||
363 | break; | ||
364 | case VIRTIO_NET_HDR_GSO_UDP: | ||
365 | gso_type = SKB_GSO_UDP; | ||
366 | break; | ||
367 | default: | ||
368 | return -EINVAL; | ||
369 | } | ||
370 | |||
371 | if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) | ||
372 | gso_type |= SKB_GSO_TCP_ECN; | ||
373 | |||
374 | if (vnet_hdr->gso_size == 0) | ||
375 | return -EINVAL; | ||
376 | } | ||
377 | |||
378 | if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { | ||
379 | if (!skb_partial_csum_set(skb, vnet_hdr->csum_start, | ||
380 | vnet_hdr->csum_offset)) | ||
381 | return -EINVAL; | ||
382 | } | ||
383 | |||
384 | if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { | ||
385 | skb_shinfo(skb)->gso_size = vnet_hdr->gso_size; | ||
386 | skb_shinfo(skb)->gso_type = gso_type; | ||
387 | |||
388 | /* Header must be checked, and gso_segs computed. */ | ||
389 | skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; | ||
390 | skb_shinfo(skb)->gso_segs = 0; | ||
391 | } | ||
392 | return 0; | ||
393 | } | ||
394 | |||
395 | static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb, | ||
396 | struct virtio_net_hdr *vnet_hdr) | ||
397 | { | ||
398 | memset(vnet_hdr, 0, sizeof(*vnet_hdr)); | ||
399 | |||
400 | if (skb_is_gso(skb)) { | ||
401 | struct skb_shared_info *sinfo = skb_shinfo(skb); | ||
402 | |||
403 | /* This is a hint as to how much should be linear. */ | ||
404 | vnet_hdr->hdr_len = skb_headlen(skb); | ||
405 | vnet_hdr->gso_size = sinfo->gso_size; | ||
406 | if (sinfo->gso_type & SKB_GSO_TCPV4) | ||
407 | vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; | ||
408 | else if (sinfo->gso_type & SKB_GSO_TCPV6) | ||
409 | vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; | ||
410 | else if (sinfo->gso_type & SKB_GSO_UDP) | ||
411 | vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; | ||
412 | else | ||
413 | BUG(); | ||
414 | if (sinfo->gso_type & SKB_GSO_TCP_ECN) | ||
415 | vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; | ||
416 | } else | ||
417 | vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; | ||
418 | |||
419 | if (skb->ip_summed == CHECKSUM_PARTIAL) { | ||
420 | vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; | ||
421 | vnet_hdr->csum_start = skb->csum_start - | ||
422 | skb_headroom(skb); | ||
423 | vnet_hdr->csum_offset = skb->csum_offset; | ||
424 | } /* else everything is zero */ | ||
425 | |||
426 | return 0; | ||
427 | } | ||
428 | |||
429 | |||
430 | /* Get packet from user space buffer */ | ||
431 | static ssize_t macvtap_get_user(struct macvtap_queue *q, | ||
432 | const struct iovec *iv, size_t count, | ||
433 | int noblock) | ||
434 | { | ||
435 | struct sk_buff *skb; | ||
436 | struct macvlan_dev *vlan; | ||
437 | size_t len = count; | ||
438 | int err; | ||
439 | struct virtio_net_hdr vnet_hdr = { 0 }; | ||
440 | int vnet_hdr_len = 0; | ||
441 | |||
442 | if (q->flags & IFF_VNET_HDR) { | ||
443 | vnet_hdr_len = sizeof(vnet_hdr); | ||
444 | |||
445 | err = -EINVAL; | ||
446 | if ((len -= vnet_hdr_len) < 0) | ||
447 | goto err; | ||
448 | |||
449 | err = memcpy_fromiovecend((void *)&vnet_hdr, iv, 0, | ||
450 | vnet_hdr_len); | ||
451 | if (err < 0) | ||
452 | goto err; | ||
453 | if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && | ||
454 | vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > | ||
455 | vnet_hdr.hdr_len) | ||
456 | vnet_hdr.hdr_len = vnet_hdr.csum_start + | ||
457 | vnet_hdr.csum_offset + 2; | ||
458 | err = -EINVAL; | ||
459 | if (vnet_hdr.hdr_len > len) | ||
460 | goto err; | ||
461 | } | ||
462 | |||
463 | err = -EINVAL; | ||
464 | if (unlikely(len < ETH_HLEN)) | ||
465 | goto err; | ||
466 | |||
467 | skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len, vnet_hdr.hdr_len, | ||
468 | noblock, &err); | ||
469 | if (!skb) | ||
470 | goto err; | ||
471 | |||
472 | err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len); | ||
473 | if (err) | ||
474 | goto err_kfree; | ||
475 | |||
476 | skb_set_network_header(skb, ETH_HLEN); | ||
477 | skb_reset_mac_header(skb); | ||
478 | skb->protocol = eth_hdr(skb)->h_proto; | ||
479 | |||
480 | if (vnet_hdr_len) { | ||
481 | err = macvtap_skb_from_vnet_hdr(skb, &vnet_hdr); | ||
482 | if (err) | ||
483 | goto err_kfree; | ||
484 | } | ||
485 | |||
486 | rcu_read_lock_bh(); | ||
487 | vlan = rcu_dereference(q->vlan); | ||
488 | if (vlan) | ||
489 | macvlan_start_xmit(skb, vlan->dev); | ||
490 | else | ||
491 | kfree_skb(skb); | ||
492 | rcu_read_unlock_bh(); | ||
493 | |||
494 | return count; | ||
495 | |||
496 | err_kfree: | ||
497 | kfree_skb(skb); | ||
498 | |||
499 | err: | ||
500 | rcu_read_lock_bh(); | ||
501 | vlan = rcu_dereference(q->vlan); | ||
502 | if (vlan) | ||
503 | netdev_get_tx_queue(vlan->dev, 0)->tx_dropped++; | ||
504 | rcu_read_unlock_bh(); | ||
505 | |||
506 | return err; | ||
507 | } | ||
508 | |||
509 | static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, | ||
510 | unsigned long count, loff_t pos) | ||
511 | { | ||
512 | struct file *file = iocb->ki_filp; | ||
513 | ssize_t result = -ENOLINK; | ||
514 | struct macvtap_queue *q = file->private_data; | ||
515 | |||
516 | result = macvtap_get_user(q, iv, iov_length(iv, count), | ||
517 | file->f_flags & O_NONBLOCK); | ||
518 | return result; | ||
519 | } | ||
520 | |||
521 | /* Put packet to the user space buffer */ | ||
522 | static ssize_t macvtap_put_user(struct macvtap_queue *q, | ||
523 | const struct sk_buff *skb, | ||
524 | const struct iovec *iv, int len) | ||
525 | { | ||
526 | struct macvlan_dev *vlan; | ||
527 | int ret; | ||
528 | int vnet_hdr_len = 0; | ||
529 | |||
530 | if (q->flags & IFF_VNET_HDR) { | ||
531 | struct virtio_net_hdr vnet_hdr; | ||
532 | vnet_hdr_len = sizeof (vnet_hdr); | ||
533 | if ((len -= vnet_hdr_len) < 0) | ||
534 | return -EINVAL; | ||
535 | |||
536 | ret = macvtap_skb_to_vnet_hdr(skb, &vnet_hdr); | ||
537 | if (ret) | ||
538 | return ret; | ||
539 | |||
540 | if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, vnet_hdr_len)) | ||
541 | return -EFAULT; | ||
542 | } | ||
543 | |||
544 | len = min_t(int, skb->len, len); | ||
545 | |||
546 | ret = skb_copy_datagram_const_iovec(skb, 0, iv, vnet_hdr_len, len); | ||
547 | |||
548 | rcu_read_lock_bh(); | ||
549 | vlan = rcu_dereference(q->vlan); | ||
550 | if (vlan) | ||
551 | macvlan_count_rx(vlan, len, ret == 0, 0); | ||
552 | rcu_read_unlock_bh(); | ||
553 | |||
554 | return ret ? ret : (len + vnet_hdr_len); | ||
555 | } | ||
556 | |||
557 | static ssize_t macvtap_do_read(struct macvtap_queue *q, struct kiocb *iocb, | ||
558 | const struct iovec *iv, unsigned long len, | ||
559 | int noblock) | ||
560 | { | ||
561 | DECLARE_WAITQUEUE(wait, current); | ||
562 | struct sk_buff *skb; | ||
563 | ssize_t ret = 0; | ||
564 | |||
565 | add_wait_queue(q->sk.sk_sleep, &wait); | ||
566 | while (len) { | ||
567 | current->state = TASK_INTERRUPTIBLE; | ||
568 | |||
569 | /* Read frames from the queue */ | ||
570 | skb = skb_dequeue(&q->sk.sk_receive_queue); | ||
571 | if (!skb) { | ||
572 | if (noblock) { | ||
573 | ret = -EAGAIN; | ||
574 | break; | ||
575 | } | ||
576 | if (signal_pending(current)) { | ||
577 | ret = -ERESTARTSYS; | ||
578 | break; | ||
579 | } | ||
580 | /* Nothing to read, let's sleep */ | ||
581 | schedule(); | ||
582 | continue; | ||
583 | } | ||
584 | ret = macvtap_put_user(q, skb, iv, len); | ||
585 | kfree_skb(skb); | ||
586 | break; | ||
587 | } | ||
588 | |||
589 | current->state = TASK_RUNNING; | ||
590 | remove_wait_queue(q->sk.sk_sleep, &wait); | ||
591 | return ret; | ||
592 | } | ||
593 | |||
594 | static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, | ||
595 | unsigned long count, loff_t pos) | ||
596 | { | ||
597 | struct file *file = iocb->ki_filp; | ||
598 | struct macvtap_queue *q = file->private_data; | ||
599 | ssize_t len, ret = 0; | ||
600 | |||
601 | len = iov_length(iv, count); | ||
602 | if (len < 0) { | ||
603 | ret = -EINVAL; | ||
604 | goto out; | ||
605 | } | ||
606 | |||
607 | ret = macvtap_do_read(q, iocb, iv, len, file->f_flags & O_NONBLOCK); | ||
608 | ret = min_t(ssize_t, ret, len); /* XXX copied from tun.c. Why? */ | ||
609 | out: | ||
610 | return ret; | ||
611 | } | ||
612 | |||
613 | /* | ||
614 | * provide compatibility with generic tun/tap interface | ||
615 | */ | ||
616 | static long macvtap_ioctl(struct file *file, unsigned int cmd, | ||
617 | unsigned long arg) | ||
618 | { | ||
619 | struct macvtap_queue *q = file->private_data; | ||
620 | struct macvlan_dev *vlan; | ||
621 | void __user *argp = (void __user *)arg; | ||
622 | struct ifreq __user *ifr = argp; | ||
623 | unsigned int __user *up = argp; | ||
624 | unsigned int u; | ||
625 | int ret; | ||
626 | |||
627 | switch (cmd) { | ||
628 | case TUNSETIFF: | ||
629 | /* ignore the name, just look at flags */ | ||
630 | if (get_user(u, &ifr->ifr_flags)) | ||
631 | return -EFAULT; | ||
632 | |||
633 | ret = 0; | ||
634 | if ((u & ~IFF_VNET_HDR) != (IFF_NO_PI | IFF_TAP)) | ||
635 | ret = -EINVAL; | ||
636 | else | ||
637 | q->flags = u; | ||
638 | |||
639 | return ret; | ||
640 | |||
641 | case TUNGETIFF: | ||
642 | rcu_read_lock_bh(); | ||
643 | vlan = rcu_dereference(q->vlan); | ||
644 | if (vlan) | ||
645 | dev_hold(vlan->dev); | ||
646 | rcu_read_unlock_bh(); | ||
647 | |||
648 | if (!vlan) | ||
649 | return -ENOLINK; | ||
650 | |||
651 | ret = 0; | ||
652 | if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) || | ||
653 | put_user(q->flags, &ifr->ifr_flags)) | ||
654 | ret = -EFAULT; | ||
655 | dev_put(vlan->dev); | ||
656 | return ret; | ||
657 | |||
658 | case TUNGETFEATURES: | ||
659 | if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR, up)) | ||
660 | return -EFAULT; | ||
661 | return 0; | ||
662 | |||
663 | case TUNSETSNDBUF: | ||
664 | if (get_user(u, up)) | ||
665 | return -EFAULT; | ||
666 | |||
667 | q->sk.sk_sndbuf = u; | ||
668 | return 0; | ||
669 | |||
670 | case TUNSETOFFLOAD: | ||
671 | /* let the user check for future flags */ | ||
672 | if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | | ||
673 | TUN_F_TSO_ECN | TUN_F_UFO)) | ||
674 | return -EINVAL; | ||
675 | |||
676 | /* TODO: only accept frames with the features that | ||
677 | got enabled for forwarded frames */ | ||
678 | if (!(q->flags & IFF_VNET_HDR)) | ||
679 | return -EINVAL; | ||
680 | return 0; | ||
681 | |||
682 | default: | ||
683 | return -EINVAL; | ||
684 | } | ||
685 | } | ||
686 | |||
687 | #ifdef CONFIG_COMPAT | ||
688 | static long macvtap_compat_ioctl(struct file *file, unsigned int cmd, | ||
689 | unsigned long arg) | ||
690 | { | ||
691 | return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); | ||
692 | } | ||
693 | #endif | ||
694 | |||
695 | static const struct file_operations macvtap_fops = { | ||
696 | .owner = THIS_MODULE, | ||
697 | .open = macvtap_open, | ||
698 | .release = macvtap_release, | ||
699 | .aio_read = macvtap_aio_read, | ||
700 | .aio_write = macvtap_aio_write, | ||
701 | .poll = macvtap_poll, | ||
702 | .llseek = no_llseek, | ||
703 | .unlocked_ioctl = macvtap_ioctl, | ||
704 | #ifdef CONFIG_COMPAT | ||
705 | .compat_ioctl = macvtap_compat_ioctl, | ||
706 | #endif | ||
707 | }; | ||
708 | |||
709 | static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock, | ||
710 | struct msghdr *m, size_t total_len) | ||
711 | { | ||
712 | struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); | ||
713 | return macvtap_get_user(q, m->msg_iov, total_len, | ||
714 | m->msg_flags & MSG_DONTWAIT); | ||
715 | } | ||
716 | |||
717 | static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock, | ||
718 | struct msghdr *m, size_t total_len, | ||
719 | int flags) | ||
720 | { | ||
721 | struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); | ||
722 | int ret; | ||
723 | if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) | ||
724 | return -EINVAL; | ||
725 | ret = macvtap_do_read(q, iocb, m->msg_iov, total_len, | ||
726 | flags & MSG_DONTWAIT); | ||
727 | if (ret > total_len) { | ||
728 | m->msg_flags |= MSG_TRUNC; | ||
729 | ret = flags & MSG_TRUNC ? ret : total_len; | ||
730 | } | ||
731 | return ret; | ||
732 | } | ||
733 | |||
734 | /* Ops structure to mimic raw sockets with tun */ | ||
735 | static const struct proto_ops macvtap_socket_ops = { | ||
736 | .sendmsg = macvtap_sendmsg, | ||
737 | .recvmsg = macvtap_recvmsg, | ||
738 | }; | ||
739 | |||
740 | /* Get an underlying socket object from tun file. Returns error unless file is | ||
741 | * attached to a device. The returned object works like a packet socket, it | ||
742 | * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for | ||
743 | * holding a reference to the file for as long as the socket is in use. */ | ||
744 | struct socket *macvtap_get_socket(struct file *file) | ||
745 | { | ||
746 | struct macvtap_queue *q; | ||
747 | if (file->f_op != &macvtap_fops) | ||
748 | return ERR_PTR(-EINVAL); | ||
749 | q = file->private_data; | ||
750 | if (!q) | ||
751 | return ERR_PTR(-EBADFD); | ||
752 | return &q->sock; | ||
753 | } | ||
754 | EXPORT_SYMBOL_GPL(macvtap_get_socket); | ||
755 | |||
756 | static int macvtap_init(void) | ||
757 | { | ||
758 | int err; | ||
759 | |||
760 | err = alloc_chrdev_region(&macvtap_major, 0, | ||
761 | MACVTAP_NUM_DEVS, "macvtap"); | ||
762 | if (err) | ||
763 | goto out1; | ||
764 | |||
765 | cdev_init(&macvtap_cdev, &macvtap_fops); | ||
766 | err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS); | ||
767 | if (err) | ||
768 | goto out2; | ||
769 | |||
770 | macvtap_class = class_create(THIS_MODULE, "macvtap"); | ||
771 | if (IS_ERR(macvtap_class)) { | ||
772 | err = PTR_ERR(macvtap_class); | ||
773 | goto out3; | ||
774 | } | ||
775 | |||
776 | err = macvlan_link_register(&macvtap_link_ops); | ||
777 | if (err) | ||
778 | goto out4; | ||
779 | |||
780 | return 0; | ||
781 | |||
782 | out4: | ||
783 | class_unregister(macvtap_class); | ||
784 | out3: | ||
785 | cdev_del(&macvtap_cdev); | ||
786 | out2: | ||
787 | unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); | ||
788 | out1: | ||
789 | return err; | ||
790 | } | ||
791 | module_init(macvtap_init); | ||
792 | |||
793 | static void macvtap_exit(void) | ||
794 | { | ||
795 | rtnl_link_unregister(&macvtap_link_ops); | ||
796 | class_unregister(macvtap_class); | ||
797 | cdev_del(&macvtap_cdev); | ||
798 | unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); | ||
799 | } | ||
800 | module_exit(macvtap_exit); | ||
801 | |||
802 | MODULE_ALIAS_RTNL_LINK("macvtap"); | ||
803 | MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); | ||
804 | MODULE_LICENSE("GPL"); | ||