diff options
author | Shirley Ma <mashirle@us.ibm.com> | 2011-07-06 08:26:11 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-07-07 07:41:24 -0400 |
commit | 97bc3633bec7ed0fdfbda6b9cf86c51e4f58f8e2 (patch) | |
tree | 4a7c3df9d75e1b3b635a91069cbaeb80c7a4fed6 /drivers/net/macvtap.c | |
parent | a6686f2f382b13f8a7253401a66690c3633b6a74 (diff) |
macvtap: macvtapTX zero-copy support
Only 128 bytes is copied, the rest of data is DMA mapped directly from
userspace.
Signed-off-by: Shirley Ma <xma@...ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/macvtap.c')
-rw-r--r-- | drivers/net/macvtap.c | 132 |
1 files changed, 121 insertions, 11 deletions
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index ecee0fe65a97..ab96c319a240 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c | |||
@@ -60,6 +60,7 @@ static struct proto macvtap_proto = { | |||
60 | */ | 60 | */ |
61 | static dev_t macvtap_major; | 61 | static dev_t macvtap_major; |
62 | #define MACVTAP_NUM_DEVS 65536 | 62 | #define MACVTAP_NUM_DEVS 65536 |
63 | #define GOODCOPY_LEN 128 | ||
63 | static struct class *macvtap_class; | 64 | static struct class *macvtap_class; |
64 | static struct cdev macvtap_cdev; | 65 | static struct cdev macvtap_cdev; |
65 | 66 | ||
@@ -340,6 +341,7 @@ static int macvtap_open(struct inode *inode, struct file *file) | |||
340 | { | 341 | { |
341 | struct net *net = current->nsproxy->net_ns; | 342 | struct net *net = current->nsproxy->net_ns; |
342 | struct net_device *dev = dev_get_by_index(net, iminor(inode)); | 343 | struct net_device *dev = dev_get_by_index(net, iminor(inode)); |
344 | struct macvlan_dev *vlan = netdev_priv(dev); | ||
343 | struct macvtap_queue *q; | 345 | struct macvtap_queue *q; |
344 | int err; | 346 | int err; |
345 | 347 | ||
@@ -369,6 +371,16 @@ static int macvtap_open(struct inode *inode, struct file *file) | |||
369 | q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; | 371 | q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; |
370 | q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); | 372 | q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); |
371 | 373 | ||
374 | /* | ||
375 | * so far only KVM virtio_net uses macvtap, enable zero copy between | ||
376 | * guest kernel and host kernel when lower device supports zerocopy | ||
377 | */ | ||
378 | if (vlan) { | ||
379 | if ((vlan->lowerdev->features & NETIF_F_HIGHDMA) && | ||
380 | (vlan->lowerdev->features & NETIF_F_SG)) | ||
381 | sock_set_flag(&q->sk, SOCK_ZEROCOPY); | ||
382 | } | ||
383 | |||
372 | err = macvtap_set_queue(dev, file, q); | 384 | err = macvtap_set_queue(dev, file, q); |
373 | if (err) | 385 | if (err) |
374 | sock_put(&q->sk); | 386 | sock_put(&q->sk); |
@@ -433,6 +445,80 @@ static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad, | |||
433 | return skb; | 445 | return skb; |
434 | } | 446 | } |
435 | 447 | ||
448 | /* set skb frags from iovec, this can move to core network code for reuse */ | ||
449 | static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, | ||
450 | int offset, size_t count) | ||
451 | { | ||
452 | int len = iov_length(from, count) - offset; | ||
453 | int copy = skb_headlen(skb); | ||
454 | int size, offset1 = 0; | ||
455 | int i = 0; | ||
456 | skb_frag_t *f; | ||
457 | |||
458 | /* Skip over from offset */ | ||
459 | while (count && (offset >= from->iov_len)) { | ||
460 | offset -= from->iov_len; | ||
461 | ++from; | ||
462 | --count; | ||
463 | } | ||
464 | |||
465 | /* copy up to skb headlen */ | ||
466 | while (count && (copy > 0)) { | ||
467 | size = min_t(unsigned int, copy, from->iov_len - offset); | ||
468 | if (copy_from_user(skb->data + offset1, from->iov_base + offset, | ||
469 | size)) | ||
470 | return -EFAULT; | ||
471 | if (copy > size) { | ||
472 | ++from; | ||
473 | --count; | ||
474 | } | ||
475 | copy -= size; | ||
476 | offset1 += size; | ||
477 | offset = 0; | ||
478 | } | ||
479 | |||
480 | if (len == offset1) | ||
481 | return 0; | ||
482 | |||
483 | while (count--) { | ||
484 | struct page *page[MAX_SKB_FRAGS]; | ||
485 | int num_pages; | ||
486 | unsigned long base; | ||
487 | |||
488 | len = from->iov_len - offset1; | ||
489 | if (!len) { | ||
490 | offset1 = 0; | ||
491 | ++from; | ||
492 | continue; | ||
493 | } | ||
494 | base = (unsigned long)from->iov_base + offset1; | ||
495 | size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; | ||
496 | num_pages = get_user_pages_fast(base, size, 0, &page[i]); | ||
497 | if ((num_pages != size) || | ||
498 | (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags)) | ||
499 | /* put_page is in skb free */ | ||
500 | return -EFAULT; | ||
501 | skb->data_len += len; | ||
502 | skb->len += len; | ||
503 | skb->truesize += len; | ||
504 | atomic_add(len, &skb->sk->sk_wmem_alloc); | ||
505 | while (len) { | ||
506 | f = &skb_shinfo(skb)->frags[i]; | ||
507 | f->page = page[i]; | ||
508 | f->page_offset = base & ~PAGE_MASK; | ||
509 | f->size = min_t(int, len, PAGE_SIZE - f->page_offset); | ||
510 | skb_shinfo(skb)->nr_frags++; | ||
511 | /* increase sk_wmem_alloc */ | ||
512 | base += f->size; | ||
513 | len -= f->size; | ||
514 | i++; | ||
515 | } | ||
516 | offset1 = 0; | ||
517 | ++from; | ||
518 | } | ||
519 | return 0; | ||
520 | } | ||
521 | |||
436 | /* | 522 | /* |
437 | * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should | 523 | * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should |
438 | * be shared with the tun/tap driver. | 524 | * be shared with the tun/tap driver. |
@@ -517,16 +603,18 @@ static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb, | |||
517 | 603 | ||
518 | 604 | ||
519 | /* Get packet from user space buffer */ | 605 | /* Get packet from user space buffer */ |
520 | static ssize_t macvtap_get_user(struct macvtap_queue *q, | 606 | static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, |
521 | const struct iovec *iv, size_t count, | 607 | const struct iovec *iv, unsigned long total_len, |
522 | int noblock) | 608 | size_t count, int noblock) |
523 | { | 609 | { |
524 | struct sk_buff *skb; | 610 | struct sk_buff *skb; |
525 | struct macvlan_dev *vlan; | 611 | struct macvlan_dev *vlan; |
526 | size_t len = count; | 612 | unsigned long len = total_len; |
527 | int err; | 613 | int err; |
528 | struct virtio_net_hdr vnet_hdr = { 0 }; | 614 | struct virtio_net_hdr vnet_hdr = { 0 }; |
529 | int vnet_hdr_len = 0; | 615 | int vnet_hdr_len = 0; |
616 | int copylen; | ||
617 | bool zerocopy = false; | ||
530 | 618 | ||
531 | if (q->flags & IFF_VNET_HDR) { | 619 | if (q->flags & IFF_VNET_HDR) { |
532 | vnet_hdr_len = q->vnet_hdr_sz; | 620 | vnet_hdr_len = q->vnet_hdr_sz; |
@@ -554,12 +642,31 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, | |||
554 | if (unlikely(len < ETH_HLEN)) | 642 | if (unlikely(len < ETH_HLEN)) |
555 | goto err; | 643 | goto err; |
556 | 644 | ||
557 | skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len, vnet_hdr.hdr_len, | 645 | if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) |
558 | noblock, &err); | 646 | zerocopy = true; |
647 | |||
648 | if (zerocopy) { | ||
649 | /* There are 256 bytes to be copied in skb, so there is enough | ||
650 | * room for skb expand head in case it is used. | ||
651 | * The rest buffer is mapped from userspace. | ||
652 | */ | ||
653 | copylen = vnet_hdr.hdr_len; | ||
654 | if (!copylen) | ||
655 | copylen = GOODCOPY_LEN; | ||
656 | } else | ||
657 | copylen = len; | ||
658 | |||
659 | skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen, | ||
660 | vnet_hdr.hdr_len, noblock, &err); | ||
559 | if (!skb) | 661 | if (!skb) |
560 | goto err; | 662 | goto err; |
561 | 663 | ||
562 | err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len); | 664 | if (zerocopy) { |
665 | err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count); | ||
666 | skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; | ||
667 | } else | ||
668 | err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, | ||
669 | len); | ||
563 | if (err) | 670 | if (err) |
564 | goto err_kfree; | 671 | goto err_kfree; |
565 | 672 | ||
@@ -575,13 +682,16 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, | |||
575 | 682 | ||
576 | rcu_read_lock_bh(); | 683 | rcu_read_lock_bh(); |
577 | vlan = rcu_dereference_bh(q->vlan); | 684 | vlan = rcu_dereference_bh(q->vlan); |
685 | /* copy skb_ubuf_info for callback when skb has no error */ | ||
686 | if (zerocopy) | ||
687 | skb_shinfo(skb)->destructor_arg = m->msg_control; | ||
578 | if (vlan) | 688 | if (vlan) |
579 | macvlan_start_xmit(skb, vlan->dev); | 689 | macvlan_start_xmit(skb, vlan->dev); |
580 | else | 690 | else |
581 | kfree_skb(skb); | 691 | kfree_skb(skb); |
582 | rcu_read_unlock_bh(); | 692 | rcu_read_unlock_bh(); |
583 | 693 | ||
584 | return count; | 694 | return total_len; |
585 | 695 | ||
586 | err_kfree: | 696 | err_kfree: |
587 | kfree_skb(skb); | 697 | kfree_skb(skb); |
@@ -603,8 +713,8 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, | |||
603 | ssize_t result = -ENOLINK; | 713 | ssize_t result = -ENOLINK; |
604 | struct macvtap_queue *q = file->private_data; | 714 | struct macvtap_queue *q = file->private_data; |
605 | 715 | ||
606 | result = macvtap_get_user(q, iv, iov_length(iv, count), | 716 | result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count, |
607 | file->f_flags & O_NONBLOCK); | 717 | file->f_flags & O_NONBLOCK); |
608 | return result; | 718 | return result; |
609 | } | 719 | } |
610 | 720 | ||
@@ -817,7 +927,7 @@ static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock, | |||
817 | struct msghdr *m, size_t total_len) | 927 | struct msghdr *m, size_t total_len) |
818 | { | 928 | { |
819 | struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); | 929 | struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock); |
820 | return macvtap_get_user(q, m->msg_iov, total_len, | 930 | return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen, |
821 | m->msg_flags & MSG_DONTWAIT); | 931 | m->msg_flags & MSG_DONTWAIT); |
822 | } | 932 | } |
823 | 933 | ||