diff options
author | Michael S. Tsirkin <mst@redhat.com> | 2012-07-20 05:23:23 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-07-22 15:39:33 -0400 |
commit | 0690899b4d4501b3505be069b9a687e68ccbe15b (patch) | |
tree | 962c728cdc3d8027dfcd12fb3dfa522814cc92a1 /drivers/net | |
parent | dcc0fb782b3a6e2abfeaaeb45dd88ed09596be0f (diff) |
tun: experimental zero copy tx support
Let vhost-net utilize zero copy tx when used with tun.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net')
-rw-r--r-- | drivers/net/tun.c | 146 |
1 files changed, 134 insertions, 12 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b95a7f44a6d8..c62163e272cd 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c | |||
@@ -100,6 +100,8 @@ do { \ | |||
100 | } while (0) | 100 | } while (0) |
101 | #endif | 101 | #endif |
102 | 102 | ||
103 | #define GOODCOPY_LEN 128 | ||
104 | |||
103 | #define FLT_EXACT_COUNT 8 | 105 | #define FLT_EXACT_COUNT 8 |
104 | struct tap_filter { | 106 | struct tap_filter { |
105 | unsigned int count; /* Number of addrs. Zero means disabled */ | 107 | unsigned int count; /* Number of addrs. Zero means disabled */ |
@@ -604,19 +606,100 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun, | |||
604 | return skb; | 606 | return skb; |
605 | } | 607 | } |
606 | 608 | ||
609 | /* set skb frags from iovec, this can move to core network code for reuse */ | ||
610 | static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, | ||
611 | int offset, size_t count) | ||
612 | { | ||
613 | int len = iov_length(from, count) - offset; | ||
614 | int copy = skb_headlen(skb); | ||
615 | int size, offset1 = 0; | ||
616 | int i = 0; | ||
617 | |||
618 | /* Skip over from offset */ | ||
619 | while (count && (offset >= from->iov_len)) { | ||
620 | offset -= from->iov_len; | ||
621 | ++from; | ||
622 | --count; | ||
623 | } | ||
624 | |||
625 | /* copy up to skb headlen */ | ||
626 | while (count && (copy > 0)) { | ||
627 | size = min_t(unsigned int, copy, from->iov_len - offset); | ||
628 | if (copy_from_user(skb->data + offset1, from->iov_base + offset, | ||
629 | size)) | ||
630 | return -EFAULT; | ||
631 | if (copy > size) { | ||
632 | ++from; | ||
633 | --count; | ||
634 | offset = 0; | ||
635 | } else | ||
636 | offset += size; | ||
637 | copy -= size; | ||
638 | offset1 += size; | ||
639 | } | ||
640 | |||
641 | if (len == offset1) | ||
642 | return 0; | ||
643 | |||
644 | while (count--) { | ||
645 | struct page *page[MAX_SKB_FRAGS]; | ||
646 | int num_pages; | ||
647 | unsigned long base; | ||
648 | unsigned long truesize; | ||
649 | |||
650 | len = from->iov_len - offset; | ||
651 | if (!len) { | ||
652 | offset = 0; | ||
653 | ++from; | ||
654 | continue; | ||
655 | } | ||
656 | base = (unsigned long)from->iov_base + offset; | ||
657 | size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; | ||
658 | if (i + size > MAX_SKB_FRAGS) | ||
659 | return -EMSGSIZE; | ||
660 | num_pages = get_user_pages_fast(base, size, 0, &page[i]); | ||
661 | if (num_pages != size) { | ||
662 | for (i = 0; i < num_pages; i++) | ||
663 | put_page(page[i]); | ||
664 | return -EFAULT; | ||
665 | } | ||
666 | truesize = size * PAGE_SIZE; | ||
667 | skb->data_len += len; | ||
668 | skb->len += len; | ||
669 | skb->truesize += truesize; | ||
670 | atomic_add(truesize, &skb->sk->sk_wmem_alloc); | ||
671 | while (len) { | ||
672 | int off = base & ~PAGE_MASK; | ||
673 | int size = min_t(int, len, PAGE_SIZE - off); | ||
674 | __skb_fill_page_desc(skb, i, page[i], off, size); | ||
675 | skb_shinfo(skb)->nr_frags++; | ||
676 | /* increase sk_wmem_alloc */ | ||
677 | base += size; | ||
678 | len -= size; | ||
679 | i++; | ||
680 | } | ||
681 | offset = 0; | ||
682 | ++from; | ||
683 | } | ||
684 | return 0; | ||
685 | } | ||
686 | |||
607 | /* Get packet from user space buffer */ | 687 | /* Get packet from user space buffer */ |
608 | static ssize_t tun_get_user(struct tun_struct *tun, | 688 | static ssize_t tun_get_user(struct tun_struct *tun, void *msg_control, |
609 | const struct iovec *iv, size_t count, | 689 | const struct iovec *iv, size_t total_len, |
610 | int noblock) | 690 | size_t count, int noblock) |
611 | { | 691 | { |
612 | struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; | 692 | struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; |
613 | struct sk_buff *skb; | 693 | struct sk_buff *skb; |
614 | size_t len = count, align = NET_SKB_PAD; | 694 | size_t len = total_len, align = NET_SKB_PAD; |
615 | struct virtio_net_hdr gso = { 0 }; | 695 | struct virtio_net_hdr gso = { 0 }; |
616 | int offset = 0; | 696 | int offset = 0; |
697 | int copylen; | ||
698 | bool zerocopy = false; | ||
699 | int err; | ||
617 | 700 | ||
618 | if (!(tun->flags & TUN_NO_PI)) { | 701 | if (!(tun->flags & TUN_NO_PI)) { |
619 | if ((len -= sizeof(pi)) > count) | 702 | if ((len -= sizeof(pi)) > total_len) |
620 | return -EINVAL; | 703 | return -EINVAL; |
621 | 704 | ||
622 | if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi))) | 705 | if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi))) |
@@ -625,7 +708,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, | |||
625 | } | 708 | } |
626 | 709 | ||
627 | if (tun->flags & TUN_VNET_HDR) { | 710 | if (tun->flags & TUN_VNET_HDR) { |
628 | if ((len -= tun->vnet_hdr_sz) > count) | 711 | if ((len -= tun->vnet_hdr_sz) > total_len) |
629 | return -EINVAL; | 712 | return -EINVAL; |
630 | 713 | ||
631 | if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso))) | 714 | if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso))) |
@@ -647,14 +730,46 @@ static ssize_t tun_get_user(struct tun_struct *tun, | |||
647 | return -EINVAL; | 730 | return -EINVAL; |
648 | } | 731 | } |
649 | 732 | ||
650 | skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock); | 733 | if (msg_control) |
734 | zerocopy = true; | ||
735 | |||
736 | if (zerocopy) { | ||
737 | /* Userspace may produce vectors with count greater than | ||
738 | * MAX_SKB_FRAGS, so we need to linearize parts of the skb | ||
739 | * to let the rest of data to be fit in the frags. | ||
740 | */ | ||
741 | if (count > MAX_SKB_FRAGS) { | ||
742 | copylen = iov_length(iv, count - MAX_SKB_FRAGS); | ||
743 | if (copylen < offset) | ||
744 | copylen = 0; | ||
745 | else | ||
746 | copylen -= offset; | ||
747 | } else | ||
748 | copylen = 0; | ||
749 | /* There are 256 bytes to be copied in skb, so there is enough | ||
750 | * room for skb expand head in case it is used. | ||
751 | * The rest of the buffer is mapped from userspace. | ||
752 | */ | ||
753 | if (copylen < gso.hdr_len) | ||
754 | copylen = gso.hdr_len; | ||
755 | if (!copylen) | ||
756 | copylen = GOODCOPY_LEN; | ||
757 | } else | ||
758 | copylen = len; | ||
759 | |||
760 | skb = tun_alloc_skb(tun, align, copylen, gso.hdr_len, noblock); | ||
651 | if (IS_ERR(skb)) { | 761 | if (IS_ERR(skb)) { |
652 | if (PTR_ERR(skb) != -EAGAIN) | 762 | if (PTR_ERR(skb) != -EAGAIN) |
653 | tun->dev->stats.rx_dropped++; | 763 | tun->dev->stats.rx_dropped++; |
654 | return PTR_ERR(skb); | 764 | return PTR_ERR(skb); |
655 | } | 765 | } |
656 | 766 | ||
657 | if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) { | 767 | if (zerocopy) |
768 | err = zerocopy_sg_from_iovec(skb, iv, offset, count); | ||
769 | else | ||
770 | err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len); | ||
771 | |||
772 | if (err) { | ||
658 | tun->dev->stats.rx_dropped++; | 773 | tun->dev->stats.rx_dropped++; |
659 | kfree_skb(skb); | 774 | kfree_skb(skb); |
660 | return -EFAULT; | 775 | return -EFAULT; |
@@ -728,12 +843,18 @@ static ssize_t tun_get_user(struct tun_struct *tun, | |||
728 | skb_shinfo(skb)->gso_segs = 0; | 843 | skb_shinfo(skb)->gso_segs = 0; |
729 | } | 844 | } |
730 | 845 | ||
846 | /* copy skb_ubuf_info for callback when skb has no error */ | ||
847 | if (zerocopy) { | ||
848 | skb_shinfo(skb)->destructor_arg = msg_control; | ||
849 | skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; | ||
850 | } | ||
851 | |||
731 | netif_rx_ni(skb); | 852 | netif_rx_ni(skb); |
732 | 853 | ||
733 | tun->dev->stats.rx_packets++; | 854 | tun->dev->stats.rx_packets++; |
734 | tun->dev->stats.rx_bytes += len; | 855 | tun->dev->stats.rx_bytes += len; |
735 | 856 | ||
736 | return count; | 857 | return total_len; |
737 | } | 858 | } |
738 | 859 | ||
739 | static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, | 860 | static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, |
@@ -748,7 +869,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, | |||
748 | 869 | ||
749 | tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count); | 870 | tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count); |
750 | 871 | ||
751 | result = tun_get_user(tun, iv, iov_length(iv, count), | 872 | result = tun_get_user(tun, NULL, iv, iov_length(iv, count), count, |
752 | file->f_flags & O_NONBLOCK); | 873 | file->f_flags & O_NONBLOCK); |
753 | 874 | ||
754 | tun_put(tun); | 875 | tun_put(tun); |
@@ -962,8 +1083,8 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, | |||
962 | struct msghdr *m, size_t total_len) | 1083 | struct msghdr *m, size_t total_len) |
963 | { | 1084 | { |
964 | struct tun_struct *tun = container_of(sock, struct tun_struct, socket); | 1085 | struct tun_struct *tun = container_of(sock, struct tun_struct, socket); |
965 | return tun_get_user(tun, m->msg_iov, total_len, | 1086 | return tun_get_user(tun, m->msg_control, m->msg_iov, total_len, |
966 | m->msg_flags & MSG_DONTWAIT); | 1087 | m->msg_iovlen, m->msg_flags & MSG_DONTWAIT); |
967 | } | 1088 | } |
968 | 1089 | ||
969 | static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, | 1090 | static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, |
@@ -1133,6 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) | |||
1133 | sock_init_data(&tun->socket, sk); | 1254 | sock_init_data(&tun->socket, sk); |
1134 | sk->sk_write_space = tun_sock_write_space; | 1255 | sk->sk_write_space = tun_sock_write_space; |
1135 | sk->sk_sndbuf = INT_MAX; | 1256 | sk->sk_sndbuf = INT_MAX; |
1257 | sock_set_flag(sk, SOCK_ZEROCOPY); | ||
1136 | 1258 | ||
1137 | tun_sk(sk)->tun = tun; | 1259 | tun_sk(sk)->tun = tun; |
1138 | 1260 | ||