aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net
diff options
context:
space:
mode:
authorMichael S. Tsirkin <mst@redhat.com>2012-07-20 05:23:23 -0400
committerDavid S. Miller <davem@davemloft.net>2012-07-22 15:39:33 -0400
commit0690899b4d4501b3505be069b9a687e68ccbe15b (patch)
tree962c728cdc3d8027dfcd12fb3dfa522814cc92a1 /drivers/net
parentdcc0fb782b3a6e2abfeaaeb45dd88ed09596be0f (diff)
tun: experimental zero copy tx support
Let vhost-net utilize zero copy tx when used with tun. Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net')
-rw-r--r--drivers/net/tun.c146
1 files changed, 134 insertions, 12 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b95a7f44a6d8..c62163e272cd 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -100,6 +100,8 @@ do { \
100} while (0) 100} while (0)
101#endif 101#endif
102 102
103#define GOODCOPY_LEN 128
104
103#define FLT_EXACT_COUNT 8 105#define FLT_EXACT_COUNT 8
104struct tap_filter { 106struct tap_filter {
105 unsigned int count; /* Number of addrs. Zero means disabled */ 107 unsigned int count; /* Number of addrs. Zero means disabled */
@@ -604,19 +606,100 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
604 return skb; 606 return skb;
605} 607}
606 608
609/* set skb frags from iovec, this can move to core network code for reuse */
610static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
611 int offset, size_t count)
612{
613 int len = iov_length(from, count) - offset;
614 int copy = skb_headlen(skb);
615 int size, offset1 = 0;
616 int i = 0;
617
618 /* Skip over from offset */
619 while (count && (offset >= from->iov_len)) {
620 offset -= from->iov_len;
621 ++from;
622 --count;
623 }
624
625 /* copy up to skb headlen */
626 while (count && (copy > 0)) {
627 size = min_t(unsigned int, copy, from->iov_len - offset);
628 if (copy_from_user(skb->data + offset1, from->iov_base + offset,
629 size))
630 return -EFAULT;
631 if (copy > size) {
632 ++from;
633 --count;
634 offset = 0;
635 } else
636 offset += size;
637 copy -= size;
638 offset1 += size;
639 }
640
641 if (len == offset1)
642 return 0;
643
644 while (count--) {
645 struct page *page[MAX_SKB_FRAGS];
646 int num_pages;
647 unsigned long base;
648 unsigned long truesize;
649
650 len = from->iov_len - offset;
651 if (!len) {
652 offset = 0;
653 ++from;
654 continue;
655 }
656 base = (unsigned long)from->iov_base + offset;
657 size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
658 if (i + size > MAX_SKB_FRAGS)
659 return -EMSGSIZE;
660 num_pages = get_user_pages_fast(base, size, 0, &page[i]);
661 if (num_pages != size) {
662 for (i = 0; i < num_pages; i++)
663 put_page(page[i]);
664 return -EFAULT;
665 }
666 truesize = size * PAGE_SIZE;
667 skb->data_len += len;
668 skb->len += len;
669 skb->truesize += truesize;
670 atomic_add(truesize, &skb->sk->sk_wmem_alloc);
671 while (len) {
672 int off = base & ~PAGE_MASK;
673 int size = min_t(int, len, PAGE_SIZE - off);
674 __skb_fill_page_desc(skb, i, page[i], off, size);
675 skb_shinfo(skb)->nr_frags++;
676 /* increase sk_wmem_alloc */
677 base += size;
678 len -= size;
679 i++;
680 }
681 offset = 0;
682 ++from;
683 }
684 return 0;
685}
686
607/* Get packet from user space buffer */ 687/* Get packet from user space buffer */
608static ssize_t tun_get_user(struct tun_struct *tun, 688static ssize_t tun_get_user(struct tun_struct *tun, void *msg_control,
609 const struct iovec *iv, size_t count, 689 const struct iovec *iv, size_t total_len,
610 int noblock) 690 size_t count, int noblock)
611{ 691{
612 struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; 692 struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
613 struct sk_buff *skb; 693 struct sk_buff *skb;
614 size_t len = count, align = NET_SKB_PAD; 694 size_t len = total_len, align = NET_SKB_PAD;
615 struct virtio_net_hdr gso = { 0 }; 695 struct virtio_net_hdr gso = { 0 };
616 int offset = 0; 696 int offset = 0;
697 int copylen;
698 bool zerocopy = false;
699 int err;
617 700
618 if (!(tun->flags & TUN_NO_PI)) { 701 if (!(tun->flags & TUN_NO_PI)) {
619 if ((len -= sizeof(pi)) > count) 702 if ((len -= sizeof(pi)) > total_len)
620 return -EINVAL; 703 return -EINVAL;
621 704
622 if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi))) 705 if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi)))
@@ -625,7 +708,7 @@ static ssize_t tun_get_user(struct tun_struct *tun,
625 } 708 }
626 709
627 if (tun->flags & TUN_VNET_HDR) { 710 if (tun->flags & TUN_VNET_HDR) {
628 if ((len -= tun->vnet_hdr_sz) > count) 711 if ((len -= tun->vnet_hdr_sz) > total_len)
629 return -EINVAL; 712 return -EINVAL;
630 713
631 if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso))) 714 if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso)))
@@ -647,14 +730,46 @@ static ssize_t tun_get_user(struct tun_struct *tun,
647 return -EINVAL; 730 return -EINVAL;
648 } 731 }
649 732
650 skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock); 733 if (msg_control)
734 zerocopy = true;
735
736 if (zerocopy) {
737 /* Userspace may produce vectors with count greater than
738 * MAX_SKB_FRAGS, so we need to linearize parts of the skb
739 * to let the rest of data to be fit in the frags.
740 */
741 if (count > MAX_SKB_FRAGS) {
742 copylen = iov_length(iv, count - MAX_SKB_FRAGS);
743 if (copylen < offset)
744 copylen = 0;
745 else
746 copylen -= offset;
747 } else
748 copylen = 0;
749 /* There are 256 bytes to be copied in skb, so there is enough
750 * room for skb expand head in case it is used.
751 * The rest of the buffer is mapped from userspace.
752 */
753 if (copylen < gso.hdr_len)
754 copylen = gso.hdr_len;
755 if (!copylen)
756 copylen = GOODCOPY_LEN;
757 } else
758 copylen = len;
759
760 skb = tun_alloc_skb(tun, align, copylen, gso.hdr_len, noblock);
651 if (IS_ERR(skb)) { 761 if (IS_ERR(skb)) {
652 if (PTR_ERR(skb) != -EAGAIN) 762 if (PTR_ERR(skb) != -EAGAIN)
653 tun->dev->stats.rx_dropped++; 763 tun->dev->stats.rx_dropped++;
654 return PTR_ERR(skb); 764 return PTR_ERR(skb);
655 } 765 }
656 766
657 if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) { 767 if (zerocopy)
768 err = zerocopy_sg_from_iovec(skb, iv, offset, count);
769 else
770 err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len);
771
772 if (err) {
658 tun->dev->stats.rx_dropped++; 773 tun->dev->stats.rx_dropped++;
659 kfree_skb(skb); 774 kfree_skb(skb);
660 return -EFAULT; 775 return -EFAULT;
@@ -728,12 +843,18 @@ static ssize_t tun_get_user(struct tun_struct *tun,
728 skb_shinfo(skb)->gso_segs = 0; 843 skb_shinfo(skb)->gso_segs = 0;
729 } 844 }
730 845
846 /* copy skb_ubuf_info for callback when skb has no error */
847 if (zerocopy) {
848 skb_shinfo(skb)->destructor_arg = msg_control;
849 skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
850 }
851
731 netif_rx_ni(skb); 852 netif_rx_ni(skb);
732 853
733 tun->dev->stats.rx_packets++; 854 tun->dev->stats.rx_packets++;
734 tun->dev->stats.rx_bytes += len; 855 tun->dev->stats.rx_bytes += len;
735 856
736 return count; 857 return total_len;
737} 858}
738 859
739static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, 860static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
@@ -748,7 +869,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
748 869
749 tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count); 870 tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count);
750 871
751 result = tun_get_user(tun, iv, iov_length(iv, count), 872 result = tun_get_user(tun, NULL, iv, iov_length(iv, count), count,
752 file->f_flags & O_NONBLOCK); 873 file->f_flags & O_NONBLOCK);
753 874
754 tun_put(tun); 875 tun_put(tun);
@@ -962,8 +1083,8 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
962 struct msghdr *m, size_t total_len) 1083 struct msghdr *m, size_t total_len)
963{ 1084{
964 struct tun_struct *tun = container_of(sock, struct tun_struct, socket); 1085 struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
965 return tun_get_user(tun, m->msg_iov, total_len, 1086 return tun_get_user(tun, m->msg_control, m->msg_iov, total_len,
966 m->msg_flags & MSG_DONTWAIT); 1087 m->msg_iovlen, m->msg_flags & MSG_DONTWAIT);
967} 1088}
968 1089
969static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, 1090static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
@@ -1133,6 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
1133 sock_init_data(&tun->socket, sk); 1254 sock_init_data(&tun->socket, sk);
1134 sk->sk_write_space = tun_sock_write_space; 1255 sk->sk_write_space = tun_sock_write_space;
1135 sk->sk_sndbuf = INT_MAX; 1256 sk->sk_sndbuf = INT_MAX;
1257 sock_set_flag(sk, SOCK_ZEROCOPY);
1136 1258
1137 tun_sk(sk)->tun = tun; 1259 tun_sk(sk)->tun = tun;
1138 1260