diff options
Diffstat (limited to 'drivers/net/tun.c')
-rw-r--r-- | drivers/net/tun.c | 160 |
1 files changed, 144 insertions, 16 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 987aeefbc774..3a16d4fdaa05 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c | |||
@@ -22,7 +22,7 @@ | |||
22 | * Add TUNSETLINK ioctl to set the link encapsulation | 22 | * Add TUNSETLINK ioctl to set the link encapsulation |
23 | * | 23 | * |
24 | * Mark Smith <markzzzsmith@yahoo.com.au> | 24 | * Mark Smith <markzzzsmith@yahoo.com.au> |
25 | * Use random_ether_addr() for tap MAC address. | 25 | * Use eth_random_addr() for tap MAC address. |
26 | * | 26 | * |
27 | * Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20 | 27 | * Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20 |
28 | * Fixes in packet dropping, queue length setting and queue wakeup. | 28 | * Fixes in packet dropping, queue length setting and queue wakeup. |
@@ -100,6 +100,8 @@ do { \ | |||
100 | } while (0) | 100 | } while (0) |
101 | #endif | 101 | #endif |
102 | 102 | ||
103 | #define GOODCOPY_LEN 128 | ||
104 | |||
103 | #define FLT_EXACT_COUNT 8 | 105 | #define FLT_EXACT_COUNT 8 |
104 | struct tap_filter { | 106 | struct tap_filter { |
105 | unsigned int count; /* Number of addrs. Zero means disabled */ | 107 | unsigned int count; /* Number of addrs. Zero means disabled */ |
@@ -185,7 +187,6 @@ static void __tun_detach(struct tun_struct *tun) | |||
185 | netif_tx_lock_bh(tun->dev); | 187 | netif_tx_lock_bh(tun->dev); |
186 | netif_carrier_off(tun->dev); | 188 | netif_carrier_off(tun->dev); |
187 | tun->tfile = NULL; | 189 | tun->tfile = NULL; |
188 | tun->socket.file = NULL; | ||
189 | netif_tx_unlock_bh(tun->dev); | 190 | netif_tx_unlock_bh(tun->dev); |
190 | 191 | ||
191 | /* Drop read queue */ | 192 | /* Drop read queue */ |
@@ -358,6 +359,8 @@ static void tun_free_netdev(struct net_device *dev) | |||
358 | { | 359 | { |
359 | struct tun_struct *tun = netdev_priv(dev); | 360 | struct tun_struct *tun = netdev_priv(dev); |
360 | 361 | ||
362 | BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED, &tun->socket.flags)); | ||
363 | |||
361 | sk_release_kernel(tun->socket.sk); | 364 | sk_release_kernel(tun->socket.sk); |
362 | } | 365 | } |
363 | 366 | ||
@@ -414,6 +417,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) | |||
414 | 417 | ||
415 | /* Orphan the skb - required as we might hang on to it | 418 | /* Orphan the skb - required as we might hang on to it |
416 | * for indefinite time. */ | 419 | * for indefinite time. */ |
420 | if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) | ||
421 | goto drop; | ||
417 | skb_orphan(skb); | 422 | skb_orphan(skb); |
418 | 423 | ||
419 | /* Enqueue packet */ | 424 | /* Enqueue packet */ |
@@ -600,19 +605,100 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun, | |||
600 | return skb; | 605 | return skb; |
601 | } | 606 | } |
602 | 607 | ||
608 | /* set skb frags from iovec, this can move to core network code for reuse */ | ||
609 | static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from, | ||
610 | int offset, size_t count) | ||
611 | { | ||
612 | int len = iov_length(from, count) - offset; | ||
613 | int copy = skb_headlen(skb); | ||
614 | int size, offset1 = 0; | ||
615 | int i = 0; | ||
616 | |||
617 | /* Skip over from offset */ | ||
618 | while (count && (offset >= from->iov_len)) { | ||
619 | offset -= from->iov_len; | ||
620 | ++from; | ||
621 | --count; | ||
622 | } | ||
623 | |||
624 | /* copy up to skb headlen */ | ||
625 | while (count && (copy > 0)) { | ||
626 | size = min_t(unsigned int, copy, from->iov_len - offset); | ||
627 | if (copy_from_user(skb->data + offset1, from->iov_base + offset, | ||
628 | size)) | ||
629 | return -EFAULT; | ||
630 | if (copy > size) { | ||
631 | ++from; | ||
632 | --count; | ||
633 | offset = 0; | ||
634 | } else | ||
635 | offset += size; | ||
636 | copy -= size; | ||
637 | offset1 += size; | ||
638 | } | ||
639 | |||
640 | if (len == offset1) | ||
641 | return 0; | ||
642 | |||
643 | while (count--) { | ||
644 | struct page *page[MAX_SKB_FRAGS]; | ||
645 | int num_pages; | ||
646 | unsigned long base; | ||
647 | unsigned long truesize; | ||
648 | |||
649 | len = from->iov_len - offset; | ||
650 | if (!len) { | ||
651 | offset = 0; | ||
652 | ++from; | ||
653 | continue; | ||
654 | } | ||
655 | base = (unsigned long)from->iov_base + offset; | ||
656 | size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT; | ||
657 | if (i + size > MAX_SKB_FRAGS) | ||
658 | return -EMSGSIZE; | ||
659 | num_pages = get_user_pages_fast(base, size, 0, &page[i]); | ||
660 | if (num_pages != size) { | ||
661 | for (i = 0; i < num_pages; i++) | ||
662 | put_page(page[i]); | ||
663 | return -EFAULT; | ||
664 | } | ||
665 | truesize = size * PAGE_SIZE; | ||
666 | skb->data_len += len; | ||
667 | skb->len += len; | ||
668 | skb->truesize += truesize; | ||
669 | atomic_add(truesize, &skb->sk->sk_wmem_alloc); | ||
670 | while (len) { | ||
671 | int off = base & ~PAGE_MASK; | ||
672 | int size = min_t(int, len, PAGE_SIZE - off); | ||
673 | __skb_fill_page_desc(skb, i, page[i], off, size); | ||
674 | skb_shinfo(skb)->nr_frags++; | ||
675 | /* increase sk_wmem_alloc */ | ||
676 | base += size; | ||
677 | len -= size; | ||
678 | i++; | ||
679 | } | ||
680 | offset = 0; | ||
681 | ++from; | ||
682 | } | ||
683 | return 0; | ||
684 | } | ||
685 | |||
603 | /* Get packet from user space buffer */ | 686 | /* Get packet from user space buffer */ |
604 | static ssize_t tun_get_user(struct tun_struct *tun, | 687 | static ssize_t tun_get_user(struct tun_struct *tun, void *msg_control, |
605 | const struct iovec *iv, size_t count, | 688 | const struct iovec *iv, size_t total_len, |
606 | int noblock) | 689 | size_t count, int noblock) |
607 | { | 690 | { |
608 | struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; | 691 | struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; |
609 | struct sk_buff *skb; | 692 | struct sk_buff *skb; |
610 | size_t len = count, align = NET_SKB_PAD; | 693 | size_t len = total_len, align = NET_SKB_PAD; |
611 | struct virtio_net_hdr gso = { 0 }; | 694 | struct virtio_net_hdr gso = { 0 }; |
612 | int offset = 0; | 695 | int offset = 0; |
696 | int copylen; | ||
697 | bool zerocopy = false; | ||
698 | int err; | ||
613 | 699 | ||
614 | if (!(tun->flags & TUN_NO_PI)) { | 700 | if (!(tun->flags & TUN_NO_PI)) { |
615 | if ((len -= sizeof(pi)) > count) | 701 | if ((len -= sizeof(pi)) > total_len) |
616 | return -EINVAL; | 702 | return -EINVAL; |
617 | 703 | ||
618 | if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi))) | 704 | if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi))) |
@@ -621,7 +707,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, | |||
621 | } | 707 | } |
622 | 708 | ||
623 | if (tun->flags & TUN_VNET_HDR) { | 709 | if (tun->flags & TUN_VNET_HDR) { |
624 | if ((len -= tun->vnet_hdr_sz) > count) | 710 | if ((len -= tun->vnet_hdr_sz) > total_len) |
625 | return -EINVAL; | 711 | return -EINVAL; |
626 | 712 | ||
627 | if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso))) | 713 | if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso))) |
@@ -643,14 +729,46 @@ static ssize_t tun_get_user(struct tun_struct *tun, | |||
643 | return -EINVAL; | 729 | return -EINVAL; |
644 | } | 730 | } |
645 | 731 | ||
646 | skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock); | 732 | if (msg_control) |
733 | zerocopy = true; | ||
734 | |||
735 | if (zerocopy) { | ||
736 | /* Userspace may produce vectors with count greater than | ||
737 | * MAX_SKB_FRAGS, so we need to linearize parts of the skb | ||
738 | * to let the rest of data to be fit in the frags. | ||
739 | */ | ||
740 | if (count > MAX_SKB_FRAGS) { | ||
741 | copylen = iov_length(iv, count - MAX_SKB_FRAGS); | ||
742 | if (copylen < offset) | ||
743 | copylen = 0; | ||
744 | else | ||
745 | copylen -= offset; | ||
746 | } else | ||
747 | copylen = 0; | ||
748 | /* There are 256 bytes to be copied in skb, so there is enough | ||
749 | * room for skb expand head in case it is used. | ||
750 | * The rest of the buffer is mapped from userspace. | ||
751 | */ | ||
752 | if (copylen < gso.hdr_len) | ||
753 | copylen = gso.hdr_len; | ||
754 | if (!copylen) | ||
755 | copylen = GOODCOPY_LEN; | ||
756 | } else | ||
757 | copylen = len; | ||
758 | |||
759 | skb = tun_alloc_skb(tun, align, copylen, gso.hdr_len, noblock); | ||
647 | if (IS_ERR(skb)) { | 760 | if (IS_ERR(skb)) { |
648 | if (PTR_ERR(skb) != -EAGAIN) | 761 | if (PTR_ERR(skb) != -EAGAIN) |
649 | tun->dev->stats.rx_dropped++; | 762 | tun->dev->stats.rx_dropped++; |
650 | return PTR_ERR(skb); | 763 | return PTR_ERR(skb); |
651 | } | 764 | } |
652 | 765 | ||
653 | if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) { | 766 | if (zerocopy) |
767 | err = zerocopy_sg_from_iovec(skb, iv, offset, count); | ||
768 | else | ||
769 | err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len); | ||
770 | |||
771 | if (err) { | ||
654 | tun->dev->stats.rx_dropped++; | 772 | tun->dev->stats.rx_dropped++; |
655 | kfree_skb(skb); | 773 | kfree_skb(skb); |
656 | return -EFAULT; | 774 | return -EFAULT; |
@@ -724,12 +842,18 @@ static ssize_t tun_get_user(struct tun_struct *tun, | |||
724 | skb_shinfo(skb)->gso_segs = 0; | 842 | skb_shinfo(skb)->gso_segs = 0; |
725 | } | 843 | } |
726 | 844 | ||
845 | /* copy skb_ubuf_info for callback when skb has no error */ | ||
846 | if (zerocopy) { | ||
847 | skb_shinfo(skb)->destructor_arg = msg_control; | ||
848 | skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; | ||
849 | } | ||
850 | |||
727 | netif_rx_ni(skb); | 851 | netif_rx_ni(skb); |
728 | 852 | ||
729 | tun->dev->stats.rx_packets++; | 853 | tun->dev->stats.rx_packets++; |
730 | tun->dev->stats.rx_bytes += len; | 854 | tun->dev->stats.rx_bytes += len; |
731 | 855 | ||
732 | return count; | 856 | return total_len; |
733 | } | 857 | } |
734 | 858 | ||
735 | static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, | 859 | static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, |
@@ -744,7 +868,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, | |||
744 | 868 | ||
745 | tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count); | 869 | tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count); |
746 | 870 | ||
747 | result = tun_get_user(tun, iv, iov_length(iv, count), | 871 | result = tun_get_user(tun, NULL, iv, iov_length(iv, count), count, |
748 | file->f_flags & O_NONBLOCK); | 872 | file->f_flags & O_NONBLOCK); |
749 | 873 | ||
750 | tun_put(tun); | 874 | tun_put(tun); |
@@ -958,8 +1082,8 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, | |||
958 | struct msghdr *m, size_t total_len) | 1082 | struct msghdr *m, size_t total_len) |
959 | { | 1083 | { |
960 | struct tun_struct *tun = container_of(sock, struct tun_struct, socket); | 1084 | struct tun_struct *tun = container_of(sock, struct tun_struct, socket); |
961 | return tun_get_user(tun, m->msg_iov, total_len, | 1085 | return tun_get_user(tun, m->msg_control, m->msg_iov, total_len, |
962 | m->msg_flags & MSG_DONTWAIT); | 1086 | m->msg_iovlen, m->msg_flags & MSG_DONTWAIT); |
963 | } | 1087 | } |
964 | 1088 | ||
965 | static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, | 1089 | static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, |
@@ -1115,6 +1239,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) | |||
1115 | tun->flags = flags; | 1239 | tun->flags = flags; |
1116 | tun->txflt.count = 0; | 1240 | tun->txflt.count = 0; |
1117 | tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); | 1241 | tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); |
1242 | set_bit(SOCK_EXTERNALLY_ALLOCATED, &tun->socket.flags); | ||
1118 | 1243 | ||
1119 | err = -ENOMEM; | 1244 | err = -ENOMEM; |
1120 | sk = sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL, &tun_proto); | 1245 | sk = sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL, &tun_proto); |
@@ -1128,6 +1253,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) | |||
1128 | sock_init_data(&tun->socket, sk); | 1253 | sock_init_data(&tun->socket, sk); |
1129 | sk->sk_write_space = tun_sock_write_space; | 1254 | sk->sk_write_space = tun_sock_write_space; |
1130 | sk->sk_sndbuf = INT_MAX; | 1255 | sk->sk_sndbuf = INT_MAX; |
1256 | sock_set_flag(sk, SOCK_ZEROCOPY); | ||
1131 | 1257 | ||
1132 | tun_sk(sk)->tun = tun; | 1258 | tun_sk(sk)->tun = tun; |
1133 | 1259 | ||
@@ -1252,10 +1378,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, | |||
1252 | int vnet_hdr_sz; | 1378 | int vnet_hdr_sz; |
1253 | int ret; | 1379 | int ret; |
1254 | 1380 | ||
1255 | if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) | 1381 | if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) { |
1256 | if (copy_from_user(&ifr, argp, ifreq_len)) | 1382 | if (copy_from_user(&ifr, argp, ifreq_len)) |
1257 | return -EFAULT; | 1383 | return -EFAULT; |
1258 | 1384 | } else { | |
1385 | memset(&ifr, 0, sizeof(ifr)); | ||
1386 | } | ||
1259 | if (cmd == TUNGETFEATURES) { | 1387 | if (cmd == TUNGETFEATURES) { |
1260 | /* Currently this just means: "what IFF flags are valid?". | 1388 | /* Currently this just means: "what IFF flags are valid?". |
1261 | * This is needed because we never checked for invalid flags on | 1389 | * This is needed because we never checked for invalid flags on |