1 files changed, 661 insertions, 0 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
new file mode 100644
index 000000000000..4c8928319e1d
--- /dev/null
+++ b/drivers/vhost/net.c
@@ -0,0 +1,661 @@
+/* Copyright (C) 2009 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * virtio-net server in host kernel.
+ */
+#include <linux/compat.h>
+#include <linux/eventfd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <linux/mmu_context.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/rcupdate.h>
+#include <linux/file.h>
+#include <linux/net.h>
+#include <linux/if_packet.h>
+#include <linux/if_arp.h>
+#include <linux/if_tun.h>
+#include <net/sock.h>
+#include "vhost.h"
+/* Max number of bytes transferred before requeueing the job.
+ * Using this limit prevents one virtqueue from starving others. */
+#define VHOST_NET_WEIGHT 0x80000
+enum {
+        VHOST_NET_VQ_RX = 0,
+        VHOST_NET_VQ_TX = 1,
+        VHOST_NET_VQ_MAX = 2,
+};
+enum vhost_net_poll_state {
+        VHOST_NET_POLL_DISABLED = 0,
+        VHOST_NET_POLL_STARTED = 1,
+        VHOST_NET_POLL_STOPPED = 2,
+};
+struct vhost_net {
+        struct vhost_dev dev;
+        struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
+        struct vhost_poll poll[VHOST_NET_VQ_MAX];
+        /* Tells us whether we are polling a socket for TX.
+         * We only do this when socket buffer fills up.
+         * Protected by tx vq lock. */
+        enum vhost_net_poll_state tx_poll_state;
+};
+/* Pop first len bytes from iovec. Return number of segments used. */
+static int move_iovec_hdr(struct iovec *from, struct iovec *to,
+                          size_t len, int iov_count)
+{
+        int seg = 0;
+        size_t size;
+        while (len && seg < iov_count) {
+                size = min(from->iov_len, len);
+                to->iov_base = from->iov_base;
+                to->iov_len = size;
+                from->iov_len -= size;
+                from->iov_base += size;
+                len -= size;
+                ++from;
+                ++to;
+                ++seg;
+        }
+        return seg;
+}
+/* Caller must have TX VQ lock */
+static void tx_poll_stop(struct vhost_net *net)
+{
+        if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
+                return;
+        vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
+        net->tx_poll_state = VHOST_NET_POLL_STOPPED;
+}
+/* Caller must have TX VQ lock */
+static void tx_poll_start(struct vhost_net *net, struct socket *sock)
+{
+        if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
+                return;
+        vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
+        net->tx_poll_state = VHOST_NET_POLL_STARTED;
+}
+/* Expects to be always run from workqueue - which acts as
+ * read-size critical section for our kind of RCU. */
+static void handle_tx(struct vhost_net *net)
+{
+        struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+        unsigned head, out, in, s;
+        struct msghdr msg = {
+                .msg_name = NULL,
+                .msg_namelen = 0,
+                .msg_control = NULL,
+                .msg_controllen = 0,
+                .msg_iov = vq->iov,
+                .msg_flags = MSG_DONTWAIT,
+        };
+        size_t len, total_len = 0;
+        int err, wmem;
+        size_t hdr_size;
+        struct socket *sock = rcu_dereference(vq->private_data);
+        if (!sock)
+                return;
+        wmem = atomic_read(&sock->sk->sk_wmem_alloc);
+        if (wmem >= sock->sk->sk_sndbuf)
+                return;
+        use_mm(net->dev.mm);
+        mutex_lock(&vq->mutex);
+        vhost_disable_notify(vq);
+        if (wmem < sock->sk->sk_sndbuf * 2)
+                tx_poll_stop(net);
+        hdr_size = vq->hdr_size;
+        for (;;) {
+                head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
+                                         ARRAY_SIZE(vq->iov),
+                                         &out, &in,
+                                         NULL, NULL);
+                /* Nothing new?  Wait for eventfd to tell us they refilled. */
+                if (head == vq->num) {
+                        wmem = atomic_read(&sock->sk->sk_wmem_alloc);
+                        if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
+                                tx_poll_start(net, sock);
+                                set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+                                break;
+                        }
+                        if (unlikely(vhost_enable_notify(vq))) {
+                                vhost_disable_notify(vq);
+                                continue;
+                        }
+                        break;
+                }
+                if (in) {
+                        vq_err(vq, "Unexpected descriptor format for TX: "
+                               "out %d, int %d\n", out, in);
+                        break;
+                }
+                /* Skip header. TODO: support TSO. */
+                s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
+                msg.msg_iovlen = out;
+                len = iov_length(vq->iov, out);
+                /* Sanity check */
+                if (!len) {
+                        vq_err(vq, "Unexpected header len for TX: "
+                               "%zd expected %zd\n",
+                               iov_length(vq->hdr, s), hdr_size);
+                        break;
+                }
+                /* TODO: Check specific error and bomb out unless ENOBUFS? */
+                err = sock->ops->sendmsg(NULL, sock, &msg, len);
+                if (unlikely(err < 0)) {
+                        vhost_discard_vq_desc(vq);
+                        tx_poll_start(net, sock);
+                        break;
+                }
+                if (err != len)
+                        pr_err("Truncated TX packet: "
+                               " len %d != %zd\n", err, len);
+                vhost_add_used_and_signal(&net->dev, vq, head, 0);
+                total_len += len;
+                if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
+                        vhost_poll_queue(&vq->poll);
+                        break;
+                }
+        }
+        mutex_unlock(&vq->mutex);
+        unuse_mm(net->dev.mm);
+}
+/* Expects to be always run from workqueue - which acts as
+ * read-size critical section for our kind of RCU. */
+static void handle_rx(struct vhost_net *net)
+{
+        struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+        unsigned head, out, in, log, s;
+        struct vhost_log *vq_log;
+        struct msghdr msg = {
+                .msg_name = NULL,
+                .msg_namelen = 0,
+                .msg_control = NULL, /* FIXME: get and handle RX aux data. */
+                .msg_controllen = 0,
+                .msg_iov = vq->iov,
+                .msg_flags = MSG_DONTWAIT,
+        };
+        struct virtio_net_hdr hdr = {
+                .flags = 0,
+                .gso_type = VIRTIO_NET_HDR_GSO_NONE
+        };
+        size_t len, total_len = 0;
+        int err;
+        size_t hdr_size;
+        struct socket *sock = rcu_dereference(vq->private_data);
+        if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+                return;
+        use_mm(net->dev.mm);
+        mutex_lock(&vq->mutex);
+        vhost_disable_notify(vq);
+        hdr_size = vq->hdr_size;
+        vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+                vq->log : NULL;
+        for (;;) {
+                head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
+                                         ARRAY_SIZE(vq->iov),
+                                         &out, &in,
+                                         vq_log, &log);
+                /* OK, now we need to know about added descriptors. */
+                if (head == vq->num) {
+                        if (unlikely(vhost_enable_notify(vq))) {
+                                /* They have slipped one in as we were
+                                 * doing that: check again. */
+                                vhost_disable_notify(vq);
+                                continue;
+                        }
+                        /* Nothing new?  Wait for eventfd to tell us
+                         * they refilled. */
+                        break;
+                }
+                /* We don't need to be notified again. */
+                if (out) {
+                        vq_err(vq, "Unexpected descriptor format for RX: "
+                               "out %d, int %d\n",
+                               out, in);
+                        break;
+                }
+                /* Skip header. TODO: support TSO/mergeable rx buffers. */
+                s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
+                msg.msg_iovlen = in;
+                len = iov_length(vq->iov, in);
+                /* Sanity check */
+                if (!len) {
+                        vq_err(vq, "Unexpected header len for RX: "
+                               "%zd expected %zd\n",
+                               iov_length(vq->hdr, s), hdr_size);
+                        break;
+                }
+                err = sock->ops->recvmsg(NULL, sock, &msg,
+                                         len, MSG_DONTWAIT | MSG_TRUNC);
+                /* TODO: Check specific error and bomb out unless EAGAIN? */
+                if (err < 0) {
+                        vhost_discard_vq_desc(vq);
+                        break;
+                }
+                /* TODO: Should check and handle checksum. */
+                if (err > len) {
+                        pr_err("Discarded truncated rx packet: "
+                               " len %d > %zd\n", err, len);
+                        vhost_discard_vq_desc(vq);
+                        continue;
+                }
+                len = err;
+                err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
+                if (err) {
+                        vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
+                               vq->iov->iov_base, err);
+                        break;
+                }
+                len += hdr_size;
+                vhost_add_used_and_signal(&net->dev, vq, head, len);
+                if (unlikely(vq_log))
+                        vhost_log_write(vq, vq_log, log, len);
+                total_len += len;
+                if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
+                        vhost_poll_queue(&vq->poll);
+                        break;
+                }
+        }
+        mutex_unlock(&vq->mutex);
+        unuse_mm(net->dev.mm);
+}
+static void handle_tx_kick(struct work_struct *work)
+{
+        struct vhost_virtqueue *vq;
+        struct vhost_net *net;
+        vq = container_of(work, struct vhost_virtqueue, poll.work);
+        net = container_of(vq->dev, struct vhost_net, dev);
+        handle_tx(net);
+}
+static void handle_rx_kick(struct work_struct *work)
+{
+        struct vhost_virtqueue *vq;
+        struct vhost_net *net;
+        vq = container_of(work, struct vhost_virtqueue, poll.work);
+        net = container_of(vq->dev, struct vhost_net, dev);
+        handle_rx(net);
+}
+static void handle_tx_net(struct work_struct *work)
+{
+        struct vhost_net *net;
+        net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
+        handle_tx(net);
+}
+static void handle_rx_net(struct work_struct *work)
+{
+        struct vhost_net *net;
+        net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
+        handle_rx(net);
+}
+static int vhost_net_open(struct inode *inode, struct file *f)
+{
+        struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
+        int r;
+        if (!n)
+                return -ENOMEM;
+        n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
+        n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
+        r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
+        if (r < 0) {
+                kfree(n);
+                return r;
+        }
+        vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
+        vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
+        n->tx_poll_state = VHOST_NET_POLL_DISABLED;
+        f->private_data = n;
+        return 0;
+}
+static void vhost_net_disable_vq(struct vhost_net *n,
+                                 struct vhost_virtqueue *vq)
+{
+        if (!vq->private_data)
+                return;
+        if (vq == n->vqs + VHOST_NET_VQ_TX) {
+                tx_poll_stop(n);
+                n->tx_poll_state = VHOST_NET_POLL_DISABLED;
+        } else
+                vhost_poll_stop(n->poll + VHOST_NET_VQ_RX);
+}
+static void vhost_net_enable_vq(struct vhost_net *n,
+                                struct vhost_virtqueue *vq)
+{
+        struct socket *sock = vq->private_data;
+        if (!sock)
+                return;
+        if (vq == n->vqs + VHOST_NET_VQ_TX) {
+                n->tx_poll_state = VHOST_NET_POLL_STOPPED;
+                tx_poll_start(n, sock);
+        } else
+                vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file);
+}
+static struct socket *vhost_net_stop_vq(struct vhost_net *n,
+                                        struct vhost_virtqueue *vq)
+{
+        struct socket *sock;
+        mutex_lock(&vq->mutex);
+        sock = vq->private_data;
+        vhost_net_disable_vq(n, vq);
+        rcu_assign_pointer(vq->private_data, NULL);
+        mutex_unlock(&vq->mutex);
+        return sock;
+}
+static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
+                           struct socket **rx_sock)
+{
+        *tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX);
+        *rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX);
+}
+static void vhost_net_flush_vq(struct vhost_net *n, int index)
+{
+        vhost_poll_flush(n->poll + index);
+        vhost_poll_flush(&n->dev.vqs[index].poll);
+}
+static void vhost_net_flush(struct vhost_net *n)
+{
+        vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
+        vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
+}
+static int vhost_net_release(struct inode *inode, struct file *f)
+{
+        struct vhost_net *n = f->private_data;
+        struct socket *tx_sock;
+        struct socket *rx_sock;
+        vhost_net_stop(n, &tx_sock, &rx_sock);
+        vhost_net_flush(n);
+        vhost_dev_cleanup(&n->dev);
+        if (tx_sock)
+                fput(tx_sock->file);
+        if (rx_sock)
+                fput(rx_sock->file);
+        /* We do an extra flush before freeing memory,
+         * since jobs can re-queue themselves. */
+        vhost_net_flush(n);
+        kfree(n);
+        return 0;
+}
+static struct socket *get_raw_socket(int fd)
+{
+        struct {
+                struct sockaddr_ll sa;
+                char  buf[MAX_ADDR_LEN];
+        } uaddr;
+        int uaddr_len = sizeof uaddr, r;
+        struct socket *sock = sockfd_lookup(fd, &r);
+        if (!sock)
+                return ERR_PTR(-ENOTSOCK);
+        /* Parameter checking */
+        if (sock->sk->sk_type != SOCK_RAW) {
+                r = -ESOCKTNOSUPPORT;
+                goto err;
+        }
+        r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa,
+                               &uaddr_len, 0);
+        if (r)
+                goto err;
+        if (uaddr.sa.sll_family != AF_PACKET) {
+                r = -EPFNOSUPPORT;
+                goto err;
+        }
+        return sock;
+err:
+        fput(sock->file);
+        return ERR_PTR(r);
+}
+static struct socket *get_tun_socket(int fd)
+{
+        struct file *file = fget(fd);
+        struct socket *sock;
+        if (!file)
+                return ERR_PTR(-EBADF);
+        sock = tun_get_socket(file);
+        if (IS_ERR(sock))
+                fput(file);
+        return sock;
+}
+static struct socket *get_socket(int fd)
+{
+        struct socket *sock;
+        /* special case to disable backend */
+        if (fd == -1)
+                return NULL;
+        sock = get_raw_socket(fd);
+        if (!IS_ERR(sock))
+                return sock;
+        sock = get_tun_socket(fd);
+        if (!IS_ERR(sock))
+                return sock;
+        return ERR_PTR(-ENOTSOCK);
+}
+static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
+{
+        struct socket *sock, *oldsock;
+        struct vhost_virtqueue *vq;
+        int r;
+        mutex_lock(&n->dev.mutex);
+        r = vhost_dev_check_owner(&n->dev);
+        if (r)
+                goto err;
+        if (index >= VHOST_NET_VQ_MAX) {
+                r = -ENOBUFS;
+                goto err;
+        }
+        vq = n->vqs + index;
+        mutex_lock(&vq->mutex);
+        /* Verify that ring has been setup correctly. */
+        if (!vhost_vq_access_ok(vq)) {
+                r = -EFAULT;
+                goto err;
+        }
+        sock = get_socket(fd);
+        if (IS_ERR(sock)) {
+                r = PTR_ERR(sock);
+                goto err;
+        }
+        /* start polling new socket */
+        oldsock = vq->private_data;
+        if (sock == oldsock)
+                goto done;
+        vhost_net_disable_vq(n, vq);
+        rcu_assign_pointer(vq->private_data, sock);
+        vhost_net_enable_vq(n, vq);
+        mutex_unlock(&vq->mutex);
+done:
+        if (oldsock) {
+                vhost_net_flush_vq(n, index);
+                fput(oldsock->file);
+        }
+err:
+        mutex_unlock(&n->dev.mutex);
+        return r;
+}
+static long vhost_net_reset_owner(struct vhost_net *n)
+{
+        struct socket *tx_sock = NULL;
+        struct socket *rx_sock = NULL;
+        long err;
+        mutex_lock(&n->dev.mutex);
+        err = vhost_dev_check_owner(&n->dev);
+        if (err)
+                goto done;
+        vhost_net_stop(n, &tx_sock, &rx_sock);
+        vhost_net_flush(n);
+        err = vhost_dev_reset_owner(&n->dev);
+done:
+        mutex_unlock(&n->dev.mutex);
+        if (tx_sock)
+                fput(tx_sock->file);
+        if (rx_sock)
+                fput(rx_sock->file);
+        return err;
+}
+static int vhost_net_set_features(struct vhost_net *n, u64 features)
+{
+        size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
+                sizeof(struct virtio_net_hdr) : 0;
+        int i;
+        mutex_lock(&n->dev.mutex);
+        if ((features & (1 << VHOST_F_LOG_ALL)) &&
+            !vhost_log_access_ok(&n->dev)) {
+                mutex_unlock(&n->dev.mutex);
+                return -EFAULT;
+        }
+        n->dev.acked_features = features;
+        smp_wmb();
+        for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
+                mutex_lock(&n->vqs[i].mutex);
+                n->vqs[i].hdr_size = hdr_size;
+                mutex_unlock(&n->vqs[i].mutex);
+        }
+        vhost_net_flush(n);
+        mutex_unlock(&n->dev.mutex);
+        return 0;
+}
+static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
+                            unsigned long arg)
+{
+        struct vhost_net *n = f->private_data;
+        void __user *argp = (void __user *)arg;
+        u64 __user *featurep = argp;
+        struct vhost_vring_file backend;
+        u64 features;
+        int r;
+        switch (ioctl) {
+        case VHOST_NET_SET_BACKEND:
+                r = copy_from_user(&backend, argp, sizeof backend);
+                if (r < 0)
+                        return r;
+                return vhost_net_set_backend(n, backend.index, backend.fd);
+        case VHOST_GET_FEATURES:
+                features = VHOST_FEATURES;
+                return copy_to_user(featurep, &features, sizeof features);
+        case VHOST_SET_FEATURES:
+                r = copy_from_user(&features, featurep, sizeof features);
+                if (r < 0)
+                        return r;
+                if (features & ~VHOST_FEATURES)
+                        return -EOPNOTSUPP;
+                return vhost_net_set_features(n, features);
+        case VHOST_RESET_OWNER:
+                return vhost_net_reset_owner(n);
+        default:
+                mutex_lock(&n->dev.mutex);
+                r = vhost_dev_ioctl(&n->dev, ioctl, arg);
+                vhost_net_flush(n);
+                mutex_unlock(&n->dev.mutex);
+                return r;
+        }
+}
+#ifdef CONFIG_COMPAT
+static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
+                                   unsigned long arg)
+{
+        return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
+}
+#endif
+const static struct file_operations vhost_net_fops = {
+        .owner          = THIS_MODULE,
+        .release        = vhost_net_release,
+        .unlocked_ioctl = vhost_net_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = vhost_net_compat_ioctl,
+#endif
+        .open           = vhost_net_open,
+};
+static struct miscdevice vhost_net_misc = {
+        VHOST_NET_MINOR,
+        "vhost-net",
+        &vhost_net_fops,
+};
+int vhost_net_init(void)
+{
+        int r = vhost_init();
+        if (r)
+                goto err_init;
+        r = misc_register(&vhost_net_misc);
+        if (r)
+                goto err_reg;
+        return 0;
+err_reg:
+        vhost_cleanup();
+err_init:
+        return r;
+}
+module_init(vhost_net_init);
+void vhost_net_exit(void)
+{
+        misc_deregister(&vhost_net_misc);
+        vhost_cleanup();
+}
+module_exit(vhost_net_exit);
+MODULE_VERSION("0.0.1");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Michael S. Tsirkin");
+MODULE_DESCRIPTION("Host kernel accelerator for virtio net");

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c new file mode 100644 index 000000000000..4c8928319e1d --- /dev/null +++ b/drivers/vhost/net.c
@@ -0,0 +1,661 @@
	1	/* Copyright (C) 2009 Red Hat, Inc.
	2	* Author: Michael S. Tsirkin <mst@redhat.com>
	3	*
	4	* This work is licensed under the terms of the GNU GPL, version 2.
	5	*
	6	* virtio-net server in host kernel.
	7	*/
	8
	9	#include <linux/compat.h>
	10	#include <linux/eventfd.h>
	11	#include <linux/vhost.h>
	12	#include <linux/virtio_net.h>
	13	#include <linux/mmu_context.h>
	14	#include <linux/miscdevice.h>
	15	#include <linux/module.h>
	16	#include <linux/mutex.h>
	17	#include <linux/workqueue.h>
	18	#include <linux/rcupdate.h>
	19	#include <linux/file.h>
	20
	21	#include <linux/net.h>
	22	#include <linux/if_packet.h>
	23	#include <linux/if_arp.h>
	24	#include <linux/if_tun.h>
	25
	26	#include <net/sock.h>
	27
	28	#include "vhost.h"
	29
	30	/* Max number of bytes transferred before requeueing the job.
	31	* Using this limit prevents one virtqueue from starving others. */
	32	#define VHOST_NET_WEIGHT 0x80000
	33
	34	enum {
	35	VHOST_NET_VQ_RX = 0,
	36	VHOST_NET_VQ_TX = 1,
	37	VHOST_NET_VQ_MAX = 2,
	38	};
	39
	40	enum vhost_net_poll_state {
	41	VHOST_NET_POLL_DISABLED = 0,
	42	VHOST_NET_POLL_STARTED = 1,
	43	VHOST_NET_POLL_STOPPED = 2,
	44	};
	45
	46	struct vhost_net {
	47	struct vhost_dev dev;
	48	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
	49	struct vhost_poll poll[VHOST_NET_VQ_MAX];
	50	/* Tells us whether we are polling a socket for TX.
	51	* We only do this when socket buffer fills up.
	52	* Protected by tx vq lock. */
	53	enum vhost_net_poll_state tx_poll_state;
	54	};
	55
	56	/* Pop first len bytes from iovec. Return number of segments used. */
	57	static int move_iovec_hdr(struct iovec from, struct iovec to,
	58	size_t len, int iov_count)
	59	{
	60	int seg = 0;
	61	size_t size;
	62	while (len && seg < iov_count) {
	63	size = min(from->iov_len, len);
	64	to->iov_base = from->iov_base;
	65	to->iov_len = size;
	66	from->iov_len -= size;
	67	from->iov_base += size;
	68	len -= size;
	69	++from;
	70	++to;
	71	++seg;
	72	}
	73	return seg;
	74	}
	75
	76	/* Caller must have TX VQ lock */
	77	static void tx_poll_stop(struct vhost_net *net)
	78	{
	79	if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
	80	return;
	81	vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
	82	net->tx_poll_state = VHOST_NET_POLL_STOPPED;
	83	}
	84
	85	/* Caller must have TX VQ lock */
	86	static void tx_poll_start(struct vhost_net net, struct socket sock)
	87	{
	88	if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
	89	return;
	90	vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
	91	net->tx_poll_state = VHOST_NET_POLL_STARTED;
	92	}
	93
	94	/* Expects to be always run from workqueue - which acts as
	95	* read-size critical section for our kind of RCU. */
	96	static void handle_tx(struct vhost_net *net)
	97	{
	98	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
	99	unsigned head, out, in, s;
	100	struct msghdr msg = {
	101	.msg_name = NULL,
	102	.msg_namelen = 0,
	103	.msg_control = NULL,
	104	.msg_controllen = 0,
	105	.msg_iov = vq->iov,
	106	.msg_flags = MSG_DONTWAIT,
	107	};
	108	size_t len, total_len = 0;
	109	int err, wmem;
	110	size_t hdr_size;
	111	struct socket *sock = rcu_dereference(vq->private_data);
	112	if (!sock)
	113	return;
	114
	115	wmem = atomic_read(&sock->sk->sk_wmem_alloc);
	116	if (wmem >= sock->sk->sk_sndbuf)
	117	return;
	118
	119	use_mm(net->dev.mm);
	120	mutex_lock(&vq->mutex);
	121	vhost_disable_notify(vq);
	122
	123	if (wmem < sock->sk->sk_sndbuf * 2)
	124	tx_poll_stop(net);
	125	hdr_size = vq->hdr_size;
	126
	127	for (;;) {
	128	head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
	129	ARRAY_SIZE(vq->iov),
	130	&out, &in,
	131	NULL, NULL);
	132	/* Nothing new? Wait for eventfd to tell us they refilled. */
	133	if (head == vq->num) {
	134	wmem = atomic_read(&sock->sk->sk_wmem_alloc);
	135	if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
	136	tx_poll_start(net, sock);
	137	set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
	138	break;
	139	}
	140	if (unlikely(vhost_enable_notify(vq))) {
	141	vhost_disable_notify(vq);
	142	continue;
	143	}
	144	break;
	145	}
	146	if (in) {
	147	vq_err(vq, "Unexpected descriptor format for TX: "
	148	"out %d, int %d\n", out, in);
	149	break;
	150	}
	151	/* Skip header. TODO: support TSO. */
	152	s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
	153	msg.msg_iovlen = out;
	154	len = iov_length(vq->iov, out);
	155	/* Sanity check */
	156	if (!len) {
	157	vq_err(vq, "Unexpected header len for TX: "
	158	"%zd expected %zd\n",
	159	iov_length(vq->hdr, s), hdr_size);
	160	break;
	161	}
	162	/* TODO: Check specific error and bomb out unless ENOBUFS? */
	163	err = sock->ops->sendmsg(NULL, sock, &msg, len);
	164	if (unlikely(err < 0)) {
	165	vhost_discard_vq_desc(vq);
	166	tx_poll_start(net, sock);
	167	break;
	168	}
	169	if (err != len)
	170	pr_err("Truncated TX packet: "
	171	" len %d != %zd\n", err, len);
	172	vhost_add_used_and_signal(&net->dev, vq, head, 0);
	173	total_len += len;
	174	if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
	175	vhost_poll_queue(&vq->poll);
	176	break;
	177	}
	178	}
	179
	180	mutex_unlock(&vq->mutex);
	181	unuse_mm(net->dev.mm);
	182	}
	183
	184	/* Expects to be always run from workqueue - which acts as
	185	* read-size critical section for our kind of RCU. */
	186	static void handle_rx(struct vhost_net *net)
	187	{
	188	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
	189	unsigned head, out, in, log, s;
	190	struct vhost_log *vq_log;
	191	struct msghdr msg = {
	192	.msg_name = NULL,
	193	.msg_namelen = 0,
	194	.msg_control = NULL, /* FIXME: get and handle RX aux data. */
	195	.msg_controllen = 0,
	196	.msg_iov = vq->iov,
	197	.msg_flags = MSG_DONTWAIT,
	198	};
	199
	200	struct virtio_net_hdr hdr = {
	201	.flags = 0,
	202	.gso_type = VIRTIO_NET_HDR_GSO_NONE
	203	};
	204
	205	size_t len, total_len = 0;
	206	int err;
	207	size_t hdr_size;
	208	struct socket *sock = rcu_dereference(vq->private_data);
	209	if (!sock \|\| skb_queue_empty(&sock->sk->sk_receive_queue))
	210	return;
	211
	212	use_mm(net->dev.mm);
	213	mutex_lock(&vq->mutex);
	214	vhost_disable_notify(vq);
	215	hdr_size = vq->hdr_size;
	216
	217	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
	218	vq->log : NULL;
	219
	220	for (;;) {
	221	head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
	222	ARRAY_SIZE(vq->iov),
	223	&out, &in,
	224	vq_log, &log);
	225	/* OK, now we need to know about added descriptors. */
	226	if (head == vq->num) {
	227	if (unlikely(vhost_enable_notify(vq))) {
	228	/* They have slipped one in as we were
	229	* doing that: check again. */
	230	vhost_disable_notify(vq);
	231	continue;
	232	}
	233	/* Nothing new? Wait for eventfd to tell us
	234	* they refilled. */
	235	break;
	236	}
	237	/* We don't need to be notified again. */
	238	if (out) {
	239	vq_err(vq, "Unexpected descriptor format for RX: "
	240	"out %d, int %d\n",
	241	out, in);
	242	break;
	243	}
	244	/* Skip header. TODO: support TSO/mergeable rx buffers. */
	245	s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
	246	msg.msg_iovlen = in;
	247	len = iov_length(vq->iov, in);
	248	/* Sanity check */
	249	if (!len) {
	250	vq_err(vq, "Unexpected header len for RX: "
	251	"%zd expected %zd\n",
	252	iov_length(vq->hdr, s), hdr_size);
	253	break;
	254	}
	255	err = sock->ops->recvmsg(NULL, sock, &msg,
	256	len, MSG_DONTWAIT \| MSG_TRUNC);
	257	/* TODO: Check specific error and bomb out unless EAGAIN? */
	258	if (err < 0) {
	259	vhost_discard_vq_desc(vq);
	260	break;
	261	}
	262	/* TODO: Should check and handle checksum. */
	263	if (err > len) {
	264	pr_err("Discarded truncated rx packet: "
	265	" len %d > %zd\n", err, len);
	266	vhost_discard_vq_desc(vq);
	267	continue;
	268	}
	269	len = err;
	270	err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
	271	if (err) {
	272	vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
	273	vq->iov->iov_base, err);
	274	break;
	275	}
	276	len += hdr_size;
	277	vhost_add_used_and_signal(&net->dev, vq, head, len);
	278	if (unlikely(vq_log))
	279	vhost_log_write(vq, vq_log, log, len);
	280	total_len += len;
	281	if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
	282	vhost_poll_queue(&vq->poll);
	283	break;
	284	}
	285	}
	286
	287	mutex_unlock(&vq->mutex);
	288	unuse_mm(net->dev.mm);
	289	}
	290
	291	static void handle_tx_kick(struct work_struct *work)
	292	{
	293	struct vhost_virtqueue *vq;
	294	struct vhost_net *net;
	295	vq = container_of(work, struct vhost_virtqueue, poll.work);
	296	net = container_of(vq->dev, struct vhost_net, dev);
	297	handle_tx(net);
	298	}
	299
	300	static void handle_rx_kick(struct work_struct *work)
	301	{
	302	struct vhost_virtqueue *vq;
	303	struct vhost_net *net;
	304	vq = container_of(work, struct vhost_virtqueue, poll.work);
	305	net = container_of(vq->dev, struct vhost_net, dev);
	306	handle_rx(net);
	307	}
	308
	309	static void handle_tx_net(struct work_struct *work)
	310	{
	311	struct vhost_net *net;
	312	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
	313	handle_tx(net);
	314	}
	315
	316	static void handle_rx_net(struct work_struct *work)
	317	{
	318	struct vhost_net *net;
	319	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
	320	handle_rx(net);
	321	}
	322
	323	static int vhost_net_open(struct inode inode, struct file f)
	324	{
	325	struct vhost_net n = kmalloc(sizeof n, GFP_KERNEL);
	326	int r;
	327	if (!n)
	328	return -ENOMEM;
	329	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
	330	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
	331	r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
	332	if (r < 0) {
	333	kfree(n);
	334	return r;
	335	}
	336
	337	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
	338	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
	339	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
	340
	341	f->private_data = n;
	342
	343	return 0;
	344	}
	345
	346	static void vhost_net_disable_vq(struct vhost_net *n,
	347	struct vhost_virtqueue *vq)
	348	{
	349	if (!vq->private_data)
	350	return;
	351	if (vq == n->vqs + VHOST_NET_VQ_TX) {
	352	tx_poll_stop(n);
	353	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
	354	} else
	355	vhost_poll_stop(n->poll + VHOST_NET_VQ_RX);
	356	}
	357
	358	static void vhost_net_enable_vq(struct vhost_net *n,
	359	struct vhost_virtqueue *vq)
	360	{
	361	struct socket *sock = vq->private_data;
	362	if (!sock)
	363	return;
	364	if (vq == n->vqs + VHOST_NET_VQ_TX) {
	365	n->tx_poll_state = VHOST_NET_POLL_STOPPED;
	366	tx_poll_start(n, sock);
	367	} else
	368	vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file);
	369	}
	370
	371	static struct socket vhost_net_stop_vq(struct vhost_net n,
	372	struct vhost_virtqueue *vq)
	373	{
	374	struct socket *sock;
	375
	376	mutex_lock(&vq->mutex);
	377	sock = vq->private_data;
	378	vhost_net_disable_vq(n, vq);
	379	rcu_assign_pointer(vq->private_data, NULL);
	380	mutex_unlock(&vq->mutex);
	381	return sock;
	382	}
	383
	384	static void vhost_net_stop(struct vhost_net n, struct socket *tx_sock,
	385	struct socket **rx_sock)
	386	{
	387	*tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX);
	388	*rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX);
	389	}
	390
	391	static void vhost_net_flush_vq(struct vhost_net *n, int index)
	392	{
	393	vhost_poll_flush(n->poll + index);
	394	vhost_poll_flush(&n->dev.vqs[index].poll);
	395	}
	396
	397	static void vhost_net_flush(struct vhost_net *n)
	398	{
	399	vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
	400	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
	401	}
	402
	403	static int vhost_net_release(struct inode inode, struct file f)
	404	{
	405	struct vhost_net *n = f->private_data;
	406	struct socket *tx_sock;
	407	struct socket *rx_sock;
	408
	409	vhost_net_stop(n, &tx_sock, &rx_sock);
	410	vhost_net_flush(n);
	411	vhost_dev_cleanup(&n->dev);
	412	if (tx_sock)
	413	fput(tx_sock->file);
	414	if (rx_sock)
	415	fput(rx_sock->file);
	416	/* We do an extra flush before freeing memory,
	417	* since jobs can re-queue themselves. */
	418	vhost_net_flush(n);
	419	kfree(n);
	420	return 0;
	421	}
	422
	423	static struct socket *get_raw_socket(int fd)
	424	{
	425	struct {
	426	struct sockaddr_ll sa;
	427	char buf[MAX_ADDR_LEN];
	428	} uaddr;
	429	int uaddr_len = sizeof uaddr, r;
	430	struct socket *sock = sockfd_lookup(fd, &r);
	431	if (!sock)
	432	return ERR_PTR(-ENOTSOCK);
	433
	434	/* Parameter checking */
	435	if (sock->sk->sk_type != SOCK_RAW) {
	436	r = -ESOCKTNOSUPPORT;
	437	goto err;
	438	}
	439
	440	r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa,
	441	&uaddr_len, 0);
	442	if (r)
	443	goto err;
	444
	445	if (uaddr.sa.sll_family != AF_PACKET) {
	446	r = -EPFNOSUPPORT;
	447	goto err;
	448	}
	449	return sock;
	450	err:
	451	fput(sock->file);
	452	return ERR_PTR(r);
	453	}
	454
	455	static struct socket *get_tun_socket(int fd)
	456	{
	457	struct file *file = fget(fd);
	458	struct socket *sock;
	459	if (!file)
	460	return ERR_PTR(-EBADF);
	461	sock = tun_get_socket(file);
	462	if (IS_ERR(sock))
	463	fput(file);
	464	return sock;
	465	}
	466
	467	static struct socket *get_socket(int fd)
	468	{
	469	struct socket *sock;
	470	/* special case to disable backend */
	471	if (fd == -1)
	472	return NULL;
	473	sock = get_raw_socket(fd);
	474	if (!IS_ERR(sock))
	475	return sock;
	476	sock = get_tun_socket(fd);
	477	if (!IS_ERR(sock))
	478	return sock;
	479	return ERR_PTR(-ENOTSOCK);
	480	}
	481
	482	static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
	483	{
	484	struct socket sock, oldsock;
	485	struct vhost_virtqueue *vq;
	486	int r;
	487
	488	mutex_lock(&n->dev.mutex);
	489	r = vhost_dev_check_owner(&n->dev);
	490	if (r)
	491	goto err;
	492
	493	if (index >= VHOST_NET_VQ_MAX) {
	494	r = -ENOBUFS;
	495	goto err;
	496	}
	497	vq = n->vqs + index;
	498	mutex_lock(&vq->mutex);
	499
	500	/* Verify that ring has been setup correctly. */
	501	if (!vhost_vq_access_ok(vq)) {
	502	r = -EFAULT;
	503	goto err;
	504	}
	505	sock = get_socket(fd);
	506	if (IS_ERR(sock)) {
	507	r = PTR_ERR(sock);
	508	goto err;
	509	}
	510
	511	/* start polling new socket */
	512	oldsock = vq->private_data;
	513	if (sock == oldsock)
	514	goto done;
	515
	516	vhost_net_disable_vq(n, vq);
	517	rcu_assign_pointer(vq->private_data, sock);
	518	vhost_net_enable_vq(n, vq);
	519	mutex_unlock(&vq->mutex);
	520	done:
	521	if (oldsock) {
	522	vhost_net_flush_vq(n, index);
	523	fput(oldsock->file);
	524	}
	525	err:
	526	mutex_unlock(&n->dev.mutex);
	527	return r;
	528	}
	529
	530	static long vhost_net_reset_owner(struct vhost_net *n)
	531	{
	532	struct socket *tx_sock = NULL;
	533	struct socket *rx_sock = NULL;
	534	long err;
	535	mutex_lock(&n->dev.mutex);
	536	err = vhost_dev_check_owner(&n->dev);
	537	if (err)
	538	goto done;
	539	vhost_net_stop(n, &tx_sock, &rx_sock);
	540	vhost_net_flush(n);
	541	err = vhost_dev_reset_owner(&n->dev);
	542	done:
	543	mutex_unlock(&n->dev.mutex);
	544	if (tx_sock)
	545	fput(tx_sock->file);
	546	if (rx_sock)
	547	fput(rx_sock->file);
	548	return err;
	549	}
	550
	551	static int vhost_net_set_features(struct vhost_net *n, u64 features)
	552	{
	553	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
	554	sizeof(struct virtio_net_hdr) : 0;
	555	int i;
	556	mutex_lock(&n->dev.mutex);
	557	if ((features & (1 << VHOST_F_LOG_ALL)) &&
	558	!vhost_log_access_ok(&n->dev)) {
	559	mutex_unlock(&n->dev.mutex);
	560	return -EFAULT;
	561	}
	562	n->dev.acked_features = features;
	563	smp_wmb();
	564	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
	565	mutex_lock(&n->vqs[i].mutex);
	566	n->vqs[i].hdr_size = hdr_size;
	567	mutex_unlock(&n->vqs[i].mutex);
	568	}
	569	vhost_net_flush(n);
	570	mutex_unlock(&n->dev.mutex);
	571	return 0;
	572	}
	573
	574	static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
	575	unsigned long arg)
	576	{
	577	struct vhost_net *n = f->private_data;
	578	void __user argp = (void __user )arg;
	579	u64 __user *featurep = argp;
	580	struct vhost_vring_file backend;
	581	u64 features;
	582	int r;
	583	switch (ioctl) {
	584	case VHOST_NET_SET_BACKEND:
	585	r = copy_from_user(&backend, argp, sizeof backend);
	586	if (r < 0)
	587	return r;
	588	return vhost_net_set_backend(n, backend.index, backend.fd);
	589	case VHOST_GET_FEATURES:
	590	features = VHOST_FEATURES;
	591	return copy_to_user(featurep, &features, sizeof features);
	592	case VHOST_SET_FEATURES:
	593	r = copy_from_user(&features, featurep, sizeof features);
	594	if (r < 0)
	595	return r;
	596	if (features & ~VHOST_FEATURES)
	597	return -EOPNOTSUPP;
	598	return vhost_net_set_features(n, features);
	599	case VHOST_RESET_OWNER:
	600	return vhost_net_reset_owner(n);
	601	default:
	602	mutex_lock(&n->dev.mutex);
	603	r = vhost_dev_ioctl(&n->dev, ioctl, arg);
	604	vhost_net_flush(n);
	605	mutex_unlock(&n->dev.mutex);
	606	return r;
	607	}
	608	}
	609
	610	#ifdef CONFIG_COMPAT
	611	static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
	612	unsigned long arg)
	613	{
	614	return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
	615	}
	616	#endif
	617
	618	const static struct file_operations vhost_net_fops = {
	619	.owner = THIS_MODULE,
	620	.release = vhost_net_release,
	621	.unlocked_ioctl = vhost_net_ioctl,
	622	#ifdef CONFIG_COMPAT
	623	.compat_ioctl = vhost_net_compat_ioctl,
	624	#endif
	625	.open = vhost_net_open,
	626	};
	627
	628	static struct miscdevice vhost_net_misc = {
	629	VHOST_NET_MINOR,
	630	"vhost-net",
	631	&vhost_net_fops,
	632	};
	633
	634	int vhost_net_init(void)
	635	{
	636	int r = vhost_init();
	637	if (r)
	638	goto err_init;
	639	r = misc_register(&vhost_net_misc);
	640	if (r)
	641	goto err_reg;
	642	return 0;
	643	err_reg:
	644	vhost_cleanup();
	645	err_init:
	646	return r;
	647
	648	}
	649	module_init(vhost_net_init);
	650
	651	void vhost_net_exit(void)
	652	{
	653	misc_deregister(&vhost_net_misc);
	654	vhost_cleanup();
	655	}
	656	module_exit(vhost_net_exit);
	657
	658	MODULE_VERSION("0.0.1");
	659	MODULE_LICENSE("GPL v2");
	660	MODULE_AUTHOR("Michael S. Tsirkin");
	661	MODULE_DESCRIPTION("Host kernel accelerator for virtio net");