vhost_net: a kernel-level virtio server

What it is: vhost net is a character device that can be used to reduce the number of system calls involved in virtio networking. Existing virtio net code is used in the guest without modification. There's similarity with vringfd, with some differences and reduced scope - uses eventfd for signalling - structures can be moved around in memory at any time (good for migration, bug work-arounds in userspace) - write logging is supported (good for migration) - support memory table and not just an offset (needed for kvm) common virtio related code has been put in a separate file vhost.c and can be made into a separate module if/when more backends appear. I used Rusty's lguest.c as the source for developing this part : this supplied me with witty comments I wouldn't be able to write myself. What it is not: vhost net is not a bus, and not a generic new system call. No assumptions are made on how guest performs hypercalls. Userspace hypervisors are supported as well as kvm. How it works: Basically, we connect virtio frontend (configured by userspace) to a backend. The backend could be a network device, or a tap device. Backend is also configured by userspace, including vlan/mac etc. Status: This works for me, and I haven't see any crashes. Compared to userspace, people reported improved latency (as I save up to 4 system calls per packet), as well as better bandwidth and CPU utilization. Features that I plan to look at in the future: - mergeable buffers - zero copy - scalability tuning: figure out the best threading model to use Note on RCU usage (this is also documented in vhost.h, near private_pointer which is the value protected by this variant of RCU): what is happening is that the rcu_dereference() is being used in a workqueue item. The role of rcu_read_lock() is taken on by the start of execution of the workqueue item, of rcu_read_unlock() by the end of execution of the workqueue item, and of synchronize_rcu() by flush_workqueue()/flush_work(). In the future we might need to apply some gcc attribute or sparse annotation to the function passed to INIT_WORK(). Paul's ack below is for this RCU usage. (Includes fixes by Alan Cox <alan@linux.intel.com>, David L Stevens <dlstevens@us.ibm.com>, Chris Wright <chrisw@redhat.com>) Acked-by: Rusty Russell <rusty@rustcorp.com.au> Acked-by: Arnd Bergmann <arnd@arndb.de> Acked-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Michael S. Tsirkin <mst@redhat.com> 2010-01-14 01:17:27 -0500
committer: David S. Miller <davem@davemloft.net> 2010-01-15 04:43:29 -0500
commit: 3a4d5c94e959359ece6d6b55045c3f046677f55c (patch)
tree: 113cfe31540e3d77925837f6990c3284d425bfd1 /drivers/vhost/net.c
parent: 5da779c34ccff5e1e617892b6c8bd8260fb1f04c (diff)
1 files changed, 661 insertions, 0 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
new file mode 100644
index 000000000000..4c8928319e1d
--- /dev/null
+++ b/drivers/vhost/net.c
@@ -0,0 +1,661 @@
+/* Copyright (C) 2009 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * virtio-net server in host kernel.
+ */
+#include <linux/compat.h>
+#include <linux/eventfd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <linux/mmu_context.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/rcupdate.h>
+#include <linux/file.h>
+#include <linux/net.h>
+#include <linux/if_packet.h>
+#include <linux/if_arp.h>
+#include <linux/if_tun.h>
+#include <net/sock.h>
+#include "vhost.h"
+/* Max number of bytes transferred before requeueing the job.
+ * Using this limit prevents one virtqueue from starving others. */
+#define VHOST_NET_WEIGHT 0x80000
+enum {
+        VHOST_NET_VQ_RX = 0,
+        VHOST_NET_VQ_TX = 1,
+        VHOST_NET_VQ_MAX = 2,
+};
+enum vhost_net_poll_state {
+        VHOST_NET_POLL_DISABLED = 0,
+        VHOST_NET_POLL_STARTED = 1,
+        VHOST_NET_POLL_STOPPED = 2,
+};
+struct vhost_net {
+        struct vhost_dev dev;
+        struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
+        struct vhost_poll poll[VHOST_NET_VQ_MAX];
+        /* Tells us whether we are polling a socket for TX.
+         * We only do this when socket buffer fills up.
+         * Protected by tx vq lock. */
+        enum vhost_net_poll_state tx_poll_state;
+};
+/* Pop first len bytes from iovec. Return number of segments used. */
+static int move_iovec_hdr(struct iovec *from, struct iovec *to,
+                          size_t len, int iov_count)
+{
+        int seg = 0;
+        size_t size;
+        while (len && seg < iov_count) {
+                size = min(from->iov_len, len);
+                to->iov_base = from->iov_base;
+                to->iov_len = size;
+                from->iov_len -= size;
+                from->iov_base += size;
+                len -= size;
+                ++from;
+                ++to;
+                ++seg;
+        }
+        return seg;
+}
+/* Caller must have TX VQ lock */
+static void tx_poll_stop(struct vhost_net *net)
+{
+        if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
+                return;
+        vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
+        net->tx_poll_state = VHOST_NET_POLL_STOPPED;
+}
+/* Caller must have TX VQ lock */
+static void tx_poll_start(struct vhost_net *net, struct socket *sock)
+{
+        if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
+                return;
+        vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
+        net->tx_poll_state = VHOST_NET_POLL_STARTED;
+}
+/* Expects to be always run from workqueue - which acts as
+ * read-size critical section for our kind of RCU. */
+static void handle_tx(struct vhost_net *net)
+{
+        struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+        unsigned head, out, in, s;
+        struct msghdr msg = {
+                .msg_name = NULL,
+                .msg_namelen = 0,
+                .msg_control = NULL,
+                .msg_controllen = 0,
+                .msg_iov = vq->iov,
+                .msg_flags = MSG_DONTWAIT,
+        };
+        size_t len, total_len = 0;
+        int err, wmem;
+        size_t hdr_size;
+        struct socket *sock = rcu_dereference(vq->private_data);
+        if (!sock)
+                return;
+        wmem = atomic_read(&sock->sk->sk_wmem_alloc);
+        if (wmem >= sock->sk->sk_sndbuf)
+                return;
+        use_mm(net->dev.mm);
+        mutex_lock(&vq->mutex);
+        vhost_disable_notify(vq);
+        if (wmem < sock->sk->sk_sndbuf * 2)
+                tx_poll_stop(net);
+        hdr_size = vq->hdr_size;
+        for (;;) {
+                head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
+                                         ARRAY_SIZE(vq->iov),
+                                         &out, &in,
+                                         NULL, NULL);
+                /* Nothing new?  Wait for eventfd to tell us they refilled. */
+                if (head == vq->num) {
+                        wmem = atomic_read(&sock->sk->sk_wmem_alloc);
+                        if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
+                                tx_poll_start(net, sock);
+                                set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
+                                break;
+                        }
+                        if (unlikely(vhost_enable_notify(vq))) {
+                                vhost_disable_notify(vq);
+                                continue;
+                        }
+                        break;
+                }
+                if (in) {
+                        vq_err(vq, "Unexpected descriptor format for TX: "
+                               "out %d, int %d\n", out, in);
+                        break;
+                }
+                /* Skip header. TODO: support TSO. */
+                s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
+                msg.msg_iovlen = out;
+                len = iov_length(vq->iov, out);
+                /* Sanity check */
+                if (!len) {
+                        vq_err(vq, "Unexpected header len for TX: "
+                               "%zd expected %zd\n",
+                               iov_length(vq->hdr, s), hdr_size);
+                        break;
+                }
+                /* TODO: Check specific error and bomb out unless ENOBUFS? */
+                err = sock->ops->sendmsg(NULL, sock, &msg, len);
+                if (unlikely(err < 0)) {
+                        vhost_discard_vq_desc(vq);
+                        tx_poll_start(net, sock);
+                        break;
+                }
+                if (err != len)
+                        pr_err("Truncated TX packet: "
+                               " len %d != %zd\n", err, len);
+                vhost_add_used_and_signal(&net->dev, vq, head, 0);
+                total_len += len;
+                if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
+                        vhost_poll_queue(&vq->poll);
+                        break;
+                }
+        }
+        mutex_unlock(&vq->mutex);
+        unuse_mm(net->dev.mm);
+}
+/* Expects to be always run from workqueue - which acts as
+ * read-size critical section for our kind of RCU. */
+static void handle_rx(struct vhost_net *net)
+{
+        struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+        unsigned head, out, in, log, s;
+        struct vhost_log *vq_log;
+        struct msghdr msg = {
+                .msg_name = NULL,
+                .msg_namelen = 0,
+                .msg_control = NULL, /* FIXME: get and handle RX aux data. */
+                .msg_controllen = 0,
+                .msg_iov = vq->iov,
+                .msg_flags = MSG_DONTWAIT,
+        };
+        struct virtio_net_hdr hdr = {
+                .flags = 0,
+                .gso_type = VIRTIO_NET_HDR_GSO_NONE
+        };
+        size_t len, total_len = 0;
+        int err;
+        size_t hdr_size;
+        struct socket *sock = rcu_dereference(vq->private_data);
+        if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+                return;
+        use_mm(net->dev.mm);
+        mutex_lock(&vq->mutex);
+        vhost_disable_notify(vq);
+        hdr_size = vq->hdr_size;
+        vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+                vq->log : NULL;
+        for (;;) {
+                head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
+                                         ARRAY_SIZE(vq->iov),
+                                         &out, &in,
+                                         vq_log, &log);
+                /* OK, now we need to know about added descriptors. */
+                if (head == vq->num) {
+                        if (unlikely(vhost_enable_notify(vq))) {
+                                /* They have slipped one in as we were
+                                 * doing that: check again. */
+                                vhost_disable_notify(vq);
+                                continue;
+                        }
+                        /* Nothing new?  Wait for eventfd to tell us
+                         * they refilled. */
+                        break;
+                }
+                /* We don't need to be notified again. */
+                if (out) {
+                        vq_err(vq, "Unexpected descriptor format for RX: "
+                               "out %d, int %d\n",
+                               out, in);
+                        break;
+                }
+                /* Skip header. TODO: support TSO/mergeable rx buffers. */
+                s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
+                msg.msg_iovlen = in;
+                len = iov_length(vq->iov, in);
+                /* Sanity check */
+                if (!len) {
+                        vq_err(vq, "Unexpected header len for RX: "
+                               "%zd expected %zd\n",
+                               iov_length(vq->hdr, s), hdr_size);
+                        break;
+                }
+                err = sock->ops->recvmsg(NULL, sock, &msg,
+                                         len, MSG_DONTWAIT | MSG_TRUNC);
+                /* TODO: Check specific error and bomb out unless EAGAIN? */
+                if (err < 0) {
+                        vhost_discard_vq_desc(vq);
+                        break;
+                }
+                /* TODO: Should check and handle checksum. */
+                if (err > len) {
+                        pr_err("Discarded truncated rx packet: "
+                               " len %d > %zd\n", err, len);
+                        vhost_discard_vq_desc(vq);
+                        continue;
+                }
+                len = err;
+                err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
+                if (err) {
+                        vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
+                               vq->iov->iov_base, err);
+                        break;
+                }
+                len += hdr_size;
+                vhost_add_used_and_signal(&net->dev, vq, head, len);
+                if (unlikely(vq_log))
+                        vhost_log_write(vq, vq_log, log, len);
+                total_len += len;
+                if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
+                        vhost_poll_queue(&vq->poll);
+                        break;
+                }
+        }
+        mutex_unlock(&vq->mutex);
+        unuse_mm(net->dev.mm);
+}
+static void handle_tx_kick(struct work_struct *work)
+{
+        struct vhost_virtqueue *vq;
+        struct vhost_net *net;
+        vq = container_of(work, struct vhost_virtqueue, poll.work);
+        net = container_of(vq->dev, struct vhost_net, dev);
+        handle_tx(net);
+}
+static void handle_rx_kick(struct work_struct *work)
+{
+        struct vhost_virtqueue *vq;
+        struct vhost_net *net;
+        vq = container_of(work, struct vhost_virtqueue, poll.work);
+        net = container_of(vq->dev, struct vhost_net, dev);
+        handle_rx(net);
+}
+static void handle_tx_net(struct work_struct *work)
+{
+        struct vhost_net *net;
+        net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
+        handle_tx(net);
+}
+static void handle_rx_net(struct work_struct *work)
+{
+        struct vhost_net *net;
+        net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
+        handle_rx(net);
+}
+static int vhost_net_open(struct inode *inode, struct file *f)
+{
+        struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
+        int r;
+        if (!n)
+                return -ENOMEM;
+        n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
+        n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
+        r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
+        if (r < 0) {
+                kfree(n);
+                return r;
+        }
+        vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
+        vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
+        n->tx_poll_state = VHOST_NET_POLL_DISABLED;
+        f->private_data = n;
+        return 0;
+}
+static void vhost_net_disable_vq(struct vhost_net *n,
+                                 struct vhost_virtqueue *vq)
+{
+        if (!vq->private_data)
+                return;
+        if (vq == n->vqs + VHOST_NET_VQ_TX) {
+                tx_poll_stop(n);
+                n->tx_poll_state = VHOST_NET_POLL_DISABLED;
+        } else
+                vhost_poll_stop(n->poll + VHOST_NET_VQ_RX);
+}
+static void vhost_net_enable_vq(struct vhost_net *n,
+                                struct vhost_virtqueue *vq)
+{
+        struct socket *sock = vq->private_data;
+        if (!sock)
+                return;
+        if (vq == n->vqs + VHOST_NET_VQ_TX) {
+                n->tx_poll_state = VHOST_NET_POLL_STOPPED;
+                tx_poll_start(n, sock);
+        } else
+                vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file);
+}
+static struct socket *vhost_net_stop_vq(struct vhost_net *n,
+                                        struct vhost_virtqueue *vq)
+{
+        struct socket *sock;
+        mutex_lock(&vq->mutex);
+        sock = vq->private_data;
+        vhost_net_disable_vq(n, vq);
+        rcu_assign_pointer(vq->private_data, NULL);
+        mutex_unlock(&vq->mutex);
+        return sock;
+}
+static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
+                           struct socket **rx_sock)
+{
+        *tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX);
+        *rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX);
+}
+static void vhost_net_flush_vq(struct vhost_net *n, int index)
+{
+        vhost_poll_flush(n->poll + index);
+        vhost_poll_flush(&n->dev.vqs[index].poll);
+}
+static void vhost_net_flush(struct vhost_net *n)
+{
+        vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
+        vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
+}
+static int vhost_net_release(struct inode *inode, struct file *f)
+{
+        struct vhost_net *n = f->private_data;
+        struct socket *tx_sock;
+        struct socket *rx_sock;
+        vhost_net_stop(n, &tx_sock, &rx_sock);
+        vhost_net_flush(n);
+        vhost_dev_cleanup(&n->dev);
+        if (tx_sock)
+                fput(tx_sock->file);
+        if (rx_sock)
+                fput(rx_sock->file);
+        /* We do an extra flush before freeing memory,
+         * since jobs can re-queue themselves. */
+        vhost_net_flush(n);
+        kfree(n);
+        return 0;
+}
+static struct socket *get_raw_socket(int fd)
+{
+        struct {
+                struct sockaddr_ll sa;
+                char  buf[MAX_ADDR_LEN];
+        } uaddr;
+        int uaddr_len = sizeof uaddr, r;
+        struct socket *sock = sockfd_lookup(fd, &r);
+        if (!sock)
+                return ERR_PTR(-ENOTSOCK);
+        /* Parameter checking */
+        if (sock->sk->sk_type != SOCK_RAW) {
+                r = -ESOCKTNOSUPPORT;
+                goto err;
+        }
+        r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa,
+                               &uaddr_len, 0);
+        if (r)
+                goto err;
+        if (uaddr.sa.sll_family != AF_PACKET) {
+                r = -EPFNOSUPPORT;
+                goto err;
+        }
+        return sock;
+err:
+        fput(sock->file);
+        return ERR_PTR(r);
+}
+static struct socket *get_tun_socket(int fd)
+{
+        struct file *file = fget(fd);
+        struct socket *sock;
+        if (!file)
+                return ERR_PTR(-EBADF);
+        sock = tun_get_socket(file);
+        if (IS_ERR(sock))
+                fput(file);
+        return sock;
+}
+static struct socket *get_socket(int fd)
+{
+        struct socket *sock;
+        /* special case to disable backend */
+        if (fd == -1)
+                return NULL;
+        sock = get_raw_socket(fd);
+        if (!IS_ERR(sock))
+                return sock;
+        sock = get_tun_socket(fd);
+        if (!IS_ERR(sock))
+                return sock;
+        return ERR_PTR(-ENOTSOCK);
+}
+static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
+{
+        struct socket *sock, *oldsock;
+        struct vhost_virtqueue *vq;
+        int r;
+        mutex_lock(&n->dev.mutex);
+        r = vhost_dev_check_owner(&n->dev);
+        if (r)
+                goto err;
+        if (index >= VHOST_NET_VQ_MAX) {
+                r = -ENOBUFS;
+                goto err;
+        }
+        vq = n->vqs + index;
+        mutex_lock(&vq->mutex);
+        /* Verify that ring has been setup correctly. */
+        if (!vhost_vq_access_ok(vq)) {
+                r = -EFAULT;
+                goto err;
+        }
+        sock = get_socket(fd);
+        if (IS_ERR(sock)) {
+                r = PTR_ERR(sock);
+                goto err;
+        }
+        /* start polling new socket */
+        oldsock = vq->private_data;
+        if (sock == oldsock)
+                goto done;
+        vhost_net_disable_vq(n, vq);
+        rcu_assign_pointer(vq->private_data, sock);
+        vhost_net_enable_vq(n, vq);
+        mutex_unlock(&vq->mutex);
+done:
+        if (oldsock) {
+                vhost_net_flush_vq(n, index);
+                fput(oldsock->file);
+        }
+err:
+        mutex_unlock(&n->dev.mutex);
+        return r;
+}
+static long vhost_net_reset_owner(struct vhost_net *n)
+{
+        struct socket *tx_sock = NULL;
+        struct socket *rx_sock = NULL;
+        long err;
+        mutex_lock(&n->dev.mutex);
+        err = vhost_dev_check_owner(&n->dev);
+        if (err)
+                goto done;
+        vhost_net_stop(n, &tx_sock, &rx_sock);
+        vhost_net_flush(n);
+        err = vhost_dev_reset_owner(&n->dev);
+done:
+        mutex_unlock(&n->dev.mutex);
+        if (tx_sock)
+                fput(tx_sock->file);
+        if (rx_sock)
+                fput(rx_sock->file);
+        return err;
+}
+static int vhost_net_set_features(struct vhost_net *n, u64 features)
+{
+        size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
+                sizeof(struct virtio_net_hdr) : 0;
+        int i;
+        mutex_lock(&n->dev.mutex);
+        if ((features & (1 << VHOST_F_LOG_ALL)) &&
+            !vhost_log_access_ok(&n->dev)) {
+                mutex_unlock(&n->dev.mutex);
+                return -EFAULT;
+        }
+        n->dev.acked_features = features;
+        smp_wmb();
+        for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
+                mutex_lock(&n->vqs[i].mutex);
+                n->vqs[i].hdr_size = hdr_size;
+                mutex_unlock(&n->vqs[i].mutex);
+        }
+        vhost_net_flush(n);
+        mutex_unlock(&n->dev.mutex);
+        return 0;
+}
+static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
+                            unsigned long arg)
+{
+        struct vhost_net *n = f->private_data;
+        void __user *argp = (void __user *)arg;
+        u64 __user *featurep = argp;
+        struct vhost_vring_file backend;
+        u64 features;
+        int r;
+        switch (ioctl) {
+        case VHOST_NET_SET_BACKEND:
+                r = copy_from_user(&backend, argp, sizeof backend);
+                if (r < 0)
+                        return r;
+                return vhost_net_set_backend(n, backend.index, backend.fd);
+        case VHOST_GET_FEATURES:
+                features = VHOST_FEATURES;
+                return copy_to_user(featurep, &features, sizeof features);
+        case VHOST_SET_FEATURES:
+                r = copy_from_user(&features, featurep, sizeof features);
+                if (r < 0)
+                        return r;
+                if (features & ~VHOST_FEATURES)
+                        return -EOPNOTSUPP;
+                return vhost_net_set_features(n, features);
+        case VHOST_RESET_OWNER:
+                return vhost_net_reset_owner(n);
+        default:
+                mutex_lock(&n->dev.mutex);
+                r = vhost_dev_ioctl(&n->dev, ioctl, arg);
+                vhost_net_flush(n);
+                mutex_unlock(&n->dev.mutex);
+                return r;
+        }
+}
+#ifdef CONFIG_COMPAT
+static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
+                                   unsigned long arg)
+{
+        return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
+}
+#endif
+const static struct file_operations vhost_net_fops = {
+        .owner          = THIS_MODULE,
+        .release        = vhost_net_release,
+        .unlocked_ioctl = vhost_net_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = vhost_net_compat_ioctl,
+#endif
+        .open           = vhost_net_open,
+};
+static struct miscdevice vhost_net_misc = {
+        VHOST_NET_MINOR,
+        "vhost-net",
+        &vhost_net_fops,
+};
+int vhost_net_init(void)
+{
+        int r = vhost_init();
+        if (r)
+                goto err_init;
+        r = misc_register(&vhost_net_misc);
+        if (r)
+                goto err_reg;
+        return 0;
+err_reg:
+        vhost_cleanup();
+err_init:
+        return r;
+}
+module_init(vhost_net_init);
+void vhost_net_exit(void)
+{
+        misc_deregister(&vhost_net_misc);
+        vhost_cleanup();
+}
+module_exit(vhost_net_exit);
+MODULE_VERSION("0.0.1");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Michael S. Tsirkin");
+MODULE_DESCRIPTION("Host kernel accelerator for virtio net");
author	Michael S. Tsirkin <mst@redhat.com>	2010-01-14 01:17:27 -0500
committer	David S. Miller <davem@davemloft.net>	2010-01-15 04:43:29 -0500
commit	3a4d5c94e959359ece6d6b55045c3f046677f55c (patch)
tree	113cfe31540e3d77925837f6990c3284d425bfd1 /drivers/vhost/net.c
parent	5da779c34ccff5e1e617892b6c8bd8260fb1f04c (diff)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c new file mode 100644 index 000000000000..4c8928319e1d --- /dev/null +++ b/drivers/vhost/net.c
@@ -0,0 +1,661 @@
	1	/* Copyright (C) 2009 Red Hat, Inc.
	2	* Author: Michael S. Tsirkin <mst@redhat.com>
	3	*
	4	* This work is licensed under the terms of the GNU GPL, version 2.
	5	*
	6	* virtio-net server in host kernel.
	7	*/
	8
	9	#include <linux/compat.h>
	10	#include <linux/eventfd.h>
	11	#include <linux/vhost.h>
	12	#include <linux/virtio_net.h>
	13	#include <linux/mmu_context.h>
	14	#include <linux/miscdevice.h>
	15	#include <linux/module.h>
	16	#include <linux/mutex.h>
	17	#include <linux/workqueue.h>
	18	#include <linux/rcupdate.h>
	19	#include <linux/file.h>
	20
	21	#include <linux/net.h>
	22	#include <linux/if_packet.h>
	23	#include <linux/if_arp.h>
	24	#include <linux/if_tun.h>
	25
	26	#include <net/sock.h>
	27
	28	#include "vhost.h"
	29
	30	/* Max number of bytes transferred before requeueing the job.
	31	* Using this limit prevents one virtqueue from starving others. */
	32	#define VHOST_NET_WEIGHT 0x80000
	33
	34	enum {
	35	VHOST_NET_VQ_RX = 0,
	36	VHOST_NET_VQ_TX = 1,
	37	VHOST_NET_VQ_MAX = 2,
	38	};
	39
	40	enum vhost_net_poll_state {
	41	VHOST_NET_POLL_DISABLED = 0,
	42	VHOST_NET_POLL_STARTED = 1,
	43	VHOST_NET_POLL_STOPPED = 2,
	44	};
	45
	46	struct vhost_net {
	47	struct vhost_dev dev;
	48	struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
	49	struct vhost_poll poll[VHOST_NET_VQ_MAX];
	50	/* Tells us whether we are polling a socket for TX.
	51	* We only do this when socket buffer fills up.
	52	* Protected by tx vq lock. */
	53	enum vhost_net_poll_state tx_poll_state;
	54	};
	55
	56	/* Pop first len bytes from iovec. Return number of segments used. */
	57	static int move_iovec_hdr(struct iovec from, struct iovec to,
	58	size_t len, int iov_count)
	59	{
	60	int seg = 0;
	61	size_t size;
	62	while (len && seg < iov_count) {
	63	size = min(from->iov_len, len);
	64	to->iov_base = from->iov_base;
	65	to->iov_len = size;
	66	from->iov_len -= size;
	67	from->iov_base += size;
	68	len -= size;
	69	++from;
	70	++to;
	71	++seg;
	72	}
	73	return seg;
	74	}
	75
	76	/* Caller must have TX VQ lock */
	77	static void tx_poll_stop(struct vhost_net *net)
	78	{
	79	if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
	80	return;
	81	vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
	82	net->tx_poll_state = VHOST_NET_POLL_STOPPED;
	83	}
	84
	85	/* Caller must have TX VQ lock */
	86	static void tx_poll_start(struct vhost_net net, struct socket sock)
	87	{
	88	if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
	89	return;
	90	vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
	91	net->tx_poll_state = VHOST_NET_POLL_STARTED;
	92	}
	93
	94	/* Expects to be always run from workqueue - which acts as
	95	* read-size critical section for our kind of RCU. */
	96	static void handle_tx(struct vhost_net *net)
	97	{
	98	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
	99	unsigned head, out, in, s;
	100	struct msghdr msg = {
	101	.msg_name = NULL,
	102	.msg_namelen = 0,
	103	.msg_control = NULL,
	104	.msg_controllen = 0,
	105	.msg_iov = vq->iov,
	106	.msg_flags = MSG_DONTWAIT,
	107	};
	108	size_t len, total_len = 0;
	109	int err, wmem;
	110	size_t hdr_size;
	111	struct socket *sock = rcu_dereference(vq->private_data);
	112	if (!sock)
	113	return;
	114
	115	wmem = atomic_read(&sock->sk->sk_wmem_alloc);
	116	if (wmem >= sock->sk->sk_sndbuf)
	117	return;
	118
	119	use_mm(net->dev.mm);
	120	mutex_lock(&vq->mutex);
	121	vhost_disable_notify(vq);
	122
	123	if (wmem < sock->sk->sk_sndbuf * 2)
	124	tx_poll_stop(net);
	125	hdr_size = vq->hdr_size;
	126
	127	for (;;) {
	128	head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
	129	ARRAY_SIZE(vq->iov),
	130	&out, &in,
	131	NULL, NULL);
	132	/* Nothing new? Wait for eventfd to tell us they refilled. */
	133	if (head == vq->num) {
	134	wmem = atomic_read(&sock->sk->sk_wmem_alloc);
	135	if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
	136	tx_poll_start(net, sock);
	137	set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
	138	break;
	139	}
	140	if (unlikely(vhost_enable_notify(vq))) {
	141	vhost_disable_notify(vq);
	142	continue;
	143	}
	144	break;
	145	}
	146	if (in) {
	147	vq_err(vq, "Unexpected descriptor format for TX: "
	148	"out %d, int %d\n", out, in);
	149	break;
	150	}
	151	/* Skip header. TODO: support TSO. */
	152	s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
	153	msg.msg_iovlen = out;
	154	len = iov_length(vq->iov, out);
	155	/* Sanity check */
	156	if (!len) {
	157	vq_err(vq, "Unexpected header len for TX: "
	158	"%zd expected %zd\n",
	159	iov_length(vq->hdr, s), hdr_size);
	160	break;
	161	}
	162	/* TODO: Check specific error and bomb out unless ENOBUFS? */
	163	err = sock->ops->sendmsg(NULL, sock, &msg, len);
	164	if (unlikely(err < 0)) {
	165	vhost_discard_vq_desc(vq);
	166	tx_poll_start(net, sock);
	167	break;
	168	}
	169	if (err != len)
	170	pr_err("Truncated TX packet: "
	171	" len %d != %zd\n", err, len);
	172	vhost_add_used_and_signal(&net->dev, vq, head, 0);
	173	total_len += len;
	174	if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
	175	vhost_poll_queue(&vq->poll);
	176	break;
	177	}
	178	}
	179
	180	mutex_unlock(&vq->mutex);
	181	unuse_mm(net->dev.mm);
	182	}
	183
	184	/* Expects to be always run from workqueue - which acts as
	185	* read-size critical section for our kind of RCU. */
	186	static void handle_rx(struct vhost_net *net)
	187	{
	188	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
	189	unsigned head, out, in, log, s;
	190	struct vhost_log *vq_log;
	191	struct msghdr msg = {
	192	.msg_name = NULL,
	193	.msg_namelen = 0,
	194	.msg_control = NULL, /* FIXME: get and handle RX aux data. */
	195	.msg_controllen = 0,
	196	.msg_iov = vq->iov,
	197	.msg_flags = MSG_DONTWAIT,
	198	};
	199
	200	struct virtio_net_hdr hdr = {
	201	.flags = 0,
	202	.gso_type = VIRTIO_NET_HDR_GSO_NONE
	203	};
	204
	205	size_t len, total_len = 0;
	206	int err;
	207	size_t hdr_size;
	208	struct socket *sock = rcu_dereference(vq->private_data);
	209	if (!sock \|\| skb_queue_empty(&sock->sk->sk_receive_queue))
	210	return;
	211
	212	use_mm(net->dev.mm);
	213	mutex_lock(&vq->mutex);
	214	vhost_disable_notify(vq);
	215	hdr_size = vq->hdr_size;
	216
	217	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
	218	vq->log : NULL;
	219
	220	for (;;) {
	221	head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
	222	ARRAY_SIZE(vq->iov),
	223	&out, &in,
	224	vq_log, &log);
	225	/* OK, now we need to know about added descriptors. */
	226	if (head == vq->num) {
	227	if (unlikely(vhost_enable_notify(vq))) {
	228	/* They have slipped one in as we were
	229	* doing that: check again. */
	230	vhost_disable_notify(vq);
	231	continue;
	232	}
	233	/* Nothing new? Wait for eventfd to tell us
	234	* they refilled. */
	235	break;
	236	}
	237	/* We don't need to be notified again. */
	238	if (out) {
	239	vq_err(vq, "Unexpected descriptor format for RX: "
	240	"out %d, int %d\n",
	241	out, in);
	242	break;
	243	}
	244	/* Skip header. TODO: support TSO/mergeable rx buffers. */
	245	s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
	246	msg.msg_iovlen = in;
	247	len = iov_length(vq->iov, in);
	248	/* Sanity check */
	249	if (!len) {
	250	vq_err(vq, "Unexpected header len for RX: "
	251	"%zd expected %zd\n",
	252	iov_length(vq->hdr, s), hdr_size);
	253	break;
	254	}
	255	err = sock->ops->recvmsg(NULL, sock, &msg,
	256	len, MSG_DONTWAIT \| MSG_TRUNC);
	257	/* TODO: Check specific error and bomb out unless EAGAIN? */
	258	if (err < 0) {
	259	vhost_discard_vq_desc(vq);
	260	break;
	261	}
	262	/* TODO: Should check and handle checksum. */
	263	if (err > len) {
	264	pr_err("Discarded truncated rx packet: "
	265	" len %d > %zd\n", err, len);
	266	vhost_discard_vq_desc(vq);
	267	continue;
	268	}
	269	len = err;
	270	err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
	271	if (err) {
	272	vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
	273	vq->iov->iov_base, err);
	274	break;
	275	}
	276	len += hdr_size;
	277	vhost_add_used_and_signal(&net->dev, vq, head, len);
	278	if (unlikely(vq_log))
	279	vhost_log_write(vq, vq_log, log, len);
	280	total_len += len;
	281	if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
	282	vhost_poll_queue(&vq->poll);
	283	break;
	284	}
	285	}
	286
	287	mutex_unlock(&vq->mutex);
	288	unuse_mm(net->dev.mm);
	289	}
	290
	291	static void handle_tx_kick(struct work_struct *work)
	292	{
	293	struct vhost_virtqueue *vq;
	294	struct vhost_net *net;
	295	vq = container_of(work, struct vhost_virtqueue, poll.work);
	296	net = container_of(vq->dev, struct vhost_net, dev);
	297	handle_tx(net);
	298	}
	299
	300	static void handle_rx_kick(struct work_struct *work)
	301	{
	302	struct vhost_virtqueue *vq;
	303	struct vhost_net *net;
	304	vq = container_of(work, struct vhost_virtqueue, poll.work);
	305	net = container_of(vq->dev, struct vhost_net, dev);
	306	handle_rx(net);
	307	}
	308
	309	static void handle_tx_net(struct work_struct *work)
	310	{
	311	struct vhost_net *net;
	312	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
	313	handle_tx(net);
	314	}
	315
	316	static void handle_rx_net(struct work_struct *work)
	317	{
	318	struct vhost_net *net;
	319	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
	320	handle_rx(net);
	321	}
	322
	323	static int vhost_net_open(struct inode inode, struct file f)
	324	{
	325	struct vhost_net n = kmalloc(sizeof n, GFP_KERNEL);
	326	int r;
	327	if (!n)
	328	return -ENOMEM;
	329	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
	330	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
	331	r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
	332	if (r < 0) {
	333	kfree(n);
	334	return r;
	335	}
	336
	337	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
	338	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
	339	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
	340
	341	f->private_data = n;
	342
	343	return 0;
	344	}
	345
	346	static void vhost_net_disable_vq(struct vhost_net *n,
	347	struct vhost_virtqueue *vq)
	348	{
	349	if (!vq->private_data)
	350	return;
	351	if (vq == n->vqs + VHOST_NET_VQ_TX) {
	352	tx_poll_stop(n);
	353	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
	354	} else
	355	vhost_poll_stop(n->poll + VHOST_NET_VQ_RX);
	356	}
	357
	358	static void vhost_net_enable_vq(struct vhost_net *n,
	359	struct vhost_virtqueue *vq)
	360	{
	361	struct socket *sock = vq->private_data;
	362	if (!sock)
	363	return;
	364	if (vq == n->vqs + VHOST_NET_VQ_TX) {
	365	n->tx_poll_state = VHOST_NET_POLL_STOPPED;
	366	tx_poll_start(n, sock);
	367	} else
	368	vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file);
	369	}
	370
	371	static struct socket vhost_net_stop_vq(struct vhost_net n,
	372	struct vhost_virtqueue *vq)
	373	{
	374	struct socket *sock;
	375
	376	mutex_lock(&vq->mutex);
	377	sock = vq->private_data;
	378	vhost_net_disable_vq(n, vq);
	379	rcu_assign_pointer(vq->private_data, NULL);
	380	mutex_unlock(&vq->mutex);
	381	return sock;
	382	}
	383
	384	static void vhost_net_stop(struct vhost_net n, struct socket *tx_sock,
	385	struct socket **rx_sock)
	386	{
	387	*tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX);
	388	*rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX);
	389	}
	390
	391	static void vhost_net_flush_vq(struct vhost_net *n, int index)
	392	{
	393	vhost_poll_flush(n->poll + index);
	394	vhost_poll_flush(&n->dev.vqs[index].poll);
	395	}
	396
	397	static void vhost_net_flush(struct vhost_net *n)
	398	{
	399	vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
	400	vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
	401	}
	402
	403	static int vhost_net_release(struct inode inode, struct file f)
	404	{
	405	struct vhost_net *n = f->private_data;
	406	struct socket *tx_sock;
	407	struct socket *rx_sock;
	408
	409	vhost_net_stop(n, &tx_sock, &rx_sock);
	410	vhost_net_flush(n);
	411	vhost_dev_cleanup(&n->dev);
	412	if (tx_sock)
	413	fput(tx_sock->file);
	414	if (rx_sock)
	415	fput(rx_sock->file);
	416	/* We do an extra flush before freeing memory,
	417	* since jobs can re-queue themselves. */
	418	vhost_net_flush(n);
	419	kfree(n);
	420	return 0;
	421	}
	422
	423	static struct socket *get_raw_socket(int fd)
	424	{
	425	struct {
	426	struct sockaddr_ll sa;
	427	char buf[MAX_ADDR_LEN];
	428	} uaddr;
	429	int uaddr_len = sizeof uaddr, r;
	430	struct socket *sock = sockfd_lookup(fd, &r);
	431	if (!sock)
	432	return ERR_PTR(-ENOTSOCK);
	433
	434	/* Parameter checking */
	435	if (sock->sk->sk_type != SOCK_RAW) {
	436	r = -ESOCKTNOSUPPORT;
	437	goto err;
	438	}
	439
	440	r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa,
	441	&uaddr_len, 0);
	442	if (r)
	443	goto err;
	444
	445	if (uaddr.sa.sll_family != AF_PACKET) {
	446	r = -EPFNOSUPPORT;
	447	goto err;
	448	}
	449	return sock;
	450	err:
	451	fput(sock->file);
	452	return ERR_PTR(r);
	453	}
	454
	455	static struct socket *get_tun_socket(int fd)
	456	{
	457	struct file *file = fget(fd);
	458	struct socket *sock;
	459	if (!file)
	460	return ERR_PTR(-EBADF);
	461	sock = tun_get_socket(file);
	462	if (IS_ERR(sock))
	463	fput(file);
	464	return sock;
	465	}
	466
	467	static struct socket *get_socket(int fd)
	468	{
	469	struct socket *sock;
	470	/* special case to disable backend */
	471	if (fd == -1)
	472	return NULL;
	473	sock = get_raw_socket(fd);
	474	if (!IS_ERR(sock))
	475	return sock;
	476	sock = get_tun_socket(fd);
	477	if (!IS_ERR(sock))
	478	return sock;
	479	return ERR_PTR(-ENOTSOCK);
	480	}
	481
	482	static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
	483	{
	484	struct socket sock, oldsock;
	485	struct vhost_virtqueue *vq;
	486	int r;
	487
	488	mutex_lock(&n->dev.mutex);
	489	r = vhost_dev_check_owner(&n->dev);
	490	if (r)
	491	goto err;
	492
	493	if (index >= VHOST_NET_VQ_MAX) {
	494	r = -ENOBUFS;
	495	goto err;
	496	}
	497	vq = n->vqs + index;
	498	mutex_lock(&vq->mutex);
	499
	500	/* Verify that ring has been setup correctly. */
	501	if (!vhost_vq_access_ok(vq)) {
	502	r = -EFAULT;
	503	goto err;
	504	}
	505	sock = get_socket(fd);
	506	if (IS_ERR(sock)) {
	507	r = PTR_ERR(sock);
	508	goto err;
	509	}
	510
	511	/* start polling new socket */
	512	oldsock = vq->private_data;
	513	if (sock == oldsock)
	514	goto done;
	515
	516	vhost_net_disable_vq(n, vq);
	517	rcu_assign_pointer(vq->private_data, sock);
	518	vhost_net_enable_vq(n, vq);
	519	mutex_unlock(&vq->mutex);
	520	done:
	521	if (oldsock) {
	522	vhost_net_flush_vq(n, index);
	523	fput(oldsock->file);
	524	}
	525	err:
	526	mutex_unlock(&n->dev.mutex);
	527	return r;
	528	}
	529
	530	static long vhost_net_reset_owner(struct vhost_net *n)
	531	{
	532	struct socket *tx_sock = NULL;
	533	struct socket *rx_sock = NULL;
	534	long err;
	535	mutex_lock(&n->dev.mutex);
	536	err = vhost_dev_check_owner(&n->dev);
	537	if (err)
	538	goto done;
	539	vhost_net_stop(n, &tx_sock, &rx_sock);
	540	vhost_net_flush(n);
	541	err = vhost_dev_reset_owner(&n->dev);
	542	done:
	543	mutex_unlock(&n->dev.mutex);
	544	if (tx_sock)
	545	fput(tx_sock->file);
	546	if (rx_sock)
	547	fput(rx_sock->file);
	548	return err;
	549	}
	550
	551	static int vhost_net_set_features(struct vhost_net *n, u64 features)
	552	{
	553	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
	554	sizeof(struct virtio_net_hdr) : 0;
	555	int i;
	556	mutex_lock(&n->dev.mutex);
	557	if ((features & (1 << VHOST_F_LOG_ALL)) &&
	558	!vhost_log_access_ok(&n->dev)) {
	559	mutex_unlock(&n->dev.mutex);
	560	return -EFAULT;
	561	}
	562	n->dev.acked_features = features;
	563	smp_wmb();
	564	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
	565	mutex_lock(&n->vqs[i].mutex);
	566	n->vqs[i].hdr_size = hdr_size;
	567	mutex_unlock(&n->vqs[i].mutex);
	568	}
	569	vhost_net_flush(n);
	570	mutex_unlock(&n->dev.mutex);
	571	return 0;
	572	}
	573
	574	static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
	575	unsigned long arg)
	576	{
	577	struct vhost_net *n = f->private_data;
	578	void __user argp = (void __user )arg;
	579	u64 __user *featurep = argp;
	580	struct vhost_vring_file backend;
	581	u64 features;
	582	int r;
	583	switch (ioctl) {
	584	case VHOST_NET_SET_BACKEND:
	585	r = copy_from_user(&backend, argp, sizeof backend);
	586	if (r < 0)
	587	return r;
	588	return vhost_net_set_backend(n, backend.index, backend.fd);
	589	case VHOST_GET_FEATURES:
	590	features = VHOST_FEATURES;
	591	return copy_to_user(featurep, &features, sizeof features);
	592	case VHOST_SET_FEATURES:
	593	r = copy_from_user(&features, featurep, sizeof features);
	594	if (r < 0)
	595	return r;
	596	if (features & ~VHOST_FEATURES)
	597	return -EOPNOTSUPP;
	598	return vhost_net_set_features(n, features);
	599	case VHOST_RESET_OWNER:
	600	return vhost_net_reset_owner(n);
	601	default:
	602	mutex_lock(&n->dev.mutex);
	603	r = vhost_dev_ioctl(&n->dev, ioctl, arg);
	604	vhost_net_flush(n);
	605	mutex_unlock(&n->dev.mutex);
	606	return r;
	607	}
	608	}
	609
	610	#ifdef CONFIG_COMPAT
	611	static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
	612	unsigned long arg)
	613	{
	614	return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
	615	}
	616	#endif
	617
	618	const static struct file_operations vhost_net_fops = {
	619	.owner = THIS_MODULE,
	620	.release = vhost_net_release,
	621	.unlocked_ioctl = vhost_net_ioctl,
	622	#ifdef CONFIG_COMPAT
	623	.compat_ioctl = vhost_net_compat_ioctl,
	624	#endif
	625	.open = vhost_net_open,
	626	};
	627
	628	static struct miscdevice vhost_net_misc = {
	629	VHOST_NET_MINOR,
	630	"vhost-net",
	631	&vhost_net_fops,
	632	};
	633
	634	int vhost_net_init(void)
	635	{
	636	int r = vhost_init();
	637	if (r)
	638	goto err_init;
	639	r = misc_register(&vhost_net_misc);
	640	if (r)
	641	goto err_reg;
	642	return 0;
	643	err_reg:
	644	vhost_cleanup();
	645	err_init:
	646	return r;
	647
	648	}
	649	module_init(vhost_net_init);
	650
	651	void vhost_net_exit(void)
	652	{
	653	misc_deregister(&vhost_net_misc);
	654	vhost_cleanup();
	655	}
	656	module_exit(vhost_net_exit);
	657
	658	MODULE_VERSION("0.0.1");
	659	MODULE_LICENSE("GPL v2");
	660	MODULE_AUTHOR("Michael S. Tsirkin");
	661	MODULE_DESCRIPTION("Host kernel accelerator for virtio net");