aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorMichael S. Tsirkin <mst@redhat.com>2010-01-14 01:17:27 -0500
committerDavid S. Miller <davem@davemloft.net>2010-01-15 04:43:29 -0500
commit3a4d5c94e959359ece6d6b55045c3f046677f55c (patch)
tree113cfe31540e3d77925837f6990c3284d425bfd1 /drivers
parent5da779c34ccff5e1e617892b6c8bd8260fb1f04c (diff)
vhost_net: a kernel-level virtio server
What it is: vhost net is a character device that can be used to reduce the number of system calls involved in virtio networking. Existing virtio net code is used in the guest without modification. There's similarity with vringfd, with some differences and reduced scope - uses eventfd for signalling - structures can be moved around in memory at any time (good for migration, bug work-arounds in userspace) - write logging is supported (good for migration) - support memory table and not just an offset (needed for kvm) common virtio related code has been put in a separate file vhost.c and can be made into a separate module if/when more backends appear. I used Rusty's lguest.c as the source for developing this part : this supplied me with witty comments I wouldn't be able to write myself. What it is not: vhost net is not a bus, and not a generic new system call. No assumptions are made on how guest performs hypercalls. Userspace hypervisors are supported as well as kvm. How it works: Basically, we connect virtio frontend (configured by userspace) to a backend. The backend could be a network device, or a tap device. Backend is also configured by userspace, including vlan/mac etc. Status: This works for me, and I haven't see any crashes. Compared to userspace, people reported improved latency (as I save up to 4 system calls per packet), as well as better bandwidth and CPU utilization. Features that I plan to look at in the future: - mergeable buffers - zero copy - scalability tuning: figure out the best threading model to use Note on RCU usage (this is also documented in vhost.h, near private_pointer which is the value protected by this variant of RCU): what is happening is that the rcu_dereference() is being used in a workqueue item. The role of rcu_read_lock() is taken on by the start of execution of the workqueue item, of rcu_read_unlock() by the end of execution of the workqueue item, and of synchronize_rcu() by flush_workqueue()/flush_work(). In the future we might need to apply some gcc attribute or sparse annotation to the function passed to INIT_WORK(). Paul's ack below is for this RCU usage. (Includes fixes by Alan Cox <alan@linux.intel.com>, David L Stevens <dlstevens@us.ibm.com>, Chris Wright <chrisw@redhat.com>) Acked-by: Rusty Russell <rusty@rustcorp.com.au> Acked-by: Arnd Bergmann <arnd@arndb.de> Acked-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/vhost/Kconfig11
-rw-r--r--drivers/vhost/Makefile2
-rw-r--r--drivers/vhost/net.c661
-rw-r--r--drivers/vhost/vhost.c1098
-rw-r--r--drivers/vhost/vhost.h161
6 files changed, 1934 insertions, 0 deletions
diff --git a/drivers/Makefile b/drivers/Makefile
index 6ee53c7a57a1..81e36596b1e9 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -106,6 +106,7 @@ obj-$(CONFIG_HID) += hid/
106obj-$(CONFIG_PPC_PS3) += ps3/ 106obj-$(CONFIG_PPC_PS3) += ps3/
107obj-$(CONFIG_OF) += of/ 107obj-$(CONFIG_OF) += of/
108obj-$(CONFIG_SSB) += ssb/ 108obj-$(CONFIG_SSB) += ssb/
109obj-$(CONFIG_VHOST_NET) += vhost/
109obj-$(CONFIG_VIRTIO) += virtio/ 110obj-$(CONFIG_VIRTIO) += virtio/
110obj-$(CONFIG_VLYNQ) += vlynq/ 111obj-$(CONFIG_VLYNQ) += vlynq/
111obj-$(CONFIG_STAGING) += staging/ 112obj-$(CONFIG_STAGING) += staging/
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
new file mode 100644
index 000000000000..9f409f447aea
--- /dev/null
+++ b/drivers/vhost/Kconfig
@@ -0,0 +1,11 @@
1config VHOST_NET
2 tristate "Host kernel accelerator for virtio net (EXPERIMENTAL)"
3 depends on NET && EVENTFD && EXPERIMENTAL
4 ---help---
5 This kernel module can be loaded in host kernel to accelerate
6 guest networking with virtio_net. Not to be confused with virtio_net
7 module itself which needs to be loaded in guest kernel.
8
9 To compile this driver as a module, choose M here: the module will
10 be called vhost_net.
11
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
new file mode 100644
index 000000000000..72dd02050bb9
--- /dev/null
+++ b/drivers/vhost/Makefile
@@ -0,0 +1,2 @@
1obj-$(CONFIG_VHOST_NET) += vhost_net.o
2vhost_net-y := vhost.o net.o
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
new file mode 100644
index 000000000000..4c8928319e1d
--- /dev/null
+++ b/drivers/vhost/net.c
@@ -0,0 +1,661 @@
1/* Copyright (C) 2009 Red Hat, Inc.
2 * Author: Michael S. Tsirkin <mst@redhat.com>
3 *
4 * This work is licensed under the terms of the GNU GPL, version 2.
5 *
6 * virtio-net server in host kernel.
7 */
8
9#include <linux/compat.h>
10#include <linux/eventfd.h>
11#include <linux/vhost.h>
12#include <linux/virtio_net.h>
13#include <linux/mmu_context.h>
14#include <linux/miscdevice.h>
15#include <linux/module.h>
16#include <linux/mutex.h>
17#include <linux/workqueue.h>
18#include <linux/rcupdate.h>
19#include <linux/file.h>
20
21#include <linux/net.h>
22#include <linux/if_packet.h>
23#include <linux/if_arp.h>
24#include <linux/if_tun.h>
25
26#include <net/sock.h>
27
28#include "vhost.h"
29
30/* Max number of bytes transferred before requeueing the job.
31 * Using this limit prevents one virtqueue from starving others. */
32#define VHOST_NET_WEIGHT 0x80000
33
34enum {
35 VHOST_NET_VQ_RX = 0,
36 VHOST_NET_VQ_TX = 1,
37 VHOST_NET_VQ_MAX = 2,
38};
39
40enum vhost_net_poll_state {
41 VHOST_NET_POLL_DISABLED = 0,
42 VHOST_NET_POLL_STARTED = 1,
43 VHOST_NET_POLL_STOPPED = 2,
44};
45
46struct vhost_net {
47 struct vhost_dev dev;
48 struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
49 struct vhost_poll poll[VHOST_NET_VQ_MAX];
50 /* Tells us whether we are polling a socket for TX.
51 * We only do this when socket buffer fills up.
52 * Protected by tx vq lock. */
53 enum vhost_net_poll_state tx_poll_state;
54};
55
56/* Pop first len bytes from iovec. Return number of segments used. */
57static int move_iovec_hdr(struct iovec *from, struct iovec *to,
58 size_t len, int iov_count)
59{
60 int seg = 0;
61 size_t size;
62 while (len && seg < iov_count) {
63 size = min(from->iov_len, len);
64 to->iov_base = from->iov_base;
65 to->iov_len = size;
66 from->iov_len -= size;
67 from->iov_base += size;
68 len -= size;
69 ++from;
70 ++to;
71 ++seg;
72 }
73 return seg;
74}
75
76/* Caller must have TX VQ lock */
77static void tx_poll_stop(struct vhost_net *net)
78{
79 if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
80 return;
81 vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
82 net->tx_poll_state = VHOST_NET_POLL_STOPPED;
83}
84
85/* Caller must have TX VQ lock */
86static void tx_poll_start(struct vhost_net *net, struct socket *sock)
87{
88 if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
89 return;
90 vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
91 net->tx_poll_state = VHOST_NET_POLL_STARTED;
92}
93
94/* Expects to be always run from workqueue - which acts as
95 * read-size critical section for our kind of RCU. */
96static void handle_tx(struct vhost_net *net)
97{
98 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
99 unsigned head, out, in, s;
100 struct msghdr msg = {
101 .msg_name = NULL,
102 .msg_namelen = 0,
103 .msg_control = NULL,
104 .msg_controllen = 0,
105 .msg_iov = vq->iov,
106 .msg_flags = MSG_DONTWAIT,
107 };
108 size_t len, total_len = 0;
109 int err, wmem;
110 size_t hdr_size;
111 struct socket *sock = rcu_dereference(vq->private_data);
112 if (!sock)
113 return;
114
115 wmem = atomic_read(&sock->sk->sk_wmem_alloc);
116 if (wmem >= sock->sk->sk_sndbuf)
117 return;
118
119 use_mm(net->dev.mm);
120 mutex_lock(&vq->mutex);
121 vhost_disable_notify(vq);
122
123 if (wmem < sock->sk->sk_sndbuf * 2)
124 tx_poll_stop(net);
125 hdr_size = vq->hdr_size;
126
127 for (;;) {
128 head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
129 ARRAY_SIZE(vq->iov),
130 &out, &in,
131 NULL, NULL);
132 /* Nothing new? Wait for eventfd to tell us they refilled. */
133 if (head == vq->num) {
134 wmem = atomic_read(&sock->sk->sk_wmem_alloc);
135 if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
136 tx_poll_start(net, sock);
137 set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
138 break;
139 }
140 if (unlikely(vhost_enable_notify(vq))) {
141 vhost_disable_notify(vq);
142 continue;
143 }
144 break;
145 }
146 if (in) {
147 vq_err(vq, "Unexpected descriptor format for TX: "
148 "out %d, int %d\n", out, in);
149 break;
150 }
151 /* Skip header. TODO: support TSO. */
152 s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
153 msg.msg_iovlen = out;
154 len = iov_length(vq->iov, out);
155 /* Sanity check */
156 if (!len) {
157 vq_err(vq, "Unexpected header len for TX: "
158 "%zd expected %zd\n",
159 iov_length(vq->hdr, s), hdr_size);
160 break;
161 }
162 /* TODO: Check specific error and bomb out unless ENOBUFS? */
163 err = sock->ops->sendmsg(NULL, sock, &msg, len);
164 if (unlikely(err < 0)) {
165 vhost_discard_vq_desc(vq);
166 tx_poll_start(net, sock);
167 break;
168 }
169 if (err != len)
170 pr_err("Truncated TX packet: "
171 " len %d != %zd\n", err, len);
172 vhost_add_used_and_signal(&net->dev, vq, head, 0);
173 total_len += len;
174 if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
175 vhost_poll_queue(&vq->poll);
176 break;
177 }
178 }
179
180 mutex_unlock(&vq->mutex);
181 unuse_mm(net->dev.mm);
182}
183
184/* Expects to be always run from workqueue - which acts as
185 * read-size critical section for our kind of RCU. */
186static void handle_rx(struct vhost_net *net)
187{
188 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
189 unsigned head, out, in, log, s;
190 struct vhost_log *vq_log;
191 struct msghdr msg = {
192 .msg_name = NULL,
193 .msg_namelen = 0,
194 .msg_control = NULL, /* FIXME: get and handle RX aux data. */
195 .msg_controllen = 0,
196 .msg_iov = vq->iov,
197 .msg_flags = MSG_DONTWAIT,
198 };
199
200 struct virtio_net_hdr hdr = {
201 .flags = 0,
202 .gso_type = VIRTIO_NET_HDR_GSO_NONE
203 };
204
205 size_t len, total_len = 0;
206 int err;
207 size_t hdr_size;
208 struct socket *sock = rcu_dereference(vq->private_data);
209 if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
210 return;
211
212 use_mm(net->dev.mm);
213 mutex_lock(&vq->mutex);
214 vhost_disable_notify(vq);
215 hdr_size = vq->hdr_size;
216
217 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
218 vq->log : NULL;
219
220 for (;;) {
221 head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
222 ARRAY_SIZE(vq->iov),
223 &out, &in,
224 vq_log, &log);
225 /* OK, now we need to know about added descriptors. */
226 if (head == vq->num) {
227 if (unlikely(vhost_enable_notify(vq))) {
228 /* They have slipped one in as we were
229 * doing that: check again. */
230 vhost_disable_notify(vq);
231 continue;
232 }
233 /* Nothing new? Wait for eventfd to tell us
234 * they refilled. */
235 break;
236 }
237 /* We don't need to be notified again. */
238 if (out) {
239 vq_err(vq, "Unexpected descriptor format for RX: "
240 "out %d, int %d\n",
241 out, in);
242 break;
243 }
244 /* Skip header. TODO: support TSO/mergeable rx buffers. */
245 s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
246 msg.msg_iovlen = in;
247 len = iov_length(vq->iov, in);
248 /* Sanity check */
249 if (!len) {
250 vq_err(vq, "Unexpected header len for RX: "
251 "%zd expected %zd\n",
252 iov_length(vq->hdr, s), hdr_size);
253 break;
254 }
255 err = sock->ops->recvmsg(NULL, sock, &msg,
256 len, MSG_DONTWAIT | MSG_TRUNC);
257 /* TODO: Check specific error and bomb out unless EAGAIN? */
258 if (err < 0) {
259 vhost_discard_vq_desc(vq);
260 break;
261 }
262 /* TODO: Should check and handle checksum. */
263 if (err > len) {
264 pr_err("Discarded truncated rx packet: "
265 " len %d > %zd\n", err, len);
266 vhost_discard_vq_desc(vq);
267 continue;
268 }
269 len = err;
270 err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
271 if (err) {
272 vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
273 vq->iov->iov_base, err);
274 break;
275 }
276 len += hdr_size;
277 vhost_add_used_and_signal(&net->dev, vq, head, len);
278 if (unlikely(vq_log))
279 vhost_log_write(vq, vq_log, log, len);
280 total_len += len;
281 if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
282 vhost_poll_queue(&vq->poll);
283 break;
284 }
285 }
286
287 mutex_unlock(&vq->mutex);
288 unuse_mm(net->dev.mm);
289}
290
291static void handle_tx_kick(struct work_struct *work)
292{
293 struct vhost_virtqueue *vq;
294 struct vhost_net *net;
295 vq = container_of(work, struct vhost_virtqueue, poll.work);
296 net = container_of(vq->dev, struct vhost_net, dev);
297 handle_tx(net);
298}
299
300static void handle_rx_kick(struct work_struct *work)
301{
302 struct vhost_virtqueue *vq;
303 struct vhost_net *net;
304 vq = container_of(work, struct vhost_virtqueue, poll.work);
305 net = container_of(vq->dev, struct vhost_net, dev);
306 handle_rx(net);
307}
308
309static void handle_tx_net(struct work_struct *work)
310{
311 struct vhost_net *net;
312 net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
313 handle_tx(net);
314}
315
316static void handle_rx_net(struct work_struct *work)
317{
318 struct vhost_net *net;
319 net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
320 handle_rx(net);
321}
322
323static int vhost_net_open(struct inode *inode, struct file *f)
324{
325 struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
326 int r;
327 if (!n)
328 return -ENOMEM;
329 n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
330 n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
331 r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
332 if (r < 0) {
333 kfree(n);
334 return r;
335 }
336
337 vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
338 vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
339 n->tx_poll_state = VHOST_NET_POLL_DISABLED;
340
341 f->private_data = n;
342
343 return 0;
344}
345
346static void vhost_net_disable_vq(struct vhost_net *n,
347 struct vhost_virtqueue *vq)
348{
349 if (!vq->private_data)
350 return;
351 if (vq == n->vqs + VHOST_NET_VQ_TX) {
352 tx_poll_stop(n);
353 n->tx_poll_state = VHOST_NET_POLL_DISABLED;
354 } else
355 vhost_poll_stop(n->poll + VHOST_NET_VQ_RX);
356}
357
358static void vhost_net_enable_vq(struct vhost_net *n,
359 struct vhost_virtqueue *vq)
360{
361 struct socket *sock = vq->private_data;
362 if (!sock)
363 return;
364 if (vq == n->vqs + VHOST_NET_VQ_TX) {
365 n->tx_poll_state = VHOST_NET_POLL_STOPPED;
366 tx_poll_start(n, sock);
367 } else
368 vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file);
369}
370
371static struct socket *vhost_net_stop_vq(struct vhost_net *n,
372 struct vhost_virtqueue *vq)
373{
374 struct socket *sock;
375
376 mutex_lock(&vq->mutex);
377 sock = vq->private_data;
378 vhost_net_disable_vq(n, vq);
379 rcu_assign_pointer(vq->private_data, NULL);
380 mutex_unlock(&vq->mutex);
381 return sock;
382}
383
384static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
385 struct socket **rx_sock)
386{
387 *tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX);
388 *rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX);
389}
390
391static void vhost_net_flush_vq(struct vhost_net *n, int index)
392{
393 vhost_poll_flush(n->poll + index);
394 vhost_poll_flush(&n->dev.vqs[index].poll);
395}
396
397static void vhost_net_flush(struct vhost_net *n)
398{
399 vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
400 vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
401}
402
403static int vhost_net_release(struct inode *inode, struct file *f)
404{
405 struct vhost_net *n = f->private_data;
406 struct socket *tx_sock;
407 struct socket *rx_sock;
408
409 vhost_net_stop(n, &tx_sock, &rx_sock);
410 vhost_net_flush(n);
411 vhost_dev_cleanup(&n->dev);
412 if (tx_sock)
413 fput(tx_sock->file);
414 if (rx_sock)
415 fput(rx_sock->file);
416 /* We do an extra flush before freeing memory,
417 * since jobs can re-queue themselves. */
418 vhost_net_flush(n);
419 kfree(n);
420 return 0;
421}
422
423static struct socket *get_raw_socket(int fd)
424{
425 struct {
426 struct sockaddr_ll sa;
427 char buf[MAX_ADDR_LEN];
428 } uaddr;
429 int uaddr_len = sizeof uaddr, r;
430 struct socket *sock = sockfd_lookup(fd, &r);
431 if (!sock)
432 return ERR_PTR(-ENOTSOCK);
433
434 /* Parameter checking */
435 if (sock->sk->sk_type != SOCK_RAW) {
436 r = -ESOCKTNOSUPPORT;
437 goto err;
438 }
439
440 r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa,
441 &uaddr_len, 0);
442 if (r)
443 goto err;
444
445 if (uaddr.sa.sll_family != AF_PACKET) {
446 r = -EPFNOSUPPORT;
447 goto err;
448 }
449 return sock;
450err:
451 fput(sock->file);
452 return ERR_PTR(r);
453}
454
455static struct socket *get_tun_socket(int fd)
456{
457 struct file *file = fget(fd);
458 struct socket *sock;
459 if (!file)
460 return ERR_PTR(-EBADF);
461 sock = tun_get_socket(file);
462 if (IS_ERR(sock))
463 fput(file);
464 return sock;
465}
466
467static struct socket *get_socket(int fd)
468{
469 struct socket *sock;
470 /* special case to disable backend */
471 if (fd == -1)
472 return NULL;
473 sock = get_raw_socket(fd);
474 if (!IS_ERR(sock))
475 return sock;
476 sock = get_tun_socket(fd);
477 if (!IS_ERR(sock))
478 return sock;
479 return ERR_PTR(-ENOTSOCK);
480}
481
482static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
483{
484 struct socket *sock, *oldsock;
485 struct vhost_virtqueue *vq;
486 int r;
487
488 mutex_lock(&n->dev.mutex);
489 r = vhost_dev_check_owner(&n->dev);
490 if (r)
491 goto err;
492
493 if (index >= VHOST_NET_VQ_MAX) {
494 r = -ENOBUFS;
495 goto err;
496 }
497 vq = n->vqs + index;
498 mutex_lock(&vq->mutex);
499
500 /* Verify that ring has been setup correctly. */
501 if (!vhost_vq_access_ok(vq)) {
502 r = -EFAULT;
503 goto err;
504 }
505 sock = get_socket(fd);
506 if (IS_ERR(sock)) {
507 r = PTR_ERR(sock);
508 goto err;
509 }
510
511 /* start polling new socket */
512 oldsock = vq->private_data;
513 if (sock == oldsock)
514 goto done;
515
516 vhost_net_disable_vq(n, vq);
517 rcu_assign_pointer(vq->private_data, sock);
518 vhost_net_enable_vq(n, vq);
519 mutex_unlock(&vq->mutex);
520done:
521 if (oldsock) {
522 vhost_net_flush_vq(n, index);
523 fput(oldsock->file);
524 }
525err:
526 mutex_unlock(&n->dev.mutex);
527 return r;
528}
529
530static long vhost_net_reset_owner(struct vhost_net *n)
531{
532 struct socket *tx_sock = NULL;
533 struct socket *rx_sock = NULL;
534 long err;
535 mutex_lock(&n->dev.mutex);
536 err = vhost_dev_check_owner(&n->dev);
537 if (err)
538 goto done;
539 vhost_net_stop(n, &tx_sock, &rx_sock);
540 vhost_net_flush(n);
541 err = vhost_dev_reset_owner(&n->dev);
542done:
543 mutex_unlock(&n->dev.mutex);
544 if (tx_sock)
545 fput(tx_sock->file);
546 if (rx_sock)
547 fput(rx_sock->file);
548 return err;
549}
550
551static int vhost_net_set_features(struct vhost_net *n, u64 features)
552{
553 size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
554 sizeof(struct virtio_net_hdr) : 0;
555 int i;
556 mutex_lock(&n->dev.mutex);
557 if ((features & (1 << VHOST_F_LOG_ALL)) &&
558 !vhost_log_access_ok(&n->dev)) {
559 mutex_unlock(&n->dev.mutex);
560 return -EFAULT;
561 }
562 n->dev.acked_features = features;
563 smp_wmb();
564 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
565 mutex_lock(&n->vqs[i].mutex);
566 n->vqs[i].hdr_size = hdr_size;
567 mutex_unlock(&n->vqs[i].mutex);
568 }
569 vhost_net_flush(n);
570 mutex_unlock(&n->dev.mutex);
571 return 0;
572}
573
574static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
575 unsigned long arg)
576{
577 struct vhost_net *n = f->private_data;
578 void __user *argp = (void __user *)arg;
579 u64 __user *featurep = argp;
580 struct vhost_vring_file backend;
581 u64 features;
582 int r;
583 switch (ioctl) {
584 case VHOST_NET_SET_BACKEND:
585 r = copy_from_user(&backend, argp, sizeof backend);
586 if (r < 0)
587 return r;
588 return vhost_net_set_backend(n, backend.index, backend.fd);
589 case VHOST_GET_FEATURES:
590 features = VHOST_FEATURES;
591 return copy_to_user(featurep, &features, sizeof features);
592 case VHOST_SET_FEATURES:
593 r = copy_from_user(&features, featurep, sizeof features);
594 if (r < 0)
595 return r;
596 if (features & ~VHOST_FEATURES)
597 return -EOPNOTSUPP;
598 return vhost_net_set_features(n, features);
599 case VHOST_RESET_OWNER:
600 return vhost_net_reset_owner(n);
601 default:
602 mutex_lock(&n->dev.mutex);
603 r = vhost_dev_ioctl(&n->dev, ioctl, arg);
604 vhost_net_flush(n);
605 mutex_unlock(&n->dev.mutex);
606 return r;
607 }
608}
609
610#ifdef CONFIG_COMPAT
611static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
612 unsigned long arg)
613{
614 return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
615}
616#endif
617
618const static struct file_operations vhost_net_fops = {
619 .owner = THIS_MODULE,
620 .release = vhost_net_release,
621 .unlocked_ioctl = vhost_net_ioctl,
622#ifdef CONFIG_COMPAT
623 .compat_ioctl = vhost_net_compat_ioctl,
624#endif
625 .open = vhost_net_open,
626};
627
628static struct miscdevice vhost_net_misc = {
629 VHOST_NET_MINOR,
630 "vhost-net",
631 &vhost_net_fops,
632};
633
634int vhost_net_init(void)
635{
636 int r = vhost_init();
637 if (r)
638 goto err_init;
639 r = misc_register(&vhost_net_misc);
640 if (r)
641 goto err_reg;
642 return 0;
643err_reg:
644 vhost_cleanup();
645err_init:
646 return r;
647
648}
649module_init(vhost_net_init);
650
651void vhost_net_exit(void)
652{
653 misc_deregister(&vhost_net_misc);
654 vhost_cleanup();
655}
656module_exit(vhost_net_exit);
657
658MODULE_VERSION("0.0.1");
659MODULE_LICENSE("GPL v2");
660MODULE_AUTHOR("Michael S. Tsirkin");
661MODULE_DESCRIPTION("Host kernel accelerator for virtio net");
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
new file mode 100644
index 000000000000..c8c25dbc5857
--- /dev/null
+++ b/drivers/vhost/vhost.c
@@ -0,0 +1,1098 @@
1/* Copyright (C) 2009 Red Hat, Inc.
2 * Copyright (C) 2006 Rusty Russell IBM Corporation
3 *
4 * Author: Michael S. Tsirkin <mst@redhat.com>
5 *
6 * Inspiration, some code, and most witty comments come from
7 * Documentation/lguest/lguest.c, by Rusty Russell
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2.
10 *
11 * Generic code for virtio server in host kernel.
12 */
13
14#include <linux/eventfd.h>
15#include <linux/vhost.h>
16#include <linux/virtio_net.h>
17#include <linux/mm.h>
18#include <linux/miscdevice.h>
19#include <linux/mutex.h>
20#include <linux/workqueue.h>
21#include <linux/rcupdate.h>
22#include <linux/poll.h>
23#include <linux/file.h>
24#include <linux/highmem.h>
25
26#include <linux/net.h>
27#include <linux/if_packet.h>
28#include <linux/if_arp.h>
29
30#include <net/sock.h>
31
32#include "vhost.h"
33
34enum {
35 VHOST_MEMORY_MAX_NREGIONS = 64,
36 VHOST_MEMORY_F_LOG = 0x1,
37};
38
39static struct workqueue_struct *vhost_workqueue;
40
41static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
42 poll_table *pt)
43{
44 struct vhost_poll *poll;
45 poll = container_of(pt, struct vhost_poll, table);
46
47 poll->wqh = wqh;
48 add_wait_queue(wqh, &poll->wait);
49}
50
51static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
52 void *key)
53{
54 struct vhost_poll *poll;
55 poll = container_of(wait, struct vhost_poll, wait);
56 if (!((unsigned long)key & poll->mask))
57 return 0;
58
59 queue_work(vhost_workqueue, &poll->work);
60 return 0;
61}
62
63/* Init poll structure */
64void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
65 unsigned long mask)
66{
67 INIT_WORK(&poll->work, func);
68 init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
69 init_poll_funcptr(&poll->table, vhost_poll_func);
70 poll->mask = mask;
71}
72
73/* Start polling a file. We add ourselves to file's wait queue. The caller must
74 * keep a reference to a file until after vhost_poll_stop is called. */
75void vhost_poll_start(struct vhost_poll *poll, struct file *file)
76{
77 unsigned long mask;
78 mask = file->f_op->poll(file, &poll->table);
79 if (mask)
80 vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask);
81}
82
83/* Stop polling a file. After this function returns, it becomes safe to drop the
84 * file reference. You must also flush afterwards. */
85void vhost_poll_stop(struct vhost_poll *poll)
86{
87 remove_wait_queue(poll->wqh, &poll->wait);
88}
89
90/* Flush any work that has been scheduled. When calling this, don't hold any
91 * locks that are also used by the callback. */
92void vhost_poll_flush(struct vhost_poll *poll)
93{
94 flush_work(&poll->work);
95}
96
97void vhost_poll_queue(struct vhost_poll *poll)
98{
99 queue_work(vhost_workqueue, &poll->work);
100}
101
102static void vhost_vq_reset(struct vhost_dev *dev,
103 struct vhost_virtqueue *vq)
104{
105 vq->num = 1;
106 vq->desc = NULL;
107 vq->avail = NULL;
108 vq->used = NULL;
109 vq->last_avail_idx = 0;
110 vq->avail_idx = 0;
111 vq->last_used_idx = 0;
112 vq->used_flags = 0;
113 vq->used_flags = 0;
114 vq->log_used = false;
115 vq->log_addr = -1ull;
116 vq->hdr_size = 0;
117 vq->private_data = NULL;
118 vq->log_base = NULL;
119 vq->error_ctx = NULL;
120 vq->error = NULL;
121 vq->kick = NULL;
122 vq->call_ctx = NULL;
123 vq->call = NULL;
124}
125
126long vhost_dev_init(struct vhost_dev *dev,
127 struct vhost_virtqueue *vqs, int nvqs)
128{
129 int i;
130 dev->vqs = vqs;
131 dev->nvqs = nvqs;
132 mutex_init(&dev->mutex);
133 dev->log_ctx = NULL;
134 dev->log_file = NULL;
135 dev->memory = NULL;
136 dev->mm = NULL;
137
138 for (i = 0; i < dev->nvqs; ++i) {
139 dev->vqs[i].dev = dev;
140 mutex_init(&dev->vqs[i].mutex);
141 vhost_vq_reset(dev, dev->vqs + i);
142 if (dev->vqs[i].handle_kick)
143 vhost_poll_init(&dev->vqs[i].poll,
144 dev->vqs[i].handle_kick,
145 POLLIN);
146 }
147 return 0;
148}
149
150/* Caller should have device mutex */
151long vhost_dev_check_owner(struct vhost_dev *dev)
152{
153 /* Are you the owner? If not, I don't think you mean to do that */
154 return dev->mm == current->mm ? 0 : -EPERM;
155}
156
157/* Caller should have device mutex */
158static long vhost_dev_set_owner(struct vhost_dev *dev)
159{
160 /* Is there an owner already? */
161 if (dev->mm)
162 return -EBUSY;
163 /* No owner, become one */
164 dev->mm = get_task_mm(current);
165 return 0;
166}
167
168/* Caller should have device mutex */
169long vhost_dev_reset_owner(struct vhost_dev *dev)
170{
171 struct vhost_memory *memory;
172
173 /* Restore memory to default empty mapping. */
174 memory = kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL);
175 if (!memory)
176 return -ENOMEM;
177
178 vhost_dev_cleanup(dev);
179
180 memory->nregions = 0;
181 dev->memory = memory;
182 return 0;
183}
184
185/* Caller should have device mutex */
186void vhost_dev_cleanup(struct vhost_dev *dev)
187{
188 int i;
189 for (i = 0; i < dev->nvqs; ++i) {
190 if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
191 vhost_poll_stop(&dev->vqs[i].poll);
192 vhost_poll_flush(&dev->vqs[i].poll);
193 }
194 if (dev->vqs[i].error_ctx)
195 eventfd_ctx_put(dev->vqs[i].error_ctx);
196 if (dev->vqs[i].error)
197 fput(dev->vqs[i].error);
198 if (dev->vqs[i].kick)
199 fput(dev->vqs[i].kick);
200 if (dev->vqs[i].call_ctx)
201 eventfd_ctx_put(dev->vqs[i].call_ctx);
202 if (dev->vqs[i].call)
203 fput(dev->vqs[i].call);
204 vhost_vq_reset(dev, dev->vqs + i);
205 }
206 if (dev->log_ctx)
207 eventfd_ctx_put(dev->log_ctx);
208 dev->log_ctx = NULL;
209 if (dev->log_file)
210 fput(dev->log_file);
211 dev->log_file = NULL;
212 /* No one will access memory at this point */
213 kfree(dev->memory);
214 dev->memory = NULL;
215 if (dev->mm)
216 mmput(dev->mm);
217 dev->mm = NULL;
218}
219
220static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
221{
222 u64 a = addr / VHOST_PAGE_SIZE / 8;
223 /* Make sure 64 bit math will not overflow. */
224 if (a > ULONG_MAX - (unsigned long)log_base ||
225 a + (unsigned long)log_base > ULONG_MAX)
226 return -EFAULT;
227
228 return access_ok(VERIFY_WRITE, log_base + a,
229 (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
230}
231
232/* Caller should have vq mutex and device mutex. */
233static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
234 int log_all)
235{
236 int i;
237 for (i = 0; i < mem->nregions; ++i) {
238 struct vhost_memory_region *m = mem->regions + i;
239 unsigned long a = m->userspace_addr;
240 if (m->memory_size > ULONG_MAX)
241 return 0;
242 else if (!access_ok(VERIFY_WRITE, (void __user *)a,
243 m->memory_size))
244 return 0;
245 else if (log_all && !log_access_ok(log_base,
246 m->guest_phys_addr,
247 m->memory_size))
248 return 0;
249 }
250 return 1;
251}
252
253/* Can we switch to this memory table? */
254/* Caller should have device mutex but not vq mutex */
255static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
256 int log_all)
257{
258 int i;
259 for (i = 0; i < d->nvqs; ++i) {
260 int ok;
261 mutex_lock(&d->vqs[i].mutex);
262 /* If ring is inactive, will check when it's enabled. */
263 if (d->vqs[i].private_data)
264 ok = vq_memory_access_ok(d->vqs[i].log_base, mem,
265 log_all);
266 else
267 ok = 1;
268 mutex_unlock(&d->vqs[i].mutex);
269 if (!ok)
270 return 0;
271 }
272 return 1;
273}
274
275static int vq_access_ok(unsigned int num,
276 struct vring_desc __user *desc,
277 struct vring_avail __user *avail,
278 struct vring_used __user *used)
279{
280 return access_ok(VERIFY_READ, desc, num * sizeof *desc) &&
281 access_ok(VERIFY_READ, avail,
282 sizeof *avail + num * sizeof *avail->ring) &&
283 access_ok(VERIFY_WRITE, used,
284 sizeof *used + num * sizeof *used->ring);
285}
286
287/* Can we log writes? */
288/* Caller should have device mutex but not vq mutex */
289int vhost_log_access_ok(struct vhost_dev *dev)
290{
291 return memory_access_ok(dev, dev->memory, 1);
292}
293
294/* Verify access for write logging. */
295/* Caller should have vq mutex and device mutex */
296static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base)
297{
298 return vq_memory_access_ok(log_base, vq->dev->memory,
299 vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) &&
300 (!vq->log_used || log_access_ok(log_base, vq->log_addr,
301 sizeof *vq->used +
302 vq->num * sizeof *vq->used->ring));
303}
304
305/* Can we start vq? */
306/* Caller should have vq mutex and device mutex */
307int vhost_vq_access_ok(struct vhost_virtqueue *vq)
308{
309 return vq_access_ok(vq->num, vq->desc, vq->avail, vq->used) &&
310 vq_log_access_ok(vq, vq->log_base);
311}
312
313static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
314{
315 struct vhost_memory mem, *newmem, *oldmem;
316 unsigned long size = offsetof(struct vhost_memory, regions);
317 long r;
318 r = copy_from_user(&mem, m, size);
319 if (r)
320 return r;
321 if (mem.padding)
322 return -EOPNOTSUPP;
323 if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS)
324 return -E2BIG;
325 newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL);
326 if (!newmem)
327 return -ENOMEM;
328
329 memcpy(newmem, &mem, size);
330 r = copy_from_user(newmem->regions, m->regions,
331 mem.nregions * sizeof *m->regions);
332 if (r) {
333 kfree(newmem);
334 return r;
335 }
336
337 if (!memory_access_ok(d, newmem, vhost_has_feature(d, VHOST_F_LOG_ALL)))
338 return -EFAULT;
339 oldmem = d->memory;
340 rcu_assign_pointer(d->memory, newmem);
341 synchronize_rcu();
342 kfree(oldmem);
343 return 0;
344}
345
346static int init_used(struct vhost_virtqueue *vq,
347 struct vring_used __user *used)
348{
349 int r = put_user(vq->used_flags, &used->flags);
350 if (r)
351 return r;
352 return get_user(vq->last_used_idx, &used->idx);
353}
354
355static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
356{
357 struct file *eventfp, *filep = NULL,
358 *pollstart = NULL, *pollstop = NULL;
359 struct eventfd_ctx *ctx = NULL;
360 u32 __user *idxp = argp;
361 struct vhost_virtqueue *vq;
362 struct vhost_vring_state s;
363 struct vhost_vring_file f;
364 struct vhost_vring_addr a;
365 u32 idx;
366 long r;
367
368 r = get_user(idx, idxp);
369 if (r < 0)
370 return r;
371 if (idx > d->nvqs)
372 return -ENOBUFS;
373
374 vq = d->vqs + idx;
375
376 mutex_lock(&vq->mutex);
377
378 switch (ioctl) {
379 case VHOST_SET_VRING_NUM:
380 /* Resizing ring with an active backend?
381 * You don't want to do that. */
382 if (vq->private_data) {
383 r = -EBUSY;
384 break;
385 }
386 r = copy_from_user(&s, argp, sizeof s);
387 if (r < 0)
388 break;
389 if (!s.num || s.num > 0xffff || (s.num & (s.num - 1))) {
390 r = -EINVAL;
391 break;
392 }
393 vq->num = s.num;
394 break;
395 case VHOST_SET_VRING_BASE:
396 /* Moving base with an active backend?
397 * You don't want to do that. */
398 if (vq->private_data) {
399 r = -EBUSY;
400 break;
401 }
402 r = copy_from_user(&s, argp, sizeof s);
403 if (r < 0)
404 break;
405 if (s.num > 0xffff) {
406 r = -EINVAL;
407 break;
408 }
409 vq->last_avail_idx = s.num;
410 /* Forget the cached index value. */
411 vq->avail_idx = vq->last_avail_idx;
412 break;
413 case VHOST_GET_VRING_BASE:
414 s.index = idx;
415 s.num = vq->last_avail_idx;
416 r = copy_to_user(argp, &s, sizeof s);
417 break;
418 case VHOST_SET_VRING_ADDR:
419 r = copy_from_user(&a, argp, sizeof a);
420 if (r < 0)
421 break;
422 if (a.flags & ~(0x1 << VHOST_VRING_F_LOG)) {
423 r = -EOPNOTSUPP;
424 break;
425 }
426 /* For 32bit, verify that the top 32bits of the user
427 data are set to zero. */
428 if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr ||
429 (u64)(unsigned long)a.used_user_addr != a.used_user_addr ||
430 (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr) {
431 r = -EFAULT;
432 break;
433 }
434 if ((a.avail_user_addr & (sizeof *vq->avail->ring - 1)) ||
435 (a.used_user_addr & (sizeof *vq->used->ring - 1)) ||
436 (a.log_guest_addr & (sizeof *vq->used->ring - 1))) {
437 r = -EINVAL;
438 break;
439 }
440
441 /* We only verify access here if backend is configured.
442 * If it is not, we don't as size might not have been setup.
443 * We will verify when backend is configured. */
444 if (vq->private_data) {
445 if (!vq_access_ok(vq->num,
446 (void __user *)(unsigned long)a.desc_user_addr,
447 (void __user *)(unsigned long)a.avail_user_addr,
448 (void __user *)(unsigned long)a.used_user_addr)) {
449 r = -EINVAL;
450 break;
451 }
452
453 /* Also validate log access for used ring if enabled. */
454 if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) &&
455 !log_access_ok(vq->log_base, a.log_guest_addr,
456 sizeof *vq->used +
457 vq->num * sizeof *vq->used->ring)) {
458 r = -EINVAL;
459 break;
460 }
461 }
462
463 r = init_used(vq, (struct vring_used __user *)(unsigned long)
464 a.used_user_addr);
465 if (r)
466 break;
467 vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
468 vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
469 vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
470 vq->log_addr = a.log_guest_addr;
471 vq->used = (void __user *)(unsigned long)a.used_user_addr;
472 break;
473 case VHOST_SET_VRING_KICK:
474 r = copy_from_user(&f, argp, sizeof f);
475 if (r < 0)
476 break;
477 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
478 if (IS_ERR(eventfp))
479 return PTR_ERR(eventfp);
480 if (eventfp != vq->kick) {
481 pollstop = filep = vq->kick;
482 pollstart = vq->kick = eventfp;
483 } else
484 filep = eventfp;
485 break;
486 case VHOST_SET_VRING_CALL:
487 r = copy_from_user(&f, argp, sizeof f);
488 if (r < 0)
489 break;
490 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
491 if (IS_ERR(eventfp))
492 return PTR_ERR(eventfp);
493 if (eventfp != vq->call) {
494 filep = vq->call;
495 ctx = vq->call_ctx;
496 vq->call = eventfp;
497 vq->call_ctx = eventfp ?
498 eventfd_ctx_fileget(eventfp) : NULL;
499 } else
500 filep = eventfp;
501 break;
502 case VHOST_SET_VRING_ERR:
503 r = copy_from_user(&f, argp, sizeof f);
504 if (r < 0)
505 break;
506 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
507 if (IS_ERR(eventfp))
508 return PTR_ERR(eventfp);
509 if (eventfp != vq->error) {
510 filep = vq->error;
511 vq->error = eventfp;
512 ctx = vq->error_ctx;
513 vq->error_ctx = eventfp ?
514 eventfd_ctx_fileget(eventfp) : NULL;
515 } else
516 filep = eventfp;
517 break;
518 default:
519 r = -ENOIOCTLCMD;
520 }
521
522 if (pollstop && vq->handle_kick)
523 vhost_poll_stop(&vq->poll);
524
525 if (ctx)
526 eventfd_ctx_put(ctx);
527 if (filep)
528 fput(filep);
529
530 if (pollstart && vq->handle_kick)
531 vhost_poll_start(&vq->poll, vq->kick);
532
533 mutex_unlock(&vq->mutex);
534
535 if (pollstop && vq->handle_kick)
536 vhost_poll_flush(&vq->poll);
537 return r;
538}
539
540/* Caller must have device mutex */
541long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
542{
543 void __user *argp = (void __user *)arg;
544 struct file *eventfp, *filep = NULL;
545 struct eventfd_ctx *ctx = NULL;
546 u64 p;
547 long r;
548 int i, fd;
549
550 /* If you are not the owner, you can become one */
551 if (ioctl == VHOST_SET_OWNER) {
552 r = vhost_dev_set_owner(d);
553 goto done;
554 }
555
556 /* You must be the owner to do anything else */
557 r = vhost_dev_check_owner(d);
558 if (r)
559 goto done;
560
561 switch (ioctl) {
562 case VHOST_SET_MEM_TABLE:
563 r = vhost_set_memory(d, argp);
564 break;
565 case VHOST_SET_LOG_BASE:
566 r = copy_from_user(&p, argp, sizeof p);
567 if (r < 0)
568 break;
569 if ((u64)(unsigned long)p != p) {
570 r = -EFAULT;
571 break;
572 }
573 for (i = 0; i < d->nvqs; ++i) {
574 struct vhost_virtqueue *vq;
575 void __user *base = (void __user *)(unsigned long)p;
576 vq = d->vqs + i;
577 mutex_lock(&vq->mutex);
578 /* If ring is inactive, will check when it's enabled. */
579 if (vq->private_data && !vq_log_access_ok(vq, base))
580 r = -EFAULT;
581 else
582 vq->log_base = base;
583 mutex_unlock(&vq->mutex);
584 }
585 break;
586 case VHOST_SET_LOG_FD:
587 r = get_user(fd, (int __user *)argp);
588 if (r < 0)
589 break;
590 eventfp = fd == -1 ? NULL : eventfd_fget(fd);
591 if (IS_ERR(eventfp)) {
592 r = PTR_ERR(eventfp);
593 break;
594 }
595 if (eventfp != d->log_file) {
596 filep = d->log_file;
597 ctx = d->log_ctx;
598 d->log_ctx = eventfp ?
599 eventfd_ctx_fileget(eventfp) : NULL;
600 } else
601 filep = eventfp;
602 for (i = 0; i < d->nvqs; ++i) {
603 mutex_lock(&d->vqs[i].mutex);
604 d->vqs[i].log_ctx = d->log_ctx;
605 mutex_unlock(&d->vqs[i].mutex);
606 }
607 if (ctx)
608 eventfd_ctx_put(ctx);
609 if (filep)
610 fput(filep);
611 break;
612 default:
613 r = vhost_set_vring(d, ioctl, argp);
614 break;
615 }
616done:
617 return r;
618}
619
620static const struct vhost_memory_region *find_region(struct vhost_memory *mem,
621 __u64 addr, __u32 len)
622{
623 struct vhost_memory_region *reg;
624 int i;
625 /* linear search is not brilliant, but we really have on the order of 6
626 * regions in practice */
627 for (i = 0; i < mem->nregions; ++i) {
628 reg = mem->regions + i;
629 if (reg->guest_phys_addr <= addr &&
630 reg->guest_phys_addr + reg->memory_size - 1 >= addr)
631 return reg;
632 }
633 return NULL;
634}
635
636/* TODO: This is really inefficient. We need something like get_user()
637 * (instruction directly accesses the data, with an exception table entry
638 * returning -EFAULT). See Documentation/x86/exception-tables.txt.
639 */
640static int set_bit_to_user(int nr, void __user *addr)
641{
642 unsigned long log = (unsigned long)addr;
643 struct page *page;
644 void *base;
645 int bit = nr + (log % PAGE_SIZE) * 8;
646 int r;
647 r = get_user_pages_fast(log, 1, 1, &page);
648 if (r)
649 return r;
650 base = kmap_atomic(page, KM_USER0);
651 set_bit(bit, base);
652 kunmap_atomic(base, KM_USER0);
653 set_page_dirty_lock(page);
654 put_page(page);
655 return 0;
656}
657
658static int log_write(void __user *log_base,
659 u64 write_address, u64 write_length)
660{
661 int r;
662 if (!write_length)
663 return 0;
664 write_address /= VHOST_PAGE_SIZE;
665 for (;;) {
666 u64 base = (u64)(unsigned long)log_base;
667 u64 log = base + write_address / 8;
668 int bit = write_address % 8;
669 if ((u64)(unsigned long)log != log)
670 return -EFAULT;
671 r = set_bit_to_user(bit, (void __user *)(unsigned long)log);
672 if (r < 0)
673 return r;
674 if (write_length <= VHOST_PAGE_SIZE)
675 break;
676 write_length -= VHOST_PAGE_SIZE;
677 write_address += VHOST_PAGE_SIZE;
678 }
679 return r;
680}
681
682int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
683 unsigned int log_num, u64 len)
684{
685 int i, r;
686
687 /* Make sure data written is seen before log. */
688 wmb();
689 for (i = 0; i < log_num; ++i) {
690 u64 l = min(log[i].len, len);
691 r = log_write(vq->log_base, log[i].addr, l);
692 if (r < 0)
693 return r;
694 len -= l;
695 if (!len)
696 return 0;
697 }
698 if (vq->log_ctx)
699 eventfd_signal(vq->log_ctx, 1);
700 /* Length written exceeds what we have stored. This is a bug. */
701 BUG();
702 return 0;
703}
704
705int translate_desc(struct vhost_dev *dev, u64 addr, u32 len,
706 struct iovec iov[], int iov_size)
707{
708 const struct vhost_memory_region *reg;
709 struct vhost_memory *mem;
710 struct iovec *_iov;
711 u64 s = 0;
712 int ret = 0;
713
714 rcu_read_lock();
715
716 mem = rcu_dereference(dev->memory);
717 while ((u64)len > s) {
718 u64 size;
719 if (ret >= iov_size) {
720 ret = -ENOBUFS;
721 break;
722 }
723 reg = find_region(mem, addr, len);
724 if (!reg) {
725 ret = -EFAULT;
726 break;
727 }
728 _iov = iov + ret;
729 size = reg->memory_size - addr + reg->guest_phys_addr;
730 _iov->iov_len = min((u64)len, size);
731 _iov->iov_base = (void *)(unsigned long)
732 (reg->userspace_addr + addr - reg->guest_phys_addr);
733 s += size;
734 addr += size;
735 ++ret;
736 }
737
738 rcu_read_unlock();
739 return ret;
740}
741
742/* Each buffer in the virtqueues is actually a chain of descriptors. This
743 * function returns the next descriptor in the chain,
744 * or -1U if we're at the end. */
745static unsigned next_desc(struct vring_desc *desc)
746{
747 unsigned int next;
748
749 /* If this descriptor says it doesn't chain, we're done. */
750 if (!(desc->flags & VRING_DESC_F_NEXT))
751 return -1U;
752
753 /* Check they're not leading us off end of descriptors. */
754 next = desc->next;
755 /* Make sure compiler knows to grab that: we don't want it changing! */
756 /* We will use the result as an index in an array, so most
757 * architectures only need a compiler barrier here. */
758 read_barrier_depends();
759
760 return next;
761}
762
763static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
764 struct iovec iov[], unsigned int iov_size,
765 unsigned int *out_num, unsigned int *in_num,
766 struct vhost_log *log, unsigned int *log_num,
767 struct vring_desc *indirect)
768{
769 struct vring_desc desc;
770 unsigned int i = 0, count, found = 0;
771 int ret;
772
773 /* Sanity check */
774 if (indirect->len % sizeof desc) {
775 vq_err(vq, "Invalid length in indirect descriptor: "
776 "len 0x%llx not multiple of 0x%zx\n",
777 (unsigned long long)indirect->len,
778 sizeof desc);
779 return -EINVAL;
780 }
781
782 ret = translate_desc(dev, indirect->addr, indirect->len, vq->indirect,
783 ARRAY_SIZE(vq->indirect));
784 if (ret < 0) {
785 vq_err(vq, "Translation failure %d in indirect.\n", ret);
786 return ret;
787 }
788
789 /* We will use the result as an address to read from, so most
790 * architectures only need a compiler barrier here. */
791 read_barrier_depends();
792
793 count = indirect->len / sizeof desc;
794 /* Buffers are chained via a 16 bit next field, so
795 * we can have at most 2^16 of these. */
796 if (count > USHORT_MAX + 1) {
797 vq_err(vq, "Indirect buffer length too big: %d\n",
798 indirect->len);
799 return -E2BIG;
800 }
801
802 do {
803 unsigned iov_count = *in_num + *out_num;
804 if (++found > count) {
805 vq_err(vq, "Loop detected: last one at %u "
806 "indirect size %u\n",
807 i, count);
808 return -EINVAL;
809 }
810 if (memcpy_fromiovec((unsigned char *)&desc, vq->indirect,
811 sizeof desc)) {
812 vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
813 i, (size_t)indirect->addr + i * sizeof desc);
814 return -EINVAL;
815 }
816 if (desc.flags & VRING_DESC_F_INDIRECT) {
817 vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
818 i, (size_t)indirect->addr + i * sizeof desc);
819 return -EINVAL;
820 }
821
822 ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count,
823 iov_size - iov_count);
824 if (ret < 0) {
825 vq_err(vq, "Translation failure %d indirect idx %d\n",
826 ret, i);
827 return ret;
828 }
829 /* If this is an input descriptor, increment that count. */
830 if (desc.flags & VRING_DESC_F_WRITE) {
831 *in_num += ret;
832 if (unlikely(log)) {
833 log[*log_num].addr = desc.addr;
834 log[*log_num].len = desc.len;
835 ++*log_num;
836 }
837 } else {
838 /* If it's an output descriptor, they're all supposed
839 * to come before any input descriptors. */
840 if (*in_num) {
841 vq_err(vq, "Indirect descriptor "
842 "has out after in: idx %d\n", i);
843 return -EINVAL;
844 }
845 *out_num += ret;
846 }
847 } while ((i = next_desc(&desc)) != -1);
848 return 0;
849}
850
851/* This looks in the virtqueue and for the first available buffer, and converts
852 * it to an iovec for convenient access. Since descriptors consist of some
853 * number of output then some number of input descriptors, it's actually two
854 * iovecs, but we pack them into one and note how many of each there were.
855 *
856 * This function returns the descriptor number found, or vq->num (which
857 * is never a valid descriptor number) if none was found. */
858unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
859 struct iovec iov[], unsigned int iov_size,
860 unsigned int *out_num, unsigned int *in_num,
861 struct vhost_log *log, unsigned int *log_num)
862{
863 struct vring_desc desc;
864 unsigned int i, head, found = 0;
865 u16 last_avail_idx;
866 int ret;
867
868 /* Check it isn't doing very strange things with descriptor numbers. */
869 last_avail_idx = vq->last_avail_idx;
870 if (get_user(vq->avail_idx, &vq->avail->idx)) {
871 vq_err(vq, "Failed to access avail idx at %p\n",
872 &vq->avail->idx);
873 return vq->num;
874 }
875
876 if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
877 vq_err(vq, "Guest moved used index from %u to %u",
878 last_avail_idx, vq->avail_idx);
879 return vq->num;
880 }
881
882 /* If there's nothing new since last we looked, return invalid. */
883 if (vq->avail_idx == last_avail_idx)
884 return vq->num;
885
886 /* Only get avail ring entries after they have been exposed by guest. */
887 rmb();
888
889 /* Grab the next descriptor number they're advertising, and increment
890 * the index we've seen. */
891 if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
892 vq_err(vq, "Failed to read head: idx %d address %p\n",
893 last_avail_idx,
894 &vq->avail->ring[last_avail_idx % vq->num]);
895 return vq->num;
896 }
897
898 /* If their number is silly, that's an error. */
899 if (head >= vq->num) {
900 vq_err(vq, "Guest says index %u > %u is available",
901 head, vq->num);
902 return vq->num;
903 }
904
905 /* When we start there are none of either input nor output. */
906 *out_num = *in_num = 0;
907 if (unlikely(log))
908 *log_num = 0;
909
910 i = head;
911 do {
912 unsigned iov_count = *in_num + *out_num;
913 if (i >= vq->num) {
914 vq_err(vq, "Desc index is %u > %u, head = %u",
915 i, vq->num, head);
916 return vq->num;
917 }
918 if (++found > vq->num) {
919 vq_err(vq, "Loop detected: last one at %u "
920 "vq size %u head %u\n",
921 i, vq->num, head);
922 return vq->num;
923 }
924 ret = copy_from_user(&desc, vq->desc + i, sizeof desc);
925 if (ret) {
926 vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
927 i, vq->desc + i);
928 return vq->num;
929 }
930 if (desc.flags & VRING_DESC_F_INDIRECT) {
931 ret = get_indirect(dev, vq, iov, iov_size,
932 out_num, in_num,
933 log, log_num, &desc);
934 if (ret < 0) {
935 vq_err(vq, "Failure detected "
936 "in indirect descriptor at idx %d\n", i);
937 return vq->num;
938 }
939 continue;
940 }
941
942 ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count,
943 iov_size - iov_count);
944 if (ret < 0) {
945 vq_err(vq, "Translation failure %d descriptor idx %d\n",
946 ret, i);
947 return vq->num;
948 }
949 if (desc.flags & VRING_DESC_F_WRITE) {
950 /* If this is an input descriptor,
951 * increment that count. */
952 *in_num += ret;
953 if (unlikely(log)) {
954 log[*log_num].addr = desc.addr;
955 log[*log_num].len = desc.len;
956 ++*log_num;
957 }
958 } else {
959 /* If it's an output descriptor, they're all supposed
960 * to come before any input descriptors. */
961 if (*in_num) {
962 vq_err(vq, "Descriptor has out after in: "
963 "idx %d\n", i);
964 return vq->num;
965 }
966 *out_num += ret;
967 }
968 } while ((i = next_desc(&desc)) != -1);
969
970 /* On success, increment avail index. */
971 vq->last_avail_idx++;
972 return head;
973}
974
975/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
976void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
977{
978 vq->last_avail_idx--;
979}
980
981/* After we've used one of their buffers, we tell them about it. We'll then
982 * want to notify the guest, using eventfd. */
983int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
984{
985 struct vring_used_elem *used;
986
987 /* The virtqueue contains a ring of used buffers. Get a pointer to the
988 * next entry in that used ring. */
989 used = &vq->used->ring[vq->last_used_idx % vq->num];
990 if (put_user(head, &used->id)) {
991 vq_err(vq, "Failed to write used id");
992 return -EFAULT;
993 }
994 if (put_user(len, &used->len)) {
995 vq_err(vq, "Failed to write used len");
996 return -EFAULT;
997 }
998 /* Make sure buffer is written before we update index. */
999 wmb();
1000 if (put_user(vq->last_used_idx + 1, &vq->used->idx)) {
1001 vq_err(vq, "Failed to increment used idx");
1002 return -EFAULT;
1003 }
1004 if (unlikely(vq->log_used)) {
1005 /* Make sure data is seen before log. */
1006 wmb();
1007 log_write(vq->log_base, vq->log_addr + sizeof *vq->used->ring *
1008 (vq->last_used_idx % vq->num),
1009 sizeof *vq->used->ring);
1010 log_write(vq->log_base, vq->log_addr, sizeof *vq->used->ring);
1011 if (vq->log_ctx)
1012 eventfd_signal(vq->log_ctx, 1);
1013 }
1014 vq->last_used_idx++;
1015 return 0;
1016}
1017
1018/* This actually signals the guest, using eventfd. */
1019void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
1020{
1021 __u16 flags = 0;
1022 if (get_user(flags, &vq->avail->flags)) {
1023 vq_err(vq, "Failed to get flags");
1024 return;
1025 }
1026
1027 /* If they don't want an interrupt, don't signal, unless empty. */
1028 if ((flags & VRING_AVAIL_F_NO_INTERRUPT) &&
1029 (vq->avail_idx != vq->last_avail_idx ||
1030 !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY)))
1031 return;
1032
1033 /* Signal the Guest tell them we used something up. */
1034 if (vq->call_ctx)
1035 eventfd_signal(vq->call_ctx, 1);
1036}
1037
1038/* And here's the combo meal deal. Supersize me! */
1039void vhost_add_used_and_signal(struct vhost_dev *dev,
1040 struct vhost_virtqueue *vq,
1041 unsigned int head, int len)
1042{
1043 vhost_add_used(vq, head, len);
1044 vhost_signal(dev, vq);
1045}
1046
1047/* OK, now we need to know about added descriptors. */
1048bool vhost_enable_notify(struct vhost_virtqueue *vq)
1049{
1050 u16 avail_idx;
1051 int r;
1052 if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
1053 return false;
1054 vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
1055 r = put_user(vq->used_flags, &vq->used->flags);
1056 if (r) {
1057 vq_err(vq, "Failed to enable notification at %p: %d\n",
1058 &vq->used->flags, r);
1059 return false;
1060 }
1061 /* They could have slipped one in as we were doing that: make
1062 * sure it's written, then check again. */
1063 mb();
1064 r = get_user(avail_idx, &vq->avail->idx);
1065 if (r) {
1066 vq_err(vq, "Failed to check avail idx at %p: %d\n",
1067 &vq->avail->idx, r);
1068 return false;
1069 }
1070
1071 return avail_idx != vq->last_avail_idx;
1072}
1073
1074/* We don't need to be notified again. */
1075void vhost_disable_notify(struct vhost_virtqueue *vq)
1076{
1077 int r;
1078 if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
1079 return;
1080 vq->used_flags |= VRING_USED_F_NO_NOTIFY;
1081 r = put_user(vq->used_flags, &vq->used->flags);
1082 if (r)
1083 vq_err(vq, "Failed to enable notification at %p: %d\n",
1084 &vq->used->flags, r);
1085}
1086
1087int vhost_init(void)
1088{
1089 vhost_workqueue = create_singlethread_workqueue("vhost");
1090 if (!vhost_workqueue)
1091 return -ENOMEM;
1092 return 0;
1093}
1094
1095void vhost_cleanup(void)
1096{
1097 destroy_workqueue(vhost_workqueue);
1098}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
new file mode 100644
index 000000000000..44591ba9b07a
--- /dev/null
+++ b/drivers/vhost/vhost.h
@@ -0,0 +1,161 @@
1#ifndef _VHOST_H
2#define _VHOST_H
3
4#include <linux/eventfd.h>
5#include <linux/vhost.h>
6#include <linux/mm.h>
7#include <linux/mutex.h>
8#include <linux/workqueue.h>
9#include <linux/poll.h>
10#include <linux/file.h>
11#include <linux/skbuff.h>
12#include <linux/uio.h>
13#include <linux/virtio_config.h>
14#include <linux/virtio_ring.h>
15
16struct vhost_device;
17
18enum {
19 /* Enough place for all fragments, head, and virtio net header. */
20 VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2,
21};
22
23/* Poll a file (eventfd or socket) */
24/* Note: there's nothing vhost specific about this structure. */
25struct vhost_poll {
26 poll_table table;
27 wait_queue_head_t *wqh;
28 wait_queue_t wait;
29 /* struct which will handle all actual work. */
30 struct work_struct work;
31 unsigned long mask;
32};
33
34void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
35 unsigned long mask);
36void vhost_poll_start(struct vhost_poll *poll, struct file *file);
37void vhost_poll_stop(struct vhost_poll *poll);
38void vhost_poll_flush(struct vhost_poll *poll);
39void vhost_poll_queue(struct vhost_poll *poll);
40
41struct vhost_log {
42 u64 addr;
43 u64 len;
44};
45
46/* The virtqueue structure describes a queue attached to a device. */
47struct vhost_virtqueue {
48 struct vhost_dev *dev;
49
50 /* The actual ring of buffers. */
51 struct mutex mutex;
52 unsigned int num;
53 struct vring_desc __user *desc;
54 struct vring_avail __user *avail;
55 struct vring_used __user *used;
56 struct file *kick;
57 struct file *call;
58 struct file *error;
59 struct eventfd_ctx *call_ctx;
60 struct eventfd_ctx *error_ctx;
61 struct eventfd_ctx *log_ctx;
62
63 struct vhost_poll poll;
64
65 /* The routine to call when the Guest pings us, or timeout. */
66 work_func_t handle_kick;
67
68 /* Last available index we saw. */
69 u16 last_avail_idx;
70
71 /* Caches available index value from user. */
72 u16 avail_idx;
73
74 /* Last index we used. */
75 u16 last_used_idx;
76
77 /* Used flags */
78 u16 used_flags;
79
80 /* Log writes to used structure. */
81 bool log_used;
82 u64 log_addr;
83
84 struct iovec indirect[VHOST_NET_MAX_SG];
85 struct iovec iov[VHOST_NET_MAX_SG];
86 struct iovec hdr[VHOST_NET_MAX_SG];
87 size_t hdr_size;
88 /* We use a kind of RCU to access private pointer.
89 * All readers access it from workqueue, which makes it possible to
90 * flush the workqueue instead of synchronize_rcu. Therefore readers do
91 * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
92 * work item execution acts instead of rcu_read_lock() and the end of
93 * work item execution acts instead of rcu_read_lock().
94 * Writers use virtqueue mutex. */
95 void *private_data;
96 /* Log write descriptors */
97 void __user *log_base;
98 struct vhost_log log[VHOST_NET_MAX_SG];
99};
100
101struct vhost_dev {
102 /* Readers use RCU to access memory table pointer
103 * log base pointer and features.
104 * Writers use mutex below.*/
105 struct vhost_memory *memory;
106 struct mm_struct *mm;
107 struct mutex mutex;
108 unsigned acked_features;
109 struct vhost_virtqueue *vqs;
110 int nvqs;
111 struct file *log_file;
112 struct eventfd_ctx *log_ctx;
113};
114
115long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
116long vhost_dev_check_owner(struct vhost_dev *);
117long vhost_dev_reset_owner(struct vhost_dev *);
118void vhost_dev_cleanup(struct vhost_dev *);
119long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg);
120int vhost_vq_access_ok(struct vhost_virtqueue *vq);
121int vhost_log_access_ok(struct vhost_dev *);
122
123unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
124 struct iovec iov[], unsigned int iov_count,
125 unsigned int *out_num, unsigned int *in_num,
126 struct vhost_log *log, unsigned int *log_num);
127void vhost_discard_vq_desc(struct vhost_virtqueue *);
128
129int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
130void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
131void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
132 unsigned int head, int len);
133void vhost_disable_notify(struct vhost_virtqueue *);
134bool vhost_enable_notify(struct vhost_virtqueue *);
135
136int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
137 unsigned int log_num, u64 len);
138
139int vhost_init(void);
140void vhost_cleanup(void);
141
142#define vq_err(vq, fmt, ...) do { \
143 pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
144 if ((vq)->error_ctx) \
145 eventfd_signal((vq)->error_ctx, 1);\
146 } while (0)
147
148enum {
149 VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
150 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
151 (1 << VHOST_F_LOG_ALL) |
152 (1 << VHOST_NET_F_VIRTIO_NET_HDR),
153};
154
155static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
156{
157 unsigned acked_features = rcu_dereference(dev->acked_features);
158 return acked_features & (1 << bit);
159}
160
161#endif