aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/vhost
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-08-04 14:47:58 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-04 14:47:58 -0400
commit6ba74014c1ab0e37af7de6f64b4eccbbae3cb9e7 (patch)
tree8f3892fc44f1e403675a6d7e88fda5c70e56ee4c /drivers/vhost
parent5abd9ccced7a726c817dd6b5b96bc933859138d1 (diff)
parent3ff1c25927e3af61c6bf0e4ed959504058ae4565 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6: (1443 commits) phy/marvell: add 88ec048 support igb: Program MDICNFG register prior to PHY init e1000e: correct MAC-PHY interconnect register offset for 82579 hso: Add new product ID can: Add driver for esd CAN-USB/2 device l2tp: fix export of header file for userspace can-raw: Fix skb_orphan_try handling Revert "net: remove zap_completion_queue" net: cleanup inclusion phy/marvell: add 88e1121 interface mode support u32: negative offset fix net: Fix a typo from "dev" to "ndev" igb: Use irq_synchronize per vector when using MSI-X ixgbevf: fix null pointer dereference due to filter being set for VLAN 0 e1000e: Fix irq_synchronize in MSI-X case e1000e: register pm_qos request on hardware activation ip_fragment: fix subtracting PPPOE_SES_HLEN from mtu twice net: Add getsockopt support for TCP thin-streams cxgb4: update driver version cxgb4: add new PCI IDs ... Manually fix up conflicts in: - drivers/net/e1000e/netdev.c: due to pm_qos registration infrastructure changes - drivers/net/phy/marvell.c: conflict between adding 88ec048 support and cleaning up the IDs - drivers/net/wireless/ipw2x00/ipw2100.c: trivial ipw2100_pm_qos_req conflict (registration change vs marking it static)
Diffstat (limited to 'drivers/vhost')
-rw-r--r--drivers/vhost/net.c306
-rw-r--r--drivers/vhost/vhost.c232
-rw-r--r--drivers/vhost/vhost.h55
3 files changed, 493 insertions, 100 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d219070fed3d..29e850a7a2f9 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -74,6 +74,22 @@ static int move_iovec_hdr(struct iovec *from, struct iovec *to,
74 } 74 }
75 return seg; 75 return seg;
76} 76}
77/* Copy iovec entries for len bytes from iovec. */
78static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
79 size_t len, int iovcount)
80{
81 int seg = 0;
82 size_t size;
83 while (len && seg < iovcount) {
84 size = min(from->iov_len, len);
85 to->iov_base = from->iov_base;
86 to->iov_len = size;
87 len -= size;
88 ++from;
89 ++to;
90 ++seg;
91 }
92}
77 93
78/* Caller must have TX VQ lock */ 94/* Caller must have TX VQ lock */
79static void tx_poll_stop(struct vhost_net *net) 95static void tx_poll_stop(struct vhost_net *net)
@@ -129,7 +145,7 @@ static void handle_tx(struct vhost_net *net)
129 145
130 if (wmem < sock->sk->sk_sndbuf / 2) 146 if (wmem < sock->sk->sk_sndbuf / 2)
131 tx_poll_stop(net); 147 tx_poll_stop(net);
132 hdr_size = vq->hdr_size; 148 hdr_size = vq->vhost_hlen;
133 149
134 for (;;) { 150 for (;;) {
135 head = vhost_get_vq_desc(&net->dev, vq, vq->iov, 151 head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
@@ -172,7 +188,7 @@ static void handle_tx(struct vhost_net *net)
172 /* TODO: Check specific error and bomb out unless ENOBUFS? */ 188 /* TODO: Check specific error and bomb out unless ENOBUFS? */
173 err = sock->ops->sendmsg(NULL, sock, &msg, len); 189 err = sock->ops->sendmsg(NULL, sock, &msg, len);
174 if (unlikely(err < 0)) { 190 if (unlikely(err < 0)) {
175 vhost_discard_vq_desc(vq); 191 vhost_discard_vq_desc(vq, 1);
176 tx_poll_start(net, sock); 192 tx_poll_start(net, sock);
177 break; 193 break;
178 } 194 }
@@ -191,9 +207,82 @@ static void handle_tx(struct vhost_net *net)
191 unuse_mm(net->dev.mm); 207 unuse_mm(net->dev.mm);
192} 208}
193 209
210static int peek_head_len(struct sock *sk)
211{
212 struct sk_buff *head;
213 int len = 0;
214
215 lock_sock(sk);
216 head = skb_peek(&sk->sk_receive_queue);
217 if (head)
218 len = head->len;
219 release_sock(sk);
220 return len;
221}
222
223/* This is a multi-buffer version of vhost_get_desc, that works if
224 * vq has read descriptors only.
225 * @vq - the relevant virtqueue
226 * @datalen - data length we'll be reading
227 * @iovcount - returned count of io vectors we fill
228 * @log - vhost log
229 * @log_num - log offset
230 * returns number of buffer heads allocated, negative on error
231 */
232static int get_rx_bufs(struct vhost_virtqueue *vq,
233 struct vring_used_elem *heads,
234 int datalen,
235 unsigned *iovcount,
236 struct vhost_log *log,
237 unsigned *log_num)
238{
239 unsigned int out, in;
240 int seg = 0;
241 int headcount = 0;
242 unsigned d;
243 int r, nlogs = 0;
244
245 while (datalen > 0) {
246 if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
247 r = -ENOBUFS;
248 goto err;
249 }
250 d = vhost_get_vq_desc(vq->dev, vq, vq->iov + seg,
251 ARRAY_SIZE(vq->iov) - seg, &out,
252 &in, log, log_num);
253 if (d == vq->num) {
254 r = 0;
255 goto err;
256 }
257 if (unlikely(out || in <= 0)) {
258 vq_err(vq, "unexpected descriptor format for RX: "
259 "out %d, in %d\n", out, in);
260 r = -EINVAL;
261 goto err;
262 }
263 if (unlikely(log)) {
264 nlogs += *log_num;
265 log += *log_num;
266 }
267 heads[headcount].id = d;
268 heads[headcount].len = iov_length(vq->iov + seg, in);
269 datalen -= heads[headcount].len;
270 ++headcount;
271 seg += in;
272 }
273 heads[headcount - 1].len += datalen;
274 *iovcount = seg;
275 if (unlikely(log))
276 *log_num = nlogs;
277 return headcount;
278err:
279 vhost_discard_vq_desc(vq, headcount);
280 return r;
281}
282
194/* Expects to be always run from workqueue - which acts as 283/* Expects to be always run from workqueue - which acts as
195 * read-size critical section for our kind of RCU. */ 284 * read-size critical section for our kind of RCU. */
196static void handle_rx(struct vhost_net *net) 285static void handle_rx_big(struct vhost_net *net)
197{ 286{
198 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; 287 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
199 unsigned out, in, log, s; 288 unsigned out, in, log, s;
@@ -223,7 +312,7 @@ static void handle_rx(struct vhost_net *net)
223 use_mm(net->dev.mm); 312 use_mm(net->dev.mm);
224 mutex_lock(&vq->mutex); 313 mutex_lock(&vq->mutex);
225 vhost_disable_notify(vq); 314 vhost_disable_notify(vq);
226 hdr_size = vq->hdr_size; 315 hdr_size = vq->vhost_hlen;
227 316
228 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? 317 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
229 vq->log : NULL; 318 vq->log : NULL;
@@ -270,14 +359,14 @@ static void handle_rx(struct vhost_net *net)
270 len, MSG_DONTWAIT | MSG_TRUNC); 359 len, MSG_DONTWAIT | MSG_TRUNC);
271 /* TODO: Check specific error and bomb out unless EAGAIN? */ 360 /* TODO: Check specific error and bomb out unless EAGAIN? */
272 if (err < 0) { 361 if (err < 0) {
273 vhost_discard_vq_desc(vq); 362 vhost_discard_vq_desc(vq, 1);
274 break; 363 break;
275 } 364 }
276 /* TODO: Should check and handle checksum. */ 365 /* TODO: Should check and handle checksum. */
277 if (err > len) { 366 if (err > len) {
278 pr_debug("Discarded truncated rx packet: " 367 pr_debug("Discarded truncated rx packet: "
279 " len %d > %zd\n", err, len); 368 " len %d > %zd\n", err, len);
280 vhost_discard_vq_desc(vq); 369 vhost_discard_vq_desc(vq, 1);
281 continue; 370 continue;
282 } 371 }
283 len = err; 372 len = err;
@@ -302,54 +391,175 @@ static void handle_rx(struct vhost_net *net)
302 unuse_mm(net->dev.mm); 391 unuse_mm(net->dev.mm);
303} 392}
304 393
305static void handle_tx_kick(struct work_struct *work) 394/* Expects to be always run from workqueue - which acts as
395 * read-size critical section for our kind of RCU. */
396static void handle_rx_mergeable(struct vhost_net *net)
306{ 397{
307 struct vhost_virtqueue *vq; 398 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
308 struct vhost_net *net; 399 unsigned uninitialized_var(in), log;
309 vq = container_of(work, struct vhost_virtqueue, poll.work); 400 struct vhost_log *vq_log;
310 net = container_of(vq->dev, struct vhost_net, dev); 401 struct msghdr msg = {
402 .msg_name = NULL,
403 .msg_namelen = 0,
404 .msg_control = NULL, /* FIXME: get and handle RX aux data. */
405 .msg_controllen = 0,
406 .msg_iov = vq->iov,
407 .msg_flags = MSG_DONTWAIT,
408 };
409
410 struct virtio_net_hdr_mrg_rxbuf hdr = {
411 .hdr.flags = 0,
412 .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
413 };
414
415 size_t total_len = 0;
416 int err, headcount;
417 size_t vhost_hlen, sock_hlen;
418 size_t vhost_len, sock_len;
419 struct socket *sock = rcu_dereference(vq->private_data);
420 if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
421 return;
422
423 use_mm(net->dev.mm);
424 mutex_lock(&vq->mutex);
425 vhost_disable_notify(vq);
426 vhost_hlen = vq->vhost_hlen;
427 sock_hlen = vq->sock_hlen;
428
429 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
430 vq->log : NULL;
431
432 while ((sock_len = peek_head_len(sock->sk))) {
433 sock_len += sock_hlen;
434 vhost_len = sock_len + vhost_hlen;
435 headcount = get_rx_bufs(vq, vq->heads, vhost_len,
436 &in, vq_log, &log);
437 /* On error, stop handling until the next kick. */
438 if (unlikely(headcount < 0))
439 break;
440 /* OK, now we need to know about added descriptors. */
441 if (!headcount) {
442 if (unlikely(vhost_enable_notify(vq))) {
443 /* They have slipped one in as we were
444 * doing that: check again. */
445 vhost_disable_notify(vq);
446 continue;
447 }
448 /* Nothing new? Wait for eventfd to tell us
449 * they refilled. */
450 break;
451 }
452 /* We don't need to be notified again. */
453 if (unlikely((vhost_hlen)))
454 /* Skip header. TODO: support TSO. */
455 move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
456 else
457 /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
458 * needed because sendmsg can modify msg_iov. */
459 copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
460 msg.msg_iovlen = in;
461 err = sock->ops->recvmsg(NULL, sock, &msg,
462 sock_len, MSG_DONTWAIT | MSG_TRUNC);
463 /* Userspace might have consumed the packet meanwhile:
464 * it's not supposed to do this usually, but might be hard
465 * to prevent. Discard data we got (if any) and keep going. */
466 if (unlikely(err != sock_len)) {
467 pr_debug("Discarded rx packet: "
468 " len %d, expected %zd\n", err, sock_len);
469 vhost_discard_vq_desc(vq, headcount);
470 continue;
471 }
472 if (unlikely(vhost_hlen) &&
473 memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
474 vhost_hlen)) {
475 vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
476 vq->iov->iov_base);
477 break;
478 }
479 /* TODO: Should check and handle checksum. */
480 if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
481 memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
482 offsetof(typeof(hdr), num_buffers),
483 sizeof hdr.num_buffers)) {
484 vq_err(vq, "Failed num_buffers write");
485 vhost_discard_vq_desc(vq, headcount);
486 break;
487 }
488 vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
489 headcount);
490 if (unlikely(vq_log))
491 vhost_log_write(vq, vq_log, log, vhost_len);
492 total_len += vhost_len;
493 if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
494 vhost_poll_queue(&vq->poll);
495 break;
496 }
497 }
498
499 mutex_unlock(&vq->mutex);
500 unuse_mm(net->dev.mm);
501}
502
503static void handle_rx(struct vhost_net *net)
504{
505 if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
506 handle_rx_mergeable(net);
507 else
508 handle_rx_big(net);
509}
510
511static void handle_tx_kick(struct vhost_work *work)
512{
513 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
514 poll.work);
515 struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
516
311 handle_tx(net); 517 handle_tx(net);
312} 518}
313 519
314static void handle_rx_kick(struct work_struct *work) 520static void handle_rx_kick(struct vhost_work *work)
315{ 521{
316 struct vhost_virtqueue *vq; 522 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
317 struct vhost_net *net; 523 poll.work);
318 vq = container_of(work, struct vhost_virtqueue, poll.work); 524 struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
319 net = container_of(vq->dev, struct vhost_net, dev); 525
320 handle_rx(net); 526 handle_rx(net);
321} 527}
322 528
323static void handle_tx_net(struct work_struct *work) 529static void handle_tx_net(struct vhost_work *work)
324{ 530{
325 struct vhost_net *net; 531 struct vhost_net *net = container_of(work, struct vhost_net,
326 net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); 532 poll[VHOST_NET_VQ_TX].work);
327 handle_tx(net); 533 handle_tx(net);
328} 534}
329 535
330static void handle_rx_net(struct work_struct *work) 536static void handle_rx_net(struct vhost_work *work)
331{ 537{
332 struct vhost_net *net; 538 struct vhost_net *net = container_of(work, struct vhost_net,
333 net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); 539 poll[VHOST_NET_VQ_RX].work);
334 handle_rx(net); 540 handle_rx(net);
335} 541}
336 542
337static int vhost_net_open(struct inode *inode, struct file *f) 543static int vhost_net_open(struct inode *inode, struct file *f)
338{ 544{
339 struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); 545 struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
546 struct vhost_dev *dev;
340 int r; 547 int r;
548
341 if (!n) 549 if (!n)
342 return -ENOMEM; 550 return -ENOMEM;
551
552 dev = &n->dev;
343 n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; 553 n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
344 n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; 554 n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
345 r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); 555 r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
346 if (r < 0) { 556 if (r < 0) {
347 kfree(n); 557 kfree(n);
348 return r; 558 return r;
349 } 559 }
350 560
351 vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); 561 vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
352 vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); 562 vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
353 n->tx_poll_state = VHOST_NET_POLL_DISABLED; 563 n->tx_poll_state = VHOST_NET_POLL_DISABLED;
354 564
355 f->private_data = n; 565 f->private_data = n;
@@ -527,13 +737,12 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
527 737
528 /* start polling new socket */ 738 /* start polling new socket */
529 oldsock = vq->private_data; 739 oldsock = vq->private_data;
530 if (sock == oldsock) 740 if (sock != oldsock) {
531 goto done; 741 vhost_net_disable_vq(n, vq);
742 rcu_assign_pointer(vq->private_data, sock);
743 vhost_net_enable_vq(n, vq);
744 }
532 745
533 vhost_net_disable_vq(n, vq);
534 rcu_assign_pointer(vq->private_data, sock);
535 vhost_net_enable_vq(n, vq);
536done:
537 mutex_unlock(&vq->mutex); 746 mutex_unlock(&vq->mutex);
538 747
539 if (oldsock) { 748 if (oldsock) {
@@ -574,9 +783,21 @@ done:
574 783
575static int vhost_net_set_features(struct vhost_net *n, u64 features) 784static int vhost_net_set_features(struct vhost_net *n, u64 features)
576{ 785{
577 size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ? 786 size_t vhost_hlen, sock_hlen, hdr_len;
578 sizeof(struct virtio_net_hdr) : 0;
579 int i; 787 int i;
788
789 hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ?
790 sizeof(struct virtio_net_hdr_mrg_rxbuf) :
791 sizeof(struct virtio_net_hdr);
792 if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
793 /* vhost provides vnet_hdr */
794 vhost_hlen = hdr_len;
795 sock_hlen = 0;
796 } else {
797 /* socket provides vnet_hdr */
798 vhost_hlen = 0;
799 sock_hlen = hdr_len;
800 }
580 mutex_lock(&n->dev.mutex); 801 mutex_lock(&n->dev.mutex);
581 if ((features & (1 << VHOST_F_LOG_ALL)) && 802 if ((features & (1 << VHOST_F_LOG_ALL)) &&
582 !vhost_log_access_ok(&n->dev)) { 803 !vhost_log_access_ok(&n->dev)) {
@@ -587,7 +808,8 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
587 smp_wmb(); 808 smp_wmb();
588 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { 809 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
589 mutex_lock(&n->vqs[i].mutex); 810 mutex_lock(&n->vqs[i].mutex);
590 n->vqs[i].hdr_size = hdr_size; 811 n->vqs[i].vhost_hlen = vhost_hlen;
812 n->vqs[i].sock_hlen = sock_hlen;
591 mutex_unlock(&n->vqs[i].mutex); 813 mutex_unlock(&n->vqs[i].mutex);
592 } 814 }
593 vhost_net_flush(n); 815 vhost_net_flush(n);
@@ -639,7 +861,7 @@ static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
639} 861}
640#endif 862#endif
641 863
642const static struct file_operations vhost_net_fops = { 864static const struct file_operations vhost_net_fops = {
643 .owner = THIS_MODULE, 865 .owner = THIS_MODULE,
644 .release = vhost_net_release, 866 .release = vhost_net_release,
645 .unlocked_ioctl = vhost_net_ioctl, 867 .unlocked_ioctl = vhost_net_ioctl,
@@ -657,25 +879,13 @@ static struct miscdevice vhost_net_misc = {
657 879
658static int vhost_net_init(void) 880static int vhost_net_init(void)
659{ 881{
660 int r = vhost_init(); 882 return misc_register(&vhost_net_misc);
661 if (r)
662 goto err_init;
663 r = misc_register(&vhost_net_misc);
664 if (r)
665 goto err_reg;
666 return 0;
667err_reg:
668 vhost_cleanup();
669err_init:
670 return r;
671
672} 883}
673module_init(vhost_net_init); 884module_init(vhost_net_init);
674 885
675static void vhost_net_exit(void) 886static void vhost_net_exit(void)
676{ 887{
677 misc_deregister(&vhost_net_misc); 888 misc_deregister(&vhost_net_misc);
678 vhost_cleanup();
679} 889}
680module_exit(vhost_net_exit); 890module_exit(vhost_net_exit);
681 891
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0b99783083f6..e05557d52999 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -17,12 +17,13 @@
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/miscdevice.h> 18#include <linux/miscdevice.h>
19#include <linux/mutex.h> 19#include <linux/mutex.h>
20#include <linux/workqueue.h>
21#include <linux/rcupdate.h> 20#include <linux/rcupdate.h>
22#include <linux/poll.h> 21#include <linux/poll.h>
23#include <linux/file.h> 22#include <linux/file.h>
24#include <linux/highmem.h> 23#include <linux/highmem.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/kthread.h>
26#include <linux/cgroup.h>
26 27
27#include <linux/net.h> 28#include <linux/net.h>
28#include <linux/if_packet.h> 29#include <linux/if_packet.h>
@@ -37,8 +38,6 @@ enum {
37 VHOST_MEMORY_F_LOG = 0x1, 38 VHOST_MEMORY_F_LOG = 0x1,
38}; 39};
39 40
40static struct workqueue_struct *vhost_workqueue;
41
42static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, 41static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
43 poll_table *pt) 42 poll_table *pt)
44{ 43{
@@ -52,23 +51,31 @@ static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
52static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, 51static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
53 void *key) 52 void *key)
54{ 53{
55 struct vhost_poll *poll; 54 struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
56 poll = container_of(wait, struct vhost_poll, wait); 55
57 if (!((unsigned long)key & poll->mask)) 56 if (!((unsigned long)key & poll->mask))
58 return 0; 57 return 0;
59 58
60 queue_work(vhost_workqueue, &poll->work); 59 vhost_poll_queue(poll);
61 return 0; 60 return 0;
62} 61}
63 62
64/* Init poll structure */ 63/* Init poll structure */
65void vhost_poll_init(struct vhost_poll *poll, work_func_t func, 64void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
66 unsigned long mask) 65 unsigned long mask, struct vhost_dev *dev)
67{ 66{
68 INIT_WORK(&poll->work, func); 67 struct vhost_work *work = &poll->work;
68
69 init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); 69 init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
70 init_poll_funcptr(&poll->table, vhost_poll_func); 70 init_poll_funcptr(&poll->table, vhost_poll_func);
71 poll->mask = mask; 71 poll->mask = mask;
72 poll->dev = dev;
73
74 INIT_LIST_HEAD(&work->node);
75 work->fn = fn;
76 init_waitqueue_head(&work->done);
77 work->flushing = 0;
78 work->queue_seq = work->done_seq = 0;
72} 79}
73 80
74/* Start polling a file. We add ourselves to file's wait queue. The caller must 81/* Start polling a file. We add ourselves to file's wait queue. The caller must
@@ -92,12 +99,40 @@ void vhost_poll_stop(struct vhost_poll *poll)
92 * locks that are also used by the callback. */ 99 * locks that are also used by the callback. */
93void vhost_poll_flush(struct vhost_poll *poll) 100void vhost_poll_flush(struct vhost_poll *poll)
94{ 101{
95 flush_work(&poll->work); 102 struct vhost_work *work = &poll->work;
103 unsigned seq;
104 int left;
105 int flushing;
106
107 spin_lock_irq(&poll->dev->work_lock);
108 seq = work->queue_seq;
109 work->flushing++;
110 spin_unlock_irq(&poll->dev->work_lock);
111 wait_event(work->done, ({
112 spin_lock_irq(&poll->dev->work_lock);
113 left = seq - work->done_seq <= 0;
114 spin_unlock_irq(&poll->dev->work_lock);
115 left;
116 }));
117 spin_lock_irq(&poll->dev->work_lock);
118 flushing = --work->flushing;
119 spin_unlock_irq(&poll->dev->work_lock);
120 BUG_ON(flushing < 0);
96} 121}
97 122
98void vhost_poll_queue(struct vhost_poll *poll) 123void vhost_poll_queue(struct vhost_poll *poll)
99{ 124{
100 queue_work(vhost_workqueue, &poll->work); 125 struct vhost_dev *dev = poll->dev;
126 struct vhost_work *work = &poll->work;
127 unsigned long flags;
128
129 spin_lock_irqsave(&dev->work_lock, flags);
130 if (list_empty(&work->node)) {
131 list_add_tail(&work->node, &dev->work_list);
132 work->queue_seq++;
133 wake_up_process(dev->worker);
134 }
135 spin_unlock_irqrestore(&dev->work_lock, flags);
101} 136}
102 137
103static void vhost_vq_reset(struct vhost_dev *dev, 138static void vhost_vq_reset(struct vhost_dev *dev,
@@ -114,7 +149,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
114 vq->used_flags = 0; 149 vq->used_flags = 0;
115 vq->log_used = false; 150 vq->log_used = false;
116 vq->log_addr = -1ull; 151 vq->log_addr = -1ull;
117 vq->hdr_size = 0; 152 vq->vhost_hlen = 0;
153 vq->sock_hlen = 0;
118 vq->private_data = NULL; 154 vq->private_data = NULL;
119 vq->log_base = NULL; 155 vq->log_base = NULL;
120 vq->error_ctx = NULL; 156 vq->error_ctx = NULL;
@@ -125,10 +161,51 @@ static void vhost_vq_reset(struct vhost_dev *dev,
125 vq->log_ctx = NULL; 161 vq->log_ctx = NULL;
126} 162}
127 163
164static int vhost_worker(void *data)
165{
166 struct vhost_dev *dev = data;
167 struct vhost_work *work = NULL;
168 unsigned uninitialized_var(seq);
169
170 for (;;) {
171 /* mb paired w/ kthread_stop */
172 set_current_state(TASK_INTERRUPTIBLE);
173
174 spin_lock_irq(&dev->work_lock);
175 if (work) {
176 work->done_seq = seq;
177 if (work->flushing)
178 wake_up_all(&work->done);
179 }
180
181 if (kthread_should_stop()) {
182 spin_unlock_irq(&dev->work_lock);
183 __set_current_state(TASK_RUNNING);
184 return 0;
185 }
186 if (!list_empty(&dev->work_list)) {
187 work = list_first_entry(&dev->work_list,
188 struct vhost_work, node);
189 list_del_init(&work->node);
190 seq = work->queue_seq;
191 } else
192 work = NULL;
193 spin_unlock_irq(&dev->work_lock);
194
195 if (work) {
196 __set_current_state(TASK_RUNNING);
197 work->fn(work);
198 } else
199 schedule();
200
201 }
202}
203
128long vhost_dev_init(struct vhost_dev *dev, 204long vhost_dev_init(struct vhost_dev *dev,
129 struct vhost_virtqueue *vqs, int nvqs) 205 struct vhost_virtqueue *vqs, int nvqs)
130{ 206{
131 int i; 207 int i;
208
132 dev->vqs = vqs; 209 dev->vqs = vqs;
133 dev->nvqs = nvqs; 210 dev->nvqs = nvqs;
134 mutex_init(&dev->mutex); 211 mutex_init(&dev->mutex);
@@ -136,6 +213,9 @@ long vhost_dev_init(struct vhost_dev *dev,
136 dev->log_file = NULL; 213 dev->log_file = NULL;
137 dev->memory = NULL; 214 dev->memory = NULL;
138 dev->mm = NULL; 215 dev->mm = NULL;
216 spin_lock_init(&dev->work_lock);
217 INIT_LIST_HEAD(&dev->work_list);
218 dev->worker = NULL;
139 219
140 for (i = 0; i < dev->nvqs; ++i) { 220 for (i = 0; i < dev->nvqs; ++i) {
141 dev->vqs[i].dev = dev; 221 dev->vqs[i].dev = dev;
@@ -143,9 +223,9 @@ long vhost_dev_init(struct vhost_dev *dev,
143 vhost_vq_reset(dev, dev->vqs + i); 223 vhost_vq_reset(dev, dev->vqs + i);
144 if (dev->vqs[i].handle_kick) 224 if (dev->vqs[i].handle_kick)
145 vhost_poll_init(&dev->vqs[i].poll, 225 vhost_poll_init(&dev->vqs[i].poll,
146 dev->vqs[i].handle_kick, 226 dev->vqs[i].handle_kick, POLLIN, dev);
147 POLLIN);
148 } 227 }
228
149 return 0; 229 return 0;
150} 230}
151 231
@@ -159,12 +239,36 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
159/* Caller should have device mutex */ 239/* Caller should have device mutex */
160static long vhost_dev_set_owner(struct vhost_dev *dev) 240static long vhost_dev_set_owner(struct vhost_dev *dev)
161{ 241{
242 struct task_struct *worker;
243 int err;
162 /* Is there an owner already? */ 244 /* Is there an owner already? */
163 if (dev->mm) 245 if (dev->mm) {
164 return -EBUSY; 246 err = -EBUSY;
247 goto err_mm;
248 }
165 /* No owner, become one */ 249 /* No owner, become one */
166 dev->mm = get_task_mm(current); 250 dev->mm = get_task_mm(current);
251 worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
252 if (IS_ERR(worker)) {
253 err = PTR_ERR(worker);
254 goto err_worker;
255 }
256
257 dev->worker = worker;
258 err = cgroup_attach_task_current_cg(worker);
259 if (err)
260 goto err_cgroup;
261 wake_up_process(worker); /* avoid contributing to loadavg */
262
167 return 0; 263 return 0;
264err_cgroup:
265 kthread_stop(worker);
266err_worker:
267 if (dev->mm)
268 mmput(dev->mm);
269 dev->mm = NULL;
270err_mm:
271 return err;
168} 272}
169 273
170/* Caller should have device mutex */ 274/* Caller should have device mutex */
@@ -217,6 +321,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
217 if (dev->mm) 321 if (dev->mm)
218 mmput(dev->mm); 322 mmput(dev->mm);
219 dev->mm = NULL; 323 dev->mm = NULL;
324
325 WARN_ON(!list_empty(&dev->work_list));
326 kthread_stop(dev->worker);
220} 327}
221 328
222static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) 329static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
@@ -237,8 +344,8 @@ static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem,
237{ 344{
238 int i; 345 int i;
239 346
240 if (!mem) 347 if (!mem)
241 return 0; 348 return 0;
242 349
243 for (i = 0; i < mem->nregions; ++i) { 350 for (i = 0; i < mem->nregions; ++i) {
244 struct vhost_memory_region *m = mem->regions + i; 351 struct vhost_memory_region *m = mem->regions + i;
@@ -995,9 +1102,9 @@ int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
995} 1102}
996 1103
997/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ 1104/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
998void vhost_discard_vq_desc(struct vhost_virtqueue *vq) 1105void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
999{ 1106{
1000 vq->last_avail_idx--; 1107 vq->last_avail_idx -= n;
1001} 1108}
1002 1109
1003/* After we've used one of their buffers, we tell them about it. We'll then 1110/* After we've used one of their buffers, we tell them about it. We'll then
@@ -1042,6 +1149,67 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
1042 return 0; 1149 return 0;
1043} 1150}
1044 1151
1152static int __vhost_add_used_n(struct vhost_virtqueue *vq,
1153 struct vring_used_elem *heads,
1154 unsigned count)
1155{
1156 struct vring_used_elem __user *used;
1157 int start;
1158
1159 start = vq->last_used_idx % vq->num;
1160 used = vq->used->ring + start;
1161 if (copy_to_user(used, heads, count * sizeof *used)) {
1162 vq_err(vq, "Failed to write used");
1163 return -EFAULT;
1164 }
1165 if (unlikely(vq->log_used)) {
1166 /* Make sure data is seen before log. */
1167 smp_wmb();
1168 /* Log used ring entry write. */
1169 log_write(vq->log_base,
1170 vq->log_addr +
1171 ((void __user *)used - (void __user *)vq->used),
1172 count * sizeof *used);
1173 }
1174 vq->last_used_idx += count;
1175 return 0;
1176}
1177
1178/* After we've used one of their buffers, we tell them about it. We'll then
1179 * want to notify the guest, using eventfd. */
1180int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
1181 unsigned count)
1182{
1183 int start, n, r;
1184
1185 start = vq->last_used_idx % vq->num;
1186 n = vq->num - start;
1187 if (n < count) {
1188 r = __vhost_add_used_n(vq, heads, n);
1189 if (r < 0)
1190 return r;
1191 heads += n;
1192 count -= n;
1193 }
1194 r = __vhost_add_used_n(vq, heads, count);
1195
1196 /* Make sure buffer is written before we update index. */
1197 smp_wmb();
1198 if (put_user(vq->last_used_idx, &vq->used->idx)) {
1199 vq_err(vq, "Failed to increment used idx");
1200 return -EFAULT;
1201 }
1202 if (unlikely(vq->log_used)) {
1203 /* Log used index update. */
1204 log_write(vq->log_base,
1205 vq->log_addr + offsetof(struct vring_used, idx),
1206 sizeof vq->used->idx);
1207 if (vq->log_ctx)
1208 eventfd_signal(vq->log_ctx, 1);
1209 }
1210 return r;
1211}
1212
1045/* This actually signals the guest, using eventfd. */ 1213/* This actually signals the guest, using eventfd. */
1046void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) 1214void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
1047{ 1215{
@@ -1076,6 +1244,15 @@ void vhost_add_used_and_signal(struct vhost_dev *dev,
1076 vhost_signal(dev, vq); 1244 vhost_signal(dev, vq);
1077} 1245}
1078 1246
1247/* multi-buffer version of vhost_add_used_and_signal */
1248void vhost_add_used_and_signal_n(struct vhost_dev *dev,
1249 struct vhost_virtqueue *vq,
1250 struct vring_used_elem *heads, unsigned count)
1251{
1252 vhost_add_used_n(vq, heads, count);
1253 vhost_signal(dev, vq);
1254}
1255
1079/* OK, now we need to know about added descriptors. */ 1256/* OK, now we need to know about added descriptors. */
1080bool vhost_enable_notify(struct vhost_virtqueue *vq) 1257bool vhost_enable_notify(struct vhost_virtqueue *vq)
1081{ 1258{
@@ -1100,7 +1277,7 @@ bool vhost_enable_notify(struct vhost_virtqueue *vq)
1100 return false; 1277 return false;
1101 } 1278 }
1102 1279
1103 return avail_idx != vq->last_avail_idx; 1280 return avail_idx != vq->avail_idx;
1104} 1281}
1105 1282
1106/* We don't need to be notified again. */ 1283/* We don't need to be notified again. */
@@ -1115,16 +1292,3 @@ void vhost_disable_notify(struct vhost_virtqueue *vq)
1115 vq_err(vq, "Failed to enable notification at %p: %d\n", 1292 vq_err(vq, "Failed to enable notification at %p: %d\n",
1116 &vq->used->flags, r); 1293 &vq->used->flags, r);
1117} 1294}
1118
1119int vhost_init(void)
1120{
1121 vhost_workqueue = create_singlethread_workqueue("vhost");
1122 if (!vhost_workqueue)
1123 return -ENOMEM;
1124 return 0;
1125}
1126
1127void vhost_cleanup(void)
1128{
1129 destroy_workqueue(vhost_workqueue);
1130}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 11ee13dba0f7..afd77295971c 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -5,13 +5,13 @@
5#include <linux/vhost.h> 5#include <linux/vhost.h>
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/mutex.h> 7#include <linux/mutex.h>
8#include <linux/workqueue.h>
9#include <linux/poll.h> 8#include <linux/poll.h>
10#include <linux/file.h> 9#include <linux/file.h>
11#include <linux/skbuff.h> 10#include <linux/skbuff.h>
12#include <linux/uio.h> 11#include <linux/uio.h>
13#include <linux/virtio_config.h> 12#include <linux/virtio_config.h>
14#include <linux/virtio_ring.h> 13#include <linux/virtio_ring.h>
14#include <asm/atomic.h>
15 15
16struct vhost_device; 16struct vhost_device;
17 17
@@ -20,19 +20,31 @@ enum {
20 VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2, 20 VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2,
21}; 21};
22 22
23struct vhost_work;
24typedef void (*vhost_work_fn_t)(struct vhost_work *work);
25
26struct vhost_work {
27 struct list_head node;
28 vhost_work_fn_t fn;
29 wait_queue_head_t done;
30 int flushing;
31 unsigned queue_seq;
32 unsigned done_seq;
33};
34
23/* Poll a file (eventfd or socket) */ 35/* Poll a file (eventfd or socket) */
24/* Note: there's nothing vhost specific about this structure. */ 36/* Note: there's nothing vhost specific about this structure. */
25struct vhost_poll { 37struct vhost_poll {
26 poll_table table; 38 poll_table table;
27 wait_queue_head_t *wqh; 39 wait_queue_head_t *wqh;
28 wait_queue_t wait; 40 wait_queue_t wait;
29 /* struct which will handle all actual work. */ 41 struct vhost_work work;
30 struct work_struct work;
31 unsigned long mask; 42 unsigned long mask;
43 struct vhost_dev *dev;
32}; 44};
33 45
34void vhost_poll_init(struct vhost_poll *poll, work_func_t func, 46void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
35 unsigned long mask); 47 unsigned long mask, struct vhost_dev *dev);
36void vhost_poll_start(struct vhost_poll *poll, struct file *file); 48void vhost_poll_start(struct vhost_poll *poll, struct file *file);
37void vhost_poll_stop(struct vhost_poll *poll); 49void vhost_poll_stop(struct vhost_poll *poll);
38void vhost_poll_flush(struct vhost_poll *poll); 50void vhost_poll_flush(struct vhost_poll *poll);
@@ -63,7 +75,7 @@ struct vhost_virtqueue {
63 struct vhost_poll poll; 75 struct vhost_poll poll;
64 76
65 /* The routine to call when the Guest pings us, or timeout. */ 77 /* The routine to call when the Guest pings us, or timeout. */
66 work_func_t handle_kick; 78 vhost_work_fn_t handle_kick;
67 79
68 /* Last available index we saw. */ 80 /* Last available index we saw. */
69 u16 last_avail_idx; 81 u16 last_avail_idx;
@@ -84,13 +96,15 @@ struct vhost_virtqueue {
84 struct iovec indirect[VHOST_NET_MAX_SG]; 96 struct iovec indirect[VHOST_NET_MAX_SG];
85 struct iovec iov[VHOST_NET_MAX_SG]; 97 struct iovec iov[VHOST_NET_MAX_SG];
86 struct iovec hdr[VHOST_NET_MAX_SG]; 98 struct iovec hdr[VHOST_NET_MAX_SG];
87 size_t hdr_size; 99 size_t vhost_hlen;
100 size_t sock_hlen;
101 struct vring_used_elem heads[VHOST_NET_MAX_SG];
88 /* We use a kind of RCU to access private pointer. 102 /* We use a kind of RCU to access private pointer.
89 * All readers access it from workqueue, which makes it possible to 103 * All readers access it from worker, which makes it possible to
90 * flush the workqueue instead of synchronize_rcu. Therefore readers do 104 * flush the vhost_work instead of synchronize_rcu. Therefore readers do
91 * not need to call rcu_read_lock/rcu_read_unlock: the beginning of 105 * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
92 * work item execution acts instead of rcu_read_lock() and the end of 106 * vhost_work execution acts instead of rcu_read_lock() and the end of
93 * work item execution acts instead of rcu_read_lock(). 107 * vhost_work execution acts instead of rcu_read_lock().
94 * Writers use virtqueue mutex. */ 108 * Writers use virtqueue mutex. */
95 void *private_data; 109 void *private_data;
96 /* Log write descriptors */ 110 /* Log write descriptors */
@@ -110,6 +124,9 @@ struct vhost_dev {
110 int nvqs; 124 int nvqs;
111 struct file *log_file; 125 struct file *log_file;
112 struct eventfd_ctx *log_ctx; 126 struct eventfd_ctx *log_ctx;
127 spinlock_t work_lock;
128 struct list_head work_list;
129 struct task_struct *worker;
113}; 130};
114 131
115long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); 132long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
@@ -124,21 +141,22 @@ int vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
124 struct iovec iov[], unsigned int iov_count, 141 struct iovec iov[], unsigned int iov_count,
125 unsigned int *out_num, unsigned int *in_num, 142 unsigned int *out_num, unsigned int *in_num,
126 struct vhost_log *log, unsigned int *log_num); 143 struct vhost_log *log, unsigned int *log_num);
127void vhost_discard_vq_desc(struct vhost_virtqueue *); 144void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
128 145
129int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); 146int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
130void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); 147int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
148 unsigned count);
131void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, 149void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
132 unsigned int head, int len); 150 unsigned int id, int len);
151void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
152 struct vring_used_elem *heads, unsigned count);
153void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
133void vhost_disable_notify(struct vhost_virtqueue *); 154void vhost_disable_notify(struct vhost_virtqueue *);
134bool vhost_enable_notify(struct vhost_virtqueue *); 155bool vhost_enable_notify(struct vhost_virtqueue *);
135 156
136int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, 157int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
137 unsigned int log_num, u64 len); 158 unsigned int log_num, u64 len);
138 159
139int vhost_init(void);
140void vhost_cleanup(void);
141
142#define vq_err(vq, fmt, ...) do { \ 160#define vq_err(vq, fmt, ...) do { \
143 pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ 161 pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
144 if ((vq)->error_ctx) \ 162 if ((vq)->error_ctx) \
@@ -149,7 +167,8 @@ enum {
149 VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) | 167 VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
150 (1 << VIRTIO_RING_F_INDIRECT_DESC) | 168 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
151 (1 << VHOST_F_LOG_ALL) | 169 (1 << VHOST_F_LOG_ALL) |
152 (1 << VHOST_NET_F_VIRTIO_NET_HDR), 170 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
171 (1 << VIRTIO_NET_F_MRG_RXBUF),
153}; 172};
154 173
155static inline int vhost_has_feature(struct vhost_dev *dev, int bit) 174static inline int vhost_has_feature(struct vhost_dev *dev, int bit)