diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-08-04 14:47:58 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-08-04 14:47:58 -0400 |
commit | 6ba74014c1ab0e37af7de6f64b4eccbbae3cb9e7 (patch) | |
tree | 8f3892fc44f1e403675a6d7e88fda5c70e56ee4c /drivers/vhost | |
parent | 5abd9ccced7a726c817dd6b5b96bc933859138d1 (diff) | |
parent | 3ff1c25927e3af61c6bf0e4ed959504058ae4565 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6: (1443 commits)
phy/marvell: add 88ec048 support
igb: Program MDICNFG register prior to PHY init
e1000e: correct MAC-PHY interconnect register offset for 82579
hso: Add new product ID
can: Add driver for esd CAN-USB/2 device
l2tp: fix export of header file for userspace
can-raw: Fix skb_orphan_try handling
Revert "net: remove zap_completion_queue"
net: cleanup inclusion
phy/marvell: add 88e1121 interface mode support
u32: negative offset fix
net: Fix a typo from "dev" to "ndev"
igb: Use irq_synchronize per vector when using MSI-X
ixgbevf: fix null pointer dereference due to filter being set for VLAN 0
e1000e: Fix irq_synchronize in MSI-X case
e1000e: register pm_qos request on hardware activation
ip_fragment: fix subtracting PPPOE_SES_HLEN from mtu twice
net: Add getsockopt support for TCP thin-streams
cxgb4: update driver version
cxgb4: add new PCI IDs
...
Manually fix up conflicts in:
- drivers/net/e1000e/netdev.c: due to pm_qos registration
infrastructure changes
- drivers/net/phy/marvell.c: conflict between adding 88ec048 support
and cleaning up the IDs
- drivers/net/wireless/ipw2x00/ipw2100.c: trivial ipw2100_pm_qos_req
conflict (registration change vs marking it static)
Diffstat (limited to 'drivers/vhost')
-rw-r--r-- | drivers/vhost/net.c | 306 | ||||
-rw-r--r-- | drivers/vhost/vhost.c | 232 | ||||
-rw-r--r-- | drivers/vhost/vhost.h | 55 |
3 files changed, 493 insertions, 100 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index d219070fed3d..29e850a7a2f9 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c | |||
@@ -74,6 +74,22 @@ static int move_iovec_hdr(struct iovec *from, struct iovec *to, | |||
74 | } | 74 | } |
75 | return seg; | 75 | return seg; |
76 | } | 76 | } |
77 | /* Copy iovec entries for len bytes from iovec. */ | ||
78 | static void copy_iovec_hdr(const struct iovec *from, struct iovec *to, | ||
79 | size_t len, int iovcount) | ||
80 | { | ||
81 | int seg = 0; | ||
82 | size_t size; | ||
83 | while (len && seg < iovcount) { | ||
84 | size = min(from->iov_len, len); | ||
85 | to->iov_base = from->iov_base; | ||
86 | to->iov_len = size; | ||
87 | len -= size; | ||
88 | ++from; | ||
89 | ++to; | ||
90 | ++seg; | ||
91 | } | ||
92 | } | ||
77 | 93 | ||
78 | /* Caller must have TX VQ lock */ | 94 | /* Caller must have TX VQ lock */ |
79 | static void tx_poll_stop(struct vhost_net *net) | 95 | static void tx_poll_stop(struct vhost_net *net) |
@@ -129,7 +145,7 @@ static void handle_tx(struct vhost_net *net) | |||
129 | 145 | ||
130 | if (wmem < sock->sk->sk_sndbuf / 2) | 146 | if (wmem < sock->sk->sk_sndbuf / 2) |
131 | tx_poll_stop(net); | 147 | tx_poll_stop(net); |
132 | hdr_size = vq->hdr_size; | 148 | hdr_size = vq->vhost_hlen; |
133 | 149 | ||
134 | for (;;) { | 150 | for (;;) { |
135 | head = vhost_get_vq_desc(&net->dev, vq, vq->iov, | 151 | head = vhost_get_vq_desc(&net->dev, vq, vq->iov, |
@@ -172,7 +188,7 @@ static void handle_tx(struct vhost_net *net) | |||
172 | /* TODO: Check specific error and bomb out unless ENOBUFS? */ | 188 | /* TODO: Check specific error and bomb out unless ENOBUFS? */ |
173 | err = sock->ops->sendmsg(NULL, sock, &msg, len); | 189 | err = sock->ops->sendmsg(NULL, sock, &msg, len); |
174 | if (unlikely(err < 0)) { | 190 | if (unlikely(err < 0)) { |
175 | vhost_discard_vq_desc(vq); | 191 | vhost_discard_vq_desc(vq, 1); |
176 | tx_poll_start(net, sock); | 192 | tx_poll_start(net, sock); |
177 | break; | 193 | break; |
178 | } | 194 | } |
@@ -191,9 +207,82 @@ static void handle_tx(struct vhost_net *net) | |||
191 | unuse_mm(net->dev.mm); | 207 | unuse_mm(net->dev.mm); |
192 | } | 208 | } |
193 | 209 | ||
210 | static int peek_head_len(struct sock *sk) | ||
211 | { | ||
212 | struct sk_buff *head; | ||
213 | int len = 0; | ||
214 | |||
215 | lock_sock(sk); | ||
216 | head = skb_peek(&sk->sk_receive_queue); | ||
217 | if (head) | ||
218 | len = head->len; | ||
219 | release_sock(sk); | ||
220 | return len; | ||
221 | } | ||
222 | |||
223 | /* This is a multi-buffer version of vhost_get_desc, that works if | ||
224 | * vq has read descriptors only. | ||
225 | * @vq - the relevant virtqueue | ||
226 | * @datalen - data length we'll be reading | ||
227 | * @iovcount - returned count of io vectors we fill | ||
228 | * @log - vhost log | ||
229 | * @log_num - log offset | ||
230 | * returns number of buffer heads allocated, negative on error | ||
231 | */ | ||
232 | static int get_rx_bufs(struct vhost_virtqueue *vq, | ||
233 | struct vring_used_elem *heads, | ||
234 | int datalen, | ||
235 | unsigned *iovcount, | ||
236 | struct vhost_log *log, | ||
237 | unsigned *log_num) | ||
238 | { | ||
239 | unsigned int out, in; | ||
240 | int seg = 0; | ||
241 | int headcount = 0; | ||
242 | unsigned d; | ||
243 | int r, nlogs = 0; | ||
244 | |||
245 | while (datalen > 0) { | ||
246 | if (unlikely(headcount >= VHOST_NET_MAX_SG)) { | ||
247 | r = -ENOBUFS; | ||
248 | goto err; | ||
249 | } | ||
250 | d = vhost_get_vq_desc(vq->dev, vq, vq->iov + seg, | ||
251 | ARRAY_SIZE(vq->iov) - seg, &out, | ||
252 | &in, log, log_num); | ||
253 | if (d == vq->num) { | ||
254 | r = 0; | ||
255 | goto err; | ||
256 | } | ||
257 | if (unlikely(out || in <= 0)) { | ||
258 | vq_err(vq, "unexpected descriptor format for RX: " | ||
259 | "out %d, in %d\n", out, in); | ||
260 | r = -EINVAL; | ||
261 | goto err; | ||
262 | } | ||
263 | if (unlikely(log)) { | ||
264 | nlogs += *log_num; | ||
265 | log += *log_num; | ||
266 | } | ||
267 | heads[headcount].id = d; | ||
268 | heads[headcount].len = iov_length(vq->iov + seg, in); | ||
269 | datalen -= heads[headcount].len; | ||
270 | ++headcount; | ||
271 | seg += in; | ||
272 | } | ||
273 | heads[headcount - 1].len += datalen; | ||
274 | *iovcount = seg; | ||
275 | if (unlikely(log)) | ||
276 | *log_num = nlogs; | ||
277 | return headcount; | ||
278 | err: | ||
279 | vhost_discard_vq_desc(vq, headcount); | ||
280 | return r; | ||
281 | } | ||
282 | |||
194 | /* Expects to be always run from workqueue - which acts as | 283 | /* Expects to be always run from workqueue - which acts as |
195 | * read-size critical section for our kind of RCU. */ | 284 | * read-size critical section for our kind of RCU. */ |
196 | static void handle_rx(struct vhost_net *net) | 285 | static void handle_rx_big(struct vhost_net *net) |
197 | { | 286 | { |
198 | struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; | 287 | struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; |
199 | unsigned out, in, log, s; | 288 | unsigned out, in, log, s; |
@@ -223,7 +312,7 @@ static void handle_rx(struct vhost_net *net) | |||
223 | use_mm(net->dev.mm); | 312 | use_mm(net->dev.mm); |
224 | mutex_lock(&vq->mutex); | 313 | mutex_lock(&vq->mutex); |
225 | vhost_disable_notify(vq); | 314 | vhost_disable_notify(vq); |
226 | hdr_size = vq->hdr_size; | 315 | hdr_size = vq->vhost_hlen; |
227 | 316 | ||
228 | vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? | 317 | vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? |
229 | vq->log : NULL; | 318 | vq->log : NULL; |
@@ -270,14 +359,14 @@ static void handle_rx(struct vhost_net *net) | |||
270 | len, MSG_DONTWAIT | MSG_TRUNC); | 359 | len, MSG_DONTWAIT | MSG_TRUNC); |
271 | /* TODO: Check specific error and bomb out unless EAGAIN? */ | 360 | /* TODO: Check specific error and bomb out unless EAGAIN? */ |
272 | if (err < 0) { | 361 | if (err < 0) { |
273 | vhost_discard_vq_desc(vq); | 362 | vhost_discard_vq_desc(vq, 1); |
274 | break; | 363 | break; |
275 | } | 364 | } |
276 | /* TODO: Should check and handle checksum. */ | 365 | /* TODO: Should check and handle checksum. */ |
277 | if (err > len) { | 366 | if (err > len) { |
278 | pr_debug("Discarded truncated rx packet: " | 367 | pr_debug("Discarded truncated rx packet: " |
279 | " len %d > %zd\n", err, len); | 368 | " len %d > %zd\n", err, len); |
280 | vhost_discard_vq_desc(vq); | 369 | vhost_discard_vq_desc(vq, 1); |
281 | continue; | 370 | continue; |
282 | } | 371 | } |
283 | len = err; | 372 | len = err; |
@@ -302,54 +391,175 @@ static void handle_rx(struct vhost_net *net) | |||
302 | unuse_mm(net->dev.mm); | 391 | unuse_mm(net->dev.mm); |
303 | } | 392 | } |
304 | 393 | ||
305 | static void handle_tx_kick(struct work_struct *work) | 394 | /* Expects to be always run from workqueue - which acts as |
395 | * read-size critical section for our kind of RCU. */ | ||
396 | static void handle_rx_mergeable(struct vhost_net *net) | ||
306 | { | 397 | { |
307 | struct vhost_virtqueue *vq; | 398 | struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; |
308 | struct vhost_net *net; | 399 | unsigned uninitialized_var(in), log; |
309 | vq = container_of(work, struct vhost_virtqueue, poll.work); | 400 | struct vhost_log *vq_log; |
310 | net = container_of(vq->dev, struct vhost_net, dev); | 401 | struct msghdr msg = { |
402 | .msg_name = NULL, | ||
403 | .msg_namelen = 0, | ||
404 | .msg_control = NULL, /* FIXME: get and handle RX aux data. */ | ||
405 | .msg_controllen = 0, | ||
406 | .msg_iov = vq->iov, | ||
407 | .msg_flags = MSG_DONTWAIT, | ||
408 | }; | ||
409 | |||
410 | struct virtio_net_hdr_mrg_rxbuf hdr = { | ||
411 | .hdr.flags = 0, | ||
412 | .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE | ||
413 | }; | ||
414 | |||
415 | size_t total_len = 0; | ||
416 | int err, headcount; | ||
417 | size_t vhost_hlen, sock_hlen; | ||
418 | size_t vhost_len, sock_len; | ||
419 | struct socket *sock = rcu_dereference(vq->private_data); | ||
420 | if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) | ||
421 | return; | ||
422 | |||
423 | use_mm(net->dev.mm); | ||
424 | mutex_lock(&vq->mutex); | ||
425 | vhost_disable_notify(vq); | ||
426 | vhost_hlen = vq->vhost_hlen; | ||
427 | sock_hlen = vq->sock_hlen; | ||
428 | |||
429 | vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? | ||
430 | vq->log : NULL; | ||
431 | |||
432 | while ((sock_len = peek_head_len(sock->sk))) { | ||
433 | sock_len += sock_hlen; | ||
434 | vhost_len = sock_len + vhost_hlen; | ||
435 | headcount = get_rx_bufs(vq, vq->heads, vhost_len, | ||
436 | &in, vq_log, &log); | ||
437 | /* On error, stop handling until the next kick. */ | ||
438 | if (unlikely(headcount < 0)) | ||
439 | break; | ||
440 | /* OK, now we need to know about added descriptors. */ | ||
441 | if (!headcount) { | ||
442 | if (unlikely(vhost_enable_notify(vq))) { | ||
443 | /* They have slipped one in as we were | ||
444 | * doing that: check again. */ | ||
445 | vhost_disable_notify(vq); | ||
446 | continue; | ||
447 | } | ||
448 | /* Nothing new? Wait for eventfd to tell us | ||
449 | * they refilled. */ | ||
450 | break; | ||
451 | } | ||
452 | /* We don't need to be notified again. */ | ||
453 | if (unlikely((vhost_hlen))) | ||
454 | /* Skip header. TODO: support TSO. */ | ||
455 | move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in); | ||
456 | else | ||
457 | /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF: | ||
458 | * needed because sendmsg can modify msg_iov. */ | ||
459 | copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in); | ||
460 | msg.msg_iovlen = in; | ||
461 | err = sock->ops->recvmsg(NULL, sock, &msg, | ||
462 | sock_len, MSG_DONTWAIT | MSG_TRUNC); | ||
463 | /* Userspace might have consumed the packet meanwhile: | ||
464 | * it's not supposed to do this usually, but might be hard | ||
465 | * to prevent. Discard data we got (if any) and keep going. */ | ||
466 | if (unlikely(err != sock_len)) { | ||
467 | pr_debug("Discarded rx packet: " | ||
468 | " len %d, expected %zd\n", err, sock_len); | ||
469 | vhost_discard_vq_desc(vq, headcount); | ||
470 | continue; | ||
471 | } | ||
472 | if (unlikely(vhost_hlen) && | ||
473 | memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0, | ||
474 | vhost_hlen)) { | ||
475 | vq_err(vq, "Unable to write vnet_hdr at addr %p\n", | ||
476 | vq->iov->iov_base); | ||
477 | break; | ||
478 | } | ||
479 | /* TODO: Should check and handle checksum. */ | ||
480 | if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) && | ||
481 | memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount, | ||
482 | offsetof(typeof(hdr), num_buffers), | ||
483 | sizeof hdr.num_buffers)) { | ||
484 | vq_err(vq, "Failed num_buffers write"); | ||
485 | vhost_discard_vq_desc(vq, headcount); | ||
486 | break; | ||
487 | } | ||
488 | vhost_add_used_and_signal_n(&net->dev, vq, vq->heads, | ||
489 | headcount); | ||
490 | if (unlikely(vq_log)) | ||
491 | vhost_log_write(vq, vq_log, log, vhost_len); | ||
492 | total_len += vhost_len; | ||
493 | if (unlikely(total_len >= VHOST_NET_WEIGHT)) { | ||
494 | vhost_poll_queue(&vq->poll); | ||
495 | break; | ||
496 | } | ||
497 | } | ||
498 | |||
499 | mutex_unlock(&vq->mutex); | ||
500 | unuse_mm(net->dev.mm); | ||
501 | } | ||
502 | |||
503 | static void handle_rx(struct vhost_net *net) | ||
504 | { | ||
505 | if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) | ||
506 | handle_rx_mergeable(net); | ||
507 | else | ||
508 | handle_rx_big(net); | ||
509 | } | ||
510 | |||
511 | static void handle_tx_kick(struct vhost_work *work) | ||
512 | { | ||
513 | struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, | ||
514 | poll.work); | ||
515 | struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); | ||
516 | |||
311 | handle_tx(net); | 517 | handle_tx(net); |
312 | } | 518 | } |
313 | 519 | ||
314 | static void handle_rx_kick(struct work_struct *work) | 520 | static void handle_rx_kick(struct vhost_work *work) |
315 | { | 521 | { |
316 | struct vhost_virtqueue *vq; | 522 | struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, |
317 | struct vhost_net *net; | 523 | poll.work); |
318 | vq = container_of(work, struct vhost_virtqueue, poll.work); | 524 | struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); |
319 | net = container_of(vq->dev, struct vhost_net, dev); | 525 | |
320 | handle_rx(net); | 526 | handle_rx(net); |
321 | } | 527 | } |
322 | 528 | ||
323 | static void handle_tx_net(struct work_struct *work) | 529 | static void handle_tx_net(struct vhost_work *work) |
324 | { | 530 | { |
325 | struct vhost_net *net; | 531 | struct vhost_net *net = container_of(work, struct vhost_net, |
326 | net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); | 532 | poll[VHOST_NET_VQ_TX].work); |
327 | handle_tx(net); | 533 | handle_tx(net); |
328 | } | 534 | } |
329 | 535 | ||
330 | static void handle_rx_net(struct work_struct *work) | 536 | static void handle_rx_net(struct vhost_work *work) |
331 | { | 537 | { |
332 | struct vhost_net *net; | 538 | struct vhost_net *net = container_of(work, struct vhost_net, |
333 | net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); | 539 | poll[VHOST_NET_VQ_RX].work); |
334 | handle_rx(net); | 540 | handle_rx(net); |
335 | } | 541 | } |
336 | 542 | ||
337 | static int vhost_net_open(struct inode *inode, struct file *f) | 543 | static int vhost_net_open(struct inode *inode, struct file *f) |
338 | { | 544 | { |
339 | struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); | 545 | struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); |
546 | struct vhost_dev *dev; | ||
340 | int r; | 547 | int r; |
548 | |||
341 | if (!n) | 549 | if (!n) |
342 | return -ENOMEM; | 550 | return -ENOMEM; |
551 | |||
552 | dev = &n->dev; | ||
343 | n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; | 553 | n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; |
344 | n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; | 554 | n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; |
345 | r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); | 555 | r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX); |
346 | if (r < 0) { | 556 | if (r < 0) { |
347 | kfree(n); | 557 | kfree(n); |
348 | return r; | 558 | return r; |
349 | } | 559 | } |
350 | 560 | ||
351 | vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); | 561 | vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); |
352 | vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); | 562 | vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); |
353 | n->tx_poll_state = VHOST_NET_POLL_DISABLED; | 563 | n->tx_poll_state = VHOST_NET_POLL_DISABLED; |
354 | 564 | ||
355 | f->private_data = n; | 565 | f->private_data = n; |
@@ -527,13 +737,12 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) | |||
527 | 737 | ||
528 | /* start polling new socket */ | 738 | /* start polling new socket */ |
529 | oldsock = vq->private_data; | 739 | oldsock = vq->private_data; |
530 | if (sock == oldsock) | 740 | if (sock != oldsock) { |
531 | goto done; | 741 | vhost_net_disable_vq(n, vq); |
742 | rcu_assign_pointer(vq->private_data, sock); | ||
743 | vhost_net_enable_vq(n, vq); | ||
744 | } | ||
532 | 745 | ||
533 | vhost_net_disable_vq(n, vq); | ||
534 | rcu_assign_pointer(vq->private_data, sock); | ||
535 | vhost_net_enable_vq(n, vq); | ||
536 | done: | ||
537 | mutex_unlock(&vq->mutex); | 746 | mutex_unlock(&vq->mutex); |
538 | 747 | ||
539 | if (oldsock) { | 748 | if (oldsock) { |
@@ -574,9 +783,21 @@ done: | |||
574 | 783 | ||
575 | static int vhost_net_set_features(struct vhost_net *n, u64 features) | 784 | static int vhost_net_set_features(struct vhost_net *n, u64 features) |
576 | { | 785 | { |
577 | size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ? | 786 | size_t vhost_hlen, sock_hlen, hdr_len; |
578 | sizeof(struct virtio_net_hdr) : 0; | ||
579 | int i; | 787 | int i; |
788 | |||
789 | hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? | ||
790 | sizeof(struct virtio_net_hdr_mrg_rxbuf) : | ||
791 | sizeof(struct virtio_net_hdr); | ||
792 | if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) { | ||
793 | /* vhost provides vnet_hdr */ | ||
794 | vhost_hlen = hdr_len; | ||
795 | sock_hlen = 0; | ||
796 | } else { | ||
797 | /* socket provides vnet_hdr */ | ||
798 | vhost_hlen = 0; | ||
799 | sock_hlen = hdr_len; | ||
800 | } | ||
580 | mutex_lock(&n->dev.mutex); | 801 | mutex_lock(&n->dev.mutex); |
581 | if ((features & (1 << VHOST_F_LOG_ALL)) && | 802 | if ((features & (1 << VHOST_F_LOG_ALL)) && |
582 | !vhost_log_access_ok(&n->dev)) { | 803 | !vhost_log_access_ok(&n->dev)) { |
@@ -587,7 +808,8 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features) | |||
587 | smp_wmb(); | 808 | smp_wmb(); |
588 | for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { | 809 | for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { |
589 | mutex_lock(&n->vqs[i].mutex); | 810 | mutex_lock(&n->vqs[i].mutex); |
590 | n->vqs[i].hdr_size = hdr_size; | 811 | n->vqs[i].vhost_hlen = vhost_hlen; |
812 | n->vqs[i].sock_hlen = sock_hlen; | ||
591 | mutex_unlock(&n->vqs[i].mutex); | 813 | mutex_unlock(&n->vqs[i].mutex); |
592 | } | 814 | } |
593 | vhost_net_flush(n); | 815 | vhost_net_flush(n); |
@@ -639,7 +861,7 @@ static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, | |||
639 | } | 861 | } |
640 | #endif | 862 | #endif |
641 | 863 | ||
642 | const static struct file_operations vhost_net_fops = { | 864 | static const struct file_operations vhost_net_fops = { |
643 | .owner = THIS_MODULE, | 865 | .owner = THIS_MODULE, |
644 | .release = vhost_net_release, | 866 | .release = vhost_net_release, |
645 | .unlocked_ioctl = vhost_net_ioctl, | 867 | .unlocked_ioctl = vhost_net_ioctl, |
@@ -657,25 +879,13 @@ static struct miscdevice vhost_net_misc = { | |||
657 | 879 | ||
658 | static int vhost_net_init(void) | 880 | static int vhost_net_init(void) |
659 | { | 881 | { |
660 | int r = vhost_init(); | 882 | return misc_register(&vhost_net_misc); |
661 | if (r) | ||
662 | goto err_init; | ||
663 | r = misc_register(&vhost_net_misc); | ||
664 | if (r) | ||
665 | goto err_reg; | ||
666 | return 0; | ||
667 | err_reg: | ||
668 | vhost_cleanup(); | ||
669 | err_init: | ||
670 | return r; | ||
671 | |||
672 | } | 883 | } |
673 | module_init(vhost_net_init); | 884 | module_init(vhost_net_init); |
674 | 885 | ||
675 | static void vhost_net_exit(void) | 886 | static void vhost_net_exit(void) |
676 | { | 887 | { |
677 | misc_deregister(&vhost_net_misc); | 888 | misc_deregister(&vhost_net_misc); |
678 | vhost_cleanup(); | ||
679 | } | 889 | } |
680 | module_exit(vhost_net_exit); | 890 | module_exit(vhost_net_exit); |
681 | 891 | ||
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 0b99783083f6..e05557d52999 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c | |||
@@ -17,12 +17,13 @@ | |||
17 | #include <linux/mm.h> | 17 | #include <linux/mm.h> |
18 | #include <linux/miscdevice.h> | 18 | #include <linux/miscdevice.h> |
19 | #include <linux/mutex.h> | 19 | #include <linux/mutex.h> |
20 | #include <linux/workqueue.h> | ||
21 | #include <linux/rcupdate.h> | 20 | #include <linux/rcupdate.h> |
22 | #include <linux/poll.h> | 21 | #include <linux/poll.h> |
23 | #include <linux/file.h> | 22 | #include <linux/file.h> |
24 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
25 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/kthread.h> | ||
26 | #include <linux/cgroup.h> | ||
26 | 27 | ||
27 | #include <linux/net.h> | 28 | #include <linux/net.h> |
28 | #include <linux/if_packet.h> | 29 | #include <linux/if_packet.h> |
@@ -37,8 +38,6 @@ enum { | |||
37 | VHOST_MEMORY_F_LOG = 0x1, | 38 | VHOST_MEMORY_F_LOG = 0x1, |
38 | }; | 39 | }; |
39 | 40 | ||
40 | static struct workqueue_struct *vhost_workqueue; | ||
41 | |||
42 | static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, | 41 | static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, |
43 | poll_table *pt) | 42 | poll_table *pt) |
44 | { | 43 | { |
@@ -52,23 +51,31 @@ static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, | |||
52 | static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, | 51 | static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, |
53 | void *key) | 52 | void *key) |
54 | { | 53 | { |
55 | struct vhost_poll *poll; | 54 | struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait); |
56 | poll = container_of(wait, struct vhost_poll, wait); | 55 | |
57 | if (!((unsigned long)key & poll->mask)) | 56 | if (!((unsigned long)key & poll->mask)) |
58 | return 0; | 57 | return 0; |
59 | 58 | ||
60 | queue_work(vhost_workqueue, &poll->work); | 59 | vhost_poll_queue(poll); |
61 | return 0; | 60 | return 0; |
62 | } | 61 | } |
63 | 62 | ||
64 | /* Init poll structure */ | 63 | /* Init poll structure */ |
65 | void vhost_poll_init(struct vhost_poll *poll, work_func_t func, | 64 | void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, |
66 | unsigned long mask) | 65 | unsigned long mask, struct vhost_dev *dev) |
67 | { | 66 | { |
68 | INIT_WORK(&poll->work, func); | 67 | struct vhost_work *work = &poll->work; |
68 | |||
69 | init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); | 69 | init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); |
70 | init_poll_funcptr(&poll->table, vhost_poll_func); | 70 | init_poll_funcptr(&poll->table, vhost_poll_func); |
71 | poll->mask = mask; | 71 | poll->mask = mask; |
72 | poll->dev = dev; | ||
73 | |||
74 | INIT_LIST_HEAD(&work->node); | ||
75 | work->fn = fn; | ||
76 | init_waitqueue_head(&work->done); | ||
77 | work->flushing = 0; | ||
78 | work->queue_seq = work->done_seq = 0; | ||
72 | } | 79 | } |
73 | 80 | ||
74 | /* Start polling a file. We add ourselves to file's wait queue. The caller must | 81 | /* Start polling a file. We add ourselves to file's wait queue. The caller must |
@@ -92,12 +99,40 @@ void vhost_poll_stop(struct vhost_poll *poll) | |||
92 | * locks that are also used by the callback. */ | 99 | * locks that are also used by the callback. */ |
93 | void vhost_poll_flush(struct vhost_poll *poll) | 100 | void vhost_poll_flush(struct vhost_poll *poll) |
94 | { | 101 | { |
95 | flush_work(&poll->work); | 102 | struct vhost_work *work = &poll->work; |
103 | unsigned seq; | ||
104 | int left; | ||
105 | int flushing; | ||
106 | |||
107 | spin_lock_irq(&poll->dev->work_lock); | ||
108 | seq = work->queue_seq; | ||
109 | work->flushing++; | ||
110 | spin_unlock_irq(&poll->dev->work_lock); | ||
111 | wait_event(work->done, ({ | ||
112 | spin_lock_irq(&poll->dev->work_lock); | ||
113 | left = seq - work->done_seq <= 0; | ||
114 | spin_unlock_irq(&poll->dev->work_lock); | ||
115 | left; | ||
116 | })); | ||
117 | spin_lock_irq(&poll->dev->work_lock); | ||
118 | flushing = --work->flushing; | ||
119 | spin_unlock_irq(&poll->dev->work_lock); | ||
120 | BUG_ON(flushing < 0); | ||
96 | } | 121 | } |
97 | 122 | ||
98 | void vhost_poll_queue(struct vhost_poll *poll) | 123 | void vhost_poll_queue(struct vhost_poll *poll) |
99 | { | 124 | { |
100 | queue_work(vhost_workqueue, &poll->work); | 125 | struct vhost_dev *dev = poll->dev; |
126 | struct vhost_work *work = &poll->work; | ||
127 | unsigned long flags; | ||
128 | |||
129 | spin_lock_irqsave(&dev->work_lock, flags); | ||
130 | if (list_empty(&work->node)) { | ||
131 | list_add_tail(&work->node, &dev->work_list); | ||
132 | work->queue_seq++; | ||
133 | wake_up_process(dev->worker); | ||
134 | } | ||
135 | spin_unlock_irqrestore(&dev->work_lock, flags); | ||
101 | } | 136 | } |
102 | 137 | ||
103 | static void vhost_vq_reset(struct vhost_dev *dev, | 138 | static void vhost_vq_reset(struct vhost_dev *dev, |
@@ -114,7 +149,8 @@ static void vhost_vq_reset(struct vhost_dev *dev, | |||
114 | vq->used_flags = 0; | 149 | vq->used_flags = 0; |
115 | vq->log_used = false; | 150 | vq->log_used = false; |
116 | vq->log_addr = -1ull; | 151 | vq->log_addr = -1ull; |
117 | vq->hdr_size = 0; | 152 | vq->vhost_hlen = 0; |
153 | vq->sock_hlen = 0; | ||
118 | vq->private_data = NULL; | 154 | vq->private_data = NULL; |
119 | vq->log_base = NULL; | 155 | vq->log_base = NULL; |
120 | vq->error_ctx = NULL; | 156 | vq->error_ctx = NULL; |
@@ -125,10 +161,51 @@ static void vhost_vq_reset(struct vhost_dev *dev, | |||
125 | vq->log_ctx = NULL; | 161 | vq->log_ctx = NULL; |
126 | } | 162 | } |
127 | 163 | ||
164 | static int vhost_worker(void *data) | ||
165 | { | ||
166 | struct vhost_dev *dev = data; | ||
167 | struct vhost_work *work = NULL; | ||
168 | unsigned uninitialized_var(seq); | ||
169 | |||
170 | for (;;) { | ||
171 | /* mb paired w/ kthread_stop */ | ||
172 | set_current_state(TASK_INTERRUPTIBLE); | ||
173 | |||
174 | spin_lock_irq(&dev->work_lock); | ||
175 | if (work) { | ||
176 | work->done_seq = seq; | ||
177 | if (work->flushing) | ||
178 | wake_up_all(&work->done); | ||
179 | } | ||
180 | |||
181 | if (kthread_should_stop()) { | ||
182 | spin_unlock_irq(&dev->work_lock); | ||
183 | __set_current_state(TASK_RUNNING); | ||
184 | return 0; | ||
185 | } | ||
186 | if (!list_empty(&dev->work_list)) { | ||
187 | work = list_first_entry(&dev->work_list, | ||
188 | struct vhost_work, node); | ||
189 | list_del_init(&work->node); | ||
190 | seq = work->queue_seq; | ||
191 | } else | ||
192 | work = NULL; | ||
193 | spin_unlock_irq(&dev->work_lock); | ||
194 | |||
195 | if (work) { | ||
196 | __set_current_state(TASK_RUNNING); | ||
197 | work->fn(work); | ||
198 | } else | ||
199 | schedule(); | ||
200 | |||
201 | } | ||
202 | } | ||
203 | |||
128 | long vhost_dev_init(struct vhost_dev *dev, | 204 | long vhost_dev_init(struct vhost_dev *dev, |
129 | struct vhost_virtqueue *vqs, int nvqs) | 205 | struct vhost_virtqueue *vqs, int nvqs) |
130 | { | 206 | { |
131 | int i; | 207 | int i; |
208 | |||
132 | dev->vqs = vqs; | 209 | dev->vqs = vqs; |
133 | dev->nvqs = nvqs; | 210 | dev->nvqs = nvqs; |
134 | mutex_init(&dev->mutex); | 211 | mutex_init(&dev->mutex); |
@@ -136,6 +213,9 @@ long vhost_dev_init(struct vhost_dev *dev, | |||
136 | dev->log_file = NULL; | 213 | dev->log_file = NULL; |
137 | dev->memory = NULL; | 214 | dev->memory = NULL; |
138 | dev->mm = NULL; | 215 | dev->mm = NULL; |
216 | spin_lock_init(&dev->work_lock); | ||
217 | INIT_LIST_HEAD(&dev->work_list); | ||
218 | dev->worker = NULL; | ||
139 | 219 | ||
140 | for (i = 0; i < dev->nvqs; ++i) { | 220 | for (i = 0; i < dev->nvqs; ++i) { |
141 | dev->vqs[i].dev = dev; | 221 | dev->vqs[i].dev = dev; |
@@ -143,9 +223,9 @@ long vhost_dev_init(struct vhost_dev *dev, | |||
143 | vhost_vq_reset(dev, dev->vqs + i); | 223 | vhost_vq_reset(dev, dev->vqs + i); |
144 | if (dev->vqs[i].handle_kick) | 224 | if (dev->vqs[i].handle_kick) |
145 | vhost_poll_init(&dev->vqs[i].poll, | 225 | vhost_poll_init(&dev->vqs[i].poll, |
146 | dev->vqs[i].handle_kick, | 226 | dev->vqs[i].handle_kick, POLLIN, dev); |
147 | POLLIN); | ||
148 | } | 227 | } |
228 | |||
149 | return 0; | 229 | return 0; |
150 | } | 230 | } |
151 | 231 | ||
@@ -159,12 +239,36 @@ long vhost_dev_check_owner(struct vhost_dev *dev) | |||
159 | /* Caller should have device mutex */ | 239 | /* Caller should have device mutex */ |
160 | static long vhost_dev_set_owner(struct vhost_dev *dev) | 240 | static long vhost_dev_set_owner(struct vhost_dev *dev) |
161 | { | 241 | { |
242 | struct task_struct *worker; | ||
243 | int err; | ||
162 | /* Is there an owner already? */ | 244 | /* Is there an owner already? */ |
163 | if (dev->mm) | 245 | if (dev->mm) { |
164 | return -EBUSY; | 246 | err = -EBUSY; |
247 | goto err_mm; | ||
248 | } | ||
165 | /* No owner, become one */ | 249 | /* No owner, become one */ |
166 | dev->mm = get_task_mm(current); | 250 | dev->mm = get_task_mm(current); |
251 | worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); | ||
252 | if (IS_ERR(worker)) { | ||
253 | err = PTR_ERR(worker); | ||
254 | goto err_worker; | ||
255 | } | ||
256 | |||
257 | dev->worker = worker; | ||
258 | err = cgroup_attach_task_current_cg(worker); | ||
259 | if (err) | ||
260 | goto err_cgroup; | ||
261 | wake_up_process(worker); /* avoid contributing to loadavg */ | ||
262 | |||
167 | return 0; | 263 | return 0; |
264 | err_cgroup: | ||
265 | kthread_stop(worker); | ||
266 | err_worker: | ||
267 | if (dev->mm) | ||
268 | mmput(dev->mm); | ||
269 | dev->mm = NULL; | ||
270 | err_mm: | ||
271 | return err; | ||
168 | } | 272 | } |
169 | 273 | ||
170 | /* Caller should have device mutex */ | 274 | /* Caller should have device mutex */ |
@@ -217,6 +321,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev) | |||
217 | if (dev->mm) | 321 | if (dev->mm) |
218 | mmput(dev->mm); | 322 | mmput(dev->mm); |
219 | dev->mm = NULL; | 323 | dev->mm = NULL; |
324 | |||
325 | WARN_ON(!list_empty(&dev->work_list)); | ||
326 | kthread_stop(dev->worker); | ||
220 | } | 327 | } |
221 | 328 | ||
222 | static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) | 329 | static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) |
@@ -237,8 +344,8 @@ static int vq_memory_access_ok(void __user *log_base, struct vhost_memory *mem, | |||
237 | { | 344 | { |
238 | int i; | 345 | int i; |
239 | 346 | ||
240 | if (!mem) | 347 | if (!mem) |
241 | return 0; | 348 | return 0; |
242 | 349 | ||
243 | for (i = 0; i < mem->nregions; ++i) { | 350 | for (i = 0; i < mem->nregions; ++i) { |
244 | struct vhost_memory_region *m = mem->regions + i; | 351 | struct vhost_memory_region *m = mem->regions + i; |
@@ -995,9 +1102,9 @@ int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq, | |||
995 | } | 1102 | } |
996 | 1103 | ||
997 | /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ | 1104 | /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ |
998 | void vhost_discard_vq_desc(struct vhost_virtqueue *vq) | 1105 | void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n) |
999 | { | 1106 | { |
1000 | vq->last_avail_idx--; | 1107 | vq->last_avail_idx -= n; |
1001 | } | 1108 | } |
1002 | 1109 | ||
1003 | /* After we've used one of their buffers, we tell them about it. We'll then | 1110 | /* After we've used one of their buffers, we tell them about it. We'll then |
@@ -1042,6 +1149,67 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) | |||
1042 | return 0; | 1149 | return 0; |
1043 | } | 1150 | } |
1044 | 1151 | ||
1152 | static int __vhost_add_used_n(struct vhost_virtqueue *vq, | ||
1153 | struct vring_used_elem *heads, | ||
1154 | unsigned count) | ||
1155 | { | ||
1156 | struct vring_used_elem __user *used; | ||
1157 | int start; | ||
1158 | |||
1159 | start = vq->last_used_idx % vq->num; | ||
1160 | used = vq->used->ring + start; | ||
1161 | if (copy_to_user(used, heads, count * sizeof *used)) { | ||
1162 | vq_err(vq, "Failed to write used"); | ||
1163 | return -EFAULT; | ||
1164 | } | ||
1165 | if (unlikely(vq->log_used)) { | ||
1166 | /* Make sure data is seen before log. */ | ||
1167 | smp_wmb(); | ||
1168 | /* Log used ring entry write. */ | ||
1169 | log_write(vq->log_base, | ||
1170 | vq->log_addr + | ||
1171 | ((void __user *)used - (void __user *)vq->used), | ||
1172 | count * sizeof *used); | ||
1173 | } | ||
1174 | vq->last_used_idx += count; | ||
1175 | return 0; | ||
1176 | } | ||
1177 | |||
1178 | /* After we've used one of their buffers, we tell them about it. We'll then | ||
1179 | * want to notify the guest, using eventfd. */ | ||
1180 | int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, | ||
1181 | unsigned count) | ||
1182 | { | ||
1183 | int start, n, r; | ||
1184 | |||
1185 | start = vq->last_used_idx % vq->num; | ||
1186 | n = vq->num - start; | ||
1187 | if (n < count) { | ||
1188 | r = __vhost_add_used_n(vq, heads, n); | ||
1189 | if (r < 0) | ||
1190 | return r; | ||
1191 | heads += n; | ||
1192 | count -= n; | ||
1193 | } | ||
1194 | r = __vhost_add_used_n(vq, heads, count); | ||
1195 | |||
1196 | /* Make sure buffer is written before we update index. */ | ||
1197 | smp_wmb(); | ||
1198 | if (put_user(vq->last_used_idx, &vq->used->idx)) { | ||
1199 | vq_err(vq, "Failed to increment used idx"); | ||
1200 | return -EFAULT; | ||
1201 | } | ||
1202 | if (unlikely(vq->log_used)) { | ||
1203 | /* Log used index update. */ | ||
1204 | log_write(vq->log_base, | ||
1205 | vq->log_addr + offsetof(struct vring_used, idx), | ||
1206 | sizeof vq->used->idx); | ||
1207 | if (vq->log_ctx) | ||
1208 | eventfd_signal(vq->log_ctx, 1); | ||
1209 | } | ||
1210 | return r; | ||
1211 | } | ||
1212 | |||
1045 | /* This actually signals the guest, using eventfd. */ | 1213 | /* This actually signals the guest, using eventfd. */ |
1046 | void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) | 1214 | void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) |
1047 | { | 1215 | { |
@@ -1076,6 +1244,15 @@ void vhost_add_used_and_signal(struct vhost_dev *dev, | |||
1076 | vhost_signal(dev, vq); | 1244 | vhost_signal(dev, vq); |
1077 | } | 1245 | } |
1078 | 1246 | ||
1247 | /* multi-buffer version of vhost_add_used_and_signal */ | ||
1248 | void vhost_add_used_and_signal_n(struct vhost_dev *dev, | ||
1249 | struct vhost_virtqueue *vq, | ||
1250 | struct vring_used_elem *heads, unsigned count) | ||
1251 | { | ||
1252 | vhost_add_used_n(vq, heads, count); | ||
1253 | vhost_signal(dev, vq); | ||
1254 | } | ||
1255 | |||
1079 | /* OK, now we need to know about added descriptors. */ | 1256 | /* OK, now we need to know about added descriptors. */ |
1080 | bool vhost_enable_notify(struct vhost_virtqueue *vq) | 1257 | bool vhost_enable_notify(struct vhost_virtqueue *vq) |
1081 | { | 1258 | { |
@@ -1100,7 +1277,7 @@ bool vhost_enable_notify(struct vhost_virtqueue *vq) | |||
1100 | return false; | 1277 | return false; |
1101 | } | 1278 | } |
1102 | 1279 | ||
1103 | return avail_idx != vq->last_avail_idx; | 1280 | return avail_idx != vq->avail_idx; |
1104 | } | 1281 | } |
1105 | 1282 | ||
1106 | /* We don't need to be notified again. */ | 1283 | /* We don't need to be notified again. */ |
@@ -1115,16 +1292,3 @@ void vhost_disable_notify(struct vhost_virtqueue *vq) | |||
1115 | vq_err(vq, "Failed to enable notification at %p: %d\n", | 1292 | vq_err(vq, "Failed to enable notification at %p: %d\n", |
1116 | &vq->used->flags, r); | 1293 | &vq->used->flags, r); |
1117 | } | 1294 | } |
1118 | |||
1119 | int vhost_init(void) | ||
1120 | { | ||
1121 | vhost_workqueue = create_singlethread_workqueue("vhost"); | ||
1122 | if (!vhost_workqueue) | ||
1123 | return -ENOMEM; | ||
1124 | return 0; | ||
1125 | } | ||
1126 | |||
1127 | void vhost_cleanup(void) | ||
1128 | { | ||
1129 | destroy_workqueue(vhost_workqueue); | ||
1130 | } | ||
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 11ee13dba0f7..afd77295971c 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h | |||
@@ -5,13 +5,13 @@ | |||
5 | #include <linux/vhost.h> | 5 | #include <linux/vhost.h> |
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/mutex.h> | 7 | #include <linux/mutex.h> |
8 | #include <linux/workqueue.h> | ||
9 | #include <linux/poll.h> | 8 | #include <linux/poll.h> |
10 | #include <linux/file.h> | 9 | #include <linux/file.h> |
11 | #include <linux/skbuff.h> | 10 | #include <linux/skbuff.h> |
12 | #include <linux/uio.h> | 11 | #include <linux/uio.h> |
13 | #include <linux/virtio_config.h> | 12 | #include <linux/virtio_config.h> |
14 | #include <linux/virtio_ring.h> | 13 | #include <linux/virtio_ring.h> |
14 | #include <asm/atomic.h> | ||
15 | 15 | ||
16 | struct vhost_device; | 16 | struct vhost_device; |
17 | 17 | ||
@@ -20,19 +20,31 @@ enum { | |||
20 | VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2, | 20 | VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2, |
21 | }; | 21 | }; |
22 | 22 | ||
23 | struct vhost_work; | ||
24 | typedef void (*vhost_work_fn_t)(struct vhost_work *work); | ||
25 | |||
26 | struct vhost_work { | ||
27 | struct list_head node; | ||
28 | vhost_work_fn_t fn; | ||
29 | wait_queue_head_t done; | ||
30 | int flushing; | ||
31 | unsigned queue_seq; | ||
32 | unsigned done_seq; | ||
33 | }; | ||
34 | |||
23 | /* Poll a file (eventfd or socket) */ | 35 | /* Poll a file (eventfd or socket) */ |
24 | /* Note: there's nothing vhost specific about this structure. */ | 36 | /* Note: there's nothing vhost specific about this structure. */ |
25 | struct vhost_poll { | 37 | struct vhost_poll { |
26 | poll_table table; | 38 | poll_table table; |
27 | wait_queue_head_t *wqh; | 39 | wait_queue_head_t *wqh; |
28 | wait_queue_t wait; | 40 | wait_queue_t wait; |
29 | /* struct which will handle all actual work. */ | 41 | struct vhost_work work; |
30 | struct work_struct work; | ||
31 | unsigned long mask; | 42 | unsigned long mask; |
43 | struct vhost_dev *dev; | ||
32 | }; | 44 | }; |
33 | 45 | ||
34 | void vhost_poll_init(struct vhost_poll *poll, work_func_t func, | 46 | void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, |
35 | unsigned long mask); | 47 | unsigned long mask, struct vhost_dev *dev); |
36 | void vhost_poll_start(struct vhost_poll *poll, struct file *file); | 48 | void vhost_poll_start(struct vhost_poll *poll, struct file *file); |
37 | void vhost_poll_stop(struct vhost_poll *poll); | 49 | void vhost_poll_stop(struct vhost_poll *poll); |
38 | void vhost_poll_flush(struct vhost_poll *poll); | 50 | void vhost_poll_flush(struct vhost_poll *poll); |
@@ -63,7 +75,7 @@ struct vhost_virtqueue { | |||
63 | struct vhost_poll poll; | 75 | struct vhost_poll poll; |
64 | 76 | ||
65 | /* The routine to call when the Guest pings us, or timeout. */ | 77 | /* The routine to call when the Guest pings us, or timeout. */ |
66 | work_func_t handle_kick; | 78 | vhost_work_fn_t handle_kick; |
67 | 79 | ||
68 | /* Last available index we saw. */ | 80 | /* Last available index we saw. */ |
69 | u16 last_avail_idx; | 81 | u16 last_avail_idx; |
@@ -84,13 +96,15 @@ struct vhost_virtqueue { | |||
84 | struct iovec indirect[VHOST_NET_MAX_SG]; | 96 | struct iovec indirect[VHOST_NET_MAX_SG]; |
85 | struct iovec iov[VHOST_NET_MAX_SG]; | 97 | struct iovec iov[VHOST_NET_MAX_SG]; |
86 | struct iovec hdr[VHOST_NET_MAX_SG]; | 98 | struct iovec hdr[VHOST_NET_MAX_SG]; |
87 | size_t hdr_size; | 99 | size_t vhost_hlen; |
100 | size_t sock_hlen; | ||
101 | struct vring_used_elem heads[VHOST_NET_MAX_SG]; | ||
88 | /* We use a kind of RCU to access private pointer. | 102 | /* We use a kind of RCU to access private pointer. |
89 | * All readers access it from workqueue, which makes it possible to | 103 | * All readers access it from worker, which makes it possible to |
90 | * flush the workqueue instead of synchronize_rcu. Therefore readers do | 104 | * flush the vhost_work instead of synchronize_rcu. Therefore readers do |
91 | * not need to call rcu_read_lock/rcu_read_unlock: the beginning of | 105 | * not need to call rcu_read_lock/rcu_read_unlock: the beginning of |
92 | * work item execution acts instead of rcu_read_lock() and the end of | 106 | * vhost_work execution acts instead of rcu_read_lock() and the end of |
93 | * work item execution acts instead of rcu_read_lock(). | 107 | * vhost_work execution acts instead of rcu_read_lock(). |
94 | * Writers use virtqueue mutex. */ | 108 | * Writers use virtqueue mutex. */ |
95 | void *private_data; | 109 | void *private_data; |
96 | /* Log write descriptors */ | 110 | /* Log write descriptors */ |
@@ -110,6 +124,9 @@ struct vhost_dev { | |||
110 | int nvqs; | 124 | int nvqs; |
111 | struct file *log_file; | 125 | struct file *log_file; |
112 | struct eventfd_ctx *log_ctx; | 126 | struct eventfd_ctx *log_ctx; |
127 | spinlock_t work_lock; | ||
128 | struct list_head work_list; | ||
129 | struct task_struct *worker; | ||
113 | }; | 130 | }; |
114 | 131 | ||
115 | long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); | 132 | long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); |
@@ -124,21 +141,22 @@ int vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *, | |||
124 | struct iovec iov[], unsigned int iov_count, | 141 | struct iovec iov[], unsigned int iov_count, |
125 | unsigned int *out_num, unsigned int *in_num, | 142 | unsigned int *out_num, unsigned int *in_num, |
126 | struct vhost_log *log, unsigned int *log_num); | 143 | struct vhost_log *log, unsigned int *log_num); |
127 | void vhost_discard_vq_desc(struct vhost_virtqueue *); | 144 | void vhost_discard_vq_desc(struct vhost_virtqueue *, int n); |
128 | 145 | ||
129 | int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); | 146 | int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); |
130 | void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); | 147 | int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads, |
148 | unsigned count); | ||
131 | void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, | 149 | void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, |
132 | unsigned int head, int len); | 150 | unsigned int id, int len); |
151 | void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *, | ||
152 | struct vring_used_elem *heads, unsigned count); | ||
153 | void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); | ||
133 | void vhost_disable_notify(struct vhost_virtqueue *); | 154 | void vhost_disable_notify(struct vhost_virtqueue *); |
134 | bool vhost_enable_notify(struct vhost_virtqueue *); | 155 | bool vhost_enable_notify(struct vhost_virtqueue *); |
135 | 156 | ||
136 | int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, | 157 | int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, |
137 | unsigned int log_num, u64 len); | 158 | unsigned int log_num, u64 len); |
138 | 159 | ||
139 | int vhost_init(void); | ||
140 | void vhost_cleanup(void); | ||
141 | |||
142 | #define vq_err(vq, fmt, ...) do { \ | 160 | #define vq_err(vq, fmt, ...) do { \ |
143 | pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ | 161 | pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ |
144 | if ((vq)->error_ctx) \ | 162 | if ((vq)->error_ctx) \ |
@@ -149,7 +167,8 @@ enum { | |||
149 | VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) | | 167 | VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) | |
150 | (1 << VIRTIO_RING_F_INDIRECT_DESC) | | 168 | (1 << VIRTIO_RING_F_INDIRECT_DESC) | |
151 | (1 << VHOST_F_LOG_ALL) | | 169 | (1 << VHOST_F_LOG_ALL) | |
152 | (1 << VHOST_NET_F_VIRTIO_NET_HDR), | 170 | (1 << VHOST_NET_F_VIRTIO_NET_HDR) | |
171 | (1 << VIRTIO_NET_F_MRG_RXBUF), | ||
153 | }; | 172 | }; |
154 | 173 | ||
155 | static inline int vhost_has_feature(struct vhost_dev *dev, int bit) | 174 | static inline int vhost_has_feature(struct vhost_dev *dev, int bit) |