aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/vhost/net.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-08-04 14:47:58 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-04 14:47:58 -0400
commit6ba74014c1ab0e37af7de6f64b4eccbbae3cb9e7 (patch)
tree8f3892fc44f1e403675a6d7e88fda5c70e56ee4c /drivers/vhost/net.c
parent5abd9ccced7a726c817dd6b5b96bc933859138d1 (diff)
parent3ff1c25927e3af61c6bf0e4ed959504058ae4565 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6: (1443 commits) phy/marvell: add 88ec048 support igb: Program MDICNFG register prior to PHY init e1000e: correct MAC-PHY interconnect register offset for 82579 hso: Add new product ID can: Add driver for esd CAN-USB/2 device l2tp: fix export of header file for userspace can-raw: Fix skb_orphan_try handling Revert "net: remove zap_completion_queue" net: cleanup inclusion phy/marvell: add 88e1121 interface mode support u32: negative offset fix net: Fix a typo from "dev" to "ndev" igb: Use irq_synchronize per vector when using MSI-X ixgbevf: fix null pointer dereference due to filter being set for VLAN 0 e1000e: Fix irq_synchronize in MSI-X case e1000e: register pm_qos request on hardware activation ip_fragment: fix subtracting PPPOE_SES_HLEN from mtu twice net: Add getsockopt support for TCP thin-streams cxgb4: update driver version cxgb4: add new PCI IDs ... Manually fix up conflicts in: - drivers/net/e1000e/netdev.c: due to pm_qos registration infrastructure changes - drivers/net/phy/marvell.c: conflict between adding 88ec048 support and cleaning up the IDs - drivers/net/wireless/ipw2x00/ipw2100.c: trivial ipw2100_pm_qos_req conflict (registration change vs marking it static)
Diffstat (limited to 'drivers/vhost/net.c')
-rw-r--r--drivers/vhost/net.c306
1 files changed, 258 insertions, 48 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d219070fed3d..29e850a7a2f9 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -74,6 +74,22 @@ static int move_iovec_hdr(struct iovec *from, struct iovec *to,
74 } 74 }
75 return seg; 75 return seg;
76} 76}
77/* Copy iovec entries for len bytes from iovec. */
78static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
79 size_t len, int iovcount)
80{
81 int seg = 0;
82 size_t size;
83 while (len && seg < iovcount) {
84 size = min(from->iov_len, len);
85 to->iov_base = from->iov_base;
86 to->iov_len = size;
87 len -= size;
88 ++from;
89 ++to;
90 ++seg;
91 }
92}
77 93
78/* Caller must have TX VQ lock */ 94/* Caller must have TX VQ lock */
79static void tx_poll_stop(struct vhost_net *net) 95static void tx_poll_stop(struct vhost_net *net)
@@ -129,7 +145,7 @@ static void handle_tx(struct vhost_net *net)
129 145
130 if (wmem < sock->sk->sk_sndbuf / 2) 146 if (wmem < sock->sk->sk_sndbuf / 2)
131 tx_poll_stop(net); 147 tx_poll_stop(net);
132 hdr_size = vq->hdr_size; 148 hdr_size = vq->vhost_hlen;
133 149
134 for (;;) { 150 for (;;) {
135 head = vhost_get_vq_desc(&net->dev, vq, vq->iov, 151 head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
@@ -172,7 +188,7 @@ static void handle_tx(struct vhost_net *net)
172 /* TODO: Check specific error and bomb out unless ENOBUFS? */ 188 /* TODO: Check specific error and bomb out unless ENOBUFS? */
173 err = sock->ops->sendmsg(NULL, sock, &msg, len); 189 err = sock->ops->sendmsg(NULL, sock, &msg, len);
174 if (unlikely(err < 0)) { 190 if (unlikely(err < 0)) {
175 vhost_discard_vq_desc(vq); 191 vhost_discard_vq_desc(vq, 1);
176 tx_poll_start(net, sock); 192 tx_poll_start(net, sock);
177 break; 193 break;
178 } 194 }
@@ -191,9 +207,82 @@ static void handle_tx(struct vhost_net *net)
191 unuse_mm(net->dev.mm); 207 unuse_mm(net->dev.mm);
192} 208}
193 209
210static int peek_head_len(struct sock *sk)
211{
212 struct sk_buff *head;
213 int len = 0;
214
215 lock_sock(sk);
216 head = skb_peek(&sk->sk_receive_queue);
217 if (head)
218 len = head->len;
219 release_sock(sk);
220 return len;
221}
222
223/* This is a multi-buffer version of vhost_get_desc, that works if
224 * vq has read descriptors only.
225 * @vq - the relevant virtqueue
226 * @datalen - data length we'll be reading
227 * @iovcount - returned count of io vectors we fill
228 * @log - vhost log
229 * @log_num - log offset
230 * returns number of buffer heads allocated, negative on error
231 */
232static int get_rx_bufs(struct vhost_virtqueue *vq,
233 struct vring_used_elem *heads,
234 int datalen,
235 unsigned *iovcount,
236 struct vhost_log *log,
237 unsigned *log_num)
238{
239 unsigned int out, in;
240 int seg = 0;
241 int headcount = 0;
242 unsigned d;
243 int r, nlogs = 0;
244
245 while (datalen > 0) {
246 if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
247 r = -ENOBUFS;
248 goto err;
249 }
250 d = vhost_get_vq_desc(vq->dev, vq, vq->iov + seg,
251 ARRAY_SIZE(vq->iov) - seg, &out,
252 &in, log, log_num);
253 if (d == vq->num) {
254 r = 0;
255 goto err;
256 }
257 if (unlikely(out || in <= 0)) {
258 vq_err(vq, "unexpected descriptor format for RX: "
259 "out %d, in %d\n", out, in);
260 r = -EINVAL;
261 goto err;
262 }
263 if (unlikely(log)) {
264 nlogs += *log_num;
265 log += *log_num;
266 }
267 heads[headcount].id = d;
268 heads[headcount].len = iov_length(vq->iov + seg, in);
269 datalen -= heads[headcount].len;
270 ++headcount;
271 seg += in;
272 }
273 heads[headcount - 1].len += datalen;
274 *iovcount = seg;
275 if (unlikely(log))
276 *log_num = nlogs;
277 return headcount;
278err:
279 vhost_discard_vq_desc(vq, headcount);
280 return r;
281}
282
194/* Expects to be always run from workqueue - which acts as 283/* Expects to be always run from workqueue - which acts as
195 * read-size critical section for our kind of RCU. */ 284 * read-size critical section for our kind of RCU. */
196static void handle_rx(struct vhost_net *net) 285static void handle_rx_big(struct vhost_net *net)
197{ 286{
198 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; 287 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
199 unsigned out, in, log, s; 288 unsigned out, in, log, s;
@@ -223,7 +312,7 @@ static void handle_rx(struct vhost_net *net)
223 use_mm(net->dev.mm); 312 use_mm(net->dev.mm);
224 mutex_lock(&vq->mutex); 313 mutex_lock(&vq->mutex);
225 vhost_disable_notify(vq); 314 vhost_disable_notify(vq);
226 hdr_size = vq->hdr_size; 315 hdr_size = vq->vhost_hlen;
227 316
228 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? 317 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
229 vq->log : NULL; 318 vq->log : NULL;
@@ -270,14 +359,14 @@ static void handle_rx(struct vhost_net *net)
270 len, MSG_DONTWAIT | MSG_TRUNC); 359 len, MSG_DONTWAIT | MSG_TRUNC);
271 /* TODO: Check specific error and bomb out unless EAGAIN? */ 360 /* TODO: Check specific error and bomb out unless EAGAIN? */
272 if (err < 0) { 361 if (err < 0) {
273 vhost_discard_vq_desc(vq); 362 vhost_discard_vq_desc(vq, 1);
274 break; 363 break;
275 } 364 }
276 /* TODO: Should check and handle checksum. */ 365 /* TODO: Should check and handle checksum. */
277 if (err > len) { 366 if (err > len) {
278 pr_debug("Discarded truncated rx packet: " 367 pr_debug("Discarded truncated rx packet: "
279 " len %d > %zd\n", err, len); 368 " len %d > %zd\n", err, len);
280 vhost_discard_vq_desc(vq); 369 vhost_discard_vq_desc(vq, 1);
281 continue; 370 continue;
282 } 371 }
283 len = err; 372 len = err;
@@ -302,54 +391,175 @@ static void handle_rx(struct vhost_net *net)
302 unuse_mm(net->dev.mm); 391 unuse_mm(net->dev.mm);
303} 392}
304 393
305static void handle_tx_kick(struct work_struct *work) 394/* Expects to be always run from workqueue - which acts as
395 * read-size critical section for our kind of RCU. */
396static void handle_rx_mergeable(struct vhost_net *net)
306{ 397{
307 struct vhost_virtqueue *vq; 398 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
308 struct vhost_net *net; 399 unsigned uninitialized_var(in), log;
309 vq = container_of(work, struct vhost_virtqueue, poll.work); 400 struct vhost_log *vq_log;
310 net = container_of(vq->dev, struct vhost_net, dev); 401 struct msghdr msg = {
402 .msg_name = NULL,
403 .msg_namelen = 0,
404 .msg_control = NULL, /* FIXME: get and handle RX aux data. */
405 .msg_controllen = 0,
406 .msg_iov = vq->iov,
407 .msg_flags = MSG_DONTWAIT,
408 };
409
410 struct virtio_net_hdr_mrg_rxbuf hdr = {
411 .hdr.flags = 0,
412 .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
413 };
414
415 size_t total_len = 0;
416 int err, headcount;
417 size_t vhost_hlen, sock_hlen;
418 size_t vhost_len, sock_len;
419 struct socket *sock = rcu_dereference(vq->private_data);
420 if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
421 return;
422
423 use_mm(net->dev.mm);
424 mutex_lock(&vq->mutex);
425 vhost_disable_notify(vq);
426 vhost_hlen = vq->vhost_hlen;
427 sock_hlen = vq->sock_hlen;
428
429 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
430 vq->log : NULL;
431
432 while ((sock_len = peek_head_len(sock->sk))) {
433 sock_len += sock_hlen;
434 vhost_len = sock_len + vhost_hlen;
435 headcount = get_rx_bufs(vq, vq->heads, vhost_len,
436 &in, vq_log, &log);
437 /* On error, stop handling until the next kick. */
438 if (unlikely(headcount < 0))
439 break;
440 /* OK, now we need to know about added descriptors. */
441 if (!headcount) {
442 if (unlikely(vhost_enable_notify(vq))) {
443 /* They have slipped one in as we were
444 * doing that: check again. */
445 vhost_disable_notify(vq);
446 continue;
447 }
448 /* Nothing new? Wait for eventfd to tell us
449 * they refilled. */
450 break;
451 }
452 /* We don't need to be notified again. */
453 if (unlikely((vhost_hlen)))
454 /* Skip header. TODO: support TSO. */
455 move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
456 else
457 /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
458 * needed because sendmsg can modify msg_iov. */
459 copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
460 msg.msg_iovlen = in;
461 err = sock->ops->recvmsg(NULL, sock, &msg,
462 sock_len, MSG_DONTWAIT | MSG_TRUNC);
463 /* Userspace might have consumed the packet meanwhile:
464 * it's not supposed to do this usually, but might be hard
465 * to prevent. Discard data we got (if any) and keep going. */
466 if (unlikely(err != sock_len)) {
467 pr_debug("Discarded rx packet: "
468 " len %d, expected %zd\n", err, sock_len);
469 vhost_discard_vq_desc(vq, headcount);
470 continue;
471 }
472 if (unlikely(vhost_hlen) &&
473 memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
474 vhost_hlen)) {
475 vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
476 vq->iov->iov_base);
477 break;
478 }
479 /* TODO: Should check and handle checksum. */
480 if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
481 memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
482 offsetof(typeof(hdr), num_buffers),
483 sizeof hdr.num_buffers)) {
484 vq_err(vq, "Failed num_buffers write");
485 vhost_discard_vq_desc(vq, headcount);
486 break;
487 }
488 vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
489 headcount);
490 if (unlikely(vq_log))
491 vhost_log_write(vq, vq_log, log, vhost_len);
492 total_len += vhost_len;
493 if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
494 vhost_poll_queue(&vq->poll);
495 break;
496 }
497 }
498
499 mutex_unlock(&vq->mutex);
500 unuse_mm(net->dev.mm);
501}
502
503static void handle_rx(struct vhost_net *net)
504{
505 if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
506 handle_rx_mergeable(net);
507 else
508 handle_rx_big(net);
509}
510
511static void handle_tx_kick(struct vhost_work *work)
512{
513 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
514 poll.work);
515 struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
516
311 handle_tx(net); 517 handle_tx(net);
312} 518}
313 519
314static void handle_rx_kick(struct work_struct *work) 520static void handle_rx_kick(struct vhost_work *work)
315{ 521{
316 struct vhost_virtqueue *vq; 522 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
317 struct vhost_net *net; 523 poll.work);
318 vq = container_of(work, struct vhost_virtqueue, poll.work); 524 struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
319 net = container_of(vq->dev, struct vhost_net, dev); 525
320 handle_rx(net); 526 handle_rx(net);
321} 527}
322 528
323static void handle_tx_net(struct work_struct *work) 529static void handle_tx_net(struct vhost_work *work)
324{ 530{
325 struct vhost_net *net; 531 struct vhost_net *net = container_of(work, struct vhost_net,
326 net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); 532 poll[VHOST_NET_VQ_TX].work);
327 handle_tx(net); 533 handle_tx(net);
328} 534}
329 535
330static void handle_rx_net(struct work_struct *work) 536static void handle_rx_net(struct vhost_work *work)
331{ 537{
332 struct vhost_net *net; 538 struct vhost_net *net = container_of(work, struct vhost_net,
333 net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); 539 poll[VHOST_NET_VQ_RX].work);
334 handle_rx(net); 540 handle_rx(net);
335} 541}
336 542
337static int vhost_net_open(struct inode *inode, struct file *f) 543static int vhost_net_open(struct inode *inode, struct file *f)
338{ 544{
339 struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); 545 struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
546 struct vhost_dev *dev;
340 int r; 547 int r;
548
341 if (!n) 549 if (!n)
342 return -ENOMEM; 550 return -ENOMEM;
551
552 dev = &n->dev;
343 n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; 553 n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
344 n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; 554 n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
345 r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); 555 r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
346 if (r < 0) { 556 if (r < 0) {
347 kfree(n); 557 kfree(n);
348 return r; 558 return r;
349 } 559 }
350 560
351 vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); 561 vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
352 vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); 562 vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
353 n->tx_poll_state = VHOST_NET_POLL_DISABLED; 563 n->tx_poll_state = VHOST_NET_POLL_DISABLED;
354 564
355 f->private_data = n; 565 f->private_data = n;
@@ -527,13 +737,12 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
527 737
528 /* start polling new socket */ 738 /* start polling new socket */
529 oldsock = vq->private_data; 739 oldsock = vq->private_data;
530 if (sock == oldsock) 740 if (sock != oldsock) {
531 goto done; 741 vhost_net_disable_vq(n, vq);
742 rcu_assign_pointer(vq->private_data, sock);
743 vhost_net_enable_vq(n, vq);
744 }
532 745
533 vhost_net_disable_vq(n, vq);
534 rcu_assign_pointer(vq->private_data, sock);
535 vhost_net_enable_vq(n, vq);
536done:
537 mutex_unlock(&vq->mutex); 746 mutex_unlock(&vq->mutex);
538 747
539 if (oldsock) { 748 if (oldsock) {
@@ -574,9 +783,21 @@ done:
574 783
575static int vhost_net_set_features(struct vhost_net *n, u64 features) 784static int vhost_net_set_features(struct vhost_net *n, u64 features)
576{ 785{
577 size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ? 786 size_t vhost_hlen, sock_hlen, hdr_len;
578 sizeof(struct virtio_net_hdr) : 0;
579 int i; 787 int i;
788
789 hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ?
790 sizeof(struct virtio_net_hdr_mrg_rxbuf) :
791 sizeof(struct virtio_net_hdr);
792 if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
793 /* vhost provides vnet_hdr */
794 vhost_hlen = hdr_len;
795 sock_hlen = 0;
796 } else {
797 /* socket provides vnet_hdr */
798 vhost_hlen = 0;
799 sock_hlen = hdr_len;
800 }
580 mutex_lock(&n->dev.mutex); 801 mutex_lock(&n->dev.mutex);
581 if ((features & (1 << VHOST_F_LOG_ALL)) && 802 if ((features & (1 << VHOST_F_LOG_ALL)) &&
582 !vhost_log_access_ok(&n->dev)) { 803 !vhost_log_access_ok(&n->dev)) {
@@ -587,7 +808,8 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
587 smp_wmb(); 808 smp_wmb();
588 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { 809 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
589 mutex_lock(&n->vqs[i].mutex); 810 mutex_lock(&n->vqs[i].mutex);
590 n->vqs[i].hdr_size = hdr_size; 811 n->vqs[i].vhost_hlen = vhost_hlen;
812 n->vqs[i].sock_hlen = sock_hlen;
591 mutex_unlock(&n->vqs[i].mutex); 813 mutex_unlock(&n->vqs[i].mutex);
592 } 814 }
593 vhost_net_flush(n); 815 vhost_net_flush(n);
@@ -639,7 +861,7 @@ static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl,
639} 861}
640#endif 862#endif
641 863
642const static struct file_operations vhost_net_fops = { 864static const struct file_operations vhost_net_fops = {
643 .owner = THIS_MODULE, 865 .owner = THIS_MODULE,
644 .release = vhost_net_release, 866 .release = vhost_net_release,
645 .unlocked_ioctl = vhost_net_ioctl, 867 .unlocked_ioctl = vhost_net_ioctl,
@@ -657,25 +879,13 @@ static struct miscdevice vhost_net_misc = {
657 879
658static int vhost_net_init(void) 880static int vhost_net_init(void)
659{ 881{
660 int r = vhost_init(); 882 return misc_register(&vhost_net_misc);
661 if (r)
662 goto err_init;
663 r = misc_register(&vhost_net_misc);
664 if (r)
665 goto err_reg;
666 return 0;
667err_reg:
668 vhost_cleanup();
669err_init:
670 return r;
671
672} 883}
673module_init(vhost_net_init); 884module_init(vhost_net_init);
674 885
675static void vhost_net_exit(void) 886static void vhost_net_exit(void)
676{ 887{
677 misc_deregister(&vhost_net_misc); 888 misc_deregister(&vhost_net_misc);
678 vhost_cleanup();
679} 889}
680module_exit(vhost_net_exit); 890module_exit(vhost_net_exit);
681 891