aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/vhost/net.c
diff options
context:
space:
mode:
authorDavid Stevens <dlstevens@us.ibm.com>2010-07-27 11:52:21 -0400
committerMichael S. Tsirkin <mst@redhat.com>2010-07-28 08:45:36 -0400
commit8dd014adfea6f173c1ef6378f7e5e7924866c923 (patch)
tree303df47f73c53cc4c919e3f8aca8245d50d76a08 /drivers/vhost/net.c
parent9e3d195720d1c8b1e09013185ab8c3b749180fc2 (diff)
vhost-net: mergeable buffers support
This adds support for mergeable buffers in vhost-net: this is needed for older guests without indirect buffer support, as well as for zero copy with some devices. Includes changes by Michael S. Tsirkin to make the patch as low risk as possible (i.e., close to no changes when feature is disabled). Signed-off-by: David Stevens <dlstevens@us.ibm.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Diffstat (limited to 'drivers/vhost/net.c')
-rw-r--r--drivers/vhost/net.c237
1 files changed, 228 insertions, 9 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d395b59289ae..f13e56babe4b 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -74,6 +74,22 @@ static int move_iovec_hdr(struct iovec *from, struct iovec *to,
74 } 74 }
75 return seg; 75 return seg;
76} 76}
77/* Copy iovec entries for len bytes from iovec. */
78static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
79 size_t len, int iovcount)
80{
81 int seg = 0;
82 size_t size;
83 while (len && seg < iovcount) {
84 size = min(from->iov_len, len);
85 to->iov_base = from->iov_base;
86 to->iov_len = size;
87 len -= size;
88 ++from;
89 ++to;
90 ++seg;
91 }
92}
77 93
78/* Caller must have TX VQ lock */ 94/* Caller must have TX VQ lock */
79static void tx_poll_stop(struct vhost_net *net) 95static void tx_poll_stop(struct vhost_net *net)
@@ -129,7 +145,7 @@ static void handle_tx(struct vhost_net *net)
129 145
130 if (wmem < sock->sk->sk_sndbuf / 2) 146 if (wmem < sock->sk->sk_sndbuf / 2)
131 tx_poll_stop(net); 147 tx_poll_stop(net);
132 hdr_size = vq->hdr_size; 148 hdr_size = vq->vhost_hlen;
133 149
134 for (;;) { 150 for (;;) {
135 head = vhost_get_vq_desc(&net->dev, vq, vq->iov, 151 head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
@@ -172,7 +188,7 @@ static void handle_tx(struct vhost_net *net)
172 /* TODO: Check specific error and bomb out unless ENOBUFS? */ 188 /* TODO: Check specific error and bomb out unless ENOBUFS? */
173 err = sock->ops->sendmsg(NULL, sock, &msg, len); 189 err = sock->ops->sendmsg(NULL, sock, &msg, len);
174 if (unlikely(err < 0)) { 190 if (unlikely(err < 0)) {
175 vhost_discard_vq_desc(vq); 191 vhost_discard_vq_desc(vq, 1);
176 tx_poll_start(net, sock); 192 tx_poll_start(net, sock);
177 break; 193 break;
178 } 194 }
@@ -191,9 +207,82 @@ static void handle_tx(struct vhost_net *net)
191 unuse_mm(net->dev.mm); 207 unuse_mm(net->dev.mm);
192} 208}
193 209
210static int peek_head_len(struct sock *sk)
211{
212 struct sk_buff *head;
213 int len = 0;
214
215 lock_sock(sk);
216 head = skb_peek(&sk->sk_receive_queue);
217 if (head)
218 len = head->len;
219 release_sock(sk);
220 return len;
221}
222
223/* This is a multi-buffer version of vhost_get_desc, that works if
224 * vq has read descriptors only.
225 * @vq - the relevant virtqueue
226 * @datalen - data length we'll be reading
227 * @iovcount - returned count of io vectors we fill
228 * @log - vhost log
229 * @log_num - log offset
230 * returns number of buffer heads allocated, negative on error
231 */
232static int get_rx_bufs(struct vhost_virtqueue *vq,
233 struct vring_used_elem *heads,
234 int datalen,
235 unsigned *iovcount,
236 struct vhost_log *log,
237 unsigned *log_num)
238{
239 unsigned int out, in;
240 int seg = 0;
241 int headcount = 0;
242 unsigned d;
243 int r, nlogs = 0;
244
245 while (datalen > 0) {
246 if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
247 r = -ENOBUFS;
248 goto err;
249 }
250 d = vhost_get_vq_desc(vq->dev, vq, vq->iov + seg,
251 ARRAY_SIZE(vq->iov) - seg, &out,
252 &in, log, log_num);
253 if (d == vq->num) {
254 r = 0;
255 goto err;
256 }
257 if (unlikely(out || in <= 0)) {
258 vq_err(vq, "unexpected descriptor format for RX: "
259 "out %d, in %d\n", out, in);
260 r = -EINVAL;
261 goto err;
262 }
263 if (unlikely(log)) {
264 nlogs += *log_num;
265 log += *log_num;
266 }
267 heads[headcount].id = d;
268 heads[headcount].len = iov_length(vq->iov + seg, in);
269 datalen -= heads[headcount].len;
270 ++headcount;
271 seg += in;
272 }
273 heads[headcount - 1].len += datalen;
274 *iovcount = seg;
275 if (unlikely(log))
276 *log_num = nlogs;
277 return headcount;
278err:
279 vhost_discard_vq_desc(vq, headcount);
280 return r;
281}
282
194/* Expects to be always run from workqueue - which acts as 283/* Expects to be always run from workqueue - which acts as
195 * read-size critical section for our kind of RCU. */ 284 * read-size critical section for our kind of RCU. */
196static void handle_rx(struct vhost_net *net) 285static void handle_rx_big(struct vhost_net *net)
197{ 286{
198 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; 287 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
199 unsigned out, in, log, s; 288 unsigned out, in, log, s;
@@ -223,7 +312,7 @@ static void handle_rx(struct vhost_net *net)
223 use_mm(net->dev.mm); 312 use_mm(net->dev.mm);
224 mutex_lock(&vq->mutex); 313 mutex_lock(&vq->mutex);
225 vhost_disable_notify(vq); 314 vhost_disable_notify(vq);
226 hdr_size = vq->hdr_size; 315 hdr_size = vq->vhost_hlen;
227 316
228 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? 317 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
229 vq->log : NULL; 318 vq->log : NULL;
@@ -270,14 +359,14 @@ static void handle_rx(struct vhost_net *net)
270 len, MSG_DONTWAIT | MSG_TRUNC); 359 len, MSG_DONTWAIT | MSG_TRUNC);
271 /* TODO: Check specific error and bomb out unless EAGAIN? */ 360 /* TODO: Check specific error and bomb out unless EAGAIN? */
272 if (err < 0) { 361 if (err < 0) {
273 vhost_discard_vq_desc(vq); 362 vhost_discard_vq_desc(vq, 1);
274 break; 363 break;
275 } 364 }
276 /* TODO: Should check and handle checksum. */ 365 /* TODO: Should check and handle checksum. */
277 if (err > len) { 366 if (err > len) {
278 pr_debug("Discarded truncated rx packet: " 367 pr_debug("Discarded truncated rx packet: "
279 " len %d > %zd\n", err, len); 368 " len %d > %zd\n", err, len);
280 vhost_discard_vq_desc(vq); 369 vhost_discard_vq_desc(vq, 1);
281 continue; 370 continue;
282 } 371 }
283 len = err; 372 len = err;
@@ -302,6 +391,123 @@ static void handle_rx(struct vhost_net *net)
302 unuse_mm(net->dev.mm); 391 unuse_mm(net->dev.mm);
303} 392}
304 393
394/* Expects to be always run from workqueue - which acts as
395 * read-size critical section for our kind of RCU. */
396static void handle_rx_mergeable(struct vhost_net *net)
397{
398 struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
399 unsigned uninitialized_var(in), log;
400 struct vhost_log *vq_log;
401 struct msghdr msg = {
402 .msg_name = NULL,
403 .msg_namelen = 0,
404 .msg_control = NULL, /* FIXME: get and handle RX aux data. */
405 .msg_controllen = 0,
406 .msg_iov = vq->iov,
407 .msg_flags = MSG_DONTWAIT,
408 };
409
410 struct virtio_net_hdr_mrg_rxbuf hdr = {
411 .hdr.flags = 0,
412 .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
413 };
414
415 size_t total_len = 0;
416 int err, headcount;
417 size_t vhost_hlen, sock_hlen;
418 size_t vhost_len, sock_len;
419 struct socket *sock = rcu_dereference(vq->private_data);
420 if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
421 return;
422
423 use_mm(net->dev.mm);
424 mutex_lock(&vq->mutex);
425 vhost_disable_notify(vq);
426 vhost_hlen = vq->vhost_hlen;
427 sock_hlen = vq->sock_hlen;
428
429 vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
430 vq->log : NULL;
431
432 while ((sock_len = peek_head_len(sock->sk))) {
433 sock_len += sock_hlen;
434 vhost_len = sock_len + vhost_hlen;
435 headcount = get_rx_bufs(vq, vq->heads, vhost_len,
436 &in, vq_log, &log);
437 /* On error, stop handling until the next kick. */
438 if (unlikely(headcount < 0))
439 break;
440 /* OK, now we need to know about added descriptors. */
441 if (!headcount) {
442 if (unlikely(vhost_enable_notify(vq))) {
443 /* They have slipped one in as we were
444 * doing that: check again. */
445 vhost_disable_notify(vq);
446 continue;
447 }
448 /* Nothing new? Wait for eventfd to tell us
449 * they refilled. */
450 break;
451 }
452 /* We don't need to be notified again. */
453 if (unlikely((vhost_hlen)))
454 /* Skip header. TODO: support TSO. */
455 move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
456 else
457 /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
458 * needed because sendmsg can modify msg_iov. */
459 copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
460 msg.msg_iovlen = in;
461 err = sock->ops->recvmsg(NULL, sock, &msg,
462 sock_len, MSG_DONTWAIT | MSG_TRUNC);
463 /* Userspace might have consumed the packet meanwhile:
464 * it's not supposed to do this usually, but might be hard
465 * to prevent. Discard data we got (if any) and keep going. */
466 if (unlikely(err != sock_len)) {
467 pr_debug("Discarded rx packet: "
468 " len %d, expected %zd\n", err, sock_len);
469 vhost_discard_vq_desc(vq, headcount);
470 continue;
471 }
472 if (unlikely(vhost_hlen) &&
473 memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
474 vhost_hlen)) {
475 vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
476 vq->iov->iov_base);
477 break;
478 }
479 /* TODO: Should check and handle checksum. */
480 if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
481 memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
482 offsetof(typeof(hdr), num_buffers),
483 sizeof hdr.num_buffers)) {
484 vq_err(vq, "Failed num_buffers write");
485 vhost_discard_vq_desc(vq, headcount);
486 break;
487 }
488 vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
489 headcount);
490 if (unlikely(vq_log))
491 vhost_log_write(vq, vq_log, log, vhost_len);
492 total_len += vhost_len;
493 if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
494 vhost_poll_queue(&vq->poll);
495 break;
496 }
497 }
498
499 mutex_unlock(&vq->mutex);
500 unuse_mm(net->dev.mm);
501}
502
503static void handle_rx(struct vhost_net *net)
504{
505 if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
506 handle_rx_mergeable(net);
507 else
508 handle_rx_big(net);
509}
510
305static void handle_tx_kick(struct vhost_work *work) 511static void handle_tx_kick(struct vhost_work *work)
306{ 512{
307 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, 513 struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
@@ -577,9 +783,21 @@ done:
577 783
578static int vhost_net_set_features(struct vhost_net *n, u64 features) 784static int vhost_net_set_features(struct vhost_net *n, u64 features)
579{ 785{
580 size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ? 786 size_t vhost_hlen, sock_hlen, hdr_len;
581 sizeof(struct virtio_net_hdr) : 0;
582 int i; 787 int i;
788
789 hdr_len = (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ?
790 sizeof(struct virtio_net_hdr_mrg_rxbuf) :
791 sizeof(struct virtio_net_hdr);
792 if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
793 /* vhost provides vnet_hdr */
794 vhost_hlen = hdr_len;
795 sock_hlen = 0;
796 } else {
797 /* socket provides vnet_hdr */
798 vhost_hlen = 0;
799 sock_hlen = hdr_len;
800 }
583 mutex_lock(&n->dev.mutex); 801 mutex_lock(&n->dev.mutex);
584 if ((features & (1 << VHOST_F_LOG_ALL)) && 802 if ((features & (1 << VHOST_F_LOG_ALL)) &&
585 !vhost_log_access_ok(&n->dev)) { 803 !vhost_log_access_ok(&n->dev)) {
@@ -590,7 +808,8 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
590 smp_wmb(); 808 smp_wmb();
591 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { 809 for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
592 mutex_lock(&n->vqs[i].mutex); 810 mutex_lock(&n->vqs[i].mutex);
593 n->vqs[i].hdr_size = hdr_size; 811 n->vqs[i].vhost_hlen = vhost_hlen;
812 n->vqs[i].sock_hlen = sock_hlen;
594 mutex_unlock(&n->vqs[i].mutex); 813 mutex_unlock(&n->vqs[i].mutex);
595 } 814 }
596 vhost_net_flush(n); 815 vhost_net_flush(n);