aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/vhost/net.c
diff options
context:
space:
mode:
authorMichael S. Tsirkin <mst@redhat.com>2011-07-17 23:48:46 -0400
committerDavid S. Miller <davem@davemloft.net>2011-07-18 13:42:32 -0400
commitbab632d69ee48a106e779b60cc01adfe80a72807 (patch)
tree56b8bd3df85cfee8e425abe18963e5aad015e2fa /drivers/vhost/net.c
parent5c74501f76360ce6f410730b9b5e5976f38e8504 (diff)
vhost: vhost TX zero-copy support
>From: Shirley Ma <mashirle@us.ibm.com> This adds experimental zero copy support in vhost-net, disabled by default. To enable, set experimental_zcopytx module option to 1. This patch maintains the outstanding userspace buffers in the sequence it is delivered to vhost. The outstanding userspace buffers will be marked as done once the lower device buffers DMA has finished. This is monitored through last reference of kfree_skb callback. Two buffer indices are used for this purpose. The vhost-net device passes the userspace buffers info to lower device skb through message control. DMA done status check and guest notification are handled by handle_tx: in the worst case is all buffers in the vq are in pending/done status, so we need to notify guest to release DMA done buffers first before we get any new buffers from the vq. One known problem is that if the guest stops submitting buffers, buffers might never get used until some further action, e.g. device reset. This does not seem to affect linux guests. Signed-off-by: Shirley <xma@us.ibm.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/vhost/net.c')
-rw-r--r--drivers/vhost/net.c77
1 files changed, 76 insertions, 1 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e224a92baa1..f0fd52cdfad 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -12,6 +12,7 @@
12#include <linux/virtio_net.h> 12#include <linux/virtio_net.h>
13#include <linux/miscdevice.h> 13#include <linux/miscdevice.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/moduleparam.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
16#include <linux/workqueue.h> 17#include <linux/workqueue.h>
17#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
@@ -28,10 +29,18 @@
28 29
29#include "vhost.h" 30#include "vhost.h"
30 31
32static int experimental_zcopytx;
33module_param(experimental_zcopytx, int, 0444);
34MODULE_PARM_DESC(experimental_zcopytx, "Enable Experimental Zero Copy TX");
35
31/* Max number of bytes transferred before requeueing the job. 36/* Max number of bytes transferred before requeueing the job.
32 * Using this limit prevents one virtqueue from starving others. */ 37 * Using this limit prevents one virtqueue from starving others. */
33#define VHOST_NET_WEIGHT 0x80000 38#define VHOST_NET_WEIGHT 0x80000
34 39
40/* MAX number of TX used buffers for outstanding zerocopy */
41#define VHOST_MAX_PEND 128
42#define VHOST_GOODCOPY_LEN 256
43
35enum { 44enum {
36 VHOST_NET_VQ_RX = 0, 45 VHOST_NET_VQ_RX = 0,
37 VHOST_NET_VQ_TX = 1, 46 VHOST_NET_VQ_TX = 1,
@@ -54,6 +63,12 @@ struct vhost_net {
54 enum vhost_net_poll_state tx_poll_state; 63 enum vhost_net_poll_state tx_poll_state;
55}; 64};
56 65
66static bool vhost_sock_zcopy(struct socket *sock)
67{
68 return unlikely(experimental_zcopytx) &&
69 sock_flag(sock->sk, SOCK_ZEROCOPY);
70}
71
57/* Pop first len bytes from iovec. Return number of segments used. */ 72/* Pop first len bytes from iovec. Return number of segments used. */
58static int move_iovec_hdr(struct iovec *from, struct iovec *to, 73static int move_iovec_hdr(struct iovec *from, struct iovec *to,
59 size_t len, int iov_count) 74 size_t len, int iov_count)
@@ -129,6 +144,8 @@ static void handle_tx(struct vhost_net *net)
129 int err, wmem; 144 int err, wmem;
130 size_t hdr_size; 145 size_t hdr_size;
131 struct socket *sock; 146 struct socket *sock;
147 struct vhost_ubuf_ref *uninitialized_var(ubufs);
148 bool zcopy;
132 149
133 /* TODO: check that we are running from vhost_worker? */ 150 /* TODO: check that we are running from vhost_worker? */
134 sock = rcu_dereference_check(vq->private_data, 1); 151 sock = rcu_dereference_check(vq->private_data, 1);
@@ -149,8 +166,13 @@ static void handle_tx(struct vhost_net *net)
149 if (wmem < sock->sk->sk_sndbuf / 2) 166 if (wmem < sock->sk->sk_sndbuf / 2)
150 tx_poll_stop(net); 167 tx_poll_stop(net);
151 hdr_size = vq->vhost_hlen; 168 hdr_size = vq->vhost_hlen;
169 zcopy = vhost_sock_zcopy(sock);
152 170
153 for (;;) { 171 for (;;) {
172 /* Release DMAs done buffers first */
173 if (zcopy)
174 vhost_zerocopy_signal_used(vq);
175
154 head = vhost_get_vq_desc(&net->dev, vq, vq->iov, 176 head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
155 ARRAY_SIZE(vq->iov), 177 ARRAY_SIZE(vq->iov),
156 &out, &in, 178 &out, &in,
@@ -166,6 +188,13 @@ static void handle_tx(struct vhost_net *net)
166 set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); 188 set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
167 break; 189 break;
168 } 190 }
191 /* If more outstanding DMAs, queue the work */
192 if (unlikely(vq->upend_idx - vq->done_idx >
193 VHOST_MAX_PEND)) {
194 tx_poll_start(net, sock);
195 set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
196 break;
197 }
169 if (unlikely(vhost_enable_notify(&net->dev, vq))) { 198 if (unlikely(vhost_enable_notify(&net->dev, vq))) {
170 vhost_disable_notify(&net->dev, vq); 199 vhost_disable_notify(&net->dev, vq);
171 continue; 200 continue;
@@ -188,9 +217,39 @@ static void handle_tx(struct vhost_net *net)
188 iov_length(vq->hdr, s), hdr_size); 217 iov_length(vq->hdr, s), hdr_size);
189 break; 218 break;
190 } 219 }
220 /* use msg_control to pass vhost zerocopy ubuf info to skb */
221 if (zcopy) {
222 vq->heads[vq->upend_idx].id = head;
223 if (len < VHOST_GOODCOPY_LEN) {
224 /* copy don't need to wait for DMA done */
225 vq->heads[vq->upend_idx].len =
226 VHOST_DMA_DONE_LEN;
227 msg.msg_control = NULL;
228 msg.msg_controllen = 0;
229 ubufs = NULL;
230 } else {
231 struct ubuf_info *ubuf = &vq->ubuf_info[head];
232
233 vq->heads[vq->upend_idx].len = len;
234 ubuf->callback = vhost_zerocopy_callback;
235 ubuf->arg = vq->ubufs;
236 ubuf->desc = vq->upend_idx;
237 msg.msg_control = ubuf;
238 msg.msg_controllen = sizeof(ubuf);
239 ubufs = vq->ubufs;
240 kref_get(&ubufs->kref);
241 }
242 vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
243 }
191 /* TODO: Check specific error and bomb out unless ENOBUFS? */ 244 /* TODO: Check specific error and bomb out unless ENOBUFS? */
192 err = sock->ops->sendmsg(NULL, sock, &msg, len); 245 err = sock->ops->sendmsg(NULL, sock, &msg, len);
193 if (unlikely(err < 0)) { 246 if (unlikely(err < 0)) {
247 if (zcopy) {
248 if (ubufs)
249 vhost_ubuf_put(ubufs);
250 vq->upend_idx = ((unsigned)vq->upend_idx - 1) %
251 UIO_MAXIOV;
252 }
194 vhost_discard_vq_desc(vq, 1); 253 vhost_discard_vq_desc(vq, 1);
195 tx_poll_start(net, sock); 254 tx_poll_start(net, sock);
196 break; 255 break;
@@ -198,7 +257,8 @@ static void handle_tx(struct vhost_net *net)
198 if (err != len) 257 if (err != len)
199 pr_debug("Truncated TX packet: " 258 pr_debug("Truncated TX packet: "
200 " len %d != %zd\n", err, len); 259 " len %d != %zd\n", err, len);
201 vhost_add_used_and_signal(&net->dev, vq, head, 0); 260 if (!zcopy)
261 vhost_add_used_and_signal(&net->dev, vq, head, 0);
202 total_len += len; 262 total_len += len;
203 if (unlikely(total_len >= VHOST_NET_WEIGHT)) { 263 if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
204 vhost_poll_queue(&vq->poll); 264 vhost_poll_queue(&vq->poll);
@@ -603,6 +663,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
603{ 663{
604 struct socket *sock, *oldsock; 664 struct socket *sock, *oldsock;
605 struct vhost_virtqueue *vq; 665 struct vhost_virtqueue *vq;
666 struct vhost_ubuf_ref *ubufs, *oldubufs = NULL;
606 int r; 667 int r;
607 668
608 mutex_lock(&n->dev.mutex); 669 mutex_lock(&n->dev.mutex);
@@ -632,6 +693,13 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
632 oldsock = rcu_dereference_protected(vq->private_data, 693 oldsock = rcu_dereference_protected(vq->private_data,
633 lockdep_is_held(&vq->mutex)); 694 lockdep_is_held(&vq->mutex));
634 if (sock != oldsock) { 695 if (sock != oldsock) {
696 ubufs = vhost_ubuf_alloc(vq, sock && vhost_sock_zcopy(sock));
697 if (IS_ERR(ubufs)) {
698 r = PTR_ERR(ubufs);
699 goto err_ubufs;
700 }
701 oldubufs = vq->ubufs;
702 vq->ubufs = ubufs;
635 vhost_net_disable_vq(n, vq); 703 vhost_net_disable_vq(n, vq);
636 rcu_assign_pointer(vq->private_data, sock); 704 rcu_assign_pointer(vq->private_data, sock);
637 vhost_net_enable_vq(n, vq); 705 vhost_net_enable_vq(n, vq);
@@ -639,6 +707,9 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
639 707
640 mutex_unlock(&vq->mutex); 708 mutex_unlock(&vq->mutex);
641 709
710 if (oldubufs)
711 vhost_ubuf_put_and_wait(oldubufs);
712
642 if (oldsock) { 713 if (oldsock) {
643 vhost_net_flush_vq(n, index); 714 vhost_net_flush_vq(n, index);
644 fput(oldsock->file); 715 fput(oldsock->file);
@@ -647,6 +718,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
647 mutex_unlock(&n->dev.mutex); 718 mutex_unlock(&n->dev.mutex);
648 return 0; 719 return 0;
649 720
721err_ubufs:
722 fput(sock->file);
650err_vq: 723err_vq:
651 mutex_unlock(&vq->mutex); 724 mutex_unlock(&vq->mutex);
652err: 725err:
@@ -776,6 +849,8 @@ static struct miscdevice vhost_net_misc = {
776 849
777static int vhost_net_init(void) 850static int vhost_net_init(void)
778{ 851{
852 if (experimental_zcopytx)
853 vhost_enable_zcopy(VHOST_NET_VQ_TX);
779 return misc_register(&vhost_net_misc); 854 return misc_register(&vhost_net_misc);
780} 855}
781module_init(vhost_net_init); 856module_init(vhost_net_init);