aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/vhost/net.c77
-rw-r--r--drivers/vhost/vhost.c128
-rw-r--r--drivers/vhost/vhost.h31
3 files changed, 220 insertions, 16 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e224a92baa16..f0fd52cdfadc 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -12,6 +12,7 @@
12#include <linux/virtio_net.h> 12#include <linux/virtio_net.h>
13#include <linux/miscdevice.h> 13#include <linux/miscdevice.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/moduleparam.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
16#include <linux/workqueue.h> 17#include <linux/workqueue.h>
17#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
@@ -28,10 +29,18 @@
28 29
29#include "vhost.h" 30#include "vhost.h"
30 31
32static int experimental_zcopytx;
33module_param(experimental_zcopytx, int, 0444);
34MODULE_PARM_DESC(experimental_zcopytx, "Enable Experimental Zero Copy TX");
35
31/* Max number of bytes transferred before requeueing the job. 36/* Max number of bytes transferred before requeueing the job.
32 * Using this limit prevents one virtqueue from starving others. */ 37 * Using this limit prevents one virtqueue from starving others. */
33#define VHOST_NET_WEIGHT 0x80000 38#define VHOST_NET_WEIGHT 0x80000
34 39
40/* MAX number of TX used buffers for outstanding zerocopy */
41#define VHOST_MAX_PEND 128
42#define VHOST_GOODCOPY_LEN 256
43
35enum { 44enum {
36 VHOST_NET_VQ_RX = 0, 45 VHOST_NET_VQ_RX = 0,
37 VHOST_NET_VQ_TX = 1, 46 VHOST_NET_VQ_TX = 1,
@@ -54,6 +63,12 @@ struct vhost_net {
54 enum vhost_net_poll_state tx_poll_state; 63 enum vhost_net_poll_state tx_poll_state;
55}; 64};
56 65
66static bool vhost_sock_zcopy(struct socket *sock)
67{
68 return unlikely(experimental_zcopytx) &&
69 sock_flag(sock->sk, SOCK_ZEROCOPY);
70}
71
57/* Pop first len bytes from iovec. Return number of segments used. */ 72/* Pop first len bytes from iovec. Return number of segments used. */
58static int move_iovec_hdr(struct iovec *from, struct iovec *to, 73static int move_iovec_hdr(struct iovec *from, struct iovec *to,
59 size_t len, int iov_count) 74 size_t len, int iov_count)
@@ -129,6 +144,8 @@ static void handle_tx(struct vhost_net *net)
129 int err, wmem; 144 int err, wmem;
130 size_t hdr_size; 145 size_t hdr_size;
131 struct socket *sock; 146 struct socket *sock;
147 struct vhost_ubuf_ref *uninitialized_var(ubufs);
148 bool zcopy;
132 149
133 /* TODO: check that we are running from vhost_worker? */ 150 /* TODO: check that we are running from vhost_worker? */
134 sock = rcu_dereference_check(vq->private_data, 1); 151 sock = rcu_dereference_check(vq->private_data, 1);
@@ -149,8 +166,13 @@ static void handle_tx(struct vhost_net *net)
149 if (wmem < sock->sk->sk_sndbuf / 2) 166 if (wmem < sock->sk->sk_sndbuf / 2)
150 tx_poll_stop(net); 167 tx_poll_stop(net);
151 hdr_size = vq->vhost_hlen; 168 hdr_size = vq->vhost_hlen;
169 zcopy = vhost_sock_zcopy(sock);
152 170
153 for (;;) { 171 for (;;) {
172 /* Release DMAs done buffers first */
173 if (zcopy)
174 vhost_zerocopy_signal_used(vq);
175
154 head = vhost_get_vq_desc(&net->dev, vq, vq->iov, 176 head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
155 ARRAY_SIZE(vq->iov), 177 ARRAY_SIZE(vq->iov),
156 &out, &in, 178 &out, &in,
@@ -166,6 +188,13 @@ static void handle_tx(struct vhost_net *net)
166 set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); 188 set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
167 break; 189 break;
168 } 190 }
191 /* If more outstanding DMAs, queue the work */
192 if (unlikely(vq->upend_idx - vq->done_idx >
193 VHOST_MAX_PEND)) {
194 tx_poll_start(net, sock);
195 set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
196 break;
197 }
169 if (unlikely(vhost_enable_notify(&net->dev, vq))) { 198 if (unlikely(vhost_enable_notify(&net->dev, vq))) {
170 vhost_disable_notify(&net->dev, vq); 199 vhost_disable_notify(&net->dev, vq);
171 continue; 200 continue;
@@ -188,9 +217,39 @@ static void handle_tx(struct vhost_net *net)
188 iov_length(vq->hdr, s), hdr_size); 217 iov_length(vq->hdr, s), hdr_size);
189 break; 218 break;
190 } 219 }
220 /* use msg_control to pass vhost zerocopy ubuf info to skb */
221 if (zcopy) {
222 vq->heads[vq->upend_idx].id = head;
223 if (len < VHOST_GOODCOPY_LEN) {
224 /* copy don't need to wait for DMA done */
225 vq->heads[vq->upend_idx].len =
226 VHOST_DMA_DONE_LEN;
227 msg.msg_control = NULL;
228 msg.msg_controllen = 0;
229 ubufs = NULL;
230 } else {
231 struct ubuf_info *ubuf = &vq->ubuf_info[head];
232
233 vq->heads[vq->upend_idx].len = len;
234 ubuf->callback = vhost_zerocopy_callback;
235 ubuf->arg = vq->ubufs;
236 ubuf->desc = vq->upend_idx;
237 msg.msg_control = ubuf;
238 msg.msg_controllen = sizeof(ubuf);
239 ubufs = vq->ubufs;
240 kref_get(&ubufs->kref);
241 }
242 vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
243 }
191 /* TODO: Check specific error and bomb out unless ENOBUFS? */ 244 /* TODO: Check specific error and bomb out unless ENOBUFS? */
192 err = sock->ops->sendmsg(NULL, sock, &msg, len); 245 err = sock->ops->sendmsg(NULL, sock, &msg, len);
193 if (unlikely(err < 0)) { 246 if (unlikely(err < 0)) {
247 if (zcopy) {
248 if (ubufs)
249 vhost_ubuf_put(ubufs);
250 vq->upend_idx = ((unsigned)vq->upend_idx - 1) %
251 UIO_MAXIOV;
252 }
194 vhost_discard_vq_desc(vq, 1); 253 vhost_discard_vq_desc(vq, 1);
195 tx_poll_start(net, sock); 254 tx_poll_start(net, sock);
196 break; 255 break;
@@ -198,7 +257,8 @@ static void handle_tx(struct vhost_net *net)
198 if (err != len) 257 if (err != len)
199 pr_debug("Truncated TX packet: " 258 pr_debug("Truncated TX packet: "
200 " len %d != %zd\n", err, len); 259 " len %d != %zd\n", err, len);
201 vhost_add_used_and_signal(&net->dev, vq, head, 0); 260 if (!zcopy)
261 vhost_add_used_and_signal(&net->dev, vq, head, 0);
202 total_len += len; 262 total_len += len;
203 if (unlikely(total_len >= VHOST_NET_WEIGHT)) { 263 if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
204 vhost_poll_queue(&vq->poll); 264 vhost_poll_queue(&vq->poll);
@@ -603,6 +663,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
603{ 663{
604 struct socket *sock, *oldsock; 664 struct socket *sock, *oldsock;
605 struct vhost_virtqueue *vq; 665 struct vhost_virtqueue *vq;
666 struct vhost_ubuf_ref *ubufs, *oldubufs = NULL;
606 int r; 667 int r;
607 668
608 mutex_lock(&n->dev.mutex); 669 mutex_lock(&n->dev.mutex);
@@ -632,6 +693,13 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
632 oldsock = rcu_dereference_protected(vq->private_data, 693 oldsock = rcu_dereference_protected(vq->private_data,
633 lockdep_is_held(&vq->mutex)); 694 lockdep_is_held(&vq->mutex));
634 if (sock != oldsock) { 695 if (sock != oldsock) {
696 ubufs = vhost_ubuf_alloc(vq, sock && vhost_sock_zcopy(sock));
697 if (IS_ERR(ubufs)) {
698 r = PTR_ERR(ubufs);
699 goto err_ubufs;
700 }
701 oldubufs = vq->ubufs;
702 vq->ubufs = ubufs;
635 vhost_net_disable_vq(n, vq); 703 vhost_net_disable_vq(n, vq);
636 rcu_assign_pointer(vq->private_data, sock); 704 rcu_assign_pointer(vq->private_data, sock);
637 vhost_net_enable_vq(n, vq); 705 vhost_net_enable_vq(n, vq);
@@ -639,6 +707,9 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
639 707
640 mutex_unlock(&vq->mutex); 708 mutex_unlock(&vq->mutex);
641 709
710 if (oldubufs)
711 vhost_ubuf_put_and_wait(oldubufs);
712
642 if (oldsock) { 713 if (oldsock) {
643 vhost_net_flush_vq(n, index); 714 vhost_net_flush_vq(n, index);
644 fput(oldsock->file); 715 fput(oldsock->file);
@@ -647,6 +718,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
647 mutex_unlock(&n->dev.mutex); 718 mutex_unlock(&n->dev.mutex);
648 return 0; 719 return 0;
649 720
721err_ubufs:
722 fput(sock->file);
650err_vq: 723err_vq:
651 mutex_unlock(&vq->mutex); 724 mutex_unlock(&vq->mutex);
652err: 725err:
@@ -776,6 +849,8 @@ static struct miscdevice vhost_net_misc = {
776 849
777static int vhost_net_init(void) 850static int vhost_net_init(void)
778{ 851{
852 if (experimental_zcopytx)
853 vhost_enable_zcopy(VHOST_NET_VQ_TX);
779 return misc_register(&vhost_net_misc); 854 return misc_register(&vhost_net_misc);
780} 855}
781module_init(vhost_net_init); 856module_init(vhost_net_init);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index ea966b356352..5ef2f62becf4 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -37,6 +37,8 @@ enum {
37 VHOST_MEMORY_F_LOG = 0x1, 37 VHOST_MEMORY_F_LOG = 0x1,
38}; 38};
39 39
40static unsigned vhost_zcopy_mask __read_mostly;
41
40#define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num]) 42#define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
41#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num]) 43#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
42 44
@@ -179,6 +181,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
179 vq->call_ctx = NULL; 181 vq->call_ctx = NULL;
180 vq->call = NULL; 182 vq->call = NULL;
181 vq->log_ctx = NULL; 183 vq->log_ctx = NULL;
184 vq->upend_idx = 0;
185 vq->done_idx = 0;
186 vq->ubufs = NULL;
182} 187}
183 188
184static int vhost_worker(void *data) 189static int vhost_worker(void *data)
@@ -225,10 +230,28 @@ static int vhost_worker(void *data)
225 return 0; 230 return 0;
226} 231}
227 232
233static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
234{
235 kfree(vq->indirect);
236 vq->indirect = NULL;
237 kfree(vq->log);
238 vq->log = NULL;
239 kfree(vq->heads);
240 vq->heads = NULL;
241 kfree(vq->ubuf_info);
242 vq->ubuf_info = NULL;
243}
244
245void vhost_enable_zcopy(int vq)
246{
247 vhost_zcopy_mask |= 0x1 << vq;
248}
249
228/* Helper to allocate iovec buffers for all vqs. */ 250/* Helper to allocate iovec buffers for all vqs. */
229static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) 251static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
230{ 252{
231 int i; 253 int i;
254 bool zcopy;
232 255
233 for (i = 0; i < dev->nvqs; ++i) { 256 for (i = 0; i < dev->nvqs; ++i) {
234 dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect * 257 dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect *
@@ -237,19 +260,21 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
237 GFP_KERNEL); 260 GFP_KERNEL);
238 dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads * 261 dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
239 UIO_MAXIOV, GFP_KERNEL); 262 UIO_MAXIOV, GFP_KERNEL);
240 263 zcopy = vhost_zcopy_mask & (0x1 << i);
264 if (zcopy)
265 dev->vqs[i].ubuf_info =
266 kmalloc(sizeof *dev->vqs[i].ubuf_info *
267 UIO_MAXIOV, GFP_KERNEL);
241 if (!dev->vqs[i].indirect || !dev->vqs[i].log || 268 if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
242 !dev->vqs[i].heads) 269 !dev->vqs[i].heads ||
270 (zcopy && !dev->vqs[i].ubuf_info))
243 goto err_nomem; 271 goto err_nomem;
244 } 272 }
245 return 0; 273 return 0;
246 274
247err_nomem: 275err_nomem:
248 for (; i >= 0; --i) { 276 for (; i >= 0; --i)
249 kfree(dev->vqs[i].indirect); 277 vhost_vq_free_iovecs(&dev->vqs[i]);
250 kfree(dev->vqs[i].log);
251 kfree(dev->vqs[i].heads);
252 }
253 return -ENOMEM; 278 return -ENOMEM;
254} 279}
255 280
@@ -257,14 +282,8 @@ static void vhost_dev_free_iovecs(struct vhost_dev *dev)
257{ 282{
258 int i; 283 int i;
259 284
260 for (i = 0; i < dev->nvqs; ++i) { 285 for (i = 0; i < dev->nvqs; ++i)
261 kfree(dev->vqs[i].indirect); 286 vhost_vq_free_iovecs(&dev->vqs[i]);
262 dev->vqs[i].indirect = NULL;
263 kfree(dev->vqs[i].log);
264 dev->vqs[i].log = NULL;
265 kfree(dev->vqs[i].heads);
266 dev->vqs[i].heads = NULL;
267 }
268} 287}
269 288
270long vhost_dev_init(struct vhost_dev *dev, 289long vhost_dev_init(struct vhost_dev *dev,
@@ -287,6 +306,7 @@ long vhost_dev_init(struct vhost_dev *dev,
287 dev->vqs[i].log = NULL; 306 dev->vqs[i].log = NULL;
288 dev->vqs[i].indirect = NULL; 307 dev->vqs[i].indirect = NULL;
289 dev->vqs[i].heads = NULL; 308 dev->vqs[i].heads = NULL;
309 dev->vqs[i].ubuf_info = NULL;
290 dev->vqs[i].dev = dev; 310 dev->vqs[i].dev = dev;
291 mutex_init(&dev->vqs[i].mutex); 311 mutex_init(&dev->vqs[i].mutex);
292 vhost_vq_reset(dev, dev->vqs + i); 312 vhost_vq_reset(dev, dev->vqs + i);
@@ -390,6 +410,30 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
390 return 0; 410 return 0;
391} 411}
392 412
413/* In case of DMA done not in order in lower device driver for some reason.
414 * upend_idx is used to track end of used idx, done_idx is used to track head
415 * of used idx. Once lower device DMA done contiguously, we will signal KVM
416 * guest used idx.
417 */
418int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
419{
420 int i;
421 int j = 0;
422
423 for (i = vq->done_idx; i != vq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
424 if ((vq->heads[i].len == VHOST_DMA_DONE_LEN)) {
425 vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
426 vhost_add_used_and_signal(vq->dev, vq,
427 vq->heads[i].id, 0);
428 ++j;
429 } else
430 break;
431 }
432 if (j)
433 vq->done_idx = i;
434 return j;
435}
436
393/* Caller should have device mutex */ 437/* Caller should have device mutex */
394void vhost_dev_cleanup(struct vhost_dev *dev) 438void vhost_dev_cleanup(struct vhost_dev *dev)
395{ 439{
@@ -400,6 +444,13 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
400 vhost_poll_stop(&dev->vqs[i].poll); 444 vhost_poll_stop(&dev->vqs[i].poll);
401 vhost_poll_flush(&dev->vqs[i].poll); 445 vhost_poll_flush(&dev->vqs[i].poll);
402 } 446 }
447 /* Wait for all lower device DMAs done. */
448 if (dev->vqs[i].ubufs)
449 vhost_ubuf_put_and_wait(dev->vqs[i].ubufs);
450
451 /* Signal guest as appropriate. */
452 vhost_zerocopy_signal_used(&dev->vqs[i]);
453
403 if (dev->vqs[i].error_ctx) 454 if (dev->vqs[i].error_ctx)
404 eventfd_ctx_put(dev->vqs[i].error_ctx); 455 eventfd_ctx_put(dev->vqs[i].error_ctx);
405 if (dev->vqs[i].error) 456 if (dev->vqs[i].error)
@@ -1486,3 +1537,50 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
1486 &vq->used->flags, r); 1537 &vq->used->flags, r);
1487 } 1538 }
1488} 1539}
1540
1541static void vhost_zerocopy_done_signal(struct kref *kref)
1542{
1543 struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref,
1544 kref);
1545 wake_up(&ubufs->wait);
1546}
1547
1548struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
1549 bool zcopy)
1550{
1551 struct vhost_ubuf_ref *ubufs;
1552 /* No zero copy backend? Nothing to count. */
1553 if (!zcopy)
1554 return NULL;
1555 ubufs = kmalloc(sizeof *ubufs, GFP_KERNEL);
1556 if (!ubufs)
1557 return ERR_PTR(-ENOMEM);
1558 kref_init(&ubufs->kref);
1559 kref_get(&ubufs->kref);
1560 init_waitqueue_head(&ubufs->wait);
1561 ubufs->vq = vq;
1562 return ubufs;
1563}
1564
1565void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs)
1566{
1567 kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
1568}
1569
1570void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
1571{
1572 kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
1573 wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
1574 kfree(ubufs);
1575}
1576
1577void vhost_zerocopy_callback(void *arg)
1578{
1579 struct ubuf_info *ubuf = arg;
1580 struct vhost_ubuf_ref *ubufs = ubuf->arg;
1581 struct vhost_virtqueue *vq = ubufs->vq;
1582
1583 /* set len = 1 to mark this desc buffers done DMA */
1584 vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
1585 kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
1586}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8e03379dd30f..1544b782529b 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -13,6 +13,11 @@
13#include <linux/virtio_ring.h> 13#include <linux/virtio_ring.h>
14#include <asm/atomic.h> 14#include <asm/atomic.h>
15 15
16/* This is for zerocopy, used buffer len is set to 1 when lower device DMA
17 * done */
18#define VHOST_DMA_DONE_LEN 1
19#define VHOST_DMA_CLEAR_LEN 0
20
16struct vhost_device; 21struct vhost_device;
17 22
18struct vhost_work; 23struct vhost_work;
@@ -50,6 +55,18 @@ struct vhost_log {
50 u64 len; 55 u64 len;
51}; 56};
52 57
58struct vhost_virtqueue;
59
60struct vhost_ubuf_ref {
61 struct kref kref;
62 wait_queue_head_t wait;
63 struct vhost_virtqueue *vq;
64};
65
66struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *, bool zcopy);
67void vhost_ubuf_put(struct vhost_ubuf_ref *);
68void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *);
69
53/* The virtqueue structure describes a queue attached to a device. */ 70/* The virtqueue structure describes a queue attached to a device. */
54struct vhost_virtqueue { 71struct vhost_virtqueue {
55 struct vhost_dev *dev; 72 struct vhost_dev *dev;
@@ -114,6 +131,16 @@ struct vhost_virtqueue {
114 /* Log write descriptors */ 131 /* Log write descriptors */
115 void __user *log_base; 132 void __user *log_base;
116 struct vhost_log *log; 133 struct vhost_log *log;
134 /* vhost zerocopy support fields below: */
135 /* last used idx for outstanding DMA zerocopy buffers */
136 int upend_idx;
137 /* first used idx for DMA done zerocopy buffers */
138 int done_idx;
139 /* an array of userspace buffers info */
140 struct ubuf_info *ubuf_info;
141 /* Reference counting for outstanding ubufs.
142 * Protected by vq mutex. Writers must also take device mutex. */
143 struct vhost_ubuf_ref *ubufs;
117}; 144};
118 145
119struct vhost_dev { 146struct vhost_dev {
@@ -160,6 +187,8 @@ bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *);
160 187
161int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, 188int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
162 unsigned int log_num, u64 len); 189 unsigned int log_num, u64 len);
190void vhost_zerocopy_callback(void *arg);
191int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
163 192
164#define vq_err(vq, fmt, ...) do { \ 193#define vq_err(vq, fmt, ...) do { \
165 pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ 194 pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
@@ -186,4 +215,6 @@ static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
186 return acked_features & (1 << bit); 215 return acked_features & (1 << bit);
187} 216}
188 217
218void vhost_enable_zcopy(int vq);
219
189#endif 220#endif