aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/vhost/vhost.c
diff options
context:
space:
mode:
authorMichael S. Tsirkin <mst@redhat.com>2011-07-17 23:48:46 -0400
committerDavid S. Miller <davem@davemloft.net>2011-07-18 13:42:32 -0400
commitbab632d69ee48a106e779b60cc01adfe80a72807 (patch)
tree56b8bd3df85cfee8e425abe18963e5aad015e2fa /drivers/vhost/vhost.c
parent5c74501f76360ce6f410730b9b5e5976f38e8504 (diff)
vhost: vhost TX zero-copy support
>From: Shirley Ma <mashirle@us.ibm.com> This adds experimental zero copy support in vhost-net, disabled by default. To enable, set experimental_zcopytx module option to 1. This patch maintains the outstanding userspace buffers in the sequence it is delivered to vhost. The outstanding userspace buffers will be marked as done once the lower device buffers DMA has finished. This is monitored through last reference of kfree_skb callback. Two buffer indices are used for this purpose. The vhost-net device passes the userspace buffers info to lower device skb through message control. DMA done status check and guest notification are handled by handle_tx: in the worst case is all buffers in the vq are in pending/done status, so we need to notify guest to release DMA done buffers first before we get any new buffers from the vq. One known problem is that if the guest stops submitting buffers, buffers might never get used until some further action, e.g. device reset. This does not seem to affect linux guests. Signed-off-by: Shirley <xma@us.ibm.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/vhost/vhost.c')
-rw-r--r--drivers/vhost/vhost.c128
1 files changed, 113 insertions, 15 deletions
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index ea966b356352..5ef2f62becf4 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -37,6 +37,8 @@ enum {
37 VHOST_MEMORY_F_LOG = 0x1, 37 VHOST_MEMORY_F_LOG = 0x1,
38}; 38};
39 39
40static unsigned vhost_zcopy_mask __read_mostly;
41
40#define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num]) 42#define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
41#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num]) 43#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
42 44
@@ -179,6 +181,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
179 vq->call_ctx = NULL; 181 vq->call_ctx = NULL;
180 vq->call = NULL; 182 vq->call = NULL;
181 vq->log_ctx = NULL; 183 vq->log_ctx = NULL;
184 vq->upend_idx = 0;
185 vq->done_idx = 0;
186 vq->ubufs = NULL;
182} 187}
183 188
184static int vhost_worker(void *data) 189static int vhost_worker(void *data)
@@ -225,10 +230,28 @@ static int vhost_worker(void *data)
225 return 0; 230 return 0;
226} 231}
227 232
233static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
234{
235 kfree(vq->indirect);
236 vq->indirect = NULL;
237 kfree(vq->log);
238 vq->log = NULL;
239 kfree(vq->heads);
240 vq->heads = NULL;
241 kfree(vq->ubuf_info);
242 vq->ubuf_info = NULL;
243}
244
245void vhost_enable_zcopy(int vq)
246{
247 vhost_zcopy_mask |= 0x1 << vq;
248}
249
228/* Helper to allocate iovec buffers for all vqs. */ 250/* Helper to allocate iovec buffers for all vqs. */
229static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) 251static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
230{ 252{
231 int i; 253 int i;
254 bool zcopy;
232 255
233 for (i = 0; i < dev->nvqs; ++i) { 256 for (i = 0; i < dev->nvqs; ++i) {
234 dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect * 257 dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect *
@@ -237,19 +260,21 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
237 GFP_KERNEL); 260 GFP_KERNEL);
238 dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads * 261 dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
239 UIO_MAXIOV, GFP_KERNEL); 262 UIO_MAXIOV, GFP_KERNEL);
240 263 zcopy = vhost_zcopy_mask & (0x1 << i);
264 if (zcopy)
265 dev->vqs[i].ubuf_info =
266 kmalloc(sizeof *dev->vqs[i].ubuf_info *
267 UIO_MAXIOV, GFP_KERNEL);
241 if (!dev->vqs[i].indirect || !dev->vqs[i].log || 268 if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
242 !dev->vqs[i].heads) 269 !dev->vqs[i].heads ||
270 (zcopy && !dev->vqs[i].ubuf_info))
243 goto err_nomem; 271 goto err_nomem;
244 } 272 }
245 return 0; 273 return 0;
246 274
247err_nomem: 275err_nomem:
248 for (; i >= 0; --i) { 276 for (; i >= 0; --i)
249 kfree(dev->vqs[i].indirect); 277 vhost_vq_free_iovecs(&dev->vqs[i]);
250 kfree(dev->vqs[i].log);
251 kfree(dev->vqs[i].heads);
252 }
253 return -ENOMEM; 278 return -ENOMEM;
254} 279}
255 280
@@ -257,14 +282,8 @@ static void vhost_dev_free_iovecs(struct vhost_dev *dev)
257{ 282{
258 int i; 283 int i;
259 284
260 for (i = 0; i < dev->nvqs; ++i) { 285 for (i = 0; i < dev->nvqs; ++i)
261 kfree(dev->vqs[i].indirect); 286 vhost_vq_free_iovecs(&dev->vqs[i]);
262 dev->vqs[i].indirect = NULL;
263 kfree(dev->vqs[i].log);
264 dev->vqs[i].log = NULL;
265 kfree(dev->vqs[i].heads);
266 dev->vqs[i].heads = NULL;
267 }
268} 287}
269 288
270long vhost_dev_init(struct vhost_dev *dev, 289long vhost_dev_init(struct vhost_dev *dev,
@@ -287,6 +306,7 @@ long vhost_dev_init(struct vhost_dev *dev,
287 dev->vqs[i].log = NULL; 306 dev->vqs[i].log = NULL;
288 dev->vqs[i].indirect = NULL; 307 dev->vqs[i].indirect = NULL;
289 dev->vqs[i].heads = NULL; 308 dev->vqs[i].heads = NULL;
309 dev->vqs[i].ubuf_info = NULL;
290 dev->vqs[i].dev = dev; 310 dev->vqs[i].dev = dev;
291 mutex_init(&dev->vqs[i].mutex); 311 mutex_init(&dev->vqs[i].mutex);
292 vhost_vq_reset(dev, dev->vqs + i); 312 vhost_vq_reset(dev, dev->vqs + i);
@@ -390,6 +410,30 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
390 return 0; 410 return 0;
391} 411}
392 412
413/* In case of DMA done not in order in lower device driver for some reason.
414 * upend_idx is used to track end of used idx, done_idx is used to track head
415 * of used idx. Once lower device DMA done contiguously, we will signal KVM
416 * guest used idx.
417 */
418int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
419{
420 int i;
421 int j = 0;
422
423 for (i = vq->done_idx; i != vq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
424 if ((vq->heads[i].len == VHOST_DMA_DONE_LEN)) {
425 vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
426 vhost_add_used_and_signal(vq->dev, vq,
427 vq->heads[i].id, 0);
428 ++j;
429 } else
430 break;
431 }
432 if (j)
433 vq->done_idx = i;
434 return j;
435}
436
393/* Caller should have device mutex */ 437/* Caller should have device mutex */
394void vhost_dev_cleanup(struct vhost_dev *dev) 438void vhost_dev_cleanup(struct vhost_dev *dev)
395{ 439{
@@ -400,6 +444,13 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
400 vhost_poll_stop(&dev->vqs[i].poll); 444 vhost_poll_stop(&dev->vqs[i].poll);
401 vhost_poll_flush(&dev->vqs[i].poll); 445 vhost_poll_flush(&dev->vqs[i].poll);
402 } 446 }
447 /* Wait for all lower device DMAs done. */
448 if (dev->vqs[i].ubufs)
449 vhost_ubuf_put_and_wait(dev->vqs[i].ubufs);
450
451 /* Signal guest as appropriate. */
452 vhost_zerocopy_signal_used(&dev->vqs[i]);
453
403 if (dev->vqs[i].error_ctx) 454 if (dev->vqs[i].error_ctx)
404 eventfd_ctx_put(dev->vqs[i].error_ctx); 455 eventfd_ctx_put(dev->vqs[i].error_ctx);
405 if (dev->vqs[i].error) 456 if (dev->vqs[i].error)
@@ -1486,3 +1537,50 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
1486 &vq->used->flags, r); 1537 &vq->used->flags, r);
1487 } 1538 }
1488} 1539}
1540
1541static void vhost_zerocopy_done_signal(struct kref *kref)
1542{
1543 struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref,
1544 kref);
1545 wake_up(&ubufs->wait);
1546}
1547
1548struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
1549 bool zcopy)
1550{
1551 struct vhost_ubuf_ref *ubufs;
1552 /* No zero copy backend? Nothing to count. */
1553 if (!zcopy)
1554 return NULL;
1555 ubufs = kmalloc(sizeof *ubufs, GFP_KERNEL);
1556 if (!ubufs)
1557 return ERR_PTR(-ENOMEM);
1558 kref_init(&ubufs->kref);
1559 kref_get(&ubufs->kref);
1560 init_waitqueue_head(&ubufs->wait);
1561 ubufs->vq = vq;
1562 return ubufs;
1563}
1564
1565void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs)
1566{
1567 kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
1568}
1569
1570void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
1571{
1572 kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
1573 wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
1574 kfree(ubufs);
1575}
1576
1577void vhost_zerocopy_callback(void *arg)
1578{
1579 struct ubuf_info *ubuf = arg;
1580 struct vhost_ubuf_ref *ubufs = ubuf->arg;
1581 struct vhost_virtqueue *vq = ubufs->vq;
1582
1583 /* set len = 1 to mark this desc buffers done DMA */
1584 vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
1585 kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
1586}