diff options
author | Alex Williamson <alex.williamson@redhat.com> | 2018-03-21 14:46:21 -0400 |
---|---|---|
committer | Alex Williamson <alex.williamson@redhat.com> | 2018-03-26 15:22:58 -0400 |
commit | 30656177c4080460b936709ff6648f201d7d2c1a (patch) | |
tree | 736aca0117c845222f0fd44cdcd661f431bd810d | |
parent | 07fd7ef3a1c25a11015bb5821c9c5982f722d4a2 (diff) |
vfio/pci: Add ioeventfd support
The ioeventfd here is actually irqfd handling of an ioeventfd such as
supported in KVM. A user is able to pre-program a device write to
occur when the eventfd triggers. This is yet another instance of
eventfd-irqfd triggering between KVM and vfio. The impetus for this
is high frequency writes to pages which are virtualized in QEMU.
Enabling this near-direct write path for selected registers within
the virtualized page can improve performance and reduce overhead.
Specifically this is initially targeted at NVIDIA graphics cards where
the driver issues a write to an MMIO register within a virtualized
region in order to allow the MSI interrupt to re-trigger.
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
-rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 35 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_private.h | 19 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_rdwr.c | 111 | ||||
-rw-r--r-- | include/uapi/linux/vfio.h | 27 |
4 files changed, 192 insertions, 0 deletions
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index b0f759476900..c6822149b394 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c | |||
@@ -305,6 +305,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) | |||
305 | { | 305 | { |
306 | struct pci_dev *pdev = vdev->pdev; | 306 | struct pci_dev *pdev = vdev->pdev; |
307 | struct vfio_pci_dummy_resource *dummy_res, *tmp; | 307 | struct vfio_pci_dummy_resource *dummy_res, *tmp; |
308 | struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp; | ||
308 | int i, bar; | 309 | int i, bar; |
309 | 310 | ||
310 | /* Stop the device from further DMA */ | 311 | /* Stop the device from further DMA */ |
@@ -314,6 +315,15 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) | |||
314 | VFIO_IRQ_SET_ACTION_TRIGGER, | 315 | VFIO_IRQ_SET_ACTION_TRIGGER, |
315 | vdev->irq_type, 0, 0, NULL); | 316 | vdev->irq_type, 0, 0, NULL); |
316 | 317 | ||
318 | /* Device closed, don't need mutex here */ | ||
319 | list_for_each_entry_safe(ioeventfd, ioeventfd_tmp, | ||
320 | &vdev->ioeventfds_list, next) { | ||
321 | vfio_virqfd_disable(&ioeventfd->virqfd); | ||
322 | list_del(&ioeventfd->next); | ||
323 | kfree(ioeventfd); | ||
324 | } | ||
325 | vdev->ioeventfds_nr = 0; | ||
326 | |||
317 | vdev->virq_disabled = false; | 327 | vdev->virq_disabled = false; |
318 | 328 | ||
319 | for (i = 0; i < vdev->num_regions; i++) | 329 | for (i = 0; i < vdev->num_regions; i++) |
@@ -1012,6 +1022,28 @@ hot_reset_release: | |||
1012 | 1022 | ||
1013 | kfree(groups); | 1023 | kfree(groups); |
1014 | return ret; | 1024 | return ret; |
1025 | } else if (cmd == VFIO_DEVICE_IOEVENTFD) { | ||
1026 | struct vfio_device_ioeventfd ioeventfd; | ||
1027 | int count; | ||
1028 | |||
1029 | minsz = offsetofend(struct vfio_device_ioeventfd, fd); | ||
1030 | |||
1031 | if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) | ||
1032 | return -EFAULT; | ||
1033 | |||
1034 | if (ioeventfd.argsz < minsz) | ||
1035 | return -EINVAL; | ||
1036 | |||
1037 | if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) | ||
1038 | return -EINVAL; | ||
1039 | |||
1040 | count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; | ||
1041 | |||
1042 | if (hweight8(count) != 1 || ioeventfd.fd < -1) | ||
1043 | return -EINVAL; | ||
1044 | |||
1045 | return vfio_pci_ioeventfd(vdev, ioeventfd.offset, | ||
1046 | ioeventfd.data, count, ioeventfd.fd); | ||
1015 | } | 1047 | } |
1016 | 1048 | ||
1017 | return -ENOTTY; | 1049 | return -ENOTTY; |
@@ -1174,6 +1206,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) | |||
1174 | vdev->irq_type = VFIO_PCI_NUM_IRQS; | 1206 | vdev->irq_type = VFIO_PCI_NUM_IRQS; |
1175 | mutex_init(&vdev->igate); | 1207 | mutex_init(&vdev->igate); |
1176 | spin_lock_init(&vdev->irqlock); | 1208 | spin_lock_init(&vdev->irqlock); |
1209 | mutex_init(&vdev->ioeventfds_lock); | ||
1210 | INIT_LIST_HEAD(&vdev->ioeventfds_list); | ||
1177 | 1211 | ||
1178 | ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); | 1212 | ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); |
1179 | if (ret) { | 1213 | if (ret) { |
@@ -1215,6 +1249,7 @@ static void vfio_pci_remove(struct pci_dev *pdev) | |||
1215 | 1249 | ||
1216 | vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); | 1250 | vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); |
1217 | kfree(vdev->region); | 1251 | kfree(vdev->region); |
1252 | mutex_destroy(&vdev->ioeventfds_lock); | ||
1218 | kfree(vdev); | 1253 | kfree(vdev); |
1219 | 1254 | ||
1220 | if (vfio_pci_is_vga(pdev)) { | 1255 | if (vfio_pci_is_vga(pdev)) { |
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index f561ac1c78a0..cde3b5d3441a 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h | |||
@@ -29,6 +29,19 @@ | |||
29 | #define PCI_CAP_ID_INVALID 0xFF /* default raw access */ | 29 | #define PCI_CAP_ID_INVALID 0xFF /* default raw access */ |
30 | #define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ | 30 | #define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ |
31 | 31 | ||
32 | /* Cap maximum number of ioeventfds per device (arbitrary) */ | ||
33 | #define VFIO_PCI_IOEVENTFD_MAX 1000 | ||
34 | |||
35 | struct vfio_pci_ioeventfd { | ||
36 | struct list_head next; | ||
37 | struct virqfd *virqfd; | ||
38 | void __iomem *addr; | ||
39 | uint64_t data; | ||
40 | loff_t pos; | ||
41 | int bar; | ||
42 | int count; | ||
43 | }; | ||
44 | |||
32 | struct vfio_pci_irq_ctx { | 45 | struct vfio_pci_irq_ctx { |
33 | struct eventfd_ctx *trigger; | 46 | struct eventfd_ctx *trigger; |
34 | struct virqfd *unmask; | 47 | struct virqfd *unmask; |
@@ -92,9 +105,12 @@ struct vfio_pci_device { | |||
92 | bool nointx; | 105 | bool nointx; |
93 | struct pci_saved_state *pci_saved_state; | 106 | struct pci_saved_state *pci_saved_state; |
94 | int refcnt; | 107 | int refcnt; |
108 | int ioeventfds_nr; | ||
95 | struct eventfd_ctx *err_trigger; | 109 | struct eventfd_ctx *err_trigger; |
96 | struct eventfd_ctx *req_trigger; | 110 | struct eventfd_ctx *req_trigger; |
97 | struct list_head dummy_resources_list; | 111 | struct list_head dummy_resources_list; |
112 | struct mutex ioeventfds_lock; | ||
113 | struct list_head ioeventfds_list; | ||
98 | }; | 114 | }; |
99 | 115 | ||
100 | #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) | 116 | #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) |
@@ -120,6 +136,9 @@ extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, | |||
120 | extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, | 136 | extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, |
121 | size_t count, loff_t *ppos, bool iswrite); | 137 | size_t count, loff_t *ppos, bool iswrite); |
122 | 138 | ||
139 | extern long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, | ||
140 | uint64_t data, int count, int fd); | ||
141 | |||
123 | extern int vfio_pci_init_perm_bits(void); | 142 | extern int vfio_pci_init_perm_bits(void); |
124 | extern void vfio_pci_uninit_perm_bits(void); | 143 | extern void vfio_pci_uninit_perm_bits(void); |
125 | 144 | ||
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 925419e0f459..a6029d0a5524 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/pci.h> | 17 | #include <linux/pci.h> |
18 | #include <linux/uaccess.h> | 18 | #include <linux/uaccess.h> |
19 | #include <linux/io.h> | 19 | #include <linux/io.h> |
20 | #include <linux/vfio.h> | ||
20 | #include <linux/vgaarb.h> | 21 | #include <linux/vgaarb.h> |
21 | 22 | ||
22 | #include "vfio_pci_private.h" | 23 | #include "vfio_pci_private.h" |
@@ -275,3 +276,113 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, | |||
275 | 276 | ||
276 | return done; | 277 | return done; |
277 | } | 278 | } |
279 | |||
280 | static int vfio_pci_ioeventfd_handler(void *opaque, void *unused) | ||
281 | { | ||
282 | struct vfio_pci_ioeventfd *ioeventfd = opaque; | ||
283 | |||
284 | switch (ioeventfd->count) { | ||
285 | case 1: | ||
286 | vfio_iowrite8(ioeventfd->data, ioeventfd->addr); | ||
287 | break; | ||
288 | case 2: | ||
289 | vfio_iowrite16(ioeventfd->data, ioeventfd->addr); | ||
290 | break; | ||
291 | case 4: | ||
292 | vfio_iowrite32(ioeventfd->data, ioeventfd->addr); | ||
293 | break; | ||
294 | #ifdef iowrite64 | ||
295 | case 8: | ||
296 | vfio_iowrite64(ioeventfd->data, ioeventfd->addr); | ||
297 | break; | ||
298 | #endif | ||
299 | } | ||
300 | |||
301 | return 0; | ||
302 | } | ||
303 | |||
304 | long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset, | ||
305 | uint64_t data, int count, int fd) | ||
306 | { | ||
307 | struct pci_dev *pdev = vdev->pdev; | ||
308 | loff_t pos = offset & VFIO_PCI_OFFSET_MASK; | ||
309 | int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset); | ||
310 | struct vfio_pci_ioeventfd *ioeventfd; | ||
311 | |||
312 | /* Only support ioeventfds into BARs */ | ||
313 | if (bar > VFIO_PCI_BAR5_REGION_INDEX) | ||
314 | return -EINVAL; | ||
315 | |||
316 | if (pos + count > pci_resource_len(pdev, bar)) | ||
317 | return -EINVAL; | ||
318 | |||
319 | /* Disallow ioeventfds working around MSI-X table writes */ | ||
320 | if (bar == vdev->msix_bar && | ||
321 | !(pos + count <= vdev->msix_offset || | ||
322 | pos >= vdev->msix_offset + vdev->msix_size)) | ||
323 | return -EINVAL; | ||
324 | |||
325 | #ifndef iowrite64 | ||
326 | if (count == 8) | ||
327 | return -EINVAL; | ||
328 | #endif | ||
329 | |||
330 | ret = vfio_pci_setup_barmap(vdev, bar); | ||
331 | if (ret) | ||
332 | return ret; | ||
333 | |||
334 | mutex_lock(&vdev->ioeventfds_lock); | ||
335 | |||
336 | list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) { | ||
337 | if (ioeventfd->pos == pos && ioeventfd->bar == bar && | ||
338 | ioeventfd->data == data && ioeventfd->count == count) { | ||
339 | if (fd == -1) { | ||
340 | vfio_virqfd_disable(&ioeventfd->virqfd); | ||
341 | list_del(&ioeventfd->next); | ||
342 | vdev->ioeventfds_nr--; | ||
343 | kfree(ioeventfd); | ||
344 | ret = 0; | ||
345 | } else | ||
346 | ret = -EEXIST; | ||
347 | |||
348 | goto out_unlock; | ||
349 | } | ||
350 | } | ||
351 | |||
352 | if (fd < 0) { | ||
353 | ret = -ENODEV; | ||
354 | goto out_unlock; | ||
355 | } | ||
356 | |||
357 | if (vdev->ioeventfds_nr >= VFIO_PCI_IOEVENTFD_MAX) { | ||
358 | ret = -ENOSPC; | ||
359 | goto out_unlock; | ||
360 | } | ||
361 | |||
362 | ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL); | ||
363 | if (!ioeventfd) { | ||
364 | ret = -ENOMEM; | ||
365 | goto out_unlock; | ||
366 | } | ||
367 | |||
368 | ioeventfd->addr = vdev->barmap[bar] + pos; | ||
369 | ioeventfd->data = data; | ||
370 | ioeventfd->pos = pos; | ||
371 | ioeventfd->bar = bar; | ||
372 | ioeventfd->count = count; | ||
373 | |||
374 | ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler, | ||
375 | NULL, NULL, &ioeventfd->virqfd, fd); | ||
376 | if (ret) { | ||
377 | kfree(ioeventfd); | ||
378 | goto out_unlock; | ||
379 | } | ||
380 | |||
381 | list_add(&ioeventfd->next, &vdev->ioeventfds_list); | ||
382 | vdev->ioeventfds_nr++; | ||
383 | |||
384 | out_unlock: | ||
385 | mutex_unlock(&vdev->ioeventfds_lock); | ||
386 | |||
387 | return ret; | ||
388 | } | ||
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index c74372163ed2..1aa7b82e8169 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h | |||
@@ -575,6 +575,33 @@ struct vfio_device_gfx_plane_info { | |||
575 | 575 | ||
576 | #define VFIO_DEVICE_GET_GFX_DMABUF _IO(VFIO_TYPE, VFIO_BASE + 15) | 576 | #define VFIO_DEVICE_GET_GFX_DMABUF _IO(VFIO_TYPE, VFIO_BASE + 15) |
577 | 577 | ||
578 | /** | ||
579 | * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 16, | ||
580 | * struct vfio_device_ioeventfd) | ||
581 | * | ||
582 | * Perform a write to the device at the specified device fd offset, with | ||
583 | * the specified data and width when the provided eventfd is triggered. | ||
584 | * vfio bus drivers may not support this for all regions, for all widths, | ||
585 | * or at all. vfio-pci currently only enables support for BAR regions, | ||
586 | * excluding the MSI-X vector table. | ||
587 | * | ||
588 | * Return: 0 on success, -errno on failure. | ||
589 | */ | ||
590 | struct vfio_device_ioeventfd { | ||
591 | __u32 argsz; | ||
592 | __u32 flags; | ||
593 | #define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */ | ||
594 | #define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */ | ||
595 | #define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */ | ||
596 | #define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */ | ||
597 | #define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf) | ||
598 | __u64 offset; /* device fd offset of write */ | ||
599 | __u64 data; /* data to be written */ | ||
600 | __s32 fd; /* -1 for de-assignment */ | ||
601 | }; | ||
602 | |||
603 | #define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16) | ||
604 | |||
578 | /* -------- API for Type1 VFIO IOMMU -------- */ | 605 | /* -------- API for Type1 VFIO IOMMU -------- */ |
579 | 606 | ||
580 | /** | 607 | /** |