aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexey Kardashevskiy <aik@ozlabs.ru>2017-12-12 21:31:31 -0500
committerAlex Williamson <alex.williamson@redhat.com>2017-12-20 12:05:06 -0500
commita32295c612c57990d17fb0f41e7134394b2f35f6 (patch)
treeb6e59851062efc6a57040c3a8e5312826d9c5a3e
parentdda01f787df9f9e46f1c0bf8aa11f246e300750d (diff)
vfio-pci: Allow mapping MSIX BAR
By default VFIO disables mapping of MSIX BAR to the userspace as the userspace may program it in a way allowing spurious interrupts; instead the userspace uses the VFIO_DEVICE_SET_IRQS ioctl. In order to eliminate guessing from the userspace about what is mmapable, VFIO also advertises a sparse list of regions allowed to mmap. This works fine as long as the system page size equals to the MSIX alignment requirement which is 4KB. However with a bigger page size the existing code prohibits mapping non-MSIX parts of a page with MSIX structures so these parts have to be emulated via slow reads/writes on a VFIO device fd. If these emulated bits are accessed often, this has serious impact on performance. This allows mmap of the entire BAR containing MSIX vector table. This removes the sparse capability for PCI devices as it becomes useless. As the userspace needs to know for sure whether mmapping of the MSIX vector containing data can succeed, this adds a new capability - VFIO_REGION_INFO_CAP_MSIX_MAPPABLE - which explicitly tells the userspace that the entire BAR can be mmapped. This does not touch the MSIX mangling in the BAR read/write handlers as we are doing this just to enable direct access to non MSIX registers. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> [aw - fixup whitespace, trim function name] Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
-rw-r--r--drivers/vfio/pci/vfio_pci.c64
-rw-r--r--include/uapi/linux/vfio.h10
2 files changed, 18 insertions, 56 deletions
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index de48acd29a84..b0f759476900 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -565,47 +565,15 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
565 return walk.ret; 565 return walk.ret;
566} 566}
567 567
568static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev, 568static int msix_mmappable_cap(struct vfio_pci_device *vdev,
569 struct vfio_info_cap *caps) 569 struct vfio_info_cap *caps)
570{ 570{
571 struct vfio_region_info_cap_sparse_mmap *sparse; 571 struct vfio_info_cap_header header = {
572 size_t end, size; 572 .id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
573 int nr_areas = 2, i = 0, ret; 573 .version = 1
574 574 };
575 end = pci_resource_len(vdev->pdev, vdev->msix_bar);
576
577 /* If MSI-X table is aligned to the start or end, only one area */
578 if (((vdev->msix_offset & PAGE_MASK) == 0) ||
579 (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) >= end))
580 nr_areas = 1;
581
582 size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
583
584 sparse = kzalloc(size, GFP_KERNEL);
585 if (!sparse)
586 return -ENOMEM;
587
588 sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
589 sparse->header.version = 1;
590 sparse->nr_areas = nr_areas;
591
592 if (vdev->msix_offset & PAGE_MASK) {
593 sparse->areas[i].offset = 0;
594 sparse->areas[i].size = vdev->msix_offset & PAGE_MASK;
595 i++;
596 }
597
598 if (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) < end) {
599 sparse->areas[i].offset = PAGE_ALIGN(vdev->msix_offset +
600 vdev->msix_size);
601 sparse->areas[i].size = end - sparse->areas[i].offset;
602 i++;
603 }
604
605 ret = vfio_info_add_capability(caps, &sparse->header, size);
606 kfree(sparse);
607 575
608 return ret; 576 return vfio_info_add_capability(caps, &header, sizeof(header));
609} 577}
610 578
611int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, 579int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
@@ -696,7 +664,7 @@ static long vfio_pci_ioctl(void *device_data,
696 if (vdev->bar_mmap_supported[info.index]) { 664 if (vdev->bar_mmap_supported[info.index]) {
697 info.flags |= VFIO_REGION_INFO_FLAG_MMAP; 665 info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
698 if (info.index == vdev->msix_bar) { 666 if (info.index == vdev->msix_bar) {
699 ret = msix_sparse_mmap_cap(vdev, &caps); 667 ret = msix_mmappable_cap(vdev, &caps);
700 if (ret) 668 if (ret)
701 return ret; 669 return ret;
702 } 670 }
@@ -1127,22 +1095,6 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
1127 if (req_start + req_len > phys_len) 1095 if (req_start + req_len > phys_len)
1128 return -EINVAL; 1096 return -EINVAL;
1129 1097
1130 if (index == vdev->msix_bar) {
1131 /*
1132 * Disallow mmaps overlapping the MSI-X table; users don't
1133 * get to touch this directly. We could find somewhere
1134 * else to map the overlap, but page granularity is only
1135 * a recommendation, not a requirement, so the user needs
1136 * to know which bits are real. Requiring them to mmap
1137 * around the table makes that clear.
1138 */
1139
1140 /* If neither entirely above nor below, then it overlaps */
1141 if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
1142 req_start + req_len <= vdev->msix_offset))
1143 return -EINVAL;
1144 }
1145
1146 /* 1098 /*
1147 * Even though we don't make use of the barmap for the mmap, 1099 * Even though we don't make use of the barmap for the mmap,
1148 * we need to request the region and the barmap tracks that. 1100 * we need to request the region and the barmap tracks that.
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index e3301dbd27d4..0d914350f7bf 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -301,6 +301,16 @@ struct vfio_region_info_cap_type {
301#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) 301#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2)
302#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) 302#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3)
303 303
304/*
305 * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
306 * which allows direct access to non-MSIX registers which happened to be within
307 * the same system page.
308 *
309 * Even though the userspace gets direct access to the MSIX data, the existing
310 * VFIO_DEVICE_SET_IRQS interface must still be used for MSIX configuration.
311 */
312#define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE 3
313
304/** 314/**
305 * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, 315 * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
306 * struct vfio_irq_info) 316 * struct vfio_irq_info)