diff options
| -rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 21 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_intrs.c | 60 | ||||
| -rw-r--r-- | drivers/vfio/pci/vfio_pci_private.h | 1 | ||||
| -rw-r--r-- | drivers/vfio/vfio.c | 119 | ||||
| -rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 80 | ||||
| -rw-r--r-- | include/linux/vfio.h | 2 | ||||
| -rw-r--r-- | include/uapi/linux/vfio.h | 1 |
7 files changed, 242 insertions, 42 deletions
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 7cc0122a18ce..f8a186381ae8 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c | |||
| @@ -239,9 +239,12 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) | |||
| 239 | 239 | ||
| 240 | return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; | 240 | return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; |
| 241 | } | 241 | } |
| 242 | } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) | 242 | } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { |
| 243 | if (pci_is_pcie(vdev->pdev)) | 243 | if (pci_is_pcie(vdev->pdev)) |
| 244 | return 1; | 244 | return 1; |
| 245 | } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { | ||
| 246 | return 1; | ||
| 247 | } | ||
| 245 | 248 | ||
| 246 | return 0; | 249 | return 0; |
| 247 | } | 250 | } |
| @@ -464,6 +467,7 @@ static long vfio_pci_ioctl(void *device_data, | |||
| 464 | 467 | ||
| 465 | switch (info.index) { | 468 | switch (info.index) { |
| 466 | case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: | 469 | case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: |
| 470 | case VFIO_PCI_REQ_IRQ_INDEX: | ||
| 467 | break; | 471 | break; |
| 468 | case VFIO_PCI_ERR_IRQ_INDEX: | 472 | case VFIO_PCI_ERR_IRQ_INDEX: |
| 469 | if (pci_is_pcie(vdev->pdev)) | 473 | if (pci_is_pcie(vdev->pdev)) |
| @@ -828,6 +832,20 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) | |||
| 828 | req_len, vma->vm_page_prot); | 832 | req_len, vma->vm_page_prot); |
| 829 | } | 833 | } |
| 830 | 834 | ||
| 835 | static void vfio_pci_request(void *device_data, unsigned int count) | ||
| 836 | { | ||
| 837 | struct vfio_pci_device *vdev = device_data; | ||
| 838 | |||
| 839 | mutex_lock(&vdev->igate); | ||
| 840 | |||
| 841 | if (vdev->req_trigger) { | ||
| 842 | dev_dbg(&vdev->pdev->dev, "Requesting device from user\n"); | ||
| 843 | eventfd_signal(vdev->req_trigger, 1); | ||
| 844 | } | ||
| 845 | |||
| 846 | mutex_unlock(&vdev->igate); | ||
| 847 | } | ||
| 848 | |||
| 831 | static const struct vfio_device_ops vfio_pci_ops = { | 849 | static const struct vfio_device_ops vfio_pci_ops = { |
| 832 | .name = "vfio-pci", | 850 | .name = "vfio-pci", |
| 833 | .open = vfio_pci_open, | 851 | .open = vfio_pci_open, |
| @@ -836,6 +854,7 @@ static const struct vfio_device_ops vfio_pci_ops = { | |||
| 836 | .read = vfio_pci_read, | 854 | .read = vfio_pci_read, |
| 837 | .write = vfio_pci_write, | 855 | .write = vfio_pci_write, |
| 838 | .mmap = vfio_pci_mmap, | 856 | .mmap = vfio_pci_mmap, |
| 857 | .request = vfio_pci_request, | ||
| 839 | }; | 858 | }; |
| 840 | 859 | ||
| 841 | static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) | 860 | static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) |
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index e8d695b3f54e..f88bfdf5b6a0 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c | |||
| @@ -763,46 +763,70 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev, | |||
| 763 | return 0; | 763 | return 0; |
| 764 | } | 764 | } |
| 765 | 765 | ||
| 766 | static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, | 766 | static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, |
| 767 | unsigned index, unsigned start, | 767 | uint32_t flags, void *data) |
| 768 | unsigned count, uint32_t flags, void *data) | ||
| 769 | { | 768 | { |
| 770 | int32_t fd = *(int32_t *)data; | 769 | int32_t fd = *(int32_t *)data; |
| 771 | 770 | ||
| 772 | if ((index != VFIO_PCI_ERR_IRQ_INDEX) || | 771 | if (!(flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) |
| 773 | !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) | ||
| 774 | return -EINVAL; | 772 | return -EINVAL; |
| 775 | 773 | ||
| 776 | /* DATA_NONE/DATA_BOOL enables loopback testing */ | 774 | /* DATA_NONE/DATA_BOOL enables loopback testing */ |
| 777 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | 775 | if (flags & VFIO_IRQ_SET_DATA_NONE) { |
| 778 | if (vdev->err_trigger) | 776 | if (*ctx) |
| 779 | eventfd_signal(vdev->err_trigger, 1); | 777 | eventfd_signal(*ctx, 1); |
| 780 | return 0; | 778 | return 0; |
| 781 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { | 779 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { |
| 782 | uint8_t trigger = *(uint8_t *)data; | 780 | uint8_t trigger = *(uint8_t *)data; |
| 783 | if (trigger && vdev->err_trigger) | 781 | if (trigger && *ctx) |
| 784 | eventfd_signal(vdev->err_trigger, 1); | 782 | eventfd_signal(*ctx, 1); |
| 785 | return 0; | 783 | return 0; |
| 786 | } | 784 | } |
| 787 | 785 | ||
| 788 | /* Handle SET_DATA_EVENTFD */ | 786 | /* Handle SET_DATA_EVENTFD */ |
| 789 | if (fd == -1) { | 787 | if (fd == -1) { |
| 790 | if (vdev->err_trigger) | 788 | if (*ctx) |
| 791 | eventfd_ctx_put(vdev->err_trigger); | 789 | eventfd_ctx_put(*ctx); |
| 792 | vdev->err_trigger = NULL; | 790 | *ctx = NULL; |
| 793 | return 0; | 791 | return 0; |
| 794 | } else if (fd >= 0) { | 792 | } else if (fd >= 0) { |
| 795 | struct eventfd_ctx *efdctx; | 793 | struct eventfd_ctx *efdctx; |
| 796 | efdctx = eventfd_ctx_fdget(fd); | 794 | efdctx = eventfd_ctx_fdget(fd); |
| 797 | if (IS_ERR(efdctx)) | 795 | if (IS_ERR(efdctx)) |
| 798 | return PTR_ERR(efdctx); | 796 | return PTR_ERR(efdctx); |
| 799 | if (vdev->err_trigger) | 797 | if (*ctx) |
| 800 | eventfd_ctx_put(vdev->err_trigger); | 798 | eventfd_ctx_put(*ctx); |
| 801 | vdev->err_trigger = efdctx; | 799 | *ctx = efdctx; |
| 802 | return 0; | 800 | return 0; |
| 803 | } else | 801 | } else |
| 804 | return -EINVAL; | 802 | return -EINVAL; |
| 805 | } | 803 | } |
| 804 | |||
| 805 | static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, | ||
| 806 | unsigned index, unsigned start, | ||
| 807 | unsigned count, uint32_t flags, void *data) | ||
| 808 | { | ||
| 809 | if (index != VFIO_PCI_ERR_IRQ_INDEX) | ||
| 810 | return -EINVAL; | ||
| 811 | |||
| 812 | /* | ||
| 813 | * We should sanitize start & count, but that wasn't caught | ||
| 814 | * originally, so this IRQ index must forever ignore them :-( | ||
| 815 | */ | ||
| 816 | |||
| 817 | return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data); | ||
| 818 | } | ||
| 819 | |||
| 820 | static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev, | ||
| 821 | unsigned index, unsigned start, | ||
| 822 | unsigned count, uint32_t flags, void *data) | ||
| 823 | { | ||
| 824 | if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count != 1) | ||
| 825 | return -EINVAL; | ||
| 826 | |||
| 827 | return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, flags, data); | ||
| 828 | } | ||
| 829 | |||
| 806 | int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, | 830 | int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, |
| 807 | unsigned index, unsigned start, unsigned count, | 831 | unsigned index, unsigned start, unsigned count, |
| 808 | void *data) | 832 | void *data) |
| @@ -844,6 +868,12 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, | |||
| 844 | func = vfio_pci_set_err_trigger; | 868 | func = vfio_pci_set_err_trigger; |
| 845 | break; | 869 | break; |
| 846 | } | 870 | } |
| 871 | case VFIO_PCI_REQ_IRQ_INDEX: | ||
| 872 | switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { | ||
| 873 | case VFIO_IRQ_SET_ACTION_TRIGGER: | ||
| 874 | func = vfio_pci_set_req_trigger; | ||
| 875 | break; | ||
| 876 | } | ||
| 847 | } | 877 | } |
| 848 | 878 | ||
| 849 | if (!func) | 879 | if (!func) |
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 671c17a6e6d0..c9f9b323f152 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h | |||
| @@ -58,6 +58,7 @@ struct vfio_pci_device { | |||
| 58 | struct pci_saved_state *pci_saved_state; | 58 | struct pci_saved_state *pci_saved_state; |
| 59 | int refcnt; | 59 | int refcnt; |
| 60 | struct eventfd_ctx *err_trigger; | 60 | struct eventfd_ctx *err_trigger; |
| 61 | struct eventfd_ctx *req_trigger; | ||
| 61 | }; | 62 | }; |
| 62 | 63 | ||
| 63 | #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) | 64 | #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) |
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index f018d8d0f975..4cde85501444 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c | |||
| @@ -63,6 +63,11 @@ struct vfio_container { | |||
| 63 | void *iommu_data; | 63 | void *iommu_data; |
| 64 | }; | 64 | }; |
| 65 | 65 | ||
| 66 | struct vfio_unbound_dev { | ||
| 67 | struct device *dev; | ||
| 68 | struct list_head unbound_next; | ||
| 69 | }; | ||
| 70 | |||
| 66 | struct vfio_group { | 71 | struct vfio_group { |
| 67 | struct kref kref; | 72 | struct kref kref; |
| 68 | int minor; | 73 | int minor; |
| @@ -75,6 +80,8 @@ struct vfio_group { | |||
| 75 | struct notifier_block nb; | 80 | struct notifier_block nb; |
| 76 | struct list_head vfio_next; | 81 | struct list_head vfio_next; |
| 77 | struct list_head container_next; | 82 | struct list_head container_next; |
| 83 | struct list_head unbound_list; | ||
| 84 | struct mutex unbound_lock; | ||
| 78 | atomic_t opened; | 85 | atomic_t opened; |
| 79 | }; | 86 | }; |
| 80 | 87 | ||
| @@ -204,6 +211,8 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) | |||
| 204 | kref_init(&group->kref); | 211 | kref_init(&group->kref); |
| 205 | INIT_LIST_HEAD(&group->device_list); | 212 | INIT_LIST_HEAD(&group->device_list); |
| 206 | mutex_init(&group->device_lock); | 213 | mutex_init(&group->device_lock); |
| 214 | INIT_LIST_HEAD(&group->unbound_list); | ||
| 215 | mutex_init(&group->unbound_lock); | ||
| 207 | atomic_set(&group->container_users, 0); | 216 | atomic_set(&group->container_users, 0); |
| 208 | atomic_set(&group->opened, 0); | 217 | atomic_set(&group->opened, 0); |
| 209 | group->iommu_group = iommu_group; | 218 | group->iommu_group = iommu_group; |
| @@ -264,13 +273,22 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) | |||
| 264 | static void vfio_group_release(struct kref *kref) | 273 | static void vfio_group_release(struct kref *kref) |
| 265 | { | 274 | { |
| 266 | struct vfio_group *group = container_of(kref, struct vfio_group, kref); | 275 | struct vfio_group *group = container_of(kref, struct vfio_group, kref); |
| 276 | struct vfio_unbound_dev *unbound, *tmp; | ||
| 277 | struct iommu_group *iommu_group = group->iommu_group; | ||
| 267 | 278 | ||
| 268 | WARN_ON(!list_empty(&group->device_list)); | 279 | WARN_ON(!list_empty(&group->device_list)); |
| 269 | 280 | ||
| 281 | list_for_each_entry_safe(unbound, tmp, | ||
| 282 | &group->unbound_list, unbound_next) { | ||
| 283 | list_del(&unbound->unbound_next); | ||
| 284 | kfree(unbound); | ||
| 285 | } | ||
| 286 | |||
| 270 | device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor)); | 287 | device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor)); |
| 271 | list_del(&group->vfio_next); | 288 | list_del(&group->vfio_next); |
| 272 | vfio_free_group_minor(group->minor); | 289 | vfio_free_group_minor(group->minor); |
| 273 | vfio_group_unlock_and_free(group); | 290 | vfio_group_unlock_and_free(group); |
| 291 | iommu_group_put(iommu_group); | ||
| 274 | } | 292 | } |
| 275 | 293 | ||
| 276 | static void vfio_group_put(struct vfio_group *group) | 294 | static void vfio_group_put(struct vfio_group *group) |
| @@ -440,17 +458,36 @@ static bool vfio_whitelisted_driver(struct device_driver *drv) | |||
| 440 | } | 458 | } |
| 441 | 459 | ||
| 442 | /* | 460 | /* |
| 443 | * A vfio group is viable for use by userspace if all devices are either | 461 | * A vfio group is viable for use by userspace if all devices are in |
| 444 | * driver-less or bound to a vfio or whitelisted driver. We test the | 462 | * one of the following states: |
| 445 | * latter by the existence of a struct vfio_device matching the dev. | 463 | * - driver-less |
| 464 | * - bound to a vfio driver | ||
| 465 | * - bound to a whitelisted driver | ||
| 466 | * | ||
| 467 | * We use two methods to determine whether a device is bound to a vfio | ||
| 468 | * driver. The first is to test whether the device exists in the vfio | ||
| 469 | * group. The second is to test if the device exists on the group | ||
| 470 | * unbound_list, indicating it's in the middle of transitioning from | ||
| 471 | * a vfio driver to driver-less. | ||
| 446 | */ | 472 | */ |
| 447 | static int vfio_dev_viable(struct device *dev, void *data) | 473 | static int vfio_dev_viable(struct device *dev, void *data) |
| 448 | { | 474 | { |
| 449 | struct vfio_group *group = data; | 475 | struct vfio_group *group = data; |
| 450 | struct vfio_device *device; | 476 | struct vfio_device *device; |
| 451 | struct device_driver *drv = ACCESS_ONCE(dev->driver); | 477 | struct device_driver *drv = ACCESS_ONCE(dev->driver); |
| 478 | struct vfio_unbound_dev *unbound; | ||
| 479 | int ret = -EINVAL; | ||
| 452 | 480 | ||
| 453 | if (!drv || vfio_whitelisted_driver(drv)) | 481 | mutex_lock(&group->unbound_lock); |
| 482 | list_for_each_entry(unbound, &group->unbound_list, unbound_next) { | ||
| 483 | if (dev == unbound->dev) { | ||
| 484 | ret = 0; | ||
| 485 | break; | ||
| 486 | } | ||
| 487 | } | ||
| 488 | mutex_unlock(&group->unbound_lock); | ||
| 489 | |||
| 490 | if (!ret || !drv || vfio_whitelisted_driver(drv)) | ||
| 454 | return 0; | 491 | return 0; |
| 455 | 492 | ||
| 456 | device = vfio_group_get_device(group, dev); | 493 | device = vfio_group_get_device(group, dev); |
| @@ -459,7 +496,7 @@ static int vfio_dev_viable(struct device *dev, void *data) | |||
| 459 | return 0; | 496 | return 0; |
| 460 | } | 497 | } |
| 461 | 498 | ||
| 462 | return -EINVAL; | 499 | return ret; |
| 463 | } | 500 | } |
| 464 | 501 | ||
| 465 | /** | 502 | /** |
| @@ -501,6 +538,7 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb, | |||
| 501 | { | 538 | { |
| 502 | struct vfio_group *group = container_of(nb, struct vfio_group, nb); | 539 | struct vfio_group *group = container_of(nb, struct vfio_group, nb); |
| 503 | struct device *dev = data; | 540 | struct device *dev = data; |
| 541 | struct vfio_unbound_dev *unbound; | ||
| 504 | 542 | ||
| 505 | /* | 543 | /* |
| 506 | * Need to go through a group_lock lookup to get a reference or we | 544 | * Need to go through a group_lock lookup to get a reference or we |
| @@ -550,6 +588,17 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb, | |||
| 550 | * stop the system to maintain isolation. At a minimum, we'd | 588 | * stop the system to maintain isolation. At a minimum, we'd |
| 551 | * want a toggle to disable driver auto probe for this device. | 589 | * want a toggle to disable driver auto probe for this device. |
| 552 | */ | 590 | */ |
| 591 | |||
| 592 | mutex_lock(&group->unbound_lock); | ||
| 593 | list_for_each_entry(unbound, | ||
| 594 | &group->unbound_list, unbound_next) { | ||
| 595 | if (dev == unbound->dev) { | ||
| 596 | list_del(&unbound->unbound_next); | ||
| 597 | kfree(unbound); | ||
| 598 | break; | ||
| 599 | } | ||
| 600 | } | ||
| 601 | mutex_unlock(&group->unbound_lock); | ||
| 553 | break; | 602 | break; |
| 554 | } | 603 | } |
| 555 | 604 | ||
| @@ -578,6 +627,12 @@ int vfio_add_group_dev(struct device *dev, | |||
| 578 | iommu_group_put(iommu_group); | 627 | iommu_group_put(iommu_group); |
| 579 | return PTR_ERR(group); | 628 | return PTR_ERR(group); |
| 580 | } | 629 | } |
| 630 | } else { | ||
| 631 | /* | ||
| 632 | * A found vfio_group already holds a reference to the | ||
| 633 | * iommu_group. A created vfio_group keeps the reference. | ||
| 634 | */ | ||
| 635 | iommu_group_put(iommu_group); | ||
| 581 | } | 636 | } |
| 582 | 637 | ||
| 583 | device = vfio_group_get_device(group, dev); | 638 | device = vfio_group_get_device(group, dev); |
| @@ -586,21 +641,19 @@ int vfio_add_group_dev(struct device *dev, | |||
| 586 | dev_name(dev), iommu_group_id(iommu_group)); | 641 | dev_name(dev), iommu_group_id(iommu_group)); |
| 587 | vfio_device_put(device); | 642 | vfio_device_put(device); |
| 588 | vfio_group_put(group); | 643 | vfio_group_put(group); |
| 589 | iommu_group_put(iommu_group); | ||
| 590 | return -EBUSY; | 644 | return -EBUSY; |
| 591 | } | 645 | } |
| 592 | 646 | ||
| 593 | device = vfio_group_create_device(group, dev, ops, device_data); | 647 | device = vfio_group_create_device(group, dev, ops, device_data); |
| 594 | if (IS_ERR(device)) { | 648 | if (IS_ERR(device)) { |
| 595 | vfio_group_put(group); | 649 | vfio_group_put(group); |
| 596 | iommu_group_put(iommu_group); | ||
| 597 | return PTR_ERR(device); | 650 | return PTR_ERR(device); |
| 598 | } | 651 | } |
| 599 | 652 | ||
| 600 | /* | 653 | /* |
| 601 | * Added device holds reference to iommu_group and vfio_device | 654 | * Drop all but the vfio_device reference. The vfio_device holds |
| 602 | * (which in turn holds reference to vfio_group). Drop extra | 655 | * a reference to the vfio_group, which holds a reference to the |
| 603 | * group reference used while acquiring device. | 656 | * iommu_group. |
| 604 | */ | 657 | */ |
| 605 | vfio_group_put(group); | 658 | vfio_group_put(group); |
| 606 | 659 | ||
| @@ -655,8 +708,9 @@ void *vfio_del_group_dev(struct device *dev) | |||
| 655 | { | 708 | { |
| 656 | struct vfio_device *device = dev_get_drvdata(dev); | 709 | struct vfio_device *device = dev_get_drvdata(dev); |
| 657 | struct vfio_group *group = device->group; | 710 | struct vfio_group *group = device->group; |
| 658 | struct iommu_group *iommu_group = group->iommu_group; | ||
| 659 | void *device_data = device->device_data; | 711 | void *device_data = device->device_data; |
| 712 | struct vfio_unbound_dev *unbound; | ||
| 713 | unsigned int i = 0; | ||
| 660 | 714 | ||
| 661 | /* | 715 | /* |
| 662 | * The group exists so long as we have a device reference. Get | 716 | * The group exists so long as we have a device reference. Get |
| @@ -664,14 +718,49 @@ void *vfio_del_group_dev(struct device *dev) | |||
| 664 | */ | 718 | */ |
| 665 | vfio_group_get(group); | 719 | vfio_group_get(group); |
| 666 | 720 | ||
| 721 | /* | ||
| 722 | * When the device is removed from the group, the group suddenly | ||
| 723 | * becomes non-viable; the device has a driver (until the unbind | ||
| 724 | * completes), but it's not present in the group. This is bad news | ||
| 725 | * for any external users that need to re-acquire a group reference | ||
| 726 | * in order to match and release their existing reference. To | ||
| 727 | * solve this, we track such devices on the unbound_list to bridge | ||
| 728 | * the gap until they're fully unbound. | ||
| 729 | */ | ||
| 730 | unbound = kzalloc(sizeof(*unbound), GFP_KERNEL); | ||
| 731 | if (unbound) { | ||
| 732 | unbound->dev = dev; | ||
| 733 | mutex_lock(&group->unbound_lock); | ||
| 734 | list_add(&unbound->unbound_next, &group->unbound_list); | ||
| 735 | mutex_unlock(&group->unbound_lock); | ||
| 736 | } | ||
| 737 | WARN_ON(!unbound); | ||
| 738 | |||
| 667 | vfio_device_put(device); | 739 | vfio_device_put(device); |
| 668 | 740 | ||
| 669 | /* TODO send a signal to encourage this to be released */ | 741 | /* |
| 670 | wait_event(vfio.release_q, !vfio_dev_present(group, dev)); | 742 | * If the device is still present in the group after the above |
| 743 | * 'put', then it is in use and we need to request it from the | ||
| 744 | * bus driver. The driver may in turn need to request the | ||
| 745 | * device from the user. We send the request on an arbitrary | ||
| 746 | * interval with counter to allow the driver to take escalating | ||
| 747 | * measures to release the device if it has the ability to do so. | ||
| 748 | */ | ||
| 749 | do { | ||
| 750 | device = vfio_group_get_device(group, dev); | ||
| 751 | if (!device) | ||
| 752 | break; | ||
| 671 | 753 | ||
| 672 | vfio_group_put(group); | 754 | if (device->ops->request) |
| 755 | device->ops->request(device_data, i++); | ||
| 673 | 756 | ||
| 674 | iommu_group_put(iommu_group); | 757 | vfio_device_put(device); |
| 758 | |||
| 759 | } while (wait_event_interruptible_timeout(vfio.release_q, | ||
| 760 | !vfio_dev_present(group, dev), | ||
| 761 | HZ * 10) <= 0); | ||
| 762 | |||
| 763 | vfio_group_put(group); | ||
| 675 | 764 | ||
| 676 | return device_data; | 765 | return device_data; |
| 677 | } | 766 | } |
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 4a9d666f1e91..57d8c37a002b 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c | |||
| @@ -66,6 +66,7 @@ struct vfio_domain { | |||
| 66 | struct list_head next; | 66 | struct list_head next; |
| 67 | struct list_head group_list; | 67 | struct list_head group_list; |
| 68 | int prot; /* IOMMU_CACHE */ | 68 | int prot; /* IOMMU_CACHE */ |
| 69 | bool fgsp; /* Fine-grained super pages */ | ||
| 69 | }; | 70 | }; |
| 70 | 71 | ||
| 71 | struct vfio_dma { | 72 | struct vfio_dma { |
| @@ -264,6 +265,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, | |||
| 264 | unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | 265 | unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
| 265 | bool lock_cap = capable(CAP_IPC_LOCK); | 266 | bool lock_cap = capable(CAP_IPC_LOCK); |
| 266 | long ret, i; | 267 | long ret, i; |
| 268 | bool rsvd; | ||
| 267 | 269 | ||
| 268 | if (!current->mm) | 270 | if (!current->mm) |
| 269 | return -ENODEV; | 271 | return -ENODEV; |
| @@ -272,10 +274,9 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, | |||
| 272 | if (ret) | 274 | if (ret) |
| 273 | return ret; | 275 | return ret; |
| 274 | 276 | ||
| 275 | if (is_invalid_reserved_pfn(*pfn_base)) | 277 | rsvd = is_invalid_reserved_pfn(*pfn_base); |
| 276 | return 1; | ||
| 277 | 278 | ||
| 278 | if (!lock_cap && current->mm->locked_vm + 1 > limit) { | 279 | if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) { |
| 279 | put_pfn(*pfn_base, prot); | 280 | put_pfn(*pfn_base, prot); |
| 280 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, | 281 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, |
| 281 | limit << PAGE_SHIFT); | 282 | limit << PAGE_SHIFT); |
| @@ -283,7 +284,8 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, | |||
| 283 | } | 284 | } |
| 284 | 285 | ||
| 285 | if (unlikely(disable_hugepages)) { | 286 | if (unlikely(disable_hugepages)) { |
| 286 | vfio_lock_acct(1); | 287 | if (!rsvd) |
| 288 | vfio_lock_acct(1); | ||
| 287 | return 1; | 289 | return 1; |
| 288 | } | 290 | } |
| 289 | 291 | ||
| @@ -295,12 +297,14 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, | |||
| 295 | if (ret) | 297 | if (ret) |
| 296 | break; | 298 | break; |
| 297 | 299 | ||
| 298 | if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) { | 300 | if (pfn != *pfn_base + i || |
| 301 | rsvd != is_invalid_reserved_pfn(pfn)) { | ||
| 299 | put_pfn(pfn, prot); | 302 | put_pfn(pfn, prot); |
| 300 | break; | 303 | break; |
| 301 | } | 304 | } |
| 302 | 305 | ||
| 303 | if (!lock_cap && current->mm->locked_vm + i + 1 > limit) { | 306 | if (!rsvd && !lock_cap && |
| 307 | current->mm->locked_vm + i + 1 > limit) { | ||
| 304 | put_pfn(pfn, prot); | 308 | put_pfn(pfn, prot); |
| 305 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", | 309 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", |
| 306 | __func__, limit << PAGE_SHIFT); | 310 | __func__, limit << PAGE_SHIFT); |
| @@ -308,7 +312,8 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, | |||
| 308 | } | 312 | } |
| 309 | } | 313 | } |
| 310 | 314 | ||
| 311 | vfio_lock_acct(i); | 315 | if (!rsvd) |
| 316 | vfio_lock_acct(i); | ||
| 312 | 317 | ||
| 313 | return i; | 318 | return i; |
| 314 | } | 319 | } |
| @@ -346,12 +351,14 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) | |||
| 346 | domain = d = list_first_entry(&iommu->domain_list, | 351 | domain = d = list_first_entry(&iommu->domain_list, |
| 347 | struct vfio_domain, next); | 352 | struct vfio_domain, next); |
| 348 | 353 | ||
| 349 | list_for_each_entry_continue(d, &iommu->domain_list, next) | 354 | list_for_each_entry_continue(d, &iommu->domain_list, next) { |
| 350 | iommu_unmap(d->domain, dma->iova, dma->size); | 355 | iommu_unmap(d->domain, dma->iova, dma->size); |
| 356 | cond_resched(); | ||
| 357 | } | ||
| 351 | 358 | ||
| 352 | while (iova < end) { | 359 | while (iova < end) { |
| 353 | size_t unmapped; | 360 | size_t unmapped, len; |
| 354 | phys_addr_t phys; | 361 | phys_addr_t phys, next; |
| 355 | 362 | ||
| 356 | phys = iommu_iova_to_phys(domain->domain, iova); | 363 | phys = iommu_iova_to_phys(domain->domain, iova); |
| 357 | if (WARN_ON(!phys)) { | 364 | if (WARN_ON(!phys)) { |
| @@ -359,7 +366,19 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) | |||
| 359 | continue; | 366 | continue; |
| 360 | } | 367 | } |
| 361 | 368 | ||
| 362 | unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE); | 369 | /* |
| 370 | * To optimize for fewer iommu_unmap() calls, each of which | ||
| 371 | * may require hardware cache flushing, try to find the | ||
| 372 | * largest contiguous physical memory chunk to unmap. | ||
| 373 | */ | ||
| 374 | for (len = PAGE_SIZE; | ||
| 375 | !domain->fgsp && iova + len < end; len += PAGE_SIZE) { | ||
| 376 | next = iommu_iova_to_phys(domain->domain, iova + len); | ||
| 377 | if (next != phys + len) | ||
| 378 | break; | ||
| 379 | } | ||
| 380 | |||
| 381 | unmapped = iommu_unmap(domain->domain, iova, len); | ||
| 363 | if (WARN_ON(!unmapped)) | 382 | if (WARN_ON(!unmapped)) |
| 364 | break; | 383 | break; |
| 365 | 384 | ||
| @@ -367,6 +386,8 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) | |||
| 367 | unmapped >> PAGE_SHIFT, | 386 | unmapped >> PAGE_SHIFT, |
| 368 | dma->prot, false); | 387 | dma->prot, false); |
| 369 | iova += unmapped; | 388 | iova += unmapped; |
| 389 | |||
| 390 | cond_resched(); | ||
| 370 | } | 391 | } |
| 371 | 392 | ||
| 372 | vfio_lock_acct(-unlocked); | 393 | vfio_lock_acct(-unlocked); |
| @@ -511,6 +532,8 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova, | |||
| 511 | map_try_harder(d, iova, pfn, npage, prot)) | 532 | map_try_harder(d, iova, pfn, npage, prot)) |
| 512 | goto unwind; | 533 | goto unwind; |
| 513 | } | 534 | } |
| 535 | |||
| 536 | cond_resched(); | ||
| 514 | } | 537 | } |
| 515 | 538 | ||
| 516 | return 0; | 539 | return 0; |
| @@ -665,6 +688,39 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, | |||
| 665 | return 0; | 688 | return 0; |
| 666 | } | 689 | } |
| 667 | 690 | ||
| 691 | /* | ||
| 692 | * We change our unmap behavior slightly depending on whether the IOMMU | ||
| 693 | * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage | ||
| 694 | * for practically any contiguous power-of-two mapping we give it. This means | ||
| 695 | * we don't need to look for contiguous chunks ourselves to make unmapping | ||
| 696 | * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d | ||
| 697 | * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks | ||
| 698 | * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when | ||
| 699 | * hugetlbfs is in use. | ||
| 700 | */ | ||
| 701 | static void vfio_test_domain_fgsp(struct vfio_domain *domain) | ||
| 702 | { | ||
| 703 | struct page *pages; | ||
| 704 | int ret, order = get_order(PAGE_SIZE * 2); | ||
| 705 | |||
| 706 | pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); | ||
| 707 | if (!pages) | ||
| 708 | return; | ||
| 709 | |||
| 710 | ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2, | ||
| 711 | IOMMU_READ | IOMMU_WRITE | domain->prot); | ||
| 712 | if (!ret) { | ||
| 713 | size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE); | ||
| 714 | |||
| 715 | if (unmapped == PAGE_SIZE) | ||
| 716 | iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE); | ||
| 717 | else | ||
| 718 | domain->fgsp = true; | ||
| 719 | } | ||
| 720 | |||
| 721 | __free_pages(pages, order); | ||
| 722 | } | ||
| 723 | |||
| 668 | static int vfio_iommu_type1_attach_group(void *iommu_data, | 724 | static int vfio_iommu_type1_attach_group(void *iommu_data, |
| 669 | struct iommu_group *iommu_group) | 725 | struct iommu_group *iommu_group) |
| 670 | { | 726 | { |
| @@ -758,6 +814,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, | |||
| 758 | } | 814 | } |
| 759 | } | 815 | } |
| 760 | 816 | ||
| 817 | vfio_test_domain_fgsp(domain); | ||
| 818 | |||
| 761 | /* replay mappings on new domains */ | 819 | /* replay mappings on new domains */ |
| 762 | ret = vfio_iommu_replay(iommu, domain); | 820 | ret = vfio_iommu_replay(iommu, domain); |
| 763 | if (ret) | 821 | if (ret) |
diff --git a/include/linux/vfio.h b/include/linux/vfio.h index d3204115f15d..2d67b8998fd8 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h | |||
| @@ -26,6 +26,7 @@ | |||
| 26 | * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_* | 26 | * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_* |
| 27 | * operations documented below | 27 | * operations documented below |
| 28 | * @mmap: Perform mmap(2) on a region of the device file descriptor | 28 | * @mmap: Perform mmap(2) on a region of the device file descriptor |
| 29 | * @request: Request for the bus driver to release the device | ||
| 29 | */ | 30 | */ |
| 30 | struct vfio_device_ops { | 31 | struct vfio_device_ops { |
| 31 | char *name; | 32 | char *name; |
| @@ -38,6 +39,7 @@ struct vfio_device_ops { | |||
| 38 | long (*ioctl)(void *device_data, unsigned int cmd, | 39 | long (*ioctl)(void *device_data, unsigned int cmd, |
| 39 | unsigned long arg); | 40 | unsigned long arg); |
| 40 | int (*mmap)(void *device_data, struct vm_area_struct *vma); | 41 | int (*mmap)(void *device_data, struct vm_area_struct *vma); |
| 42 | void (*request)(void *device_data, unsigned int count); | ||
| 41 | }; | 43 | }; |
| 42 | 44 | ||
| 43 | extern int vfio_add_group_dev(struct device *dev, | 45 | extern int vfio_add_group_dev(struct device *dev, |
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 29715d27548f..82889c30f4f5 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h | |||
| @@ -333,6 +333,7 @@ enum { | |||
| 333 | VFIO_PCI_MSI_IRQ_INDEX, | 333 | VFIO_PCI_MSI_IRQ_INDEX, |
| 334 | VFIO_PCI_MSIX_IRQ_INDEX, | 334 | VFIO_PCI_MSIX_IRQ_INDEX, |
| 335 | VFIO_PCI_ERR_IRQ_INDEX, | 335 | VFIO_PCI_ERR_IRQ_INDEX, |
| 336 | VFIO_PCI_REQ_IRQ_INDEX, | ||
| 336 | VFIO_PCI_NUM_IRQS | 337 | VFIO_PCI_NUM_IRQS |
| 337 | }; | 338 | }; |
| 338 | 339 | ||
