diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-21 14:55:21 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-21 14:55:21 -0500 |
commit | c189cb8ef62832f33b6cf757350a0270532a1ad8 (patch) | |
tree | c8cd1f998f7ae98e0446a86a9efcc688841c289a /drivers | |
parent | a2a6937da0b95644008ede0eb309493d16cf2ac2 (diff) | |
parent | 6140a8f5623820cec7f56c63444b9551d8d35775 (diff) |
Merge tag 'vfio-v3.20-rc1' of git://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- IOMMU updates based on trace analysis
- VFIO device request interface
* tag 'vfio-v3.20-rc1' of git://github.com/awilliam/linux-vfio:
vfio-pci: Add device request interface
vfio-pci: Generalize setup of simple eventfds
vfio: Add and use device request op for vfio bus drivers
vfio: Tie IOMMU group reference to vfio group
vfio: Add device tracking during unbind
vfio/type1: Add conditional rescheduling
vfio/type1: Chunk contiguous reserved/invalid page mappings
vfio/type1: DMA unmap chunking
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 21 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_intrs.c | 60 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_private.h | 1 | ||||
-rw-r--r-- | drivers/vfio/vfio.c | 119 | ||||
-rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 80 |
5 files changed, 239 insertions, 42 deletions
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 7cc0122a18ce..f8a186381ae8 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c | |||
@@ -239,9 +239,12 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) | |||
239 | 239 | ||
240 | return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; | 240 | return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; |
241 | } | 241 | } |
242 | } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) | 242 | } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { |
243 | if (pci_is_pcie(vdev->pdev)) | 243 | if (pci_is_pcie(vdev->pdev)) |
244 | return 1; | 244 | return 1; |
245 | } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { | ||
246 | return 1; | ||
247 | } | ||
245 | 248 | ||
246 | return 0; | 249 | return 0; |
247 | } | 250 | } |
@@ -464,6 +467,7 @@ static long vfio_pci_ioctl(void *device_data, | |||
464 | 467 | ||
465 | switch (info.index) { | 468 | switch (info.index) { |
466 | case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: | 469 | case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: |
470 | case VFIO_PCI_REQ_IRQ_INDEX: | ||
467 | break; | 471 | break; |
468 | case VFIO_PCI_ERR_IRQ_INDEX: | 472 | case VFIO_PCI_ERR_IRQ_INDEX: |
469 | if (pci_is_pcie(vdev->pdev)) | 473 | if (pci_is_pcie(vdev->pdev)) |
@@ -828,6 +832,20 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) | |||
828 | req_len, vma->vm_page_prot); | 832 | req_len, vma->vm_page_prot); |
829 | } | 833 | } |
830 | 834 | ||
835 | static void vfio_pci_request(void *device_data, unsigned int count) | ||
836 | { | ||
837 | struct vfio_pci_device *vdev = device_data; | ||
838 | |||
839 | mutex_lock(&vdev->igate); | ||
840 | |||
841 | if (vdev->req_trigger) { | ||
842 | dev_dbg(&vdev->pdev->dev, "Requesting device from user\n"); | ||
843 | eventfd_signal(vdev->req_trigger, 1); | ||
844 | } | ||
845 | |||
846 | mutex_unlock(&vdev->igate); | ||
847 | } | ||
848 | |||
831 | static const struct vfio_device_ops vfio_pci_ops = { | 849 | static const struct vfio_device_ops vfio_pci_ops = { |
832 | .name = "vfio-pci", | 850 | .name = "vfio-pci", |
833 | .open = vfio_pci_open, | 851 | .open = vfio_pci_open, |
@@ -836,6 +854,7 @@ static const struct vfio_device_ops vfio_pci_ops = { | |||
836 | .read = vfio_pci_read, | 854 | .read = vfio_pci_read, |
837 | .write = vfio_pci_write, | 855 | .write = vfio_pci_write, |
838 | .mmap = vfio_pci_mmap, | 856 | .mmap = vfio_pci_mmap, |
857 | .request = vfio_pci_request, | ||
839 | }; | 858 | }; |
840 | 859 | ||
841 | static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) | 860 | static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) |
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index e8d695b3f54e..f88bfdf5b6a0 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c | |||
@@ -763,46 +763,70 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev, | |||
763 | return 0; | 763 | return 0; |
764 | } | 764 | } |
765 | 765 | ||
766 | static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, | 766 | static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, |
767 | unsigned index, unsigned start, | 767 | uint32_t flags, void *data) |
768 | unsigned count, uint32_t flags, void *data) | ||
769 | { | 768 | { |
770 | int32_t fd = *(int32_t *)data; | 769 | int32_t fd = *(int32_t *)data; |
771 | 770 | ||
772 | if ((index != VFIO_PCI_ERR_IRQ_INDEX) || | 771 | if (!(flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) |
773 | !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) | ||
774 | return -EINVAL; | 772 | return -EINVAL; |
775 | 773 | ||
776 | /* DATA_NONE/DATA_BOOL enables loopback testing */ | 774 | /* DATA_NONE/DATA_BOOL enables loopback testing */ |
777 | if (flags & VFIO_IRQ_SET_DATA_NONE) { | 775 | if (flags & VFIO_IRQ_SET_DATA_NONE) { |
778 | if (vdev->err_trigger) | 776 | if (*ctx) |
779 | eventfd_signal(vdev->err_trigger, 1); | 777 | eventfd_signal(*ctx, 1); |
780 | return 0; | 778 | return 0; |
781 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { | 779 | } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { |
782 | uint8_t trigger = *(uint8_t *)data; | 780 | uint8_t trigger = *(uint8_t *)data; |
783 | if (trigger && vdev->err_trigger) | 781 | if (trigger && *ctx) |
784 | eventfd_signal(vdev->err_trigger, 1); | 782 | eventfd_signal(*ctx, 1); |
785 | return 0; | 783 | return 0; |
786 | } | 784 | } |
787 | 785 | ||
788 | /* Handle SET_DATA_EVENTFD */ | 786 | /* Handle SET_DATA_EVENTFD */ |
789 | if (fd == -1) { | 787 | if (fd == -1) { |
790 | if (vdev->err_trigger) | 788 | if (*ctx) |
791 | eventfd_ctx_put(vdev->err_trigger); | 789 | eventfd_ctx_put(*ctx); |
792 | vdev->err_trigger = NULL; | 790 | *ctx = NULL; |
793 | return 0; | 791 | return 0; |
794 | } else if (fd >= 0) { | 792 | } else if (fd >= 0) { |
795 | struct eventfd_ctx *efdctx; | 793 | struct eventfd_ctx *efdctx; |
796 | efdctx = eventfd_ctx_fdget(fd); | 794 | efdctx = eventfd_ctx_fdget(fd); |
797 | if (IS_ERR(efdctx)) | 795 | if (IS_ERR(efdctx)) |
798 | return PTR_ERR(efdctx); | 796 | return PTR_ERR(efdctx); |
799 | if (vdev->err_trigger) | 797 | if (*ctx) |
800 | eventfd_ctx_put(vdev->err_trigger); | 798 | eventfd_ctx_put(*ctx); |
801 | vdev->err_trigger = efdctx; | 799 | *ctx = efdctx; |
802 | return 0; | 800 | return 0; |
803 | } else | 801 | } else |
804 | return -EINVAL; | 802 | return -EINVAL; |
805 | } | 803 | } |
804 | |||
805 | static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, | ||
806 | unsigned index, unsigned start, | ||
807 | unsigned count, uint32_t flags, void *data) | ||
808 | { | ||
809 | if (index != VFIO_PCI_ERR_IRQ_INDEX) | ||
810 | return -EINVAL; | ||
811 | |||
812 | /* | ||
813 | * We should sanitize start & count, but that wasn't caught | ||
814 | * originally, so this IRQ index must forever ignore them :-( | ||
815 | */ | ||
816 | |||
817 | return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data); | ||
818 | } | ||
819 | |||
820 | static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev, | ||
821 | unsigned index, unsigned start, | ||
822 | unsigned count, uint32_t flags, void *data) | ||
823 | { | ||
824 | if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count != 1) | ||
825 | return -EINVAL; | ||
826 | |||
827 | return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, flags, data); | ||
828 | } | ||
829 | |||
806 | int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, | 830 | int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, |
807 | unsigned index, unsigned start, unsigned count, | 831 | unsigned index, unsigned start, unsigned count, |
808 | void *data) | 832 | void *data) |
@@ -844,6 +868,12 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, | |||
844 | func = vfio_pci_set_err_trigger; | 868 | func = vfio_pci_set_err_trigger; |
845 | break; | 869 | break; |
846 | } | 870 | } |
871 | case VFIO_PCI_REQ_IRQ_INDEX: | ||
872 | switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { | ||
873 | case VFIO_IRQ_SET_ACTION_TRIGGER: | ||
874 | func = vfio_pci_set_req_trigger; | ||
875 | break; | ||
876 | } | ||
847 | } | 877 | } |
848 | 878 | ||
849 | if (!func) | 879 | if (!func) |
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 671c17a6e6d0..c9f9b323f152 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h | |||
@@ -58,6 +58,7 @@ struct vfio_pci_device { | |||
58 | struct pci_saved_state *pci_saved_state; | 58 | struct pci_saved_state *pci_saved_state; |
59 | int refcnt; | 59 | int refcnt; |
60 | struct eventfd_ctx *err_trigger; | 60 | struct eventfd_ctx *err_trigger; |
61 | struct eventfd_ctx *req_trigger; | ||
61 | }; | 62 | }; |
62 | 63 | ||
63 | #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) | 64 | #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) |
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index f018d8d0f975..4cde85501444 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c | |||
@@ -63,6 +63,11 @@ struct vfio_container { | |||
63 | void *iommu_data; | 63 | void *iommu_data; |
64 | }; | 64 | }; |
65 | 65 | ||
66 | struct vfio_unbound_dev { | ||
67 | struct device *dev; | ||
68 | struct list_head unbound_next; | ||
69 | }; | ||
70 | |||
66 | struct vfio_group { | 71 | struct vfio_group { |
67 | struct kref kref; | 72 | struct kref kref; |
68 | int minor; | 73 | int minor; |
@@ -75,6 +80,8 @@ struct vfio_group { | |||
75 | struct notifier_block nb; | 80 | struct notifier_block nb; |
76 | struct list_head vfio_next; | 81 | struct list_head vfio_next; |
77 | struct list_head container_next; | 82 | struct list_head container_next; |
83 | struct list_head unbound_list; | ||
84 | struct mutex unbound_lock; | ||
78 | atomic_t opened; | 85 | atomic_t opened; |
79 | }; | 86 | }; |
80 | 87 | ||
@@ -204,6 +211,8 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) | |||
204 | kref_init(&group->kref); | 211 | kref_init(&group->kref); |
205 | INIT_LIST_HEAD(&group->device_list); | 212 | INIT_LIST_HEAD(&group->device_list); |
206 | mutex_init(&group->device_lock); | 213 | mutex_init(&group->device_lock); |
214 | INIT_LIST_HEAD(&group->unbound_list); | ||
215 | mutex_init(&group->unbound_lock); | ||
207 | atomic_set(&group->container_users, 0); | 216 | atomic_set(&group->container_users, 0); |
208 | atomic_set(&group->opened, 0); | 217 | atomic_set(&group->opened, 0); |
209 | group->iommu_group = iommu_group; | 218 | group->iommu_group = iommu_group; |
@@ -264,13 +273,22 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) | |||
264 | static void vfio_group_release(struct kref *kref) | 273 | static void vfio_group_release(struct kref *kref) |
265 | { | 274 | { |
266 | struct vfio_group *group = container_of(kref, struct vfio_group, kref); | 275 | struct vfio_group *group = container_of(kref, struct vfio_group, kref); |
276 | struct vfio_unbound_dev *unbound, *tmp; | ||
277 | struct iommu_group *iommu_group = group->iommu_group; | ||
267 | 278 | ||
268 | WARN_ON(!list_empty(&group->device_list)); | 279 | WARN_ON(!list_empty(&group->device_list)); |
269 | 280 | ||
281 | list_for_each_entry_safe(unbound, tmp, | ||
282 | &group->unbound_list, unbound_next) { | ||
283 | list_del(&unbound->unbound_next); | ||
284 | kfree(unbound); | ||
285 | } | ||
286 | |||
270 | device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor)); | 287 | device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor)); |
271 | list_del(&group->vfio_next); | 288 | list_del(&group->vfio_next); |
272 | vfio_free_group_minor(group->minor); | 289 | vfio_free_group_minor(group->minor); |
273 | vfio_group_unlock_and_free(group); | 290 | vfio_group_unlock_and_free(group); |
291 | iommu_group_put(iommu_group); | ||
274 | } | 292 | } |
275 | 293 | ||
276 | static void vfio_group_put(struct vfio_group *group) | 294 | static void vfio_group_put(struct vfio_group *group) |
@@ -440,17 +458,36 @@ static bool vfio_whitelisted_driver(struct device_driver *drv) | |||
440 | } | 458 | } |
441 | 459 | ||
442 | /* | 460 | /* |
443 | * A vfio group is viable for use by userspace if all devices are either | 461 | * A vfio group is viable for use by userspace if all devices are in |
444 | * driver-less or bound to a vfio or whitelisted driver. We test the | 462 | * one of the following states: |
445 | * latter by the existence of a struct vfio_device matching the dev. | 463 | * - driver-less |
464 | * - bound to a vfio driver | ||
465 | * - bound to a whitelisted driver | ||
466 | * | ||
467 | * We use two methods to determine whether a device is bound to a vfio | ||
468 | * driver. The first is to test whether the device exists in the vfio | ||
469 | * group. The second is to test if the device exists on the group | ||
470 | * unbound_list, indicating it's in the middle of transitioning from | ||
471 | * a vfio driver to driver-less. | ||
446 | */ | 472 | */ |
447 | static int vfio_dev_viable(struct device *dev, void *data) | 473 | static int vfio_dev_viable(struct device *dev, void *data) |
448 | { | 474 | { |
449 | struct vfio_group *group = data; | 475 | struct vfio_group *group = data; |
450 | struct vfio_device *device; | 476 | struct vfio_device *device; |
451 | struct device_driver *drv = ACCESS_ONCE(dev->driver); | 477 | struct device_driver *drv = ACCESS_ONCE(dev->driver); |
478 | struct vfio_unbound_dev *unbound; | ||
479 | int ret = -EINVAL; | ||
452 | 480 | ||
453 | if (!drv || vfio_whitelisted_driver(drv)) | 481 | mutex_lock(&group->unbound_lock); |
482 | list_for_each_entry(unbound, &group->unbound_list, unbound_next) { | ||
483 | if (dev == unbound->dev) { | ||
484 | ret = 0; | ||
485 | break; | ||
486 | } | ||
487 | } | ||
488 | mutex_unlock(&group->unbound_lock); | ||
489 | |||
490 | if (!ret || !drv || vfio_whitelisted_driver(drv)) | ||
454 | return 0; | 491 | return 0; |
455 | 492 | ||
456 | device = vfio_group_get_device(group, dev); | 493 | device = vfio_group_get_device(group, dev); |
@@ -459,7 +496,7 @@ static int vfio_dev_viable(struct device *dev, void *data) | |||
459 | return 0; | 496 | return 0; |
460 | } | 497 | } |
461 | 498 | ||
462 | return -EINVAL; | 499 | return ret; |
463 | } | 500 | } |
464 | 501 | ||
465 | /** | 502 | /** |
@@ -501,6 +538,7 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb, | |||
501 | { | 538 | { |
502 | struct vfio_group *group = container_of(nb, struct vfio_group, nb); | 539 | struct vfio_group *group = container_of(nb, struct vfio_group, nb); |
503 | struct device *dev = data; | 540 | struct device *dev = data; |
541 | struct vfio_unbound_dev *unbound; | ||
504 | 542 | ||
505 | /* | 543 | /* |
506 | * Need to go through a group_lock lookup to get a reference or we | 544 | * Need to go through a group_lock lookup to get a reference or we |
@@ -550,6 +588,17 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb, | |||
550 | * stop the system to maintain isolation. At a minimum, we'd | 588 | * stop the system to maintain isolation. At a minimum, we'd |
551 | * want a toggle to disable driver auto probe for this device. | 589 | * want a toggle to disable driver auto probe for this device. |
552 | */ | 590 | */ |
591 | |||
592 | mutex_lock(&group->unbound_lock); | ||
593 | list_for_each_entry(unbound, | ||
594 | &group->unbound_list, unbound_next) { | ||
595 | if (dev == unbound->dev) { | ||
596 | list_del(&unbound->unbound_next); | ||
597 | kfree(unbound); | ||
598 | break; | ||
599 | } | ||
600 | } | ||
601 | mutex_unlock(&group->unbound_lock); | ||
553 | break; | 602 | break; |
554 | } | 603 | } |
555 | 604 | ||
@@ -578,6 +627,12 @@ int vfio_add_group_dev(struct device *dev, | |||
578 | iommu_group_put(iommu_group); | 627 | iommu_group_put(iommu_group); |
579 | return PTR_ERR(group); | 628 | return PTR_ERR(group); |
580 | } | 629 | } |
630 | } else { | ||
631 | /* | ||
632 | * A found vfio_group already holds a reference to the | ||
633 | * iommu_group. A created vfio_group keeps the reference. | ||
634 | */ | ||
635 | iommu_group_put(iommu_group); | ||
581 | } | 636 | } |
582 | 637 | ||
583 | device = vfio_group_get_device(group, dev); | 638 | device = vfio_group_get_device(group, dev); |
@@ -586,21 +641,19 @@ int vfio_add_group_dev(struct device *dev, | |||
586 | dev_name(dev), iommu_group_id(iommu_group)); | 641 | dev_name(dev), iommu_group_id(iommu_group)); |
587 | vfio_device_put(device); | 642 | vfio_device_put(device); |
588 | vfio_group_put(group); | 643 | vfio_group_put(group); |
589 | iommu_group_put(iommu_group); | ||
590 | return -EBUSY; | 644 | return -EBUSY; |
591 | } | 645 | } |
592 | 646 | ||
593 | device = vfio_group_create_device(group, dev, ops, device_data); | 647 | device = vfio_group_create_device(group, dev, ops, device_data); |
594 | if (IS_ERR(device)) { | 648 | if (IS_ERR(device)) { |
595 | vfio_group_put(group); | 649 | vfio_group_put(group); |
596 | iommu_group_put(iommu_group); | ||
597 | return PTR_ERR(device); | 650 | return PTR_ERR(device); |
598 | } | 651 | } |
599 | 652 | ||
600 | /* | 653 | /* |
601 | * Added device holds reference to iommu_group and vfio_device | 654 | * Drop all but the vfio_device reference. The vfio_device holds |
602 | * (which in turn holds reference to vfio_group). Drop extra | 655 | * a reference to the vfio_group, which holds a reference to the |
603 | * group reference used while acquiring device. | 656 | * iommu_group. |
604 | */ | 657 | */ |
605 | vfio_group_put(group); | 658 | vfio_group_put(group); |
606 | 659 | ||
@@ -655,8 +708,9 @@ void *vfio_del_group_dev(struct device *dev) | |||
655 | { | 708 | { |
656 | struct vfio_device *device = dev_get_drvdata(dev); | 709 | struct vfio_device *device = dev_get_drvdata(dev); |
657 | struct vfio_group *group = device->group; | 710 | struct vfio_group *group = device->group; |
658 | struct iommu_group *iommu_group = group->iommu_group; | ||
659 | void *device_data = device->device_data; | 711 | void *device_data = device->device_data; |
712 | struct vfio_unbound_dev *unbound; | ||
713 | unsigned int i = 0; | ||
660 | 714 | ||
661 | /* | 715 | /* |
662 | * The group exists so long as we have a device reference. Get | 716 | * The group exists so long as we have a device reference. Get |
@@ -664,14 +718,49 @@ void *vfio_del_group_dev(struct device *dev) | |||
664 | */ | 718 | */ |
665 | vfio_group_get(group); | 719 | vfio_group_get(group); |
666 | 720 | ||
721 | /* | ||
722 | * When the device is removed from the group, the group suddenly | ||
723 | * becomes non-viable; the device has a driver (until the unbind | ||
724 | * completes), but it's not present in the group. This is bad news | ||
725 | * for any external users that need to re-acquire a group reference | ||
726 | * in order to match and release their existing reference. To | ||
727 | * solve this, we track such devices on the unbound_list to bridge | ||
728 | * the gap until they're fully unbound. | ||
729 | */ | ||
730 | unbound = kzalloc(sizeof(*unbound), GFP_KERNEL); | ||
731 | if (unbound) { | ||
732 | unbound->dev = dev; | ||
733 | mutex_lock(&group->unbound_lock); | ||
734 | list_add(&unbound->unbound_next, &group->unbound_list); | ||
735 | mutex_unlock(&group->unbound_lock); | ||
736 | } | ||
737 | WARN_ON(!unbound); | ||
738 | |||
667 | vfio_device_put(device); | 739 | vfio_device_put(device); |
668 | 740 | ||
669 | /* TODO send a signal to encourage this to be released */ | 741 | /* |
670 | wait_event(vfio.release_q, !vfio_dev_present(group, dev)); | 742 | * If the device is still present in the group after the above |
743 | * 'put', then it is in use and we need to request it from the | ||
744 | * bus driver. The driver may in turn need to request the | ||
745 | * device from the user. We send the request on an arbitrary | ||
746 | * interval with counter to allow the driver to take escalating | ||
747 | * measures to release the device if it has the ability to do so. | ||
748 | */ | ||
749 | do { | ||
750 | device = vfio_group_get_device(group, dev); | ||
751 | if (!device) | ||
752 | break; | ||
671 | 753 | ||
672 | vfio_group_put(group); | 754 | if (device->ops->request) |
755 | device->ops->request(device_data, i++); | ||
673 | 756 | ||
674 | iommu_group_put(iommu_group); | 757 | vfio_device_put(device); |
758 | |||
759 | } while (wait_event_interruptible_timeout(vfio.release_q, | ||
760 | !vfio_dev_present(group, dev), | ||
761 | HZ * 10) <= 0); | ||
762 | |||
763 | vfio_group_put(group); | ||
675 | 764 | ||
676 | return device_data; | 765 | return device_data; |
677 | } | 766 | } |
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 4a9d666f1e91..57d8c37a002b 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c | |||
@@ -66,6 +66,7 @@ struct vfio_domain { | |||
66 | struct list_head next; | 66 | struct list_head next; |
67 | struct list_head group_list; | 67 | struct list_head group_list; |
68 | int prot; /* IOMMU_CACHE */ | 68 | int prot; /* IOMMU_CACHE */ |
69 | bool fgsp; /* Fine-grained super pages */ | ||
69 | }; | 70 | }; |
70 | 71 | ||
71 | struct vfio_dma { | 72 | struct vfio_dma { |
@@ -264,6 +265,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, | |||
264 | unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | 265 | unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
265 | bool lock_cap = capable(CAP_IPC_LOCK); | 266 | bool lock_cap = capable(CAP_IPC_LOCK); |
266 | long ret, i; | 267 | long ret, i; |
268 | bool rsvd; | ||
267 | 269 | ||
268 | if (!current->mm) | 270 | if (!current->mm) |
269 | return -ENODEV; | 271 | return -ENODEV; |
@@ -272,10 +274,9 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, | |||
272 | if (ret) | 274 | if (ret) |
273 | return ret; | 275 | return ret; |
274 | 276 | ||
275 | if (is_invalid_reserved_pfn(*pfn_base)) | 277 | rsvd = is_invalid_reserved_pfn(*pfn_base); |
276 | return 1; | ||
277 | 278 | ||
278 | if (!lock_cap && current->mm->locked_vm + 1 > limit) { | 279 | if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) { |
279 | put_pfn(*pfn_base, prot); | 280 | put_pfn(*pfn_base, prot); |
280 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, | 281 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, |
281 | limit << PAGE_SHIFT); | 282 | limit << PAGE_SHIFT); |
@@ -283,7 +284,8 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, | |||
283 | } | 284 | } |
284 | 285 | ||
285 | if (unlikely(disable_hugepages)) { | 286 | if (unlikely(disable_hugepages)) { |
286 | vfio_lock_acct(1); | 287 | if (!rsvd) |
288 | vfio_lock_acct(1); | ||
287 | return 1; | 289 | return 1; |
288 | } | 290 | } |
289 | 291 | ||
@@ -295,12 +297,14 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, | |||
295 | if (ret) | 297 | if (ret) |
296 | break; | 298 | break; |
297 | 299 | ||
298 | if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) { | 300 | if (pfn != *pfn_base + i || |
301 | rsvd != is_invalid_reserved_pfn(pfn)) { | ||
299 | put_pfn(pfn, prot); | 302 | put_pfn(pfn, prot); |
300 | break; | 303 | break; |
301 | } | 304 | } |
302 | 305 | ||
303 | if (!lock_cap && current->mm->locked_vm + i + 1 > limit) { | 306 | if (!rsvd && !lock_cap && |
307 | current->mm->locked_vm + i + 1 > limit) { | ||
304 | put_pfn(pfn, prot); | 308 | put_pfn(pfn, prot); |
305 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", | 309 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", |
306 | __func__, limit << PAGE_SHIFT); | 310 | __func__, limit << PAGE_SHIFT); |
@@ -308,7 +312,8 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, | |||
308 | } | 312 | } |
309 | } | 313 | } |
310 | 314 | ||
311 | vfio_lock_acct(i); | 315 | if (!rsvd) |
316 | vfio_lock_acct(i); | ||
312 | 317 | ||
313 | return i; | 318 | return i; |
314 | } | 319 | } |
@@ -346,12 +351,14 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) | |||
346 | domain = d = list_first_entry(&iommu->domain_list, | 351 | domain = d = list_first_entry(&iommu->domain_list, |
347 | struct vfio_domain, next); | 352 | struct vfio_domain, next); |
348 | 353 | ||
349 | list_for_each_entry_continue(d, &iommu->domain_list, next) | 354 | list_for_each_entry_continue(d, &iommu->domain_list, next) { |
350 | iommu_unmap(d->domain, dma->iova, dma->size); | 355 | iommu_unmap(d->domain, dma->iova, dma->size); |
356 | cond_resched(); | ||
357 | } | ||
351 | 358 | ||
352 | while (iova < end) { | 359 | while (iova < end) { |
353 | size_t unmapped; | 360 | size_t unmapped, len; |
354 | phys_addr_t phys; | 361 | phys_addr_t phys, next; |
355 | 362 | ||
356 | phys = iommu_iova_to_phys(domain->domain, iova); | 363 | phys = iommu_iova_to_phys(domain->domain, iova); |
357 | if (WARN_ON(!phys)) { | 364 | if (WARN_ON(!phys)) { |
@@ -359,7 +366,19 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) | |||
359 | continue; | 366 | continue; |
360 | } | 367 | } |
361 | 368 | ||
362 | unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE); | 369 | /* |
370 | * To optimize for fewer iommu_unmap() calls, each of which | ||
371 | * may require hardware cache flushing, try to find the | ||
372 | * largest contiguous physical memory chunk to unmap. | ||
373 | */ | ||
374 | for (len = PAGE_SIZE; | ||
375 | !domain->fgsp && iova + len < end; len += PAGE_SIZE) { | ||
376 | next = iommu_iova_to_phys(domain->domain, iova + len); | ||
377 | if (next != phys + len) | ||
378 | break; | ||
379 | } | ||
380 | |||
381 | unmapped = iommu_unmap(domain->domain, iova, len); | ||
363 | if (WARN_ON(!unmapped)) | 382 | if (WARN_ON(!unmapped)) |
364 | break; | 383 | break; |
365 | 384 | ||
@@ -367,6 +386,8 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) | |||
367 | unmapped >> PAGE_SHIFT, | 386 | unmapped >> PAGE_SHIFT, |
368 | dma->prot, false); | 387 | dma->prot, false); |
369 | iova += unmapped; | 388 | iova += unmapped; |
389 | |||
390 | cond_resched(); | ||
370 | } | 391 | } |
371 | 392 | ||
372 | vfio_lock_acct(-unlocked); | 393 | vfio_lock_acct(-unlocked); |
@@ -511,6 +532,8 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova, | |||
511 | map_try_harder(d, iova, pfn, npage, prot)) | 532 | map_try_harder(d, iova, pfn, npage, prot)) |
512 | goto unwind; | 533 | goto unwind; |
513 | } | 534 | } |
535 | |||
536 | cond_resched(); | ||
514 | } | 537 | } |
515 | 538 | ||
516 | return 0; | 539 | return 0; |
@@ -665,6 +688,39 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, | |||
665 | return 0; | 688 | return 0; |
666 | } | 689 | } |
667 | 690 | ||
691 | /* | ||
692 | * We change our unmap behavior slightly depending on whether the IOMMU | ||
693 | * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage | ||
694 | * for practically any contiguous power-of-two mapping we give it. This means | ||
695 | * we don't need to look for contiguous chunks ourselves to make unmapping | ||
696 | * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d | ||
697 | * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks | ||
698 | * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when | ||
699 | * hugetlbfs is in use. | ||
700 | */ | ||
701 | static void vfio_test_domain_fgsp(struct vfio_domain *domain) | ||
702 | { | ||
703 | struct page *pages; | ||
704 | int ret, order = get_order(PAGE_SIZE * 2); | ||
705 | |||
706 | pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); | ||
707 | if (!pages) | ||
708 | return; | ||
709 | |||
710 | ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2, | ||
711 | IOMMU_READ | IOMMU_WRITE | domain->prot); | ||
712 | if (!ret) { | ||
713 | size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE); | ||
714 | |||
715 | if (unmapped == PAGE_SIZE) | ||
716 | iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE); | ||
717 | else | ||
718 | domain->fgsp = true; | ||
719 | } | ||
720 | |||
721 | __free_pages(pages, order); | ||
722 | } | ||
723 | |||
668 | static int vfio_iommu_type1_attach_group(void *iommu_data, | 724 | static int vfio_iommu_type1_attach_group(void *iommu_data, |
669 | struct iommu_group *iommu_group) | 725 | struct iommu_group *iommu_group) |
670 | { | 726 | { |
@@ -758,6 +814,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, | |||
758 | } | 814 | } |
759 | } | 815 | } |
760 | 816 | ||
817 | vfio_test_domain_fgsp(domain); | ||
818 | |||
761 | /* replay mappings on new domains */ | 819 | /* replay mappings on new domains */ |
762 | ret = vfio_iommu_replay(iommu, domain); | 820 | ret = vfio_iommu_replay(iommu, domain); |
763 | if (ret) | 821 | if (ret) |