aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-21 14:55:21 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-21 14:55:21 -0500
commitc189cb8ef62832f33b6cf757350a0270532a1ad8 (patch)
treec8cd1f998f7ae98e0446a86a9efcc688841c289a /drivers
parenta2a6937da0b95644008ede0eb309493d16cf2ac2 (diff)
parent6140a8f5623820cec7f56c63444b9551d8d35775 (diff)
Merge tag 'vfio-v3.20-rc1' of git://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson: - IOMMU updates based on trace analysis - VFIO device request interface * tag 'vfio-v3.20-rc1' of git://github.com/awilliam/linux-vfio: vfio-pci: Add device request interface vfio-pci: Generalize setup of simple eventfds vfio: Add and use device request op for vfio bus drivers vfio: Tie IOMMU group reference to vfio group vfio: Add device tracking during unbind vfio/type1: Add conditional rescheduling vfio/type1: Chunk contiguous reserved/invalid page mappings vfio/type1: DMA unmap chunking
Diffstat (limited to 'drivers')
-rw-r--r--drivers/vfio/pci/vfio_pci.c21
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c60
-rw-r--r--drivers/vfio/pci/vfio_pci_private.h1
-rw-r--r--drivers/vfio/vfio.c119
-rw-r--r--drivers/vfio/vfio_iommu_type1.c80
5 files changed, 239 insertions, 42 deletions
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 7cc0122a18ce..f8a186381ae8 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -239,9 +239,12 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
239 239
240 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; 240 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
241 } 241 }
242 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) 242 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
243 if (pci_is_pcie(vdev->pdev)) 243 if (pci_is_pcie(vdev->pdev))
244 return 1; 244 return 1;
245 } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
246 return 1;
247 }
245 248
246 return 0; 249 return 0;
247} 250}
@@ -464,6 +467,7 @@ static long vfio_pci_ioctl(void *device_data,
464 467
465 switch (info.index) { 468 switch (info.index) {
466 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: 469 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
470 case VFIO_PCI_REQ_IRQ_INDEX:
467 break; 471 break;
468 case VFIO_PCI_ERR_IRQ_INDEX: 472 case VFIO_PCI_ERR_IRQ_INDEX:
469 if (pci_is_pcie(vdev->pdev)) 473 if (pci_is_pcie(vdev->pdev))
@@ -828,6 +832,20 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
828 req_len, vma->vm_page_prot); 832 req_len, vma->vm_page_prot);
829} 833}
830 834
835static void vfio_pci_request(void *device_data, unsigned int count)
836{
837 struct vfio_pci_device *vdev = device_data;
838
839 mutex_lock(&vdev->igate);
840
841 if (vdev->req_trigger) {
842 dev_dbg(&vdev->pdev->dev, "Requesting device from user\n");
843 eventfd_signal(vdev->req_trigger, 1);
844 }
845
846 mutex_unlock(&vdev->igate);
847}
848
831static const struct vfio_device_ops vfio_pci_ops = { 849static const struct vfio_device_ops vfio_pci_ops = {
832 .name = "vfio-pci", 850 .name = "vfio-pci",
833 .open = vfio_pci_open, 851 .open = vfio_pci_open,
@@ -836,6 +854,7 @@ static const struct vfio_device_ops vfio_pci_ops = {
836 .read = vfio_pci_read, 854 .read = vfio_pci_read,
837 .write = vfio_pci_write, 855 .write = vfio_pci_write,
838 .mmap = vfio_pci_mmap, 856 .mmap = vfio_pci_mmap,
857 .request = vfio_pci_request,
839}; 858};
840 859
841static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) 860static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index e8d695b3f54e..f88bfdf5b6a0 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -763,46 +763,70 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
763 return 0; 763 return 0;
764} 764}
765 765
766static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, 766static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
767 unsigned index, unsigned start, 767 uint32_t flags, void *data)
768 unsigned count, uint32_t flags, void *data)
769{ 768{
770 int32_t fd = *(int32_t *)data; 769 int32_t fd = *(int32_t *)data;
771 770
772 if ((index != VFIO_PCI_ERR_IRQ_INDEX) || 771 if (!(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
773 !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
774 return -EINVAL; 772 return -EINVAL;
775 773
776 /* DATA_NONE/DATA_BOOL enables loopback testing */ 774 /* DATA_NONE/DATA_BOOL enables loopback testing */
777 if (flags & VFIO_IRQ_SET_DATA_NONE) { 775 if (flags & VFIO_IRQ_SET_DATA_NONE) {
778 if (vdev->err_trigger) 776 if (*ctx)
779 eventfd_signal(vdev->err_trigger, 1); 777 eventfd_signal(*ctx, 1);
780 return 0; 778 return 0;
781 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { 779 } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
782 uint8_t trigger = *(uint8_t *)data; 780 uint8_t trigger = *(uint8_t *)data;
783 if (trigger && vdev->err_trigger) 781 if (trigger && *ctx)
784 eventfd_signal(vdev->err_trigger, 1); 782 eventfd_signal(*ctx, 1);
785 return 0; 783 return 0;
786 } 784 }
787 785
788 /* Handle SET_DATA_EVENTFD */ 786 /* Handle SET_DATA_EVENTFD */
789 if (fd == -1) { 787 if (fd == -1) {
790 if (vdev->err_trigger) 788 if (*ctx)
791 eventfd_ctx_put(vdev->err_trigger); 789 eventfd_ctx_put(*ctx);
792 vdev->err_trigger = NULL; 790 *ctx = NULL;
793 return 0; 791 return 0;
794 } else if (fd >= 0) { 792 } else if (fd >= 0) {
795 struct eventfd_ctx *efdctx; 793 struct eventfd_ctx *efdctx;
796 efdctx = eventfd_ctx_fdget(fd); 794 efdctx = eventfd_ctx_fdget(fd);
797 if (IS_ERR(efdctx)) 795 if (IS_ERR(efdctx))
798 return PTR_ERR(efdctx); 796 return PTR_ERR(efdctx);
799 if (vdev->err_trigger) 797 if (*ctx)
800 eventfd_ctx_put(vdev->err_trigger); 798 eventfd_ctx_put(*ctx);
801 vdev->err_trigger = efdctx; 799 *ctx = efdctx;
802 return 0; 800 return 0;
803 } else 801 } else
804 return -EINVAL; 802 return -EINVAL;
805} 803}
804
805static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
806 unsigned index, unsigned start,
807 unsigned count, uint32_t flags, void *data)
808{
809 if (index != VFIO_PCI_ERR_IRQ_INDEX)
810 return -EINVAL;
811
812 /*
813 * We should sanitize start & count, but that wasn't caught
814 * originally, so this IRQ index must forever ignore them :-(
815 */
816
817 return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data);
818}
819
820static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev,
821 unsigned index, unsigned start,
822 unsigned count, uint32_t flags, void *data)
823{
824 if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count != 1)
825 return -EINVAL;
826
827 return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, flags, data);
828}
829
806int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, 830int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
807 unsigned index, unsigned start, unsigned count, 831 unsigned index, unsigned start, unsigned count,
808 void *data) 832 void *data)
@@ -844,6 +868,12 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
844 func = vfio_pci_set_err_trigger; 868 func = vfio_pci_set_err_trigger;
845 break; 869 break;
846 } 870 }
871 case VFIO_PCI_REQ_IRQ_INDEX:
872 switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
873 case VFIO_IRQ_SET_ACTION_TRIGGER:
874 func = vfio_pci_set_req_trigger;
875 break;
876 }
847 } 877 }
848 878
849 if (!func) 879 if (!func)
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 671c17a6e6d0..c9f9b323f152 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -58,6 +58,7 @@ struct vfio_pci_device {
58 struct pci_saved_state *pci_saved_state; 58 struct pci_saved_state *pci_saved_state;
59 int refcnt; 59 int refcnt;
60 struct eventfd_ctx *err_trigger; 60 struct eventfd_ctx *err_trigger;
61 struct eventfd_ctx *req_trigger;
61}; 62};
62 63
63#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) 64#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index f018d8d0f975..4cde85501444 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -63,6 +63,11 @@ struct vfio_container {
63 void *iommu_data; 63 void *iommu_data;
64}; 64};
65 65
66struct vfio_unbound_dev {
67 struct device *dev;
68 struct list_head unbound_next;
69};
70
66struct vfio_group { 71struct vfio_group {
67 struct kref kref; 72 struct kref kref;
68 int minor; 73 int minor;
@@ -75,6 +80,8 @@ struct vfio_group {
75 struct notifier_block nb; 80 struct notifier_block nb;
76 struct list_head vfio_next; 81 struct list_head vfio_next;
77 struct list_head container_next; 82 struct list_head container_next;
83 struct list_head unbound_list;
84 struct mutex unbound_lock;
78 atomic_t opened; 85 atomic_t opened;
79}; 86};
80 87
@@ -204,6 +211,8 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
204 kref_init(&group->kref); 211 kref_init(&group->kref);
205 INIT_LIST_HEAD(&group->device_list); 212 INIT_LIST_HEAD(&group->device_list);
206 mutex_init(&group->device_lock); 213 mutex_init(&group->device_lock);
214 INIT_LIST_HEAD(&group->unbound_list);
215 mutex_init(&group->unbound_lock);
207 atomic_set(&group->container_users, 0); 216 atomic_set(&group->container_users, 0);
208 atomic_set(&group->opened, 0); 217 atomic_set(&group->opened, 0);
209 group->iommu_group = iommu_group; 218 group->iommu_group = iommu_group;
@@ -264,13 +273,22 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
264static void vfio_group_release(struct kref *kref) 273static void vfio_group_release(struct kref *kref)
265{ 274{
266 struct vfio_group *group = container_of(kref, struct vfio_group, kref); 275 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
276 struct vfio_unbound_dev *unbound, *tmp;
277 struct iommu_group *iommu_group = group->iommu_group;
267 278
268 WARN_ON(!list_empty(&group->device_list)); 279 WARN_ON(!list_empty(&group->device_list));
269 280
281 list_for_each_entry_safe(unbound, tmp,
282 &group->unbound_list, unbound_next) {
283 list_del(&unbound->unbound_next);
284 kfree(unbound);
285 }
286
270 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor)); 287 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
271 list_del(&group->vfio_next); 288 list_del(&group->vfio_next);
272 vfio_free_group_minor(group->minor); 289 vfio_free_group_minor(group->minor);
273 vfio_group_unlock_and_free(group); 290 vfio_group_unlock_and_free(group);
291 iommu_group_put(iommu_group);
274} 292}
275 293
276static void vfio_group_put(struct vfio_group *group) 294static void vfio_group_put(struct vfio_group *group)
@@ -440,17 +458,36 @@ static bool vfio_whitelisted_driver(struct device_driver *drv)
440} 458}
441 459
442/* 460/*
443 * A vfio group is viable for use by userspace if all devices are either 461 * A vfio group is viable for use by userspace if all devices are in
444 * driver-less or bound to a vfio or whitelisted driver. We test the 462 * one of the following states:
445 * latter by the existence of a struct vfio_device matching the dev. 463 * - driver-less
464 * - bound to a vfio driver
465 * - bound to a whitelisted driver
466 *
467 * We use two methods to determine whether a device is bound to a vfio
468 * driver. The first is to test whether the device exists in the vfio
469 * group. The second is to test if the device exists on the group
470 * unbound_list, indicating it's in the middle of transitioning from
471 * a vfio driver to driver-less.
446 */ 472 */
447static int vfio_dev_viable(struct device *dev, void *data) 473static int vfio_dev_viable(struct device *dev, void *data)
448{ 474{
449 struct vfio_group *group = data; 475 struct vfio_group *group = data;
450 struct vfio_device *device; 476 struct vfio_device *device;
451 struct device_driver *drv = ACCESS_ONCE(dev->driver); 477 struct device_driver *drv = ACCESS_ONCE(dev->driver);
478 struct vfio_unbound_dev *unbound;
479 int ret = -EINVAL;
452 480
453 if (!drv || vfio_whitelisted_driver(drv)) 481 mutex_lock(&group->unbound_lock);
482 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
483 if (dev == unbound->dev) {
484 ret = 0;
485 break;
486 }
487 }
488 mutex_unlock(&group->unbound_lock);
489
490 if (!ret || !drv || vfio_whitelisted_driver(drv))
454 return 0; 491 return 0;
455 492
456 device = vfio_group_get_device(group, dev); 493 device = vfio_group_get_device(group, dev);
@@ -459,7 +496,7 @@ static int vfio_dev_viable(struct device *dev, void *data)
459 return 0; 496 return 0;
460 } 497 }
461 498
462 return -EINVAL; 499 return ret;
463} 500}
464 501
465/** 502/**
@@ -501,6 +538,7 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
501{ 538{
502 struct vfio_group *group = container_of(nb, struct vfio_group, nb); 539 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
503 struct device *dev = data; 540 struct device *dev = data;
541 struct vfio_unbound_dev *unbound;
504 542
505 /* 543 /*
506 * Need to go through a group_lock lookup to get a reference or we 544 * Need to go through a group_lock lookup to get a reference or we
@@ -550,6 +588,17 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
550 * stop the system to maintain isolation. At a minimum, we'd 588 * stop the system to maintain isolation. At a minimum, we'd
551 * want a toggle to disable driver auto probe for this device. 589 * want a toggle to disable driver auto probe for this device.
552 */ 590 */
591
592 mutex_lock(&group->unbound_lock);
593 list_for_each_entry(unbound,
594 &group->unbound_list, unbound_next) {
595 if (dev == unbound->dev) {
596 list_del(&unbound->unbound_next);
597 kfree(unbound);
598 break;
599 }
600 }
601 mutex_unlock(&group->unbound_lock);
553 break; 602 break;
554 } 603 }
555 604
@@ -578,6 +627,12 @@ int vfio_add_group_dev(struct device *dev,
578 iommu_group_put(iommu_group); 627 iommu_group_put(iommu_group);
579 return PTR_ERR(group); 628 return PTR_ERR(group);
580 } 629 }
630 } else {
631 /*
632 * A found vfio_group already holds a reference to the
633 * iommu_group. A created vfio_group keeps the reference.
634 */
635 iommu_group_put(iommu_group);
581 } 636 }
582 637
583 device = vfio_group_get_device(group, dev); 638 device = vfio_group_get_device(group, dev);
@@ -586,21 +641,19 @@ int vfio_add_group_dev(struct device *dev,
586 dev_name(dev), iommu_group_id(iommu_group)); 641 dev_name(dev), iommu_group_id(iommu_group));
587 vfio_device_put(device); 642 vfio_device_put(device);
588 vfio_group_put(group); 643 vfio_group_put(group);
589 iommu_group_put(iommu_group);
590 return -EBUSY; 644 return -EBUSY;
591 } 645 }
592 646
593 device = vfio_group_create_device(group, dev, ops, device_data); 647 device = vfio_group_create_device(group, dev, ops, device_data);
594 if (IS_ERR(device)) { 648 if (IS_ERR(device)) {
595 vfio_group_put(group); 649 vfio_group_put(group);
596 iommu_group_put(iommu_group);
597 return PTR_ERR(device); 650 return PTR_ERR(device);
598 } 651 }
599 652
600 /* 653 /*
601 * Added device holds reference to iommu_group and vfio_device 654 * Drop all but the vfio_device reference. The vfio_device holds
602 * (which in turn holds reference to vfio_group). Drop extra 655 * a reference to the vfio_group, which holds a reference to the
603 * group reference used while acquiring device. 656 * iommu_group.
604 */ 657 */
605 vfio_group_put(group); 658 vfio_group_put(group);
606 659
@@ -655,8 +708,9 @@ void *vfio_del_group_dev(struct device *dev)
655{ 708{
656 struct vfio_device *device = dev_get_drvdata(dev); 709 struct vfio_device *device = dev_get_drvdata(dev);
657 struct vfio_group *group = device->group; 710 struct vfio_group *group = device->group;
658 struct iommu_group *iommu_group = group->iommu_group;
659 void *device_data = device->device_data; 711 void *device_data = device->device_data;
712 struct vfio_unbound_dev *unbound;
713 unsigned int i = 0;
660 714
661 /* 715 /*
662 * The group exists so long as we have a device reference. Get 716 * The group exists so long as we have a device reference. Get
@@ -664,14 +718,49 @@ void *vfio_del_group_dev(struct device *dev)
664 */ 718 */
665 vfio_group_get(group); 719 vfio_group_get(group);
666 720
721 /*
722 * When the device is removed from the group, the group suddenly
723 * becomes non-viable; the device has a driver (until the unbind
724 * completes), but it's not present in the group. This is bad news
725 * for any external users that need to re-acquire a group reference
726 * in order to match and release their existing reference. To
727 * solve this, we track such devices on the unbound_list to bridge
728 * the gap until they're fully unbound.
729 */
730 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
731 if (unbound) {
732 unbound->dev = dev;
733 mutex_lock(&group->unbound_lock);
734 list_add(&unbound->unbound_next, &group->unbound_list);
735 mutex_unlock(&group->unbound_lock);
736 }
737 WARN_ON(!unbound);
738
667 vfio_device_put(device); 739 vfio_device_put(device);
668 740
669 /* TODO send a signal to encourage this to be released */ 741 /*
670 wait_event(vfio.release_q, !vfio_dev_present(group, dev)); 742 * If the device is still present in the group after the above
743 * 'put', then it is in use and we need to request it from the
744 * bus driver. The driver may in turn need to request the
745 * device from the user. We send the request on an arbitrary
746 * interval with counter to allow the driver to take escalating
747 * measures to release the device if it has the ability to do so.
748 */
749 do {
750 device = vfio_group_get_device(group, dev);
751 if (!device)
752 break;
671 753
672 vfio_group_put(group); 754 if (device->ops->request)
755 device->ops->request(device_data, i++);
673 756
674 iommu_group_put(iommu_group); 757 vfio_device_put(device);
758
759 } while (wait_event_interruptible_timeout(vfio.release_q,
760 !vfio_dev_present(group, dev),
761 HZ * 10) <= 0);
762
763 vfio_group_put(group);
675 764
676 return device_data; 765 return device_data;
677} 766}
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4a9d666f1e91..57d8c37a002b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -66,6 +66,7 @@ struct vfio_domain {
66 struct list_head next; 66 struct list_head next;
67 struct list_head group_list; 67 struct list_head group_list;
68 int prot; /* IOMMU_CACHE */ 68 int prot; /* IOMMU_CACHE */
69 bool fgsp; /* Fine-grained super pages */
69}; 70};
70 71
71struct vfio_dma { 72struct vfio_dma {
@@ -264,6 +265,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
264 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 265 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
265 bool lock_cap = capable(CAP_IPC_LOCK); 266 bool lock_cap = capable(CAP_IPC_LOCK);
266 long ret, i; 267 long ret, i;
268 bool rsvd;
267 269
268 if (!current->mm) 270 if (!current->mm)
269 return -ENODEV; 271 return -ENODEV;
@@ -272,10 +274,9 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
272 if (ret) 274 if (ret)
273 return ret; 275 return ret;
274 276
275 if (is_invalid_reserved_pfn(*pfn_base)) 277 rsvd = is_invalid_reserved_pfn(*pfn_base);
276 return 1;
277 278
278 if (!lock_cap && current->mm->locked_vm + 1 > limit) { 279 if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
279 put_pfn(*pfn_base, prot); 280 put_pfn(*pfn_base, prot);
280 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, 281 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
281 limit << PAGE_SHIFT); 282 limit << PAGE_SHIFT);
@@ -283,7 +284,8 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
283 } 284 }
284 285
285 if (unlikely(disable_hugepages)) { 286 if (unlikely(disable_hugepages)) {
286 vfio_lock_acct(1); 287 if (!rsvd)
288 vfio_lock_acct(1);
287 return 1; 289 return 1;
288 } 290 }
289 291
@@ -295,12 +297,14 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
295 if (ret) 297 if (ret)
296 break; 298 break;
297 299
298 if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) { 300 if (pfn != *pfn_base + i ||
301 rsvd != is_invalid_reserved_pfn(pfn)) {
299 put_pfn(pfn, prot); 302 put_pfn(pfn, prot);
300 break; 303 break;
301 } 304 }
302 305
303 if (!lock_cap && current->mm->locked_vm + i + 1 > limit) { 306 if (!rsvd && !lock_cap &&
307 current->mm->locked_vm + i + 1 > limit) {
304 put_pfn(pfn, prot); 308 put_pfn(pfn, prot);
305 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", 309 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
306 __func__, limit << PAGE_SHIFT); 310 __func__, limit << PAGE_SHIFT);
@@ -308,7 +312,8 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
308 } 312 }
309 } 313 }
310 314
311 vfio_lock_acct(i); 315 if (!rsvd)
316 vfio_lock_acct(i);
312 317
313 return i; 318 return i;
314} 319}
@@ -346,12 +351,14 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
346 domain = d = list_first_entry(&iommu->domain_list, 351 domain = d = list_first_entry(&iommu->domain_list,
347 struct vfio_domain, next); 352 struct vfio_domain, next);
348 353
349 list_for_each_entry_continue(d, &iommu->domain_list, next) 354 list_for_each_entry_continue(d, &iommu->domain_list, next) {
350 iommu_unmap(d->domain, dma->iova, dma->size); 355 iommu_unmap(d->domain, dma->iova, dma->size);
356 cond_resched();
357 }
351 358
352 while (iova < end) { 359 while (iova < end) {
353 size_t unmapped; 360 size_t unmapped, len;
354 phys_addr_t phys; 361 phys_addr_t phys, next;
355 362
356 phys = iommu_iova_to_phys(domain->domain, iova); 363 phys = iommu_iova_to_phys(domain->domain, iova);
357 if (WARN_ON(!phys)) { 364 if (WARN_ON(!phys)) {
@@ -359,7 +366,19 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
359 continue; 366 continue;
360 } 367 }
361 368
362 unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE); 369 /*
370 * To optimize for fewer iommu_unmap() calls, each of which
371 * may require hardware cache flushing, try to find the
372 * largest contiguous physical memory chunk to unmap.
373 */
374 for (len = PAGE_SIZE;
375 !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
376 next = iommu_iova_to_phys(domain->domain, iova + len);
377 if (next != phys + len)
378 break;
379 }
380
381 unmapped = iommu_unmap(domain->domain, iova, len);
363 if (WARN_ON(!unmapped)) 382 if (WARN_ON(!unmapped))
364 break; 383 break;
365 384
@@ -367,6 +386,8 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
367 unmapped >> PAGE_SHIFT, 386 unmapped >> PAGE_SHIFT,
368 dma->prot, false); 387 dma->prot, false);
369 iova += unmapped; 388 iova += unmapped;
389
390 cond_resched();
370 } 391 }
371 392
372 vfio_lock_acct(-unlocked); 393 vfio_lock_acct(-unlocked);
@@ -511,6 +532,8 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
511 map_try_harder(d, iova, pfn, npage, prot)) 532 map_try_harder(d, iova, pfn, npage, prot))
512 goto unwind; 533 goto unwind;
513 } 534 }
535
536 cond_resched();
514 } 537 }
515 538
516 return 0; 539 return 0;
@@ -665,6 +688,39 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
665 return 0; 688 return 0;
666} 689}
667 690
691/*
692 * We change our unmap behavior slightly depending on whether the IOMMU
693 * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage
694 * for practically any contiguous power-of-two mapping we give it. This means
695 * we don't need to look for contiguous chunks ourselves to make unmapping
696 * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d
697 * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
698 * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
699 * hugetlbfs is in use.
700 */
701static void vfio_test_domain_fgsp(struct vfio_domain *domain)
702{
703 struct page *pages;
704 int ret, order = get_order(PAGE_SIZE * 2);
705
706 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
707 if (!pages)
708 return;
709
710 ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
711 IOMMU_READ | IOMMU_WRITE | domain->prot);
712 if (!ret) {
713 size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
714
715 if (unmapped == PAGE_SIZE)
716 iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
717 else
718 domain->fgsp = true;
719 }
720
721 __free_pages(pages, order);
722}
723
668static int vfio_iommu_type1_attach_group(void *iommu_data, 724static int vfio_iommu_type1_attach_group(void *iommu_data,
669 struct iommu_group *iommu_group) 725 struct iommu_group *iommu_group)
670{ 726{
@@ -758,6 +814,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
758 } 814 }
759 } 815 }
760 816
817 vfio_test_domain_fgsp(domain);
818
761 /* replay mappings on new domains */ 819 /* replay mappings on new domains */
762 ret = vfio_iommu_replay(iommu, domain); 820 ret = vfio_iommu_replay(iommu, domain);
763 if (ret) 821 if (ret)