aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/vfio
diff options
context:
space:
mode:
authorAlex Williamson <alex.williamson@redhat.com>2015-02-06 12:58:56 -0500
committerAlex Williamson <alex.williamson@redhat.com>2015-02-06 12:58:56 -0500
commit6fe1010d6d9c02cf3556ab076585104551a6ee7e (patch)
treea4067ec65d2adef950cd233db2998c725b0a6905 /drivers/vfio
parente36f014edff70fc02b3d3d79cead1d58f289332e (diff)
vfio/type1: DMA unmap chunking
When unmapping DMA entries we try to rely on the IOMMU API behavior that allows the IOMMU to unmap a larger area than requested, up to the size of the original mapping. This works great when the IOMMU supports superpages *and* they're in use. Otherwise, each PAGE_SIZE increment is unmapped separately, resulting in poor performance. Instead we can use the IOVA-to-physical-address translation provided by the IOMMU API and unmap using the largest contiguous physical memory chunk available, which is also how vfio/type1 would have mapped the region. For a synthetic 1TB guest VM mapping and shutdown test on Intel VT-d (2M IOMMU pagesize support), this achieves about a 30% overall improvement mapping standard 4K pages, regardless of IOMMU superpage enabling, and about a 40% improvement mapping 2M hugetlbfs pages when IOMMU superpages are not available. Hugetlbfs with IOMMU superpages enabled is effectively unchanged. Unfortunately the same algorithm does not work well on IOMMUs with fine-grained superpages, like AMD-Vi, costing about 25% extra since the IOMMU will automatically unmap any power-of-two contiguous mapping we've provided it. We add a routine and a domain flag to detect this feature, leaving AMD-Vi unaffected by this unmap optimization. Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Diffstat (limited to 'drivers/vfio')
-rw-r--r--drivers/vfio/vfio_iommu_type1.c54
1 files changed, 51 insertions, 3 deletions
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4a9d666f1e91..e6e7f155bdd9 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -66,6 +66,7 @@ struct vfio_domain {
66 struct list_head next; 66 struct list_head next;
67 struct list_head group_list; 67 struct list_head group_list;
68 int prot; /* IOMMU_CACHE */ 68 int prot; /* IOMMU_CACHE */
69 bool fgsp; /* Fine-grained super pages */
69}; 70};
70 71
71struct vfio_dma { 72struct vfio_dma {
@@ -350,8 +351,8 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
350 iommu_unmap(d->domain, dma->iova, dma->size); 351 iommu_unmap(d->domain, dma->iova, dma->size);
351 352
352 while (iova < end) { 353 while (iova < end) {
353 size_t unmapped; 354 size_t unmapped, len;
354 phys_addr_t phys; 355 phys_addr_t phys, next;
355 356
356 phys = iommu_iova_to_phys(domain->domain, iova); 357 phys = iommu_iova_to_phys(domain->domain, iova);
357 if (WARN_ON(!phys)) { 358 if (WARN_ON(!phys)) {
@@ -359,7 +360,19 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
359 continue; 360 continue;
360 } 361 }
361 362
362 unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE); 363 /*
364 * To optimize for fewer iommu_unmap() calls, each of which
365 * may require hardware cache flushing, try to find the
366 * largest contiguous physical memory chunk to unmap.
367 */
368 for (len = PAGE_SIZE;
369 !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
370 next = iommu_iova_to_phys(domain->domain, iova + len);
371 if (next != phys + len)
372 break;
373 }
374
375 unmapped = iommu_unmap(domain->domain, iova, len);
363 if (WARN_ON(!unmapped)) 376 if (WARN_ON(!unmapped))
364 break; 377 break;
365 378
@@ -665,6 +678,39 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
665 return 0; 678 return 0;
666} 679}
667 680
681/*
682 * We change our unmap behavior slightly depending on whether the IOMMU
683 * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage
684 * for practically any contiguous power-of-two mapping we give it. This means
685 * we don't need to look for contiguous chunks ourselves to make unmapping
686 * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d
687 * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
688 * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
689 * hugetlbfs is in use.
690 */
691static void vfio_test_domain_fgsp(struct vfio_domain *domain)
692{
693 struct page *pages;
694 int ret, order = get_order(PAGE_SIZE * 2);
695
696 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
697 if (!pages)
698 return;
699
700 ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
701 IOMMU_READ | IOMMU_WRITE | domain->prot);
702 if (!ret) {
703 size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
704
705 if (unmapped == PAGE_SIZE)
706 iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
707 else
708 domain->fgsp = true;
709 }
710
711 __free_pages(pages, order);
712}
713
668static int vfio_iommu_type1_attach_group(void *iommu_data, 714static int vfio_iommu_type1_attach_group(void *iommu_data,
669 struct iommu_group *iommu_group) 715 struct iommu_group *iommu_group)
670{ 716{
@@ -758,6 +804,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
758 } 804 }
759 } 805 }
760 806
807 vfio_test_domain_fgsp(domain);
808
761 /* replay mappings on new domains */ 809 /* replay mappings on new domains */
762 ret = vfio_iommu_replay(iommu, domain); 810 ret = vfio_iommu_replay(iommu, domain);
763 if (ret) 811 if (ret)