vfio/type1: DMA unmap chunking

When unmapping DMA entries we try to rely on the IOMMU API behavior that allows the IOMMU to unmap a larger area than requested, up to the size of the original mapping. This works great when the IOMMU supports superpages *and* they're in use. Otherwise, each PAGE_SIZE increment is unmapped separately, resulting in poor performance. Instead we can use the IOVA-to-physical-address translation provided by the IOMMU API and unmap using the largest contiguous physical memory chunk available, which is also how vfio/type1 would have mapped the region. For a synthetic 1TB guest VM mapping and shutdown test on Intel VT-d (2M IOMMU pagesize support), this achieves about a 30% overall improvement mapping standard 4K pages, regardless of IOMMU superpage enabling, and about a 40% improvement mapping 2M hugetlbfs pages when IOMMU superpages are not available. Hugetlbfs with IOMMU superpages enabled is effectively unchanged. Unfortunately the same algorithm does not work well on IOMMUs with fine-grained superpages, like AMD-Vi, costing about 25% extra since the IOMMU will automatically unmap any power-of-two contiguous mapping we've provided it. We add a routine and a domain flag to detect this feature, leaving AMD-Vi unaffected by this unmap optimization. Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
author: Alex Williamson <alex.williamson@redhat.com> 2015-02-06 12:58:56 -0500
committer: Alex Williamson <alex.williamson@redhat.com> 2015-02-06 12:58:56 -0500
commit: 6fe1010d6d9c02cf3556ab076585104551a6ee7e (patch)
tree: a4067ec65d2adef950cd233db2998c725b0a6905 /drivers/vfio
parent: e36f014edff70fc02b3d3d79cead1d58f289332e (diff)
1 files changed, 51 insertions, 3 deletions
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4a9d666f1e91..e6e7f155bdd9 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -66,6 +66,7 @@ struct vfio_domain {
        struct list_head        next;
        struct list_head        group_list;
        int                     prot;           /* IOMMU_CACHE */
+        bool                    fgsp;           /* Fine-grained super pages */
 };
 struct vfio_dma {
@@ -350,8 +351,8 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
                iommu_unmap(d->domain, dma->iova, dma->size);
        while (iova < end) {
-                size_t unmapped;
+                size_t unmapped, len;
-                phys_addr_t phys;
+                phys_addr_t phys, next;
                phys = iommu_iova_to_phys(domain->domain, iova);
                if (WARN_ON(!phys)) {
@@ -359,7 +360,19 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
                        continue;
                }
-                unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE);
+                /*
+                 * To optimize for fewer iommu_unmap() calls, each of which
+                 * may require hardware cache flushing, try to find the
+                 * largest contiguous physical memory chunk to unmap.
+                 */
+                for (len = PAGE_SIZE;
+                     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
+                        next = iommu_iova_to_phys(domain->domain, iova + len);
+                        if (next != phys + len)
+                                break;
+                }
+                unmapped = iommu_unmap(domain->domain, iova, len);
                if (WARN_ON(!unmapped))
                        break;
@@ -665,6 +678,39 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
        return 0;
 }
+/*
+ * We change our unmap behavior slightly depending on whether the IOMMU
+ * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
+ * for practically any contiguous power-of-two mapping we give it.  This means
+ * we don't need to look for contiguous chunks ourselves to make unmapping
+ * more efficient.  On IOMMUs with coarse-grained super pages, like Intel VT-d
+ * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
+ * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
+ * hugetlbfs is in use.
+ */
+static void vfio_test_domain_fgsp(struct vfio_domain *domain)
+{
+        struct page *pages;
+        int ret, order = get_order(PAGE_SIZE * 2);
+        pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
+        if (!pages)
+                return;
+        ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
+                        IOMMU_READ | IOMMU_WRITE | domain->prot);
+        if (!ret) {
+                size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
+                if (unmapped == PAGE_SIZE)
+                        iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
+                else
+                        domain->fgsp = true;
+        }
+        __free_pages(pages, order);
+}
 static int vfio_iommu_type1_attach_group(void *iommu_data,
                                         struct iommu_group *iommu_group)
 {
@@ -758,6 +804,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
                }
        }
+        vfio_test_domain_fgsp(domain);
        /* replay mappings on new domains */
        ret = vfio_iommu_replay(iommu, domain);
        if (ret)
author	Alex Williamson <alex.williamson@redhat.com>	2015-02-06 12:58:56 -0500
committer	Alex Williamson <alex.williamson@redhat.com>	2015-02-06 12:58:56 -0500
commit	6fe1010d6d9c02cf3556ab076585104551a6ee7e (patch)
tree	a4067ec65d2adef950cd233db2998c725b0a6905 /drivers/vfio
parent	e36f014edff70fc02b3d3d79cead1d58f289332e (diff)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 4a9d666f1e91..e6e7f155bdd9 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c
@@ -66,6 +66,7 @@ struct vfio_domain {
66	struct list_head next;	66	struct list_head next;
67	struct list_head group_list;	67	struct list_head group_list;
68	int prot; /* IOMMU_CACHE */	68	int prot; /* IOMMU_CACHE */
		69	bool fgsp; /* Fine-grained super pages */
69	};	70	};
70		71
71	struct vfio_dma {	72	struct vfio_dma {
@@ -350,8 +351,8 @@ static void vfio_unmap_unpin(struct vfio_iommu iommu, struct vfio_dma dma)
350	iommu_unmap(d->domain, dma->iova, dma->size);	351	iommu_unmap(d->domain, dma->iova, dma->size);
351		352
352	while (iova < end) {	353	while (iova < end) {
353	size_t unmapped;	354	size_t unmapped, len;
354	phys_addr_t phys;	355	phys_addr_t phys, next;
355		356
356	phys = iommu_iova_to_phys(domain->domain, iova);	357	phys = iommu_iova_to_phys(domain->domain, iova);
357	if (WARN_ON(!phys)) {	358	if (WARN_ON(!phys)) {
@@ -359,7 +360,19 @@ static void vfio_unmap_unpin(struct vfio_iommu iommu, struct vfio_dma dma)
359	continue;	360	continue;
360	}	361	}
361		362
362	unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE);	363	/*
		364	* To optimize for fewer iommu_unmap() calls, each of which
		365	* may require hardware cache flushing, try to find the
		366	* largest contiguous physical memory chunk to unmap.
		367	*/
		368	for (len = PAGE_SIZE;
		369	!domain->fgsp && iova + len < end; len += PAGE_SIZE) {
		370	next = iommu_iova_to_phys(domain->domain, iova + len);
		371	if (next != phys + len)
		372	break;
		373	}
		374
		375	unmapped = iommu_unmap(domain->domain, iova, len);
363	if (WARN_ON(!unmapped))	376	if (WARN_ON(!unmapped))
364	break;	377	break;
365		378
@@ -665,6 +678,39 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
665	return 0;	678	return 0;
666	}	679	}
667		680
		681	/*
		682	* We change our unmap behavior slightly depending on whether the IOMMU
		683	* supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage
		684	* for practically any contiguous power-of-two mapping we give it. This means
		685	* we don't need to look for contiguous chunks ourselves to make unmapping
		686	* more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d
		687	* with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
		688	* significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
		689	* hugetlbfs is in use.
		690	*/
		691	static void vfio_test_domain_fgsp(struct vfio_domain *domain)
		692	{
		693	struct page *pages;
		694	int ret, order = get_order(PAGE_SIZE * 2);
		695
		696	pages = alloc_pages(GFP_KERNEL \| __GFP_ZERO, order);
		697	if (!pages)
		698	return;
		699
		700	ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
		701	IOMMU_READ \| IOMMU_WRITE \| domain->prot);
		702	if (!ret) {
		703	size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
		704
		705	if (unmapped == PAGE_SIZE)
		706	iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
		707	else
		708	domain->fgsp = true;
		709	}
		710
		711	__free_pages(pages, order);
		712	}
		713
668	static int vfio_iommu_type1_attach_group(void *iommu_data,	714	static int vfio_iommu_type1_attach_group(void *iommu_data,
669	struct iommu_group *iommu_group)	715	struct iommu_group *iommu_group)
670	{	716	{
@@ -758,6 +804,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
758	}	804	}
759	}	805	}
760		806
		807	vfio_test_domain_fgsp(domain);
		808
761	/* replay mappings on new domains */	809	/* replay mappings on new domains */
762	ret = vfio_iommu_replay(iommu, domain);	810	ret = vfio_iommu_replay(iommu, domain);
763	if (ret)	811	if (ret)