intel-iommu: Enable super page (2MiB, 1GiB, etc.) support

There are no externally-visible changes with this. In the loop in the internal __domain_mapping() function, we simply detect if we are mapping: - size >= 2MiB, and - virtual address aligned to 2MiB, and - physical address aligned to 2MiB, and - on hardware that supports superpages. (and likewise for larger superpages). We automatically use a superpage for such mappings. We never have to worry about *breaking* superpages, since we trust that we will always *unmap* the same range that was mapped. So all we need to do is ensure that dma_pte_clear_range() will also cope with superpages. Adjust pfn_to_dma_pte() to take a superpage 'level' as an argument, so it can return a PTE at the appropriate level rather than always extending the page tables all the way down to level 1. Again, this is simplified by the fact that we should never encounter existing small pages when we're creating a mapping; any old mapping that used the same virtual range will have been entirely removed and its obsolete page tables freed. Provide an 'intel_iommu=sp_off' argument on the command line as a chicken bit. Not that it should ever be required. == The original commit seen in the iommu-2.6.git was Youquan's implementation (and completion) of my own half-baked code which I'd typed into an email. Followed by half a dozen subsequent 'fixes'. I've taken the unusual step of rewriting history and collapsing the original commits in order to keep the main history simpler, and make life easier for the people who are going to have to backport this to older kernels. And also so I can give it a more coherent commit comment which (hopefully) gives a better explanation of what's going on. The original sequence of commits leading to identical code was: Youquan Song (3): intel-iommu: super page support intel-iommu: Fix superpage alignment calculation error intel-iommu: Fix superpage level calculation error in dma_pfn_level_pte() David Woodhouse (4): intel-iommu: Precalculate superpage support for dmar_domain intel-iommu: Fix hardware_largepage_caps() intel-iommu: Fix inappropriate use of superpages in __domain_mapping() intel-iommu: Fix phys_pfn in __domain_mapping for sglist pages Signed-off-by: Youquan Song <youquan.song@intel.com> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
author: Youquan Song <youquan.song@intel.com> 2011-05-25 14:13:49 -0400
committer: David Woodhouse <David.Woodhouse@intel.com> 2011-06-01 07:26:35 -0400
commit: 6dd9a7c73761a8a5f5475d5cfdc15368a0f4c06d (patch)
tree: cb685e370cc1cb2dec39b29500bdd22fd1814596 /drivers/pci/intel-iommu.c
parent: 7b668357810ecb5fdda4418689d50f5d95aea6a8 (diff)
1 files changed, 139 insertions, 18 deletions
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 395f253c0494..e6fe1994f9d3 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -115,6 +115,11 @@ static inline unsigned long align_to_level(unsigned long pfn, int level)
        return (pfn + level_size(level) - 1) & level_mask(level);
 }
+static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
+{
+        return  1 << ((lvl - 1) * LEVEL_STRIDE);
+}
 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
   are never going to work. */
 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
@@ -343,6 +348,9 @@ struct dmar_domain {
        int             iommu_coherency;/* indicate coherency of iommu access */
        int             iommu_snooping; /* indicate snooping control feature*/
        int             iommu_count;    /* reference count of iommu */
+        int             iommu_superpage;/* Level of superpages supported:
+                                           0 == 4KiB (no superpages), 1 == 2MiB,
+                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
        spinlock_t      iommu_lock;     /* protect iommu set in domain */
        u64             max_addr;       /* maximum mapped address */
 };
@@ -392,6 +400,7 @@ int dmar_disabled = 1;
 static int dmar_map_gfx = 1;
 static int dmar_forcedac;
 static int intel_iommu_strict;
+static int intel_iommu_superpage = 1;
 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 static DEFINE_SPINLOCK(device_domain_lock);
@@ -422,6 +431,10 @@ static int __init intel_iommu_setup(char *str)
                        printk(KERN_INFO
                                "Intel-IOMMU: disable batched IOTLB flush\n");
                        intel_iommu_strict = 1;
+                } else if (!strncmp(str, "sp_off", 6)) {
+                        printk(KERN_INFO
+                                "Intel-IOMMU: disable supported super page\n");
+                        intel_iommu_superpage = 0;
                }
                str += strcspn(str, ",");
@@ -560,11 +573,32 @@ static void domain_update_iommu_snooping(struct dmar_domain *domain)
        }
 }
+static void domain_update_iommu_superpage(struct dmar_domain *domain)
+{
+        int i, mask = 0xf;
+        if (!intel_iommu_superpage) {
+                domain->iommu_superpage = 0;
+                return;
+        }
+        domain->iommu_superpage = 4; /* 1TiB */
+        for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
+                mask |= cap_super_page_val(g_iommus[i]->cap);
+                if (!mask) {
+                        break;
+                }
+        }
+        domain->iommu_superpage = fls(mask);
+}
 /* Some capabilities may be different across iommus */
 static void domain_update_iommu_cap(struct dmar_domain *domain)
 {
        domain_update_iommu_coherency(domain);
        domain_update_iommu_snooping(domain);
+        domain_update_iommu_superpage(domain);
 }
 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
@@ -694,23 +728,31 @@ out:
 }
 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
-                                      unsigned long pfn)
+                                      unsigned long pfn, int large_level)
 {
        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
        struct dma_pte *parent, *pte = NULL;
        int level = agaw_to_level(domain->agaw);
-        int offset;
+        int offset, target_level;
        BUG_ON(!domain->pgd);
        BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
        parent = domain->pgd;
+        /* Search pte */
+        if (!large_level)
+                target_level = 1;
+        else
+                target_level = large_level;
        while (level > 0) {
                void *tmp_page;
                offset = pfn_level_offset(pfn, level);
                pte = &parent[offset];
-                if (level == 1)
+                if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE))
+                        break;
+                if (level == target_level)
                        break;
                if (!dma_pte_present(pte)) {
@@ -738,10 +780,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
        return pte;
 }
 /* return address's pte at specific level */
 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
                                         unsigned long pfn,
-                                         int level)
+                                         int level, int *large_page)
 {
        struct dma_pte *parent, *pte = NULL;
        int total = agaw_to_level(domain->agaw);
@@ -754,8 +797,16 @@ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
                if (level == total)
                        return pte;
-                if (!dma_pte_present(pte))
+                if (!dma_pte_present(pte)) {
+                        *large_page = total;
                        break;
+                }
+                if (pte->val & DMA_PTE_LARGE_PAGE) {
+                        *large_page = total;
+                        return pte;
+                }
                parent = phys_to_virt(dma_pte_addr(pte));
                total--;
        }
@@ -768,6 +819,7 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
                                unsigned long last_pfn)
 {
        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+        unsigned int large_page = 1;
        struct dma_pte *first_pte, *pte;
        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
@@ -776,14 +828,15 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
        /* we don't need lock here; nobody else touches the iova range */
        do {
-                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
+                large_page = 1;
+                first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
                if (!pte) {
-                        start_pfn = align_to_level(start_pfn + 1, 2);
+                        start_pfn = align_to_level(start_pfn + 1, large_page + 1);
                        continue;
                }
-                do { 
+                do {
                        dma_clear_pte(pte);
-                        start_pfn++;
+                        start_pfn += lvl_to_nr_pages(large_page);
                        pte++;
                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
@@ -803,6 +856,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
        int total = agaw_to_level(domain->agaw);
        int level;
        unsigned long tmp;
+        int large_page = 2;
        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
@@ -818,7 +872,10 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
                        return;
                do {
-                        first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
+                        large_page = level;
+                        first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
+                        if (large_page > level)
+                                level = large_page + 1;
                        if (!pte) {
                                tmp = align_to_level(tmp + 1, level + 1);
                                continue;
@@ -1402,6 +1459,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
        else
                domain->iommu_snooping = 0;
+        domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
        domain->iommu_count = 1;
        domain->nid = iommu->node;
@@ -1657,6 +1715,34 @@ static inline unsigned long aligned_nrpages(unsigned long host_addr,
        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
 }
+/* Return largest possible superpage level for a given mapping */
+static inline int hardware_largepage_caps(struct dmar_domain *domain,
+                                          unsigned long iov_pfn,
+                                          unsigned long phy_pfn,
+                                          unsigned long pages)
+{
+        int support, level = 1;
+        unsigned long pfnmerge;
+        support = domain->iommu_superpage;
+        /* To use a large page, the virtual *and* physical addresses
+           must be aligned to 2MiB/1GiB/etc. Lower bits set in either
+           of them will mean we have to use smaller pages. So just
+           merge them and check both at once. */
+        pfnmerge = iov_pfn | phy_pfn;
+        while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
+                pages >>= VTD_STRIDE_SHIFT;
+                if (!pages)
+                        break;
+                pfnmerge >>= VTD_STRIDE_SHIFT;
+                level++;
+                support--;
+        }
+        return level;
+}
 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
                            struct scatterlist *sg, unsigned long phys_pfn,
                            unsigned long nr_pages, int prot)
@@ -1665,6 +1751,8 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
        phys_addr_t uninitialized_var(pteval);
        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
        unsigned long sg_res;
+        unsigned int largepage_lvl = 0;
+        unsigned long lvl_pages = 0;
        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
@@ -1680,7 +1768,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
        }
-        while (nr_pages--) {
+        while (nr_pages > 0) {
                uint64_t tmp;
                if (!sg_res) {
@@ -1688,11 +1776,21 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
                        sg->dma_length = sg->length;
                        pteval = page_to_phys(sg_page(sg)) | prot;
+                        phys_pfn = pteval >> VTD_PAGE_SHIFT;
                }
                if (!pte) {
-                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
+                        largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
+                        first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
                        if (!pte)
                                return -ENOMEM;
+                        /* It is large page*/
+                        if (largepage_lvl > 1)
+                                pteval |= DMA_PTE_LARGE_PAGE;
+                        else
+                                pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
                }
                /* We don't need lock here, nobody else
                 * touches the iova range
@@ -1708,16 +1806,38 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
                        }
                        WARN_ON(1);
                }
+                lvl_pages = lvl_to_nr_pages(largepage_lvl);
+                BUG_ON(nr_pages < lvl_pages);
+                BUG_ON(sg_res < lvl_pages);
+                nr_pages -= lvl_pages;
+                iov_pfn += lvl_pages;
+                phys_pfn += lvl_pages;
+                pteval += lvl_pages * VTD_PAGE_SIZE;
+                sg_res -= lvl_pages;
+                /* If the next PTE would be the first in a new page, then we
+                   need to flush the cache on the entries we've just written.
+                   And then we'll need to recalculate 'pte', so clear it and
+                   let it get set again in the if (!pte) block above.
+                   If we're done (!nr_pages) we need to flush the cache too.
+                   Also if we've been setting superpages, we may need to
+                   recalculate 'pte' and switch back to smaller pages for the
+                   end of the mapping, if the trailing size is not enough to
+                   use another superpage (i.e. sg_res < lvl_pages). */
                pte++;
-                if (!nr_pages || first_pte_in_page(pte)) {
+                if (!nr_pages || first_pte_in_page(pte) ||
+                    (largepage_lvl > 1 && sg_res < lvl_pages)) {
                        domain_flush_cache(domain, first_pte,
                                           (void *)pte - (void *)first_pte);
                        pte = NULL;
                }
-                iov_pfn++;
-                pteval += VTD_PAGE_SIZE;
+                if (!sg_res && nr_pages)
-                sg_res--;
-                if (!sg_res)
                        sg = sg_next(sg);
        }
        return 0;
@@ -3527,6 +3647,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
        domain->iommu_count = 0;
        domain->iommu_coherency = 0;
        domain->iommu_snooping = 0;
+        domain->iommu_superpage = 0;
        domain->max_addr = 0;
        domain->nid = -1;
@@ -3742,7 +3863,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
        struct dma_pte *pte;
        u64 phys = 0;
-        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
+        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
        if (pte)
                phys = dma_pte_addr(pte);
author	Youquan Song <youquan.song@intel.com>	2011-05-25 14:13:49 -0400
committer	David Woodhouse <David.Woodhouse@intel.com>	2011-06-01 07:26:35 -0400
commit	6dd9a7c73761a8a5f5475d5cfdc15368a0f4c06d (patch)
tree	cb685e370cc1cb2dec39b29500bdd22fd1814596 /drivers/pci/intel-iommu.c
parent	7b668357810ecb5fdda4418689d50f5d95aea6a8 (diff)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 395f253c0494..e6fe1994f9d3 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c
@@ -115,6 +115,11 @@ static inline unsigned long align_to_level(unsigned long pfn, int level)
115	return (pfn + level_size(level) - 1) & level_mask(level);	115	return (pfn + level_size(level) - 1) & level_mask(level);
116	}	116	}
117		117
		118	static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
		119	{
		120	return 1 << ((lvl - 1) * LEVEL_STRIDE);
		121	}
		122
118	/* VT-d pages must always be _smaller_ than MM pages. Otherwise things	123	/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
119	are never going to work. */	124	are never going to work. */
120	static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)	125	static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
@@ -343,6 +348,9 @@ struct dmar_domain {
343	int iommu_coherency;/* indicate coherency of iommu access */	348	int iommu_coherency;/* indicate coherency of iommu access */
344	int iommu_snooping; /* indicate snooping control feature*/	349	int iommu_snooping; /* indicate snooping control feature*/
345	int iommu_count; /* reference count of iommu */	350	int iommu_count; /* reference count of iommu */
		351	int iommu_superpage;/* Level of superpages supported:
		352	0 == 4KiB (no superpages), 1 == 2MiB,
		353	2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
346	spinlock_t iommu_lock; /* protect iommu set in domain */	354	spinlock_t iommu_lock; /* protect iommu set in domain */
347	u64 max_addr; /* maximum mapped address */	355	u64 max_addr; /* maximum mapped address */
348	};	356	};
@@ -392,6 +400,7 @@ int dmar_disabled = 1;
392	static int dmar_map_gfx = 1;	400	static int dmar_map_gfx = 1;
393	static int dmar_forcedac;	401	static int dmar_forcedac;
394	static int intel_iommu_strict;	402	static int intel_iommu_strict;
		403	static int intel_iommu_superpage = 1;
395		404
396	#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))	405	#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397	static DEFINE_SPINLOCK(device_domain_lock);	406	static DEFINE_SPINLOCK(device_domain_lock);
@@ -422,6 +431,10 @@ static int __init intel_iommu_setup(char *str)
422	printk(KERN_INFO	431	printk(KERN_INFO
423	"Intel-IOMMU: disable batched IOTLB flush\n");	432	"Intel-IOMMU: disable batched IOTLB flush\n");
424	intel_iommu_strict = 1;	433	intel_iommu_strict = 1;
		434	} else if (!strncmp(str, "sp_off", 6)) {
		435	printk(KERN_INFO
		436	"Intel-IOMMU: disable supported super page\n");
		437	intel_iommu_superpage = 0;
425	}	438	}
426		439
427	str += strcspn(str, ",");	440	str += strcspn(str, ",");
@@ -560,11 +573,32 @@ static void domain_update_iommu_snooping(struct dmar_domain *domain)
560	}	573	}
561	}	574	}
562		575
		576	static void domain_update_iommu_superpage(struct dmar_domain *domain)
		577	{
		578	int i, mask = 0xf;
		579
		580	if (!intel_iommu_superpage) {
		581	domain->iommu_superpage = 0;
		582	return;
		583	}
		584
		585	domain->iommu_superpage = 4; /* 1TiB */
		586
		587	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
		588	mask \|= cap_super_page_val(g_iommus[i]->cap);
		589	if (!mask) {
		590	break;
		591	}
		592	}
		593	domain->iommu_superpage = fls(mask);
		594	}
		595
563	/* Some capabilities may be different across iommus */	596	/* Some capabilities may be different across iommus */
564	static void domain_update_iommu_cap(struct dmar_domain *domain)	597	static void domain_update_iommu_cap(struct dmar_domain *domain)
565	{	598	{
566	domain_update_iommu_coherency(domain);	599	domain_update_iommu_coherency(domain);
567	domain_update_iommu_snooping(domain);	600	domain_update_iommu_snooping(domain);
		601	domain_update_iommu_superpage(domain);
568	}	602	}
569		603
570	static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)	604	static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
@@ -694,23 +728,31 @@ out:
694	}	728	}
695		729
696	static struct dma_pte pfn_to_dma_pte(struct dmar_domain domain,	730	static struct dma_pte pfn_to_dma_pte(struct dmar_domain domain,
697	unsigned long pfn)	731	unsigned long pfn, int large_level)
698	{	732	{
699	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;	733	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
700	struct dma_pte parent, pte = NULL;	734	struct dma_pte parent, pte = NULL;
701	int level = agaw_to_level(domain->agaw);	735	int level = agaw_to_level(domain->agaw);
702	int offset;	736	int offset, target_level;
703		737
704	BUG_ON(!domain->pgd);	738	BUG_ON(!domain->pgd);
705	BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);	739	BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
706	parent = domain->pgd;	740	parent = domain->pgd;
707		741
		742	/* Search pte */
		743	if (!large_level)
		744	target_level = 1;
		745	else
		746	target_level = large_level;
		747
708	while (level > 0) {	748	while (level > 0) {
709	void *tmp_page;	749	void *tmp_page;
710		750
711	offset = pfn_level_offset(pfn, level);	751	offset = pfn_level_offset(pfn, level);
712	pte = &parent[offset];	752	pte = &parent[offset];
713	if (level == 1)	753	if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE))
		754	break;
		755	if (level == target_level)
714	break;	756	break;
715		757
716	if (!dma_pte_present(pte)) {	758	if (!dma_pte_present(pte)) {
@@ -738,10 +780,11 @@ static struct dma_pte pfn_to_dma_pte(struct dmar_domain domain,
738	return pte;	780	return pte;
739	}	781	}
740		782
		783
741	/* return address's pte at specific level */	784	/* return address's pte at specific level */
742	static struct dma_pte dma_pfn_level_pte(struct dmar_domain domain,	785	static struct dma_pte dma_pfn_level_pte(struct dmar_domain domain,
743	unsigned long pfn,	786	unsigned long pfn,
744	int level)	787	int level, int *large_page)
745	{	788	{
746	struct dma_pte parent, pte = NULL;	789	struct dma_pte parent, pte = NULL;
747	int total = agaw_to_level(domain->agaw);	790	int total = agaw_to_level(domain->agaw);
@@ -754,8 +797,16 @@ static struct dma_pte dma_pfn_level_pte(struct dmar_domain domain,
754	if (level == total)	797	if (level == total)
755	return pte;	798	return pte;
756		799
757	if (!dma_pte_present(pte))	800	if (!dma_pte_present(pte)) {
		801	*large_page = total;
758	break;	802	break;
		803	}
		804
		805	if (pte->val & DMA_PTE_LARGE_PAGE) {
		806	*large_page = total;
		807	return pte;
		808	}
		809
759	parent = phys_to_virt(dma_pte_addr(pte));	810	parent = phys_to_virt(dma_pte_addr(pte));
760	total--;	811	total--;
761	}	812	}
@@ -768,6 +819,7 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
768	unsigned long last_pfn)	819	unsigned long last_pfn)
769	{	820	{
770	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;	821	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
		822	unsigned int large_page = 1;
771	struct dma_pte first_pte, pte;	823	struct dma_pte first_pte, pte;
772		824
773	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);	825	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
@@ -776,14 +828,15 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
776		828
777	/* we don't need lock here; nobody else touches the iova range */	829	/* we don't need lock here; nobody else touches the iova range */
778	do {	830	do {
779	first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);	831	large_page = 1;
		832	first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
780	if (!pte) {	833	if (!pte) {
781	start_pfn = align_to_level(start_pfn + 1, 2);	834	start_pfn = align_to_level(start_pfn + 1, large_page + 1);
782	continue;	835	continue;
783	}	836	}
784	do {	837	do {
785	dma_clear_pte(pte);	838	dma_clear_pte(pte);
786	start_pfn++;	839	start_pfn += lvl_to_nr_pages(large_page);
787	pte++;	840	pte++;
788	} while (start_pfn <= last_pfn && !first_pte_in_page(pte));	841	} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
789		842
@@ -803,6 +856,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
803	int total = agaw_to_level(domain->agaw);	856	int total = agaw_to_level(domain->agaw);
804	int level;	857	int level;
805	unsigned long tmp;	858	unsigned long tmp;
		859	int large_page = 2;
806		860
807	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);	861	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
808	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);	862	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
@@ -818,7 +872,10 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
818	return;	872	return;
819		873
820	do {	874	do {
821	first_pte = pte = dma_pfn_level_pte(domain, tmp, level);	875	large_page = level;
		876	first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
		877	if (large_page > level)
		878	level = large_page + 1;
822	if (!pte) {	879	if (!pte) {
823	tmp = align_to_level(tmp + 1, level + 1);	880	tmp = align_to_level(tmp + 1, level + 1);
824	continue;	881	continue;
@@ -1402,6 +1459,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
1402	else	1459	else
1403	domain->iommu_snooping = 0;	1460	domain->iommu_snooping = 0;
1404		1461
		1462	domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1405	domain->iommu_count = 1;	1463	domain->iommu_count = 1;
1406	domain->nid = iommu->node;	1464	domain->nid = iommu->node;
1407		1465
@@ -1657,6 +1715,34 @@ static inline unsigned long aligned_nrpages(unsigned long host_addr,
1657	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;	1715	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1658	}	1716	}
1659		1717
		1718	/* Return largest possible superpage level for a given mapping */
		1719	static inline int hardware_largepage_caps(struct dmar_domain *domain,
		1720	unsigned long iov_pfn,
		1721	unsigned long phy_pfn,
		1722	unsigned long pages)
		1723	{
		1724	int support, level = 1;
		1725	unsigned long pfnmerge;
		1726
		1727	support = domain->iommu_superpage;
		1728
		1729	/* To use a large page, the virtual and physical addresses
		1730	must be aligned to 2MiB/1GiB/etc. Lower bits set in either
		1731	of them will mean we have to use smaller pages. So just
		1732	merge them and check both at once. */
		1733	pfnmerge = iov_pfn \| phy_pfn;
		1734
		1735	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
		1736	pages >>= VTD_STRIDE_SHIFT;
		1737	if (!pages)
		1738	break;
		1739	pfnmerge >>= VTD_STRIDE_SHIFT;
		1740	level++;
		1741	support--;
		1742	}
		1743	return level;
		1744	}
		1745
1660	static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,	1746	static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1661	struct scatterlist *sg, unsigned long phys_pfn,	1747	struct scatterlist *sg, unsigned long phys_pfn,
1662	unsigned long nr_pages, int prot)	1748	unsigned long nr_pages, int prot)
@@ -1665,6 +1751,8 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1665	phys_addr_t uninitialized_var(pteval);	1751	phys_addr_t uninitialized_var(pteval);
1666	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;	1752	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1667	unsigned long sg_res;	1753	unsigned long sg_res;
		1754	unsigned int largepage_lvl = 0;
		1755	unsigned long lvl_pages = 0;
1668		1756
1669	BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);	1757	BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1670		1758
@@ -1680,7 +1768,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1680	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) \| prot;	1768	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) \| prot;
1681	}	1769	}
1682		1770
1683	while (nr_pages--) {	1771	while (nr_pages > 0) {
1684	uint64_t tmp;	1772	uint64_t tmp;
1685		1773
1686	if (!sg_res) {	1774	if (!sg_res) {
@@ -1688,11 +1776,21 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1688	sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;	1776	sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1689	sg->dma_length = sg->length;	1777	sg->dma_length = sg->length;
1690	pteval = page_to_phys(sg_page(sg)) \| prot;	1778	pteval = page_to_phys(sg_page(sg)) \| prot;
		1779	phys_pfn = pteval >> VTD_PAGE_SHIFT;
1691	}	1780	}
		1781
1692	if (!pte) {	1782	if (!pte) {
1693	first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);	1783	largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
		1784
		1785	first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1694	if (!pte)	1786	if (!pte)
1695	return -ENOMEM;	1787	return -ENOMEM;
		1788	/* It is large page*/
		1789	if (largepage_lvl > 1)
		1790	pteval \|= DMA_PTE_LARGE_PAGE;
		1791	else
		1792	pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
		1793
1696	}	1794	}
1697	/* We don't need lock here, nobody else	1795	/* We don't need lock here, nobody else
1698	* touches the iova range	1796	* touches the iova range
@@ -1708,16 +1806,38 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1708	}	1806	}
1709	WARN_ON(1);	1807	WARN_ON(1);
1710	}	1808	}
		1809
		1810	lvl_pages = lvl_to_nr_pages(largepage_lvl);
		1811
		1812	BUG_ON(nr_pages < lvl_pages);
		1813	BUG_ON(sg_res < lvl_pages);
		1814
		1815	nr_pages -= lvl_pages;
		1816	iov_pfn += lvl_pages;
		1817	phys_pfn += lvl_pages;
		1818	pteval += lvl_pages * VTD_PAGE_SIZE;
		1819	sg_res -= lvl_pages;
		1820
		1821	/* If the next PTE would be the first in a new page, then we
		1822	need to flush the cache on the entries we've just written.
		1823	And then we'll need to recalculate 'pte', so clear it and
		1824	let it get set again in the if (!pte) block above.
		1825
		1826	If we're done (!nr_pages) we need to flush the cache too.
		1827
		1828	Also if we've been setting superpages, we may need to
		1829	recalculate 'pte' and switch back to smaller pages for the
		1830	end of the mapping, if the trailing size is not enough to
		1831	use another superpage (i.e. sg_res < lvl_pages). */
1711	pte++;	1832	pte++;
1712	if (!nr_pages \|\| first_pte_in_page(pte)) {	1833	if (!nr_pages \|\| first_pte_in_page(pte) \|\|
		1834	(largepage_lvl > 1 && sg_res < lvl_pages)) {
1713	domain_flush_cache(domain, first_pte,	1835	domain_flush_cache(domain, first_pte,
1714	(void )pte - (void )first_pte);	1836	(void )pte - (void )first_pte);
1715	pte = NULL;	1837	pte = NULL;
1716	}	1838	}
1717	iov_pfn++;	1839
1718	pteval += VTD_PAGE_SIZE;	1840	if (!sg_res && nr_pages)
1719	sg_res--;
1720	if (!sg_res)
1721	sg = sg_next(sg);	1841	sg = sg_next(sg);
1722	}	1842	}
1723	return 0;	1843	return 0;
@@ -3527,6 +3647,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
3527	domain->iommu_count = 0;	3647	domain->iommu_count = 0;
3528	domain->iommu_coherency = 0;	3648	domain->iommu_coherency = 0;
3529	domain->iommu_snooping = 0;	3649	domain->iommu_snooping = 0;
		3650	domain->iommu_superpage = 0;
3530	domain->max_addr = 0;	3651	domain->max_addr = 0;
3531	domain->nid = -1;	3652	domain->nid = -1;
3532		3653
@@ -3742,7 +3863,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3742	struct dma_pte *pte;	3863	struct dma_pte *pte;
3743	u64 phys = 0;	3864	u64 phys = 0;
3744		3865
3745	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);	3866	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
3746	if (pte)	3867	if (pte)
3747	phys = dma_pte_addr(pte);	3868	phys = dma_pte_addr(pte);
3748		3869