aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYouquan Song <youquan.song@intel.com>2011-05-25 14:13:49 -0400
committerDavid Woodhouse <David.Woodhouse@intel.com>2011-06-01 07:26:35 -0400
commit6dd9a7c73761a8a5f5475d5cfdc15368a0f4c06d (patch)
treecb685e370cc1cb2dec39b29500bdd22fd1814596
parent7b668357810ecb5fdda4418689d50f5d95aea6a8 (diff)
intel-iommu: Enable super page (2MiB, 1GiB, etc.) support
There are no externally-visible changes with this. In the loop in the internal __domain_mapping() function, we simply detect if we are mapping: - size >= 2MiB, and - virtual address aligned to 2MiB, and - physical address aligned to 2MiB, and - on hardware that supports superpages. (and likewise for larger superpages). We automatically use a superpage for such mappings. We never have to worry about *breaking* superpages, since we trust that we will always *unmap* the same range that was mapped. So all we need to do is ensure that dma_pte_clear_range() will also cope with superpages. Adjust pfn_to_dma_pte() to take a superpage 'level' as an argument, so it can return a PTE at the appropriate level rather than always extending the page tables all the way down to level 1. Again, this is simplified by the fact that we should never encounter existing small pages when we're creating a mapping; any old mapping that used the same virtual range will have been entirely removed and its obsolete page tables freed. Provide an 'intel_iommu=sp_off' argument on the command line as a chicken bit. Not that it should ever be required. == The original commit seen in the iommu-2.6.git was Youquan's implementation (and completion) of my own half-baked code which I'd typed into an email. Followed by half a dozen subsequent 'fixes'. I've taken the unusual step of rewriting history and collapsing the original commits in order to keep the main history simpler, and make life easier for the people who are going to have to backport this to older kernels. And also so I can give it a more coherent commit comment which (hopefully) gives a better explanation of what's going on. The original sequence of commits leading to identical code was: Youquan Song (3): intel-iommu: super page support intel-iommu: Fix superpage alignment calculation error intel-iommu: Fix superpage level calculation error in dma_pfn_level_pte() David Woodhouse (4): intel-iommu: Precalculate superpage support for dmar_domain intel-iommu: Fix hardware_largepage_caps() intel-iommu: Fix inappropriate use of superpages in __domain_mapping() intel-iommu: Fix phys_pfn in __domain_mapping for sglist pages Signed-off-by: Youquan Song <youquan.song@intel.com> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--drivers/pci/intel-iommu.c157
-rw-r--r--include/linux/dma_remapping.h4
3 files changed, 147 insertions, 19 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index cc85a9278190..d005487c1a22 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -999,7 +999,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
999 With this option on every unmap_single operation will 999 With this option on every unmap_single operation will
1000 result in a hardware IOTLB flush operation as opposed 1000 result in a hardware IOTLB flush operation as opposed
1001 to batching them for performance. 1001 to batching them for performance.
1002 1002 sp_off [Default Off]
1003 By default, super page will be supported if Intel IOMMU
1004 has the capability. With this option, super page will
1005 not be supported.
1003 intremap= [X86-64, Intel-IOMMU] 1006 intremap= [X86-64, Intel-IOMMU]
1004 Format: { on (default) | off | nosid } 1007 Format: { on (default) | off | nosid }
1005 on enable Interrupt Remapping (default) 1008 on enable Interrupt Remapping (default)
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 395f253c0494..e6fe1994f9d3 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -115,6 +115,11 @@ static inline unsigned long align_to_level(unsigned long pfn, int level)
115 return (pfn + level_size(level) - 1) & level_mask(level); 115 return (pfn + level_size(level) - 1) & level_mask(level);
116} 116}
117 117
118static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
119{
120 return 1 << ((lvl - 1) * LEVEL_STRIDE);
121}
122
118/* VT-d pages must always be _smaller_ than MM pages. Otherwise things 123/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
119 are never going to work. */ 124 are never going to work. */
120static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) 125static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
@@ -343,6 +348,9 @@ struct dmar_domain {
343 int iommu_coherency;/* indicate coherency of iommu access */ 348 int iommu_coherency;/* indicate coherency of iommu access */
344 int iommu_snooping; /* indicate snooping control feature*/ 349 int iommu_snooping; /* indicate snooping control feature*/
345 int iommu_count; /* reference count of iommu */ 350 int iommu_count; /* reference count of iommu */
351 int iommu_superpage;/* Level of superpages supported:
352 0 == 4KiB (no superpages), 1 == 2MiB,
353 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
346 spinlock_t iommu_lock; /* protect iommu set in domain */ 354 spinlock_t iommu_lock; /* protect iommu set in domain */
347 u64 max_addr; /* maximum mapped address */ 355 u64 max_addr; /* maximum mapped address */
348}; 356};
@@ -392,6 +400,7 @@ int dmar_disabled = 1;
392static int dmar_map_gfx = 1; 400static int dmar_map_gfx = 1;
393static int dmar_forcedac; 401static int dmar_forcedac;
394static int intel_iommu_strict; 402static int intel_iommu_strict;
403static int intel_iommu_superpage = 1;
395 404
396#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) 405#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397static DEFINE_SPINLOCK(device_domain_lock); 406static DEFINE_SPINLOCK(device_domain_lock);
@@ -422,6 +431,10 @@ static int __init intel_iommu_setup(char *str)
422 printk(KERN_INFO 431 printk(KERN_INFO
423 "Intel-IOMMU: disable batched IOTLB flush\n"); 432 "Intel-IOMMU: disable batched IOTLB flush\n");
424 intel_iommu_strict = 1; 433 intel_iommu_strict = 1;
434 } else if (!strncmp(str, "sp_off", 6)) {
435 printk(KERN_INFO
436 "Intel-IOMMU: disable supported super page\n");
437 intel_iommu_superpage = 0;
425 } 438 }
426 439
427 str += strcspn(str, ","); 440 str += strcspn(str, ",");
@@ -560,11 +573,32 @@ static void domain_update_iommu_snooping(struct dmar_domain *domain)
560 } 573 }
561} 574}
562 575
576static void domain_update_iommu_superpage(struct dmar_domain *domain)
577{
578 int i, mask = 0xf;
579
580 if (!intel_iommu_superpage) {
581 domain->iommu_superpage = 0;
582 return;
583 }
584
585 domain->iommu_superpage = 4; /* 1TiB */
586
587 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
588 mask |= cap_super_page_val(g_iommus[i]->cap);
589 if (!mask) {
590 break;
591 }
592 }
593 domain->iommu_superpage = fls(mask);
594}
595
563/* Some capabilities may be different across iommus */ 596/* Some capabilities may be different across iommus */
564static void domain_update_iommu_cap(struct dmar_domain *domain) 597static void domain_update_iommu_cap(struct dmar_domain *domain)
565{ 598{
566 domain_update_iommu_coherency(domain); 599 domain_update_iommu_coherency(domain);
567 domain_update_iommu_snooping(domain); 600 domain_update_iommu_snooping(domain);
601 domain_update_iommu_superpage(domain);
568} 602}
569 603
570static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn) 604static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
@@ -694,23 +728,31 @@ out:
694} 728}
695 729
696static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 730static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
697 unsigned long pfn) 731 unsigned long pfn, int large_level)
698{ 732{
699 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 733 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
700 struct dma_pte *parent, *pte = NULL; 734 struct dma_pte *parent, *pte = NULL;
701 int level = agaw_to_level(domain->agaw); 735 int level = agaw_to_level(domain->agaw);
702 int offset; 736 int offset, target_level;
703 737
704 BUG_ON(!domain->pgd); 738 BUG_ON(!domain->pgd);
705 BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width); 739 BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
706 parent = domain->pgd; 740 parent = domain->pgd;
707 741
742 /* Search pte */
743 if (!large_level)
744 target_level = 1;
745 else
746 target_level = large_level;
747
708 while (level > 0) { 748 while (level > 0) {
709 void *tmp_page; 749 void *tmp_page;
710 750
711 offset = pfn_level_offset(pfn, level); 751 offset = pfn_level_offset(pfn, level);
712 pte = &parent[offset]; 752 pte = &parent[offset];
713 if (level == 1) 753 if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE))
754 break;
755 if (level == target_level)
714 break; 756 break;
715 757
716 if (!dma_pte_present(pte)) { 758 if (!dma_pte_present(pte)) {
@@ -738,10 +780,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
738 return pte; 780 return pte;
739} 781}
740 782
783
741/* return address's pte at specific level */ 784/* return address's pte at specific level */
742static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 785static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
743 unsigned long pfn, 786 unsigned long pfn,
744 int level) 787 int level, int *large_page)
745{ 788{
746 struct dma_pte *parent, *pte = NULL; 789 struct dma_pte *parent, *pte = NULL;
747 int total = agaw_to_level(domain->agaw); 790 int total = agaw_to_level(domain->agaw);
@@ -754,8 +797,16 @@ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
754 if (level == total) 797 if (level == total)
755 return pte; 798 return pte;
756 799
757 if (!dma_pte_present(pte)) 800 if (!dma_pte_present(pte)) {
801 *large_page = total;
758 break; 802 break;
803 }
804
805 if (pte->val & DMA_PTE_LARGE_PAGE) {
806 *large_page = total;
807 return pte;
808 }
809
759 parent = phys_to_virt(dma_pte_addr(pte)); 810 parent = phys_to_virt(dma_pte_addr(pte));
760 total--; 811 total--;
761 } 812 }
@@ -768,6 +819,7 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
768 unsigned long last_pfn) 819 unsigned long last_pfn)
769{ 820{
770 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 821 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
822 unsigned int large_page = 1;
771 struct dma_pte *first_pte, *pte; 823 struct dma_pte *first_pte, *pte;
772 824
773 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); 825 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
@@ -776,14 +828,15 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
776 828
777 /* we don't need lock here; nobody else touches the iova range */ 829 /* we don't need lock here; nobody else touches the iova range */
778 do { 830 do {
779 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1); 831 large_page = 1;
832 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
780 if (!pte) { 833 if (!pte) {
781 start_pfn = align_to_level(start_pfn + 1, 2); 834 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
782 continue; 835 continue;
783 } 836 }
784 do { 837 do {
785 dma_clear_pte(pte); 838 dma_clear_pte(pte);
786 start_pfn++; 839 start_pfn += lvl_to_nr_pages(large_page);
787 pte++; 840 pte++;
788 } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 841 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
789 842
@@ -803,6 +856,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
803 int total = agaw_to_level(domain->agaw); 856 int total = agaw_to_level(domain->agaw);
804 int level; 857 int level;
805 unsigned long tmp; 858 unsigned long tmp;
859 int large_page = 2;
806 860
807 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); 861 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
808 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); 862 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
@@ -818,7 +872,10 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
818 return; 872 return;
819 873
820 do { 874 do {
821 first_pte = pte = dma_pfn_level_pte(domain, tmp, level); 875 large_page = level;
876 first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
877 if (large_page > level)
878 level = large_page + 1;
822 if (!pte) { 879 if (!pte) {
823 tmp = align_to_level(tmp + 1, level + 1); 880 tmp = align_to_level(tmp + 1, level + 1);
824 continue; 881 continue;
@@ -1402,6 +1459,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
1402 else 1459 else
1403 domain->iommu_snooping = 0; 1460 domain->iommu_snooping = 0;
1404 1461
1462 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1405 domain->iommu_count = 1; 1463 domain->iommu_count = 1;
1406 domain->nid = iommu->node; 1464 domain->nid = iommu->node;
1407 1465
@@ -1657,6 +1715,34 @@ static inline unsigned long aligned_nrpages(unsigned long host_addr,
1657 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; 1715 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1658} 1716}
1659 1717
1718/* Return largest possible superpage level for a given mapping */
1719static inline int hardware_largepage_caps(struct dmar_domain *domain,
1720 unsigned long iov_pfn,
1721 unsigned long phy_pfn,
1722 unsigned long pages)
1723{
1724 int support, level = 1;
1725 unsigned long pfnmerge;
1726
1727 support = domain->iommu_superpage;
1728
1729 /* To use a large page, the virtual *and* physical addresses
1730 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1731 of them will mean we have to use smaller pages. So just
1732 merge them and check both at once. */
1733 pfnmerge = iov_pfn | phy_pfn;
1734
1735 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1736 pages >>= VTD_STRIDE_SHIFT;
1737 if (!pages)
1738 break;
1739 pfnmerge >>= VTD_STRIDE_SHIFT;
1740 level++;
1741 support--;
1742 }
1743 return level;
1744}
1745
1660static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 1746static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1661 struct scatterlist *sg, unsigned long phys_pfn, 1747 struct scatterlist *sg, unsigned long phys_pfn,
1662 unsigned long nr_pages, int prot) 1748 unsigned long nr_pages, int prot)
@@ -1665,6 +1751,8 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1665 phys_addr_t uninitialized_var(pteval); 1751 phys_addr_t uninitialized_var(pteval);
1666 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 1752 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1667 unsigned long sg_res; 1753 unsigned long sg_res;
1754 unsigned int largepage_lvl = 0;
1755 unsigned long lvl_pages = 0;
1668 1756
1669 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width); 1757 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1670 1758
@@ -1680,7 +1768,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1680 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot; 1768 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1681 } 1769 }
1682 1770
1683 while (nr_pages--) { 1771 while (nr_pages > 0) {
1684 uint64_t tmp; 1772 uint64_t tmp;
1685 1773
1686 if (!sg_res) { 1774 if (!sg_res) {
@@ -1688,11 +1776,21 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1688 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset; 1776 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1689 sg->dma_length = sg->length; 1777 sg->dma_length = sg->length;
1690 pteval = page_to_phys(sg_page(sg)) | prot; 1778 pteval = page_to_phys(sg_page(sg)) | prot;
1779 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1691 } 1780 }
1781
1692 if (!pte) { 1782 if (!pte) {
1693 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn); 1783 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1784
1785 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1694 if (!pte) 1786 if (!pte)
1695 return -ENOMEM; 1787 return -ENOMEM;
1788 /* It is large page*/
1789 if (largepage_lvl > 1)
1790 pteval |= DMA_PTE_LARGE_PAGE;
1791 else
1792 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1793
1696 } 1794 }
1697 /* We don't need lock here, nobody else 1795 /* We don't need lock here, nobody else
1698 * touches the iova range 1796 * touches the iova range
@@ -1708,16 +1806,38 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1708 } 1806 }
1709 WARN_ON(1); 1807 WARN_ON(1);
1710 } 1808 }
1809
1810 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1811
1812 BUG_ON(nr_pages < lvl_pages);
1813 BUG_ON(sg_res < lvl_pages);
1814
1815 nr_pages -= lvl_pages;
1816 iov_pfn += lvl_pages;
1817 phys_pfn += lvl_pages;
1818 pteval += lvl_pages * VTD_PAGE_SIZE;
1819 sg_res -= lvl_pages;
1820
1821 /* If the next PTE would be the first in a new page, then we
1822 need to flush the cache on the entries we've just written.
1823 And then we'll need to recalculate 'pte', so clear it and
1824 let it get set again in the if (!pte) block above.
1825
1826 If we're done (!nr_pages) we need to flush the cache too.
1827
1828 Also if we've been setting superpages, we may need to
1829 recalculate 'pte' and switch back to smaller pages for the
1830 end of the mapping, if the trailing size is not enough to
1831 use another superpage (i.e. sg_res < lvl_pages). */
1711 pte++; 1832 pte++;
1712 if (!nr_pages || first_pte_in_page(pte)) { 1833 if (!nr_pages || first_pte_in_page(pte) ||
1834 (largepage_lvl > 1 && sg_res < lvl_pages)) {
1713 domain_flush_cache(domain, first_pte, 1835 domain_flush_cache(domain, first_pte,
1714 (void *)pte - (void *)first_pte); 1836 (void *)pte - (void *)first_pte);
1715 pte = NULL; 1837 pte = NULL;
1716 } 1838 }
1717 iov_pfn++; 1839
1718 pteval += VTD_PAGE_SIZE; 1840 if (!sg_res && nr_pages)
1719 sg_res--;
1720 if (!sg_res)
1721 sg = sg_next(sg); 1841 sg = sg_next(sg);
1722 } 1842 }
1723 return 0; 1843 return 0;
@@ -3527,6 +3647,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
3527 domain->iommu_count = 0; 3647 domain->iommu_count = 0;
3528 domain->iommu_coherency = 0; 3648 domain->iommu_coherency = 0;
3529 domain->iommu_snooping = 0; 3649 domain->iommu_snooping = 0;
3650 domain->iommu_superpage = 0;
3530 domain->max_addr = 0; 3651 domain->max_addr = 0;
3531 domain->nid = -1; 3652 domain->nid = -1;
3532 3653
@@ -3742,7 +3863,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3742 struct dma_pte *pte; 3863 struct dma_pte *pte;
3743 u64 phys = 0; 3864 u64 phys = 0;
3744 3865
3745 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT); 3866 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
3746 if (pte) 3867 if (pte)
3747 phys = dma_pte_addr(pte); 3868 phys = dma_pte_addr(pte);
3748 3869
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 5619f8522738..bbd8661b3473 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -9,8 +9,12 @@
9#define VTD_PAGE_MASK (((u64)-1) << VTD_PAGE_SHIFT) 9#define VTD_PAGE_MASK (((u64)-1) << VTD_PAGE_SHIFT)
10#define VTD_PAGE_ALIGN(addr) (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK) 10#define VTD_PAGE_ALIGN(addr) (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
11 11
12#define VTD_STRIDE_SHIFT (9)
13#define VTD_STRIDE_MASK (((u64)-1) << VTD_STRIDE_SHIFT)
14
12#define DMA_PTE_READ (1) 15#define DMA_PTE_READ (1)
13#define DMA_PTE_WRITE (2) 16#define DMA_PTE_WRITE (2)
17#define DMA_PTE_LARGE_PAGE (1 << 7)
14#define DMA_PTE_SNP (1 << 11) 18#define DMA_PTE_SNP (1 << 11)
15 19
16#define CONTEXT_TT_MULTI_LEVEL 0 20#define CONTEXT_TT_MULTI_LEVEL 0