diff options
author | Alex Williamson <alex.williamson@redhat.com> | 2013-06-15 12:27:19 -0400 |
---|---|---|
committer | Joerg Roedel <joro@8bytes.org> | 2013-08-14 16:21:04 -0400 |
commit | 3269ee0bd6686baf86630300d528500ac5b516d7 (patch) | |
tree | de651f5e9631b3258ff01b3fc97d91b260d29a17 /drivers/iommu | |
parent | d4e4ab86bcba5a72779c43dc1459f71fea3d89c8 (diff) |
intel-iommu: Fix leaks in pagetable freeing
At best the current code only seems to free the leaf pagetables and
the root. If you're unlucky enough to have a large gap (like any
QEMU guest with more than 3G of memory), only the first chunk of leaf
pagetables are freed (plus the root). This is a massive memory leak.
This patch re-writes the pagetable freeing function to use a
recursive algorithm and manages to not only free all the pagetables,
but does it without any apparent performance loss versus the current
broken version.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Cc: stable@vger.kernel.org
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Joerg Roedel <joro@8bytes.org>
Diffstat (limited to 'drivers/iommu')
-rw-r--r-- | drivers/iommu/intel-iommu.c | 72 |
1 files changed, 35 insertions, 37 deletions
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index eec0d3e04bf5..15e9b57e9cf0 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c | |||
@@ -890,56 +890,54 @@ static int dma_pte_clear_range(struct dmar_domain *domain, | |||
890 | return order; | 890 | return order; |
891 | } | 891 | } |
892 | 892 | ||
893 | static void dma_pte_free_level(struct dmar_domain *domain, int level, | ||
894 | struct dma_pte *pte, unsigned long pfn, | ||
895 | unsigned long start_pfn, unsigned long last_pfn) | ||
896 | { | ||
897 | pfn = max(start_pfn, pfn); | ||
898 | pte = &pte[pfn_level_offset(pfn, level)]; | ||
899 | |||
900 | do { | ||
901 | unsigned long level_pfn; | ||
902 | struct dma_pte *level_pte; | ||
903 | |||
904 | if (!dma_pte_present(pte) || dma_pte_superpage(pte)) | ||
905 | goto next; | ||
906 | |||
907 | level_pfn = pfn & level_mask(level - 1); | ||
908 | level_pte = phys_to_virt(dma_pte_addr(pte)); | ||
909 | |||
910 | if (level > 2) | ||
911 | dma_pte_free_level(domain, level - 1, level_pte, | ||
912 | level_pfn, start_pfn, last_pfn); | ||
913 | |||
914 | /* If range covers entire pagetable, free it */ | ||
915 | if (!(start_pfn > level_pfn || | ||
916 | last_pfn < level_pfn + level_size(level))) { | ||
917 | dma_clear_pte(pte); | ||
918 | domain_flush_cache(domain, pte, sizeof(*pte)); | ||
919 | free_pgtable_page(level_pte); | ||
920 | } | ||
921 | next: | ||
922 | pfn += level_size(level); | ||
923 | } while (!first_pte_in_page(++pte) && pfn <= last_pfn); | ||
924 | } | ||
925 | |||
893 | /* free page table pages. last level pte should already be cleared */ | 926 | /* free page table pages. last level pte should already be cleared */ |
894 | static void dma_pte_free_pagetable(struct dmar_domain *domain, | 927 | static void dma_pte_free_pagetable(struct dmar_domain *domain, |
895 | unsigned long start_pfn, | 928 | unsigned long start_pfn, |
896 | unsigned long last_pfn) | 929 | unsigned long last_pfn) |
897 | { | 930 | { |
898 | int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; | 931 | int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; |
899 | struct dma_pte *first_pte, *pte; | ||
900 | int total = agaw_to_level(domain->agaw); | ||
901 | int level; | ||
902 | unsigned long tmp; | ||
903 | int large_page = 2; | ||
904 | 932 | ||
905 | BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); | 933 | BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); |
906 | BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); | 934 | BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); |
907 | BUG_ON(start_pfn > last_pfn); | 935 | BUG_ON(start_pfn > last_pfn); |
908 | 936 | ||
909 | /* We don't need lock here; nobody else touches the iova range */ | 937 | /* We don't need lock here; nobody else touches the iova range */ |
910 | level = 2; | 938 | dma_pte_free_level(domain, agaw_to_level(domain->agaw), |
911 | while (level <= total) { | 939 | domain->pgd, 0, start_pfn, last_pfn); |
912 | tmp = align_to_level(start_pfn, level); | ||
913 | |||
914 | /* If we can't even clear one PTE at this level, we're done */ | ||
915 | if (tmp + level_size(level) - 1 > last_pfn) | ||
916 | return; | ||
917 | |||
918 | do { | ||
919 | large_page = level; | ||
920 | first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page); | ||
921 | if (large_page > level) | ||
922 | level = large_page + 1; | ||
923 | if (!pte) { | ||
924 | tmp = align_to_level(tmp + 1, level + 1); | ||
925 | continue; | ||
926 | } | ||
927 | do { | ||
928 | if (dma_pte_present(pte)) { | ||
929 | free_pgtable_page(phys_to_virt(dma_pte_addr(pte))); | ||
930 | dma_clear_pte(pte); | ||
931 | } | ||
932 | pte++; | ||
933 | tmp += level_size(level); | ||
934 | } while (!first_pte_in_page(pte) && | ||
935 | tmp + level_size(level) - 1 <= last_pfn); | ||
936 | 940 | ||
937 | domain_flush_cache(domain, first_pte, | ||
938 | (void *)pte - (void *)first_pte); | ||
939 | |||
940 | } while (tmp && tmp + level_size(level) - 1 <= last_pfn); | ||
941 | level++; | ||
942 | } | ||
943 | /* free pgd */ | 941 | /* free pgd */ |
944 | if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { | 942 | if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { |
945 | free_pgtable_page(domain->pgd); | 943 | free_pgtable_page(domain->pgd); |