summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGerald Schaefer <gerald.schaefer@de.ibm.com>2016-04-28 19:18:35 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-04-28 22:34:04 -0400
commit28093f9f34cedeaea0f481c58446d9dac6dd620f (patch)
treeda930385a854ae9cb95239b84fa9bac9b677c812
parent3486b85a29c1741db99d0c522211c82d2b7a56d0 (diff)
numa: fix /proc/<pid>/numa_maps for THP
In gather_pte_stats() a THP pmd is cast into a pte, which is wrong because the layouts may differ depending on the architecture. On s390 this will lead to inaccurate numa_maps accounting in /proc because of misguided pte_present() and pte_dirty() checks on the fake pte. On other architectures pte_present() and pte_dirty() may work by chance, but there may be an issue with direct-access (dax) mappings w/o underlying struct pages when HAVE_PTE_SPECIAL is set and THP is available. In vm_normal_page() the fake pte will be checked with pte_special() and because there is no "special" bit in a pmd, this will always return false and the VM_PFNMAP | VM_MIXEDMAP checking will be skipped. On dax mappings w/o struct pages, an invalid struct page pointer would then be returned that can crash the kernel. This patch fixes the numa_maps THP handling by introducing new "_pmd" variants of the can_gather_numa_stats() and vm_normal_page() functions. Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com> Cc: Konstantin Khlebnikov <koct9i@gmail.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Jerome Marchand <jmarchan@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com> Cc: <stable@vger.kernel.org> [4.3+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/proc/task_mmu.c33
-rw-r--r--include/linux/mm.h2
-rw-r--r--mm/memory.c40
3 files changed, 72 insertions, 3 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 229cb546bee0..541583510cfb 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1518,6 +1518,32 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1518 return page; 1518 return page;
1519} 1519}
1520 1520
1521#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1522static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
1523 struct vm_area_struct *vma,
1524 unsigned long addr)
1525{
1526 struct page *page;
1527 int nid;
1528
1529 if (!pmd_present(pmd))
1530 return NULL;
1531
1532 page = vm_normal_page_pmd(vma, addr, pmd);
1533 if (!page)
1534 return NULL;
1535
1536 if (PageReserved(page))
1537 return NULL;
1538
1539 nid = page_to_nid(page);
1540 if (!node_isset(nid, node_states[N_MEMORY]))
1541 return NULL;
1542
1543 return page;
1544}
1545#endif
1546
1521static int gather_pte_stats(pmd_t *pmd, unsigned long addr, 1547static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1522 unsigned long end, struct mm_walk *walk) 1548 unsigned long end, struct mm_walk *walk)
1523{ 1549{
@@ -1527,14 +1553,14 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1527 pte_t *orig_pte; 1553 pte_t *orig_pte;
1528 pte_t *pte; 1554 pte_t *pte;
1529 1555
1556#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1530 ptl = pmd_trans_huge_lock(pmd, vma); 1557 ptl = pmd_trans_huge_lock(pmd, vma);
1531 if (ptl) { 1558 if (ptl) {
1532 pte_t huge_pte = *(pte_t *)pmd;
1533 struct page *page; 1559 struct page *page;
1534 1560
1535 page = can_gather_numa_stats(huge_pte, vma, addr); 1561 page = can_gather_numa_stats_pmd(*pmd, vma, addr);
1536 if (page) 1562 if (page)
1537 gather_stats(page, md, pte_dirty(huge_pte), 1563 gather_stats(page, md, pmd_dirty(*pmd),
1538 HPAGE_PMD_SIZE/PAGE_SIZE); 1564 HPAGE_PMD_SIZE/PAGE_SIZE);
1539 spin_unlock(ptl); 1565 spin_unlock(ptl);
1540 return 0; 1566 return 0;
@@ -1542,6 +1568,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1542 1568
1543 if (pmd_trans_unstable(pmd)) 1569 if (pmd_trans_unstable(pmd))
1544 return 0; 1570 return 0;
1571#endif
1545 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 1572 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1546 do { 1573 do {
1547 struct page *page = can_gather_numa_stats(*pte, vma, addr); 1574 struct page *page = can_gather_numa_stats(*pte, vma, addr);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 79b6c18d0a38..864d7221de84 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1140,6 +1140,8 @@ struct zap_details {
1140 1140
1141struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 1141struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
1142 pte_t pte); 1142 pte_t pte);
1143struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
1144 pmd_t pmd);
1143 1145
1144int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, 1146int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1145 unsigned long size); 1147 unsigned long size);
diff --git a/mm/memory.c b/mm/memory.c
index 93897f23cc11..305537fc8640 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -789,6 +789,46 @@ out:
789 return pfn_to_page(pfn); 789 return pfn_to_page(pfn);
790} 790}
791 791
792#ifdef CONFIG_TRANSPARENT_HUGEPAGE
793struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
794 pmd_t pmd)
795{
796 unsigned long pfn = pmd_pfn(pmd);
797
798 /*
799 * There is no pmd_special() but there may be special pmds, e.g.
800 * in a direct-access (dax) mapping, so let's just replicate the
801 * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
802 */
803 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
804 if (vma->vm_flags & VM_MIXEDMAP) {
805 if (!pfn_valid(pfn))
806 return NULL;
807 goto out;
808 } else {
809 unsigned long off;
810 off = (addr - vma->vm_start) >> PAGE_SHIFT;
811 if (pfn == vma->vm_pgoff + off)
812 return NULL;
813 if (!is_cow_mapping(vma->vm_flags))
814 return NULL;
815 }
816 }
817
818 if (is_zero_pfn(pfn))
819 return NULL;
820 if (unlikely(pfn > highest_memmap_pfn))
821 return NULL;
822
823 /*
824 * NOTE! We still have PageReserved() pages in the page tables.
825 * eg. VDSO mappings can cause them to exist.
826 */
827out:
828 return pfn_to_page(pfn);
829}
830#endif
831
792/* 832/*
793 * copy one vm_area from one task to the other. Assumes the page tables 833 * copy one vm_area from one task to the other. Assumes the page tables
794 * already present in the new task to be cleared in the whole range 834 * already present in the new task to be cleared in the whole range