diff options
author | Gerald Schaefer <gerald.schaefer@de.ibm.com> | 2016-04-28 19:18:35 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-04-28 22:34:04 -0400 |
commit | 28093f9f34cedeaea0f481c58446d9dac6dd620f (patch) | |
tree | da930385a854ae9cb95239b84fa9bac9b677c812 | |
parent | 3486b85a29c1741db99d0c522211c82d2b7a56d0 (diff) |
numa: fix /proc/<pid>/numa_maps for THP
In gather_pte_stats() a THP pmd is cast into a pte, which is wrong
because the layouts may differ depending on the architecture. On s390
this will lead to inaccurate numa_maps accounting in /proc because of
misguided pte_present() and pte_dirty() checks on the fake pte.
On other architectures pte_present() and pte_dirty() may work by chance,
but there may be an issue with direct-access (dax) mappings w/o
underlying struct pages when HAVE_PTE_SPECIAL is set and THP is
available. In vm_normal_page() the fake pte will be checked with
pte_special() and because there is no "special" bit in a pmd, this will
always return false and the VM_PFNMAP | VM_MIXEDMAP checking will be
skipped. On dax mappings w/o struct pages, an invalid struct page
pointer would then be returned that can crash the kernel.
This patch fixes the numa_maps THP handling by introducing new "_pmd"
variants of the can_gather_numa_stats() and vm_normal_page() functions.
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org> [4.3+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | fs/proc/task_mmu.c | 33 | ||||
-rw-r--r-- | include/linux/mm.h | 2 | ||||
-rw-r--r-- | mm/memory.c | 40 |
3 files changed, 72 insertions, 3 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 229cb546bee0..541583510cfb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -1518,6 +1518,32 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, | |||
1518 | return page; | 1518 | return page; |
1519 | } | 1519 | } |
1520 | 1520 | ||
1521 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
1522 | static struct page *can_gather_numa_stats_pmd(pmd_t pmd, | ||
1523 | struct vm_area_struct *vma, | ||
1524 | unsigned long addr) | ||
1525 | { | ||
1526 | struct page *page; | ||
1527 | int nid; | ||
1528 | |||
1529 | if (!pmd_present(pmd)) | ||
1530 | return NULL; | ||
1531 | |||
1532 | page = vm_normal_page_pmd(vma, addr, pmd); | ||
1533 | if (!page) | ||
1534 | return NULL; | ||
1535 | |||
1536 | if (PageReserved(page)) | ||
1537 | return NULL; | ||
1538 | |||
1539 | nid = page_to_nid(page); | ||
1540 | if (!node_isset(nid, node_states[N_MEMORY])) | ||
1541 | return NULL; | ||
1542 | |||
1543 | return page; | ||
1544 | } | ||
1545 | #endif | ||
1546 | |||
1521 | static int gather_pte_stats(pmd_t *pmd, unsigned long addr, | 1547 | static int gather_pte_stats(pmd_t *pmd, unsigned long addr, |
1522 | unsigned long end, struct mm_walk *walk) | 1548 | unsigned long end, struct mm_walk *walk) |
1523 | { | 1549 | { |
@@ -1527,14 +1553,14 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, | |||
1527 | pte_t *orig_pte; | 1553 | pte_t *orig_pte; |
1528 | pte_t *pte; | 1554 | pte_t *pte; |
1529 | 1555 | ||
1556 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
1530 | ptl = pmd_trans_huge_lock(pmd, vma); | 1557 | ptl = pmd_trans_huge_lock(pmd, vma); |
1531 | if (ptl) { | 1558 | if (ptl) { |
1532 | pte_t huge_pte = *(pte_t *)pmd; | ||
1533 | struct page *page; | 1559 | struct page *page; |
1534 | 1560 | ||
1535 | page = can_gather_numa_stats(huge_pte, vma, addr); | 1561 | page = can_gather_numa_stats_pmd(*pmd, vma, addr); |
1536 | if (page) | 1562 | if (page) |
1537 | gather_stats(page, md, pte_dirty(huge_pte), | 1563 | gather_stats(page, md, pmd_dirty(*pmd), |
1538 | HPAGE_PMD_SIZE/PAGE_SIZE); | 1564 | HPAGE_PMD_SIZE/PAGE_SIZE); |
1539 | spin_unlock(ptl); | 1565 | spin_unlock(ptl); |
1540 | return 0; | 1566 | return 0; |
@@ -1542,6 +1568,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, | |||
1542 | 1568 | ||
1543 | if (pmd_trans_unstable(pmd)) | 1569 | if (pmd_trans_unstable(pmd)) |
1544 | return 0; | 1570 | return 0; |
1571 | #endif | ||
1545 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | 1572 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); |
1546 | do { | 1573 | do { |
1547 | struct page *page = can_gather_numa_stats(*pte, vma, addr); | 1574 | struct page *page = can_gather_numa_stats(*pte, vma, addr); |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 79b6c18d0a38..864d7221de84 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -1140,6 +1140,8 @@ struct zap_details { | |||
1140 | 1140 | ||
1141 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | 1141 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, |
1142 | pte_t pte); | 1142 | pte_t pte); |
1143 | struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, | ||
1144 | pmd_t pmd); | ||
1143 | 1145 | ||
1144 | int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, | 1146 | int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, |
1145 | unsigned long size); | 1147 | unsigned long size); |
diff --git a/mm/memory.c b/mm/memory.c index 93897f23cc11..305537fc8640 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -789,6 +789,46 @@ out: | |||
789 | return pfn_to_page(pfn); | 789 | return pfn_to_page(pfn); |
790 | } | 790 | } |
791 | 791 | ||
792 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
793 | struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, | ||
794 | pmd_t pmd) | ||
795 | { | ||
796 | unsigned long pfn = pmd_pfn(pmd); | ||
797 | |||
798 | /* | ||
799 | * There is no pmd_special() but there may be special pmds, e.g. | ||
800 | * in a direct-access (dax) mapping, so let's just replicate the | ||
801 | * !HAVE_PTE_SPECIAL case from vm_normal_page() here. | ||
802 | */ | ||
803 | if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { | ||
804 | if (vma->vm_flags & VM_MIXEDMAP) { | ||
805 | if (!pfn_valid(pfn)) | ||
806 | return NULL; | ||
807 | goto out; | ||
808 | } else { | ||
809 | unsigned long off; | ||
810 | off = (addr - vma->vm_start) >> PAGE_SHIFT; | ||
811 | if (pfn == vma->vm_pgoff + off) | ||
812 | return NULL; | ||
813 | if (!is_cow_mapping(vma->vm_flags)) | ||
814 | return NULL; | ||
815 | } | ||
816 | } | ||
817 | |||
818 | if (is_zero_pfn(pfn)) | ||
819 | return NULL; | ||
820 | if (unlikely(pfn > highest_memmap_pfn)) | ||
821 | return NULL; | ||
822 | |||
823 | /* | ||
824 | * NOTE! We still have PageReserved() pages in the page tables. | ||
825 | * eg. VDSO mappings can cause them to exist. | ||
826 | */ | ||
827 | out: | ||
828 | return pfn_to_page(pfn); | ||
829 | } | ||
830 | #endif | ||
831 | |||
792 | /* | 832 | /* |
793 | * copy one vm_area from one task to the other. Assumes the page tables | 833 | * copy one vm_area from one task to the other. Assumes the page tables |
794 | * already present in the new task to be cleared in the whole range | 834 | * already present in the new task to be cleared in the whole range |