aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>2015-10-08 23:02:21 -0400
committerMichael Ellerman <mpe@ellerman.id.au>2015-10-12 00:30:09 -0400
commit891121e6c02c6242487aa4ea1d5c75b7ecdc45ee (patch)
tree118c5b4df68fe004438e4a1a629b04fbeed9b417
parentec2640b114d535ba7d895b6ee353791d542f2407 (diff)
powerpc/mm: Differentiate between hugetlb and THP during page walk
We need to properly identify whether a hugepage is an explicit or a transparent hugepage in follow_huge_addr(). We used to depend on hugepage shift argument to do that. But in some case that can result in wrong results. For ex: On finding a transparent hugepage we set hugepage shift to PMD_SHIFT. But we can end up clearing the thp pte, via pmdp_huge_get_and_clear. We do prevent reusing the pfn page via the usage of kick_all_cpus_sync(). But that happens after we updated the pte to 0. Hence in follow_huge_addr() we can find hugepage shift set, but transparent huge page check fail for a thp pte. NOTE: We fixed a variant of this race against thp split in commit 691e95fd7396905a38d98919e9c150dbc3ea21a3 ("powerpc/mm/thp: Make page table walk safe against thp split/collapse") Without this patch, we may hit the BUG_ON(flags & FOLL_GET) in follow_page_mask occasionally. In the long term, we may want to switch ppc64 64k page size config to enable CONFIG_ARCH_WANT_GENERAL_HUGETLB Reported-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
-rw-r--r--arch/powerpc/include/asm/mmu-hash64.h1
-rw-r--r--arch/powerpc/include/asm/pgtable-ppc64.h10
-rw-r--r--arch/powerpc/include/asm/pgtable.h6
-rw-r--r--arch/powerpc/kernel/eeh.c3
-rw-r--r--arch/powerpc/kernel/io-workarounds.c2
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c2
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c8
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c2
-rw-r--r--arch/powerpc/mm/hash_utils_64.c7
-rw-r--r--arch/powerpc/mm/hugetlbpage.c21
-rw-r--r--arch/powerpc/mm/tlb_hash64.c9
-rw-r--r--arch/powerpc/perf/callchain.c2
12 files changed, 49 insertions, 24 deletions
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index a82f5347540a..ba3342bbdbda 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -14,6 +14,7 @@
14 14
15#include <asm/asm-compat.h> 15#include <asm/asm-compat.h>
16#include <asm/page.h> 16#include <asm/page.h>
17#include <asm/bug.h>
17 18
18/* 19/*
19 * This is necessary to get the definition of PGTABLE_RANGE which we 20 * This is necessary to get the definition of PGTABLE_RANGE which we
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index fa1dfb7f7b48..3245f2d96d4f 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -437,9 +437,9 @@ static inline char *get_hpte_slot_array(pmd_t *pmdp)
437 437
438} 438}
439 439
440#ifdef CONFIG_TRANSPARENT_HUGEPAGE
440extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, 441extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
441 pmd_t *pmdp, unsigned long old_pmd); 442 pmd_t *pmdp, unsigned long old_pmd);
442#ifdef CONFIG_TRANSPARENT_HUGEPAGE
443extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); 443extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
444extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); 444extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
445extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); 445extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
@@ -479,6 +479,14 @@ static inline int pmd_trans_splitting(pmd_t pmd)
479} 479}
480 480
481extern int has_transparent_hugepage(void); 481extern int has_transparent_hugepage(void);
482#else
483static inline void hpte_do_hugepage_flush(struct mm_struct *mm,
484 unsigned long addr, pmd_t *pmdp,
485 unsigned long old_pmd)
486{
487
488 WARN(1, "%s called with THP disabled\n", __func__);
489}
482#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 490#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
483 491
484static inline int pmd_large(pmd_t pmd) 492static inline int pmd_large(pmd_t pmd)
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 0717693c8428..b64b4212b71f 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -259,15 +259,15 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
259#define has_transparent_hugepage() 0 259#define has_transparent_hugepage() 0
260#endif 260#endif
261pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 261pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
262 unsigned *shift); 262 bool *is_thp, unsigned *shift);
263static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 263static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
264 unsigned *shift) 264 bool *is_thp, unsigned *shift)
265{ 265{
266 if (!arch_irqs_disabled()) { 266 if (!arch_irqs_disabled()) {
267 pr_info("%s called with irq enabled\n", __func__); 267 pr_info("%s called with irq enabled\n", __func__);
268 dump_stack(); 268 dump_stack();
269 } 269 }
270 return __find_linux_pte_or_hugepte(pgdir, ea, shift); 270 return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift);
271} 271}
272#endif /* __ASSEMBLY__ */ 272#endif /* __ASSEMBLY__ */
273 273
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index e968533e3e05..00ba5de12256 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -351,7 +351,8 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
351 * worried about _PAGE_SPLITTING/collapse. Also we will not hit 351 * worried about _PAGE_SPLITTING/collapse. Also we will not hit
352 * page table free, because of init_mm. 352 * page table free, because of init_mm.
353 */ 353 */
354 ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift); 354 ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token,
355 NULL, &hugepage_shift);
355 if (!ptep) 356 if (!ptep)
356 return token; 357 return token;
357 WARN_ON(hugepage_shift); 358 WARN_ON(hugepage_shift);
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index 63d9cc4d7366..5f8613ceb97f 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -76,7 +76,7 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
76 * a page table free due to init_mm 76 * a page table free due to init_mm
77 */ 77 */
78 ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr, 78 ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
79 &hugepage_shift); 79 NULL, &hugepage_shift);
80 if (ptep == NULL) 80 if (ptep == NULL)
81 paddr = 0; 81 paddr = 0;
82 else { 82 else {
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 1f9c0a17f445..3fc2ba784a71 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -543,7 +543,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
543 */ 543 */
544 local_irq_save(flags); 544 local_irq_save(flags);
545 ptep = find_linux_pte_or_hugepte(current->mm->pgd, 545 ptep = find_linux_pte_or_hugepte(current->mm->pgd,
546 hva, NULL); 546 hva, NULL, NULL);
547 if (ptep) { 547 if (ptep) {
548 pte = kvmppc_read_update_linux_pte(ptep, 1); 548 pte = kvmppc_read_update_linux_pte(ptep, 1);
549 if (pte_write(pte)) 549 if (pte_write(pte))
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index c1df9bb1e413..0bce4fffcb2e 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -32,7 +32,7 @@ static void *real_vmalloc_addr(void *x)
32 * So don't worry about THP collapse/split. Called 32 * So don't worry about THP collapse/split. Called
33 * Only in realmode, hence won't need irq_save/restore. 33 * Only in realmode, hence won't need irq_save/restore.
34 */ 34 */
35 p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); 35 p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL, NULL);
36 if (!p || !pte_present(*p)) 36 if (!p || !pte_present(*p))
37 return NULL; 37 return NULL;
38 addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); 38 addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
@@ -221,10 +221,12 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
221 * retry via mmu_notifier_retry. 221 * retry via mmu_notifier_retry.
222 */ 222 */
223 if (realmode) 223 if (realmode)
224 ptep = __find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift); 224 ptep = __find_linux_pte_or_hugepte(pgdir, hva, NULL,
225 &hpage_shift);
225 else { 226 else {
226 local_irq_save(irq_flags); 227 local_irq_save(irq_flags);
227 ptep = find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift); 228 ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL,
229 &hpage_shift);
228 } 230 }
229 if (ptep) { 231 if (ptep) {
230 pte_t pte; 232 pte_t pte;
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 4d33e199edcc..805fee9beefa 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -476,7 +476,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
476 * can't run hence pfn won't change. 476 * can't run hence pfn won't change.
477 */ 477 */
478 local_irq_save(flags); 478 local_irq_save(flags);
479 ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL); 479 ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL, NULL);
480 if (ptep) { 480 if (ptep) {
481 pte_t pte = READ_ONCE(*ptep); 481 pte_t pte = READ_ONCE(*ptep);
482 482
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index aee70171355b..7f9616f7c479 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -994,6 +994,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
994 unsigned long access, unsigned long trap, 994 unsigned long access, unsigned long trap,
995 unsigned long flags) 995 unsigned long flags)
996{ 996{
997 bool is_thp;
997 enum ctx_state prev_state = exception_enter(); 998 enum ctx_state prev_state = exception_enter();
998 pgd_t *pgdir; 999 pgd_t *pgdir;
999 unsigned long vsid; 1000 unsigned long vsid;
@@ -1068,7 +1069,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
1068#endif /* CONFIG_PPC_64K_PAGES */ 1069#endif /* CONFIG_PPC_64K_PAGES */
1069 1070
1070 /* Get PTE and page size from page tables */ 1071 /* Get PTE and page size from page tables */
1071 ptep = __find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); 1072 ptep = __find_linux_pte_or_hugepte(pgdir, ea, &is_thp, &hugeshift);
1072 if (ptep == NULL || !pte_present(*ptep)) { 1073 if (ptep == NULL || !pte_present(*ptep)) {
1073 DBG_LOW(" no PTE !\n"); 1074 DBG_LOW(" no PTE !\n");
1074 rc = 1; 1075 rc = 1;
@@ -1088,7 +1089,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
1088 } 1089 }
1089 1090
1090 if (hugeshift) { 1091 if (hugeshift) {
1091 if (pmd_trans_huge(*(pmd_t *)ptep)) 1092 if (is_thp)
1092 rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, 1093 rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
1093 trap, flags, ssize, psize); 1094 trap, flags, ssize, psize);
1094#ifdef CONFIG_HUGETLB_PAGE 1095#ifdef CONFIG_HUGETLB_PAGE
@@ -1243,7 +1244,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
1243 * THP pages use update_mmu_cache_pmd. We don't do 1244 * THP pages use update_mmu_cache_pmd. We don't do
1244 * hash preload there. Hence can ignore THP here 1245 * hash preload there. Hence can ignore THP here
1245 */ 1246 */
1246 ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift); 1247 ptep = find_linux_pte_or_hugepte(pgdir, ea, NULL, &hugepage_shift);
1247 if (!ptep) 1248 if (!ptep)
1248 goto out_exit; 1249 goto out_exit;
1249 1250
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f093828e8997..9833fee493ec 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -128,7 +128,7 @@ int pgd_huge(pgd_t pgd)
128pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 128pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
129{ 129{
130 /* Only called for hugetlbfs pages, hence can ignore THP */ 130 /* Only called for hugetlbfs pages, hence can ignore THP */
131 return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL); 131 return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
132} 132}
133 133
134static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 134static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
@@ -703,13 +703,14 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
703struct page * 703struct page *
704follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 704follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
705{ 705{
706 bool is_thp;
706 pte_t *ptep, pte; 707 pte_t *ptep, pte;
707 unsigned shift; 708 unsigned shift;
708 unsigned long mask, flags; 709 unsigned long mask, flags;
709 struct page *page = ERR_PTR(-EINVAL); 710 struct page *page = ERR_PTR(-EINVAL);
710 711
711 local_irq_save(flags); 712 local_irq_save(flags);
712 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); 713 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift);
713 if (!ptep) 714 if (!ptep)
714 goto no_page; 715 goto no_page;
715 pte = READ_ONCE(*ptep); 716 pte = READ_ONCE(*ptep);
@@ -718,7 +719,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
718 * Transparent hugepages are handled by generic code. We can skip them 719 * Transparent hugepages are handled by generic code. We can skip them
719 * here. 720 * here.
720 */ 721 */
721 if (!shift || pmd_trans_huge(__pmd(pte_val(pte)))) 722 if (!shift || is_thp)
722 goto no_page; 723 goto no_page;
723 724
724 if (!pte_present(pte)) { 725 if (!pte_present(pte)) {
@@ -975,7 +976,7 @@ void flush_dcache_icache_hugepage(struct page *page)
975 */ 976 */
976 977
977pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 978pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
978 unsigned *shift) 979 bool *is_thp, unsigned *shift)
979{ 980{
980 pgd_t pgd, *pgdp; 981 pgd_t pgd, *pgdp;
981 pud_t pud, *pudp; 982 pud_t pud, *pudp;
@@ -987,6 +988,9 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
987 if (shift) 988 if (shift)
988 *shift = 0; 989 *shift = 0;
989 990
991 if (is_thp)
992 *is_thp = false;
993
990 pgdp = pgdir + pgd_index(ea); 994 pgdp = pgdir + pgd_index(ea);
991 pgd = READ_ONCE(*pgdp); 995 pgd = READ_ONCE(*pgdp);
992 /* 996 /*
@@ -1034,7 +1038,14 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
1034 if (pmd_none(pmd)) 1038 if (pmd_none(pmd))
1035 return NULL; 1039 return NULL;
1036 1040
1037 if (pmd_huge(pmd) || pmd_large(pmd)) { 1041 if (pmd_trans_huge(pmd)) {
1042 if (is_thp)
1043 *is_thp = true;
1044 ret_pte = (pte_t *) pmdp;
1045 goto out;
1046 }
1047
1048 if (pmd_huge(pmd)) {
1038 ret_pte = (pte_t *) pmdp; 1049 ret_pte = (pte_t *) pmdp;
1039 goto out; 1050 goto out;
1040 } else if (is_hugepd(__hugepd(pmd_val(pmd)))) 1051 } else if (is_hugepd(__hugepd(pmd_val(pmd))))
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index c522969f012d..f7b80391bee7 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -190,6 +190,7 @@ void tlb_flush(struct mmu_gather *tlb)
190void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, 190void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
191 unsigned long end) 191 unsigned long end)
192{ 192{
193 bool is_thp;
193 int hugepage_shift; 194 int hugepage_shift;
194 unsigned long flags; 195 unsigned long flags;
195 196
@@ -208,21 +209,21 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
208 local_irq_save(flags); 209 local_irq_save(flags);
209 arch_enter_lazy_mmu_mode(); 210 arch_enter_lazy_mmu_mode();
210 for (; start < end; start += PAGE_SIZE) { 211 for (; start < end; start += PAGE_SIZE) {
211 pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, 212 pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, &is_thp,
212 &hugepage_shift); 213 &hugepage_shift);
213 unsigned long pte; 214 unsigned long pte;
214 215
215 if (ptep == NULL) 216 if (ptep == NULL)
216 continue; 217 continue;
217 pte = pte_val(*ptep); 218 pte = pte_val(*ptep);
218 if (hugepage_shift) 219 if (is_thp)
219 trace_hugepage_invalidate(start, pte); 220 trace_hugepage_invalidate(start, pte);
220 if (!(pte & _PAGE_HASHPTE)) 221 if (!(pte & _PAGE_HASHPTE))
221 continue; 222 continue;
222 if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte))) 223 if (unlikely(is_thp))
223 hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); 224 hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
224 else 225 else
225 hpte_need_flush(mm, start, ptep, pte, 0); 226 hpte_need_flush(mm, start, ptep, pte, hugepage_shift);
226 } 227 }
227 arch_leave_lazy_mmu_mode(); 228 arch_leave_lazy_mmu_mode();
228 local_irq_restore(flags); 229 local_irq_restore(flags);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index ff09cde20cd2..e04a6752b399 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -127,7 +127,7 @@ static int read_user_stack_slow(void __user *ptr, void *buf, int nb)
127 return -EFAULT; 127 return -EFAULT;
128 128
129 local_irq_save(flags); 129 local_irq_save(flags);
130 ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift); 130 ptep = find_linux_pte_or_hugepte(pgdir, addr, NULL, &shift);
131 if (!ptep) 131 if (!ptep)
132 goto err_out; 132 goto err_out;
133 if (!shift) 133 if (!shift)