aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/kernel/vm86_32.c2
-rw-r--r--fs/proc/task_mmu.c9
-rw-r--r--include/asm-generic/pgtable.h61
-rw-r--r--mm/memcontrol.c4
-rw-r--r--mm/memory.c16
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/swapfile.c4
9 files changed, 92 insertions, 10 deletions
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index b466cab5ba15..328cb37bb827 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
172 spinlock_t *ptl; 172 spinlock_t *ptl;
173 int i; 173 int i;
174 174
175 down_write(&mm->mmap_sem);
175 pgd = pgd_offset(mm, 0xA0000); 176 pgd = pgd_offset(mm, 0xA0000);
176 if (pgd_none_or_clear_bad(pgd)) 177 if (pgd_none_or_clear_bad(pgd))
177 goto out; 178 goto out;
@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
190 } 191 }
191 pte_unmap_unlock(pte, ptl); 192 pte_unmap_unlock(pte, ptl);
192out: 193out:
194 up_write(&mm->mmap_sem);
193 flush_tlb(); 195 flush_tlb();
194} 196}
195 197
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7dcd2a250495..3efa7253523e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -409,6 +409,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
409 } else { 409 } else {
410 spin_unlock(&walk->mm->page_table_lock); 410 spin_unlock(&walk->mm->page_table_lock);
411 } 411 }
412
413 if (pmd_trans_unstable(pmd))
414 return 0;
412 /* 415 /*
413 * The mmap_sem held all the way back in m_start() is what 416 * The mmap_sem held all the way back in m_start() is what
414 * keeps khugepaged out of here and from collapsing things 417 * keeps khugepaged out of here and from collapsing things
@@ -507,6 +510,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
507 struct page *page; 510 struct page *page;
508 511
509 split_huge_page_pmd(walk->mm, pmd); 512 split_huge_page_pmd(walk->mm, pmd);
513 if (pmd_trans_unstable(pmd))
514 return 0;
510 515
511 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 516 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
512 for (; addr != end; pte++, addr += PAGE_SIZE) { 517 for (; addr != end; pte++, addr += PAGE_SIZE) {
@@ -670,6 +675,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
670 int err = 0; 675 int err = 0;
671 676
672 split_huge_page_pmd(walk->mm, pmd); 677 split_huge_page_pmd(walk->mm, pmd);
678 if (pmd_trans_unstable(pmd))
679 return 0;
673 680
674 /* find the first VMA at or above 'addr' */ 681 /* find the first VMA at or above 'addr' */
675 vma = find_vma(walk->mm, addr); 682 vma = find_vma(walk->mm, addr);
@@ -961,6 +968,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
961 spin_unlock(&walk->mm->page_table_lock); 968 spin_unlock(&walk->mm->page_table_lock);
962 } 969 }
963 970
971 if (pmd_trans_unstable(pmd))
972 return 0;
964 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 973 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
965 do { 974 do {
966 struct page *page = can_gather_numa_stats(*pte, md->vma, addr); 975 struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 76bff2bff15e..a03c098b0cce 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -425,6 +425,8 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
425 unsigned long size); 425 unsigned long size);
426#endif 426#endif
427 427
428#ifdef CONFIG_MMU
429
428#ifndef CONFIG_TRANSPARENT_HUGEPAGE 430#ifndef CONFIG_TRANSPARENT_HUGEPAGE
429static inline int pmd_trans_huge(pmd_t pmd) 431static inline int pmd_trans_huge(pmd_t pmd)
430{ 432{
@@ -441,7 +443,66 @@ static inline int pmd_write(pmd_t pmd)
441 return 0; 443 return 0;
442} 444}
443#endif /* __HAVE_ARCH_PMD_WRITE */ 445#endif /* __HAVE_ARCH_PMD_WRITE */
446#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
447
448/*
449 * This function is meant to be used by sites walking pagetables with
450 * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
451 * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
452 * into a null pmd and the transhuge page fault can convert a null pmd
453 * into an hugepmd or into a regular pmd (if the hugepage allocation
454 * fails). While holding the mmap_sem in read mode the pmd becomes
455 * stable and stops changing under us only if it's not null and not a
456 * transhuge pmd. When those races occurs and this function makes a
457 * difference vs the standard pmd_none_or_clear_bad, the result is
458 * undefined so behaving like if the pmd was none is safe (because it
459 * can return none anyway). The compiler level barrier() is critically
460 * important to compute the two checks atomically on the same pmdval.
461 */
462static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
463{
464 /* depend on compiler for an atomic pmd read */
465 pmd_t pmdval = *pmd;
466 /*
467 * The barrier will stabilize the pmdval in a register or on
468 * the stack so that it will stop changing under the code.
469 */
470#ifdef CONFIG_TRANSPARENT_HUGEPAGE
471 barrier();
472#endif
473 if (pmd_none(pmdval))
474 return 1;
475 if (unlikely(pmd_bad(pmdval))) {
476 if (!pmd_trans_huge(pmdval))
477 pmd_clear_bad(pmd);
478 return 1;
479 }
480 return 0;
481}
482
483/*
484 * This is a noop if Transparent Hugepage Support is not built into
485 * the kernel. Otherwise it is equivalent to
486 * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
487 * places that already verified the pmd is not none and they want to
488 * walk ptes while holding the mmap sem in read mode (write mode don't
489 * need this). If THP is not enabled, the pmd can't go away under the
490 * code even if MADV_DONTNEED runs, but if THP is enabled we need to
491 * run a pmd_trans_unstable before walking the ptes after
492 * split_huge_page_pmd returns (because it may have run when the pmd
493 * become null, but then a page fault can map in a THP and not a
494 * regular page).
495 */
496static inline int pmd_trans_unstable(pmd_t *pmd)
497{
498#ifdef CONFIG_TRANSPARENT_HUGEPAGE
499 return pmd_none_or_trans_huge_or_clear_bad(pmd);
500#else
501 return 0;
444#endif 502#endif
503}
504
505#endif /* CONFIG_MMU */
445 506
446#endif /* !__ASSEMBLY__ */ 507#endif /* !__ASSEMBLY__ */
447 508
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 26c6f4ec20f4..37281816ff67 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5230,6 +5230,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5230 spinlock_t *ptl; 5230 spinlock_t *ptl;
5231 5231
5232 split_huge_page_pmd(walk->mm, pmd); 5232 split_huge_page_pmd(walk->mm, pmd);
5233 if (pmd_trans_unstable(pmd))
5234 return 0;
5233 5235
5234 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5236 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5235 for (; addr != end; pte++, addr += PAGE_SIZE) 5237 for (; addr != end; pte++, addr += PAGE_SIZE)
@@ -5390,6 +5392,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5390 spinlock_t *ptl; 5392 spinlock_t *ptl;
5391 5393
5392 split_huge_page_pmd(walk->mm, pmd); 5394 split_huge_page_pmd(walk->mm, pmd);
5395 if (pmd_trans_unstable(pmd))
5396 return 0;
5393retry: 5397retry:
5394 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5398 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5395 for (; addr != end; addr += PAGE_SIZE) { 5399 for (; addr != end; addr += PAGE_SIZE) {
diff --git a/mm/memory.c b/mm/memory.c
index 347e5fad1cfa..e01abb908b6b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1247,16 +1247,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1247 do { 1247 do {
1248 next = pmd_addr_end(addr, end); 1248 next = pmd_addr_end(addr, end);
1249 if (pmd_trans_huge(*pmd)) { 1249 if (pmd_trans_huge(*pmd)) {
1250 if (next-addr != HPAGE_PMD_SIZE) { 1250 if (next - addr != HPAGE_PMD_SIZE) {
1251 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1251 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1252 split_huge_page_pmd(vma->vm_mm, pmd); 1252 split_huge_page_pmd(vma->vm_mm, pmd);
1253 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1253 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1254 continue; 1254 goto next;
1255 /* fall through */ 1255 /* fall through */
1256 } 1256 }
1257 if (pmd_none_or_clear_bad(pmd)) 1257 /*
1258 continue; 1258 * Here there can be other concurrent MADV_DONTNEED or
1259 * trans huge page faults running, and if the pmd is
1260 * none or trans huge it can change under us. This is
1261 * because MADV_DONTNEED holds the mmap_sem in read
1262 * mode.
1263 */
1264 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1265 goto next;
1259 next = zap_pte_range(tlb, vma, pmd, addr, next, details); 1266 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1267next:
1260 cond_resched(); 1268 cond_resched();
1261 } while (pmd++, addr = next, addr != end); 1269 } while (pmd++, addr = next, addr != end);
1262 1270
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 47296fee23db..0a3757067631 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
512 do { 512 do {
513 next = pmd_addr_end(addr, end); 513 next = pmd_addr_end(addr, end);
514 split_huge_page_pmd(vma->vm_mm, pmd); 514 split_huge_page_pmd(vma->vm_mm, pmd);
515 if (pmd_none_or_clear_bad(pmd)) 515 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
516 continue; 516 continue;
517 if (check_pte_range(vma, pmd, addr, next, nodes, 517 if (check_pte_range(vma, pmd, addr, next, nodes,
518 flags, private)) 518 flags, private))
diff --git a/mm/mincore.c b/mm/mincore.c
index 636a86876ff2..936b4cee8cb1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
164 } 164 }
165 /* fall through */ 165 /* fall through */
166 } 166 }
167 if (pmd_none_or_clear_bad(pmd)) 167 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
168 mincore_unmapped_range(vma, addr, next, vec); 168 mincore_unmapped_range(vma, addr, next, vec);
169 else 169 else
170 mincore_pte_range(vma, pmd, addr, next, vec); 170 mincore_pte_range(vma, pmd, addr, next, vec);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2f5cf10ff660..aa9701e12714 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -59,7 +59,7 @@ again:
59 continue; 59 continue;
60 60
61 split_huge_page_pmd(walk->mm, pmd); 61 split_huge_page_pmd(walk->mm, pmd);
62 if (pmd_none_or_clear_bad(pmd)) 62 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
63 goto again; 63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk); 64 err = walk_pte_range(pmd, addr, next, walk);
65 if (err) 65 if (err)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 00a962caab1a..44595a373e42 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
932 pmd = pmd_offset(pud, addr); 932 pmd = pmd_offset(pud, addr);
933 do { 933 do {
934 next = pmd_addr_end(addr, end); 934 next = pmd_addr_end(addr, end);
935 if (unlikely(pmd_trans_huge(*pmd))) 935 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
936 continue;
937 if (pmd_none_or_clear_bad(pmd))
938 continue; 936 continue;
939 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 937 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
940 if (ret) 938 if (ret)