diff options
-rw-r--r-- | arch/x86/kernel/vm86_32.c | 2 | ||||
-rw-r--r-- | fs/proc/task_mmu.c | 9 | ||||
-rw-r--r-- | include/asm-generic/pgtable.h | 61 | ||||
-rw-r--r-- | mm/memcontrol.c | 4 | ||||
-rw-r--r-- | mm/memory.c | 16 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/mincore.c | 2 | ||||
-rw-r--r-- | mm/pagewalk.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 4 |
9 files changed, 92 insertions, 10 deletions
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index b466cab5ba15..328cb37bb827 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
172 | spinlock_t *ptl; | 172 | spinlock_t *ptl; |
173 | int i; | 173 | int i; |
174 | 174 | ||
175 | down_write(&mm->mmap_sem); | ||
175 | pgd = pgd_offset(mm, 0xA0000); | 176 | pgd = pgd_offset(mm, 0xA0000); |
176 | if (pgd_none_or_clear_bad(pgd)) | 177 | if (pgd_none_or_clear_bad(pgd)) |
177 | goto out; | 178 | goto out; |
@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
190 | } | 191 | } |
191 | pte_unmap_unlock(pte, ptl); | 192 | pte_unmap_unlock(pte, ptl); |
192 | out: | 193 | out: |
194 | up_write(&mm->mmap_sem); | ||
193 | flush_tlb(); | 195 | flush_tlb(); |
194 | } | 196 | } |
195 | 197 | ||
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7dcd2a250495..3efa7253523e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -409,6 +409,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
409 | } else { | 409 | } else { |
410 | spin_unlock(&walk->mm->page_table_lock); | 410 | spin_unlock(&walk->mm->page_table_lock); |
411 | } | 411 | } |
412 | |||
413 | if (pmd_trans_unstable(pmd)) | ||
414 | return 0; | ||
412 | /* | 415 | /* |
413 | * The mmap_sem held all the way back in m_start() is what | 416 | * The mmap_sem held all the way back in m_start() is what |
414 | * keeps khugepaged out of here and from collapsing things | 417 | * keeps khugepaged out of here and from collapsing things |
@@ -507,6 +510,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, | |||
507 | struct page *page; | 510 | struct page *page; |
508 | 511 | ||
509 | split_huge_page_pmd(walk->mm, pmd); | 512 | split_huge_page_pmd(walk->mm, pmd); |
513 | if (pmd_trans_unstable(pmd)) | ||
514 | return 0; | ||
510 | 515 | ||
511 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 516 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
512 | for (; addr != end; pte++, addr += PAGE_SIZE) { | 517 | for (; addr != end; pte++, addr += PAGE_SIZE) { |
@@ -670,6 +675,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
670 | int err = 0; | 675 | int err = 0; |
671 | 676 | ||
672 | split_huge_page_pmd(walk->mm, pmd); | 677 | split_huge_page_pmd(walk->mm, pmd); |
678 | if (pmd_trans_unstable(pmd)) | ||
679 | return 0; | ||
673 | 680 | ||
674 | /* find the first VMA at or above 'addr' */ | 681 | /* find the first VMA at or above 'addr' */ |
675 | vma = find_vma(walk->mm, addr); | 682 | vma = find_vma(walk->mm, addr); |
@@ -961,6 +968,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, | |||
961 | spin_unlock(&walk->mm->page_table_lock); | 968 | spin_unlock(&walk->mm->page_table_lock); |
962 | } | 969 | } |
963 | 970 | ||
971 | if (pmd_trans_unstable(pmd)) | ||
972 | return 0; | ||
964 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | 973 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); |
965 | do { | 974 | do { |
966 | struct page *page = can_gather_numa_stats(*pte, md->vma, addr); | 975 | struct page *page = can_gather_numa_stats(*pte, md->vma, addr); |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 76bff2bff15e..a03c098b0cce 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -425,6 +425,8 @@ extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, | |||
425 | unsigned long size); | 425 | unsigned long size); |
426 | #endif | 426 | #endif |
427 | 427 | ||
428 | #ifdef CONFIG_MMU | ||
429 | |||
428 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | 430 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE |
429 | static inline int pmd_trans_huge(pmd_t pmd) | 431 | static inline int pmd_trans_huge(pmd_t pmd) |
430 | { | 432 | { |
@@ -441,7 +443,66 @@ static inline int pmd_write(pmd_t pmd) | |||
441 | return 0; | 443 | return 0; |
442 | } | 444 | } |
443 | #endif /* __HAVE_ARCH_PMD_WRITE */ | 445 | #endif /* __HAVE_ARCH_PMD_WRITE */ |
446 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
447 | |||
448 | /* | ||
449 | * This function is meant to be used by sites walking pagetables with | ||
450 | * the mmap_sem hold in read mode to protect against MADV_DONTNEED and | ||
451 | * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd | ||
452 | * into a null pmd and the transhuge page fault can convert a null pmd | ||
453 | * into an hugepmd or into a regular pmd (if the hugepage allocation | ||
454 | * fails). While holding the mmap_sem in read mode the pmd becomes | ||
455 | * stable and stops changing under us only if it's not null and not a | ||
456 | * transhuge pmd. When those races occurs and this function makes a | ||
457 | * difference vs the standard pmd_none_or_clear_bad, the result is | ||
458 | * undefined so behaving like if the pmd was none is safe (because it | ||
459 | * can return none anyway). The compiler level barrier() is critically | ||
460 | * important to compute the two checks atomically on the same pmdval. | ||
461 | */ | ||
462 | static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) | ||
463 | { | ||
464 | /* depend on compiler for an atomic pmd read */ | ||
465 | pmd_t pmdval = *pmd; | ||
466 | /* | ||
467 | * The barrier will stabilize the pmdval in a register or on | ||
468 | * the stack so that it will stop changing under the code. | ||
469 | */ | ||
470 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
471 | barrier(); | ||
472 | #endif | ||
473 | if (pmd_none(pmdval)) | ||
474 | return 1; | ||
475 | if (unlikely(pmd_bad(pmdval))) { | ||
476 | if (!pmd_trans_huge(pmdval)) | ||
477 | pmd_clear_bad(pmd); | ||
478 | return 1; | ||
479 | } | ||
480 | return 0; | ||
481 | } | ||
482 | |||
483 | /* | ||
484 | * This is a noop if Transparent Hugepage Support is not built into | ||
485 | * the kernel. Otherwise it is equivalent to | ||
486 | * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in | ||
487 | * places that already verified the pmd is not none and they want to | ||
488 | * walk ptes while holding the mmap sem in read mode (write mode don't | ||
489 | * need this). If THP is not enabled, the pmd can't go away under the | ||
490 | * code even if MADV_DONTNEED runs, but if THP is enabled we need to | ||
491 | * run a pmd_trans_unstable before walking the ptes after | ||
492 | * split_huge_page_pmd returns (because it may have run when the pmd | ||
493 | * become null, but then a page fault can map in a THP and not a | ||
494 | * regular page). | ||
495 | */ | ||
496 | static inline int pmd_trans_unstable(pmd_t *pmd) | ||
497 | { | ||
498 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
499 | return pmd_none_or_trans_huge_or_clear_bad(pmd); | ||
500 | #else | ||
501 | return 0; | ||
444 | #endif | 502 | #endif |
503 | } | ||
504 | |||
505 | #endif /* CONFIG_MMU */ | ||
445 | 506 | ||
446 | #endif /* !__ASSEMBLY__ */ | 507 | #endif /* !__ASSEMBLY__ */ |
447 | 508 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 26c6f4ec20f4..37281816ff67 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -5230,6 +5230,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
5230 | spinlock_t *ptl; | 5230 | spinlock_t *ptl; |
5231 | 5231 | ||
5232 | split_huge_page_pmd(walk->mm, pmd); | 5232 | split_huge_page_pmd(walk->mm, pmd); |
5233 | if (pmd_trans_unstable(pmd)) | ||
5234 | return 0; | ||
5233 | 5235 | ||
5234 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 5236 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
5235 | for (; addr != end; pte++, addr += PAGE_SIZE) | 5237 | for (; addr != end; pte++, addr += PAGE_SIZE) |
@@ -5390,6 +5392,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
5390 | spinlock_t *ptl; | 5392 | spinlock_t *ptl; |
5391 | 5393 | ||
5392 | split_huge_page_pmd(walk->mm, pmd); | 5394 | split_huge_page_pmd(walk->mm, pmd); |
5395 | if (pmd_trans_unstable(pmd)) | ||
5396 | return 0; | ||
5393 | retry: | 5397 | retry: |
5394 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 5398 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
5395 | for (; addr != end; addr += PAGE_SIZE) { | 5399 | for (; addr != end; addr += PAGE_SIZE) { |
diff --git a/mm/memory.c b/mm/memory.c index 347e5fad1cfa..e01abb908b6b 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1247,16 +1247,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1247 | do { | 1247 | do { |
1248 | next = pmd_addr_end(addr, end); | 1248 | next = pmd_addr_end(addr, end); |
1249 | if (pmd_trans_huge(*pmd)) { | 1249 | if (pmd_trans_huge(*pmd)) { |
1250 | if (next-addr != HPAGE_PMD_SIZE) { | 1250 | if (next - addr != HPAGE_PMD_SIZE) { |
1251 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | 1251 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); |
1252 | split_huge_page_pmd(vma->vm_mm, pmd); | 1252 | split_huge_page_pmd(vma->vm_mm, pmd); |
1253 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1253 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
1254 | continue; | 1254 | goto next; |
1255 | /* fall through */ | 1255 | /* fall through */ |
1256 | } | 1256 | } |
1257 | if (pmd_none_or_clear_bad(pmd)) | 1257 | /* |
1258 | continue; | 1258 | * Here there can be other concurrent MADV_DONTNEED or |
1259 | * trans huge page faults running, and if the pmd is | ||
1260 | * none or trans huge it can change under us. This is | ||
1261 | * because MADV_DONTNEED holds the mmap_sem in read | ||
1262 | * mode. | ||
1263 | */ | ||
1264 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
1265 | goto next; | ||
1259 | next = zap_pte_range(tlb, vma, pmd, addr, next, details); | 1266 | next = zap_pte_range(tlb, vma, pmd, addr, next, details); |
1267 | next: | ||
1260 | cond_resched(); | 1268 | cond_resched(); |
1261 | } while (pmd++, addr = next, addr != end); | 1269 | } while (pmd++, addr = next, addr != end); |
1262 | 1270 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 47296fee23db..0a3757067631 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
512 | do { | 512 | do { |
513 | next = pmd_addr_end(addr, end); | 513 | next = pmd_addr_end(addr, end); |
514 | split_huge_page_pmd(vma->vm_mm, pmd); | 514 | split_huge_page_pmd(vma->vm_mm, pmd); |
515 | if (pmd_none_or_clear_bad(pmd)) | 515 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
516 | continue; | 516 | continue; |
517 | if (check_pte_range(vma, pmd, addr, next, nodes, | 517 | if (check_pte_range(vma, pmd, addr, next, nodes, |
518 | flags, private)) | 518 | flags, private)) |
diff --git a/mm/mincore.c b/mm/mincore.c index 636a86876ff2..936b4cee8cb1 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
164 | } | 164 | } |
165 | /* fall through */ | 165 | /* fall through */ |
166 | } | 166 | } |
167 | if (pmd_none_or_clear_bad(pmd)) | 167 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
168 | mincore_unmapped_range(vma, addr, next, vec); | 168 | mincore_unmapped_range(vma, addr, next, vec); |
169 | else | 169 | else |
170 | mincore_pte_range(vma, pmd, addr, next, vec); | 170 | mincore_pte_range(vma, pmd, addr, next, vec); |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2f5cf10ff660..aa9701e12714 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -59,7 +59,7 @@ again: | |||
59 | continue; | 59 | continue; |
60 | 60 | ||
61 | split_huge_page_pmd(walk->mm, pmd); | 61 | split_huge_page_pmd(walk->mm, pmd); |
62 | if (pmd_none_or_clear_bad(pmd)) | 62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
63 | goto again; | 63 | goto again; |
64 | err = walk_pte_range(pmd, addr, next, walk); | 64 | err = walk_pte_range(pmd, addr, next, walk); |
65 | if (err) | 65 | if (err) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 00a962caab1a..44595a373e42 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
932 | pmd = pmd_offset(pud, addr); | 932 | pmd = pmd_offset(pud, addr); |
933 | do { | 933 | do { |
934 | next = pmd_addr_end(addr, end); | 934 | next = pmd_addr_end(addr, end); |
935 | if (unlikely(pmd_trans_huge(*pmd))) | 935 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
936 | continue; | ||
937 | if (pmd_none_or_clear_bad(pmd)) | ||
938 | continue; | 936 | continue; |
939 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); | 937 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
940 | if (ret) | 938 | if (ret) |