diff options
author | Lorenzo Stoakes <lstoakes@gmail.com> | 2016-09-11 18:54:25 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-09-25 18:43:42 -0400 |
commit | 38e088546522e1e86d2b8f401a1354ad3a9b3303 (patch) | |
tree | 3e7144eb3ecd99edd02d0c2bb44e962d1982fd42 | |
parent | 831e45d84a971495c882bc186d98bbb825b2ee59 (diff) |
mm: check VMA flags to avoid invalid PROT_NONE NUMA balancing
The NUMA balancing logic uses an arch-specific PROT_NONE page table flag
defined by pte_protnone() or pmd_protnone() to mark PTEs or huge page
PMDs respectively as requiring balancing upon a subsequent page fault.
User-defined PROT_NONE memory regions which also have this flag set will
not normally invoke the NUMA balancing code as do_page_fault() will send
a segfault to the process before handle_mm_fault() is even called.
However if access_remote_vm() is invoked to access a PROT_NONE region of
memory, handle_mm_fault() is called via faultin_page() and
__get_user_pages() without any access checks being performed, meaning
the NUMA balancing logic is incorrectly invoked on a non-NUMA memory
region.
A simple means of triggering this problem is to access PROT_NONE mmap'd
memory using /proc/self/mem which reliably results in the NUMA handling
functions being invoked when CONFIG_NUMA_BALANCING is set.
This issue was reported in bugzilla (issue 99101) which includes some
simple repro code.
There are BUG_ON() checks in do_numa_page() and do_huge_pmd_numa_page()
added at commit c0e7cad to avoid accidentally provoking strange
behaviour by attempting to apply NUMA balancing to pages that are in
fact PROT_NONE. The BUG_ON()'s are consistently triggered by the repro.
This patch moves the PROT_NONE check into mm/memory.c rather than
invoking BUG_ON() as faulting in these pages via faultin_page() is a
valid reason for reaching the NUMA check with the PROT_NONE page table
flag set and is therefore not always a bug.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=99101
Reported-by: Trevor Saunders <tbsaunde@tbsaunde.org>
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/huge_memory.c | 3 | ||||
-rw-r--r-- | mm/memory.c | 12 |
2 files changed, 7 insertions, 8 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a6abd76baa72..53ae6d00656a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1138,9 +1138,6 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) | |||
1138 | bool was_writable; | 1138 | bool was_writable; |
1139 | int flags = 0; | 1139 | int flags = 0; |
1140 | 1140 | ||
1141 | /* A PROT_NONE fault should not end up here */ | ||
1142 | BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); | ||
1143 | |||
1144 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 1141 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); |
1145 | if (unlikely(!pmd_same(pmd, *fe->pmd))) | 1142 | if (unlikely(!pmd_same(pmd, *fe->pmd))) |
1146 | goto out_unlock; | 1143 | goto out_unlock; |
diff --git a/mm/memory.c b/mm/memory.c index 83be99d9d8a1..793fe0f9841c 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -3351,9 +3351,6 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) | |||
3351 | bool was_writable = pte_write(pte); | 3351 | bool was_writable = pte_write(pte); |
3352 | int flags = 0; | 3352 | int flags = 0; |
3353 | 3353 | ||
3354 | /* A PROT_NONE fault should not end up here */ | ||
3355 | BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); | ||
3356 | |||
3357 | /* | 3354 | /* |
3358 | * The "pte" at this point cannot be used safely without | 3355 | * The "pte" at this point cannot be used safely without |
3359 | * validation through pte_unmap_same(). It's of NUMA type but | 3356 | * validation through pte_unmap_same(). It's of NUMA type but |
@@ -3458,6 +3455,11 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) | |||
3458 | return VM_FAULT_FALLBACK; | 3455 | return VM_FAULT_FALLBACK; |
3459 | } | 3456 | } |
3460 | 3457 | ||
3458 | static inline bool vma_is_accessible(struct vm_area_struct *vma) | ||
3459 | { | ||
3460 | return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); | ||
3461 | } | ||
3462 | |||
3461 | /* | 3463 | /* |
3462 | * These routines also need to handle stuff like marking pages dirty | 3464 | * These routines also need to handle stuff like marking pages dirty |
3463 | * and/or accessed for architectures that don't do it in hardware (most | 3465 | * and/or accessed for architectures that don't do it in hardware (most |
@@ -3524,7 +3526,7 @@ static int handle_pte_fault(struct fault_env *fe) | |||
3524 | if (!pte_present(entry)) | 3526 | if (!pte_present(entry)) |
3525 | return do_swap_page(fe, entry); | 3527 | return do_swap_page(fe, entry); |
3526 | 3528 | ||
3527 | if (pte_protnone(entry)) | 3529 | if (pte_protnone(entry) && vma_is_accessible(fe->vma)) |
3528 | return do_numa_page(fe, entry); | 3530 | return do_numa_page(fe, entry); |
3529 | 3531 | ||
3530 | fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); | 3532 | fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); |
@@ -3590,7 +3592,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | |||
3590 | 3592 | ||
3591 | barrier(); | 3593 | barrier(); |
3592 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { | 3594 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { |
3593 | if (pmd_protnone(orig_pmd)) | 3595 | if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) |
3594 | return do_huge_pmd_numa_page(&fe, orig_pmd); | 3596 | return do_huge_pmd_numa_page(&fe, orig_pmd); |
3595 | 3597 | ||
3596 | if ((fe.flags & FAULT_FLAG_WRITE) && | 3598 | if ((fe.flags & FAULT_FLAG_WRITE) && |