aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2014-06-23 16:22:05 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-23 19:47:44 -0400
commitf72e7dcdd25229446b102e587ef2f826f76bff28 (patch)
treeaf103d9dcbd735ea250ffc396b88ad74b7a41a2e
parent5338a9372234f8b782c7d78f0355e1cb21d02468 (diff)
mm: let mm_find_pmd fix buggy race with THP fault
Trinity has reported: BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 IP: __lock_acquire (kernel/locking/lockdep.c:3070 (discriminator 1)) CPU: 6 PID: 16173 Comm: trinity-c364 Tainted: G W 3.15.0-rc1-next-20140415-sasha-00020-gaa90d09 #398 lock_acquire (arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602) _raw_spin_lock (include/linux/spinlock_api_smp.h:143 kernel/locking/spinlock.c:151) remove_migration_pte (mm/migrate.c:137) rmap_walk (mm/rmap.c:1628 mm/rmap.c:1699) remove_migration_ptes (mm/migrate.c:224) migrate_pages (mm/migrate.c:922 mm/migrate.c:960 mm/migrate.c:1126) migrate_misplaced_page (mm/migrate.c:1733) __handle_mm_fault (mm/memory.c:3762 mm/memory.c:3812 mm/memory.c:3925) handle_mm_fault (mm/memory.c:3948) __get_user_pages (mm/memory.c:1851) __mlock_vma_pages_range (mm/mlock.c:255) __mm_populate (mm/mlock.c:711) SyS_mlockall (include/linux/mm.h:1799 mm/mlock.c:817 mm/mlock.c:791) I believe this comes about because, whereas collapsing and splitting THP functions take anon_vma lock in write mode (which excludes concurrent rmap walks), faulting THP functions (write protection and misplaced NUMA) do not - and mostly they do not need to. But they do use a pmdp_clear_flush(), set_pmd_at() sequence which, for an instant (indeed, for a long instant, given the inter-CPU TLB flush in there), leaves *pmd neither present not trans_huge. Which can confuse a concurrent rmap walk, as when removing migration ptes, seen in the dumped trace. Although that rmap walk has a 4k page to insert, anon_vmas containing THPs are in no way segregated from 4k-page anon_vmas, so the 4k-intent mm_find_pmd() does need to cope with that instant when a trans_huge pmd is temporarily absent. I don't think we need strengthen the locking at the THP end: it's easily handled with an ACCESS_ONCE() before testing both conditions. And since mm_find_pmd() had only one caller who wanted a THP rather than a pmd, let's slightly repurpose it to fail when it hits a THP or non-present pmd, and open code split_huge_page_address() again. Signed-off-by: Hugh Dickins <hughd@google.com> Reported-by: Sasha Levin <sasha.levin@oracle.com> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Konstantin Khlebnikov <koct9i@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Bob Liu <bob.liu@oracle.com> Cc: Christoph Lameter <cl@gentwo.org> Cc: Dave Jones <davej@redhat.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/huge_memory.c18
-rw-r--r--mm/ksm.c1
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/rmap.c12
4 files changed, 20 insertions, 13 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bade35ef563b..33514d88fef9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2423,8 +2423,6 @@ static void collapse_huge_page(struct mm_struct *mm,
2423 pmd = mm_find_pmd(mm, address); 2423 pmd = mm_find_pmd(mm, address);
2424 if (!pmd) 2424 if (!pmd)
2425 goto out; 2425 goto out;
2426 if (pmd_trans_huge(*pmd))
2427 goto out;
2428 2426
2429 anon_vma_lock_write(vma->anon_vma); 2427 anon_vma_lock_write(vma->anon_vma);
2430 2428
@@ -2523,8 +2521,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2523 pmd = mm_find_pmd(mm, address); 2521 pmd = mm_find_pmd(mm, address);
2524 if (!pmd) 2522 if (!pmd)
2525 goto out; 2523 goto out;
2526 if (pmd_trans_huge(*pmd))
2527 goto out;
2528 2524
2529 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); 2525 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
2530 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2526 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2877,12 +2873,22 @@ void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
2877static void split_huge_page_address(struct mm_struct *mm, 2873static void split_huge_page_address(struct mm_struct *mm,
2878 unsigned long address) 2874 unsigned long address)
2879{ 2875{
2876 pgd_t *pgd;
2877 pud_t *pud;
2880 pmd_t *pmd; 2878 pmd_t *pmd;
2881 2879
2882 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); 2880 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2883 2881
2884 pmd = mm_find_pmd(mm, address); 2882 pgd = pgd_offset(mm, address);
2885 if (!pmd) 2883 if (!pgd_present(*pgd))
2884 return;
2885
2886 pud = pud_offset(pgd, address);
2887 if (!pud_present(*pud))
2888 return;
2889
2890 pmd = pmd_offset(pud, address);
2891 if (!pmd_present(*pmd))
2886 return; 2892 return;
2887 /* 2893 /*
2888 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2894 * Caller holds the mmap_sem write mode, so a huge pmd cannot
diff --git a/mm/ksm.c b/mm/ksm.c
index 68710e80994a..346ddc9e4c0d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -945,7 +945,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
945 pmd = mm_find_pmd(mm, addr); 945 pmd = mm_find_pmd(mm, addr);
946 if (!pmd) 946 if (!pmd)
947 goto out; 947 goto out;
948 BUG_ON(pmd_trans_huge(*pmd));
949 948
950 mmun_start = addr; 949 mmun_start = addr;
951 mmun_end = addr + PAGE_SIZE; 950 mmun_end = addr + PAGE_SIZE;
diff --git a/mm/migrate.c b/mm/migrate.c
index 63f0cd559999..9e0beaa91845 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -120,8 +120,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
120 pmd = mm_find_pmd(mm, addr); 120 pmd = mm_find_pmd(mm, addr);
121 if (!pmd) 121 if (!pmd)
122 goto out; 122 goto out;
123 if (pmd_trans_huge(*pmd))
124 goto out;
125 123
126 ptep = pte_offset_map(pmd, addr); 124 ptep = pte_offset_map(pmd, addr);
127 125
diff --git a/mm/rmap.c b/mm/rmap.c
index bf05fc872ae8..b7e94ebbd09e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -569,6 +569,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
569 pgd_t *pgd; 569 pgd_t *pgd;
570 pud_t *pud; 570 pud_t *pud;
571 pmd_t *pmd = NULL; 571 pmd_t *pmd = NULL;
572 pmd_t pmde;
572 573
573 pgd = pgd_offset(mm, address); 574 pgd = pgd_offset(mm, address);
574 if (!pgd_present(*pgd)) 575 if (!pgd_present(*pgd))
@@ -579,7 +580,13 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
579 goto out; 580 goto out;
580 581
581 pmd = pmd_offset(pud, address); 582 pmd = pmd_offset(pud, address);
582 if (!pmd_present(*pmd)) 583 /*
584 * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
585 * without holding anon_vma lock for write. So when looking for a
586 * genuine pmde (in which to find pte), test present and !THP together.
587 */
588 pmde = ACCESS_ONCE(*pmd);
589 if (!pmd_present(pmde) || pmd_trans_huge(pmde))
583 pmd = NULL; 590 pmd = NULL;
584out: 591out:
585 return pmd; 592 return pmd;
@@ -615,9 +622,6 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
615 if (!pmd) 622 if (!pmd)
616 return NULL; 623 return NULL;
617 624
618 if (pmd_trans_huge(*pmd))
619 return NULL;
620
621 pte = pte_offset_map(pmd, address); 625 pte = pte_offset_map(pmd, address);
622 /* Make a quick check before getting the lock */ 626 /* Make a quick check before getting the lock */
623 if (!sync && !pte_present(*pte)) { 627 if (!sync && !pte_present(*pte)) {