aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2013-12-18 20:08:32 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-12-18 22:04:50 -0500
commit2b4847e73004c10ae6666c2e27b5c5430aed8698 (patch)
treef5062cda19087b1d9a6830feffc4089aeb5b7fc8
parentc97102ba96324da330078ad8619ba4dfe840dbe3 (diff)
mm: numa: serialise parallel get_user_page against THP migration
Base pages are unmapped and flushed from cache and TLB during normal page migration and replaced with a migration entry that causes any parallel NUMA hinting fault or gup to block until migration completes. THP does not unmap pages due to a lack of support for migration entries at a PMD level. This allows races with get_user_pages and get_user_pages_fast which commit 3f926ab945b6 ("mm: Close races between THP migration and PMD numa clearing") made worse by introducing a pmd_clear_flush(). This patch forces get_user_page (fast and normal) on a pmd_numa page to go through the slow get_user_page path where it will serialise against THP migration and properly account for the NUMA hinting fault. On the migration side the page table lock is taken for each PTE update. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Alex Thorlton <athorlton@sgi.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/x86/mm/gup.c13
-rw-r--r--mm/huge_memory.c24
-rw-r--r--mm/migrate.c38
3 files changed, 60 insertions, 15 deletions
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index dd74e46828c0..0596e8e0cc19 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -83,6 +83,12 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
83 pte_t pte = gup_get_pte(ptep); 83 pte_t pte = gup_get_pte(ptep);
84 struct page *page; 84 struct page *page;
85 85
86 /* Similar to the PMD case, NUMA hinting must take slow path */
87 if (pte_numa(pte)) {
88 pte_unmap(ptep);
89 return 0;
90 }
91
86 if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { 92 if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
87 pte_unmap(ptep); 93 pte_unmap(ptep);
88 return 0; 94 return 0;
@@ -167,6 +173,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
167 if (pmd_none(pmd) || pmd_trans_splitting(pmd)) 173 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
168 return 0; 174 return 0;
169 if (unlikely(pmd_large(pmd))) { 175 if (unlikely(pmd_large(pmd))) {
176 /*
177 * NUMA hinting faults need to be handled in the GUP
178 * slowpath for accounting purposes and so that they
179 * can be serialised against THP migration.
180 */
181 if (pmd_numa(pmd))
182 return 0;
170 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) 183 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
171 return 0; 184 return 0;
172 } else { 185 } else {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 33a5dc492810..51f069303ab9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1243,6 +1243,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1243 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) 1243 if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1244 return ERR_PTR(-EFAULT); 1244 return ERR_PTR(-EFAULT);
1245 1245
1246 /* Full NUMA hinting faults to serialise migration in fault paths */
1247 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1248 goto out;
1249
1246 page = pmd_page(*pmd); 1250 page = pmd_page(*pmd);
1247 VM_BUG_ON(!PageHead(page)); 1251 VM_BUG_ON(!PageHead(page));
1248 if (flags & FOLL_TOUCH) { 1252 if (flags & FOLL_TOUCH) {
@@ -1323,23 +1327,27 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1323 /* If the page was locked, there are no parallel migrations */ 1327 /* If the page was locked, there are no parallel migrations */
1324 if (page_locked) 1328 if (page_locked)
1325 goto clear_pmdnuma; 1329 goto clear_pmdnuma;
1330 }
1326 1331
1327 /* 1332 /*
1328 * Otherwise wait for potential migrations and retry. We do 1333 * If there are potential migrations, wait for completion and retry. We
1329 * relock and check_same as the page may no longer be mapped. 1334 * do not relock and check_same as the page may no longer be mapped.
1330 * As the fault is being retried, do not account for it. 1335 * Furtermore, even if the page is currently misplaced, there is no
1331 */ 1336 * guarantee it is still misplaced after the migration completes.
1337 */
1338 if (!page_locked) {
1332 spin_unlock(ptl); 1339 spin_unlock(ptl);
1333 wait_on_page_locked(page); 1340 wait_on_page_locked(page);
1334 page_nid = -1; 1341 page_nid = -1;
1335 goto out; 1342 goto out;
1336 } 1343 }
1337 1344
1338 /* Page is misplaced, serialise migrations and parallel THP splits */ 1345 /*
1346 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
1347 * to serialises splits
1348 */
1339 get_page(page); 1349 get_page(page);
1340 spin_unlock(ptl); 1350 spin_unlock(ptl);
1341 if (!page_locked)
1342 lock_page(page);
1343 anon_vma = page_lock_anon_vma_read(page); 1351 anon_vma = page_lock_anon_vma_read(page);
1344 1352
1345 /* Confirm the PMD did not change while page_table_lock was released */ 1353 /* Confirm the PMD did not change while page_table_lock was released */
diff --git a/mm/migrate.c b/mm/migrate.c
index bb940045fe85..2cabbd5fa5bf 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1722,6 +1722,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1722 struct page *new_page = NULL; 1722 struct page *new_page = NULL;
1723 struct mem_cgroup *memcg = NULL; 1723 struct mem_cgroup *memcg = NULL;
1724 int page_lru = page_is_file_cache(page); 1724 int page_lru = page_is_file_cache(page);
1725 pmd_t orig_entry;
1725 1726
1726 /* 1727 /*
1727 * Rate-limit the amount of data that is being migrated to a node. 1728 * Rate-limit the amount of data that is being migrated to a node.
@@ -1756,7 +1757,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1756 1757
1757 /* Recheck the target PMD */ 1758 /* Recheck the target PMD */
1758 ptl = pmd_lock(mm, pmd); 1759 ptl = pmd_lock(mm, pmd);
1759 if (unlikely(!pmd_same(*pmd, entry))) { 1760 if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
1761fail_putback:
1760 spin_unlock(ptl); 1762 spin_unlock(ptl);
1761 1763
1762 /* Reverse changes made by migrate_page_copy() */ 1764 /* Reverse changes made by migrate_page_copy() */
@@ -1786,16 +1788,34 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1786 */ 1788 */
1787 mem_cgroup_prepare_migration(page, new_page, &memcg); 1789 mem_cgroup_prepare_migration(page, new_page, &memcg);
1788 1790
1791 orig_entry = *pmd;
1789 entry = mk_pmd(new_page, vma->vm_page_prot); 1792 entry = mk_pmd(new_page, vma->vm_page_prot);
1790 entry = pmd_mknonnuma(entry);
1791 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1792 entry = pmd_mkhuge(entry); 1793 entry = pmd_mkhuge(entry);
1794 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1793 1795
1796 /*
1797 * Clear the old entry under pagetable lock and establish the new PTE.
1798 * Any parallel GUP will either observe the old page blocking on the
1799 * page lock, block on the page table lock or observe the new page.
1800 * The SetPageUptodate on the new page and page_add_new_anon_rmap
1801 * guarantee the copy is visible before the pagetable update.
1802 */
1803 flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
1804 page_add_new_anon_rmap(new_page, vma, haddr);
1794 pmdp_clear_flush(vma, haddr, pmd); 1805 pmdp_clear_flush(vma, haddr, pmd);
1795 set_pmd_at(mm, haddr, pmd, entry); 1806 set_pmd_at(mm, haddr, pmd, entry);
1796 page_add_new_anon_rmap(new_page, vma, haddr);
1797 update_mmu_cache_pmd(vma, address, &entry); 1807 update_mmu_cache_pmd(vma, address, &entry);
1808
1809 if (page_count(page) != 2) {
1810 set_pmd_at(mm, haddr, pmd, orig_entry);
1811 flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
1812 update_mmu_cache_pmd(vma, address, &entry);
1813 page_remove_rmap(new_page);
1814 goto fail_putback;
1815 }
1816
1798 page_remove_rmap(page); 1817 page_remove_rmap(page);
1818
1799 /* 1819 /*
1800 * Finish the charge transaction under the page table lock to 1820 * Finish the charge transaction under the page table lock to
1801 * prevent split_huge_page() from dividing up the charge 1821 * prevent split_huge_page() from dividing up the charge
@@ -1820,9 +1840,13 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1820out_fail: 1840out_fail:
1821 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1841 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1822out_dropref: 1842out_dropref:
1823 entry = pmd_mknonnuma(entry); 1843 ptl = pmd_lock(mm, pmd);
1824 set_pmd_at(mm, haddr, pmd, entry); 1844 if (pmd_same(*pmd, entry)) {
1825 update_mmu_cache_pmd(vma, address, &entry); 1845 entry = pmd_mknonnuma(entry);
1846 set_pmd_at(mm, haddr, pmd, entry);
1847 update_mmu_cache_pmd(vma, address, &entry);
1848 }
1849 spin_unlock(ptl);
1826 1850
1827 unlock_page(page); 1851 unlock_page(page);
1828 put_page(page); 1852 put_page(page);