aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>2017-04-13 17:56:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-04-13 21:24:20 -0400
commitced108037c2aa542b3ed8b7afd1576064ad1362a (patch)
tree9bbfcea14b7f7098f667e213a6cc5851e7adc105
parent0a85e51d37645e9ce57e5e1a30859e07810ed07c (diff)
thp: fix MADV_DONTNEED vs. numa balancing race
In case prot_numa, we are under down_read(mmap_sem). It's critical to not clear pmd intermittently to avoid race with MADV_DONTNEED which is also under down_read(mmap_sem): CPU0: CPU1: change_huge_pmd(prot_numa=1) pmdp_huge_get_and_clear_notify() madvise_dontneed() zap_pmd_range() pmd_trans_huge(*pmd) == 0 (without ptl) // skip the pmd set_pmd_at(); // pmd is re-established The race makes MADV_DONTNEED miss the huge pmd and don't clear it which may break userspace. Found by code analysis, never saw triggered. Link: http://lkml.kernel.org/r/20170302151034.27829-3-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Hillf Danton <hillf.zj@alibaba-inc.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/huge_memory.c34
1 files changed, 33 insertions, 1 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a513861a9037..26769465af63 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1746,7 +1746,39 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1746 if (prot_numa && pmd_protnone(*pmd)) 1746 if (prot_numa && pmd_protnone(*pmd))
1747 goto unlock; 1747 goto unlock;
1748 1748
1749 entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); 1749 /*
1750 * In case prot_numa, we are under down_read(mmap_sem). It's critical
1751 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
1752 * which is also under down_read(mmap_sem):
1753 *
1754 * CPU0: CPU1:
1755 * change_huge_pmd(prot_numa=1)
1756 * pmdp_huge_get_and_clear_notify()
1757 * madvise_dontneed()
1758 * zap_pmd_range()
1759 * pmd_trans_huge(*pmd) == 0 (without ptl)
1760 * // skip the pmd
1761 * set_pmd_at();
1762 * // pmd is re-established
1763 *
1764 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
1765 * which may break userspace.
1766 *
1767 * pmdp_invalidate() is required to make sure we don't miss
1768 * dirty/young flags set by hardware.
1769 */
1770 entry = *pmd;
1771 pmdp_invalidate(vma, addr, pmd);
1772
1773 /*
1774 * Recover dirty/young flags. It relies on pmdp_invalidate to not
1775 * corrupt them.
1776 */
1777 if (pmd_dirty(*pmd))
1778 entry = pmd_mkdirty(entry);
1779 if (pmd_young(*pmd))
1780 entry = pmd_mkyoung(entry);
1781
1750 entry = pmd_modify(entry, newprot); 1782 entry = pmd_modify(entry, newprot);
1751 if (preserve_write) 1783 if (preserve_write)
1752 entry = pmd_mk_savedwrite(entry); 1784 entry = pmd_mk_savedwrite(entry);