diff options
author | Kirill A. Shutemov <kirill.shutemov@linux.intel.com> | 2017-04-13 17:56:20 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-04-13 21:24:20 -0400 |
commit | ced108037c2aa542b3ed8b7afd1576064ad1362a (patch) | |
tree | 9bbfcea14b7f7098f667e213a6cc5851e7adc105 | |
parent | 0a85e51d37645e9ce57e5e1a30859e07810ed07c (diff) |
thp: fix MADV_DONTNEED vs. numa balancing race
In case prot_numa, we are under down_read(mmap_sem). It's critical to
not clear pmd intermittently to avoid race with MADV_DONTNEED which is
also under down_read(mmap_sem):
CPU0: CPU1:
change_huge_pmd(prot_numa=1)
pmdp_huge_get_and_clear_notify()
madvise_dontneed()
zap_pmd_range()
pmd_trans_huge(*pmd) == 0 (without ptl)
// skip the pmd
set_pmd_at();
// pmd is re-established
The race makes MADV_DONTNEED miss the huge pmd and don't clear it
which may break userspace.
Found by code analysis, never saw triggered.
Link: http://lkml.kernel.org/r/20170302151034.27829-3-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/huge_memory.c | 34 |
1 files changed, 33 insertions, 1 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a513861a9037..26769465af63 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1746,7 +1746,39 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1746 | if (prot_numa && pmd_protnone(*pmd)) | 1746 | if (prot_numa && pmd_protnone(*pmd)) |
1747 | goto unlock; | 1747 | goto unlock; |
1748 | 1748 | ||
1749 | entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); | 1749 | /* |
1750 | * In case prot_numa, we are under down_read(mmap_sem). It's critical | ||
1751 | * to not clear pmd intermittently to avoid race with MADV_DONTNEED | ||
1752 | * which is also under down_read(mmap_sem): | ||
1753 | * | ||
1754 | * CPU0: CPU1: | ||
1755 | * change_huge_pmd(prot_numa=1) | ||
1756 | * pmdp_huge_get_and_clear_notify() | ||
1757 | * madvise_dontneed() | ||
1758 | * zap_pmd_range() | ||
1759 | * pmd_trans_huge(*pmd) == 0 (without ptl) | ||
1760 | * // skip the pmd | ||
1761 | * set_pmd_at(); | ||
1762 | * // pmd is re-established | ||
1763 | * | ||
1764 | * The race makes MADV_DONTNEED miss the huge pmd and don't clear it | ||
1765 | * which may break userspace. | ||
1766 | * | ||
1767 | * pmdp_invalidate() is required to make sure we don't miss | ||
1768 | * dirty/young flags set by hardware. | ||
1769 | */ | ||
1770 | entry = *pmd; | ||
1771 | pmdp_invalidate(vma, addr, pmd); | ||
1772 | |||
1773 | /* | ||
1774 | * Recover dirty/young flags. It relies on pmdp_invalidate to not | ||
1775 | * corrupt them. | ||
1776 | */ | ||
1777 | if (pmd_dirty(*pmd)) | ||
1778 | entry = pmd_mkdirty(entry); | ||
1779 | if (pmd_young(*pmd)) | ||
1780 | entry = pmd_mkyoung(entry); | ||
1781 | |||
1750 | entry = pmd_modify(entry, newprot); | 1782 | entry = pmd_modify(entry, newprot); |
1751 | if (preserve_write) | 1783 | if (preserve_write) |
1752 | entry = pmd_mk_savedwrite(entry); | 1784 | entry = pmd_mk_savedwrite(entry); |