x86/mm/64: Fix vmapped stack syncing on very-large-memory 4-level systems

Neil Berrington reported a double-fault on a VM with 768GB of RAM that uses large amounts of vmalloc space with PTI enabled. The cause is that load_new_mm_cr3() was never fixed to take the 5-level pgd folding code into account, so, on a 4-level kernel, the pgd synchronization logic compiles away to exactly nothing. Interestingly, the problem doesn't trigger with nopti. I assume this is because the kernel is mapped with global pages if we boot with nopti. The sequence of operations when we create a new task is that we first load its mm while still running on the old stack (which crashes if the old stack is unmapped in the new mm unless the TLB saves us), then we call prepare_switch_to(), and then we switch to the new stack. prepare_switch_to() pokes the new stack directly, which will populate the mapping through vmalloc_fault(). I assume that we're getting lucky on non-PTI systems -- the old stack's TLB entry stays alive long enough to make it all the way through prepare_switch_to() and switch_to() so that we make it to a valid stack. Fixes: b50858ce3e2a ("x86/mm/vmalloc: Add 5-level paging support") Reported-and-tested-by: Neil Berrington <neil.berrington@datacore.com> Signed-off-by: Andy Lutomirski <luto@kernel.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> Cc: stable@vger.kernel.org Cc: Dave Hansen <dave.hansen@intel.com> Cc: Borislav Petkov <bp@alien8.de> Link: https://lkml.kernel.org/r/346541c56caed61abbe693d7d2742b4a380c5001.1516914529.git.luto@kernel.org
author: Andy Lutomirski <luto@kernel.org> 2018-01-25 16:12:14 -0500
committer: Thomas Gleixner <tglx@linutronix.de> 2018-01-26 09:56:23 -0500
commit: 5beda7d54eafece4c974cfa9fbb9f60fb18fd20a (patch)
tree: e7d14f0b4a300cff583ace3791c638ed9f3c993c /arch
parent: 1d080f096fe33f031d26e19b3ef0146f66b8b0f1 (diff)
1 files changed, 29 insertions, 5 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index a1561957dccb..5bfe61a5e8e3 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -151,6 +151,34 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
        local_irq_restore(flags);
 }
+static void sync_current_stack_to_mm(struct mm_struct *mm)
+{
+        unsigned long sp = current_stack_pointer;
+        pgd_t *pgd = pgd_offset(mm, sp);
+        if (CONFIG_PGTABLE_LEVELS > 4) {
+                if (unlikely(pgd_none(*pgd))) {
+                        pgd_t *pgd_ref = pgd_offset_k(sp);
+                        set_pgd(pgd, *pgd_ref);
+                }
+        } else {
+                /*
+                 * "pgd" is faked.  The top level entries are "p4d"s, so sync
+                 * the p4d.  This compiles to approximately the same code as
+                 * the 5-level case.
+                 */
+                p4d_t *p4d = p4d_offset(pgd, sp);
+                if (unlikely(p4d_none(*p4d))) {
+                        pgd_t *pgd_ref = pgd_offset_k(sp);
+                        p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
+                        set_p4d(p4d, *p4d_ref);
+                }
+        }
+}
 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                        struct task_struct *tsk)
 {
@@ -226,11 +254,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                         * mapped in the new pgd, we'll double-fault.  Forcibly
                         * map it.
                         */
-                        unsigned int index = pgd_index(current_stack_pointer);
+                        sync_current_stack_to_mm(next);
-                        pgd_t *pgd = next->pgd + index;
-                        if (unlikely(pgd_none(*pgd)))
-                                set_pgd(pgd, init_mm.pgd[index]);
                }
                /* Stop remote flushes for the previous mm */
author	Andy Lutomirski <luto@kernel.org>	2018-01-25 16:12:14 -0500
committer	Thomas Gleixner <tglx@linutronix.de>	2018-01-26 09:56:23 -0500
commit	5beda7d54eafece4c974cfa9fbb9f60fb18fd20a (patch)
tree	e7d14f0b4a300cff583ace3791c638ed9f3c993c /arch
parent	1d080f096fe33f031d26e19b3ef0146f66b8b0f1 (diff)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index a1561957dccb..5bfe61a5e8e3 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c
@@ -151,6 +151,34 @@ void switch_mm(struct mm_struct prev, struct mm_struct next,
151	local_irq_restore(flags);	151	local_irq_restore(flags);
152	}	152	}
153		153
		154	static void sync_current_stack_to_mm(struct mm_struct *mm)
		155	{
		156	unsigned long sp = current_stack_pointer;
		157	pgd_t *pgd = pgd_offset(mm, sp);
		158
		159	if (CONFIG_PGTABLE_LEVELS > 4) {
		160	if (unlikely(pgd_none(*pgd))) {
		161	pgd_t *pgd_ref = pgd_offset_k(sp);
		162
		163	set_pgd(pgd, *pgd_ref);
		164	}
		165	} else {
		166	/*
		167	* "pgd" is faked. The top level entries are "p4d"s, so sync
		168	* the p4d. This compiles to approximately the same code as
		169	* the 5-level case.
		170	*/
		171	p4d_t *p4d = p4d_offset(pgd, sp);
		172
		173	if (unlikely(p4d_none(*p4d))) {
		174	pgd_t *pgd_ref = pgd_offset_k(sp);
		175	p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
		176
		177	set_p4d(p4d, *p4d_ref);
		178	}
		179	}
		180	}
		181
154	void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,	182	void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
155	struct task_struct *tsk)	183	struct task_struct *tsk)
156	{	184	{
@@ -226,11 +254,7 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
226	* mapped in the new pgd, we'll double-fault. Forcibly	254	* mapped in the new pgd, we'll double-fault. Forcibly
227	* map it.	255	* map it.
228	*/	256	*/
229	unsigned int index = pgd_index(current_stack_pointer);	257	sync_current_stack_to_mm(next);
230	pgd_t *pgd = next->pgd + index;
231
232	if (unlikely(pgd_none(*pgd)))
233	set_pgd(pgd, init_mm.pgd[index]);
234	}	258	}
235		259
236	/* Stop remote flushes for the previous mm */	260	/* Stop remote flushes for the previous mm */