x86/mm/tlb: Leave lazy TLB mode at page table free time

Andy discovered that speculative memory accesses while in lazy TLB mode can crash a system, when a CPU tries to dereference a speculative access using memory contents that used to be valid page table memory, but have since been reused for something else and point into la-la land. The latter problem can be prevented in two ways. The first is to always send a TLB shootdown IPI to CPUs in lazy TLB mode, while the second one is to only send the TLB shootdown at page table freeing time. The second should result in fewer IPIs, since operationgs like mprotect and madvise are very common with some workloads, but do not involve page table freeing. Also, on munmap, batching of page table freeing covers much larger ranges of virtual memory than the batching of unmapped user pages. Tested-by: Song Liu <songliubraving@fb.com> Signed-off-by: Rik van Riel <riel@surriel.com> Acked-by: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: efault@gmx.de Cc: kernel-team@fb.com Cc: luto@kernel.org Link: http://lkml.kernel.org/r/20180716190337.26133-3-riel@surriel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Rik van Riel <riel@surriel.com> 2018-07-16 15:03:32 -0400
committer: Ingo Molnar <mingo@kernel.org> 2018-07-17 03:35:31 -0400
commit: 2ff6ddf19c0ec40633bd14d8fe28a289816bd98d (patch)
tree: e608a4aa5331e3fcd5a1b00a6c65de41b6563eb5
parent: c1a2f7f0c06454387c2cd7b93ff1491c715a8c69 (diff)
4 files changed, 56 insertions, 8 deletions
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6690cd3fc8b1..3aa3204b5dc0 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -554,4 +554,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
        native_flush_tlb_others(mask, info)
 #endif
+extern void tlb_flush_remove_tables(struct mm_struct *mm);
+extern void tlb_flush_remove_tables_local(void *arg);
+#define HAVE_TLB_FLUSH_REMOVE_TABLES
 #endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6eb1f34c3c85..9a893673c56b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -646,6 +646,33 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
        put_cpu();
 }
+void tlb_flush_remove_tables_local(void *arg)
+{
+        struct mm_struct *mm = arg;
+        if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
+                        this_cpu_read(cpu_tlbstate.is_lazy)) {
+                /*
+                 * We're in lazy mode.  We need to at least flush our
+                 * paging-structure cache to avoid speculatively reading
+                 * garbage into our TLB.  Since switching to init_mm is barely
+                 * slower than a minimal flush, just switch to init_mm.
+                 */
+                switch_mm_irqs_off(NULL, &init_mm, NULL);
+        }
+}
+void tlb_flush_remove_tables(struct mm_struct *mm)
+{
+        int cpu = get_cpu();
+        /*
+         * XXX: this really only needs to be called for CPUs in lazy TLB mode.
+         */
+        if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
+                smp_call_function_many(mm_cpumask(mm), tlb_flush_remove_tables_local, (void *)mm, 1);
+        put_cpu();
+}
 static void do_flush_tlb_all(void *info)
 {
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 3063125197ad..e811ef7b8350 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -303,4 +303,14 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 #define tlb_migrate_finish(mm) do {} while (0)
+/*
+ * Used to flush the TLB when page tables are removed, when lazy
+ * TLB mode may cause a CPU to retain intermediate translations
+ * pointing to about-to-be-freed page table memory.
+ */
+#ifndef HAVE_TLB_FLUSH_REMOVE_TABLES
+#define tlb_flush_remove_tables(mm) do {} while (0)
+#define tlb_flush_remove_tables_local(mm) do {} while (0)
+#endif
 #endif /* _ASM_GENERIC__TLB_H */
diff --git a/mm/memory.c b/mm/memory.c
index 7206a634270b..18355e0b971a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -326,16 +326,20 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
-/*
- * See the comment near struct mmu_table_batch.
- */
 static void tlb_remove_table_smp_sync(void *arg)
 {
-        /* Simply deliver the interrupt */
+        struct mm_struct __maybe_unused *mm = arg;
+        /*
+         * On most architectures this does nothing. Simply delivering the
+         * interrupt is enough to prevent races with software page table
+         * walking like that done in get_user_pages_fast.
+         *
+         * See the comment near struct mmu_table_batch.
+         */
+        tlb_flush_remove_tables_local(mm);
 }
-static void tlb_remove_table_one(void *table)
+static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
 {
        /*
         * This isn't an RCU grace period and hence the page-tables cannot be
@@ -344,7 +348,7 @@ static void tlb_remove_table_one(void *table)
         * It is however sufficient for software page-table walkers that rely on
         * IRQ disabling. See the comment near struct mmu_table_batch.
         */
-        smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+        smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
        __tlb_remove_table(table);
 }
@@ -365,6 +369,8 @@ void tlb_table_flush(struct mmu_gather *tlb)
 {
        struct mmu_table_batch **batch = &tlb->batch;
+        tlb_flush_remove_tables(tlb->mm);
        if (*batch) {
                call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
                *batch = NULL;
@@ -387,7 +393,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
        if (*batch == NULL) {
                *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
                if (*batch == NULL) {
-                        tlb_remove_table_one(table);
+                        tlb_remove_table_one(table, tlb);
                        return;
                }
                (*batch)->nr = 0;
author	Rik van Riel <riel@surriel.com>	2018-07-16 15:03:32 -0400
committer	Ingo Molnar <mingo@kernel.org>	2018-07-17 03:35:31 -0400
commit	2ff6ddf19c0ec40633bd14d8fe28a289816bd98d (patch)
tree	e608a4aa5331e3fcd5a1b00a6c65de41b6563eb5
parent	c1a2f7f0c06454387c2cd7b93ff1491c715a8c69 (diff)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6690cd3fc8b1..3aa3204b5dc0 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h
@@ -554,4 +554,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
554	native_flush_tlb_others(mask, info)	554	native_flush_tlb_others(mask, info)
555	#endif	555	#endif
556		556
		557	extern void tlb_flush_remove_tables(struct mm_struct *mm);
		558	extern void tlb_flush_remove_tables_local(void *arg);
		559
		560	#define HAVE_TLB_FLUSH_REMOVE_TABLES
		561
557	#endif /* _ASM_X86_TLBFLUSH_H */	562	#endif /* _ASM_X86_TLBFLUSH_H */


diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6eb1f34c3c85..9a893673c56b 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c
@@ -646,6 +646,33 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
646	put_cpu();	646	put_cpu();
647	}	647	}
648		648
		649	void tlb_flush_remove_tables_local(void *arg)
		650	{
		651	struct mm_struct *mm = arg;
		652
		653	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
		654	this_cpu_read(cpu_tlbstate.is_lazy)) {
		655	/*
		656	* We're in lazy mode. We need to at least flush our
		657	* paging-structure cache to avoid speculatively reading
		658	* garbage into our TLB. Since switching to init_mm is barely
		659	* slower than a minimal flush, just switch to init_mm.
		660	*/
		661	switch_mm_irqs_off(NULL, &init_mm, NULL);
		662	}
		663	}
		664
		665	void tlb_flush_remove_tables(struct mm_struct *mm)
		666	{
		667	int cpu = get_cpu();
		668	/*
		669	* XXX: this really only needs to be called for CPUs in lazy TLB mode.
		670	*/
		671	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
		672	smp_call_function_many(mm_cpumask(mm), tlb_flush_remove_tables_local, (void *)mm, 1);
		673
		674	put_cpu();
		675	}
649		676
650	static void do_flush_tlb_all(void *info)	677	static void do_flush_tlb_all(void *info)
651	{	678	{


diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 3063125197ad..e811ef7b8350 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h
@@ -303,4 +303,14 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
303		303
304	#define tlb_migrate_finish(mm) do {} while (0)	304	#define tlb_migrate_finish(mm) do {} while (0)
305		305
		306	/*
		307	* Used to flush the TLB when page tables are removed, when lazy
		308	* TLB mode may cause a CPU to retain intermediate translations
		309	* pointing to about-to-be-freed page table memory.
		310	*/
		311	#ifndef HAVE_TLB_FLUSH_REMOVE_TABLES
		312	#define tlb_flush_remove_tables(mm) do {} while (0)
		313	#define tlb_flush_remove_tables_local(mm) do {} while (0)
		314	#endif
		315
306	#endif /* _ASM_GENERIC__TLB_H */	316	#endif /* _ASM_GENERIC__TLB_H */


diff --git a/mm/memory.c b/mm/memory.c index 7206a634270b..18355e0b971a 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -326,16 +326,20 @@ bool __tlb_remove_page_size(struct mmu_gather tlb, struct page page, int page_
326		326
327	#ifdef CONFIG_HAVE_RCU_TABLE_FREE	327	#ifdef CONFIG_HAVE_RCU_TABLE_FREE
328		328
329	/*
330	* See the comment near struct mmu_table_batch.
331	*/
332
333	static void tlb_remove_table_smp_sync(void *arg)	329	static void tlb_remove_table_smp_sync(void *arg)
334	{	330	{
335	/* Simply deliver the interrupt */	331	struct mm_struct __maybe_unused *mm = arg;
		332	/*
		333	* On most architectures this does nothing. Simply delivering the
		334	* interrupt is enough to prevent races with software page table
		335	* walking like that done in get_user_pages_fast.
		336	*
		337	* See the comment near struct mmu_table_batch.
		338	*/
		339	tlb_flush_remove_tables_local(mm);
336	}	340	}
337		341
338	static void tlb_remove_table_one(void *table)	342	static void tlb_remove_table_one(void table, struct mmu_gather tlb)
339	{	343	{
340	/*	344	/*
341	* This isn't an RCU grace period and hence the page-tables cannot be	345	* This isn't an RCU grace period and hence the page-tables cannot be
@@ -344,7 +348,7 @@ static void tlb_remove_table_one(void *table)
344	* It is however sufficient for software page-table walkers that rely on	348	* It is however sufficient for software page-table walkers that rely on
345	* IRQ disabling. See the comment near struct mmu_table_batch.	349	* IRQ disabling. See the comment near struct mmu_table_batch.
346	*/	350	*/
347	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);	351	smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
348	__tlb_remove_table(table);	352	__tlb_remove_table(table);
349	}	353	}
350		354
@@ -365,6 +369,8 @@ void tlb_table_flush(struct mmu_gather *tlb)
365	{	369	{
366	struct mmu_table_batch **batch = &tlb->batch;	370	struct mmu_table_batch **batch = &tlb->batch;
367		371
		372	tlb_flush_remove_tables(tlb->mm);
		373
368	if (*batch) {	374	if (*batch) {
369	call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);	375	call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
370	*batch = NULL;	376	*batch = NULL;
@@ -387,7 +393,7 @@ void tlb_remove_table(struct mmu_gather tlb, void table)
387	if (*batch == NULL) {	393	if (*batch == NULL) {
388	batch = (struct mmu_table_batch )__get_free_page(GFP_NOWAIT \| __GFP_NOWARN);	394	batch = (struct mmu_table_batch )__get_free_page(GFP_NOWAIT \| __GFP_NOWARN);
389	if (*batch == NULL) {	395	if (*batch == NULL) {
390	tlb_remove_table_one(table);	396	tlb_remove_table_one(table, tlb);
391	return;	397	return;
392	}	398	}
393	(*batch)->nr = 0;	399	(*batch)->nr = 0;