x86/mm/tlb: Make lazy TLB mode lazier

Lazy TLB mode can result in an idle CPU being woken up by a TLB flush, when all it really needs to do is reload %CR3 at the next context switch, assuming no page table pages got freed. Memory ordering is used to prevent race conditions between switch_mm_irqs_off, which checks whether .tlb_gen changed, and the TLB invalidation code, which increments .tlb_gen whenever page table entries get invalidated. The atomic increment in inc_mm_tlb_gen is its own barrier; the context switch code adds an explicit barrier between reading tlbstate.is_lazy and next->context.tlb_gen. Unlike the 2016 version of this patch, CPUs with cpu_tlbstate.is_lazy set are not removed from the mm_cpumask(mm), since that would prevent the TLB flush IPIs at page table free time from being sent to all the CPUs that need them. This patch reduces total CPU use in the system by about 1-2% for a memcache workload on two socket systems, and by about 1% for a heavily multi-process netperf between two systems. Tested-by: Song Liu <songliubraving@fb.com> Signed-off-by: Rik van Riel <riel@surriel.com> Acked-by: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: efault@gmx.de Cc: kernel-team@fb.com Cc: luto@kernel.org Link: http://lkml.kernel.org/r/20180716190337.26133-5-riel@surriel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Rik van Riel <riel@surriel.com> 2018-07-16 15:03:34 -0400
committer: Ingo Molnar <mingo@kernel.org> 2018-07-17 03:35:33 -0400
commit: ac0315896970d8589291e9d8a1569fc65967b7f1 (patch)
tree: bae8564b79d095101b5b12a894c811657d6645ee
parent: 61d0beb5796ab11f7f3bf38cb2eccc6579aaa70b (diff)
1 files changed, 59 insertions, 9 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4b73fe835c95..26542cc17043 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,6 +7,7 @@
 #include <linux/export.h>
 #include <linux/cpu.h>
 #include <linux/debugfs.h>
+#include <linux/gfp.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -185,6 +186,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 {
        struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
        u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+        bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
        unsigned cpu = smp_processor_id();
        u64 next_tlb_gen;
        bool need_flush;
@@ -242,17 +244,40 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                           next->context.ctx_id);
                /*
-                 * We don't currently support having a real mm loaded without
+                 * Even in lazy TLB mode, the CPU should stay set in the
-                 * our cpu set in mm_cpumask().  We have all the bookkeeping
+                 * mm_cpumask. The TLB shootdown code can figure out from
-                 * in place to figure out whether we would need to flush
+                 * from cpu_tlbstate.is_lazy whether or not to send an IPI.
-                 * if our cpu were cleared in mm_cpumask(), but we don't
-                 * currently use it.
                 */
                if (WARN_ON_ONCE(real_prev != &init_mm &&
                                 !cpumask_test_cpu(cpu, mm_cpumask(next))))
                        cpumask_set_cpu(cpu, mm_cpumask(next));
-                return;
+                /*
+                 * If the CPU is not in lazy TLB mode, we are just switching
+                 * from one thread in a process to another thread in the same
+                 * process. No TLB flush required.
+                 */
+                if (!was_lazy)
+                        return;
+                /*
+                 * Read the tlb_gen to check whether a flush is needed.
+                 * If the TLB is up to date, just use it.
+                 * The barrier synchronizes with the tlb_gen increment in
+                 * the TLB shootdown code.
+                 */
+                smp_mb();
+                next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+                if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
+                                next_tlb_gen)
+                        return;
+                /*
+                 * TLB contents went out of date while we were in lazy
+                 * mode. Fall through to the TLB switching code below.
+                 */
+                new_asid = prev_asid;
+                need_flush = true;
        } else {
                u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
@@ -454,6 +479,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
                 * paging-structure cache to avoid speculatively reading
                 * garbage into our TLB.  Since switching to init_mm is barely
                 * slower than a minimal flush, just switch to init_mm.
+                 *
+                 * This should be rare, with native_flush_tlb_others skipping
+                 * IPIs to lazy TLB mode CPUs.
                 */
                switch_mm_irqs_off(NULL, &init_mm, NULL);
                return;
@@ -560,6 +588,9 @@ static void flush_tlb_func_remote(void *info)
 void native_flush_tlb_others(const struct cpumask *cpumask,
                             const struct flush_tlb_info *info)
 {
+        cpumask_var_t lazymask;
+        unsigned int cpu;
        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
        if (info->end == TLB_FLUSH_ALL)
                trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -583,8 +614,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
                 * that UV should be updated so that smp_call_function_many(),
                 * etc, are optimal on UV.
                 */
-                unsigned int cpu;
                cpu = smp_processor_id();
                cpumask = uv_flush_tlb_others(cpumask, info);
                if (cpumask)
@@ -592,8 +621,29 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
                                               (void *)info, 1);
                return;
        }
-        smp_call_function_many(cpumask, flush_tlb_func_remote,
+        /*
+         * A temporary cpumask is used in order to skip sending IPIs
+         * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
+         * If the allocation fails, simply IPI every CPU in mm_cpumask.
+         */
+        if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
+                smp_call_function_many(cpumask, flush_tlb_func_remote,
+                               (void *)info, 1);
+                return;
+        }
+        cpumask_copy(lazymask, cpumask);
+        for_each_cpu(cpu, lazymask) {
+                if (per_cpu(cpu_tlbstate.is_lazy, cpu))
+                        cpumask_clear_cpu(cpu, lazymask);
+        }
+        smp_call_function_many(lazymask, flush_tlb_func_remote,
                               (void *)info, 1);
+        free_cpumask_var(lazymask);
 }
 /*
author	Rik van Riel <riel@surriel.com>	2018-07-16 15:03:34 -0400
committer	Ingo Molnar <mingo@kernel.org>	2018-07-17 03:35:33 -0400
commit	ac0315896970d8589291e9d8a1569fc65967b7f1 (patch)
tree	bae8564b79d095101b5b12a894c811657d6645ee
parent	61d0beb5796ab11f7f3bf38cb2eccc6579aaa70b (diff)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4b73fe835c95..26542cc17043 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c
@@ -7,6 +7,7 @@
7	#include <linux/export.h>	7	#include <linux/export.h>
8	#include <linux/cpu.h>	8	#include <linux/cpu.h>
9	#include <linux/debugfs.h>	9	#include <linux/debugfs.h>
		10	#include <linux/gfp.h>
10		11
11	#include <asm/tlbflush.h>	12	#include <asm/tlbflush.h>
12	#include <asm/mmu_context.h>	13	#include <asm/mmu_context.h>
@@ -185,6 +186,7 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
185	{	186	{
186	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);	187	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
187	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);	188	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
		189	bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
188	unsigned cpu = smp_processor_id();	190	unsigned cpu = smp_processor_id();
189	u64 next_tlb_gen;	191	u64 next_tlb_gen;
190	bool need_flush;	192	bool need_flush;
@@ -242,17 +244,40 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
242	next->context.ctx_id);	244	next->context.ctx_id);
243		245
244	/*	246	/*
245	* We don't currently support having a real mm loaded without	247	* Even in lazy TLB mode, the CPU should stay set in the
246	* our cpu set in mm_cpumask(). We have all the bookkeeping	248	* mm_cpumask. The TLB shootdown code can figure out from
247	* in place to figure out whether we would need to flush	249	* from cpu_tlbstate.is_lazy whether or not to send an IPI.
248	* if our cpu were cleared in mm_cpumask(), but we don't
249	* currently use it.
250	*/	250	*/
251	if (WARN_ON_ONCE(real_prev != &init_mm &&	251	if (WARN_ON_ONCE(real_prev != &init_mm &&
252	!cpumask_test_cpu(cpu, mm_cpumask(next))))	252	!cpumask_test_cpu(cpu, mm_cpumask(next))))
253	cpumask_set_cpu(cpu, mm_cpumask(next));	253	cpumask_set_cpu(cpu, mm_cpumask(next));
254		254
255	return;	255	/*
		256	* If the CPU is not in lazy TLB mode, we are just switching
		257	* from one thread in a process to another thread in the same
		258	* process. No TLB flush required.
		259	*/
		260	if (!was_lazy)
		261	return;
		262
		263	/*
		264	* Read the tlb_gen to check whether a flush is needed.
		265	* If the TLB is up to date, just use it.
		266	* The barrier synchronizes with the tlb_gen increment in
		267	* the TLB shootdown code.
		268	*/
		269	smp_mb();
		270	next_tlb_gen = atomic64_read(&next->context.tlb_gen);
		271	if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
		272	next_tlb_gen)
		273	return;
		274
		275	/*
		276	* TLB contents went out of date while we were in lazy
		277	* mode. Fall through to the TLB switching code below.
		278	*/
		279	new_asid = prev_asid;
		280	need_flush = true;
256	} else {	281	} else {
257	u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);	282	u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
258		283
@@ -454,6 +479,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
454	* paging-structure cache to avoid speculatively reading	479	* paging-structure cache to avoid speculatively reading
455	* garbage into our TLB. Since switching to init_mm is barely	480	* garbage into our TLB. Since switching to init_mm is barely
456	* slower than a minimal flush, just switch to init_mm.	481	* slower than a minimal flush, just switch to init_mm.
		482	*
		483	* This should be rare, with native_flush_tlb_others skipping
		484	* IPIs to lazy TLB mode CPUs.
457	*/	485	*/
458	switch_mm_irqs_off(NULL, &init_mm, NULL);	486	switch_mm_irqs_off(NULL, &init_mm, NULL);
459	return;	487	return;
@@ -560,6 +588,9 @@ static void flush_tlb_func_remote(void *info)
560	void native_flush_tlb_others(const struct cpumask *cpumask,	588	void native_flush_tlb_others(const struct cpumask *cpumask,
561	const struct flush_tlb_info *info)	589	const struct flush_tlb_info *info)
562	{	590	{
		591	cpumask_var_t lazymask;
		592	unsigned int cpu;
		593
563	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);	594	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
564	if (info->end == TLB_FLUSH_ALL)	595	if (info->end == TLB_FLUSH_ALL)
565	trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);	596	trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -583,8 +614,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
583	* that UV should be updated so that smp_call_function_many(),	614	* that UV should be updated so that smp_call_function_many(),
584	* etc, are optimal on UV.	615	* etc, are optimal on UV.
585	*/	616	*/
586	unsigned int cpu;
587
588	cpu = smp_processor_id();	617	cpu = smp_processor_id();
589	cpumask = uv_flush_tlb_others(cpumask, info);	618	cpumask = uv_flush_tlb_others(cpumask, info);
590	if (cpumask)	619	if (cpumask)
@@ -592,8 +621,29 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
592	(void *)info, 1);	621	(void *)info, 1);
593	return;	622	return;
594	}	623	}
595	smp_call_function_many(cpumask, flush_tlb_func_remote,	624
		625	/*
		626	* A temporary cpumask is used in order to skip sending IPIs
		627	* to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
		628	* If the allocation fails, simply IPI every CPU in mm_cpumask.
		629	*/
		630	if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
		631	smp_call_function_many(cpumask, flush_tlb_func_remote,
		632	(void *)info, 1);
		633	return;
		634	}
		635
		636	cpumask_copy(lazymask, cpumask);
		637
		638	for_each_cpu(cpu, lazymask) {
		639	if (per_cpu(cpu_tlbstate.is_lazy, cpu))
		640	cpumask_clear_cpu(cpu, lazymask);
		641	}
		642
		643	smp_call_function_many(lazymask, flush_tlb_func_remote,
596	(void *)info, 1);	644	(void *)info, 1);
		645
		646	free_cpumask_var(lazymask);
597	}	647	}
598		648
599	/*	649	/*