aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRik van Riel <riel@surriel.com>2018-07-16 15:03:34 -0400
committerIngo Molnar <mingo@kernel.org>2018-07-17 03:35:33 -0400
commitac0315896970d8589291e9d8a1569fc65967b7f1 (patch)
treebae8564b79d095101b5b12a894c811657d6645ee
parent61d0beb5796ab11f7f3bf38cb2eccc6579aaa70b (diff)
x86/mm/tlb: Make lazy TLB mode lazier
Lazy TLB mode can result in an idle CPU being woken up by a TLB flush, when all it really needs to do is reload %CR3 at the next context switch, assuming no page table pages got freed. Memory ordering is used to prevent race conditions between switch_mm_irqs_off, which checks whether .tlb_gen changed, and the TLB invalidation code, which increments .tlb_gen whenever page table entries get invalidated. The atomic increment in inc_mm_tlb_gen is its own barrier; the context switch code adds an explicit barrier between reading tlbstate.is_lazy and next->context.tlb_gen. Unlike the 2016 version of this patch, CPUs with cpu_tlbstate.is_lazy set are not removed from the mm_cpumask(mm), since that would prevent the TLB flush IPIs at page table free time from being sent to all the CPUs that need them. This patch reduces total CPU use in the system by about 1-2% for a memcache workload on two socket systems, and by about 1% for a heavily multi-process netperf between two systems. Tested-by: Song Liu <songliubraving@fb.com> Signed-off-by: Rik van Riel <riel@surriel.com> Acked-by: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: efault@gmx.de Cc: kernel-team@fb.com Cc: luto@kernel.org Link: http://lkml.kernel.org/r/20180716190337.26133-5-riel@surriel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/mm/tlb.c68
1 files changed, 59 insertions, 9 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4b73fe835c95..26542cc17043 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,6 +7,7 @@
7#include <linux/export.h> 7#include <linux/export.h>
8#include <linux/cpu.h> 8#include <linux/cpu.h>
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/gfp.h>
10 11
11#include <asm/tlbflush.h> 12#include <asm/tlbflush.h>
12#include <asm/mmu_context.h> 13#include <asm/mmu_context.h>
@@ -185,6 +186,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
185{ 186{
186 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); 187 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
187 u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 188 u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
189 bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
188 unsigned cpu = smp_processor_id(); 190 unsigned cpu = smp_processor_id();
189 u64 next_tlb_gen; 191 u64 next_tlb_gen;
190 bool need_flush; 192 bool need_flush;
@@ -242,17 +244,40 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
242 next->context.ctx_id); 244 next->context.ctx_id);
243 245
244 /* 246 /*
245 * We don't currently support having a real mm loaded without 247 * Even in lazy TLB mode, the CPU should stay set in the
246 * our cpu set in mm_cpumask(). We have all the bookkeeping 248 * mm_cpumask. The TLB shootdown code can figure out from
247 * in place to figure out whether we would need to flush 249 * from cpu_tlbstate.is_lazy whether or not to send an IPI.
248 * if our cpu were cleared in mm_cpumask(), but we don't
249 * currently use it.
250 */ 250 */
251 if (WARN_ON_ONCE(real_prev != &init_mm && 251 if (WARN_ON_ONCE(real_prev != &init_mm &&
252 !cpumask_test_cpu(cpu, mm_cpumask(next)))) 252 !cpumask_test_cpu(cpu, mm_cpumask(next))))
253 cpumask_set_cpu(cpu, mm_cpumask(next)); 253 cpumask_set_cpu(cpu, mm_cpumask(next));
254 254
255 return; 255 /*
256 * If the CPU is not in lazy TLB mode, we are just switching
257 * from one thread in a process to another thread in the same
258 * process. No TLB flush required.
259 */
260 if (!was_lazy)
261 return;
262
263 /*
264 * Read the tlb_gen to check whether a flush is needed.
265 * If the TLB is up to date, just use it.
266 * The barrier synchronizes with the tlb_gen increment in
267 * the TLB shootdown code.
268 */
269 smp_mb();
270 next_tlb_gen = atomic64_read(&next->context.tlb_gen);
271 if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
272 next_tlb_gen)
273 return;
274
275 /*
276 * TLB contents went out of date while we were in lazy
277 * mode. Fall through to the TLB switching code below.
278 */
279 new_asid = prev_asid;
280 need_flush = true;
256 } else { 281 } else {
257 u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); 282 u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
258 283
@@ -454,6 +479,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
454 * paging-structure cache to avoid speculatively reading 479 * paging-structure cache to avoid speculatively reading
455 * garbage into our TLB. Since switching to init_mm is barely 480 * garbage into our TLB. Since switching to init_mm is barely
456 * slower than a minimal flush, just switch to init_mm. 481 * slower than a minimal flush, just switch to init_mm.
482 *
483 * This should be rare, with native_flush_tlb_others skipping
484 * IPIs to lazy TLB mode CPUs.
457 */ 485 */
458 switch_mm_irqs_off(NULL, &init_mm, NULL); 486 switch_mm_irqs_off(NULL, &init_mm, NULL);
459 return; 487 return;
@@ -560,6 +588,9 @@ static void flush_tlb_func_remote(void *info)
560void native_flush_tlb_others(const struct cpumask *cpumask, 588void native_flush_tlb_others(const struct cpumask *cpumask,
561 const struct flush_tlb_info *info) 589 const struct flush_tlb_info *info)
562{ 590{
591 cpumask_var_t lazymask;
592 unsigned int cpu;
593
563 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); 594 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
564 if (info->end == TLB_FLUSH_ALL) 595 if (info->end == TLB_FLUSH_ALL)
565 trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); 596 trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -583,8 +614,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
583 * that UV should be updated so that smp_call_function_many(), 614 * that UV should be updated so that smp_call_function_many(),
584 * etc, are optimal on UV. 615 * etc, are optimal on UV.
585 */ 616 */
586 unsigned int cpu;
587
588 cpu = smp_processor_id(); 617 cpu = smp_processor_id();
589 cpumask = uv_flush_tlb_others(cpumask, info); 618 cpumask = uv_flush_tlb_others(cpumask, info);
590 if (cpumask) 619 if (cpumask)
@@ -592,8 +621,29 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
592 (void *)info, 1); 621 (void *)info, 1);
593 return; 622 return;
594 } 623 }
595 smp_call_function_many(cpumask, flush_tlb_func_remote, 624
625 /*
626 * A temporary cpumask is used in order to skip sending IPIs
627 * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
628 * If the allocation fails, simply IPI every CPU in mm_cpumask.
629 */
630 if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
631 smp_call_function_many(cpumask, flush_tlb_func_remote,
632 (void *)info, 1);
633 return;
634 }
635
636 cpumask_copy(lazymask, cpumask);
637
638 for_each_cpu(cpu, lazymask) {
639 if (per_cpu(cpu_tlbstate.is_lazy, cpu))
640 cpumask_clear_cpu(cpu, lazymask);
641 }
642
643 smp_call_function_many(lazymask, flush_tlb_func_remote,
596 (void *)info, 1); 644 (void *)info, 1);
645
646 free_cpumask_var(lazymask);
597} 647}
598 648
599/* 649/*