aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Lutomirski <luto@kernel.org>2017-11-04 07:16:12 -0400
committerIngo Molnar <mingo@kernel.org>2017-11-04 10:01:50 -0400
commit675357362aeba19688440eb1aaa7991067f73b12 (patch)
tree1c0de6b9876ab1378e9f98771c65ce38b59d05f0
parent5f479447d983111c039f1d6d958553c1ad1b2ff1 (diff)
Revert "x86/mm: Stop calling leave_mm() in idle code"
This reverts commit 43858b4f25cf0adc5c2ca9cf5ce5fdf2532941e5. The reason I removed the leave_mm() calls in question is because the heuristic wasn't needed after that patch. With the original version of my PCID series, we never flushed a "lazy cpu" (i.e. a CPU running kernel thread) due a flush on the loaded mm. Unfortunately, that caused architectural issues, so now I've reinstated these flushes on non-PCID systems in: commit b956575bed91 ("x86/mm: Flush more aggressively in lazy TLB mode"). That, in turn, gives us a power management and occasionally performance regression as compared to old kernels: a process that goes into a deep idle state on a given CPU and gets its mm flushed due to activity on a different CPU will wake the idle CPU. Reinstate the old ugly heuristic: if a CPU goes into ACPI C3 or an intel_idle state that is likely to cause a TLB flush gets its mm switched to init_mm before going idle. FWIW, this heuristic is lousy. Whether we should change CR3 before idle isn't a good hint except insofar as the performance hit is a bit lower if the TLB is getting flushed by the idle code anyway. What we really want to know is whether we anticipate being idle long enough that the mm is likely to be flushed before we wake up. This is more a matter of the expected latency than the idle state that gets chosen. This heuristic also completely fails on systems that don't know whether the TLB will be flushed (e.g. AMD systems?). OTOH it may be a bit obsolete anyway -- PCID systems don't presently benefit from this heuristic at all. We also shouldn't do this callback from innermost bit of the idle code due to the RCU nastiness it causes. All the information need is available before rcu_idle_enter() needs to happen. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Borislav Petkov <bpetkov@suse.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Fixes: 43858b4f25cf "x86/mm: Stop calling leave_mm() in idle code" Link: http://lkml.kernel.org/r/c513bbd4e653747213e05bc7062de000bf0202a5.1509793738.git.luto@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/ia64/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/mm/tlb.c17
-rw-r--r--drivers/acpi/processor_idle.c2
-rw-r--r--drivers/idle/intel_idle.c9
5 files changed, 25 insertions, 7 deletions
diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h
index c86a947f5368..a3d0211970e9 100644
--- a/arch/ia64/include/asm/acpi.h
+++ b/arch/ia64/include/asm/acpi.h
@@ -112,6 +112,8 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
112 buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP; 112 buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP;
113} 113}
114 114
115#define acpi_unlazy_tlb(x)
116
115#ifdef CONFIG_ACPI_NUMA 117#ifdef CONFIG_ACPI_NUMA
116extern cpumask_t early_cpu_possible_map; 118extern cpumask_t early_cpu_possible_map;
117#define for_each_possible_early_cpu(cpu) \ 119#define for_each_possible_early_cpu(cpu) \
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 72d867f6b518..8d0ec9df1cbe 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -150,6 +150,8 @@ static inline void disable_acpi(void) { }
150extern int x86_acpi_numa_init(void); 150extern int x86_acpi_numa_init(void);
151#endif /* CONFIG_ACPI_NUMA */ 151#endif /* CONFIG_ACPI_NUMA */
152 152
153#define acpi_unlazy_tlb(x) leave_mm(x)
154
153#ifdef CONFIG_ACPI_APEI 155#ifdef CONFIG_ACPI_APEI
154static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) 156static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
155{ 157{
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0f3d0cea4d00..3118392cdf75 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -85,6 +85,7 @@ void leave_mm(int cpu)
85 85
86 switch_mm(NULL, &init_mm, NULL); 86 switch_mm(NULL, &init_mm, NULL);
87} 87}
88EXPORT_SYMBOL_GPL(leave_mm);
88 89
89void switch_mm(struct mm_struct *prev, struct mm_struct *next, 90void switch_mm(struct mm_struct *prev, struct mm_struct *next,
90 struct task_struct *tsk) 91 struct task_struct *tsk)
@@ -195,12 +196,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
195 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 196 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
196 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 197 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
197 write_cr3(build_cr3(next, new_asid)); 198 write_cr3(build_cr3(next, new_asid));
198 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 199
199 TLB_FLUSH_ALL); 200 /*
201 * NB: This gets called via leave_mm() in the idle path
202 * where RCU functions differently. Tracing normally
203 * uses RCU, so we need to use the _rcuidle variant.
204 *
205 * (There is no good reason for this. The idle code should
206 * be rearranged to call this before rcu_idle_enter().)
207 */
208 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
200 } else { 209 } else {
201 /* The new ASID is already up to date. */ 210 /* The new ASID is already up to date. */
202 write_cr3(build_cr3_noflush(next, new_asid)); 211 write_cr3(build_cr3_noflush(next, new_asid));
203 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); 212
213 /* See above wrt _rcuidle. */
214 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
204 } 215 }
205 216
206 this_cpu_write(cpu_tlbstate.loaded_mm, next); 217 this_cpu_write(cpu_tlbstate.loaded_mm, next);
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 2736e25e9dc6..d50a7b6ccddd 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -710,6 +710,8 @@ static DEFINE_RAW_SPINLOCK(c3_lock);
710static void acpi_idle_enter_bm(struct acpi_processor *pr, 710static void acpi_idle_enter_bm(struct acpi_processor *pr,
711 struct acpi_processor_cx *cx, bool timer_bc) 711 struct acpi_processor_cx *cx, bool timer_bc)
712{ 712{
713 acpi_unlazy_tlb(smp_processor_id());
714
713 /* 715 /*
714 * Must be done before busmaster disable as we might need to 716 * Must be done before busmaster disable as we might need to
715 * access HPET ! 717 * access HPET !
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 5dc7ea4b6bc4..f0b06b14e782 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -913,15 +913,16 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
913 struct cpuidle_state *state = &drv->states[index]; 913 struct cpuidle_state *state = &drv->states[index];
914 unsigned long eax = flg2MWAIT(state->flags); 914 unsigned long eax = flg2MWAIT(state->flags);
915 unsigned int cstate; 915 unsigned int cstate;
916 int cpu = smp_processor_id();
916 917
917 cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; 918 cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;
918 919
919 /* 920 /*
920 * NB: if CPUIDLE_FLAG_TLB_FLUSHED is set, this idle transition 921 * leave_mm() to avoid costly and often unnecessary wakeups
921 * will probably flush the TLB. It's not guaranteed to flush 922 * for flushing the user TLB's associated with the active mm.
922 * the TLB, though, so it's not clear that we can do anything
923 * useful with this knowledge.
924 */ 923 */
924 if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
925 leave_mm(cpu);
925 926
926 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 927 if (!(lapic_timer_reliable_states & (1 << (cstate))))
927 tick_broadcast_enter(); 928 tick_broadcast_enter();