1 files changed, 61 insertions, 44 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 49d9778376d7..0f3d0cea4d00 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,7 @@
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
 static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
                            u16 *new_asid, bool *need_flush)
 {
@@ -80,7 +81,7 @@ void leave_mm(int cpu)
                return;
        /* Warn if we're not lazy. */
-        WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
+        WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
        switch_mm(NULL, &init_mm, NULL);
 }
@@ -142,45 +143,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                __flush_tlb_all();
        }
 #endif
+        this_cpu_write(cpu_tlbstate.is_lazy, false);
        if (real_prev == next) {
-                VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+                VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
-                          next->context.ctx_id);
+                           next->context.ctx_id);
-                if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
-                        /*
-                         * There's nothing to do: we weren't lazy, and we
-                         * aren't changing our mm.  We don't need to flush
-                         * anything, nor do we need to update CR3, CR4, or
-                         * LDTR.
-                         */
-                        return;
-                }
-                /* Resume remote flushes and then read tlb_gen. */
-                cpumask_set_cpu(cpu, mm_cpumask(next));
-                next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-                if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
-                    next_tlb_gen) {
-                        /*
-                         * Ideally, we'd have a flush_tlb() variant that
-                         * takes the known CR3 value as input.  This would
-                         * be faster on Xen PV and on hypothetical CPUs
-                         * on which INVPCID is fast.
-                         */
-                        this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
-                                       next_tlb_gen);
-                        write_cr3(build_cr3(next, prev_asid));
-                        trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
-                                        TLB_FLUSH_ALL);
-                }
                /*
-                 * We just exited lazy mode, which means that CR4 and/or LDTR
+                 * We don't currently support having a real mm loaded without
-                 * may be stale.  (Changes to the required CR4 and LDTR states
+                 * our cpu set in mm_cpumask().  We have all the bookkeeping
-                 * are not reflected in tlb_gen.)
+                 * in place to figure out whether we would need to flush
+                 * if our cpu were cleared in mm_cpumask(), but we don't
+                 * currently use it.
                 */
+                if (WARN_ON_ONCE(real_prev != &init_mm &&
+                                 !cpumask_test_cpu(cpu, mm_cpumask(next))))
+                        cpumask_set_cpu(cpu, mm_cpumask(next));
+                return;
        } else {
                u16 new_asid;
                bool need_flush;
@@ -199,10 +179,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                }
                /* Stop remote flushes for the previous mm */
-                if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
+                VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
-                        cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+                                real_prev != &init_mm);
+                cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
-                VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
                /*
                 * Start remote flushes and then read tlb_gen.
@@ -233,6 +212,40 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 }
 /*
+ * Please ignore the name of this function.  It should be called
+ * switch_to_kernel_thread().
+ *
+ * enter_lazy_tlb() is a hint from the scheduler that we are entering a
+ * kernel thread or other context without an mm.  Acceptable implementations
+ * include doing nothing whatsoever, switching to init_mm, or various clever
+ * lazy tricks to try to minimize TLB flushes.
+ *
+ * The scheduler reserves the right to call enter_lazy_tlb() several times
+ * in a row.  It will notify us that we're going back to a real mm by
+ * calling switch_mm_irqs_off().
+ */
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+        if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
+                return;
+        if (tlb_defer_switch_to_init_mm()) {
+                /*
+                 * There's a significant optimization that may be possible
+                 * here.  We have accurate enough TLB flush tracking that we
+                 * don't need to maintain coherence of TLB per se when we're
+                 * lazy.  We do, however, need to maintain coherence of
+                 * paging-structure caches.  We could, in principle, leave our
+                 * old mm loaded and only switch to init_mm when
+                 * tlb_remove_page() happens.
+                 */
+                this_cpu_write(cpu_tlbstate.is_lazy, true);
+        } else {
+                switch_mm(NULL, &init_mm, NULL);
+        }
+}
+/*
 * Call this when reinitializing a CPU.  It fixes the following potential
 * problems:
 *
@@ -303,16 +316,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
        /* This code cannot presently handle being reentered. */
        VM_WARN_ON(!irqs_disabled());
+        if (unlikely(loaded_mm == &init_mm))
+                return;
        VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
                   loaded_mm->context.ctx_id);
-        if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
+        if (this_cpu_read(cpu_tlbstate.is_lazy)) {
                /*
-                 * We're in lazy mode -- don't flush.  We can get here on
+                 * We're in lazy mode.  We need to at least flush our
-                 * remote flushes due to races and on local flushes if a
+                 * paging-structure cache to avoid speculatively reading
-                 * kernel thread coincidentally flushes the mm it's lazily
+                 * garbage into our TLB.  Since switching to init_mm is barely
-                 * still using.
+                 * slower than a minimal flush, just switch to init_mm.
                 */
+                switch_mm_irqs_off(NULL, &init_mm, NULL);
                return;
        }

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 49d9778376d7..0f3d0cea4d00 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,7 @@
30		30
31	atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);	31	atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
32		32
		33
33	static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,	34	static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
34	u16 new_asid, bool need_flush)	35	u16 new_asid, bool need_flush)
35	{	36	{
@@ -80,7 +81,7 @@ void leave_mm(int cpu)
80	return;	81	return;
81		82
82	/* Warn if we're not lazy. */	83	/* Warn if we're not lazy. */
83	WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));	84	WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
84		85
85	switch_mm(NULL, &init_mm, NULL);	86	switch_mm(NULL, &init_mm, NULL);
86	}	87	}
@@ -142,45 +143,24 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
142	__flush_tlb_all();	143	__flush_tlb_all();
143	}	144	}
144	#endif	145	#endif
		146	this_cpu_write(cpu_tlbstate.is_lazy, false);
145		147
146	if (real_prev == next) {	148	if (real_prev == next) {
147	VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=	149	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
148	next->context.ctx_id);	150	next->context.ctx_id);
149
150	if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
151	/*
152	* There's nothing to do: we weren't lazy, and we
153	* aren't changing our mm. We don't need to flush
154	* anything, nor do we need to update CR3, CR4, or
155	* LDTR.
156	*/
157	return;
158	}
159
160	/* Resume remote flushes and then read tlb_gen. */
161	cpumask_set_cpu(cpu, mm_cpumask(next));
162	next_tlb_gen = atomic64_read(&next->context.tlb_gen);
163
164	if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
165	next_tlb_gen) {
166	/*
167	* Ideally, we'd have a flush_tlb() variant that
168	* takes the known CR3 value as input. This would
169	* be faster on Xen PV and on hypothetical CPUs
170	* on which INVPCID is fast.
171	*/
172	this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
173	next_tlb_gen);
174	write_cr3(build_cr3(next, prev_asid));
175	trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
176	TLB_FLUSH_ALL);
177	}
178		151
179	/*	152	/*
180	* We just exited lazy mode, which means that CR4 and/or LDTR	153	* We don't currently support having a real mm loaded without
181	* may be stale. (Changes to the required CR4 and LDTR states	154	* our cpu set in mm_cpumask(). We have all the bookkeeping
182	* are not reflected in tlb_gen.)	155	* in place to figure out whether we would need to flush
		156	* if our cpu were cleared in mm_cpumask(), but we don't
		157	* currently use it.
183	*/	158	*/
		159	if (WARN_ON_ONCE(real_prev != &init_mm &&
		160	!cpumask_test_cpu(cpu, mm_cpumask(next))))
		161	cpumask_set_cpu(cpu, mm_cpumask(next));
		162
		163	return;
184	} else {	164	} else {
185	u16 new_asid;	165	u16 new_asid;
186	bool need_flush;	166	bool need_flush;
@@ -199,10 +179,9 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
199	}	179	}
200		180
201	/* Stop remote flushes for the previous mm */	181	/* Stop remote flushes for the previous mm */
202	if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))	182	VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
203	cpumask_clear_cpu(cpu, mm_cpumask(real_prev));	183	real_prev != &init_mm);
204		184	cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
205	VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
206		185
207	/*	186	/*
208	* Start remote flushes and then read tlb_gen.	187	* Start remote flushes and then read tlb_gen.
@@ -233,6 +212,40 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
233	}	212	}
234		213
235	/*	214	/*
		215	* Please ignore the name of this function. It should be called
		216	* switch_to_kernel_thread().
		217	*
		218	* enter_lazy_tlb() is a hint from the scheduler that we are entering a
		219	* kernel thread or other context without an mm. Acceptable implementations
		220	* include doing nothing whatsoever, switching to init_mm, or various clever
		221	* lazy tricks to try to minimize TLB flushes.
		222	*
		223	* The scheduler reserves the right to call enter_lazy_tlb() several times
		224	* in a row. It will notify us that we're going back to a real mm by
		225	* calling switch_mm_irqs_off().
		226	*/
		227	void enter_lazy_tlb(struct mm_struct mm, struct task_struct tsk)
		228	{
		229	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
		230	return;
		231
		232	if (tlb_defer_switch_to_init_mm()) {
		233	/*
		234	* There's a significant optimization that may be possible
		235	* here. We have accurate enough TLB flush tracking that we
		236	* don't need to maintain coherence of TLB per se when we're
		237	* lazy. We do, however, need to maintain coherence of
		238	* paging-structure caches. We could, in principle, leave our
		239	* old mm loaded and only switch to init_mm when
		240	* tlb_remove_page() happens.
		241	*/
		242	this_cpu_write(cpu_tlbstate.is_lazy, true);
		243	} else {
		244	switch_mm(NULL, &init_mm, NULL);
		245	}
		246	}
		247
		248	/*
236	* Call this when reinitializing a CPU. It fixes the following potential	249	* Call this when reinitializing a CPU. It fixes the following potential
237	* problems:	250	* problems:
238	*	251	*
@@ -303,16 +316,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
303	/* This code cannot presently handle being reentered. */	316	/* This code cannot presently handle being reentered. */
304	VM_WARN_ON(!irqs_disabled());	317	VM_WARN_ON(!irqs_disabled());
305		318
		319	if (unlikely(loaded_mm == &init_mm))
		320	return;
		321
306	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=	322	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
307	loaded_mm->context.ctx_id);	323	loaded_mm->context.ctx_id);
308		324
309	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {	325	if (this_cpu_read(cpu_tlbstate.is_lazy)) {
310	/*	326	/*
311	* We're in lazy mode -- don't flush. We can get here on	327	* We're in lazy mode. We need to at least flush our
312	* remote flushes due to races and on local flushes if a	328	* paging-structure cache to avoid speculatively reading
313	* kernel thread coincidentally flushes the mm it's lazily	329	* garbage into our TLB. Since switching to init_mm is barely
314	* still using.	330	* slower than a minimal flush, just switch to init_mm.
315	*/	331	*/
		332	switch_mm_irqs_off(NULL, &init_mm, NULL);
316	return;	333	return;
317	}	334	}
318		335