1 files changed, 111 insertions, 42 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 49d9778376d7..658bf0090565 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,8 @@
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
 static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
                            u16 *new_asid, bool *need_flush)
 {
@@ -80,7 +82,7 @@ void leave_mm(int cpu)
                return;
        /* Warn if we're not lazy. */
-        WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
+        WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
        switch_mm(NULL, &init_mm, NULL);
 }
@@ -142,45 +144,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                __flush_tlb_all();
        }
 #endif
+        this_cpu_write(cpu_tlbstate.is_lazy, false);
        if (real_prev == next) {
                VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
                          next->context.ctx_id);
-                if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
-                        /*
-                         * There's nothing to do: we weren't lazy, and we
-                         * aren't changing our mm.  We don't need to flush
-                         * anything, nor do we need to update CR3, CR4, or
-                         * LDTR.
-                         */
-                        return;
-                }
-                /* Resume remote flushes and then read tlb_gen. */
-                cpumask_set_cpu(cpu, mm_cpumask(next));
-                next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-                if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
-                    next_tlb_gen) {
-                        /*
-                         * Ideally, we'd have a flush_tlb() variant that
-                         * takes the known CR3 value as input.  This would
-                         * be faster on Xen PV and on hypothetical CPUs
-                         * on which INVPCID is fast.
-                         */
-                        this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
-                                       next_tlb_gen);
-                        write_cr3(build_cr3(next, prev_asid));
-                        trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
-                                        TLB_FLUSH_ALL);
-                }
                /*
-                 * We just exited lazy mode, which means that CR4 and/or LDTR
+                 * We don't currently support having a real mm loaded without
-                 * may be stale.  (Changes to the required CR4 and LDTR states
+                 * our cpu set in mm_cpumask().  We have all the bookkeeping
-                 * are not reflected in tlb_gen.)
+                 * in place to figure out whether we would need to flush
+                 * if our cpu were cleared in mm_cpumask(), but we don't
+                 * currently use it.
                 */
+                if (WARN_ON_ONCE(real_prev != &init_mm &&
+                                 !cpumask_test_cpu(cpu, mm_cpumask(next))))
+                        cpumask_set_cpu(cpu, mm_cpumask(next));
+                return;
        } else {
                u16 new_asid;
                bool need_flush;
@@ -199,10 +180,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                }
                /* Stop remote flushes for the previous mm */
-                if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
+                VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
-                        cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+                                real_prev != &init_mm);
+                cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
-                VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
                /*
                 * Start remote flushes and then read tlb_gen.
@@ -233,6 +213,37 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 }
 /*
+ * enter_lazy_tlb() is a hint from the scheduler that we are entering a
+ * kernel thread or other context without an mm.  Acceptable implementations
+ * include doing nothing whatsoever, switching to init_mm, or various clever
+ * lazy tricks to try to minimize TLB flushes.
+ *
+ * The scheduler reserves the right to call enter_lazy_tlb() several times
+ * in a row.  It will notify us that we're going back to a real mm by
+ * calling switch_mm_irqs_off().
+ */
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+        if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
+                return;
+        if (static_branch_unlikely(&tlb_use_lazy_mode)) {
+                /*
+                 * There's a significant optimization that may be possible
+                 * here.  We have accurate enough TLB flush tracking that we
+                 * don't need to maintain coherence of TLB per se when we're
+                 * lazy.  We do, however, need to maintain coherence of
+                 * paging-structure caches.  We could, in principle, leave our
+                 * old mm loaded and only switch to init_mm when
+                 * tlb_remove_page() happens.
+                 */
+                this_cpu_write(cpu_tlbstate.is_lazy, true);
+        } else {
+                switch_mm(NULL, &init_mm, NULL);
+        }
+}
+/*
 * Call this when reinitializing a CPU.  It fixes the following potential
 * problems:
 *
@@ -303,16 +314,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
        /* This code cannot presently handle being reentered. */
        VM_WARN_ON(!irqs_disabled());
+        if (unlikely(loaded_mm == &init_mm))
+                return;
        VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
                   loaded_mm->context.ctx_id);
-        if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
+        if (this_cpu_read(cpu_tlbstate.is_lazy)) {
                /*
-                 * We're in lazy mode -- don't flush.  We can get here on
+                 * We're in lazy mode.  We need to at least flush our
-                 * remote flushes due to races and on local flushes if a
+                 * paging-structure cache to avoid speculatively reading
-                 * kernel thread coincidentally flushes the mm it's lazily
+                 * garbage into our TLB.  Since switching to init_mm is barely
-                 * still using.
+                 * slower than a minimal flush, just switch to init_mm.
                 */
+                switch_mm_irqs_off(NULL, &init_mm, NULL);
                return;
        }
@@ -611,3 +626,57 @@ static int __init create_tlb_single_page_flush_ceiling(void)
        return 0;
 }
 late_initcall(create_tlb_single_page_flush_ceiling);
+static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf,
+                                 size_t count, loff_t *ppos)
+{
+        char buf[2];
+        buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0';
+        buf[1] = '\n';
+        return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+static ssize_t tlblazy_write_file(struct file *file,
+                 const char __user *user_buf, size_t count, loff_t *ppos)
+{
+        bool val;
+        if (kstrtobool_from_user(user_buf, count, &val))
+                return -EINVAL;
+        if (val)
+                static_branch_enable(&tlb_use_lazy_mode);
+        else
+                static_branch_disable(&tlb_use_lazy_mode);
+        return count;
+}
+static const struct file_operations fops_tlblazy = {
+        .read = tlblazy_read_file,
+        .write = tlblazy_write_file,
+        .llseek = default_llseek,
+};
+static int __init init_tlb_use_lazy_mode(void)
+{
+        if (boot_cpu_has(X86_FEATURE_PCID)) {
+                /*
+                 * Heuristic: with PCID on, switching to and from
+                 * init_mm is reasonably fast, but remote flush IPIs
+                 * as expensive as ever, so turn off lazy TLB mode.
+                 *
+                 * We can't do this in setup_pcid() because static keys
+                 * haven't been initialized yet, and it would blow up
+                 * badly.
+                 */
+                static_branch_disable(&tlb_use_lazy_mode);
+        }
+        debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR,
+                            arch_debugfs_dir, NULL, &fops_tlblazy);
+        return 0;
+}
+late_initcall(init_tlb_use_lazy_mode);

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 49d9778376d7..658bf0090565 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,8 @@
30		30
31	atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);	31	atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
32		32
		33	DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
		34
33	static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,	35	static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
34	u16 new_asid, bool need_flush)	36	u16 new_asid, bool need_flush)
35	{	37	{
@@ -80,7 +82,7 @@ void leave_mm(int cpu)
80	return;	82	return;
81		83
82	/* Warn if we're not lazy. */	84	/* Warn if we're not lazy. */
83	WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));	85	WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
84		86
85	switch_mm(NULL, &init_mm, NULL);	87	switch_mm(NULL, &init_mm, NULL);
86	}	88	}
@@ -142,45 +144,24 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
142	__flush_tlb_all();	144	__flush_tlb_all();
143	}	145	}
144	#endif	146	#endif
		147	this_cpu_write(cpu_tlbstate.is_lazy, false);
145		148
146	if (real_prev == next) {	149	if (real_prev == next) {
147	VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=	150	VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
148	next->context.ctx_id);	151	next->context.ctx_id);
149		152
150	if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
151	/*
152	* There's nothing to do: we weren't lazy, and we
153	* aren't changing our mm. We don't need to flush
154	* anything, nor do we need to update CR3, CR4, or
155	* LDTR.
156	*/
157	return;
158	}
159
160	/* Resume remote flushes and then read tlb_gen. */
161	cpumask_set_cpu(cpu, mm_cpumask(next));
162	next_tlb_gen = atomic64_read(&next->context.tlb_gen);
163
164	if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
165	next_tlb_gen) {
166	/*
167	* Ideally, we'd have a flush_tlb() variant that
168	* takes the known CR3 value as input. This would
169	* be faster on Xen PV and on hypothetical CPUs
170	* on which INVPCID is fast.
171	*/
172	this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
173	next_tlb_gen);
174	write_cr3(build_cr3(next, prev_asid));
175	trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
176	TLB_FLUSH_ALL);
177	}
178
179	/*	153	/*
180	* We just exited lazy mode, which means that CR4 and/or LDTR	154	* We don't currently support having a real mm loaded without
181	* may be stale. (Changes to the required CR4 and LDTR states	155	* our cpu set in mm_cpumask(). We have all the bookkeeping
182	* are not reflected in tlb_gen.)	156	* in place to figure out whether we would need to flush
		157	* if our cpu were cleared in mm_cpumask(), but we don't
		158	* currently use it.
183	*/	159	*/
		160	if (WARN_ON_ONCE(real_prev != &init_mm &&
		161	!cpumask_test_cpu(cpu, mm_cpumask(next))))
		162	cpumask_set_cpu(cpu, mm_cpumask(next));
		163
		164	return;
184	} else {	165	} else {
185	u16 new_asid;	166	u16 new_asid;
186	bool need_flush;	167	bool need_flush;
@@ -199,10 +180,9 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
199	}	180	}
200		181
201	/* Stop remote flushes for the previous mm */	182	/* Stop remote flushes for the previous mm */
202	if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))	183	VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
203	cpumask_clear_cpu(cpu, mm_cpumask(real_prev));	184	real_prev != &init_mm);
204		185	cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
205	VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
206		186
207	/*	187	/*
208	* Start remote flushes and then read tlb_gen.	188	* Start remote flushes and then read tlb_gen.
@@ -233,6 +213,37 @@ void switch_mm_irqs_off(struct mm_struct prev, struct mm_struct next,
233	}	213	}
234		214
235	/*	215	/*
		216	* enter_lazy_tlb() is a hint from the scheduler that we are entering a
		217	* kernel thread or other context without an mm. Acceptable implementations
		218	* include doing nothing whatsoever, switching to init_mm, or various clever
		219	* lazy tricks to try to minimize TLB flushes.
		220	*
		221	* The scheduler reserves the right to call enter_lazy_tlb() several times
		222	* in a row. It will notify us that we're going back to a real mm by
		223	* calling switch_mm_irqs_off().
		224	*/
		225	void enter_lazy_tlb(struct mm_struct mm, struct task_struct tsk)
		226	{
		227	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
		228	return;
		229
		230	if (static_branch_unlikely(&tlb_use_lazy_mode)) {
		231	/*
		232	* There's a significant optimization that may be possible
		233	* here. We have accurate enough TLB flush tracking that we
		234	* don't need to maintain coherence of TLB per se when we're
		235	* lazy. We do, however, need to maintain coherence of
		236	* paging-structure caches. We could, in principle, leave our
		237	* old mm loaded and only switch to init_mm when
		238	* tlb_remove_page() happens.
		239	*/
		240	this_cpu_write(cpu_tlbstate.is_lazy, true);
		241	} else {
		242	switch_mm(NULL, &init_mm, NULL);
		243	}
		244	}
		245
		246	/*
236	* Call this when reinitializing a CPU. It fixes the following potential	247	* Call this when reinitializing a CPU. It fixes the following potential
237	* problems:	248	* problems:
238	*	249	*
@@ -303,16 +314,20 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
303	/* This code cannot presently handle being reentered. */	314	/* This code cannot presently handle being reentered. */
304	VM_WARN_ON(!irqs_disabled());	315	VM_WARN_ON(!irqs_disabled());
305		316
		317	if (unlikely(loaded_mm == &init_mm))
		318	return;
		319
306	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=	320	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
307	loaded_mm->context.ctx_id);	321	loaded_mm->context.ctx_id);
308		322
309	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {	323	if (this_cpu_read(cpu_tlbstate.is_lazy)) {
310	/*	324	/*
311	* We're in lazy mode -- don't flush. We can get here on	325	* We're in lazy mode. We need to at least flush our
312	* remote flushes due to races and on local flushes if a	326	* paging-structure cache to avoid speculatively reading
313	* kernel thread coincidentally flushes the mm it's lazily	327	* garbage into our TLB. Since switching to init_mm is barely
314	* still using.	328	* slower than a minimal flush, just switch to init_mm.
315	*/	329	*/
		330	switch_mm_irqs_off(NULL, &init_mm, NULL);
316	return;	331	return;
317	}	332	}
318		333
@@ -611,3 +626,57 @@ static int __init create_tlb_single_page_flush_ceiling(void)
611	return 0;	626	return 0;
612	}	627	}
613	late_initcall(create_tlb_single_page_flush_ceiling);	628	late_initcall(create_tlb_single_page_flush_ceiling);
		629
		630	static ssize_t tlblazy_read_file(struct file file, char __user user_buf,
		631	size_t count, loff_t *ppos)
		632	{
		633	char buf[2];
		634
		635	buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0';
		636	buf[1] = '\n';
		637
		638	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
		639	}
		640
		641	static ssize_t tlblazy_write_file(struct file *file,
		642	const char __user user_buf, size_t count, loff_t ppos)
		643	{
		644	bool val;
		645
		646	if (kstrtobool_from_user(user_buf, count, &val))
		647	return -EINVAL;
		648
		649	if (val)
		650	static_branch_enable(&tlb_use_lazy_mode);
		651	else
		652	static_branch_disable(&tlb_use_lazy_mode);
		653
		654	return count;
		655	}
		656
		657	static const struct file_operations fops_tlblazy = {
		658	.read = tlblazy_read_file,
		659	.write = tlblazy_write_file,
		660	.llseek = default_llseek,
		661	};
		662
		663	static int __init init_tlb_use_lazy_mode(void)
		664	{
		665	if (boot_cpu_has(X86_FEATURE_PCID)) {
		666	/*
		667	* Heuristic: with PCID on, switching to and from
		668	* init_mm is reasonably fast, but remote flush IPIs
		669	* as expensive as ever, so turn off lazy TLB mode.
		670	*
		671	* We can't do this in setup_pcid() because static keys
		672	* haven't been initialized yet, and it would blow up
		673	* badly.
		674	*/
		675	static_branch_disable(&tlb_use_lazy_mode);
		676	}
		677
		678	debugfs_create_file("tlb_use_lazy_mode", S_IRUSR \| S_IWUSR,
		679	arch_debugfs_dir, NULL, &fops_tlblazy);
		680	return 0;
		681	}
		682	late_initcall(init_tlb_use_lazy_mode);