aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2018-11-25 13:33:49 -0500
committerThomas Gleixner <tglx@linutronix.de>2018-11-28 05:57:11 -0500
commit4c71a2b6fd7e42814aa68a6dec88abf3b42ea573 (patch)
tree321ae1f298901ae668d8107c04b59100ba5d4a6f
parent5635d99953f04b550738f6f4c1c532667c3fd872 (diff)
x86/speculation: Prepare for conditional IBPB in switch_mm()
The IBPB speculation barrier is issued from switch_mm() when the kernel switches to a user space task with a different mm than the user space task which ran last on the same CPU. An additional optimization is to avoid IBPB when the incoming task can be ptraced by the outgoing task. This optimization only works when switching directly between two user space tasks. When switching from a kernel task to a user space task the optimization fails because the previous task cannot be accessed anymore. So for quite some scenarios the optimization is just adding overhead. The upcoming conditional IBPB support will issue IBPB only for user space tasks which have the TIF_SPEC_IB bit set. This requires to handle the following cases: 1) Switch from a user space task (potential attacker) which has TIF_SPEC_IB set to a user space task (potential victim) which has TIF_SPEC_IB not set. 2) Switch from a user space task (potential attacker) which has TIF_SPEC_IB not set to a user space task (potential victim) which has TIF_SPEC_IB set. This needs to be optimized for the case where the IBPB can be avoided when only kernel threads ran in between user space tasks which belong to the same process. The current check whether two tasks belong to the same context is using the tasks context id. While correct, it's simpler to use the mm pointer because it allows to mangle the TIF_SPEC_IB bit into it. The context id based mechanism requires extra storage, which creates worse code. When a task is scheduled out its TIF_SPEC_IB bit is mangled as bit 0 into the per CPU storage which is used to track the last user space mm which was running on a CPU. This bit can be used together with the TIF_SPEC_IB bit of the incoming task to make the decision whether IBPB needs to be issued or not to cover the two cases above. As conditional IBPB is going to be the default, remove the dubious ptrace check for the IBPB always case and simply issue IBPB always when the process changes. Move the storage to a different place in the struct as the original one created a hole. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Ingo Molnar <mingo@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: David Woodhouse <dwmw@amazon.co.uk> Cc: Tim Chen <tim.c.chen@linux.intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Casey Schaufler <casey.schaufler@intel.com> Cc: Asit Mallick <asit.k.mallick@intel.com> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: Jon Masters <jcm@redhat.com> Cc: Waiman Long <longman9394@gmail.com> Cc: Greg KH <gregkh@linuxfoundation.org> Cc: Dave Stewart <david.c.stewart@intel.com> Cc: Kees Cook <keescook@chromium.org> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185005.466447057@linutronix.de
-rw-r--r--arch/x86/include/asm/nospec-branch.h2
-rw-r--r--arch/x86/include/asm/tlbflush.h8
-rw-r--r--arch/x86/kernel/cpu/bugs.c29
-rw-r--r--arch/x86/mm/tlb.c115
4 files changed, 118 insertions, 36 deletions
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index be0b0aa780e2..d4d35baf0430 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -312,6 +312,8 @@ do { \
312} while (0) 312} while (0)
313 313
314DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); 314DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
315DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
316DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
315 317
316#endif /* __ASSEMBLY__ */ 318#endif /* __ASSEMBLY__ */
317 319
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index d760611cfc35..f4204bf377fc 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -169,10 +169,14 @@ struct tlb_state {
169 169
170#define LOADED_MM_SWITCHING ((struct mm_struct *)1) 170#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
171 171
172 /* Last user mm for optimizing IBPB */
173 union {
174 struct mm_struct *last_user_mm;
175 unsigned long last_user_mm_ibpb;
176 };
177
172 u16 loaded_mm_asid; 178 u16 loaded_mm_asid;
173 u16 next_asid; 179 u16 next_asid;
174 /* last user mm's ctx id */
175 u64 last_ctx_id;
176 180
177 /* 181 /*
178 * We can be in one of several states: 182 * We can be in one of several states:
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 1e13dbfc0919..7c946a9af947 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -56,6 +56,10 @@ u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
56 56
57/* Control conditional STIPB in switch_to() */ 57/* Control conditional STIPB in switch_to() */
58DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp); 58DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
59/* Control conditional IBPB in switch_mm() */
60DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
61/* Control unconditional IBPB in switch_mm() */
62DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
59 63
60void __init check_bugs(void) 64void __init check_bugs(void)
61{ 65{
@@ -331,7 +335,17 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
331 /* Initialize Indirect Branch Prediction Barrier */ 335 /* Initialize Indirect Branch Prediction Barrier */
332 if (boot_cpu_has(X86_FEATURE_IBPB)) { 336 if (boot_cpu_has(X86_FEATURE_IBPB)) {
333 setup_force_cpu_cap(X86_FEATURE_USE_IBPB); 337 setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
334 pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n"); 338
339 switch (mode) {
340 case SPECTRE_V2_USER_STRICT:
341 static_branch_enable(&switch_mm_always_ibpb);
342 break;
343 default:
344 break;
345 }
346
347 pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
348 mode == SPECTRE_V2_USER_STRICT ? "always-on" : "conditional");
335 } 349 }
336 350
337 /* If enhanced IBRS is enabled no STIPB required */ 351 /* If enhanced IBRS is enabled no STIPB required */
@@ -955,10 +969,15 @@ static char *stibp_state(void)
955 969
956static char *ibpb_state(void) 970static char *ibpb_state(void)
957{ 971{
958 if (boot_cpu_has(X86_FEATURE_USE_IBPB)) 972 if (boot_cpu_has(X86_FEATURE_IBPB)) {
959 return ", IBPB"; 973 switch (spectre_v2_user) {
960 else 974 case SPECTRE_V2_USER_NONE:
961 return ""; 975 return ", IBPB: disabled";
976 case SPECTRE_V2_USER_STRICT:
977 return ", IBPB: always-on";
978 }
979 }
980 return "";
962} 981}
963 982
964static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, 983static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index bddd6b3cee1d..03b6b4c2238d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,7 +7,6 @@
7#include <linux/export.h> 7#include <linux/export.h>
8#include <linux/cpu.h> 8#include <linux/cpu.h>
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/ptrace.h>
11 10
12#include <asm/tlbflush.h> 11#include <asm/tlbflush.h>
13#include <asm/mmu_context.h> 12#include <asm/mmu_context.h>
@@ -31,6 +30,12 @@
31 */ 30 */
32 31
33/* 32/*
33 * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
34 * stored in cpu_tlb_state.last_user_mm_ibpb.
35 */
36#define LAST_USER_MM_IBPB 0x1UL
37
38/*
34 * We get here when we do something requiring a TLB invalidation 39 * We get here when we do something requiring a TLB invalidation
35 * but could not go invalidate all of the contexts. We do the 40 * but could not go invalidate all of the contexts. We do the
36 * necessary invalidation by clearing out the 'ctx_id' which 41 * necessary invalidation by clearing out the 'ctx_id' which
@@ -181,17 +186,87 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
181 } 186 }
182} 187}
183 188
184static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id) 189static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
190{
191 unsigned long next_tif = task_thread_info(next)->flags;
192 unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
193
194 return (unsigned long)next->mm | ibpb;
195}
196
197static void cond_ibpb(struct task_struct *next)
185{ 198{
199 if (!next || !next->mm)
200 return;
201
186 /* 202 /*
187 * Check if the current (previous) task has access to the memory 203 * Both, the conditional and the always IBPB mode use the mm
188 * of the @tsk (next) task. If access is denied, make sure to 204 * pointer to avoid the IBPB when switching between tasks of the
189 * issue a IBPB to stop user->user Spectre-v2 attacks. 205 * same process. Using the mm pointer instead of mm->context.ctx_id
190 * 206 * opens a hypothetical hole vs. mm_struct reuse, which is more or
191 * Note: __ptrace_may_access() returns 0 or -ERRNO. 207 * less impossible to control by an attacker. Aside of that it
208 * would only affect the first schedule so the theoretically
209 * exposed data is not really interesting.
192 */ 210 */
193 return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id && 211 if (static_branch_likely(&switch_mm_cond_ibpb)) {
194 ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB)); 212 unsigned long prev_mm, next_mm;
213
214 /*
215 * This is a bit more complex than the always mode because
216 * it has to handle two cases:
217 *
218 * 1) Switch from a user space task (potential attacker)
219 * which has TIF_SPEC_IB set to a user space task
220 * (potential victim) which has TIF_SPEC_IB not set.
221 *
222 * 2) Switch from a user space task (potential attacker)
223 * which has TIF_SPEC_IB not set to a user space task
224 * (potential victim) which has TIF_SPEC_IB set.
225 *
226 * This could be done by unconditionally issuing IBPB when
227 * a task which has TIF_SPEC_IB set is either scheduled in
228 * or out. Though that results in two flushes when:
229 *
230 * - the same user space task is scheduled out and later
231 * scheduled in again and only a kernel thread ran in
232 * between.
233 *
234 * - a user space task belonging to the same process is
235 * scheduled in after a kernel thread ran in between
236 *
237 * - a user space task belonging to the same process is
238 * scheduled in immediately.
239 *
240 * Optimize this with reasonably small overhead for the
241 * above cases. Mangle the TIF_SPEC_IB bit into the mm
242 * pointer of the incoming task which is stored in
243 * cpu_tlbstate.last_user_mm_ibpb for comparison.
244 */
245 next_mm = mm_mangle_tif_spec_ib(next);
246 prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
247
248 /*
249 * Issue IBPB only if the mm's are different and one or
250 * both have the IBPB bit set.
251 */
252 if (next_mm != prev_mm &&
253 (next_mm | prev_mm) & LAST_USER_MM_IBPB)
254 indirect_branch_prediction_barrier();
255
256 this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
257 }
258
259 if (static_branch_unlikely(&switch_mm_always_ibpb)) {
260 /*
261 * Only flush when switching to a user space task with a
262 * different context than the user space task which ran
263 * last on this CPU.
264 */
265 if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
266 indirect_branch_prediction_barrier();
267 this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
268 }
269 }
195} 270}
196 271
197void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, 272void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
@@ -292,22 +367,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
292 new_asid = prev_asid; 367 new_asid = prev_asid;
293 need_flush = true; 368 need_flush = true;
294 } else { 369 } else {
295 u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
296
297 /* 370 /*
298 * Avoid user/user BTB poisoning by flushing the branch 371 * Avoid user/user BTB poisoning by flushing the branch
299 * predictor when switching between processes. This stops 372 * predictor when switching between processes. This stops
300 * one process from doing Spectre-v2 attacks on another. 373 * one process from doing Spectre-v2 attacks on another.
301 *
302 * As an optimization, flush indirect branches only when
303 * switching into a processes that can't be ptrace by the
304 * current one (as in such case, attacker has much more
305 * convenient way how to tamper with the next process than
306 * branch buffer poisoning).
307 */ 374 */
308 if (static_cpu_has(X86_FEATURE_USE_IBPB) && 375 cond_ibpb(tsk);
309 ibpb_needed(tsk, last_ctx_id))
310 indirect_branch_prediction_barrier();
311 376
312 if (IS_ENABLED(CONFIG_VMAP_STACK)) { 377 if (IS_ENABLED(CONFIG_VMAP_STACK)) {
313 /* 378 /*
@@ -365,14 +430,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
365 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); 430 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
366 } 431 }
367 432
368 /*
369 * Record last user mm's context id, so we can avoid
370 * flushing branch buffer with IBPB if we switch back
371 * to the same user.
372 */
373 if (next != &init_mm)
374 this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
375
376 /* Make sure we write CR3 before loaded_mm. */ 433 /* Make sure we write CR3 before loaded_mm. */
377 barrier(); 434 barrier();
378 435
@@ -441,7 +498,7 @@ void initialize_tlbstate_and_flush(void)
441 write_cr3(build_cr3(mm->pgd, 0)); 498 write_cr3(build_cr3(mm->pgd, 0));
442 499
443 /* Reinitialize tlbstate. */ 500 /* Reinitialize tlbstate. */
444 this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id); 501 this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
445 this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); 502 this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
446 this_cpu_write(cpu_tlbstate.next_asid, 1); 503 this_cpu_write(cpu_tlbstate.next_asid, 1);
447 this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); 504 this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);