diff options
author | Andy Lutomirski <luto@kernel.org> | 2015-04-26 19:47:59 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-26 20:57:38 -0400 |
commit | 61f01dd941ba9e06d2bf05994450ecc3d61b6b8b (patch) | |
tree | 3a4a4ef2d5e4d44bb4cb1708f9fc4749e9c9824a | |
parent | 1190944f4b12203330ac5ed8784f6c181bf26f2d (diff) |
x86_64, asm: Work around AMD SYSRET SS descriptor attribute issue
AMD CPUs don't reinitialize the SS descriptor on SYSRET, so SYSRET with
SS == 0 results in an invalid usermode state in which SS is apparently
equal to __USER_DS but causes #SS if used.
Work around the issue by setting SS to __KERNEL_DS __switch_to, thus
ensuring that SYSRET never happens with SS set to NULL.
This was exposed by a recent vDSO cleanup.
Fixes: e7d6eefaaa44 x86/vdso32/syscall.S: Do not load __USER32_DS to %ss
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | arch/x86/ia32/ia32entry.S | 7 | ||||
-rw-r--r-- | arch/x86/include/asm/cpufeature.h | 1 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/amd.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/entry_64.S | 9 | ||||
-rw-r--r-- | arch/x86/kernel/process_64.c | 28 |
5 files changed, 48 insertions, 0 deletions
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a821b1cd4fa7..72bf2680f819 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S | |||
@@ -427,6 +427,13 @@ sysretl_from_sys_call: | |||
427 | * cs and ss are loaded from MSRs. | 427 | * cs and ss are loaded from MSRs. |
428 | * (Note: 32bit->32bit SYSRET is different: since r11 | 428 | * (Note: 32bit->32bit SYSRET is different: since r11 |
429 | * does not exist, it merely sets eflags.IF=1). | 429 | * does not exist, it merely sets eflags.IF=1). |
430 | * | ||
431 | * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss | ||
432 | * descriptor is not reinitialized. This means that we must | ||
433 | * avoid SYSRET with SS == NULL, which could happen if we schedule, | ||
434 | * exit the kernel, and re-enter using an interrupt vector. (All | ||
435 | * interrupt entries on x86_64 set SS to NULL.) We prevent that | ||
436 | * from happening by reloading SS in __switch_to. | ||
430 | */ | 437 | */ |
431 | USERGS_SYSRET32 | 438 | USERGS_SYSRET32 |
432 | 439 | ||
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 7ee9b94d9921..3d6606fb97d0 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h | |||
@@ -265,6 +265,7 @@ | |||
265 | #define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ | 265 | #define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ |
266 | #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ | 266 | #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ |
267 | #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ | 267 | #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ |
268 | #define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ | ||
268 | 269 | ||
269 | #if defined(__KERNEL__) && !defined(__ASSEMBLY__) | 270 | #if defined(__KERNEL__) && !defined(__ASSEMBLY__) |
270 | 271 | ||
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index fd470ebf924e..e4cf63301ff4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -720,6 +720,9 @@ static void init_amd(struct cpuinfo_x86 *c) | |||
720 | if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) | 720 | if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) |
721 | if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) | 721 | if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) |
722 | set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); | 722 | set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); |
723 | |||
724 | /* AMD CPUs don't reset SS attributes on SYSRET */ | ||
725 | set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); | ||
723 | } | 726 | } |
724 | 727 | ||
725 | #ifdef CONFIG_X86_32 | 728 | #ifdef CONFIG_X86_32 |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c7b238494b31..02c2eff7478d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -295,6 +295,15 @@ system_call_fastpath: | |||
295 | * rflags from r11 (but RF and VM bits are forced to 0), | 295 | * rflags from r11 (but RF and VM bits are forced to 0), |
296 | * cs and ss are loaded from MSRs. | 296 | * cs and ss are loaded from MSRs. |
297 | * Restoration of rflags re-enables interrupts. | 297 | * Restoration of rflags re-enables interrupts. |
298 | * | ||
299 | * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss | ||
300 | * descriptor is not reinitialized. This means that we should | ||
301 | * avoid SYSRET with SS == NULL, which could happen if we schedule, | ||
302 | * exit the kernel, and re-enter using an interrupt vector. (All | ||
303 | * interrupt entries on x86_64 set SS to NULL.) We prevent that | ||
304 | * from happening by reloading SS in __switch_to. (Actually | ||
305 | * detecting the failure in 64-bit userspace is tricky but can be | ||
306 | * done.) | ||
298 | */ | 307 | */ |
299 | USERGS_SYSRET64 | 308 | USERGS_SYSRET64 |
300 | 309 | ||
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4baaa972f52a..ddfdbf74f174 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -419,6 +419,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
419 | task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) | 419 | task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) |
420 | __switch_to_xtra(prev_p, next_p, tss); | 420 | __switch_to_xtra(prev_p, next_p, tss); |
421 | 421 | ||
422 | if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { | ||
423 | /* | ||
424 | * AMD CPUs have a misfeature: SYSRET sets the SS selector but | ||
425 | * does not update the cached descriptor. As a result, if we | ||
426 | * do SYSRET while SS is NULL, we'll end up in user mode with | ||
427 | * SS apparently equal to __USER_DS but actually unusable. | ||
428 | * | ||
429 | * The straightforward workaround would be to fix it up just | ||
430 | * before SYSRET, but that would slow down the system call | ||
431 | * fast paths. Instead, we ensure that SS is never NULL in | ||
432 | * system call context. We do this by replacing NULL SS | ||
433 | * selectors at every context switch. SYSCALL sets up a valid | ||
434 | * SS, so the only way to get NULL is to re-enter the kernel | ||
435 | * from CPL 3 through an interrupt. Since that can't happen | ||
436 | * in the same task as a running syscall, we are guaranteed to | ||
437 | * context switch between every interrupt vector entry and a | ||
438 | * subsequent SYSRET. | ||
439 | * | ||
440 | * We read SS first because SS reads are much faster than | ||
441 | * writes. Out of caution, we force SS to __KERNEL_DS even if | ||
442 | * it previously had a different non-NULL value. | ||
443 | */ | ||
444 | unsigned short ss_sel; | ||
445 | savesegment(ss, ss_sel); | ||
446 | if (ss_sel != __KERNEL_DS) | ||
447 | loadsegment(ss, __KERNEL_DS); | ||
448 | } | ||
449 | |||
422 | return prev_p; | 450 | return prev_p; |
423 | } | 451 | } |
424 | 452 | ||