aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Lutomirski <luto@amacapital.net>2014-11-11 15:49:41 -0500
committerAndy Lutomirski <luto@amacapital.net>2015-01-02 13:22:45 -0500
commit48e08d0fb265b007ebbb29a72297ff7e40938969 (patch)
tree424a8207cc53c2b0dfbd9fb12bee15952ce822ae
parent734d16801349fbe951d2f780191d32c5b8a892d1 (diff)
x86, entry: Switch stacks on a paranoid entry from userspace
This causes all non-NMI, non-double-fault kernel entries from userspace to run on the normal kernel stack. Double-fault is exempt to minimize confusion if we double-fault directly from userspace due to a bad kernel stack. This is, suprisingly, simpler and shorter than the current code. It removes the IMO rather frightening paranoid_userspace path, and it make sync_regs much simpler. There is no risk of stack overflow due to this change -- the kernel stack that we switch to is empty. This will also enable us to create non-atomic sections within machine checks from userspace, which will simplify memory failure handling. It will also allow the upcoming fsgsbase code to be simplified, because it doesn't need to worry about usergs when scheduling in paranoid_exit, as that code no longer exists. Cc: Oleg Nesterov <oleg@redhat.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Tony Luck <tony.luck@intel.com> Acked-by: Borislav Petkov <bp@alien8.de> Signed-off-by: Andy Lutomirski <luto@amacapital.net>
-rw-r--r--Documentation/x86/entry_64.txt18
-rw-r--r--Documentation/x86/x86_64/kernel-stacks8
-rw-r--r--arch/x86/kernel/entry_64.S86
-rw-r--r--arch/x86/kernel/traps.c23
4 files changed, 67 insertions, 68 deletions
diff --git a/Documentation/x86/entry_64.txt b/Documentation/x86/entry_64.txt
index 4a1c5c2dc5a9..9132b86176a3 100644
--- a/Documentation/x86/entry_64.txt
+++ b/Documentation/x86/entry_64.txt
@@ -78,9 +78,6 @@ The expensive (paranoid) way is to read back the MSR_GS_BASE value
78 xorl %ebx,%ebx 78 xorl %ebx,%ebx
791: ret 791: ret
80 80
81and the whole paranoid non-paranoid macro complexity is about whether
82to suffer that RDMSR cost.
83
84If we are at an interrupt or user-trap/gate-alike boundary then we can 81If we are at an interrupt or user-trap/gate-alike boundary then we can
85use the faster check: the stack will be a reliable indicator of 82use the faster check: the stack will be a reliable indicator of
86whether SWAPGS was already done: if we see that we are a secondary 83whether SWAPGS was already done: if we see that we are a secondary
@@ -93,6 +90,15 @@ which might have triggered right after a normal entry wrote CS to the
93stack but before we executed SWAPGS, then the only safe way to check 90stack but before we executed SWAPGS, then the only safe way to check
94for GS is the slower method: the RDMSR. 91for GS is the slower method: the RDMSR.
95 92
96So we try only to mark those entry methods 'paranoid' that absolutely 93Therefore, super-atomic entries (except NMI, which is handled separately)
97need the more expensive check for the GS base - and we generate all 94must use idtentry with paranoid=1 to handle gsbase correctly. This
98'normal' entry points with the regular (faster) entry macros. 95triggers three main behavior changes:
96
97 - Interrupt entry will use the slower gsbase check.
98 - Interrupt entry from user mode will switch off the IST stack.
99 - Interrupt exit to kernel mode will not attempt to reschedule.
100
101We try to only use IST entries and the paranoid entry code for vectors
102that absolutely need the more expensive check for the GS base - and we
103generate all 'normal' entry points with the regular (faster) paranoid=0
104variant.
diff --git a/Documentation/x86/x86_64/kernel-stacks b/Documentation/x86/x86_64/kernel-stacks
index a01eec5d1d0b..e3c8a49d1a2f 100644
--- a/Documentation/x86/x86_64/kernel-stacks
+++ b/Documentation/x86/x86_64/kernel-stacks
@@ -40,9 +40,11 @@ An IST is selected by a non-zero value in the IST field of an
40interrupt-gate descriptor. When an interrupt occurs and the hardware 40interrupt-gate descriptor. When an interrupt occurs and the hardware
41loads such a descriptor, the hardware automatically sets the new stack 41loads such a descriptor, the hardware automatically sets the new stack
42pointer based on the IST value, then invokes the interrupt handler. If 42pointer based on the IST value, then invokes the interrupt handler. If
43software wants to allow nested IST interrupts then the handler must 43the interrupt came from user mode, then the interrupt handler prologue
44adjust the IST values on entry to and exit from the interrupt handler. 44will switch back to the per-thread stack. If software wants to allow
45(This is occasionally done, e.g. for debug exceptions.) 45nested IST interrupts then the handler must adjust the IST values on
46entry to and exit from the interrupt handler. (This is occasionally
47done, e.g. for debug exceptions.)
46 48
47Events with different IST codes (i.e. with different stacks) can be 49Events with different IST codes (i.e. with different stacks) can be
48nested. For example, a debug interrupt can safely be interrupted by an 50nested. For example, a debug interrupt can safely be interrupted by an
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 9ebaf63ba182..931f32f4578b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1048,6 +1048,11 @@ ENTRY(\sym)
1048 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1048 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1049 1049
1050 .if \paranoid 1050 .if \paranoid
1051 .if \paranoid == 1
1052 CFI_REMEMBER_STATE
1053 testl $3, CS(%rsp) /* If coming from userspace, switch */
1054 jnz 1f /* stacks. */
1055 .endif
1051 call save_paranoid 1056 call save_paranoid
1052 .else 1057 .else
1053 call error_entry 1058 call error_entry
@@ -1088,6 +1093,36 @@ ENTRY(\sym)
1088 jmp error_exit /* %ebx: no swapgs flag */ 1093 jmp error_exit /* %ebx: no swapgs flag */
1089 .endif 1094 .endif
1090 1095
1096 .if \paranoid == 1
1097 CFI_RESTORE_STATE
1098 /*
1099 * Paranoid entry from userspace. Switch stacks and treat it
1100 * as a normal entry. This means that paranoid handlers
1101 * run in real process context if user_mode(regs).
1102 */
11031:
1104 call error_entry
1105
1106 DEFAULT_FRAME 0
1107
1108 movq %rsp,%rdi /* pt_regs pointer */
1109 call sync_regs
1110 movq %rax,%rsp /* switch stack */
1111
1112 movq %rsp,%rdi /* pt_regs pointer */
1113
1114 .if \has_error_code
1115 movq ORIG_RAX(%rsp),%rsi /* get error code */
1116 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1117 .else
1118 xorl %esi,%esi /* no error code */
1119 .endif
1120
1121 call \do_sym
1122
1123 jmp error_exit /* %ebx: no swapgs flag */
1124 .endif
1125
1091 CFI_ENDPROC 1126 CFI_ENDPROC
1092END(\sym) 1127END(\sym)
1093.endm 1128.endm
@@ -1108,7 +1143,7 @@ idtentry overflow do_overflow has_error_code=0
1108idtentry bounds do_bounds has_error_code=0 1143idtentry bounds do_bounds has_error_code=0
1109idtentry invalid_op do_invalid_op has_error_code=0 1144idtentry invalid_op do_invalid_op has_error_code=0
1110idtentry device_not_available do_device_not_available has_error_code=0 1145idtentry device_not_available do_device_not_available has_error_code=0
1111idtentry double_fault do_double_fault has_error_code=1 paranoid=1 1146idtentry double_fault do_double_fault has_error_code=1 paranoid=2
1112idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 1147idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
1113idtentry invalid_TSS do_invalid_TSS has_error_code=1 1148idtentry invalid_TSS do_invalid_TSS has_error_code=1
1114idtentry segment_not_present do_segment_not_present has_error_code=1 1149idtentry segment_not_present do_segment_not_present has_error_code=1
@@ -1289,16 +1324,14 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(
1289#endif 1324#endif
1290 1325
1291 /* 1326 /*
1292 * "Paranoid" exit path from exception stack. 1327 * "Paranoid" exit path from exception stack. This is invoked
1293 * Paranoid because this is used by NMIs and cannot take 1328 * only on return from non-NMI IST interrupts that came
1294 * any kernel state for granted. 1329 * from kernel space.
1295 * We don't do kernel preemption checks here, because only
1296 * NMI should be common and it does not enable IRQs and
1297 * cannot get reschedule ticks.
1298 * 1330 *
1299 * "trace" is 0 for the NMI handler only, because irq-tracing 1331 * We may be returning to very strange contexts (e.g. very early
1300 * is fundamentally NMI-unsafe. (we cannot change the soft and 1332 * in syscall entry), so checking for preemption here would
1301 * hard flags at once, atomically) 1333 * be complicated. Fortunately, we there's no good reason
1334 * to try to handle preemption here.
1302 */ 1335 */
1303 1336
1304 /* ebx: no swapgs flag */ 1337 /* ebx: no swapgs flag */
@@ -1308,43 +1341,14 @@ ENTRY(paranoid_exit)
1308 TRACE_IRQS_OFF_DEBUG 1341 TRACE_IRQS_OFF_DEBUG
1309 testl %ebx,%ebx /* swapgs needed? */ 1342 testl %ebx,%ebx /* swapgs needed? */
1310 jnz paranoid_restore 1343 jnz paranoid_restore
1311 testl $3,CS(%rsp)
1312 jnz paranoid_userspace
1313paranoid_swapgs:
1314 TRACE_IRQS_IRETQ 0 1344 TRACE_IRQS_IRETQ 0
1315 SWAPGS_UNSAFE_STACK 1345 SWAPGS_UNSAFE_STACK
1316 RESTORE_ALL 8 1346 RESTORE_ALL 8
1317 jmp irq_return 1347 INTERRUPT_RETURN
1318paranoid_restore: 1348paranoid_restore:
1319 TRACE_IRQS_IRETQ_DEBUG 0 1349 TRACE_IRQS_IRETQ_DEBUG 0
1320 RESTORE_ALL 8 1350 RESTORE_ALL 8
1321 jmp irq_return 1351 INTERRUPT_RETURN
1322paranoid_userspace:
1323 GET_THREAD_INFO(%rcx)
1324 movl TI_flags(%rcx),%ebx
1325 andl $_TIF_WORK_MASK,%ebx
1326 jz paranoid_swapgs
1327 movq %rsp,%rdi /* &pt_regs */
1328 call sync_regs
1329 movq %rax,%rsp /* switch stack for scheduling */
1330 testl $_TIF_NEED_RESCHED,%ebx
1331 jnz paranoid_schedule
1332 movl %ebx,%edx /* arg3: thread flags */
1333 TRACE_IRQS_ON
1334 ENABLE_INTERRUPTS(CLBR_NONE)
1335 xorl %esi,%esi /* arg2: oldset */
1336 movq %rsp,%rdi /* arg1: &pt_regs */
1337 call do_notify_resume
1338 DISABLE_INTERRUPTS(CLBR_NONE)
1339 TRACE_IRQS_OFF
1340 jmp paranoid_userspace
1341paranoid_schedule:
1342 TRACE_IRQS_ON
1343 ENABLE_INTERRUPTS(CLBR_ANY)
1344 SCHEDULE_USER
1345 DISABLE_INTERRUPTS(CLBR_ANY)
1346 TRACE_IRQS_OFF
1347 jmp paranoid_userspace
1348 CFI_ENDPROC 1352 CFI_ENDPROC
1349END(paranoid_exit) 1353END(paranoid_exit)
1350 1354
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 88900e288021..28f3e5ffc55d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -466,27 +466,14 @@ NOKPROBE_SYMBOL(do_int3);
466 466
467#ifdef CONFIG_X86_64 467#ifdef CONFIG_X86_64
468/* 468/*
469 * Help handler running on IST stack to switch back to user stack 469 * Help handler running on IST stack to switch off the IST stack if the
470 * for scheduling or signal handling. The actual stack switch is done in 470 * interrupted code was in user mode. The actual stack switch is done in
471 * entry.S 471 * entry_64.S
472 */ 472 */
473asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) 473asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
474{ 474{
475 struct pt_regs *regs = eregs; 475 struct pt_regs *regs = task_pt_regs(current);
476 /* Did already sync */ 476 *regs = *eregs;
477 if (eregs == (struct pt_regs *)eregs->sp)
478 ;
479 /* Exception from user space */
480 else if (user_mode(eregs))
481 regs = task_pt_regs(current);
482 /*
483 * Exception from kernel and interrupts are enabled. Move to
484 * kernel process stack.
485 */
486 else if (eregs->flags & X86_EFLAGS_IF)
487 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
488 if (eregs != regs)
489 *regs = *eregs;
490 return regs; 477 return regs;
491} 478}
492NOKPROBE_SYMBOL(sync_regs); 479NOKPROBE_SYMBOL(sync_regs);