aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2015-02-04 03:01:12 -0500
committerIngo Molnar <mingo@kernel.org>2015-02-04 03:01:12 -0500
commit0967160ad615985c7c35443156ea9aecc60c37b8 (patch)
tree658f728aff1be23540180091b718452a6848a6b0
parent2fde4f94e0a9531251e706fa57131b51b0df042e (diff)
parentb57c0b5175ddbe9b477801f9994a5b330702c1ba (diff)
Merge branch 'x86/asm' into perf/x86, to avoid conflicts with upcoming patches
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--Documentation/x86/entry_64.txt18
-rw-r--r--Documentation/x86/x86_64/kernel-stacks8
-rw-r--r--arch/x86/ia32/ia32entry.S4
-rw-r--r--arch/x86/include/asm/calling.h1
-rw-r--r--arch/x86/include/asm/mce.h1
-rw-r--r--arch/x86/include/asm/thread_info.h15
-rw-r--r--arch/x86/include/asm/traps.h6
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c114
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c5
-rw-r--r--arch/x86/kernel/entry_64.S317
-rw-r--r--arch/x86/kernel/irq_32.c13
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/traps.c119
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--kernel/rcu/tree.c66
16 files changed, 374 insertions, 327 deletions
diff --git a/Documentation/x86/entry_64.txt b/Documentation/x86/entry_64.txt
index 4a1c5c2dc5a9..9132b86176a3 100644
--- a/Documentation/x86/entry_64.txt
+++ b/Documentation/x86/entry_64.txt
@@ -78,9 +78,6 @@ The expensive (paranoid) way is to read back the MSR_GS_BASE value
78 xorl %ebx,%ebx 78 xorl %ebx,%ebx
791: ret 791: ret
80 80
81and the whole paranoid non-paranoid macro complexity is about whether
82to suffer that RDMSR cost.
83
84If we are at an interrupt or user-trap/gate-alike boundary then we can 81If we are at an interrupt or user-trap/gate-alike boundary then we can
85use the faster check: the stack will be a reliable indicator of 82use the faster check: the stack will be a reliable indicator of
86whether SWAPGS was already done: if we see that we are a secondary 83whether SWAPGS was already done: if we see that we are a secondary
@@ -93,6 +90,15 @@ which might have triggered right after a normal entry wrote CS to the
93stack but before we executed SWAPGS, then the only safe way to check 90stack but before we executed SWAPGS, then the only safe way to check
94for GS is the slower method: the RDMSR. 91for GS is the slower method: the RDMSR.
95 92
96So we try only to mark those entry methods 'paranoid' that absolutely 93Therefore, super-atomic entries (except NMI, which is handled separately)
97need the more expensive check for the GS base - and we generate all 94must use idtentry with paranoid=1 to handle gsbase correctly. This
98'normal' entry points with the regular (faster) entry macros. 95triggers three main behavior changes:
96
97 - Interrupt entry will use the slower gsbase check.
98 - Interrupt entry from user mode will switch off the IST stack.
99 - Interrupt exit to kernel mode will not attempt to reschedule.
100
101We try to only use IST entries and the paranoid entry code for vectors
102that absolutely need the more expensive check for the GS base - and we
103generate all 'normal' entry points with the regular (faster) paranoid=0
104variant.
diff --git a/Documentation/x86/x86_64/kernel-stacks b/Documentation/x86/x86_64/kernel-stacks
index a01eec5d1d0b..e3c8a49d1a2f 100644
--- a/Documentation/x86/x86_64/kernel-stacks
+++ b/Documentation/x86/x86_64/kernel-stacks
@@ -40,9 +40,11 @@ An IST is selected by a non-zero value in the IST field of an
40interrupt-gate descriptor. When an interrupt occurs and the hardware 40interrupt-gate descriptor. When an interrupt occurs and the hardware
41loads such a descriptor, the hardware automatically sets the new stack 41loads such a descriptor, the hardware automatically sets the new stack
42pointer based on the IST value, then invokes the interrupt handler. If 42pointer based on the IST value, then invokes the interrupt handler. If
43software wants to allow nested IST interrupts then the handler must 43the interrupt came from user mode, then the interrupt handler prologue
44adjust the IST values on entry to and exit from the interrupt handler. 44will switch back to the per-thread stack. If software wants to allow
45(This is occasionally done, e.g. for debug exceptions.) 45nested IST interrupts then the handler must adjust the IST values on
46entry to and exit from the interrupt handler. (This is occasionally
47done, e.g. for debug exceptions.)
46 48
47Events with different IST codes (i.e. with different stacks) can be 49Events with different IST codes (i.e. with different stacks) can be
48nested. For example, a debug interrupt can safely be interrupted by an 50nested. For example, a debug interrupt can safely be interrupted by an
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 82e8a1d44658..156ebcab4ada 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -179,8 +179,8 @@ sysenter_dispatch:
179sysexit_from_sys_call: 179sysexit_from_sys_call:
180 andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) 180 andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
181 /* clear IF, that popfq doesn't enable interrupts early */ 181 /* clear IF, that popfq doesn't enable interrupts early */
182 andl $~0x200,EFLAGS-R11(%rsp) 182 andl $~0x200,EFLAGS-ARGOFFSET(%rsp)
183 movl RIP-R11(%rsp),%edx /* User %eip */ 183 movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */
184 CFI_REGISTER rip,rdx 184 CFI_REGISTER rip,rdx
185 RESTORE_ARGS 0,24,0,0,0,0 185 RESTORE_ARGS 0,24,0,0,0,0
186 xorq %r8,%r8 186 xorq %r8,%r8
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 76659b67fd11..1f1297b46f83 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -83,7 +83,6 @@ For 32-bit we have the following conventions - kernel is built with
83#define SS 160 83#define SS 160
84 84
85#define ARGOFFSET R11 85#define ARGOFFSET R11
86#define SWFRAME ORIG_RAX
87 86
88 .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 87 .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
89 subq $9*8+\addskip, %rsp 88 subq $9*8+\addskip, %rsp
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 51b26e895933..9b3de99dc004 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -190,7 +190,6 @@ enum mcp_flags {
190void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); 190void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
191 191
192int mce_notify_irq(void); 192int mce_notify_irq(void);
193void mce_notify_process(void);
194 193
195DECLARE_PER_CPU(struct mce, injectm); 194DECLARE_PER_CPU(struct mce, injectm);
196 195
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 547e344a6dc6..e82e95abc92b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -75,7 +75,6 @@ struct thread_info {
75#define TIF_SYSCALL_EMU 6 /* syscall emulation active */ 75#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
76#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 76#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
77#define TIF_SECCOMP 8 /* secure computing */ 77#define TIF_SECCOMP 8 /* secure computing */
78#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
79#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ 78#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
80#define TIF_UPROBE 12 /* breakpointed or singlestepping */ 79#define TIF_UPROBE 12 /* breakpointed or singlestepping */
81#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 80#define TIF_NOTSC 16 /* TSC is not accessible in userland */
@@ -100,7 +99,6 @@ struct thread_info {
100#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) 99#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
101#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 100#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
102#define _TIF_SECCOMP (1 << TIF_SECCOMP) 101#define _TIF_SECCOMP (1 << TIF_SECCOMP)
103#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
104#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) 102#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
105#define _TIF_UPROBE (1 << TIF_UPROBE) 103#define _TIF_UPROBE (1 << TIF_UPROBE)
106#define _TIF_NOTSC (1 << TIF_NOTSC) 104#define _TIF_NOTSC (1 << TIF_NOTSC)
@@ -140,7 +138,7 @@ struct thread_info {
140 138
141/* Only used for 64 bit */ 139/* Only used for 64 bit */
142#define _TIF_DO_NOTIFY_MASK \ 140#define _TIF_DO_NOTIFY_MASK \
143 (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \ 141 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \
144 _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) 142 _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
145 143
146/* flags to check in __switch_to() */ 144/* flags to check in __switch_to() */
@@ -170,6 +168,17 @@ static inline struct thread_info *current_thread_info(void)
170 return ti; 168 return ti;
171} 169}
172 170
171static inline unsigned long current_stack_pointer(void)
172{
173 unsigned long sp;
174#ifdef CONFIG_X86_64
175 asm("mov %%rsp,%0" : "=g" (sp));
176#else
177 asm("mov %%esp,%0" : "=g" (sp));
178#endif
179 return sp;
180}
181
173#else /* !__ASSEMBLY__ */ 182#else /* !__ASSEMBLY__ */
174 183
175/* how to get the thread information struct from ASM */ 184/* how to get the thread information struct from ASM */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 707adc6549d8..4e49d7dff78e 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_X86_TRAPS_H 1#ifndef _ASM_X86_TRAPS_H
2#define _ASM_X86_TRAPS_H 2#define _ASM_X86_TRAPS_H
3 3
4#include <linux/context_tracking_state.h>
4#include <linux/kprobes.h> 5#include <linux/kprobes.h>
5 6
6#include <asm/debugreg.h> 7#include <asm/debugreg.h>
@@ -110,6 +111,11 @@ asmlinkage void smp_thermal_interrupt(void);
110asmlinkage void mce_threshold_interrupt(void); 111asmlinkage void mce_threshold_interrupt(void);
111#endif 112#endif
112 113
114extern enum ctx_state ist_enter(struct pt_regs *regs);
115extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
116extern void ist_begin_non_atomic(struct pt_regs *regs);
117extern void ist_end_non_atomic(void);
118
113/* Interrupts/Exceptions */ 119/* Interrupts/Exceptions */
114enum { 120enum {
115 X86_TRAP_DE = 0, /* 0, Divide-by-zero */ 121 X86_TRAP_DE = 0, /* 0, Divide-by-zero */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d2c611699cd9..d23179900755 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -43,6 +43,7 @@
43#include <linux/export.h> 43#include <linux/export.h>
44 44
45#include <asm/processor.h> 45#include <asm/processor.h>
46#include <asm/traps.h>
46#include <asm/mce.h> 47#include <asm/mce.h>
47#include <asm/msr.h> 48#include <asm/msr.h>
48 49
@@ -1003,51 +1004,6 @@ static void mce_clear_state(unsigned long *toclear)
1003} 1004}
1004 1005
1005/* 1006/*
1006 * Need to save faulting physical address associated with a process
1007 * in the machine check handler some place where we can grab it back
1008 * later in mce_notify_process()
1009 */
1010#define MCE_INFO_MAX 16
1011
1012struct mce_info {
1013 atomic_t inuse;
1014 struct task_struct *t;
1015 __u64 paddr;
1016 int restartable;
1017} mce_info[MCE_INFO_MAX];
1018
1019static void mce_save_info(__u64 addr, int c)
1020{
1021 struct mce_info *mi;
1022
1023 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
1024 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
1025 mi->t = current;
1026 mi->paddr = addr;
1027 mi->restartable = c;
1028 return;
1029 }
1030 }
1031
1032 mce_panic("Too many concurrent recoverable errors", NULL, NULL);
1033}
1034
1035static struct mce_info *mce_find_info(void)
1036{
1037 struct mce_info *mi;
1038
1039 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
1040 if (atomic_read(&mi->inuse) && mi->t == current)
1041 return mi;
1042 return NULL;
1043}
1044
1045static void mce_clear_info(struct mce_info *mi)
1046{
1047 atomic_set(&mi->inuse, 0);
1048}
1049
1050/*
1051 * The actual machine check handler. This only handles real 1007 * The actual machine check handler. This only handles real
1052 * exceptions when something got corrupted coming in through int 18. 1008 * exceptions when something got corrupted coming in through int 18.
1053 * 1009 *
@@ -1063,6 +1019,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1063{ 1019{
1064 struct mca_config *cfg = &mca_cfg; 1020 struct mca_config *cfg = &mca_cfg;
1065 struct mce m, *final; 1021 struct mce m, *final;
1022 enum ctx_state prev_state;
1066 int i; 1023 int i;
1067 int worst = 0; 1024 int worst = 0;
1068 int severity; 1025 int severity;
@@ -1084,6 +1041,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1084 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 1041 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1085 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); 1042 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1086 char *msg = "Unknown"; 1043 char *msg = "Unknown";
1044 u64 recover_paddr = ~0ull;
1045 int flags = MF_ACTION_REQUIRED;
1046
1047 prev_state = ist_enter(regs);
1087 1048
1088 this_cpu_inc(mce_exception_count); 1049 this_cpu_inc(mce_exception_count);
1089 1050
@@ -1203,9 +1164,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1203 if (no_way_out) 1164 if (no_way_out)
1204 mce_panic("Fatal machine check on current CPU", &m, msg); 1165 mce_panic("Fatal machine check on current CPU", &m, msg);
1205 if (worst == MCE_AR_SEVERITY) { 1166 if (worst == MCE_AR_SEVERITY) {
1206 /* schedule action before return to userland */ 1167 recover_paddr = m.addr;
1207 mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); 1168 if (!(m.mcgstatus & MCG_STATUS_RIPV))
1208 set_thread_flag(TIF_MCE_NOTIFY); 1169 flags |= MF_MUST_KILL;
1209 } else if (kill_it) { 1170 } else if (kill_it) {
1210 force_sig(SIGBUS, current); 1171 force_sig(SIGBUS, current);
1211 } 1172 }
@@ -1216,6 +1177,27 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1216 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 1177 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1217out: 1178out:
1218 sync_core(); 1179 sync_core();
1180
1181 if (recover_paddr == ~0ull)
1182 goto done;
1183
1184 pr_err("Uncorrected hardware memory error in user-access at %llx",
1185 recover_paddr);
1186 /*
1187 * We must call memory_failure() here even if the current process is
1188 * doomed. We still need to mark the page as poisoned and alert any
1189 * other users of the page.
1190 */
1191 ist_begin_non_atomic(regs);
1192 local_irq_enable();
1193 if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
1194 pr_err("Memory error not recovered");
1195 force_sig(SIGBUS, current);
1196 }
1197 local_irq_disable();
1198 ist_end_non_atomic();
1199done:
1200 ist_exit(regs, prev_state);
1219} 1201}
1220EXPORT_SYMBOL_GPL(do_machine_check); 1202EXPORT_SYMBOL_GPL(do_machine_check);
1221 1203
@@ -1233,42 +1215,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
1233#endif 1215#endif
1234 1216
1235/* 1217/*
1236 * Called in process context that interrupted by MCE and marked with
1237 * TIF_MCE_NOTIFY, just before returning to erroneous userland.
1238 * This code is allowed to sleep.
1239 * Attempt possible recovery such as calling the high level VM handler to
1240 * process any corrupted pages, and kill/signal current process if required.
1241 * Action required errors are handled here.
1242 */
1243void mce_notify_process(void)
1244{
1245 unsigned long pfn;
1246 struct mce_info *mi = mce_find_info();
1247 int flags = MF_ACTION_REQUIRED;
1248
1249 if (!mi)
1250 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
1251 pfn = mi->paddr >> PAGE_SHIFT;
1252
1253 clear_thread_flag(TIF_MCE_NOTIFY);
1254
1255 pr_err("Uncorrected hardware memory error in user-access at %llx",
1256 mi->paddr);
1257 /*
1258 * We must call memory_failure() here even if the current process is
1259 * doomed. We still need to mark the page as poisoned and alert any
1260 * other users of the page.
1261 */
1262 if (!mi->restartable)
1263 flags |= MF_MUST_KILL;
1264 if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
1265 pr_err("Memory error not recovered");
1266 force_sig(SIGBUS, current);
1267 }
1268 mce_clear_info(mi);
1269}
1270
1271/*
1272 * Action optional processing happens here (picking up 1218 * Action optional processing happens here (picking up
1273 * from the list of faulting pages that do_machine_check() 1219 * from the list of faulting pages that do_machine_check()
1274 * placed into the "ring"). 1220 * placed into the "ring").
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index a3042989398c..ec2663a708e4 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -8,6 +8,7 @@
8#include <linux/smp.h> 8#include <linux/smp.h>
9 9
10#include <asm/processor.h> 10#include <asm/processor.h>
11#include <asm/traps.h>
11#include <asm/mce.h> 12#include <asm/mce.h>
12#include <asm/msr.h> 13#include <asm/msr.h>
13 14
@@ -17,8 +18,11 @@ int mce_p5_enabled __read_mostly;
17/* Machine check handler for Pentium class Intel CPUs: */ 18/* Machine check handler for Pentium class Intel CPUs: */
18static void pentium_machine_check(struct pt_regs *regs, long error_code) 19static void pentium_machine_check(struct pt_regs *regs, long error_code)
19{ 20{
21 enum ctx_state prev_state;
20 u32 loaddr, hi, lotype; 22 u32 loaddr, hi, lotype;
21 23
24 prev_state = ist_enter(regs);
25
22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 26 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
23 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); 27 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
24 28
@@ -33,6 +37,8 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
33 } 37 }
34 38
35 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 39 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
40
41 ist_exit(regs, prev_state);
36} 42}
37 43
38/* Set up machine check reporting for processors with Intel style MCE: */ 44/* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 7dc5564d0cdf..bd5d46a32210 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -7,14 +7,19 @@
7#include <linux/types.h> 7#include <linux/types.h>
8 8
9#include <asm/processor.h> 9#include <asm/processor.h>
10#include <asm/traps.h>
10#include <asm/mce.h> 11#include <asm/mce.h>
11#include <asm/msr.h> 12#include <asm/msr.h>
12 13
13/* Machine check handler for WinChip C6: */ 14/* Machine check handler for WinChip C6: */
14static void winchip_machine_check(struct pt_regs *regs, long error_code) 15static void winchip_machine_check(struct pt_regs *regs, long error_code)
15{ 16{
17 enum ctx_state prev_state = ist_enter(regs);
18
16 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 19 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
17 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 20 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
21
22 ist_exit(regs, prev_state);
18} 23}
19 24
20/* Set up machine check reporting on the Winchip C6 series */ 25/* Set up machine check reporting on the Winchip C6 series */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 9ebaf63ba182..db13655c3a2a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -143,7 +143,8 @@ ENDPROC(native_usergs_sysret64)
143 movq \tmp,RSP+\offset(%rsp) 143 movq \tmp,RSP+\offset(%rsp)
144 movq $__USER_DS,SS+\offset(%rsp) 144 movq $__USER_DS,SS+\offset(%rsp)
145 movq $__USER_CS,CS+\offset(%rsp) 145 movq $__USER_CS,CS+\offset(%rsp)
146 movq $-1,RCX+\offset(%rsp) 146 movq RIP+\offset(%rsp),\tmp /* get rip */
147 movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
147 movq R11+\offset(%rsp),\tmp /* get eflags */ 148 movq R11+\offset(%rsp),\tmp /* get eflags */
148 movq \tmp,EFLAGS+\offset(%rsp) 149 movq \tmp,EFLAGS+\offset(%rsp)
149 .endm 150 .endm
@@ -155,27 +156,6 @@ ENDPROC(native_usergs_sysret64)
155 movq \tmp,R11+\offset(%rsp) 156 movq \tmp,R11+\offset(%rsp)
156 .endm 157 .endm
157 158
158 .macro FAKE_STACK_FRAME child_rip
159 /* push in order ss, rsp, eflags, cs, rip */
160 xorl %eax, %eax
161 pushq_cfi $__KERNEL_DS /* ss */
162 /*CFI_REL_OFFSET ss,0*/
163 pushq_cfi %rax /* rsp */
164 CFI_REL_OFFSET rsp,0
165 pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */
166 /*CFI_REL_OFFSET rflags,0*/
167 pushq_cfi $__KERNEL_CS /* cs */
168 /*CFI_REL_OFFSET cs,0*/
169 pushq_cfi \child_rip /* rip */
170 CFI_REL_OFFSET rip,0
171 pushq_cfi %rax /* orig rax */
172 .endm
173
174 .macro UNFAKE_STACK_FRAME
175 addq $8*6, %rsp
176 CFI_ADJUST_CFA_OFFSET -(6*8)
177 .endm
178
179/* 159/*
180 * initial frame state for interrupts (and exceptions without error code) 160 * initial frame state for interrupts (and exceptions without error code)
181 */ 161 */
@@ -238,51 +218,6 @@ ENDPROC(native_usergs_sysret64)
238 CFI_REL_OFFSET r15, R15+\offset 218 CFI_REL_OFFSET r15, R15+\offset
239 .endm 219 .endm
240 220
241/* save partial stack frame */
242 .macro SAVE_ARGS_IRQ
243 cld
244 /* start from rbp in pt_regs and jump over */
245 movq_cfi rdi, (RDI-RBP)
246 movq_cfi rsi, (RSI-RBP)
247 movq_cfi rdx, (RDX-RBP)
248 movq_cfi rcx, (RCX-RBP)
249 movq_cfi rax, (RAX-RBP)
250 movq_cfi r8, (R8-RBP)
251 movq_cfi r9, (R9-RBP)
252 movq_cfi r10, (R10-RBP)
253 movq_cfi r11, (R11-RBP)
254
255 /* Save rbp so that we can unwind from get_irq_regs() */
256 movq_cfi rbp, 0
257
258 /* Save previous stack value */
259 movq %rsp, %rsi
260
261 leaq -RBP(%rsp),%rdi /* arg1 for handler */
262 testl $3, CS-RBP(%rsi)
263 je 1f
264 SWAPGS
265 /*
266 * irq_count is used to check if a CPU is already on an interrupt stack
267 * or not. While this is essentially redundant with preempt_count it is
268 * a little cheaper to use a separate counter in the PDA (short of
269 * moving irq_enter into assembly, which would be too much work)
270 */
2711: incl PER_CPU_VAR(irq_count)
272 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
273 CFI_DEF_CFA_REGISTER rsi
274
275 /* Store previous stack value */
276 pushq %rsi
277 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
278 0x77 /* DW_OP_breg7 */, 0, \
279 0x06 /* DW_OP_deref */, \
280 0x08 /* DW_OP_const1u */, SS+8-RBP, \
281 0x22 /* DW_OP_plus */
282 /* We entered an interrupt context - irqs are off: */
283 TRACE_IRQS_OFF
284 .endm
285
286ENTRY(save_paranoid) 221ENTRY(save_paranoid)
287 XCPT_FRAME 1 RDI+8 222 XCPT_FRAME 1 RDI+8
288 cld 223 cld
@@ -426,15 +361,12 @@ system_call_fastpath:
426 * Has incomplete stack frame and undefined top of stack. 361 * Has incomplete stack frame and undefined top of stack.
427 */ 362 */
428ret_from_sys_call: 363ret_from_sys_call:
429 movl $_TIF_ALLWORK_MASK,%edi 364 testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
430 /* edi: flagmask */ 365 jnz int_ret_from_sys_call_fixup /* Go the the slow path */
431sysret_check: 366
432 LOCKDEP_SYS_EXIT 367 LOCKDEP_SYS_EXIT
433 DISABLE_INTERRUPTS(CLBR_NONE) 368 DISABLE_INTERRUPTS(CLBR_NONE)
434 TRACE_IRQS_OFF 369 TRACE_IRQS_OFF
435 movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
436 andl %edi,%edx
437 jnz sysret_careful
438 CFI_REMEMBER_STATE 370 CFI_REMEMBER_STATE
439 /* 371 /*
440 * sysretq will re-enable interrupts: 372 * sysretq will re-enable interrupts:
@@ -448,49 +380,10 @@ sysret_check:
448 USERGS_SYSRET64 380 USERGS_SYSRET64
449 381
450 CFI_RESTORE_STATE 382 CFI_RESTORE_STATE
451 /* Handle reschedules */
452 /* edx: work, edi: workmask */
453sysret_careful:
454 bt $TIF_NEED_RESCHED,%edx
455 jnc sysret_signal
456 TRACE_IRQS_ON
457 ENABLE_INTERRUPTS(CLBR_NONE)
458 pushq_cfi %rdi
459 SCHEDULE_USER
460 popq_cfi %rdi
461 jmp sysret_check
462 383
463 /* Handle a signal */ 384int_ret_from_sys_call_fixup:
464sysret_signal:
465 TRACE_IRQS_ON
466 ENABLE_INTERRUPTS(CLBR_NONE)
467#ifdef CONFIG_AUDITSYSCALL
468 bt $TIF_SYSCALL_AUDIT,%edx
469 jc sysret_audit
470#endif
471 /*
472 * We have a signal, or exit tracing or single-step.
473 * These all wind up with the iret return path anyway,
474 * so just join that path right now.
475 */
476 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET 385 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
477 jmp int_check_syscall_exit_work 386 jmp int_ret_from_sys_call
478
479#ifdef CONFIG_AUDITSYSCALL
480 /*
481 * Return fast path for syscall audit. Call __audit_syscall_exit()
482 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
483 * masked off.
484 */
485sysret_audit:
486 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
487 cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
488 setbe %al /* 1 if so, 0 if not */
489 movzbl %al,%edi /* zero-extend that into %edi */
490 call __audit_syscall_exit
491 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
492 jmp sysret_check
493#endif /* CONFIG_AUDITSYSCALL */
494 387
495 /* Do syscall tracing */ 388 /* Do syscall tracing */
496tracesys: 389tracesys:
@@ -626,19 +519,6 @@ END(\label)
626 FORK_LIKE vfork 519 FORK_LIKE vfork
627 FIXED_FRAME stub_iopl, sys_iopl 520 FIXED_FRAME stub_iopl, sys_iopl
628 521
629ENTRY(ptregscall_common)
630 DEFAULT_FRAME 1 8 /* offset 8: return address */
631 RESTORE_TOP_OF_STACK %r11, 8
632 movq_cfi_restore R15+8, r15
633 movq_cfi_restore R14+8, r14
634 movq_cfi_restore R13+8, r13
635 movq_cfi_restore R12+8, r12
636 movq_cfi_restore RBP+8, rbp
637 movq_cfi_restore RBX+8, rbx
638 ret $REST_SKIP /* pop extended registers */
639 CFI_ENDPROC
640END(ptregscall_common)
641
642ENTRY(stub_execve) 522ENTRY(stub_execve)
643 CFI_STARTPROC 523 CFI_STARTPROC
644 addq $8, %rsp 524 addq $8, %rsp
@@ -779,7 +659,48 @@ END(interrupt)
779 /* reserve pt_regs for scratch regs and rbp */ 659 /* reserve pt_regs for scratch regs and rbp */
780 subq $ORIG_RAX-RBP, %rsp 660 subq $ORIG_RAX-RBP, %rsp
781 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP 661 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
782 SAVE_ARGS_IRQ 662 cld
663 /* start from rbp in pt_regs and jump over */
664 movq_cfi rdi, (RDI-RBP)
665 movq_cfi rsi, (RSI-RBP)
666 movq_cfi rdx, (RDX-RBP)
667 movq_cfi rcx, (RCX-RBP)
668 movq_cfi rax, (RAX-RBP)
669 movq_cfi r8, (R8-RBP)
670 movq_cfi r9, (R9-RBP)
671 movq_cfi r10, (R10-RBP)
672 movq_cfi r11, (R11-RBP)
673
674 /* Save rbp so that we can unwind from get_irq_regs() */
675 movq_cfi rbp, 0
676
677 /* Save previous stack value */
678 movq %rsp, %rsi
679
680 leaq -RBP(%rsp),%rdi /* arg1 for handler */
681 testl $3, CS-RBP(%rsi)
682 je 1f
683 SWAPGS
684 /*
685 * irq_count is used to check if a CPU is already on an interrupt stack
686 * or not. While this is essentially redundant with preempt_count it is
687 * a little cheaper to use a separate counter in the PDA (short of
688 * moving irq_enter into assembly, which would be too much work)
689 */
6901: incl PER_CPU_VAR(irq_count)
691 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
692 CFI_DEF_CFA_REGISTER rsi
693
694 /* Store previous stack value */
695 pushq %rsi
696 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
697 0x77 /* DW_OP_breg7 */, 0, \
698 0x06 /* DW_OP_deref */, \
699 0x08 /* DW_OP_const1u */, SS+8-RBP, \
700 0x22 /* DW_OP_plus */
701 /* We entered an interrupt context - irqs are off: */
702 TRACE_IRQS_OFF
703
783 call \func 704 call \func
784 .endm 705 .endm
785 706
@@ -831,6 +752,60 @@ retint_swapgs: /* return to user-space */
831 */ 752 */
832 DISABLE_INTERRUPTS(CLBR_ANY) 753 DISABLE_INTERRUPTS(CLBR_ANY)
833 TRACE_IRQS_IRETQ 754 TRACE_IRQS_IRETQ
755
756 /*
757 * Try to use SYSRET instead of IRET if we're returning to
758 * a completely clean 64-bit userspace context.
759 */
760 movq (RCX-R11)(%rsp), %rcx
761 cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
762 jne opportunistic_sysret_failed
763
764 /*
765 * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
766 * in kernel space. This essentially lets the user take over
767 * the kernel, since userspace controls RSP. It's not worth
768 * testing for canonicalness exactly -- this check detects any
769 * of the 17 high bits set, which is true for non-canonical
770 * or kernel addresses. (This will pessimize vsyscall=native.
771 * Big deal.)
772 *
773 * If virtual addresses ever become wider, this will need
774 * to be updated to remain correct on both old and new CPUs.
775 */
776 .ifne __VIRTUAL_MASK_SHIFT - 47
777 .error "virtual address width changed -- sysret checks need update"
778 .endif
779 shr $__VIRTUAL_MASK_SHIFT, %rcx
780 jnz opportunistic_sysret_failed
781
782 cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
783 jne opportunistic_sysret_failed
784
785 movq (R11-ARGOFFSET)(%rsp), %r11
786 cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
787 jne opportunistic_sysret_failed
788
789 testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */
790 jnz opportunistic_sysret_failed
791
792 /* nothing to check for RSP */
793
794 cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
795 jne opportunistic_sysret_failed
796
797 /*
798 * We win! This label is here just for ease of understanding
799 * perf profiles. Nothing jumps here.
800 */
801irq_return_via_sysret:
802 CFI_REMEMBER_STATE
803 RESTORE_ARGS 1,8,1
804 movq (RSP-RIP)(%rsp),%rsp
805 USERGS_SYSRET64
806 CFI_RESTORE_STATE
807
808opportunistic_sysret_failed:
834 SWAPGS 809 SWAPGS
835 jmp restore_args 810 jmp restore_args
836 811
@@ -1048,6 +1023,11 @@ ENTRY(\sym)
1048 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1023 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1049 1024
1050 .if \paranoid 1025 .if \paranoid
1026 .if \paranoid == 1
1027 CFI_REMEMBER_STATE
1028 testl $3, CS(%rsp) /* If coming from userspace, switch */
1029 jnz 1f /* stacks. */
1030 .endif
1051 call save_paranoid 1031 call save_paranoid
1052 .else 1032 .else
1053 call error_entry 1033 call error_entry
@@ -1088,6 +1068,36 @@ ENTRY(\sym)
1088 jmp error_exit /* %ebx: no swapgs flag */ 1068 jmp error_exit /* %ebx: no swapgs flag */
1089 .endif 1069 .endif
1090 1070
1071 .if \paranoid == 1
1072 CFI_RESTORE_STATE
1073 /*
1074 * Paranoid entry from userspace. Switch stacks and treat it
1075 * as a normal entry. This means that paranoid handlers
1076 * run in real process context if user_mode(regs).
1077 */
10781:
1079 call error_entry
1080
1081 DEFAULT_FRAME 0
1082
1083 movq %rsp,%rdi /* pt_regs pointer */
1084 call sync_regs
1085 movq %rax,%rsp /* switch stack */
1086
1087 movq %rsp,%rdi /* pt_regs pointer */
1088
1089 .if \has_error_code
1090 movq ORIG_RAX(%rsp),%rsi /* get error code */
1091 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1092 .else
1093 xorl %esi,%esi /* no error code */
1094 .endif
1095
1096 call \do_sym
1097
1098 jmp error_exit /* %ebx: no swapgs flag */
1099 .endif
1100
1091 CFI_ENDPROC 1101 CFI_ENDPROC
1092END(\sym) 1102END(\sym)
1093.endm 1103.endm
@@ -1108,7 +1118,7 @@ idtentry overflow do_overflow has_error_code=0
1108idtentry bounds do_bounds has_error_code=0 1118idtentry bounds do_bounds has_error_code=0
1109idtentry invalid_op do_invalid_op has_error_code=0 1119idtentry invalid_op do_invalid_op has_error_code=0
1110idtentry device_not_available do_device_not_available has_error_code=0 1120idtentry device_not_available do_device_not_available has_error_code=0
1111idtentry double_fault do_double_fault has_error_code=1 paranoid=1 1121idtentry double_fault do_double_fault has_error_code=1 paranoid=2
1112idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 1122idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
1113idtentry invalid_TSS do_invalid_TSS has_error_code=1 1123idtentry invalid_TSS do_invalid_TSS has_error_code=1
1114idtentry segment_not_present do_segment_not_present has_error_code=1 1124idtentry segment_not_present do_segment_not_present has_error_code=1
@@ -1289,16 +1299,14 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(
1289#endif 1299#endif
1290 1300
1291 /* 1301 /*
1292 * "Paranoid" exit path from exception stack. 1302 * "Paranoid" exit path from exception stack. This is invoked
1293 * Paranoid because this is used by NMIs and cannot take 1303 * only on return from non-NMI IST interrupts that came
1294 * any kernel state for granted. 1304 * from kernel space.
1295 * We don't do kernel preemption checks here, because only
1296 * NMI should be common and it does not enable IRQs and
1297 * cannot get reschedule ticks.
1298 * 1305 *
1299 * "trace" is 0 for the NMI handler only, because irq-tracing 1306 * We may be returning to very strange contexts (e.g. very early
1300 * is fundamentally NMI-unsafe. (we cannot change the soft and 1307 * in syscall entry), so checking for preemption here would
1301 * hard flags at once, atomically) 1308 * be complicated. Fortunately, we there's no good reason
1309 * to try to handle preemption here.
1302 */ 1310 */
1303 1311
1304 /* ebx: no swapgs flag */ 1312 /* ebx: no swapgs flag */
@@ -1308,43 +1316,14 @@ ENTRY(paranoid_exit)
1308 TRACE_IRQS_OFF_DEBUG 1316 TRACE_IRQS_OFF_DEBUG
1309 testl %ebx,%ebx /* swapgs needed? */ 1317 testl %ebx,%ebx /* swapgs needed? */
1310 jnz paranoid_restore 1318 jnz paranoid_restore
1311 testl $3,CS(%rsp)
1312 jnz paranoid_userspace
1313paranoid_swapgs:
1314 TRACE_IRQS_IRETQ 0 1319 TRACE_IRQS_IRETQ 0
1315 SWAPGS_UNSAFE_STACK 1320 SWAPGS_UNSAFE_STACK
1316 RESTORE_ALL 8 1321 RESTORE_ALL 8
1317 jmp irq_return 1322 INTERRUPT_RETURN
1318paranoid_restore: 1323paranoid_restore:
1319 TRACE_IRQS_IRETQ_DEBUG 0 1324 TRACE_IRQS_IRETQ_DEBUG 0
1320 RESTORE_ALL 8 1325 RESTORE_ALL 8
1321 jmp irq_return 1326 INTERRUPT_RETURN
1322paranoid_userspace:
1323 GET_THREAD_INFO(%rcx)
1324 movl TI_flags(%rcx),%ebx
1325 andl $_TIF_WORK_MASK,%ebx
1326 jz paranoid_swapgs
1327 movq %rsp,%rdi /* &pt_regs */
1328 call sync_regs
1329 movq %rax,%rsp /* switch stack for scheduling */
1330 testl $_TIF_NEED_RESCHED,%ebx
1331 jnz paranoid_schedule
1332 movl %ebx,%edx /* arg3: thread flags */
1333 TRACE_IRQS_ON
1334 ENABLE_INTERRUPTS(CLBR_NONE)
1335 xorl %esi,%esi /* arg2: oldset */
1336 movq %rsp,%rdi /* arg1: &pt_regs */
1337 call do_notify_resume
1338 DISABLE_INTERRUPTS(CLBR_NONE)
1339 TRACE_IRQS_OFF
1340 jmp paranoid_userspace
1341paranoid_schedule:
1342 TRACE_IRQS_ON
1343 ENABLE_INTERRUPTS(CLBR_ANY)
1344 SCHEDULE_USER
1345 DISABLE_INTERRUPTS(CLBR_ANY)
1346 TRACE_IRQS_OFF
1347 jmp paranoid_userspace
1348 CFI_ENDPROC 1327 CFI_ENDPROC
1349END(paranoid_exit) 1328END(paranoid_exit)
1350 1329
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 63ce838e5a54..28d28f5eb8f4 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -69,16 +69,9 @@ static void call_on_stack(void *func, void *stack)
69 : "memory", "cc", "edx", "ecx", "eax"); 69 : "memory", "cc", "edx", "ecx", "eax");
70} 70}
71 71
72/* how to get the current stack pointer from C */
73#define current_stack_pointer ({ \
74 unsigned long sp; \
75 asm("mov %%esp,%0" : "=g" (sp)); \
76 sp; \
77})
78
79static inline void *current_stack(void) 72static inline void *current_stack(void)
80{ 73{
81 return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); 74 return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
82} 75}
83 76
84static inline int 77static inline int
@@ -103,7 +96,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
103 96
104 /* Save the next esp at the bottom of the stack */ 97 /* Save the next esp at the bottom of the stack */
105 prev_esp = (u32 *)irqstk; 98 prev_esp = (u32 *)irqstk;
106 *prev_esp = current_stack_pointer; 99 *prev_esp = current_stack_pointer();
107 100
108 if (unlikely(overflow)) 101 if (unlikely(overflow))
109 call_on_stack(print_stack_overflow, isp); 102 call_on_stack(print_stack_overflow, isp);
@@ -156,7 +149,7 @@ void do_softirq_own_stack(void)
156 149
157 /* Push the previous esp onto the stack */ 150 /* Push the previous esp onto the stack */
158 prev_esp = (u32 *)irqstk; 151 prev_esp = (u32 *)irqstk;
159 *prev_esp = current_stack_pointer; 152 *prev_esp = current_stack_pointer();
160 153
161 call_on_stack(__do_softirq, isp); 154 call_on_stack(__do_softirq, isp);
162} 155}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ed37a768d0fc..2a33c8f68319 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -740,12 +740,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
740{ 740{
741 user_exit(); 741 user_exit();
742 742
743#ifdef CONFIG_X86_MCE
744 /* notify userspace of pending MCEs */
745 if (thread_info_flags & _TIF_MCE_NOTIFY)
746 mce_notify_process();
747#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
748
749 if (thread_info_flags & _TIF_UPROBE) 743 if (thread_info_flags & _TIF_UPROBE)
750 uprobe_notify_resume(regs); 744 uprobe_notify_resume(regs);
751 745
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 88900e288021..c74f2f5652da 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -108,6 +108,88 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
108 preempt_count_dec(); 108 preempt_count_dec();
109} 109}
110 110
111enum ctx_state ist_enter(struct pt_regs *regs)
112{
113 enum ctx_state prev_state;
114
115 if (user_mode_vm(regs)) {
116 /* Other than that, we're just an exception. */
117 prev_state = exception_enter();
118 } else {
119 /*
120 * We might have interrupted pretty much anything. In
121 * fact, if we're a machine check, we can even interrupt
122 * NMI processing. We don't want in_nmi() to return true,
123 * but we need to notify RCU.
124 */
125 rcu_nmi_enter();
126 prev_state = IN_KERNEL; /* the value is irrelevant. */
127 }
128
129 /*
130 * We are atomic because we're on the IST stack (or we're on x86_32,
131 * in which case we still shouldn't schedule).
132 *
133 * This must be after exception_enter(), because exception_enter()
134 * won't do anything if in_interrupt() returns true.
135 */
136 preempt_count_add(HARDIRQ_OFFSET);
137
138 /* This code is a bit fragile. Test it. */
139 rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
140
141 return prev_state;
142}
143
144void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
145{
146 /* Must be before exception_exit. */
147 preempt_count_sub(HARDIRQ_OFFSET);
148
149 if (user_mode_vm(regs))
150 return exception_exit(prev_state);
151 else
152 rcu_nmi_exit();
153}
154
155/**
156 * ist_begin_non_atomic() - begin a non-atomic section in an IST exception
157 * @regs: regs passed to the IST exception handler
158 *
159 * IST exception handlers normally cannot schedule. As a special
160 * exception, if the exception interrupted userspace code (i.e.
161 * user_mode_vm(regs) would return true) and the exception was not
162 * a double fault, it can be safe to schedule. ist_begin_non_atomic()
163 * begins a non-atomic section within an ist_enter()/ist_exit() region.
164 * Callers are responsible for enabling interrupts themselves inside
165 * the non-atomic section, and callers must call is_end_non_atomic()
166 * before ist_exit().
167 */
168void ist_begin_non_atomic(struct pt_regs *regs)
169{
170 BUG_ON(!user_mode_vm(regs));
171
172 /*
173 * Sanity check: we need to be on the normal thread stack. This
174 * will catch asm bugs and any attempt to use ist_preempt_enable
175 * from double_fault.
176 */
177 BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
178 & ~(THREAD_SIZE - 1)) != 0);
179
180 preempt_count_sub(HARDIRQ_OFFSET);
181}
182
183/**
184 * ist_end_non_atomic() - begin a non-atomic section in an IST exception
185 *
186 * Ends a non-atomic section started with ist_begin_non_atomic().
187 */
188void ist_end_non_atomic(void)
189{
190 preempt_count_add(HARDIRQ_OFFSET);
191}
192
111static nokprobe_inline int 193static nokprobe_inline int
112do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, 194do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
113 struct pt_regs *regs, long error_code) 195 struct pt_regs *regs, long error_code)
@@ -251,6 +333,8 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
251 * end up promoting it to a doublefault. In that case, modify 333 * end up promoting it to a doublefault. In that case, modify
252 * the stack to make it look like we just entered the #GP 334 * the stack to make it look like we just entered the #GP
253 * handler from user space, similar to bad_iret. 335 * handler from user space, similar to bad_iret.
336 *
337 * No need for ist_enter here because we don't use RCU.
254 */ 338 */
255 if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && 339 if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
256 regs->cs == __KERNEL_CS && 340 regs->cs == __KERNEL_CS &&
@@ -263,12 +347,12 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
263 normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ 347 normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
264 regs->ip = (unsigned long)general_protection; 348 regs->ip = (unsigned long)general_protection;
265 regs->sp = (unsigned long)&normal_regs->orig_ax; 349 regs->sp = (unsigned long)&normal_regs->orig_ax;
350
266 return; 351 return;
267 } 352 }
268#endif 353#endif
269 354
270 exception_enter(); 355 ist_enter(regs); /* Discard prev_state because we won't return. */
271 /* Return not checked because double check cannot be ignored */
272 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); 356 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
273 357
274 tsk->thread.error_code = error_code; 358 tsk->thread.error_code = error_code;
@@ -434,7 +518,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
434 if (poke_int3_handler(regs)) 518 if (poke_int3_handler(regs))
435 return; 519 return;
436 520
437 prev_state = exception_enter(); 521 prev_state = ist_enter(regs);
438#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 522#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
439 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 523 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
440 SIGTRAP) == NOTIFY_STOP) 524 SIGTRAP) == NOTIFY_STOP)
@@ -460,33 +544,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
460 preempt_conditional_cli(regs); 544 preempt_conditional_cli(regs);
461 debug_stack_usage_dec(); 545 debug_stack_usage_dec();
462exit: 546exit:
463 exception_exit(prev_state); 547 ist_exit(regs, prev_state);
464} 548}
465NOKPROBE_SYMBOL(do_int3); 549NOKPROBE_SYMBOL(do_int3);
466 550
467#ifdef CONFIG_X86_64 551#ifdef CONFIG_X86_64
468/* 552/*
469 * Help handler running on IST stack to switch back to user stack 553 * Help handler running on IST stack to switch off the IST stack if the
470 * for scheduling or signal handling. The actual stack switch is done in 554 * interrupted code was in user mode. The actual stack switch is done in
471 * entry.S 555 * entry_64.S
472 */ 556 */
473asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) 557asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
474{ 558{
475 struct pt_regs *regs = eregs; 559 struct pt_regs *regs = task_pt_regs(current);
476 /* Did already sync */ 560 *regs = *eregs;
477 if (eregs == (struct pt_regs *)eregs->sp)
478 ;
479 /* Exception from user space */
480 else if (user_mode(eregs))
481 regs = task_pt_regs(current);
482 /*
483 * Exception from kernel and interrupts are enabled. Move to
484 * kernel process stack.
485 */
486 else if (eregs->flags & X86_EFLAGS_IF)
487 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
488 if (eregs != regs)
489 *regs = *eregs;
490 return regs; 561 return regs;
491} 562}
492NOKPROBE_SYMBOL(sync_regs); 563NOKPROBE_SYMBOL(sync_regs);
@@ -554,7 +625,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
554 unsigned long dr6; 625 unsigned long dr6;
555 int si_code; 626 int si_code;
556 627
557 prev_state = exception_enter(); 628 prev_state = ist_enter(regs);
558 629
559 get_debugreg(dr6, 6); 630 get_debugreg(dr6, 6);
560 631
@@ -629,7 +700,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
629 debug_stack_usage_dec(); 700 debug_stack_usage_dec();
630 701
631exit: 702exit:
632 exception_exit(prev_state); 703 ist_exit(regs, prev_state);
633} 704}
634NOKPROBE_SYMBOL(do_debug); 705NOKPROBE_SYMBOL(do_debug);
635 706
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 5a4affe025e8..09297c8e1fcd 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -205,4 +205,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
205PHONY += vdso_install $(vdso_img_insttargets) 205PHONY += vdso_install $(vdso_img_insttargets)
206vdso_install: $(vdso_img_insttargets) FORCE 206vdso_install: $(vdso_img_insttargets) FORCE
207 207
208clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* 208clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7680fc275036..4c106fcc0d54 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -759,39 +759,71 @@ void rcu_irq_enter(void)
759/** 759/**
760 * rcu_nmi_enter - inform RCU of entry to NMI context 760 * rcu_nmi_enter - inform RCU of entry to NMI context
761 * 761 *
762 * If the CPU was idle with dynamic ticks active, and there is no 762 * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
763 * irq handler running, this updates rdtp->dynticks_nmi to let the 763 * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
764 * RCU grace-period handling know that the CPU is active. 764 * that the CPU is active. This implementation permits nested NMIs, as
765 * long as the nesting level does not overflow an int. (You will probably
766 * run out of stack space first.)
765 */ 767 */
766void rcu_nmi_enter(void) 768void rcu_nmi_enter(void)
767{ 769{
768 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 770 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
771 int incby = 2;
769 772
770 if (rdtp->dynticks_nmi_nesting == 0 && 773 /* Complain about underflow. */
771 (atomic_read(&rdtp->dynticks) & 0x1)) 774 WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
772 return; 775
773 rdtp->dynticks_nmi_nesting++; 776 /*
774 smp_mb__before_atomic(); /* Force delay from prior write. */ 777 * If idle from RCU viewpoint, atomically increment ->dynticks
775 atomic_inc(&rdtp->dynticks); 778 * to mark non-idle and increment ->dynticks_nmi_nesting by one.
776 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 779 * Otherwise, increment ->dynticks_nmi_nesting by two. This means
777 smp_mb__after_atomic(); /* See above. */ 780 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
778 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 781 * to be in the outermost NMI handler that interrupted an RCU-idle
782 * period (observation due to Andy Lutomirski).
783 */
784 if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
785 smp_mb__before_atomic(); /* Force delay from prior write. */
786 atomic_inc(&rdtp->dynticks);
787 /* atomic_inc() before later RCU read-side crit sects */
788 smp_mb__after_atomic(); /* See above. */
789 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
790 incby = 1;
791 }
792 rdtp->dynticks_nmi_nesting += incby;
793 barrier();
779} 794}
780 795
781/** 796/**
782 * rcu_nmi_exit - inform RCU of exit from NMI context 797 * rcu_nmi_exit - inform RCU of exit from NMI context
783 * 798 *
784 * If the CPU was idle with dynamic ticks active, and there is no 799 * If we are returning from the outermost NMI handler that interrupted an
785 * irq handler running, this updates rdtp->dynticks_nmi to let the 800 * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
786 * RCU grace-period handling know that the CPU is no longer active. 801 * to let the RCU grace-period handling know that the CPU is back to
802 * being RCU-idle.
787 */ 803 */
788void rcu_nmi_exit(void) 804void rcu_nmi_exit(void)
789{ 805{
790 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 806 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
791 807
792 if (rdtp->dynticks_nmi_nesting == 0 || 808 /*
793 --rdtp->dynticks_nmi_nesting != 0) 809 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
810 * (We are exiting an NMI handler, so RCU better be paying attention
811 * to us!)
812 */
813 WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
814 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
815
816 /*
817 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
818 * leave it in non-RCU-idle state.
819 */
820 if (rdtp->dynticks_nmi_nesting != 1) {
821 rdtp->dynticks_nmi_nesting -= 2;
794 return; 822 return;
823 }
824
825 /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
826 rdtp->dynticks_nmi_nesting = 0;
795 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 827 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
796 smp_mb__before_atomic(); /* See above. */ 828 smp_mb__before_atomic(); /* See above. */
797 atomic_inc(&rdtp->dynticks); 829 atomic_inc(&rdtp->dynticks);