diff options
author | Ingo Molnar <mingo@kernel.org> | 2015-02-04 03:01:12 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2015-02-04 03:01:12 -0500 |
commit | 0967160ad615985c7c35443156ea9aecc60c37b8 (patch) | |
tree | 658f728aff1be23540180091b718452a6848a6b0 | |
parent | 2fde4f94e0a9531251e706fa57131b51b0df042e (diff) | |
parent | b57c0b5175ddbe9b477801f9994a5b330702c1ba (diff) |
Merge branch 'x86/asm' into perf/x86, to avoid conflicts with upcoming patches
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | Documentation/x86/entry_64.txt | 18 | ||||
-rw-r--r-- | Documentation/x86/x86_64/kernel-stacks | 8 | ||||
-rw-r--r-- | arch/x86/ia32/ia32entry.S | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/calling.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/mce.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/thread_info.h | 15 | ||||
-rw-r--r-- | arch/x86/include/asm/traps.h | 6 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 114 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/p5.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/winchip.c | 5 | ||||
-rw-r--r-- | arch/x86/kernel/entry_64.S | 317 | ||||
-rw-r--r-- | arch/x86/kernel/irq_32.c | 13 | ||||
-rw-r--r-- | arch/x86/kernel/signal.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 119 | ||||
-rw-r--r-- | arch/x86/vdso/Makefile | 2 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 66 |
16 files changed, 374 insertions, 327 deletions
diff --git a/Documentation/x86/entry_64.txt b/Documentation/x86/entry_64.txt index 4a1c5c2dc5a9..9132b86176a3 100644 --- a/Documentation/x86/entry_64.txt +++ b/Documentation/x86/entry_64.txt | |||
@@ -78,9 +78,6 @@ The expensive (paranoid) way is to read back the MSR_GS_BASE value | |||
78 | xorl %ebx,%ebx | 78 | xorl %ebx,%ebx |
79 | 1: ret | 79 | 1: ret |
80 | 80 | ||
81 | and the whole paranoid non-paranoid macro complexity is about whether | ||
82 | to suffer that RDMSR cost. | ||
83 | |||
84 | If we are at an interrupt or user-trap/gate-alike boundary then we can | 81 | If we are at an interrupt or user-trap/gate-alike boundary then we can |
85 | use the faster check: the stack will be a reliable indicator of | 82 | use the faster check: the stack will be a reliable indicator of |
86 | whether SWAPGS was already done: if we see that we are a secondary | 83 | whether SWAPGS was already done: if we see that we are a secondary |
@@ -93,6 +90,15 @@ which might have triggered right after a normal entry wrote CS to the | |||
93 | stack but before we executed SWAPGS, then the only safe way to check | 90 | stack but before we executed SWAPGS, then the only safe way to check |
94 | for GS is the slower method: the RDMSR. | 91 | for GS is the slower method: the RDMSR. |
95 | 92 | ||
96 | So we try only to mark those entry methods 'paranoid' that absolutely | 93 | Therefore, super-atomic entries (except NMI, which is handled separately) |
97 | need the more expensive check for the GS base - and we generate all | 94 | must use idtentry with paranoid=1 to handle gsbase correctly. This |
98 | 'normal' entry points with the regular (faster) entry macros. | 95 | triggers three main behavior changes: |
96 | |||
97 | - Interrupt entry will use the slower gsbase check. | ||
98 | - Interrupt entry from user mode will switch off the IST stack. | ||
99 | - Interrupt exit to kernel mode will not attempt to reschedule. | ||
100 | |||
101 | We try to only use IST entries and the paranoid entry code for vectors | ||
102 | that absolutely need the more expensive check for the GS base - and we | ||
103 | generate all 'normal' entry points with the regular (faster) paranoid=0 | ||
104 | variant. | ||
diff --git a/Documentation/x86/x86_64/kernel-stacks b/Documentation/x86/x86_64/kernel-stacks index a01eec5d1d0b..e3c8a49d1a2f 100644 --- a/Documentation/x86/x86_64/kernel-stacks +++ b/Documentation/x86/x86_64/kernel-stacks | |||
@@ -40,9 +40,11 @@ An IST is selected by a non-zero value in the IST field of an | |||
40 | interrupt-gate descriptor. When an interrupt occurs and the hardware | 40 | interrupt-gate descriptor. When an interrupt occurs and the hardware |
41 | loads such a descriptor, the hardware automatically sets the new stack | 41 | loads such a descriptor, the hardware automatically sets the new stack |
42 | pointer based on the IST value, then invokes the interrupt handler. If | 42 | pointer based on the IST value, then invokes the interrupt handler. If |
43 | software wants to allow nested IST interrupts then the handler must | 43 | the interrupt came from user mode, then the interrupt handler prologue |
44 | adjust the IST values on entry to and exit from the interrupt handler. | 44 | will switch back to the per-thread stack. If software wants to allow |
45 | (This is occasionally done, e.g. for debug exceptions.) | 45 | nested IST interrupts then the handler must adjust the IST values on |
46 | entry to and exit from the interrupt handler. (This is occasionally | ||
47 | done, e.g. for debug exceptions.) | ||
46 | 48 | ||
47 | Events with different IST codes (i.e. with different stacks) can be | 49 | Events with different IST codes (i.e. with different stacks) can be |
48 | nested. For example, a debug interrupt can safely be interrupted by an | 50 | nested. For example, a debug interrupt can safely be interrupted by an |
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 82e8a1d44658..156ebcab4ada 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S | |||
@@ -179,8 +179,8 @@ sysenter_dispatch: | |||
179 | sysexit_from_sys_call: | 179 | sysexit_from_sys_call: |
180 | andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 180 | andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) |
181 | /* clear IF, that popfq doesn't enable interrupts early */ | 181 | /* clear IF, that popfq doesn't enable interrupts early */ |
182 | andl $~0x200,EFLAGS-R11(%rsp) | 182 | andl $~0x200,EFLAGS-ARGOFFSET(%rsp) |
183 | movl RIP-R11(%rsp),%edx /* User %eip */ | 183 | movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */ |
184 | CFI_REGISTER rip,rdx | 184 | CFI_REGISTER rip,rdx |
185 | RESTORE_ARGS 0,24,0,0,0,0 | 185 | RESTORE_ARGS 0,24,0,0,0,0 |
186 | xorq %r8,%r8 | 186 | xorq %r8,%r8 |
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 76659b67fd11..1f1297b46f83 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h | |||
@@ -83,7 +83,6 @@ For 32-bit we have the following conventions - kernel is built with | |||
83 | #define SS 160 | 83 | #define SS 160 |
84 | 84 | ||
85 | #define ARGOFFSET R11 | 85 | #define ARGOFFSET R11 |
86 | #define SWFRAME ORIG_RAX | ||
87 | 86 | ||
88 | .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 | 87 | .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 |
89 | subq $9*8+\addskip, %rsp | 88 | subq $9*8+\addskip, %rsp |
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 51b26e895933..9b3de99dc004 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
@@ -190,7 +190,6 @@ enum mcp_flags { | |||
190 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); | 190 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); |
191 | 191 | ||
192 | int mce_notify_irq(void); | 192 | int mce_notify_irq(void); |
193 | void mce_notify_process(void); | ||
194 | 193 | ||
195 | DECLARE_PER_CPU(struct mce, injectm); | 194 | DECLARE_PER_CPU(struct mce, injectm); |
196 | 195 | ||
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 547e344a6dc6..e82e95abc92b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h | |||
@@ -75,7 +75,6 @@ struct thread_info { | |||
75 | #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ | 75 | #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ |
76 | #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ | 76 | #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ |
77 | #define TIF_SECCOMP 8 /* secure computing */ | 77 | #define TIF_SECCOMP 8 /* secure computing */ |
78 | #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ | ||
79 | #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ | 78 | #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ |
80 | #define TIF_UPROBE 12 /* breakpointed or singlestepping */ | 79 | #define TIF_UPROBE 12 /* breakpointed or singlestepping */ |
81 | #define TIF_NOTSC 16 /* TSC is not accessible in userland */ | 80 | #define TIF_NOTSC 16 /* TSC is not accessible in userland */ |
@@ -100,7 +99,6 @@ struct thread_info { | |||
100 | #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) | 99 | #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) |
101 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) | 100 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) |
102 | #define _TIF_SECCOMP (1 << TIF_SECCOMP) | 101 | #define _TIF_SECCOMP (1 << TIF_SECCOMP) |
103 | #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) | ||
104 | #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) | 102 | #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) |
105 | #define _TIF_UPROBE (1 << TIF_UPROBE) | 103 | #define _TIF_UPROBE (1 << TIF_UPROBE) |
106 | #define _TIF_NOTSC (1 << TIF_NOTSC) | 104 | #define _TIF_NOTSC (1 << TIF_NOTSC) |
@@ -140,7 +138,7 @@ struct thread_info { | |||
140 | 138 | ||
141 | /* Only used for 64 bit */ | 139 | /* Only used for 64 bit */ |
142 | #define _TIF_DO_NOTIFY_MASK \ | 140 | #define _TIF_DO_NOTIFY_MASK \ |
143 | (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \ | 141 | (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \ |
144 | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) | 142 | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) |
145 | 143 | ||
146 | /* flags to check in __switch_to() */ | 144 | /* flags to check in __switch_to() */ |
@@ -170,6 +168,17 @@ static inline struct thread_info *current_thread_info(void) | |||
170 | return ti; | 168 | return ti; |
171 | } | 169 | } |
172 | 170 | ||
171 | static inline unsigned long current_stack_pointer(void) | ||
172 | { | ||
173 | unsigned long sp; | ||
174 | #ifdef CONFIG_X86_64 | ||
175 | asm("mov %%rsp,%0" : "=g" (sp)); | ||
176 | #else | ||
177 | asm("mov %%esp,%0" : "=g" (sp)); | ||
178 | #endif | ||
179 | return sp; | ||
180 | } | ||
181 | |||
173 | #else /* !__ASSEMBLY__ */ | 182 | #else /* !__ASSEMBLY__ */ |
174 | 183 | ||
175 | /* how to get the thread information struct from ASM */ | 184 | /* how to get the thread information struct from ASM */ |
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 707adc6549d8..4e49d7dff78e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h | |||
@@ -1,6 +1,7 @@ | |||
1 | #ifndef _ASM_X86_TRAPS_H | 1 | #ifndef _ASM_X86_TRAPS_H |
2 | #define _ASM_X86_TRAPS_H | 2 | #define _ASM_X86_TRAPS_H |
3 | 3 | ||
4 | #include <linux/context_tracking_state.h> | ||
4 | #include <linux/kprobes.h> | 5 | #include <linux/kprobes.h> |
5 | 6 | ||
6 | #include <asm/debugreg.h> | 7 | #include <asm/debugreg.h> |
@@ -110,6 +111,11 @@ asmlinkage void smp_thermal_interrupt(void); | |||
110 | asmlinkage void mce_threshold_interrupt(void); | 111 | asmlinkage void mce_threshold_interrupt(void); |
111 | #endif | 112 | #endif |
112 | 113 | ||
114 | extern enum ctx_state ist_enter(struct pt_regs *regs); | ||
115 | extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state); | ||
116 | extern void ist_begin_non_atomic(struct pt_regs *regs); | ||
117 | extern void ist_end_non_atomic(void); | ||
118 | |||
113 | /* Interrupts/Exceptions */ | 119 | /* Interrupts/Exceptions */ |
114 | enum { | 120 | enum { |
115 | X86_TRAP_DE = 0, /* 0, Divide-by-zero */ | 121 | X86_TRAP_DE = 0, /* 0, Divide-by-zero */ |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d2c611699cd9..d23179900755 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -43,6 +43,7 @@ | |||
43 | #include <linux/export.h> | 43 | #include <linux/export.h> |
44 | 44 | ||
45 | #include <asm/processor.h> | 45 | #include <asm/processor.h> |
46 | #include <asm/traps.h> | ||
46 | #include <asm/mce.h> | 47 | #include <asm/mce.h> |
47 | #include <asm/msr.h> | 48 | #include <asm/msr.h> |
48 | 49 | ||
@@ -1003,51 +1004,6 @@ static void mce_clear_state(unsigned long *toclear) | |||
1003 | } | 1004 | } |
1004 | 1005 | ||
1005 | /* | 1006 | /* |
1006 | * Need to save faulting physical address associated with a process | ||
1007 | * in the machine check handler some place where we can grab it back | ||
1008 | * later in mce_notify_process() | ||
1009 | */ | ||
1010 | #define MCE_INFO_MAX 16 | ||
1011 | |||
1012 | struct mce_info { | ||
1013 | atomic_t inuse; | ||
1014 | struct task_struct *t; | ||
1015 | __u64 paddr; | ||
1016 | int restartable; | ||
1017 | } mce_info[MCE_INFO_MAX]; | ||
1018 | |||
1019 | static void mce_save_info(__u64 addr, int c) | ||
1020 | { | ||
1021 | struct mce_info *mi; | ||
1022 | |||
1023 | for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { | ||
1024 | if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { | ||
1025 | mi->t = current; | ||
1026 | mi->paddr = addr; | ||
1027 | mi->restartable = c; | ||
1028 | return; | ||
1029 | } | ||
1030 | } | ||
1031 | |||
1032 | mce_panic("Too many concurrent recoverable errors", NULL, NULL); | ||
1033 | } | ||
1034 | |||
1035 | static struct mce_info *mce_find_info(void) | ||
1036 | { | ||
1037 | struct mce_info *mi; | ||
1038 | |||
1039 | for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) | ||
1040 | if (atomic_read(&mi->inuse) && mi->t == current) | ||
1041 | return mi; | ||
1042 | return NULL; | ||
1043 | } | ||
1044 | |||
1045 | static void mce_clear_info(struct mce_info *mi) | ||
1046 | { | ||
1047 | atomic_set(&mi->inuse, 0); | ||
1048 | } | ||
1049 | |||
1050 | /* | ||
1051 | * The actual machine check handler. This only handles real | 1007 | * The actual machine check handler. This only handles real |
1052 | * exceptions when something got corrupted coming in through int 18. | 1008 | * exceptions when something got corrupted coming in through int 18. |
1053 | * | 1009 | * |
@@ -1063,6 +1019,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1063 | { | 1019 | { |
1064 | struct mca_config *cfg = &mca_cfg; | 1020 | struct mca_config *cfg = &mca_cfg; |
1065 | struct mce m, *final; | 1021 | struct mce m, *final; |
1022 | enum ctx_state prev_state; | ||
1066 | int i; | 1023 | int i; |
1067 | int worst = 0; | 1024 | int worst = 0; |
1068 | int severity; | 1025 | int severity; |
@@ -1084,6 +1041,10 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1084 | DECLARE_BITMAP(toclear, MAX_NR_BANKS); | 1041 | DECLARE_BITMAP(toclear, MAX_NR_BANKS); |
1085 | DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); | 1042 | DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); |
1086 | char *msg = "Unknown"; | 1043 | char *msg = "Unknown"; |
1044 | u64 recover_paddr = ~0ull; | ||
1045 | int flags = MF_ACTION_REQUIRED; | ||
1046 | |||
1047 | prev_state = ist_enter(regs); | ||
1087 | 1048 | ||
1088 | this_cpu_inc(mce_exception_count); | 1049 | this_cpu_inc(mce_exception_count); |
1089 | 1050 | ||
@@ -1203,9 +1164,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1203 | if (no_way_out) | 1164 | if (no_way_out) |
1204 | mce_panic("Fatal machine check on current CPU", &m, msg); | 1165 | mce_panic("Fatal machine check on current CPU", &m, msg); |
1205 | if (worst == MCE_AR_SEVERITY) { | 1166 | if (worst == MCE_AR_SEVERITY) { |
1206 | /* schedule action before return to userland */ | 1167 | recover_paddr = m.addr; |
1207 | mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); | 1168 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) |
1208 | set_thread_flag(TIF_MCE_NOTIFY); | 1169 | flags |= MF_MUST_KILL; |
1209 | } else if (kill_it) { | 1170 | } else if (kill_it) { |
1210 | force_sig(SIGBUS, current); | 1171 | force_sig(SIGBUS, current); |
1211 | } | 1172 | } |
@@ -1216,6 +1177,27 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1216 | mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); | 1177 | mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); |
1217 | out: | 1178 | out: |
1218 | sync_core(); | 1179 | sync_core(); |
1180 | |||
1181 | if (recover_paddr == ~0ull) | ||
1182 | goto done; | ||
1183 | |||
1184 | pr_err("Uncorrected hardware memory error in user-access at %llx", | ||
1185 | recover_paddr); | ||
1186 | /* | ||
1187 | * We must call memory_failure() here even if the current process is | ||
1188 | * doomed. We still need to mark the page as poisoned and alert any | ||
1189 | * other users of the page. | ||
1190 | */ | ||
1191 | ist_begin_non_atomic(regs); | ||
1192 | local_irq_enable(); | ||
1193 | if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) { | ||
1194 | pr_err("Memory error not recovered"); | ||
1195 | force_sig(SIGBUS, current); | ||
1196 | } | ||
1197 | local_irq_disable(); | ||
1198 | ist_end_non_atomic(); | ||
1199 | done: | ||
1200 | ist_exit(regs, prev_state); | ||
1219 | } | 1201 | } |
1220 | EXPORT_SYMBOL_GPL(do_machine_check); | 1202 | EXPORT_SYMBOL_GPL(do_machine_check); |
1221 | 1203 | ||
@@ -1233,42 +1215,6 @@ int memory_failure(unsigned long pfn, int vector, int flags) | |||
1233 | #endif | 1215 | #endif |
1234 | 1216 | ||
1235 | /* | 1217 | /* |
1236 | * Called in process context that interrupted by MCE and marked with | ||
1237 | * TIF_MCE_NOTIFY, just before returning to erroneous userland. | ||
1238 | * This code is allowed to sleep. | ||
1239 | * Attempt possible recovery such as calling the high level VM handler to | ||
1240 | * process any corrupted pages, and kill/signal current process if required. | ||
1241 | * Action required errors are handled here. | ||
1242 | */ | ||
1243 | void mce_notify_process(void) | ||
1244 | { | ||
1245 | unsigned long pfn; | ||
1246 | struct mce_info *mi = mce_find_info(); | ||
1247 | int flags = MF_ACTION_REQUIRED; | ||
1248 | |||
1249 | if (!mi) | ||
1250 | mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); | ||
1251 | pfn = mi->paddr >> PAGE_SHIFT; | ||
1252 | |||
1253 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
1254 | |||
1255 | pr_err("Uncorrected hardware memory error in user-access at %llx", | ||
1256 | mi->paddr); | ||
1257 | /* | ||
1258 | * We must call memory_failure() here even if the current process is | ||
1259 | * doomed. We still need to mark the page as poisoned and alert any | ||
1260 | * other users of the page. | ||
1261 | */ | ||
1262 | if (!mi->restartable) | ||
1263 | flags |= MF_MUST_KILL; | ||
1264 | if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { | ||
1265 | pr_err("Memory error not recovered"); | ||
1266 | force_sig(SIGBUS, current); | ||
1267 | } | ||
1268 | mce_clear_info(mi); | ||
1269 | } | ||
1270 | |||
1271 | /* | ||
1272 | * Action optional processing happens here (picking up | 1218 | * Action optional processing happens here (picking up |
1273 | * from the list of faulting pages that do_machine_check() | 1219 | * from the list of faulting pages that do_machine_check() |
1274 | * placed into the "ring"). | 1220 | * placed into the "ring"). |
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index a3042989398c..ec2663a708e4 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/smp.h> | 8 | #include <linux/smp.h> |
9 | 9 | ||
10 | #include <asm/processor.h> | 10 | #include <asm/processor.h> |
11 | #include <asm/traps.h> | ||
11 | #include <asm/mce.h> | 12 | #include <asm/mce.h> |
12 | #include <asm/msr.h> | 13 | #include <asm/msr.h> |
13 | 14 | ||
@@ -17,8 +18,11 @@ int mce_p5_enabled __read_mostly; | |||
17 | /* Machine check handler for Pentium class Intel CPUs: */ | 18 | /* Machine check handler for Pentium class Intel CPUs: */ |
18 | static void pentium_machine_check(struct pt_regs *regs, long error_code) | 19 | static void pentium_machine_check(struct pt_regs *regs, long error_code) |
19 | { | 20 | { |
21 | enum ctx_state prev_state; | ||
20 | u32 loaddr, hi, lotype; | 22 | u32 loaddr, hi, lotype; |
21 | 23 | ||
24 | prev_state = ist_enter(regs); | ||
25 | |||
22 | rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); | 26 | rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); |
23 | rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); | 27 | rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); |
24 | 28 | ||
@@ -33,6 +37,8 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) | |||
33 | } | 37 | } |
34 | 38 | ||
35 | add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); | 39 | add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); |
40 | |||
41 | ist_exit(regs, prev_state); | ||
36 | } | 42 | } |
37 | 43 | ||
38 | /* Set up machine check reporting for processors with Intel style MCE: */ | 44 | /* Set up machine check reporting for processors with Intel style MCE: */ |
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 7dc5564d0cdf..bd5d46a32210 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c | |||
@@ -7,14 +7,19 @@ | |||
7 | #include <linux/types.h> | 7 | #include <linux/types.h> |
8 | 8 | ||
9 | #include <asm/processor.h> | 9 | #include <asm/processor.h> |
10 | #include <asm/traps.h> | ||
10 | #include <asm/mce.h> | 11 | #include <asm/mce.h> |
11 | #include <asm/msr.h> | 12 | #include <asm/msr.h> |
12 | 13 | ||
13 | /* Machine check handler for WinChip C6: */ | 14 | /* Machine check handler for WinChip C6: */ |
14 | static void winchip_machine_check(struct pt_regs *regs, long error_code) | 15 | static void winchip_machine_check(struct pt_regs *regs, long error_code) |
15 | { | 16 | { |
17 | enum ctx_state prev_state = ist_enter(regs); | ||
18 | |||
16 | printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); | 19 | printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); |
17 | add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); | 20 | add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); |
21 | |||
22 | ist_exit(regs, prev_state); | ||
18 | } | 23 | } |
19 | 24 | ||
20 | /* Set up machine check reporting on the Winchip C6 series */ | 25 | /* Set up machine check reporting on the Winchip C6 series */ |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 9ebaf63ba182..db13655c3a2a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -143,7 +143,8 @@ ENDPROC(native_usergs_sysret64) | |||
143 | movq \tmp,RSP+\offset(%rsp) | 143 | movq \tmp,RSP+\offset(%rsp) |
144 | movq $__USER_DS,SS+\offset(%rsp) | 144 | movq $__USER_DS,SS+\offset(%rsp) |
145 | movq $__USER_CS,CS+\offset(%rsp) | 145 | movq $__USER_CS,CS+\offset(%rsp) |
146 | movq $-1,RCX+\offset(%rsp) | 146 | movq RIP+\offset(%rsp),\tmp /* get rip */ |
147 | movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */ | ||
147 | movq R11+\offset(%rsp),\tmp /* get eflags */ | 148 | movq R11+\offset(%rsp),\tmp /* get eflags */ |
148 | movq \tmp,EFLAGS+\offset(%rsp) | 149 | movq \tmp,EFLAGS+\offset(%rsp) |
149 | .endm | 150 | .endm |
@@ -155,27 +156,6 @@ ENDPROC(native_usergs_sysret64) | |||
155 | movq \tmp,R11+\offset(%rsp) | 156 | movq \tmp,R11+\offset(%rsp) |
156 | .endm | 157 | .endm |
157 | 158 | ||
158 | .macro FAKE_STACK_FRAME child_rip | ||
159 | /* push in order ss, rsp, eflags, cs, rip */ | ||
160 | xorl %eax, %eax | ||
161 | pushq_cfi $__KERNEL_DS /* ss */ | ||
162 | /*CFI_REL_OFFSET ss,0*/ | ||
163 | pushq_cfi %rax /* rsp */ | ||
164 | CFI_REL_OFFSET rsp,0 | ||
165 | pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */ | ||
166 | /*CFI_REL_OFFSET rflags,0*/ | ||
167 | pushq_cfi $__KERNEL_CS /* cs */ | ||
168 | /*CFI_REL_OFFSET cs,0*/ | ||
169 | pushq_cfi \child_rip /* rip */ | ||
170 | CFI_REL_OFFSET rip,0 | ||
171 | pushq_cfi %rax /* orig rax */ | ||
172 | .endm | ||
173 | |||
174 | .macro UNFAKE_STACK_FRAME | ||
175 | addq $8*6, %rsp | ||
176 | CFI_ADJUST_CFA_OFFSET -(6*8) | ||
177 | .endm | ||
178 | |||
179 | /* | 159 | /* |
180 | * initial frame state for interrupts (and exceptions without error code) | 160 | * initial frame state for interrupts (and exceptions without error code) |
181 | */ | 161 | */ |
@@ -238,51 +218,6 @@ ENDPROC(native_usergs_sysret64) | |||
238 | CFI_REL_OFFSET r15, R15+\offset | 218 | CFI_REL_OFFSET r15, R15+\offset |
239 | .endm | 219 | .endm |
240 | 220 | ||
241 | /* save partial stack frame */ | ||
242 | .macro SAVE_ARGS_IRQ | ||
243 | cld | ||
244 | /* start from rbp in pt_regs and jump over */ | ||
245 | movq_cfi rdi, (RDI-RBP) | ||
246 | movq_cfi rsi, (RSI-RBP) | ||
247 | movq_cfi rdx, (RDX-RBP) | ||
248 | movq_cfi rcx, (RCX-RBP) | ||
249 | movq_cfi rax, (RAX-RBP) | ||
250 | movq_cfi r8, (R8-RBP) | ||
251 | movq_cfi r9, (R9-RBP) | ||
252 | movq_cfi r10, (R10-RBP) | ||
253 | movq_cfi r11, (R11-RBP) | ||
254 | |||
255 | /* Save rbp so that we can unwind from get_irq_regs() */ | ||
256 | movq_cfi rbp, 0 | ||
257 | |||
258 | /* Save previous stack value */ | ||
259 | movq %rsp, %rsi | ||
260 | |||
261 | leaq -RBP(%rsp),%rdi /* arg1 for handler */ | ||
262 | testl $3, CS-RBP(%rsi) | ||
263 | je 1f | ||
264 | SWAPGS | ||
265 | /* | ||
266 | * irq_count is used to check if a CPU is already on an interrupt stack | ||
267 | * or not. While this is essentially redundant with preempt_count it is | ||
268 | * a little cheaper to use a separate counter in the PDA (short of | ||
269 | * moving irq_enter into assembly, which would be too much work) | ||
270 | */ | ||
271 | 1: incl PER_CPU_VAR(irq_count) | ||
272 | cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp | ||
273 | CFI_DEF_CFA_REGISTER rsi | ||
274 | |||
275 | /* Store previous stack value */ | ||
276 | pushq %rsi | ||
277 | CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ | ||
278 | 0x77 /* DW_OP_breg7 */, 0, \ | ||
279 | 0x06 /* DW_OP_deref */, \ | ||
280 | 0x08 /* DW_OP_const1u */, SS+8-RBP, \ | ||
281 | 0x22 /* DW_OP_plus */ | ||
282 | /* We entered an interrupt context - irqs are off: */ | ||
283 | TRACE_IRQS_OFF | ||
284 | .endm | ||
285 | |||
286 | ENTRY(save_paranoid) | 221 | ENTRY(save_paranoid) |
287 | XCPT_FRAME 1 RDI+8 | 222 | XCPT_FRAME 1 RDI+8 |
288 | cld | 223 | cld |
@@ -426,15 +361,12 @@ system_call_fastpath: | |||
426 | * Has incomplete stack frame and undefined top of stack. | 361 | * Has incomplete stack frame and undefined top of stack. |
427 | */ | 362 | */ |
428 | ret_from_sys_call: | 363 | ret_from_sys_call: |
429 | movl $_TIF_ALLWORK_MASK,%edi | 364 | testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) |
430 | /* edi: flagmask */ | 365 | jnz int_ret_from_sys_call_fixup /* Go the the slow path */ |
431 | sysret_check: | 366 | |
432 | LOCKDEP_SYS_EXIT | 367 | LOCKDEP_SYS_EXIT |
433 | DISABLE_INTERRUPTS(CLBR_NONE) | 368 | DISABLE_INTERRUPTS(CLBR_NONE) |
434 | TRACE_IRQS_OFF | 369 | TRACE_IRQS_OFF |
435 | movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx | ||
436 | andl %edi,%edx | ||
437 | jnz sysret_careful | ||
438 | CFI_REMEMBER_STATE | 370 | CFI_REMEMBER_STATE |
439 | /* | 371 | /* |
440 | * sysretq will re-enable interrupts: | 372 | * sysretq will re-enable interrupts: |
@@ -448,49 +380,10 @@ sysret_check: | |||
448 | USERGS_SYSRET64 | 380 | USERGS_SYSRET64 |
449 | 381 | ||
450 | CFI_RESTORE_STATE | 382 | CFI_RESTORE_STATE |
451 | /* Handle reschedules */ | ||
452 | /* edx: work, edi: workmask */ | ||
453 | sysret_careful: | ||
454 | bt $TIF_NEED_RESCHED,%edx | ||
455 | jnc sysret_signal | ||
456 | TRACE_IRQS_ON | ||
457 | ENABLE_INTERRUPTS(CLBR_NONE) | ||
458 | pushq_cfi %rdi | ||
459 | SCHEDULE_USER | ||
460 | popq_cfi %rdi | ||
461 | jmp sysret_check | ||
462 | 383 | ||
463 | /* Handle a signal */ | 384 | int_ret_from_sys_call_fixup: |
464 | sysret_signal: | ||
465 | TRACE_IRQS_ON | ||
466 | ENABLE_INTERRUPTS(CLBR_NONE) | ||
467 | #ifdef CONFIG_AUDITSYSCALL | ||
468 | bt $TIF_SYSCALL_AUDIT,%edx | ||
469 | jc sysret_audit | ||
470 | #endif | ||
471 | /* | ||
472 | * We have a signal, or exit tracing or single-step. | ||
473 | * These all wind up with the iret return path anyway, | ||
474 | * so just join that path right now. | ||
475 | */ | ||
476 | FIXUP_TOP_OF_STACK %r11, -ARGOFFSET | 385 | FIXUP_TOP_OF_STACK %r11, -ARGOFFSET |
477 | jmp int_check_syscall_exit_work | 386 | jmp int_ret_from_sys_call |
478 | |||
479 | #ifdef CONFIG_AUDITSYSCALL | ||
480 | /* | ||
481 | * Return fast path for syscall audit. Call __audit_syscall_exit() | ||
482 | * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT | ||
483 | * masked off. | ||
484 | */ | ||
485 | sysret_audit: | ||
486 | movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ | ||
487 | cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ | ||
488 | setbe %al /* 1 if so, 0 if not */ | ||
489 | movzbl %al,%edi /* zero-extend that into %edi */ | ||
490 | call __audit_syscall_exit | ||
491 | movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi | ||
492 | jmp sysret_check | ||
493 | #endif /* CONFIG_AUDITSYSCALL */ | ||
494 | 387 | ||
495 | /* Do syscall tracing */ | 388 | /* Do syscall tracing */ |
496 | tracesys: | 389 | tracesys: |
@@ -626,19 +519,6 @@ END(\label) | |||
626 | FORK_LIKE vfork | 519 | FORK_LIKE vfork |
627 | FIXED_FRAME stub_iopl, sys_iopl | 520 | FIXED_FRAME stub_iopl, sys_iopl |
628 | 521 | ||
629 | ENTRY(ptregscall_common) | ||
630 | DEFAULT_FRAME 1 8 /* offset 8: return address */ | ||
631 | RESTORE_TOP_OF_STACK %r11, 8 | ||
632 | movq_cfi_restore R15+8, r15 | ||
633 | movq_cfi_restore R14+8, r14 | ||
634 | movq_cfi_restore R13+8, r13 | ||
635 | movq_cfi_restore R12+8, r12 | ||
636 | movq_cfi_restore RBP+8, rbp | ||
637 | movq_cfi_restore RBX+8, rbx | ||
638 | ret $REST_SKIP /* pop extended registers */ | ||
639 | CFI_ENDPROC | ||
640 | END(ptregscall_common) | ||
641 | |||
642 | ENTRY(stub_execve) | 522 | ENTRY(stub_execve) |
643 | CFI_STARTPROC | 523 | CFI_STARTPROC |
644 | addq $8, %rsp | 524 | addq $8, %rsp |
@@ -779,7 +659,48 @@ END(interrupt) | |||
779 | /* reserve pt_regs for scratch regs and rbp */ | 659 | /* reserve pt_regs for scratch regs and rbp */ |
780 | subq $ORIG_RAX-RBP, %rsp | 660 | subq $ORIG_RAX-RBP, %rsp |
781 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP | 661 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP |
782 | SAVE_ARGS_IRQ | 662 | cld |
663 | /* start from rbp in pt_regs and jump over */ | ||
664 | movq_cfi rdi, (RDI-RBP) | ||
665 | movq_cfi rsi, (RSI-RBP) | ||
666 | movq_cfi rdx, (RDX-RBP) | ||
667 | movq_cfi rcx, (RCX-RBP) | ||
668 | movq_cfi rax, (RAX-RBP) | ||
669 | movq_cfi r8, (R8-RBP) | ||
670 | movq_cfi r9, (R9-RBP) | ||
671 | movq_cfi r10, (R10-RBP) | ||
672 | movq_cfi r11, (R11-RBP) | ||
673 | |||
674 | /* Save rbp so that we can unwind from get_irq_regs() */ | ||
675 | movq_cfi rbp, 0 | ||
676 | |||
677 | /* Save previous stack value */ | ||
678 | movq %rsp, %rsi | ||
679 | |||
680 | leaq -RBP(%rsp),%rdi /* arg1 for handler */ | ||
681 | testl $3, CS-RBP(%rsi) | ||
682 | je 1f | ||
683 | SWAPGS | ||
684 | /* | ||
685 | * irq_count is used to check if a CPU is already on an interrupt stack | ||
686 | * or not. While this is essentially redundant with preempt_count it is | ||
687 | * a little cheaper to use a separate counter in the PDA (short of | ||
688 | * moving irq_enter into assembly, which would be too much work) | ||
689 | */ | ||
690 | 1: incl PER_CPU_VAR(irq_count) | ||
691 | cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp | ||
692 | CFI_DEF_CFA_REGISTER rsi | ||
693 | |||
694 | /* Store previous stack value */ | ||
695 | pushq %rsi | ||
696 | CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ | ||
697 | 0x77 /* DW_OP_breg7 */, 0, \ | ||
698 | 0x06 /* DW_OP_deref */, \ | ||
699 | 0x08 /* DW_OP_const1u */, SS+8-RBP, \ | ||
700 | 0x22 /* DW_OP_plus */ | ||
701 | /* We entered an interrupt context - irqs are off: */ | ||
702 | TRACE_IRQS_OFF | ||
703 | |||
783 | call \func | 704 | call \func |
784 | .endm | 705 | .endm |
785 | 706 | ||
@@ -831,6 +752,60 @@ retint_swapgs: /* return to user-space */ | |||
831 | */ | 752 | */ |
832 | DISABLE_INTERRUPTS(CLBR_ANY) | 753 | DISABLE_INTERRUPTS(CLBR_ANY) |
833 | TRACE_IRQS_IRETQ | 754 | TRACE_IRQS_IRETQ |
755 | |||
756 | /* | ||
757 | * Try to use SYSRET instead of IRET if we're returning to | ||
758 | * a completely clean 64-bit userspace context. | ||
759 | */ | ||
760 | movq (RCX-R11)(%rsp), %rcx | ||
761 | cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */ | ||
762 | jne opportunistic_sysret_failed | ||
763 | |||
764 | /* | ||
765 | * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP | ||
766 | * in kernel space. This essentially lets the user take over | ||
767 | * the kernel, since userspace controls RSP. It's not worth | ||
768 | * testing for canonicalness exactly -- this check detects any | ||
769 | * of the 17 high bits set, which is true for non-canonical | ||
770 | * or kernel addresses. (This will pessimize vsyscall=native. | ||
771 | * Big deal.) | ||
772 | * | ||
773 | * If virtual addresses ever become wider, this will need | ||
774 | * to be updated to remain correct on both old and new CPUs. | ||
775 | */ | ||
776 | .ifne __VIRTUAL_MASK_SHIFT - 47 | ||
777 | .error "virtual address width changed -- sysret checks need update" | ||
778 | .endif | ||
779 | shr $__VIRTUAL_MASK_SHIFT, %rcx | ||
780 | jnz opportunistic_sysret_failed | ||
781 | |||
782 | cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */ | ||
783 | jne opportunistic_sysret_failed | ||
784 | |||
785 | movq (R11-ARGOFFSET)(%rsp), %r11 | ||
786 | cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */ | ||
787 | jne opportunistic_sysret_failed | ||
788 | |||
789 | testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */ | ||
790 | jnz opportunistic_sysret_failed | ||
791 | |||
792 | /* nothing to check for RSP */ | ||
793 | |||
794 | cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */ | ||
795 | jne opportunistic_sysret_failed | ||
796 | |||
797 | /* | ||
798 | * We win! This label is here just for ease of understanding | ||
799 | * perf profiles. Nothing jumps here. | ||
800 | */ | ||
801 | irq_return_via_sysret: | ||
802 | CFI_REMEMBER_STATE | ||
803 | RESTORE_ARGS 1,8,1 | ||
804 | movq (RSP-RIP)(%rsp),%rsp | ||
805 | USERGS_SYSRET64 | ||
806 | CFI_RESTORE_STATE | ||
807 | |||
808 | opportunistic_sysret_failed: | ||
834 | SWAPGS | 809 | SWAPGS |
835 | jmp restore_args | 810 | jmp restore_args |
836 | 811 | ||
@@ -1048,6 +1023,11 @@ ENTRY(\sym) | |||
1048 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 | 1023 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 |
1049 | 1024 | ||
1050 | .if \paranoid | 1025 | .if \paranoid |
1026 | .if \paranoid == 1 | ||
1027 | CFI_REMEMBER_STATE | ||
1028 | testl $3, CS(%rsp) /* If coming from userspace, switch */ | ||
1029 | jnz 1f /* stacks. */ | ||
1030 | .endif | ||
1051 | call save_paranoid | 1031 | call save_paranoid |
1052 | .else | 1032 | .else |
1053 | call error_entry | 1033 | call error_entry |
@@ -1088,6 +1068,36 @@ ENTRY(\sym) | |||
1088 | jmp error_exit /* %ebx: no swapgs flag */ | 1068 | jmp error_exit /* %ebx: no swapgs flag */ |
1089 | .endif | 1069 | .endif |
1090 | 1070 | ||
1071 | .if \paranoid == 1 | ||
1072 | CFI_RESTORE_STATE | ||
1073 | /* | ||
1074 | * Paranoid entry from userspace. Switch stacks and treat it | ||
1075 | * as a normal entry. This means that paranoid handlers | ||
1076 | * run in real process context if user_mode(regs). | ||
1077 | */ | ||
1078 | 1: | ||
1079 | call error_entry | ||
1080 | |||
1081 | DEFAULT_FRAME 0 | ||
1082 | |||
1083 | movq %rsp,%rdi /* pt_regs pointer */ | ||
1084 | call sync_regs | ||
1085 | movq %rax,%rsp /* switch stack */ | ||
1086 | |||
1087 | movq %rsp,%rdi /* pt_regs pointer */ | ||
1088 | |||
1089 | .if \has_error_code | ||
1090 | movq ORIG_RAX(%rsp),%rsi /* get error code */ | ||
1091 | movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ | ||
1092 | .else | ||
1093 | xorl %esi,%esi /* no error code */ | ||
1094 | .endif | ||
1095 | |||
1096 | call \do_sym | ||
1097 | |||
1098 | jmp error_exit /* %ebx: no swapgs flag */ | ||
1099 | .endif | ||
1100 | |||
1091 | CFI_ENDPROC | 1101 | CFI_ENDPROC |
1092 | END(\sym) | 1102 | END(\sym) |
1093 | .endm | 1103 | .endm |
@@ -1108,7 +1118,7 @@ idtentry overflow do_overflow has_error_code=0 | |||
1108 | idtentry bounds do_bounds has_error_code=0 | 1118 | idtentry bounds do_bounds has_error_code=0 |
1109 | idtentry invalid_op do_invalid_op has_error_code=0 | 1119 | idtentry invalid_op do_invalid_op has_error_code=0 |
1110 | idtentry device_not_available do_device_not_available has_error_code=0 | 1120 | idtentry device_not_available do_device_not_available has_error_code=0 |
1111 | idtentry double_fault do_double_fault has_error_code=1 paranoid=1 | 1121 | idtentry double_fault do_double_fault has_error_code=1 paranoid=2 |
1112 | idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 | 1122 | idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 |
1113 | idtentry invalid_TSS do_invalid_TSS has_error_code=1 | 1123 | idtentry invalid_TSS do_invalid_TSS has_error_code=1 |
1114 | idtentry segment_not_present do_segment_not_present has_error_code=1 | 1124 | idtentry segment_not_present do_segment_not_present has_error_code=1 |
@@ -1289,16 +1299,14 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector( | |||
1289 | #endif | 1299 | #endif |
1290 | 1300 | ||
1291 | /* | 1301 | /* |
1292 | * "Paranoid" exit path from exception stack. | 1302 | * "Paranoid" exit path from exception stack. This is invoked |
1293 | * Paranoid because this is used by NMIs and cannot take | 1303 | * only on return from non-NMI IST interrupts that came |
1294 | * any kernel state for granted. | 1304 | * from kernel space. |
1295 | * We don't do kernel preemption checks here, because only | ||
1296 | * NMI should be common and it does not enable IRQs and | ||
1297 | * cannot get reschedule ticks. | ||
1298 | * | 1305 | * |
1299 | * "trace" is 0 for the NMI handler only, because irq-tracing | 1306 | * We may be returning to very strange contexts (e.g. very early |
1300 | * is fundamentally NMI-unsafe. (we cannot change the soft and | 1307 | * in syscall entry), so checking for preemption here would |
1301 | * hard flags at once, atomically) | 1308 | * be complicated. Fortunately, we there's no good reason |
1309 | * to try to handle preemption here. | ||
1302 | */ | 1310 | */ |
1303 | 1311 | ||
1304 | /* ebx: no swapgs flag */ | 1312 | /* ebx: no swapgs flag */ |
@@ -1308,43 +1316,14 @@ ENTRY(paranoid_exit) | |||
1308 | TRACE_IRQS_OFF_DEBUG | 1316 | TRACE_IRQS_OFF_DEBUG |
1309 | testl %ebx,%ebx /* swapgs needed? */ | 1317 | testl %ebx,%ebx /* swapgs needed? */ |
1310 | jnz paranoid_restore | 1318 | jnz paranoid_restore |
1311 | testl $3,CS(%rsp) | ||
1312 | jnz paranoid_userspace | ||
1313 | paranoid_swapgs: | ||
1314 | TRACE_IRQS_IRETQ 0 | 1319 | TRACE_IRQS_IRETQ 0 |
1315 | SWAPGS_UNSAFE_STACK | 1320 | SWAPGS_UNSAFE_STACK |
1316 | RESTORE_ALL 8 | 1321 | RESTORE_ALL 8 |
1317 | jmp irq_return | 1322 | INTERRUPT_RETURN |
1318 | paranoid_restore: | 1323 | paranoid_restore: |
1319 | TRACE_IRQS_IRETQ_DEBUG 0 | 1324 | TRACE_IRQS_IRETQ_DEBUG 0 |
1320 | RESTORE_ALL 8 | 1325 | RESTORE_ALL 8 |
1321 | jmp irq_return | 1326 | INTERRUPT_RETURN |
1322 | paranoid_userspace: | ||
1323 | GET_THREAD_INFO(%rcx) | ||
1324 | movl TI_flags(%rcx),%ebx | ||
1325 | andl $_TIF_WORK_MASK,%ebx | ||
1326 | jz paranoid_swapgs | ||
1327 | movq %rsp,%rdi /* &pt_regs */ | ||
1328 | call sync_regs | ||
1329 | movq %rax,%rsp /* switch stack for scheduling */ | ||
1330 | testl $_TIF_NEED_RESCHED,%ebx | ||
1331 | jnz paranoid_schedule | ||
1332 | movl %ebx,%edx /* arg3: thread flags */ | ||
1333 | TRACE_IRQS_ON | ||
1334 | ENABLE_INTERRUPTS(CLBR_NONE) | ||
1335 | xorl %esi,%esi /* arg2: oldset */ | ||
1336 | movq %rsp,%rdi /* arg1: &pt_regs */ | ||
1337 | call do_notify_resume | ||
1338 | DISABLE_INTERRUPTS(CLBR_NONE) | ||
1339 | TRACE_IRQS_OFF | ||
1340 | jmp paranoid_userspace | ||
1341 | paranoid_schedule: | ||
1342 | TRACE_IRQS_ON | ||
1343 | ENABLE_INTERRUPTS(CLBR_ANY) | ||
1344 | SCHEDULE_USER | ||
1345 | DISABLE_INTERRUPTS(CLBR_ANY) | ||
1346 | TRACE_IRQS_OFF | ||
1347 | jmp paranoid_userspace | ||
1348 | CFI_ENDPROC | 1327 | CFI_ENDPROC |
1349 | END(paranoid_exit) | 1328 | END(paranoid_exit) |
1350 | 1329 | ||
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 63ce838e5a54..28d28f5eb8f4 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
@@ -69,16 +69,9 @@ static void call_on_stack(void *func, void *stack) | |||
69 | : "memory", "cc", "edx", "ecx", "eax"); | 69 | : "memory", "cc", "edx", "ecx", "eax"); |
70 | } | 70 | } |
71 | 71 | ||
72 | /* how to get the current stack pointer from C */ | ||
73 | #define current_stack_pointer ({ \ | ||
74 | unsigned long sp; \ | ||
75 | asm("mov %%esp,%0" : "=g" (sp)); \ | ||
76 | sp; \ | ||
77 | }) | ||
78 | |||
79 | static inline void *current_stack(void) | 72 | static inline void *current_stack(void) |
80 | { | 73 | { |
81 | return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); | 74 | return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); |
82 | } | 75 | } |
83 | 76 | ||
84 | static inline int | 77 | static inline int |
@@ -103,7 +96,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) | |||
103 | 96 | ||
104 | /* Save the next esp at the bottom of the stack */ | 97 | /* Save the next esp at the bottom of the stack */ |
105 | prev_esp = (u32 *)irqstk; | 98 | prev_esp = (u32 *)irqstk; |
106 | *prev_esp = current_stack_pointer; | 99 | *prev_esp = current_stack_pointer(); |
107 | 100 | ||
108 | if (unlikely(overflow)) | 101 | if (unlikely(overflow)) |
109 | call_on_stack(print_stack_overflow, isp); | 102 | call_on_stack(print_stack_overflow, isp); |
@@ -156,7 +149,7 @@ void do_softirq_own_stack(void) | |||
156 | 149 | ||
157 | /* Push the previous esp onto the stack */ | 150 | /* Push the previous esp onto the stack */ |
158 | prev_esp = (u32 *)irqstk; | 151 | prev_esp = (u32 *)irqstk; |
159 | *prev_esp = current_stack_pointer; | 152 | *prev_esp = current_stack_pointer(); |
160 | 153 | ||
161 | call_on_stack(__do_softirq, isp); | 154 | call_on_stack(__do_softirq, isp); |
162 | } | 155 | } |
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index ed37a768d0fc..2a33c8f68319 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c | |||
@@ -740,12 +740,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | |||
740 | { | 740 | { |
741 | user_exit(); | 741 | user_exit(); |
742 | 742 | ||
743 | #ifdef CONFIG_X86_MCE | ||
744 | /* notify userspace of pending MCEs */ | ||
745 | if (thread_info_flags & _TIF_MCE_NOTIFY) | ||
746 | mce_notify_process(); | ||
747 | #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ | ||
748 | |||
749 | if (thread_info_flags & _TIF_UPROBE) | 743 | if (thread_info_flags & _TIF_UPROBE) |
750 | uprobe_notify_resume(regs); | 744 | uprobe_notify_resume(regs); |
751 | 745 | ||
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 88900e288021..c74f2f5652da 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -108,6 +108,88 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) | |||
108 | preempt_count_dec(); | 108 | preempt_count_dec(); |
109 | } | 109 | } |
110 | 110 | ||
111 | enum ctx_state ist_enter(struct pt_regs *regs) | ||
112 | { | ||
113 | enum ctx_state prev_state; | ||
114 | |||
115 | if (user_mode_vm(regs)) { | ||
116 | /* Other than that, we're just an exception. */ | ||
117 | prev_state = exception_enter(); | ||
118 | } else { | ||
119 | /* | ||
120 | * We might have interrupted pretty much anything. In | ||
121 | * fact, if we're a machine check, we can even interrupt | ||
122 | * NMI processing. We don't want in_nmi() to return true, | ||
123 | * but we need to notify RCU. | ||
124 | */ | ||
125 | rcu_nmi_enter(); | ||
126 | prev_state = IN_KERNEL; /* the value is irrelevant. */ | ||
127 | } | ||
128 | |||
129 | /* | ||
130 | * We are atomic because we're on the IST stack (or we're on x86_32, | ||
131 | * in which case we still shouldn't schedule). | ||
132 | * | ||
133 | * This must be after exception_enter(), because exception_enter() | ||
134 | * won't do anything if in_interrupt() returns true. | ||
135 | */ | ||
136 | preempt_count_add(HARDIRQ_OFFSET); | ||
137 | |||
138 | /* This code is a bit fragile. Test it. */ | ||
139 | rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work"); | ||
140 | |||
141 | return prev_state; | ||
142 | } | ||
143 | |||
144 | void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) | ||
145 | { | ||
146 | /* Must be before exception_exit. */ | ||
147 | preempt_count_sub(HARDIRQ_OFFSET); | ||
148 | |||
149 | if (user_mode_vm(regs)) | ||
150 | return exception_exit(prev_state); | ||
151 | else | ||
152 | rcu_nmi_exit(); | ||
153 | } | ||
154 | |||
155 | /** | ||
156 | * ist_begin_non_atomic() - begin a non-atomic section in an IST exception | ||
157 | * @regs: regs passed to the IST exception handler | ||
158 | * | ||
159 | * IST exception handlers normally cannot schedule. As a special | ||
160 | * exception, if the exception interrupted userspace code (i.e. | ||
161 | * user_mode_vm(regs) would return true) and the exception was not | ||
162 | * a double fault, it can be safe to schedule. ist_begin_non_atomic() | ||
163 | * begins a non-atomic section within an ist_enter()/ist_exit() region. | ||
164 | * Callers are responsible for enabling interrupts themselves inside | ||
165 | * the non-atomic section, and callers must call is_end_non_atomic() | ||
166 | * before ist_exit(). | ||
167 | */ | ||
168 | void ist_begin_non_atomic(struct pt_regs *regs) | ||
169 | { | ||
170 | BUG_ON(!user_mode_vm(regs)); | ||
171 | |||
172 | /* | ||
173 | * Sanity check: we need to be on the normal thread stack. This | ||
174 | * will catch asm bugs and any attempt to use ist_preempt_enable | ||
175 | * from double_fault. | ||
176 | */ | ||
177 | BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) | ||
178 | & ~(THREAD_SIZE - 1)) != 0); | ||
179 | |||
180 | preempt_count_sub(HARDIRQ_OFFSET); | ||
181 | } | ||
182 | |||
183 | /** | ||
184 | * ist_end_non_atomic() - begin a non-atomic section in an IST exception | ||
185 | * | ||
186 | * Ends a non-atomic section started with ist_begin_non_atomic(). | ||
187 | */ | ||
188 | void ist_end_non_atomic(void) | ||
189 | { | ||
190 | preempt_count_add(HARDIRQ_OFFSET); | ||
191 | } | ||
192 | |||
111 | static nokprobe_inline int | 193 | static nokprobe_inline int |
112 | do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, | 194 | do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, |
113 | struct pt_regs *regs, long error_code) | 195 | struct pt_regs *regs, long error_code) |
@@ -251,6 +333,8 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) | |||
251 | * end up promoting it to a doublefault. In that case, modify | 333 | * end up promoting it to a doublefault. In that case, modify |
252 | * the stack to make it look like we just entered the #GP | 334 | * the stack to make it look like we just entered the #GP |
253 | * handler from user space, similar to bad_iret. | 335 | * handler from user space, similar to bad_iret. |
336 | * | ||
337 | * No need for ist_enter here because we don't use RCU. | ||
254 | */ | 338 | */ |
255 | if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && | 339 | if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && |
256 | regs->cs == __KERNEL_CS && | 340 | regs->cs == __KERNEL_CS && |
@@ -263,12 +347,12 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) | |||
263 | normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ | 347 | normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ |
264 | regs->ip = (unsigned long)general_protection; | 348 | regs->ip = (unsigned long)general_protection; |
265 | regs->sp = (unsigned long)&normal_regs->orig_ax; | 349 | regs->sp = (unsigned long)&normal_regs->orig_ax; |
350 | |||
266 | return; | 351 | return; |
267 | } | 352 | } |
268 | #endif | 353 | #endif |
269 | 354 | ||
270 | exception_enter(); | 355 | ist_enter(regs); /* Discard prev_state because we won't return. */ |
271 | /* Return not checked because double check cannot be ignored */ | ||
272 | notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); | 356 | notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); |
273 | 357 | ||
274 | tsk->thread.error_code = error_code; | 358 | tsk->thread.error_code = error_code; |
@@ -434,7 +518,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) | |||
434 | if (poke_int3_handler(regs)) | 518 | if (poke_int3_handler(regs)) |
435 | return; | 519 | return; |
436 | 520 | ||
437 | prev_state = exception_enter(); | 521 | prev_state = ist_enter(regs); |
438 | #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP | 522 | #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP |
439 | if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, | 523 | if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, |
440 | SIGTRAP) == NOTIFY_STOP) | 524 | SIGTRAP) == NOTIFY_STOP) |
@@ -460,33 +544,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) | |||
460 | preempt_conditional_cli(regs); | 544 | preempt_conditional_cli(regs); |
461 | debug_stack_usage_dec(); | 545 | debug_stack_usage_dec(); |
462 | exit: | 546 | exit: |
463 | exception_exit(prev_state); | 547 | ist_exit(regs, prev_state); |
464 | } | 548 | } |
465 | NOKPROBE_SYMBOL(do_int3); | 549 | NOKPROBE_SYMBOL(do_int3); |
466 | 550 | ||
467 | #ifdef CONFIG_X86_64 | 551 | #ifdef CONFIG_X86_64 |
468 | /* | 552 | /* |
469 | * Help handler running on IST stack to switch back to user stack | 553 | * Help handler running on IST stack to switch off the IST stack if the |
470 | * for scheduling or signal handling. The actual stack switch is done in | 554 | * interrupted code was in user mode. The actual stack switch is done in |
471 | * entry.S | 555 | * entry_64.S |
472 | */ | 556 | */ |
473 | asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) | 557 | asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) |
474 | { | 558 | { |
475 | struct pt_regs *regs = eregs; | 559 | struct pt_regs *regs = task_pt_regs(current); |
476 | /* Did already sync */ | 560 | *regs = *eregs; |
477 | if (eregs == (struct pt_regs *)eregs->sp) | ||
478 | ; | ||
479 | /* Exception from user space */ | ||
480 | else if (user_mode(eregs)) | ||
481 | regs = task_pt_regs(current); | ||
482 | /* | ||
483 | * Exception from kernel and interrupts are enabled. Move to | ||
484 | * kernel process stack. | ||
485 | */ | ||
486 | else if (eregs->flags & X86_EFLAGS_IF) | ||
487 | regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); | ||
488 | if (eregs != regs) | ||
489 | *regs = *eregs; | ||
490 | return regs; | 561 | return regs; |
491 | } | 562 | } |
492 | NOKPROBE_SYMBOL(sync_regs); | 563 | NOKPROBE_SYMBOL(sync_regs); |
@@ -554,7 +625,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) | |||
554 | unsigned long dr6; | 625 | unsigned long dr6; |
555 | int si_code; | 626 | int si_code; |
556 | 627 | ||
557 | prev_state = exception_enter(); | 628 | prev_state = ist_enter(regs); |
558 | 629 | ||
559 | get_debugreg(dr6, 6); | 630 | get_debugreg(dr6, 6); |
560 | 631 | ||
@@ -629,7 +700,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) | |||
629 | debug_stack_usage_dec(); | 700 | debug_stack_usage_dec(); |
630 | 701 | ||
631 | exit: | 702 | exit: |
632 | exception_exit(prev_state); | 703 | ist_exit(regs, prev_state); |
633 | } | 704 | } |
634 | NOKPROBE_SYMBOL(do_debug); | 705 | NOKPROBE_SYMBOL(do_debug); |
635 | 706 | ||
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 5a4affe025e8..09297c8e1fcd 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile | |||
@@ -205,4 +205,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE | |||
205 | PHONY += vdso_install $(vdso_img_insttargets) | 205 | PHONY += vdso_install $(vdso_img_insttargets) |
206 | vdso_install: $(vdso_img_insttargets) FORCE | 206 | vdso_install: $(vdso_img_insttargets) FORCE |
207 | 207 | ||
208 | clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* | 208 | clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7680fc275036..4c106fcc0d54 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -759,39 +759,71 @@ void rcu_irq_enter(void) | |||
759 | /** | 759 | /** |
760 | * rcu_nmi_enter - inform RCU of entry to NMI context | 760 | * rcu_nmi_enter - inform RCU of entry to NMI context |
761 | * | 761 | * |
762 | * If the CPU was idle with dynamic ticks active, and there is no | 762 | * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and |
763 | * irq handler running, this updates rdtp->dynticks_nmi to let the | 763 | * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know |
764 | * RCU grace-period handling know that the CPU is active. | 764 | * that the CPU is active. This implementation permits nested NMIs, as |
765 | * long as the nesting level does not overflow an int. (You will probably | ||
766 | * run out of stack space first.) | ||
765 | */ | 767 | */ |
766 | void rcu_nmi_enter(void) | 768 | void rcu_nmi_enter(void) |
767 | { | 769 | { |
768 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 770 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
771 | int incby = 2; | ||
769 | 772 | ||
770 | if (rdtp->dynticks_nmi_nesting == 0 && | 773 | /* Complain about underflow. */ |
771 | (atomic_read(&rdtp->dynticks) & 0x1)) | 774 | WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); |
772 | return; | 775 | |
773 | rdtp->dynticks_nmi_nesting++; | 776 | /* |
774 | smp_mb__before_atomic(); /* Force delay from prior write. */ | 777 | * If idle from RCU viewpoint, atomically increment ->dynticks |
775 | atomic_inc(&rdtp->dynticks); | 778 | * to mark non-idle and increment ->dynticks_nmi_nesting by one. |
776 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | 779 | * Otherwise, increment ->dynticks_nmi_nesting by two. This means |
777 | smp_mb__after_atomic(); /* See above. */ | 780 | * if ->dynticks_nmi_nesting is equal to one, we are guaranteed |
778 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 781 | * to be in the outermost NMI handler that interrupted an RCU-idle |
782 | * period (observation due to Andy Lutomirski). | ||
783 | */ | ||
784 | if (!(atomic_read(&rdtp->dynticks) & 0x1)) { | ||
785 | smp_mb__before_atomic(); /* Force delay from prior write. */ | ||
786 | atomic_inc(&rdtp->dynticks); | ||
787 | /* atomic_inc() before later RCU read-side crit sects */ | ||
788 | smp_mb__after_atomic(); /* See above. */ | ||
789 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | ||
790 | incby = 1; | ||
791 | } | ||
792 | rdtp->dynticks_nmi_nesting += incby; | ||
793 | barrier(); | ||
779 | } | 794 | } |
780 | 795 | ||
781 | /** | 796 | /** |
782 | * rcu_nmi_exit - inform RCU of exit from NMI context | 797 | * rcu_nmi_exit - inform RCU of exit from NMI context |
783 | * | 798 | * |
784 | * If the CPU was idle with dynamic ticks active, and there is no | 799 | * If we are returning from the outermost NMI handler that interrupted an |
785 | * irq handler running, this updates rdtp->dynticks_nmi to let the | 800 | * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting |
786 | * RCU grace-period handling know that the CPU is no longer active. | 801 | * to let the RCU grace-period handling know that the CPU is back to |
802 | * being RCU-idle. | ||
787 | */ | 803 | */ |
788 | void rcu_nmi_exit(void) | 804 | void rcu_nmi_exit(void) |
789 | { | 805 | { |
790 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 806 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
791 | 807 | ||
792 | if (rdtp->dynticks_nmi_nesting == 0 || | 808 | /* |
793 | --rdtp->dynticks_nmi_nesting != 0) | 809 | * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. |
810 | * (We are exiting an NMI handler, so RCU better be paying attention | ||
811 | * to us!) | ||
812 | */ | ||
813 | WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); | ||
814 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | ||
815 | |||
816 | /* | ||
817 | * If the nesting level is not 1, the CPU wasn't RCU-idle, so | ||
818 | * leave it in non-RCU-idle state. | ||
819 | */ | ||
820 | if (rdtp->dynticks_nmi_nesting != 1) { | ||
821 | rdtp->dynticks_nmi_nesting -= 2; | ||
794 | return; | 822 | return; |
823 | } | ||
824 | |||
825 | /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ | ||
826 | rdtp->dynticks_nmi_nesting = 0; | ||
795 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | 827 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ |
796 | smp_mb__before_atomic(); /* See above. */ | 828 | smp_mb__before_atomic(); /* See above. */ |
797 | atomic_inc(&rdtp->dynticks); | 829 | atomic_inc(&rdtp->dynticks); |