diff options
| -rw-r--r-- | Documentation/x86/entry_64.txt | 18 | ||||
| -rw-r--r-- | Documentation/x86/x86_64/kernel-stacks | 8 | ||||
| -rw-r--r-- | arch/x86/ia32/ia32entry.S | 4 | ||||
| -rw-r--r-- | arch/x86/include/asm/calling.h | 1 | ||||
| -rw-r--r-- | arch/x86/include/asm/mce.h | 1 | ||||
| -rw-r--r-- | arch/x86/include/asm/thread_info.h | 15 | ||||
| -rw-r--r-- | arch/x86/include/asm/traps.h | 6 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 114 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/p5.c | 6 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/winchip.c | 5 | ||||
| -rw-r--r-- | arch/x86/kernel/entry_64.S | 208 | ||||
| -rw-r--r-- | arch/x86/kernel/irq_32.c | 13 | ||||
| -rw-r--r-- | arch/x86/kernel/signal.c | 6 | ||||
| -rw-r--r-- | arch/x86/kernel/traps.c | 108 | ||||
| -rw-r--r-- | kernel/rcu/tree.c | 66 |
15 files changed, 301 insertions, 278 deletions
diff --git a/Documentation/x86/entry_64.txt b/Documentation/x86/entry_64.txt index 4a1c5c2dc5a9..9132b86176a3 100644 --- a/Documentation/x86/entry_64.txt +++ b/Documentation/x86/entry_64.txt | |||
| @@ -78,9 +78,6 @@ The expensive (paranoid) way is to read back the MSR_GS_BASE value | |||
| 78 | xorl %ebx,%ebx | 78 | xorl %ebx,%ebx |
| 79 | 1: ret | 79 | 1: ret |
| 80 | 80 | ||
| 81 | and the whole paranoid non-paranoid macro complexity is about whether | ||
| 82 | to suffer that RDMSR cost. | ||
| 83 | |||
| 84 | If we are at an interrupt or user-trap/gate-alike boundary then we can | 81 | If we are at an interrupt or user-trap/gate-alike boundary then we can |
| 85 | use the faster check: the stack will be a reliable indicator of | 82 | use the faster check: the stack will be a reliable indicator of |
| 86 | whether SWAPGS was already done: if we see that we are a secondary | 83 | whether SWAPGS was already done: if we see that we are a secondary |
| @@ -93,6 +90,15 @@ which might have triggered right after a normal entry wrote CS to the | |||
| 93 | stack but before we executed SWAPGS, then the only safe way to check | 90 | stack but before we executed SWAPGS, then the only safe way to check |
| 94 | for GS is the slower method: the RDMSR. | 91 | for GS is the slower method: the RDMSR. |
| 95 | 92 | ||
| 96 | So we try only to mark those entry methods 'paranoid' that absolutely | 93 | Therefore, super-atomic entries (except NMI, which is handled separately) |
| 97 | need the more expensive check for the GS base - and we generate all | 94 | must use idtentry with paranoid=1 to handle gsbase correctly. This |
| 98 | 'normal' entry points with the regular (faster) entry macros. | 95 | triggers three main behavior changes: |
| 96 | |||
| 97 | - Interrupt entry will use the slower gsbase check. | ||
| 98 | - Interrupt entry from user mode will switch off the IST stack. | ||
| 99 | - Interrupt exit to kernel mode will not attempt to reschedule. | ||
| 100 | |||
| 101 | We try to only use IST entries and the paranoid entry code for vectors | ||
| 102 | that absolutely need the more expensive check for the GS base - and we | ||
| 103 | generate all 'normal' entry points with the regular (faster) paranoid=0 | ||
| 104 | variant. | ||
diff --git a/Documentation/x86/x86_64/kernel-stacks b/Documentation/x86/x86_64/kernel-stacks index a01eec5d1d0b..e3c8a49d1a2f 100644 --- a/Documentation/x86/x86_64/kernel-stacks +++ b/Documentation/x86/x86_64/kernel-stacks | |||
| @@ -40,9 +40,11 @@ An IST is selected by a non-zero value in the IST field of an | |||
| 40 | interrupt-gate descriptor. When an interrupt occurs and the hardware | 40 | interrupt-gate descriptor. When an interrupt occurs and the hardware |
| 41 | loads such a descriptor, the hardware automatically sets the new stack | 41 | loads such a descriptor, the hardware automatically sets the new stack |
| 42 | pointer based on the IST value, then invokes the interrupt handler. If | 42 | pointer based on the IST value, then invokes the interrupt handler. If |
| 43 | software wants to allow nested IST interrupts then the handler must | 43 | the interrupt came from user mode, then the interrupt handler prologue |
| 44 | adjust the IST values on entry to and exit from the interrupt handler. | 44 | will switch back to the per-thread stack. If software wants to allow |
| 45 | (This is occasionally done, e.g. for debug exceptions.) | 45 | nested IST interrupts then the handler must adjust the IST values on |
| 46 | entry to and exit from the interrupt handler. (This is occasionally | ||
| 47 | done, e.g. for debug exceptions.) | ||
| 46 | 48 | ||
| 47 | Events with different IST codes (i.e. with different stacks) can be | 49 | Events with different IST codes (i.e. with different stacks) can be |
| 48 | nested. For example, a debug interrupt can safely be interrupted by an | 50 | nested. For example, a debug interrupt can safely be interrupted by an |
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 82e8a1d44658..156ebcab4ada 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S | |||
| @@ -179,8 +179,8 @@ sysenter_dispatch: | |||
| 179 | sysexit_from_sys_call: | 179 | sysexit_from_sys_call: |
| 180 | andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) | 180 | andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) |
| 181 | /* clear IF, that popfq doesn't enable interrupts early */ | 181 | /* clear IF, that popfq doesn't enable interrupts early */ |
| 182 | andl $~0x200,EFLAGS-R11(%rsp) | 182 | andl $~0x200,EFLAGS-ARGOFFSET(%rsp) |
| 183 | movl RIP-R11(%rsp),%edx /* User %eip */ | 183 | movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */ |
| 184 | CFI_REGISTER rip,rdx | 184 | CFI_REGISTER rip,rdx |
| 185 | RESTORE_ARGS 0,24,0,0,0,0 | 185 | RESTORE_ARGS 0,24,0,0,0,0 |
| 186 | xorq %r8,%r8 | 186 | xorq %r8,%r8 |
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 76659b67fd11..1f1297b46f83 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h | |||
| @@ -83,7 +83,6 @@ For 32-bit we have the following conventions - kernel is built with | |||
| 83 | #define SS 160 | 83 | #define SS 160 |
| 84 | 84 | ||
| 85 | #define ARGOFFSET R11 | 85 | #define ARGOFFSET R11 |
| 86 | #define SWFRAME ORIG_RAX | ||
| 87 | 86 | ||
| 88 | .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 | 87 | .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 |
| 89 | subq $9*8+\addskip, %rsp | 88 | subq $9*8+\addskip, %rsp |
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 51b26e895933..9b3de99dc004 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
| @@ -190,7 +190,6 @@ enum mcp_flags { | |||
| 190 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); | 190 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); |
| 191 | 191 | ||
| 192 | int mce_notify_irq(void); | 192 | int mce_notify_irq(void); |
| 193 | void mce_notify_process(void); | ||
| 194 | 193 | ||
| 195 | DECLARE_PER_CPU(struct mce, injectm); | 194 | DECLARE_PER_CPU(struct mce, injectm); |
| 196 | 195 | ||
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 547e344a6dc6..e82e95abc92b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h | |||
| @@ -75,7 +75,6 @@ struct thread_info { | |||
| 75 | #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ | 75 | #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ |
| 76 | #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ | 76 | #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ |
| 77 | #define TIF_SECCOMP 8 /* secure computing */ | 77 | #define TIF_SECCOMP 8 /* secure computing */ |
| 78 | #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ | ||
| 79 | #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ | 78 | #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ |
| 80 | #define TIF_UPROBE 12 /* breakpointed or singlestepping */ | 79 | #define TIF_UPROBE 12 /* breakpointed or singlestepping */ |
| 81 | #define TIF_NOTSC 16 /* TSC is not accessible in userland */ | 80 | #define TIF_NOTSC 16 /* TSC is not accessible in userland */ |
| @@ -100,7 +99,6 @@ struct thread_info { | |||
| 100 | #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) | 99 | #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) |
| 101 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) | 100 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) |
| 102 | #define _TIF_SECCOMP (1 << TIF_SECCOMP) | 101 | #define _TIF_SECCOMP (1 << TIF_SECCOMP) |
| 103 | #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) | ||
| 104 | #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) | 102 | #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) |
| 105 | #define _TIF_UPROBE (1 << TIF_UPROBE) | 103 | #define _TIF_UPROBE (1 << TIF_UPROBE) |
| 106 | #define _TIF_NOTSC (1 << TIF_NOTSC) | 104 | #define _TIF_NOTSC (1 << TIF_NOTSC) |
| @@ -140,7 +138,7 @@ struct thread_info { | |||
| 140 | 138 | ||
| 141 | /* Only used for 64 bit */ | 139 | /* Only used for 64 bit */ |
| 142 | #define _TIF_DO_NOTIFY_MASK \ | 140 | #define _TIF_DO_NOTIFY_MASK \ |
| 143 | (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \ | 141 | (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \ |
| 144 | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) | 142 | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) |
| 145 | 143 | ||
| 146 | /* flags to check in __switch_to() */ | 144 | /* flags to check in __switch_to() */ |
| @@ -170,6 +168,17 @@ static inline struct thread_info *current_thread_info(void) | |||
| 170 | return ti; | 168 | return ti; |
| 171 | } | 169 | } |
| 172 | 170 | ||
| 171 | static inline unsigned long current_stack_pointer(void) | ||
| 172 | { | ||
| 173 | unsigned long sp; | ||
| 174 | #ifdef CONFIG_X86_64 | ||
| 175 | asm("mov %%rsp,%0" : "=g" (sp)); | ||
| 176 | #else | ||
| 177 | asm("mov %%esp,%0" : "=g" (sp)); | ||
| 178 | #endif | ||
| 179 | return sp; | ||
| 180 | } | ||
| 181 | |||
| 173 | #else /* !__ASSEMBLY__ */ | 182 | #else /* !__ASSEMBLY__ */ |
| 174 | 183 | ||
| 175 | /* how to get the thread information struct from ASM */ | 184 | /* how to get the thread information struct from ASM */ |
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 707adc6549d8..4e49d7dff78e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h | |||
| @@ -1,6 +1,7 @@ | |||
| 1 | #ifndef _ASM_X86_TRAPS_H | 1 | #ifndef _ASM_X86_TRAPS_H |
| 2 | #define _ASM_X86_TRAPS_H | 2 | #define _ASM_X86_TRAPS_H |
| 3 | 3 | ||
| 4 | #include <linux/context_tracking_state.h> | ||
| 4 | #include <linux/kprobes.h> | 5 | #include <linux/kprobes.h> |
| 5 | 6 | ||
| 6 | #include <asm/debugreg.h> | 7 | #include <asm/debugreg.h> |
| @@ -110,6 +111,11 @@ asmlinkage void smp_thermal_interrupt(void); | |||
| 110 | asmlinkage void mce_threshold_interrupt(void); | 111 | asmlinkage void mce_threshold_interrupt(void); |
| 111 | #endif | 112 | #endif |
| 112 | 113 | ||
| 114 | extern enum ctx_state ist_enter(struct pt_regs *regs); | ||
| 115 | extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state); | ||
| 116 | extern void ist_begin_non_atomic(struct pt_regs *regs); | ||
| 117 | extern void ist_end_non_atomic(void); | ||
| 118 | |||
| 113 | /* Interrupts/Exceptions */ | 119 | /* Interrupts/Exceptions */ |
| 114 | enum { | 120 | enum { |
| 115 | X86_TRAP_DE = 0, /* 0, Divide-by-zero */ | 121 | X86_TRAP_DE = 0, /* 0, Divide-by-zero */ |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d2c611699cd9..d23179900755 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
| @@ -43,6 +43,7 @@ | |||
| 43 | #include <linux/export.h> | 43 | #include <linux/export.h> |
| 44 | 44 | ||
| 45 | #include <asm/processor.h> | 45 | #include <asm/processor.h> |
| 46 | #include <asm/traps.h> | ||
| 46 | #include <asm/mce.h> | 47 | #include <asm/mce.h> |
| 47 | #include <asm/msr.h> | 48 | #include <asm/msr.h> |
| 48 | 49 | ||
| @@ -1003,51 +1004,6 @@ static void mce_clear_state(unsigned long *toclear) | |||
| 1003 | } | 1004 | } |
| 1004 | 1005 | ||
| 1005 | /* | 1006 | /* |
| 1006 | * Need to save faulting physical address associated with a process | ||
| 1007 | * in the machine check handler some place where we can grab it back | ||
| 1008 | * later in mce_notify_process() | ||
| 1009 | */ | ||
| 1010 | #define MCE_INFO_MAX 16 | ||
| 1011 | |||
| 1012 | struct mce_info { | ||
| 1013 | atomic_t inuse; | ||
| 1014 | struct task_struct *t; | ||
| 1015 | __u64 paddr; | ||
| 1016 | int restartable; | ||
| 1017 | } mce_info[MCE_INFO_MAX]; | ||
| 1018 | |||
| 1019 | static void mce_save_info(__u64 addr, int c) | ||
| 1020 | { | ||
| 1021 | struct mce_info *mi; | ||
| 1022 | |||
| 1023 | for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { | ||
| 1024 | if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { | ||
| 1025 | mi->t = current; | ||
| 1026 | mi->paddr = addr; | ||
| 1027 | mi->restartable = c; | ||
| 1028 | return; | ||
| 1029 | } | ||
| 1030 | } | ||
| 1031 | |||
| 1032 | mce_panic("Too many concurrent recoverable errors", NULL, NULL); | ||
| 1033 | } | ||
| 1034 | |||
| 1035 | static struct mce_info *mce_find_info(void) | ||
| 1036 | { | ||
| 1037 | struct mce_info *mi; | ||
| 1038 | |||
| 1039 | for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) | ||
| 1040 | if (atomic_read(&mi->inuse) && mi->t == current) | ||
| 1041 | return mi; | ||
| 1042 | return NULL; | ||
| 1043 | } | ||
| 1044 | |||
| 1045 | static void mce_clear_info(struct mce_info *mi) | ||
| 1046 | { | ||
| 1047 | atomic_set(&mi->inuse, 0); | ||
| 1048 | } | ||
| 1049 | |||
| 1050 | /* | ||
| 1051 | * The actual machine check handler. This only handles real | 1007 | * The actual machine check handler. This only handles real |
| 1052 | * exceptions when something got corrupted coming in through int 18. | 1008 | * exceptions when something got corrupted coming in through int 18. |
| 1053 | * | 1009 | * |
| @@ -1063,6 +1019,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
| 1063 | { | 1019 | { |
| 1064 | struct mca_config *cfg = &mca_cfg; | 1020 | struct mca_config *cfg = &mca_cfg; |
| 1065 | struct mce m, *final; | 1021 | struct mce m, *final; |
| 1022 | enum ctx_state prev_state; | ||
| 1066 | int i; | 1023 | int i; |
| 1067 | int worst = 0; | 1024 | int worst = 0; |
| 1068 | int severity; | 1025 | int severity; |
| @@ -1084,6 +1041,10 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
| 1084 | DECLARE_BITMAP(toclear, MAX_NR_BANKS); | 1041 | DECLARE_BITMAP(toclear, MAX_NR_BANKS); |
| 1085 | DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); | 1042 | DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); |
| 1086 | char *msg = "Unknown"; | 1043 | char *msg = "Unknown"; |
| 1044 | u64 recover_paddr = ~0ull; | ||
| 1045 | int flags = MF_ACTION_REQUIRED; | ||
| 1046 | |||
| 1047 | prev_state = ist_enter(regs); | ||
| 1087 | 1048 | ||
| 1088 | this_cpu_inc(mce_exception_count); | 1049 | this_cpu_inc(mce_exception_count); |
| 1089 | 1050 | ||
| @@ -1203,9 +1164,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
| 1203 | if (no_way_out) | 1164 | if (no_way_out) |
| 1204 | mce_panic("Fatal machine check on current CPU", &m, msg); | 1165 | mce_panic("Fatal machine check on current CPU", &m, msg); |
| 1205 | if (worst == MCE_AR_SEVERITY) { | 1166 | if (worst == MCE_AR_SEVERITY) { |
| 1206 | /* schedule action before return to userland */ | 1167 | recover_paddr = m.addr; |
| 1207 | mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); | 1168 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) |
| 1208 | set_thread_flag(TIF_MCE_NOTIFY); | 1169 | flags |= MF_MUST_KILL; |
| 1209 | } else if (kill_it) { | 1170 | } else if (kill_it) { |
| 1210 | force_sig(SIGBUS, current); | 1171 | force_sig(SIGBUS, current); |
| 1211 | } | 1172 | } |
| @@ -1216,6 +1177,27 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
| 1216 | mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); | 1177 | mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); |
| 1217 | out: | 1178 | out: |
| 1218 | sync_core(); | 1179 | sync_core(); |
| 1180 | |||
| 1181 | if (recover_paddr == ~0ull) | ||
| 1182 | goto done; | ||
| 1183 | |||
| 1184 | pr_err("Uncorrected hardware memory error in user-access at %llx", | ||
| 1185 | recover_paddr); | ||
| 1186 | /* | ||
| 1187 | * We must call memory_failure() here even if the current process is | ||
| 1188 | * doomed. We still need to mark the page as poisoned and alert any | ||
| 1189 | * other users of the page. | ||
| 1190 | */ | ||
| 1191 | ist_begin_non_atomic(regs); | ||
| 1192 | local_irq_enable(); | ||
| 1193 | if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) { | ||
| 1194 | pr_err("Memory error not recovered"); | ||
| 1195 | force_sig(SIGBUS, current); | ||
| 1196 | } | ||
| 1197 | local_irq_disable(); | ||
| 1198 | ist_end_non_atomic(); | ||
| 1199 | done: | ||
| 1200 | ist_exit(regs, prev_state); | ||
| 1219 | } | 1201 | } |
| 1220 | EXPORT_SYMBOL_GPL(do_machine_check); | 1202 | EXPORT_SYMBOL_GPL(do_machine_check); |
| 1221 | 1203 | ||
| @@ -1233,42 +1215,6 @@ int memory_failure(unsigned long pfn, int vector, int flags) | |||
| 1233 | #endif | 1215 | #endif |
| 1234 | 1216 | ||
| 1235 | /* | 1217 | /* |
| 1236 | * Called in process context that interrupted by MCE and marked with | ||
| 1237 | * TIF_MCE_NOTIFY, just before returning to erroneous userland. | ||
| 1238 | * This code is allowed to sleep. | ||
| 1239 | * Attempt possible recovery such as calling the high level VM handler to | ||
| 1240 | * process any corrupted pages, and kill/signal current process if required. | ||
| 1241 | * Action required errors are handled here. | ||
| 1242 | */ | ||
| 1243 | void mce_notify_process(void) | ||
| 1244 | { | ||
| 1245 | unsigned long pfn; | ||
| 1246 | struct mce_info *mi = mce_find_info(); | ||
| 1247 | int flags = MF_ACTION_REQUIRED; | ||
| 1248 | |||
| 1249 | if (!mi) | ||
| 1250 | mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); | ||
| 1251 | pfn = mi->paddr >> PAGE_SHIFT; | ||
| 1252 | |||
| 1253 | clear_thread_flag(TIF_MCE_NOTIFY); | ||
| 1254 | |||
| 1255 | pr_err("Uncorrected hardware memory error in user-access at %llx", | ||
| 1256 | mi->paddr); | ||
| 1257 | /* | ||
| 1258 | * We must call memory_failure() here even if the current process is | ||
| 1259 | * doomed. We still need to mark the page as poisoned and alert any | ||
| 1260 | * other users of the page. | ||
| 1261 | */ | ||
| 1262 | if (!mi->restartable) | ||
| 1263 | flags |= MF_MUST_KILL; | ||
| 1264 | if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { | ||
| 1265 | pr_err("Memory error not recovered"); | ||
| 1266 | force_sig(SIGBUS, current); | ||
| 1267 | } | ||
| 1268 | mce_clear_info(mi); | ||
| 1269 | } | ||
| 1270 | |||
| 1271 | /* | ||
| 1272 | * Action optional processing happens here (picking up | 1218 | * Action optional processing happens here (picking up |
| 1273 | * from the list of faulting pages that do_machine_check() | 1219 | * from the list of faulting pages that do_machine_check() |
| 1274 | * placed into the "ring"). | 1220 | * placed into the "ring"). |
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index a3042989398c..ec2663a708e4 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | #include <linux/smp.h> | 8 | #include <linux/smp.h> |
| 9 | 9 | ||
| 10 | #include <asm/processor.h> | 10 | #include <asm/processor.h> |
| 11 | #include <asm/traps.h> | ||
| 11 | #include <asm/mce.h> | 12 | #include <asm/mce.h> |
| 12 | #include <asm/msr.h> | 13 | #include <asm/msr.h> |
| 13 | 14 | ||
| @@ -17,8 +18,11 @@ int mce_p5_enabled __read_mostly; | |||
| 17 | /* Machine check handler for Pentium class Intel CPUs: */ | 18 | /* Machine check handler for Pentium class Intel CPUs: */ |
| 18 | static void pentium_machine_check(struct pt_regs *regs, long error_code) | 19 | static void pentium_machine_check(struct pt_regs *regs, long error_code) |
| 19 | { | 20 | { |
| 21 | enum ctx_state prev_state; | ||
| 20 | u32 loaddr, hi, lotype; | 22 | u32 loaddr, hi, lotype; |
| 21 | 23 | ||
| 24 | prev_state = ist_enter(regs); | ||
| 25 | |||
| 22 | rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); | 26 | rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); |
| 23 | rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); | 27 | rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); |
| 24 | 28 | ||
| @@ -33,6 +37,8 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) | |||
| 33 | } | 37 | } |
| 34 | 38 | ||
| 35 | add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); | 39 | add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); |
| 40 | |||
| 41 | ist_exit(regs, prev_state); | ||
| 36 | } | 42 | } |
| 37 | 43 | ||
| 38 | /* Set up machine check reporting for processors with Intel style MCE: */ | 44 | /* Set up machine check reporting for processors with Intel style MCE: */ |
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 7dc5564d0cdf..bd5d46a32210 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c | |||
| @@ -7,14 +7,19 @@ | |||
| 7 | #include <linux/types.h> | 7 | #include <linux/types.h> |
| 8 | 8 | ||
| 9 | #include <asm/processor.h> | 9 | #include <asm/processor.h> |
| 10 | #include <asm/traps.h> | ||
| 10 | #include <asm/mce.h> | 11 | #include <asm/mce.h> |
| 11 | #include <asm/msr.h> | 12 | #include <asm/msr.h> |
| 12 | 13 | ||
| 13 | /* Machine check handler for WinChip C6: */ | 14 | /* Machine check handler for WinChip C6: */ |
| 14 | static void winchip_machine_check(struct pt_regs *regs, long error_code) | 15 | static void winchip_machine_check(struct pt_regs *regs, long error_code) |
| 15 | { | 16 | { |
| 17 | enum ctx_state prev_state = ist_enter(regs); | ||
| 18 | |||
| 16 | printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); | 19 | printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); |
| 17 | add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); | 20 | add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); |
| 21 | |||
| 22 | ist_exit(regs, prev_state); | ||
| 18 | } | 23 | } |
| 19 | 24 | ||
| 20 | /* Set up machine check reporting on the Winchip C6 series */ | 25 | /* Set up machine check reporting on the Winchip C6 series */ |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c653dc437e6b..501212f14c87 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
| @@ -156,27 +156,6 @@ ENDPROC(native_usergs_sysret64) | |||
| 156 | movq \tmp,R11+\offset(%rsp) | 156 | movq \tmp,R11+\offset(%rsp) |
| 157 | .endm | 157 | .endm |
| 158 | 158 | ||
| 159 | .macro FAKE_STACK_FRAME child_rip | ||
| 160 | /* push in order ss, rsp, eflags, cs, rip */ | ||
| 161 | xorl %eax, %eax | ||
| 162 | pushq_cfi $__KERNEL_DS /* ss */ | ||
| 163 | /*CFI_REL_OFFSET ss,0*/ | ||
| 164 | pushq_cfi %rax /* rsp */ | ||
| 165 | CFI_REL_OFFSET rsp,0 | ||
| 166 | pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */ | ||
| 167 | /*CFI_REL_OFFSET rflags,0*/ | ||
| 168 | pushq_cfi $__KERNEL_CS /* cs */ | ||
| 169 | /*CFI_REL_OFFSET cs,0*/ | ||
| 170 | pushq_cfi \child_rip /* rip */ | ||
| 171 | CFI_REL_OFFSET rip,0 | ||
| 172 | pushq_cfi %rax /* orig rax */ | ||
| 173 | .endm | ||
| 174 | |||
| 175 | .macro UNFAKE_STACK_FRAME | ||
| 176 | addq $8*6, %rsp | ||
| 177 | CFI_ADJUST_CFA_OFFSET -(6*8) | ||
| 178 | .endm | ||
| 179 | |||
| 180 | /* | 159 | /* |
| 181 | * initial frame state for interrupts (and exceptions without error code) | 160 | * initial frame state for interrupts (and exceptions without error code) |
| 182 | */ | 161 | */ |
| @@ -239,51 +218,6 @@ ENDPROC(native_usergs_sysret64) | |||
| 239 | CFI_REL_OFFSET r15, R15+\offset | 218 | CFI_REL_OFFSET r15, R15+\offset |
| 240 | .endm | 219 | .endm |
| 241 | 220 | ||
| 242 | /* save partial stack frame */ | ||
| 243 | .macro SAVE_ARGS_IRQ | ||
| 244 | cld | ||
| 245 | /* start from rbp in pt_regs and jump over */ | ||
| 246 | movq_cfi rdi, (RDI-RBP) | ||
| 247 | movq_cfi rsi, (RSI-RBP) | ||
| 248 | movq_cfi rdx, (RDX-RBP) | ||
| 249 | movq_cfi rcx, (RCX-RBP) | ||
| 250 | movq_cfi rax, (RAX-RBP) | ||
| 251 | movq_cfi r8, (R8-RBP) | ||
| 252 | movq_cfi r9, (R9-RBP) | ||
| 253 | movq_cfi r10, (R10-RBP) | ||
| 254 | movq_cfi r11, (R11-RBP) | ||
| 255 | |||
| 256 | /* Save rbp so that we can unwind from get_irq_regs() */ | ||
| 257 | movq_cfi rbp, 0 | ||
| 258 | |||
| 259 | /* Save previous stack value */ | ||
| 260 | movq %rsp, %rsi | ||
| 261 | |||
| 262 | leaq -RBP(%rsp),%rdi /* arg1 for handler */ | ||
| 263 | testl $3, CS-RBP(%rsi) | ||
| 264 | je 1f | ||
| 265 | SWAPGS | ||
| 266 | /* | ||
| 267 | * irq_count is used to check if a CPU is already on an interrupt stack | ||
| 268 | * or not. While this is essentially redundant with preempt_count it is | ||
| 269 | * a little cheaper to use a separate counter in the PDA (short of | ||
| 270 | * moving irq_enter into assembly, which would be too much work) | ||
| 271 | */ | ||
| 272 | 1: incl PER_CPU_VAR(irq_count) | ||
| 273 | cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp | ||
| 274 | CFI_DEF_CFA_REGISTER rsi | ||
| 275 | |||
| 276 | /* Store previous stack value */ | ||
| 277 | pushq %rsi | ||
| 278 | CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ | ||
| 279 | 0x77 /* DW_OP_breg7 */, 0, \ | ||
| 280 | 0x06 /* DW_OP_deref */, \ | ||
| 281 | 0x08 /* DW_OP_const1u */, SS+8-RBP, \ | ||
| 282 | 0x22 /* DW_OP_plus */ | ||
| 283 | /* We entered an interrupt context - irqs are off: */ | ||
| 284 | TRACE_IRQS_OFF | ||
| 285 | .endm | ||
| 286 | |||
| 287 | ENTRY(save_paranoid) | 221 | ENTRY(save_paranoid) |
| 288 | XCPT_FRAME 1 RDI+8 | 222 | XCPT_FRAME 1 RDI+8 |
| 289 | cld | 223 | cld |
| @@ -627,19 +561,6 @@ END(\label) | |||
| 627 | FORK_LIKE vfork | 561 | FORK_LIKE vfork |
| 628 | FIXED_FRAME stub_iopl, sys_iopl | 562 | FIXED_FRAME stub_iopl, sys_iopl |
| 629 | 563 | ||
| 630 | ENTRY(ptregscall_common) | ||
| 631 | DEFAULT_FRAME 1 8 /* offset 8: return address */ | ||
| 632 | RESTORE_TOP_OF_STACK %r11, 8 | ||
| 633 | movq_cfi_restore R15+8, r15 | ||
| 634 | movq_cfi_restore R14+8, r14 | ||
| 635 | movq_cfi_restore R13+8, r13 | ||
| 636 | movq_cfi_restore R12+8, r12 | ||
| 637 | movq_cfi_restore RBP+8, rbp | ||
| 638 | movq_cfi_restore RBX+8, rbx | ||
| 639 | ret $REST_SKIP /* pop extended registers */ | ||
| 640 | CFI_ENDPROC | ||
| 641 | END(ptregscall_common) | ||
| 642 | |||
| 643 | ENTRY(stub_execve) | 564 | ENTRY(stub_execve) |
| 644 | CFI_STARTPROC | 565 | CFI_STARTPROC |
| 645 | addq $8, %rsp | 566 | addq $8, %rsp |
| @@ -780,7 +701,48 @@ END(interrupt) | |||
| 780 | /* reserve pt_regs for scratch regs and rbp */ | 701 | /* reserve pt_regs for scratch regs and rbp */ |
| 781 | subq $ORIG_RAX-RBP, %rsp | 702 | subq $ORIG_RAX-RBP, %rsp |
| 782 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP | 703 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP |
| 783 | SAVE_ARGS_IRQ | 704 | cld |
| 705 | /* start from rbp in pt_regs and jump over */ | ||
| 706 | movq_cfi rdi, (RDI-RBP) | ||
| 707 | movq_cfi rsi, (RSI-RBP) | ||
| 708 | movq_cfi rdx, (RDX-RBP) | ||
| 709 | movq_cfi rcx, (RCX-RBP) | ||
| 710 | movq_cfi rax, (RAX-RBP) | ||
| 711 | movq_cfi r8, (R8-RBP) | ||
| 712 | movq_cfi r9, (R9-RBP) | ||
| 713 | movq_cfi r10, (R10-RBP) | ||
| 714 | movq_cfi r11, (R11-RBP) | ||
| 715 | |||
| 716 | /* Save rbp so that we can unwind from get_irq_regs() */ | ||
| 717 | movq_cfi rbp, 0 | ||
| 718 | |||
| 719 | /* Save previous stack value */ | ||
| 720 | movq %rsp, %rsi | ||
| 721 | |||
| 722 | leaq -RBP(%rsp),%rdi /* arg1 for handler */ | ||
| 723 | testl $3, CS-RBP(%rsi) | ||
| 724 | je 1f | ||
| 725 | SWAPGS | ||
| 726 | /* | ||
| 727 | * irq_count is used to check if a CPU is already on an interrupt stack | ||
| 728 | * or not. While this is essentially redundant with preempt_count it is | ||
| 729 | * a little cheaper to use a separate counter in the PDA (short of | ||
| 730 | * moving irq_enter into assembly, which would be too much work) | ||
| 731 | */ | ||
| 732 | 1: incl PER_CPU_VAR(irq_count) | ||
| 733 | cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp | ||
| 734 | CFI_DEF_CFA_REGISTER rsi | ||
| 735 | |||
| 736 | /* Store previous stack value */ | ||
| 737 | pushq %rsi | ||
| 738 | CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ | ||
| 739 | 0x77 /* DW_OP_breg7 */, 0, \ | ||
| 740 | 0x06 /* DW_OP_deref */, \ | ||
| 741 | 0x08 /* DW_OP_const1u */, SS+8-RBP, \ | ||
| 742 | 0x22 /* DW_OP_plus */ | ||
| 743 | /* We entered an interrupt context - irqs are off: */ | ||
| 744 | TRACE_IRQS_OFF | ||
| 745 | |||
| 784 | call \func | 746 | call \func |
| 785 | .endm | 747 | .endm |
| 786 | 748 | ||
| @@ -1049,6 +1011,11 @@ ENTRY(\sym) | |||
| 1049 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 | 1011 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 |
| 1050 | 1012 | ||
| 1051 | .if \paranoid | 1013 | .if \paranoid |
| 1014 | .if \paranoid == 1 | ||
| 1015 | CFI_REMEMBER_STATE | ||
| 1016 | testl $3, CS(%rsp) /* If coming from userspace, switch */ | ||
| 1017 | jnz 1f /* stacks. */ | ||
| 1018 | .endif | ||
| 1052 | call save_paranoid | 1019 | call save_paranoid |
| 1053 | .else | 1020 | .else |
| 1054 | call error_entry | 1021 | call error_entry |
| @@ -1089,6 +1056,36 @@ ENTRY(\sym) | |||
| 1089 | jmp error_exit /* %ebx: no swapgs flag */ | 1056 | jmp error_exit /* %ebx: no swapgs flag */ |
| 1090 | .endif | 1057 | .endif |
| 1091 | 1058 | ||
| 1059 | .if \paranoid == 1 | ||
| 1060 | CFI_RESTORE_STATE | ||
| 1061 | /* | ||
| 1062 | * Paranoid entry from userspace. Switch stacks and treat it | ||
| 1063 | * as a normal entry. This means that paranoid handlers | ||
| 1064 | * run in real process context if user_mode(regs). | ||
| 1065 | */ | ||
| 1066 | 1: | ||
| 1067 | call error_entry | ||
| 1068 | |||
| 1069 | DEFAULT_FRAME 0 | ||
| 1070 | |||
| 1071 | movq %rsp,%rdi /* pt_regs pointer */ | ||
| 1072 | call sync_regs | ||
| 1073 | movq %rax,%rsp /* switch stack */ | ||
| 1074 | |||
| 1075 | movq %rsp,%rdi /* pt_regs pointer */ | ||
| 1076 | |||
| 1077 | .if \has_error_code | ||
| 1078 | movq ORIG_RAX(%rsp),%rsi /* get error code */ | ||
| 1079 | movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ | ||
| 1080 | .else | ||
| 1081 | xorl %esi,%esi /* no error code */ | ||
| 1082 | .endif | ||
| 1083 | |||
| 1084 | call \do_sym | ||
| 1085 | |||
| 1086 | jmp error_exit /* %ebx: no swapgs flag */ | ||
| 1087 | .endif | ||
| 1088 | |||
| 1092 | CFI_ENDPROC | 1089 | CFI_ENDPROC |
| 1093 | END(\sym) | 1090 | END(\sym) |
| 1094 | .endm | 1091 | .endm |
| @@ -1109,7 +1106,7 @@ idtentry overflow do_overflow has_error_code=0 | |||
| 1109 | idtentry bounds do_bounds has_error_code=0 | 1106 | idtentry bounds do_bounds has_error_code=0 |
| 1110 | idtentry invalid_op do_invalid_op has_error_code=0 | 1107 | idtentry invalid_op do_invalid_op has_error_code=0 |
| 1111 | idtentry device_not_available do_device_not_available has_error_code=0 | 1108 | idtentry device_not_available do_device_not_available has_error_code=0 |
| 1112 | idtentry double_fault do_double_fault has_error_code=1 paranoid=1 | 1109 | idtentry double_fault do_double_fault has_error_code=1 paranoid=2 |
| 1113 | idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 | 1110 | idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 |
| 1114 | idtentry invalid_TSS do_invalid_TSS has_error_code=1 | 1111 | idtentry invalid_TSS do_invalid_TSS has_error_code=1 |
| 1115 | idtentry segment_not_present do_segment_not_present has_error_code=1 | 1112 | idtentry segment_not_present do_segment_not_present has_error_code=1 |
| @@ -1290,16 +1287,14 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector( | |||
| 1290 | #endif | 1287 | #endif |
| 1291 | 1288 | ||
| 1292 | /* | 1289 | /* |
| 1293 | * "Paranoid" exit path from exception stack. | 1290 | * "Paranoid" exit path from exception stack. This is invoked |
| 1294 | * Paranoid because this is used by NMIs and cannot take | 1291 | * only on return from non-NMI IST interrupts that came |
| 1295 | * any kernel state for granted. | 1292 | * from kernel space. |
| 1296 | * We don't do kernel preemption checks here, because only | ||
| 1297 | * NMI should be common and it does not enable IRQs and | ||
| 1298 | * cannot get reschedule ticks. | ||
| 1299 | * | 1293 | * |
| 1300 | * "trace" is 0 for the NMI handler only, because irq-tracing | 1294 | * We may be returning to very strange contexts (e.g. very early |
| 1301 | * is fundamentally NMI-unsafe. (we cannot change the soft and | 1295 | * in syscall entry), so checking for preemption here would |
| 1302 | * hard flags at once, atomically) | 1296 | * be complicated. Fortunately, we there's no good reason |
| 1297 | * to try to handle preemption here. | ||
| 1303 | */ | 1298 | */ |
| 1304 | 1299 | ||
| 1305 | /* ebx: no swapgs flag */ | 1300 | /* ebx: no swapgs flag */ |
| @@ -1309,43 +1304,14 @@ ENTRY(paranoid_exit) | |||
| 1309 | TRACE_IRQS_OFF_DEBUG | 1304 | TRACE_IRQS_OFF_DEBUG |
| 1310 | testl %ebx,%ebx /* swapgs needed? */ | 1305 | testl %ebx,%ebx /* swapgs needed? */ |
| 1311 | jnz paranoid_restore | 1306 | jnz paranoid_restore |
| 1312 | testl $3,CS(%rsp) | ||
| 1313 | jnz paranoid_userspace | ||
| 1314 | paranoid_swapgs: | ||
| 1315 | TRACE_IRQS_IRETQ 0 | 1307 | TRACE_IRQS_IRETQ 0 |
| 1316 | SWAPGS_UNSAFE_STACK | 1308 | SWAPGS_UNSAFE_STACK |
| 1317 | RESTORE_ALL 8 | 1309 | RESTORE_ALL 8 |
| 1318 | jmp irq_return | 1310 | INTERRUPT_RETURN |
| 1319 | paranoid_restore: | 1311 | paranoid_restore: |
| 1320 | TRACE_IRQS_IRETQ_DEBUG 0 | 1312 | TRACE_IRQS_IRETQ_DEBUG 0 |
| 1321 | RESTORE_ALL 8 | 1313 | RESTORE_ALL 8 |
| 1322 | jmp irq_return | 1314 | INTERRUPT_RETURN |
| 1323 | paranoid_userspace: | ||
| 1324 | GET_THREAD_INFO(%rcx) | ||
| 1325 | movl TI_flags(%rcx),%ebx | ||
| 1326 | andl $_TIF_WORK_MASK,%ebx | ||
| 1327 | jz paranoid_swapgs | ||
| 1328 | movq %rsp,%rdi /* &pt_regs */ | ||
| 1329 | call sync_regs | ||
| 1330 | movq %rax,%rsp /* switch stack for scheduling */ | ||
| 1331 | testl $_TIF_NEED_RESCHED,%ebx | ||
| 1332 | jnz paranoid_schedule | ||
| 1333 | movl %ebx,%edx /* arg3: thread flags */ | ||
| 1334 | TRACE_IRQS_ON | ||
| 1335 | ENABLE_INTERRUPTS(CLBR_NONE) | ||
| 1336 | xorl %esi,%esi /* arg2: oldset */ | ||
| 1337 | movq %rsp,%rdi /* arg1: &pt_regs */ | ||
| 1338 | call do_notify_resume | ||
| 1339 | DISABLE_INTERRUPTS(CLBR_NONE) | ||
| 1340 | TRACE_IRQS_OFF | ||
| 1341 | jmp paranoid_userspace | ||
| 1342 | paranoid_schedule: | ||
| 1343 | TRACE_IRQS_ON | ||
| 1344 | ENABLE_INTERRUPTS(CLBR_ANY) | ||
| 1345 | SCHEDULE_USER | ||
| 1346 | DISABLE_INTERRUPTS(CLBR_ANY) | ||
| 1347 | TRACE_IRQS_OFF | ||
| 1348 | jmp paranoid_userspace | ||
| 1349 | CFI_ENDPROC | 1315 | CFI_ENDPROC |
| 1350 | END(paranoid_exit) | 1316 | END(paranoid_exit) |
| 1351 | 1317 | ||
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 63ce838e5a54..28d28f5eb8f4 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c | |||
| @@ -69,16 +69,9 @@ static void call_on_stack(void *func, void *stack) | |||
| 69 | : "memory", "cc", "edx", "ecx", "eax"); | 69 | : "memory", "cc", "edx", "ecx", "eax"); |
| 70 | } | 70 | } |
| 71 | 71 | ||
| 72 | /* how to get the current stack pointer from C */ | ||
| 73 | #define current_stack_pointer ({ \ | ||
| 74 | unsigned long sp; \ | ||
| 75 | asm("mov %%esp,%0" : "=g" (sp)); \ | ||
| 76 | sp; \ | ||
| 77 | }) | ||
| 78 | |||
| 79 | static inline void *current_stack(void) | 72 | static inline void *current_stack(void) |
| 80 | { | 73 | { |
| 81 | return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); | 74 | return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); |
| 82 | } | 75 | } |
| 83 | 76 | ||
| 84 | static inline int | 77 | static inline int |
| @@ -103,7 +96,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) | |||
| 103 | 96 | ||
| 104 | /* Save the next esp at the bottom of the stack */ | 97 | /* Save the next esp at the bottom of the stack */ |
| 105 | prev_esp = (u32 *)irqstk; | 98 | prev_esp = (u32 *)irqstk; |
| 106 | *prev_esp = current_stack_pointer; | 99 | *prev_esp = current_stack_pointer(); |
| 107 | 100 | ||
| 108 | if (unlikely(overflow)) | 101 | if (unlikely(overflow)) |
| 109 | call_on_stack(print_stack_overflow, isp); | 102 | call_on_stack(print_stack_overflow, isp); |
| @@ -156,7 +149,7 @@ void do_softirq_own_stack(void) | |||
| 156 | 149 | ||
| 157 | /* Push the previous esp onto the stack */ | 150 | /* Push the previous esp onto the stack */ |
| 158 | prev_esp = (u32 *)irqstk; | 151 | prev_esp = (u32 *)irqstk; |
| 159 | *prev_esp = current_stack_pointer; | 152 | *prev_esp = current_stack_pointer(); |
| 160 | 153 | ||
| 161 | call_on_stack(__do_softirq, isp); | 154 | call_on_stack(__do_softirq, isp); |
| 162 | } | 155 | } |
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index ed37a768d0fc..2a33c8f68319 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c | |||
| @@ -740,12 +740,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) | |||
| 740 | { | 740 | { |
| 741 | user_exit(); | 741 | user_exit(); |
| 742 | 742 | ||
| 743 | #ifdef CONFIG_X86_MCE | ||
| 744 | /* notify userspace of pending MCEs */ | ||
| 745 | if (thread_info_flags & _TIF_MCE_NOTIFY) | ||
| 746 | mce_notify_process(); | ||
| 747 | #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ | ||
| 748 | |||
| 749 | if (thread_info_flags & _TIF_UPROBE) | 743 | if (thread_info_flags & _TIF_UPROBE) |
| 750 | uprobe_notify_resume(regs); | 744 | uprobe_notify_resume(regs); |
| 751 | 745 | ||
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 88900e288021..7176f84f95a4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
| @@ -108,6 +108,77 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) | |||
| 108 | preempt_count_dec(); | 108 | preempt_count_dec(); |
| 109 | } | 109 | } |
| 110 | 110 | ||
| 111 | enum ctx_state ist_enter(struct pt_regs *regs) | ||
| 112 | { | ||
| 113 | /* | ||
| 114 | * We are atomic because we're on the IST stack (or we're on x86_32, | ||
| 115 | * in which case we still shouldn't schedule. | ||
| 116 | */ | ||
| 117 | preempt_count_add(HARDIRQ_OFFSET); | ||
| 118 | |||
| 119 | if (user_mode_vm(regs)) { | ||
| 120 | /* Other than that, we're just an exception. */ | ||
| 121 | return exception_enter(); | ||
| 122 | } else { | ||
| 123 | /* | ||
| 124 | * We might have interrupted pretty much anything. In | ||
| 125 | * fact, if we're a machine check, we can even interrupt | ||
| 126 | * NMI processing. We don't want in_nmi() to return true, | ||
| 127 | * but we need to notify RCU. | ||
| 128 | */ | ||
| 129 | rcu_nmi_enter(); | ||
| 130 | return IN_KERNEL; /* the value is irrelevant. */ | ||
| 131 | } | ||
| 132 | } | ||
| 133 | |||
| 134 | void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) | ||
| 135 | { | ||
| 136 | preempt_count_sub(HARDIRQ_OFFSET); | ||
| 137 | |||
| 138 | if (user_mode_vm(regs)) | ||
| 139 | return exception_exit(prev_state); | ||
| 140 | else | ||
| 141 | rcu_nmi_exit(); | ||
| 142 | } | ||
| 143 | |||
| 144 | /** | ||
| 145 | * ist_begin_non_atomic() - begin a non-atomic section in an IST exception | ||
| 146 | * @regs: regs passed to the IST exception handler | ||
| 147 | * | ||
| 148 | * IST exception handlers normally cannot schedule. As a special | ||
| 149 | * exception, if the exception interrupted userspace code (i.e. | ||
| 150 | * user_mode_vm(regs) would return true) and the exception was not | ||
| 151 | * a double fault, it can be safe to schedule. ist_begin_non_atomic() | ||
| 152 | * begins a non-atomic section within an ist_enter()/ist_exit() region. | ||
| 153 | * Callers are responsible for enabling interrupts themselves inside | ||
| 154 | * the non-atomic section, and callers must call is_end_non_atomic() | ||
| 155 | * before ist_exit(). | ||
| 156 | */ | ||
| 157 | void ist_begin_non_atomic(struct pt_regs *regs) | ||
| 158 | { | ||
| 159 | BUG_ON(!user_mode_vm(regs)); | ||
| 160 | |||
| 161 | /* | ||
| 162 | * Sanity check: we need to be on the normal thread stack. This | ||
| 163 | * will catch asm bugs and any attempt to use ist_preempt_enable | ||
| 164 | * from double_fault. | ||
| 165 | */ | ||
| 166 | BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) | ||
| 167 | & ~(THREAD_SIZE - 1)) != 0); | ||
| 168 | |||
| 169 | preempt_count_sub(HARDIRQ_OFFSET); | ||
| 170 | } | ||
| 171 | |||
| 172 | /** | ||
| 173 | * ist_end_non_atomic() - begin a non-atomic section in an IST exception | ||
| 174 | * | ||
| 175 | * Ends a non-atomic section started with ist_begin_non_atomic(). | ||
| 176 | */ | ||
| 177 | void ist_end_non_atomic(void) | ||
| 178 | { | ||
| 179 | preempt_count_add(HARDIRQ_OFFSET); | ||
| 180 | } | ||
| 181 | |||
| 111 | static nokprobe_inline int | 182 | static nokprobe_inline int |
| 112 | do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, | 183 | do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, |
| 113 | struct pt_regs *regs, long error_code) | 184 | struct pt_regs *regs, long error_code) |
| @@ -251,6 +322,8 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) | |||
| 251 | * end up promoting it to a doublefault. In that case, modify | 322 | * end up promoting it to a doublefault. In that case, modify |
| 252 | * the stack to make it look like we just entered the #GP | 323 | * the stack to make it look like we just entered the #GP |
| 253 | * handler from user space, similar to bad_iret. | 324 | * handler from user space, similar to bad_iret. |
| 325 | * | ||
| 326 | * No need for ist_enter here because we don't use RCU. | ||
| 254 | */ | 327 | */ |
| 255 | if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && | 328 | if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && |
| 256 | regs->cs == __KERNEL_CS && | 329 | regs->cs == __KERNEL_CS && |
| @@ -263,12 +336,12 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) | |||
| 263 | normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ | 336 | normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ |
| 264 | regs->ip = (unsigned long)general_protection; | 337 | regs->ip = (unsigned long)general_protection; |
| 265 | regs->sp = (unsigned long)&normal_regs->orig_ax; | 338 | regs->sp = (unsigned long)&normal_regs->orig_ax; |
| 339 | |||
| 266 | return; | 340 | return; |
| 267 | } | 341 | } |
| 268 | #endif | 342 | #endif |
| 269 | 343 | ||
| 270 | exception_enter(); | 344 | ist_enter(regs); /* Discard prev_state because we won't return. */ |
| 271 | /* Return not checked because double check cannot be ignored */ | ||
| 272 | notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); | 345 | notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); |
| 273 | 346 | ||
| 274 | tsk->thread.error_code = error_code; | 347 | tsk->thread.error_code = error_code; |
| @@ -434,7 +507,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) | |||
| 434 | if (poke_int3_handler(regs)) | 507 | if (poke_int3_handler(regs)) |
| 435 | return; | 508 | return; |
| 436 | 509 | ||
| 437 | prev_state = exception_enter(); | 510 | prev_state = ist_enter(regs); |
| 438 | #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP | 511 | #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP |
| 439 | if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, | 512 | if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, |
| 440 | SIGTRAP) == NOTIFY_STOP) | 513 | SIGTRAP) == NOTIFY_STOP) |
| @@ -460,33 +533,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) | |||
| 460 | preempt_conditional_cli(regs); | 533 | preempt_conditional_cli(regs); |
| 461 | debug_stack_usage_dec(); | 534 | debug_stack_usage_dec(); |
| 462 | exit: | 535 | exit: |
| 463 | exception_exit(prev_state); | 536 | ist_exit(regs, prev_state); |
| 464 | } | 537 | } |
| 465 | NOKPROBE_SYMBOL(do_int3); | 538 | NOKPROBE_SYMBOL(do_int3); |
| 466 | 539 | ||
| 467 | #ifdef CONFIG_X86_64 | 540 | #ifdef CONFIG_X86_64 |
| 468 | /* | 541 | /* |
| 469 | * Help handler running on IST stack to switch back to user stack | 542 | * Help handler running on IST stack to switch off the IST stack if the |
| 470 | * for scheduling or signal handling. The actual stack switch is done in | 543 | * interrupted code was in user mode. The actual stack switch is done in |
| 471 | * entry.S | 544 | * entry_64.S |
| 472 | */ | 545 | */ |
| 473 | asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) | 546 | asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) |
| 474 | { | 547 | { |
| 475 | struct pt_regs *regs = eregs; | 548 | struct pt_regs *regs = task_pt_regs(current); |
| 476 | /* Did already sync */ | 549 | *regs = *eregs; |
| 477 | if (eregs == (struct pt_regs *)eregs->sp) | ||
| 478 | ; | ||
| 479 | /* Exception from user space */ | ||
| 480 | else if (user_mode(eregs)) | ||
| 481 | regs = task_pt_regs(current); | ||
| 482 | /* | ||
| 483 | * Exception from kernel and interrupts are enabled. Move to | ||
| 484 | * kernel process stack. | ||
| 485 | */ | ||
| 486 | else if (eregs->flags & X86_EFLAGS_IF) | ||
| 487 | regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); | ||
| 488 | if (eregs != regs) | ||
| 489 | *regs = *eregs; | ||
| 490 | return regs; | 550 | return regs; |
| 491 | } | 551 | } |
| 492 | NOKPROBE_SYMBOL(sync_regs); | 552 | NOKPROBE_SYMBOL(sync_regs); |
| @@ -554,7 +614,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) | |||
| 554 | unsigned long dr6; | 614 | unsigned long dr6; |
| 555 | int si_code; | 615 | int si_code; |
| 556 | 616 | ||
| 557 | prev_state = exception_enter(); | 617 | prev_state = ist_enter(regs); |
| 558 | 618 | ||
| 559 | get_debugreg(dr6, 6); | 619 | get_debugreg(dr6, 6); |
| 560 | 620 | ||
| @@ -629,7 +689,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) | |||
| 629 | debug_stack_usage_dec(); | 689 | debug_stack_usage_dec(); |
| 630 | 690 | ||
| 631 | exit: | 691 | exit: |
| 632 | exception_exit(prev_state); | 692 | ist_exit(regs, prev_state); |
| 633 | } | 693 | } |
| 634 | NOKPROBE_SYMBOL(do_debug); | 694 | NOKPROBE_SYMBOL(do_debug); |
| 635 | 695 | ||
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7680fc275036..4c106fcc0d54 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
| @@ -759,39 +759,71 @@ void rcu_irq_enter(void) | |||
| 759 | /** | 759 | /** |
| 760 | * rcu_nmi_enter - inform RCU of entry to NMI context | 760 | * rcu_nmi_enter - inform RCU of entry to NMI context |
| 761 | * | 761 | * |
| 762 | * If the CPU was idle with dynamic ticks active, and there is no | 762 | * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and |
| 763 | * irq handler running, this updates rdtp->dynticks_nmi to let the | 763 | * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know |
| 764 | * RCU grace-period handling know that the CPU is active. | 764 | * that the CPU is active. This implementation permits nested NMIs, as |
| 765 | * long as the nesting level does not overflow an int. (You will probably | ||
| 766 | * run out of stack space first.) | ||
| 765 | */ | 767 | */ |
| 766 | void rcu_nmi_enter(void) | 768 | void rcu_nmi_enter(void) |
| 767 | { | 769 | { |
| 768 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 770 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
| 771 | int incby = 2; | ||
| 769 | 772 | ||
| 770 | if (rdtp->dynticks_nmi_nesting == 0 && | 773 | /* Complain about underflow. */ |
| 771 | (atomic_read(&rdtp->dynticks) & 0x1)) | 774 | WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); |
| 772 | return; | 775 | |
| 773 | rdtp->dynticks_nmi_nesting++; | 776 | /* |
| 774 | smp_mb__before_atomic(); /* Force delay from prior write. */ | 777 | * If idle from RCU viewpoint, atomically increment ->dynticks |
| 775 | atomic_inc(&rdtp->dynticks); | 778 | * to mark non-idle and increment ->dynticks_nmi_nesting by one. |
| 776 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | 779 | * Otherwise, increment ->dynticks_nmi_nesting by two. This means |
| 777 | smp_mb__after_atomic(); /* See above. */ | 780 | * if ->dynticks_nmi_nesting is equal to one, we are guaranteed |
| 778 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 781 | * to be in the outermost NMI handler that interrupted an RCU-idle |
| 782 | * period (observation due to Andy Lutomirski). | ||
| 783 | */ | ||
| 784 | if (!(atomic_read(&rdtp->dynticks) & 0x1)) { | ||
| 785 | smp_mb__before_atomic(); /* Force delay from prior write. */ | ||
| 786 | atomic_inc(&rdtp->dynticks); | ||
| 787 | /* atomic_inc() before later RCU read-side crit sects */ | ||
| 788 | smp_mb__after_atomic(); /* See above. */ | ||
| 789 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | ||
| 790 | incby = 1; | ||
| 791 | } | ||
| 792 | rdtp->dynticks_nmi_nesting += incby; | ||
| 793 | barrier(); | ||
| 779 | } | 794 | } |
| 780 | 795 | ||
| 781 | /** | 796 | /** |
| 782 | * rcu_nmi_exit - inform RCU of exit from NMI context | 797 | * rcu_nmi_exit - inform RCU of exit from NMI context |
| 783 | * | 798 | * |
| 784 | * If the CPU was idle with dynamic ticks active, and there is no | 799 | * If we are returning from the outermost NMI handler that interrupted an |
| 785 | * irq handler running, this updates rdtp->dynticks_nmi to let the | 800 | * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting |
| 786 | * RCU grace-period handling know that the CPU is no longer active. | 801 | * to let the RCU grace-period handling know that the CPU is back to |
| 802 | * being RCU-idle. | ||
| 787 | */ | 803 | */ |
| 788 | void rcu_nmi_exit(void) | 804 | void rcu_nmi_exit(void) |
| 789 | { | 805 | { |
| 790 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 806 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
| 791 | 807 | ||
| 792 | if (rdtp->dynticks_nmi_nesting == 0 || | 808 | /* |
| 793 | --rdtp->dynticks_nmi_nesting != 0) | 809 | * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. |
| 810 | * (We are exiting an NMI handler, so RCU better be paying attention | ||
| 811 | * to us!) | ||
| 812 | */ | ||
| 813 | WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); | ||
| 814 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | ||
| 815 | |||
| 816 | /* | ||
| 817 | * If the nesting level is not 1, the CPU wasn't RCU-idle, so | ||
| 818 | * leave it in non-RCU-idle state. | ||
| 819 | */ | ||
| 820 | if (rdtp->dynticks_nmi_nesting != 1) { | ||
| 821 | rdtp->dynticks_nmi_nesting -= 2; | ||
| 794 | return; | 822 | return; |
| 823 | } | ||
| 824 | |||
| 825 | /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ | ||
| 826 | rdtp->dynticks_nmi_nesting = 0; | ||
| 795 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | 827 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ |
| 796 | smp_mb__before_atomic(); /* See above. */ | 828 | smp_mb__before_atomic(); /* See above. */ |
| 797 | atomic_inc(&rdtp->dynticks); | 829 | atomic_inc(&rdtp->dynticks); |
