aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c129
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c2
2 files changed, 109 insertions, 22 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fed875742b1..268b05edade 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s). 4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it. 5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
6 */ 8 */
7 9
8#include <linux/init.h> 10#include <linux/init.h>
@@ -189,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
189} 191}
190 192
191/* 193/*
192 * The actual machine check handler 194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i])
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245
246 /*
247 * Clear state for this bank.
248 */
249 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
250 }
251
252 /*
253 * Don't clear MCG_STATUS here because it's only defined for
254 * exceptions.
255 */
256}
257
258/*
259 * The actual machine check handler. This only handles real
260 * exceptions when something got corrupted coming in through int 18.
261 *
262 * This is executed in NMI context not subject to normal locking rules. This
263 * implies that most kernel services cannot be safely used. Don't even
264 * think about putting a printk in there!
193 */ 265 */
194void do_machine_check(struct pt_regs * regs, long error_code) 266void do_machine_check(struct pt_regs * regs, long error_code)
195{ 267{
@@ -207,13 +279,14 @@ void do_machine_check(struct pt_regs * regs, long error_code)
207 * error. 279 * error.
208 */ 280 */
209 int kill_it = 0; 281 int kill_it = 0;
282 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
210 283
211 atomic_inc(&mce_entry); 284 atomic_inc(&mce_entry);
212 285
213 if ((regs 286 if (notify_die(DIE_NMI, "machine check", regs, error_code,
214 && notify_die(DIE_NMI, "machine check", regs, error_code,
215 18, SIGKILL) == NOTIFY_STOP) 287 18, SIGKILL) == NOTIFY_STOP)
216 || !banks) 288 goto out2;
289 if (!banks)
217 goto out2; 290 goto out2;
218 291
219 mce_setup(&m); 292 mce_setup(&m);
@@ -227,6 +300,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
227 barrier(); 300 barrier();
228 301
229 for (i = 0; i < banks; i++) { 302 for (i = 0; i < banks; i++) {
303 __clear_bit(i, toclear);
230 if (!bank[i]) 304 if (!bank[i])
231 continue; 305 continue;
232 306
@@ -238,6 +312,20 @@ void do_machine_check(struct pt_regs * regs, long error_code)
238 if ((m.status & MCI_STATUS_VAL) == 0) 312 if ((m.status & MCI_STATUS_VAL) == 0)
239 continue; 313 continue;
240 314
315 /*
316 * Non uncorrected errors are handled by machine_check_poll
317 * Leave them alone.
318 */
319 if ((m.status & MCI_STATUS_UC) == 0)
320 continue;
321
322 /*
323 * Set taint even when machine check was not enabled.
324 */
325 add_taint(TAINT_MACHINE_CHECK);
326
327 __set_bit(i, toclear);
328
241 if (m.status & MCI_STATUS_EN) { 329 if (m.status & MCI_STATUS_EN) {
242 /* if PCC was set, there's no way out */ 330 /* if PCC was set, there's no way out */
243 no_way_out |= !!(m.status & MCI_STATUS_PCC); 331 no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -251,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
251 no_way_out = 1; 339 no_way_out = 1;
252 kill_it = 1; 340 kill_it = 1;
253 } 341 }
342 } else {
343 /*
344 * Machine check event was not enabled. Clear, but
345 * ignore.
346 */
347 continue;
254 } 348 }
255 349
256 if (m.status & MCI_STATUS_MISCV) 350 if (m.status & MCI_STATUS_MISCV)
@@ -259,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
259 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 353 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
260 354
261 mce_get_rip(&m, regs); 355 mce_get_rip(&m, regs);
262 if (error_code < 0) 356 mce_log(&m);
263 m.tsc = 0;
264 if (error_code != -2)
265 mce_log(&m);
266 357
267 /* Did this bank cause the exception? */ 358 /* Did this bank cause the exception? */
268 /* Assume that the bank with uncorrectable errors did it, 359 /* Assume that the bank with uncorrectable errors did it,
@@ -271,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
271 panicm = m; 362 panicm = m;
272 panicm_found = 1; 363 panicm_found = 1;
273 } 364 }
274
275 add_taint(TAINT_MACHINE_CHECK);
276 } 365 }
277 366
278 /* Never do anything final in the polling timer */
279 if (!regs)
280 goto out;
281
282 /* If we didn't find an uncorrectable error, pick 367 /* If we didn't find an uncorrectable error, pick
283 the last one (shouldn't happen, just being safe). */ 368 the last one (shouldn't happen, just being safe). */
284 if (!panicm_found) 369 if (!panicm_found)
@@ -325,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
325 /* notify userspace ASAP */ 410 /* notify userspace ASAP */
326 set_thread_flag(TIF_MCE_NOTIFY); 411 set_thread_flag(TIF_MCE_NOTIFY);
327 412
328 out:
329 /* the last thing we do is clear state */ 413 /* the last thing we do is clear state */
330 for (i = 0; i < banks; i++) 414 for (i = 0; i < banks; i++) {
331 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 415 if (test_bit(i, toclear))
416 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
417 }
332 wrmsrl(MSR_IA32_MCG_STATUS, 0); 418 wrmsrl(MSR_IA32_MCG_STATUS, 0);
333 out2: 419 out2:
334 atomic_dec(&mce_entry); 420 atomic_dec(&mce_entry);
@@ -377,7 +463,7 @@ static void mcheck_timer(unsigned long data)
377 WARN_ON(smp_processor_id() != data); 463 WARN_ON(smp_processor_id() != data);
378 464
379 if (mce_available(&current_cpu_data)) 465 if (mce_available(&current_cpu_data))
380 do_machine_check(NULL, 0); 466 machine_check_poll(MCP_TIMESTAMP);
381 467
382 /* 468 /*
383 * Alert userspace if needed. If we logged an MCE, reduce the 469 * Alert userspace if needed. If we logged an MCE, reduce the
@@ -494,9 +580,10 @@ static void mce_init(void *dummy)
494 u64 cap; 580 u64 cap;
495 int i; 581 int i;
496 582
497 /* Log the machine checks left over from the previous reset. 583 /*
498 This also clears all registers */ 584 * Log the machine checks left over from the previous reset.
499 do_machine_check(NULL, mce_bootlog ? -1 : -2); 585 */
586 machine_check_poll(MCP_UC);
500 587
501 set_in_cr4(X86_CR4_MCE); 588 set_in_cr4(X86_CR4_MCE);
502 589
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 75d9dd25e3d..0069c653f4e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -231,7 +231,7 @@ asmlinkage void mce_threshold_interrupt(void)
231 231
232 /* Log the machine check that caused the threshold 232 /* Log the machine check that caused the threshold
233 event. */ 233 event. */
234 do_machine_check(NULL, 0); 234 machine_check_poll(MCP_TIMESTAMP);
235 235
236 if (high & MASK_OVERFLOW_HI) { 236 if (high & MASK_OVERFLOW_HI) {
237 rdmsrl(address, m.misc); 237 rdmsrl(address, m.misc);