aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/alternative.c17
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c505
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c2
5 files changed, 391 insertions, 153 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a84ac7b570e6..5b8394a3a6b2 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
414 that might execute the to be patched code. 414 that might execute the to be patched code.
415 Other CPUs are not running. */ 415 Other CPUs are not running. */
416 stop_nmi(); 416 stop_nmi();
417#ifdef CONFIG_X86_MCE 417
418 stop_mce(); 418 /*
419#endif 419 * Don't stop machine check exceptions while patching.
420 * MCEs only happen when something got corrupted and in this
421 * case we must do something about the corruption.
422 * Ignoring it is worse than a unlikely patching race.
423 * Also machine checks tend to be broadcast and if one CPU
424 * goes into machine check the others follow quickly, so we don't
425 * expect a machine check to cause undue problems during to code
426 * patching.
427 */
420 428
421 apply_alternatives(__alt_instructions, __alt_instructions_end); 429 apply_alternatives(__alt_instructions, __alt_instructions_end);
422 430
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
456 (unsigned long)__smp_locks_end); 464 (unsigned long)__smp_locks_end);
457 465
458 restart_nmi(); 466 restart_nmi();
459#ifdef CONFIG_X86_MCE
460 restart_mce();
461#endif
462} 467}
463 468
464/** 469/**
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633e..3552119b091d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
60 } 60 }
61} 61}
62 62
63static unsigned long old_cr4 __initdata;
64
65void __init stop_mce(void)
66{
67 old_cr4 = read_cr4();
68 clear_in_cr4(X86_CR4_MCE);
69}
70
71void __init restart_mce(void)
72{
73 if (old_cr4 & X86_CR4_MCE)
74 set_in_cr4(X86_CR4_MCE);
75}
76
77static int __init mcheck_disable(char *str) 63static int __init mcheck_disable(char *str)
78{ 64{
79 mce_disabled = 1; 65 mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f2..a4a7c686ce90 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s). 4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it. 5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
6 */ 8 */
7 9
8#include <linux/init.h> 10#include <linux/init.h>
@@ -24,6 +26,8 @@
24#include <linux/ctype.h> 26#include <linux/ctype.h>
25#include <linux/kmod.h> 27#include <linux/kmod.h>
26#include <linux/kdebug.h> 28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
27#include <asm/processor.h> 31#include <asm/processor.h>
28#include <asm/msr.h> 32#include <asm/msr.h>
29#include <asm/mce.h> 33#include <asm/mce.h>
@@ -32,7 +36,12 @@
32#include <asm/idle.h> 36#include <asm/idle.h>
33 37
34#define MISC_MCELOG_MINOR 227 38#define MISC_MCELOG_MINOR 227
35#define NR_SYSFS_BANKS 6 39
40/*
41 * To support more than 128 would need to escape the predefined
42 * Linux defined extended banks first.
43 */
44#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
36 45
37atomic_t mce_entry; 46atomic_t mce_entry;
38 47
@@ -47,7 +56,7 @@ static int mce_dont_init;
47 */ 56 */
48static int tolerant = 1; 57static int tolerant = 1;
49static int banks; 58static int banks;
50static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; 59static u64 *bank;
51static unsigned long notify_user; 60static unsigned long notify_user;
52static int rip_msr; 61static int rip_msr;
53static int mce_bootlog = -1; 62static int mce_bootlog = -1;
@@ -58,6 +67,14 @@ static char *trigger_argv[2] = { trigger, NULL };
58 67
59static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 68static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
60 69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
61/* 78/*
62 * Lockless MCE logging infrastructure. 79 * Lockless MCE logging infrastructure.
63 * This avoids deadlocks on printk locks without having to break locks. Also 80 * This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
119 print_symbol("{%s}", m->ip); 136 print_symbol("{%s}", m->ip);
120 printk("\n"); 137 printk("\n");
121 } 138 }
122 printk(KERN_EMERG "TSC %Lx ", m->tsc); 139 printk(KERN_EMERG "TSC %llx ", m->tsc);
123 if (m->addr) 140 if (m->addr)
124 printk("ADDR %Lx ", m->addr); 141 printk("ADDR %llx ", m->addr);
125 if (m->misc) 142 if (m->misc)
126 printk("MISC %Lx ", m->misc); 143 printk("MISC %llx ", m->misc);
127 printk("\n"); 144 printk("\n");
128 printk(KERN_EMERG "This is not a software problem!\n"); 145 printk(KERN_EMERG "This is not a software problem!\n");
129 printk(KERN_EMERG "Run through mcelog --ascii to decode " 146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -151,6 +168,8 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
151 168
152static int mce_available(struct cpuinfo_x86 *c) 169static int mce_available(struct cpuinfo_x86 *c)
153{ 170{
171 if (mce_dont_init)
172 return 0;
154 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
155} 174}
156 175
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
172} 191}
173 192
174/* 193/*
175 * The actual machine check handler 194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i])
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245
246 /*
247 * Clear state for this bank.
248 */
249 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
250 }
251
252 /*
253 * Don't clear MCG_STATUS here because it's only defined for
254 * exceptions.
255 */
256}
257
258/*
259 * The actual machine check handler. This only handles real
260 * exceptions when something got corrupted coming in through int 18.
261 *
262 * This is executed in NMI context not subject to normal locking rules. This
263 * implies that most kernel services cannot be safely used. Don't even
264 * think about putting a printk in there!
176 */ 265 */
177void do_machine_check(struct pt_regs * regs, long error_code) 266void do_machine_check(struct pt_regs * regs, long error_code)
178{ 267{
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
190 * error. 279 * error.
191 */ 280 */
192 int kill_it = 0; 281 int kill_it = 0;
282 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
193 283
194 atomic_inc(&mce_entry); 284 atomic_inc(&mce_entry);
195 285
196 if ((regs 286 if (notify_die(DIE_NMI, "machine check", regs, error_code,
197 && notify_die(DIE_NMI, "machine check", regs, error_code,
198 18, SIGKILL) == NOTIFY_STOP) 287 18, SIGKILL) == NOTIFY_STOP)
199 || !banks)
200 goto out2; 288 goto out2;
289 if (!banks)
290 goto out2;
291
292 mce_setup(&m);
201 293
202 memset(&m, 0, sizeof(struct mce));
203 m.cpu = smp_processor_id();
204 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); 294 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
205 /* if the restart IP is not valid, we're done for */ 295 /* if the restart IP is not valid, we're done for */
206 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 296 if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
210 barrier(); 300 barrier();
211 301
212 for (i = 0; i < banks; i++) { 302 for (i = 0; i < banks; i++) {
213 if (i < NR_SYSFS_BANKS && !bank[i]) 303 __clear_bit(i, toclear);
304 if (!bank[i])
214 continue; 305 continue;
215 306
216 m.misc = 0; 307 m.misc = 0;
217 m.addr = 0; 308 m.addr = 0;
218 m.bank = i; 309 m.bank = i;
219 m.tsc = 0;
220 310
221 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); 311 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
222 if ((m.status & MCI_STATUS_VAL) == 0) 312 if ((m.status & MCI_STATUS_VAL) == 0)
223 continue; 313 continue;
224 314
315 /*
316 * Non uncorrected errors are handled by machine_check_poll
317 * Leave them alone.
318 */
319 if ((m.status & MCI_STATUS_UC) == 0)
320 continue;
321
322 /*
323 * Set taint even when machine check was not enabled.
324 */
325 add_taint(TAINT_MACHINE_CHECK);
326
327 __set_bit(i, toclear);
328
225 if (m.status & MCI_STATUS_EN) { 329 if (m.status & MCI_STATUS_EN) {
226 /* if PCC was set, there's no way out */ 330 /* if PCC was set, there's no way out */
227 no_way_out |= !!(m.status & MCI_STATUS_PCC); 331 no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
235 no_way_out = 1; 339 no_way_out = 1;
236 kill_it = 1; 340 kill_it = 1;
237 } 341 }
342 } else {
343 /*
344 * Machine check event was not enabled. Clear, but
345 * ignore.
346 */
347 continue;
238 } 348 }
239 349
240 if (m.status & MCI_STATUS_MISCV) 350 if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
243 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); 353 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
244 354
245 mce_get_rip(&m, regs); 355 mce_get_rip(&m, regs);
246 if (error_code >= 0) 356 mce_log(&m);
247 rdtscll(m.tsc);
248 if (error_code != -2)
249 mce_log(&m);
250 357
251 /* Did this bank cause the exception? */ 358 /* Did this bank cause the exception? */
252 /* Assume that the bank with uncorrectable errors did it, 359 /* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
255 panicm = m; 362 panicm = m;
256 panicm_found = 1; 363 panicm_found = 1;
257 } 364 }
258
259 add_taint(TAINT_MACHINE_CHECK);
260 } 365 }
261 366
262 /* Never do anything final in the polling timer */
263 if (!regs)
264 goto out;
265
266 /* If we didn't find an uncorrectable error, pick 367 /* If we didn't find an uncorrectable error, pick
267 the last one (shouldn't happen, just being safe). */ 368 the last one (shouldn't happen, just being safe). */
268 if (!panicm_found) 369 if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
309 /* notify userspace ASAP */ 410 /* notify userspace ASAP */
310 set_thread_flag(TIF_MCE_NOTIFY); 411 set_thread_flag(TIF_MCE_NOTIFY);
311 412
312 out:
313 /* the last thing we do is clear state */ 413 /* the last thing we do is clear state */
314 for (i = 0; i < banks; i++) 414 for (i = 0; i < banks; i++) {
315 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 415 if (test_bit(i, toclear))
416 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
417 }
316 wrmsrl(MSR_IA32_MCG_STATUS, 0); 418 wrmsrl(MSR_IA32_MCG_STATUS, 0);
317 out2: 419 out2:
318 atomic_dec(&mce_entry); 420 atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
332 * and historically has been the register value of the 434 * and historically has been the register value of the
333 * MSR_IA32_THERMAL_STATUS (Intel) msr. 435 * MSR_IA32_THERMAL_STATUS (Intel) msr.
334 */ 436 */
335void mce_log_therm_throt_event(unsigned int cpu, __u64 status) 437void mce_log_therm_throt_event(__u64 status)
336{ 438{
337 struct mce m; 439 struct mce m;
338 440
339 memset(&m, 0, sizeof(m)); 441 mce_setup(&m);
340 m.cpu = cpu;
341 m.bank = MCE_THERMAL_BANK; 442 m.bank = MCE_THERMAL_BANK;
342 m.status = status; 443 m.status = status;
343 rdtscll(m.tsc);
344 mce_log(&m); 444 mce_log(&m);
345} 445}
346#endif /* CONFIG_X86_MCE_INTEL */ 446#endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,17 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
353 453
354static int check_interval = 5 * 60; /* 5 minutes */ 454static int check_interval = 5 * 60; /* 5 minutes */
355static int next_interval; /* in jiffies */ 455static int next_interval; /* in jiffies */
356static void mcheck_timer(struct work_struct *work); 456static void mcheck_timer(unsigned long);
357static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); 457static DEFINE_PER_CPU(struct timer_list, mce_timer);
358 458
359static void mcheck_check_cpu(void *info) 459static void mcheck_timer(unsigned long data)
360{ 460{
361 if (mce_available(&current_cpu_data)) 461 struct timer_list *t = &per_cpu(mce_timer, data);
362 do_machine_check(NULL, 0);
363}
364 462
365static void mcheck_timer(struct work_struct *work) 463 WARN_ON(smp_processor_id() != data);
366{ 464
367 on_each_cpu(mcheck_check_cpu, NULL, 1); 465 if (mce_available(&current_cpu_data))
466 machine_check_poll(MCP_TIMESTAMP);
368 467
369 /* 468 /*
370 * Alert userspace if needed. If we logged an MCE, reduce the 469 * Alert userspace if needed. If we logged an MCE, reduce the
@@ -377,14 +476,21 @@ static void mcheck_timer(struct work_struct *work)
377 (int)round_jiffies_relative(check_interval*HZ)); 476 (int)round_jiffies_relative(check_interval*HZ));
378 } 477 }
379 478
380 schedule_delayed_work(&mcheck_work, next_interval); 479 t->expires = jiffies + next_interval;
480 add_timer(t);
381} 481}
382 482
483static void mce_do_trigger(struct work_struct *work)
484{
485 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
486}
487
488static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
489
383/* 490/*
384 * This is only called from process context. This is where we do 491 * Notify the user(s) about new machine check events.
385 * anything we need to alert userspace about new MCEs. This is called 492 * Can be called from interrupt context, but not from machine check/NMI
386 * directly from the poller and also from entry.S and idle, thanks to 493 * context.
387 * TIF_MCE_NOTIFY.
388 */ 494 */
389int mce_notify_user(void) 495int mce_notify_user(void)
390{ 496{
@@ -394,9 +500,14 @@ int mce_notify_user(void)
394 unsigned long now = jiffies; 500 unsigned long now = jiffies;
395 501
396 wake_up_interruptible(&mce_wait); 502 wake_up_interruptible(&mce_wait);
397 if (trigger[0]) 503
398 call_usermodehelper(trigger, trigger_argv, NULL, 504 /*
399 UMH_NO_WAIT); 505 * There is no risk of missing notifications because
506 * work_pending is always cleared before the function is
507 * executed.
508 */
509 if (trigger[0] && !work_pending(&mce_trigger_work))
510 schedule_work(&mce_trigger_work);
400 511
401 if (time_after_eq(now, last_print + (check_interval*HZ))) { 512 if (time_after_eq(now, last_print + (check_interval*HZ))) {
402 last_print = now; 513 last_print = now;
@@ -425,63 +536,76 @@ static struct notifier_block mce_idle_notifier = {
425 536
426static __init int periodic_mcheck_init(void) 537static __init int periodic_mcheck_init(void)
427{ 538{
428 next_interval = check_interval * HZ; 539 idle_notifier_register(&mce_idle_notifier);
429 if (next_interval) 540 return 0;
430 schedule_delayed_work(&mcheck_work,
431 round_jiffies_relative(next_interval));
432 idle_notifier_register(&mce_idle_notifier);
433 return 0;
434} 541}
435__initcall(periodic_mcheck_init); 542__initcall(periodic_mcheck_init);
436 543
437
438/* 544/*
439 * Initialize Machine Checks for a CPU. 545 * Initialize Machine Checks for a CPU.
440 */ 546 */
441static void mce_init(void *dummy) 547static int mce_cap_init(void)
442{ 548{
443 u64 cap; 549 u64 cap;
444 int i; 550 unsigned b;
445 551
446 rdmsrl(MSR_IA32_MCG_CAP, cap); 552 rdmsrl(MSR_IA32_MCG_CAP, cap);
447 banks = cap & 0xff; 553 b = cap & 0xff;
448 if (banks > MCE_EXTENDED_BANK) { 554 if (b > MAX_NR_BANKS) {
449 banks = MCE_EXTENDED_BANK; 555 printk(KERN_WARNING
450 printk(KERN_INFO "MCE: warning: using only %d banks\n", 556 "MCE: Using only %u machine check banks out of %u\n",
451 MCE_EXTENDED_BANK); 557 MAX_NR_BANKS, b);
558 b = MAX_NR_BANKS;
452 } 559 }
560
561 /* Don't support asymmetric configurations today */
562 WARN_ON(banks != 0 && b != banks);
563 banks = b;
564 if (!bank) {
565 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
566 if (!bank)
567 return -ENOMEM;
568 memset(bank, 0xff, banks * sizeof(u64));
569 }
570
453 /* Use accurate RIP reporting if available. */ 571 /* Use accurate RIP reporting if available. */
454 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) 572 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
455 rip_msr = MSR_IA32_MCG_EIP; 573 rip_msr = MSR_IA32_MCG_EIP;
456 574
457 /* Log the machine checks left over from the previous reset. 575 return 0;
458 This also clears all registers */ 576}
459 do_machine_check(NULL, mce_bootlog ? -1 : -2); 577
578static void mce_init(void *dummy)
579{
580 u64 cap;
581 int i;
582
583 /*
584 * Log the machine checks left over from the previous reset.
585 */
586 machine_check_poll(MCP_UC);
460 587
461 set_in_cr4(X86_CR4_MCE); 588 set_in_cr4(X86_CR4_MCE);
462 589
590 rdmsrl(MSR_IA32_MCG_CAP, cap);
463 if (cap & MCG_CTL_P) 591 if (cap & MCG_CTL_P)
464 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 592 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
465 593
466 for (i = 0; i < banks; i++) { 594 for (i = 0; i < banks; i++) {
467 if (i < NR_SYSFS_BANKS) 595 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
468 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
469 else
470 wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
471
472 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 596 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
473 } 597 }
474} 598}
475 599
476/* Add per CPU specific workarounds here */ 600/* Add per CPU specific workarounds here */
477static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 601static void mce_cpu_quirks(struct cpuinfo_x86 *c)
478{ 602{
479 /* This should be disabled by the BIOS, but isn't always */ 603 /* This should be disabled by the BIOS, but isn't always */
480 if (c->x86_vendor == X86_VENDOR_AMD) { 604 if (c->x86_vendor == X86_VENDOR_AMD) {
481 if(c->x86 == 15) 605 if (c->x86 == 15 && banks > 4)
482 /* disable GART TBL walk error reporting, which trips off 606 /* disable GART TBL walk error reporting, which trips off
483 incorrectly with the IOMMU & 3ware & Cerberus. */ 607 incorrectly with the IOMMU & 3ware & Cerberus. */
484 clear_bit(10, &bank[4]); 608 clear_bit(10, (unsigned long *)&bank[4]);
485 if(c->x86 <= 17 && mce_bootlog < 0) 609 if(c->x86 <= 17 && mce_bootlog < 0)
486 /* Lots of broken BIOS around that don't clear them 610 /* Lots of broken BIOS around that don't clear them
487 by default and leave crap in there. Don't log. */ 611 by default and leave crap in there. Don't log. */
@@ -504,20 +628,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
504 } 628 }
505} 629}
506 630
631static void mce_init_timer(void)
632{
633 struct timer_list *t = &__get_cpu_var(mce_timer);
634
635 /* data race harmless because everyone sets to the same value */
636 if (!next_interval)
637 next_interval = check_interval * HZ;
638 if (!next_interval)
639 return;
640 setup_timer(t, mcheck_timer, smp_processor_id());
641 t->expires = round_jiffies_relative(jiffies + next_interval);
642 add_timer(t);
643}
644
507/* 645/*
508 * Called for each booted CPU to set up machine checks. 646 * Called for each booted CPU to set up machine checks.
509 * Must be called with preempt off. 647 * Must be called with preempt off.
510 */ 648 */
511void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 649void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
512{ 650{
513 mce_cpu_quirks(c); 651 if (!mce_available(c))
652 return;
514 653
515 if (mce_dont_init || 654 if (mce_cap_init() < 0) {
516 !mce_available(c)) 655 mce_dont_init = 1;
517 return; 656 return;
657 }
658 mce_cpu_quirks(c);
518 659
519 mce_init(NULL); 660 mce_init(NULL);
520 mce_cpu_features(c); 661 mce_cpu_features(c);
662 mce_init_timer();
521} 663}
522 664
523/* 665/*
@@ -573,7 +715,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
573{ 715{
574 unsigned long *cpu_tsc; 716 unsigned long *cpu_tsc;
575 static DEFINE_MUTEX(mce_read_mutex); 717 static DEFINE_MUTEX(mce_read_mutex);
576 unsigned next; 718 unsigned prev, next;
577 char __user *buf = ubuf; 719 char __user *buf = ubuf;
578 int i, err; 720 int i, err;
579 721
@@ -592,25 +734,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
592 } 734 }
593 735
594 err = 0; 736 err = 0;
595 for (i = 0; i < next; i++) { 737 prev = 0;
596 unsigned long start = jiffies; 738 do {
597 739 for (i = prev; i < next; i++) {
598 while (!mcelog.entry[i].finished) { 740 unsigned long start = jiffies;
599 if (time_after_eq(jiffies, start + 2)) { 741
600 memset(mcelog.entry + i,0, sizeof(struct mce)); 742 while (!mcelog.entry[i].finished) {
601 goto timeout; 743 if (time_after_eq(jiffies, start + 2)) {
744 memset(mcelog.entry + i, 0,
745 sizeof(struct mce));
746 goto timeout;
747 }
748 cpu_relax();
602 } 749 }
603 cpu_relax(); 750 smp_rmb();
751 err |= copy_to_user(buf, mcelog.entry + i,
752 sizeof(struct mce));
753 buf += sizeof(struct mce);
754timeout:
755 ;
604 } 756 }
605 smp_rmb();
606 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
607 buf += sizeof(struct mce);
608 timeout:
609 ;
610 }
611 757
612 memset(mcelog.entry, 0, next * sizeof(struct mce)); 758 memset(mcelog.entry + prev, 0,
613 mcelog.next = 0; 759 (next - prev) * sizeof(struct mce));
760 prev = next;
761 next = cmpxchg(&mcelog.next, prev, 0);
762 } while (next != prev);
614 763
615 synchronize_sched(); 764 synchronize_sched();
616 765
@@ -680,20 +829,6 @@ static struct miscdevice mce_log_device = {
680 &mce_chrdev_ops, 829 &mce_chrdev_ops,
681}; 830};
682 831
683static unsigned long old_cr4 __initdata;
684
685void __init stop_mce(void)
686{
687 old_cr4 = read_cr4();
688 clear_in_cr4(X86_CR4_MCE);
689}
690
691void __init restart_mce(void)
692{
693 if (old_cr4 & X86_CR4_MCE)
694 set_in_cr4(X86_CR4_MCE);
695}
696
697/* 832/*
698 * Old style boot options parsing. Only for compatibility. 833 * Old style boot options parsing. Only for compatibility.
699 */ 834 */
@@ -703,8 +838,7 @@ static int __init mcheck_disable(char *str)
703 return 1; 838 return 1;
704} 839}
705 840
706/* mce=off disables machine check. Note you can re-enable it later 841/* mce=off disables machine check.
707 using sysfs.
708 mce=TOLERANCELEVEL (number, see above) 842 mce=TOLERANCELEVEL (number, see above)
709 mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 843 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
710 mce=nobootlog Don't log MCEs from before booting. */ 844 mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +862,29 @@ __setup("mce=", mcheck_enable);
728 * Sysfs support 862 * Sysfs support
729 */ 863 */
730 864
865/*
866 * Disable machine checks on suspend and shutdown. We can't really handle
867 * them later.
868 */
869static int mce_disable(void)
870{
871 int i;
872
873 for (i = 0; i < banks; i++)
874 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
875 return 0;
876}
877
878static int mce_suspend(struct sys_device *dev, pm_message_t state)
879{
880 return mce_disable();
881}
882
883static int mce_shutdown(struct sys_device *dev)
884{
885 return mce_disable();
886}
887
731/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. 888/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
732 Only one CPU is active at this time, the others get readded later using 889 Only one CPU is active at this time, the others get readded later using
733 CPU hotplug. */ 890 CPU hotplug. */
@@ -738,20 +895,24 @@ static int mce_resume(struct sys_device *dev)
738 return 0; 895 return 0;
739} 896}
740 897
898static void mce_cpu_restart(void *data)
899{
900 del_timer_sync(&__get_cpu_var(mce_timer));
901 if (mce_available(&current_cpu_data))
902 mce_init(NULL);
903 mce_init_timer();
904}
905
741/* Reinit MCEs after user configuration changes */ 906/* Reinit MCEs after user configuration changes */
742static void mce_restart(void) 907static void mce_restart(void)
743{ 908{
744 if (next_interval)
745 cancel_delayed_work(&mcheck_work);
746 /* Timer race is harmless here */
747 on_each_cpu(mce_init, NULL, 1);
748 next_interval = check_interval * HZ; 909 next_interval = check_interval * HZ;
749 if (next_interval) 910 on_each_cpu(mce_cpu_restart, NULL, 1);
750 schedule_delayed_work(&mcheck_work,
751 round_jiffies_relative(next_interval));
752} 911}
753 912
754static struct sysdev_class mce_sysclass = { 913static struct sysdev_class mce_sysclass = {
914 .suspend = mce_suspend,
915 .shutdown = mce_shutdown,
755 .resume = mce_resume, 916 .resume = mce_resume,
756 .name = "machinecheck", 917 .name = "machinecheck",
757}; 918};
@@ -778,16 +939,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
778 } \ 939 } \
779 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 940 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
780 941
781/* 942static struct sysdev_attribute *bank_attrs;
782 * TBD should generate these dynamically based on number of available banks. 943
783 * Have only 6 contol banks in /sysfs until then. 944static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
784 */ 945 char *buf)
785ACCESSOR(bank0ctl,bank[0],mce_restart()) 946{
786ACCESSOR(bank1ctl,bank[1],mce_restart()) 947 u64 b = bank[attr - bank_attrs];
787ACCESSOR(bank2ctl,bank[2],mce_restart()) 948 return sprintf(buf, "%llx\n", b);
788ACCESSOR(bank3ctl,bank[3],mce_restart()) 949}
789ACCESSOR(bank4ctl,bank[4],mce_restart()) 950
790ACCESSOR(bank5ctl,bank[5],mce_restart()) 951static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
952 const char *buf, size_t siz)
953{
954 char *end;
955 u64 new = simple_strtoull(buf, &end, 0);
956 if (end == buf)
957 return -EINVAL;
958 bank[attr - bank_attrs] = new;
959 mce_restart();
960 return end-buf;
961}
791 962
792static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, 963static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
793 char *buf) 964 char *buf)
@@ -814,8 +985,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
814static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 985static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
815ACCESSOR(check_interval,check_interval,mce_restart()) 986ACCESSOR(check_interval,check_interval,mce_restart())
816static struct sysdev_attribute *mce_attributes[] = { 987static struct sysdev_attribute *mce_attributes[] = {
817 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
818 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
819 &attr_tolerant.attr, &attr_check_interval, &attr_trigger, 988 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
820 NULL 989 NULL
821}; 990};
@@ -845,11 +1014,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
845 if (err) 1014 if (err)
846 goto error; 1015 goto error;
847 } 1016 }
1017 for (i = 0; i < banks; i++) {
1018 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1019 &bank_attrs[i]);
1020 if (err)
1021 goto error2;
1022 }
848 cpu_set(cpu, mce_device_initialized); 1023 cpu_set(cpu, mce_device_initialized);
849 1024
850 return 0; 1025 return 0;
1026error2:
1027 while (--i >= 0) {
1028 sysdev_remove_file(&per_cpu(device_mce, cpu),
1029 &bank_attrs[i]);
1030 }
851error: 1031error:
852 while (i--) { 1032 while (--i >= 0) {
853 sysdev_remove_file(&per_cpu(device_mce,cpu), 1033 sysdev_remove_file(&per_cpu(device_mce,cpu),
854 mce_attributes[i]); 1034 mce_attributes[i]);
855 } 1035 }
@@ -868,15 +1048,40 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
868 for (i = 0; mce_attributes[i]; i++) 1048 for (i = 0; mce_attributes[i]; i++)
869 sysdev_remove_file(&per_cpu(device_mce,cpu), 1049 sysdev_remove_file(&per_cpu(device_mce,cpu),
870 mce_attributes[i]); 1050 mce_attributes[i]);
1051 for (i = 0; i < banks; i++)
1052 sysdev_remove_file(&per_cpu(device_mce, cpu),
1053 &bank_attrs[i]);
871 sysdev_unregister(&per_cpu(device_mce,cpu)); 1054 sysdev_unregister(&per_cpu(device_mce,cpu));
872 cpu_clear(cpu, mce_device_initialized); 1055 cpu_clear(cpu, mce_device_initialized);
873} 1056}
874 1057
1058/* Make sure there are no machine checks on offlined CPUs. */
1059static void mce_disable_cpu(void *h)
1060{
1061 int i;
1062
1063 if (!mce_available(&current_cpu_data))
1064 return;
1065 for (i = 0; i < banks; i++)
1066 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1067}
1068
1069static void mce_reenable_cpu(void *h)
1070{
1071 int i;
1072
1073 if (!mce_available(&current_cpu_data))
1074 return;
1075 for (i = 0; i < banks; i++)
1076 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1077}
1078
875/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 1079/* Get notified when a cpu comes on/off. Be hotplug friendly. */
876static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, 1080static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
877 unsigned long action, void *hcpu) 1081 unsigned long action, void *hcpu)
878{ 1082{
879 unsigned int cpu = (unsigned long)hcpu; 1083 unsigned int cpu = (unsigned long)hcpu;
1084 struct timer_list *t = &per_cpu(mce_timer, cpu);
880 1085
881 switch (action) { 1086 switch (action) {
882 case CPU_ONLINE: 1087 case CPU_ONLINE:
@@ -891,6 +1096,17 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
891 threshold_cpu_callback(action, cpu); 1096 threshold_cpu_callback(action, cpu);
892 mce_remove_device(cpu); 1097 mce_remove_device(cpu);
893 break; 1098 break;
1099 case CPU_DOWN_PREPARE:
1100 case CPU_DOWN_PREPARE_FROZEN:
1101 del_timer_sync(t);
1102 smp_call_function_single(cpu, mce_disable_cpu, NULL, 1);
1103 break;
1104 case CPU_DOWN_FAILED:
1105 case CPU_DOWN_FAILED_FROZEN:
1106 t->expires = round_jiffies_relative(jiffies + next_interval);
1107 add_timer_on(t, cpu);
1108 smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1);
1109 break;
894 } 1110 }
895 return NOTIFY_OK; 1111 return NOTIFY_OK;
896} 1112}
@@ -899,6 +1115,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
899 .notifier_call = mce_cpu_callback, 1115 .notifier_call = mce_cpu_callback,
900}; 1116};
901 1117
1118static __init int mce_init_banks(void)
1119{
1120 int i;
1121
1122 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1123 GFP_KERNEL);
1124 if (!bank_attrs)
1125 return -ENOMEM;
1126
1127 for (i = 0; i < banks; i++) {
1128 struct sysdev_attribute *a = &bank_attrs[i];
1129 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1130 if (!a->attr.name)
1131 goto nomem;
1132 a->attr.mode = 0644;
1133 a->show = show_bank;
1134 a->store = set_bank;
1135 }
1136 return 0;
1137
1138nomem:
1139 while (--i >= 0)
1140 kfree(bank_attrs[i].attr.name);
1141 kfree(bank_attrs);
1142 bank_attrs = NULL;
1143 return -ENOMEM;
1144}
1145
902static __init int mce_init_device(void) 1146static __init int mce_init_device(void)
903{ 1147{
904 int err; 1148 int err;
@@ -906,6 +1150,11 @@ static __init int mce_init_device(void)
906 1150
907 if (!mce_available(&boot_cpu_data)) 1151 if (!mce_available(&boot_cpu_data))
908 return -EIO; 1152 return -EIO;
1153
1154 err = mce_init_banks();
1155 if (err)
1156 return err;
1157
909 err = sysdev_class_register(&mce_sysclass); 1158 err = sysdev_class_register(&mce_sysclass);
910 if (err) 1159 if (err)
911 return err; 1160 return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index f2ee0ae29bd6..e82c8208b81e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -197,9 +197,7 @@ asmlinkage void mce_threshold_interrupt(void)
197 exit_idle(); 197 exit_idle();
198 irq_enter(); 198 irq_enter();
199 199
200 memset(&m, 0, sizeof(m)); 200 mce_setup(&m);
201 rdtscll(m.tsc);
202 m.cpu = smp_processor_id();
203 201
204 /* assume first bank caused it */ 202 /* assume first bank caused it */
205 for (bank = 0; bank < NR_BANKS; ++bank) { 203 for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,7 @@ asmlinkage void mce_threshold_interrupt(void)
233 231
234 /* Log the machine check that caused the threshold 232 /* Log the machine check that caused the threshold
235 event. */ 233 event. */
236 do_machine_check(NULL, 0); 234 machine_check_poll(MCP_TIMESTAMP);
237 235
238 if (high & MASK_OVERFLOW_HI) { 236 if (high & MASK_OVERFLOW_HI) {
239 rdmsrl(address, m.misc); 237 rdmsrl(address, m.misc);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index f44c36624360..1b1491a76b55 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -24,7 +24,7 @@ asmlinkage void smp_thermal_interrupt(void)
24 24
25 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 25 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
26 if (therm_throt_process(msr_val & 1)) 26 if (therm_throt_process(msr_val & 1))
27 mce_log_therm_throt_event(smp_processor_id(), msr_val); 27 mce_log_therm_throt_event(msr_val);
28 28
29 inc_irq_stat(irq_thermal_count); 29 inc_irq_stat(irq_thermal_count);
30 irq_exit(); 30 irq_exit();