aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c360
1 files changed, 331 insertions, 29 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 421020f1d7db..ba431893e31d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -21,6 +21,7 @@
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/string.h> 22#include <linux/string.h>
23#include <linux/sysdev.h> 23#include <linux/sysdev.h>
24#include <linux/delay.h>
24#include <linux/ctype.h> 25#include <linux/ctype.h>
25#include <linux/sched.h> 26#include <linux/sched.h>
26#include <linux/sysfs.h> 27#include <linux/sysfs.h>
@@ -28,6 +29,7 @@
28#include <linux/init.h> 29#include <linux/init.h>
29#include <linux/kmod.h> 30#include <linux/kmod.h>
30#include <linux/poll.h> 31#include <linux/poll.h>
32#include <linux/nmi.h>
31#include <linux/cpu.h> 33#include <linux/cpu.h>
32#include <linux/smp.h> 34#include <linux/smp.h>
33#include <linux/fs.h> 35#include <linux/fs.h>
@@ -60,6 +62,8 @@ int mce_disabled;
60 62
61#define MISC_MCELOG_MINOR 227 63#define MISC_MCELOG_MINOR 227
62 64
65#define SPINUNIT 100 /* 100ns */
66
63atomic_t mce_entry; 67atomic_t mce_entry;
64 68
65DEFINE_PER_CPU(unsigned, mce_exception_count); 69DEFINE_PER_CPU(unsigned, mce_exception_count);
@@ -77,6 +81,7 @@ static u64 *bank;
77static unsigned long notify_user; 81static unsigned long notify_user;
78static int rip_msr; 82static int rip_msr;
79static int mce_bootlog = -1; 83static int mce_bootlog = -1;
84static int monarch_timeout = -1;
80 85
81static char trigger[128]; 86static char trigger[128];
82static char *trigger_argv[2] = { trigger, NULL }; 87static char *trigger_argv[2] = { trigger, NULL };
@@ -84,6 +89,9 @@ static char *trigger_argv[2] = { trigger, NULL };
84static unsigned long dont_init_banks; 89static unsigned long dont_init_banks;
85 90
86static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 91static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
92static DEFINE_PER_CPU(struct mce, mces_seen);
93static int cpu_missing;
94
87 95
88/* MCA banks polled by the period polling timer for corrected events */ 96/* MCA banks polled by the period polling timer for corrected events */
89DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 97DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -241,6 +249,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
241 } 249 }
242 if (final) 250 if (final)
243 print_mce(final); 251 print_mce(final);
252 if (cpu_missing)
253 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
244 if (exp) 254 if (exp)
245 printk(KERN_EMERG "Machine check: %s\n", exp); 255 printk(KERN_EMERG "Machine check: %s\n", exp);
246 panic(msg); 256 panic(msg);
@@ -451,18 +461,287 @@ static int mce_no_way_out(struct mce *m, char **msg)
451} 461}
452 462
453/* 463/*
464 * Variable to establish order between CPUs while scanning.
465 * Each CPU spins initially until executing is equal its number.
466 */
467static atomic_t mce_executing;
468
469/*
470 * Defines order of CPUs on entry. First CPU becomes Monarch.
471 */
472static atomic_t mce_callin;
473
474/*
475 * Check if a timeout waiting for other CPUs happened.
476 */
477static int mce_timed_out(u64 *t)
478{
479 /*
480 * The others already did panic for some reason.
481 * Bail out like in a timeout.
482 * rmb() to tell the compiler that system_state
483 * might have been modified by someone else.
484 */
485 rmb();
486 if (atomic_read(&mce_paniced))
487 wait_for_panic();
488 if (!monarch_timeout)
489 goto out;
490 if ((s64)*t < SPINUNIT) {
491 /* CHECKME: Make panic default for 1 too? */
492 if (tolerant < 1)
493 mce_panic("Timeout synchronizing machine check over CPUs",
494 NULL, NULL);
495 cpu_missing = 1;
496 return 1;
497 }
498 *t -= SPINUNIT;
499out:
500 touch_nmi_watchdog();
501 return 0;
502}
503
504/*
505 * The Monarch's reign. The Monarch is the CPU who entered
506 * the machine check handler first. It waits for the others to
507 * raise the exception too and then grades them. When any
508 * error is fatal panic. Only then let the others continue.
509 *
510 * The other CPUs entering the MCE handler will be controlled by the
511 * Monarch. They are called Subjects.
512 *
513 * This way we prevent any potential data corruption in a unrecoverable case
514 * and also makes sure always all CPU's errors are examined.
515 *
516 * Also this detects the case of an machine check event coming from outer
517 * space (not detected by any CPUs) In this case some external agent wants
518 * us to shut down, so panic too.
519 *
520 * The other CPUs might still decide to panic if the handler happens
521 * in a unrecoverable place, but in this case the system is in a semi-stable
522 * state and won't corrupt anything by itself. It's ok to let the others
523 * continue for a bit first.
524 *
525 * All the spin loops have timeouts; when a timeout happens a CPU
526 * typically elects itself to be Monarch.
527 */
528static void mce_reign(void)
529{
530 int cpu;
531 struct mce *m = NULL;
532 int global_worst = 0;
533 char *msg = NULL;
534 char *nmsg = NULL;
535
536 /*
537 * This CPU is the Monarch and the other CPUs have run
538 * through their handlers.
539 * Grade the severity of the errors of all the CPUs.
540 */
541 for_each_possible_cpu(cpu) {
542 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
543 &nmsg);
544 if (severity > global_worst) {
545 msg = nmsg;
546 global_worst = severity;
547 m = &per_cpu(mces_seen, cpu);
548 }
549 }
550
551 /*
552 * Cannot recover? Panic here then.
553 * This dumps all the mces in the log buffer and stops the
554 * other CPUs.
555 */
556 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
557 mce_panic("Fatal machine check", m, msg);
558
559 /*
560 * For UC somewhere we let the CPU who detects it handle it.
561 * Also must let continue the others, otherwise the handling
562 * CPU could deadlock on a lock.
563 */
564
565 /*
566 * No machine check event found. Must be some external
567 * source or one CPU is hung. Panic.
568 */
569 if (!m && tolerant < 3)
570 mce_panic("Machine check from unknown source", NULL, NULL);
571
572 /*
573 * Now clear all the mces_seen so that they don't reappear on
574 * the next mce.
575 */
576 for_each_possible_cpu(cpu)
577 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
578}
579
580static atomic_t global_nwo;
581
582/*
583 * Start of Monarch synchronization. This waits until all CPUs have
584 * entered the exception handler and then determines if any of them
585 * saw a fatal event that requires panic. Then it executes them
586 * in the entry order.
587 * TBD double check parallel CPU hotunplug
588 */
589static int mce_start(int no_way_out, int *order)
590{
591 int nwo;
592 int cpus = num_online_cpus();
593 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
594
595 if (!timeout) {
596 *order = -1;
597 return no_way_out;
598 }
599
600 atomic_add(no_way_out, &global_nwo);
601
602 /*
603 * Wait for everyone.
604 */
605 while (atomic_read(&mce_callin) != cpus) {
606 if (mce_timed_out(&timeout)) {
607 atomic_set(&global_nwo, 0);
608 *order = -1;
609 return no_way_out;
610 }
611 ndelay(SPINUNIT);
612 }
613
614 /*
615 * Cache the global no_way_out state.
616 */
617 nwo = atomic_read(&global_nwo);
618
619 /*
620 * Monarch starts executing now, the others wait.
621 */
622 if (*order == 1) {
623 atomic_set(&mce_executing, 1);
624 return nwo;
625 }
626
627 /*
628 * Now start the scanning loop one by one
629 * in the original callin order.
630 * This way when there are any shared banks it will
631 * be only seen by one CPU before cleared, avoiding duplicates.
632 */
633 while (atomic_read(&mce_executing) < *order) {
634 if (mce_timed_out(&timeout)) {
635 atomic_set(&global_nwo, 0);
636 *order = -1;
637 return no_way_out;
638 }
639 ndelay(SPINUNIT);
640 }
641 return nwo;
642}
643
644/*
645 * Synchronize between CPUs after main scanning loop.
646 * This invokes the bulk of the Monarch processing.
647 */
648static int mce_end(int order)
649{
650 int ret = -1;
651 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
652
653 if (!timeout)
654 goto reset;
655 if (order < 0)
656 goto reset;
657
658 /*
659 * Allow others to run.
660 */
661 atomic_inc(&mce_executing);
662
663 if (order == 1) {
664 /* CHECKME: Can this race with a parallel hotplug? */
665 int cpus = num_online_cpus();
666
667 /*
668 * Monarch: Wait for everyone to go through their scanning
669 * loops.
670 */
671 while (atomic_read(&mce_executing) <= cpus) {
672 if (mce_timed_out(&timeout))
673 goto reset;
674 ndelay(SPINUNIT);
675 }
676
677 mce_reign();
678 barrier();
679 ret = 0;
680 } else {
681 /*
682 * Subject: Wait for Monarch to finish.
683 */
684 while (atomic_read(&mce_executing) != 0) {
685 if (mce_timed_out(&timeout))
686 goto reset;
687 ndelay(SPINUNIT);
688 }
689
690 /*
691 * Don't reset anything. That's done by the Monarch.
692 */
693 return 0;
694 }
695
696 /*
697 * Reset all global state.
698 */
699reset:
700 atomic_set(&global_nwo, 0);
701 atomic_set(&mce_callin, 0);
702 barrier();
703
704 /*
705 * Let others run again.
706 */
707 atomic_set(&mce_executing, 0);
708 return ret;
709}
710
711static void mce_clear_state(unsigned long *toclear)
712{
713 int i;
714
715 for (i = 0; i < banks; i++) {
716 if (test_bit(i, toclear))
717 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
718 }
719}
720
721/*
454 * The actual machine check handler. This only handles real 722 * The actual machine check handler. This only handles real
455 * exceptions when something got corrupted coming in through int 18. 723 * exceptions when something got corrupted coming in through int 18.
456 * 724 *
457 * This is executed in NMI context not subject to normal locking rules. This 725 * This is executed in NMI context not subject to normal locking rules. This
458 * implies that most kernel services cannot be safely used. Don't even 726 * implies that most kernel services cannot be safely used. Don't even
459 * think about putting a printk in there! 727 * think about putting a printk in there!
728 *
729 * On Intel systems this is entered on all CPUs in parallel through
730 * MCE broadcast. However some CPUs might be broken beyond repair,
731 * so be always careful when synchronizing with others.
460 */ 732 */
461void do_machine_check(struct pt_regs *regs, long error_code) 733void do_machine_check(struct pt_regs *regs, long error_code)
462{ 734{
463 struct mce m, panicm; 735 struct mce m, *final;
464 int panicm_found = 0;
465 int i; 736 int i;
737 int worst = 0;
738 int severity;
739 /*
740 * Establish sequential order between the CPUs entering the machine
741 * check handler.
742 */
743 int order;
744
466 /* 745 /*
467 * If no_way_out gets set, there is no safe way to recover from this 746 * If no_way_out gets set, there is no safe way to recover from this
468 * MCE. If tolerant is cranked up, we'll try anyway. 747 * MCE. If tolerant is cranked up, we'll try anyway.
@@ -486,13 +765,23 @@ void do_machine_check(struct pt_regs *regs, long error_code)
486 if (!banks) 765 if (!banks)
487 goto out; 766 goto out;
488 767
768 order = atomic_add_return(1, &mce_callin);
489 mce_setup(&m); 769 mce_setup(&m);
490 770
491 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 771 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
492 no_way_out = mce_no_way_out(&m, &msg); 772 no_way_out = mce_no_way_out(&m, &msg);
493 773
774 final = &__get_cpu_var(mces_seen);
775 *final = m;
776
494 barrier(); 777 barrier();
495 778
779 /*
780 * Go through all the banks in exclusion of the other CPUs.
781 * This way we don't report duplicated events on shared banks
782 * because the first one to see it will clear it.
783 */
784 no_way_out = mce_start(no_way_out, &order);
496 for (i = 0; i < banks; i++) { 785 for (i = 0; i < banks; i++) {
497 __clear_bit(i, toclear); 786 __clear_bit(i, toclear);
498 if (!bank[i]) 787 if (!bank[i])
@@ -544,32 +833,32 @@ void do_machine_check(struct pt_regs *regs, long error_code)
544 mce_get_rip(&m, regs); 833 mce_get_rip(&m, regs);
545 mce_log(&m); 834 mce_log(&m);
546 835
547 /* 836 severity = mce_severity(&m, tolerant, NULL);
548 * Did this bank cause the exception? 837 if (severity > worst) {
549 * 838 *final = m;
550 * Assume that the bank with uncorrectable errors did it, 839 worst = severity;
551 * and that there is only a single one:
552 */
553 if ((m.status & MCI_STATUS_UC) &&
554 (m.status & MCI_STATUS_EN)) {
555 panicm = m;
556 panicm_found = 1;
557 } 840 }
558 } 841 }
559 842
843 if (!no_way_out)
844 mce_clear_state(toclear);
845
560 /* 846 /*
561 * If we didn't find an uncorrectable error, pick 847 * Do most of the synchronization with other CPUs.
562 * the last one (shouldn't happen, just being safe). 848 * When there's any problem use only local no_way_out state.
563 */ 849 */
564 if (!panicm_found) 850 if (mce_end(order) < 0)
565 panicm = m; 851 no_way_out = worst >= MCE_PANIC_SEVERITY;
566 852
567 /* 853 /*
568 * If we have decided that we just CAN'T continue, and the user 854 * If we have decided that we just CAN'T continue, and the user
569 * has not set tolerant to an insane level, give up and die. 855 * has not set tolerant to an insane level, give up and die.
856 *
857 * This is mainly used in the case when the system doesn't
858 * support MCE broadcasting or it has been disabled.
570 */ 859 */
571 if (no_way_out && tolerant < 3) 860 if (no_way_out && tolerant < 3)
572 mce_panic("Machine check", &panicm, msg); 861 mce_panic("Machine check", final, msg);
573 862
574 /* 863 /*
575 * If the error seems to be unrecoverable, something should be 864 * If the error seems to be unrecoverable, something should be
@@ -585,7 +874,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
585 * instruction which caused the MCE. 874 * instruction which caused the MCE.
586 */ 875 */
587 if (m.mcgstatus & MCG_STATUS_EIPV) 876 if (m.mcgstatus & MCG_STATUS_EIPV)
588 user_space = panicm.ip && (panicm.cs & 3); 877 user_space = final->ip && (final->cs & 3);
589 878
590 /* 879 /*
591 * If we know that the error was in user space, send a 880 * If we know that the error was in user space, send a
@@ -597,20 +886,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
597 if (user_space) { 886 if (user_space) {
598 force_sig(SIGBUS, current); 887 force_sig(SIGBUS, current);
599 } else if (panic_on_oops || tolerant < 2) { 888 } else if (panic_on_oops || tolerant < 2) {
600 mce_panic("Uncorrected machine check", &panicm, msg); 889 mce_panic("Uncorrected machine check", final, msg);
601 } 890 }
602 } 891 }
603 892
604 /* notify userspace ASAP */ 893 /* notify userspace ASAP */
605 set_thread_flag(TIF_MCE_NOTIFY); 894 set_thread_flag(TIF_MCE_NOTIFY);
606 895
607 mce_report_event(regs); 896 if (worst > 0)
608 897 mce_report_event(regs);
609 /* the last thing we do is clear state */
610 for (i = 0; i < banks; i++) {
611 if (test_bit(i, toclear))
612 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
613 }
614 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 898 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
615out: 899out:
616 atomic_dec(&mce_entry); 900 atomic_dec(&mce_entry);
@@ -821,7 +1105,17 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
821 1105
822 if (c->x86 == 6 && c->x86_model < 0x1A) 1106 if (c->x86 == 6 && c->x86_model < 0x1A)
823 __set_bit(0, &dont_init_banks); 1107 __set_bit(0, &dont_init_banks);
1108
1109 /*
1110 * All newer Intel systems support MCE broadcasting. Enable
1111 * synchronization with a one second timeout.
1112 */
1113 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1114 monarch_timeout < 0)
1115 monarch_timeout = USEC_PER_SEC;
824 } 1116 }
1117 if (monarch_timeout < 0)
1118 monarch_timeout = 0;
825} 1119}
826 1120
827static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1121static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
@@ -1068,7 +1362,9 @@ static struct miscdevice mce_log_device = {
1068 1362
1069/* 1363/*
1070 * mce=off disables machine check 1364 * mce=off disables machine check
1071 * mce=TOLERANCELEVEL (number, see above) 1365 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1366 * monarchtimeout is how long to wait for other CPUs on machine
1367 * check, or 0 to not wait
1072 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1368 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1073 * mce=nobootlog Don't log MCEs from before booting. 1369 * mce=nobootlog Don't log MCEs from before booting.
1074 */ 1370 */
@@ -1082,9 +1378,13 @@ static int __init mcheck_enable(char *str)
1082 mce_disabled = 1; 1378 mce_disabled = 1;
1083 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1379 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1084 mce_bootlog = (str[0] == 'b'); 1380 mce_bootlog = (str[0] == 'b');
1085 else if (isdigit(str[0])) 1381 else if (isdigit(str[0])) {
1086 get_option(&str, &tolerant); 1382 get_option(&str, &tolerant);
1087 else { 1383 if (*str == ',') {
1384 ++str;
1385 get_option(&str, &monarch_timeout);
1386 }
1387 } else {
1088 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1388 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1089 str); 1389 str);
1090 return 0; 1390 return 0;
@@ -1221,6 +1521,7 @@ static ssize_t store_int_with_restart(struct sys_device *s,
1221 1521
1222static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1522static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1223static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1523static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1524static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1224 1525
1225static struct sysdev_ext_attribute attr_check_interval = { 1526static struct sysdev_ext_attribute attr_check_interval = {
1226 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1527 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
@@ -1230,6 +1531,7 @@ static struct sysdev_ext_attribute attr_check_interval = {
1230 1531
1231static struct sysdev_attribute *mce_attrs[] = { 1532static struct sysdev_attribute *mce_attrs[] = {
1232 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1533 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1534 &attr_monarch_timeout.attr,
1233 NULL 1535 NULL
1234}; 1536};
1235 1537