aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorAndi Kleen <andi@firstfloor.org>2009-05-27 15:56:55 -0400
committerH. Peter Anvin <hpa@zytor.com>2009-06-03 17:45:12 -0400
commit3c0797925f4ef9d55a32059d2af61a9c262e639d (patch)
tree7037a444ec7042352b33f6a7e24b255f9e4d9332 /arch/x86
parentf94b61c2c9fdcc90773c49df9ccf9ede3ad0d7db (diff)
x86, mce: switch x86 machine check handler to Monarch election.
On Intel platforms machine check exceptions are always broadcast to all CPUs. This patch makes the machine check handler synchronize all these machine checks, elect a Monarch to handle the event and collect the worst event from all CPUs and then process it first. This has some advantages: - When there is a truly data corrupting error the system panics as quickly as possible. This improves containment of corrupted data and makes sure the corrupted data never hits stable storage. - The panics are synchronized and do not reenter the panic code on multiple CPUs (which currently does not handle this well). - All the errors are reported. Currently it often happens that another CPU happens to do the panic first, but reports useless information (empty machine check) because the real error happened on another CPU which came in later. This is a big advantage on Nehalem where the 8 threads per CPU lead to often the wrong CPU winning the race and dumping useless information on a machine check. The problem also occurs in a less severe form on older CPUs. - The system can detect when no CPUs detected a machine check and shut down the system. This can happen when one CPU is so badly hung that that it cannot process a machine check anymore or when some external agent wants to stop the system by asserting the machine check pin. This follows Intel hardware recommendations. - This matches the recommended error model by the CPU designers. - The events can be output in true severity order - When a panic happens on another CPU it makes sure to be actually be able to process the stop IPI by enabling interrupts. The code is extremly careful to handle timeouts while waiting for other CPUs. It can't rely on the normal timing mechanisms (jiffies, ktime_get) because of its asynchronous/lockless nature, so it uses own timeouts using ndelay() and a "SPINUNIT" The timeout is configurable. By default it waits for upto one second for the other CPUs. This can be also disabled. From some informal testing AMD systems do not see to broadcast machine checks, so right now it's always disabled by default on non Intel CPUs or also on very old Intel systems. Includes fixes from Ying Huang Fixed a "ecception" in a comment (H.Seto) Moved global_nwo reset later based on suggestion from H.Seto v2: Avoid duplicate messages [ Impact: feature, fixes long standing problems. ] Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c360
1 files changed, 331 insertions, 29 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 421020f1d7db..ba431893e31d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -21,6 +21,7 @@
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/string.h> 22#include <linux/string.h>
23#include <linux/sysdev.h> 23#include <linux/sysdev.h>
24#include <linux/delay.h>
24#include <linux/ctype.h> 25#include <linux/ctype.h>
25#include <linux/sched.h> 26#include <linux/sched.h>
26#include <linux/sysfs.h> 27#include <linux/sysfs.h>
@@ -28,6 +29,7 @@
28#include <linux/init.h> 29#include <linux/init.h>
29#include <linux/kmod.h> 30#include <linux/kmod.h>
30#include <linux/poll.h> 31#include <linux/poll.h>
32#include <linux/nmi.h>
31#include <linux/cpu.h> 33#include <linux/cpu.h>
32#include <linux/smp.h> 34#include <linux/smp.h>
33#include <linux/fs.h> 35#include <linux/fs.h>
@@ -60,6 +62,8 @@ int mce_disabled;
60 62
61#define MISC_MCELOG_MINOR 227 63#define MISC_MCELOG_MINOR 227
62 64
65#define SPINUNIT 100 /* 100ns */
66
63atomic_t mce_entry; 67atomic_t mce_entry;
64 68
65DEFINE_PER_CPU(unsigned, mce_exception_count); 69DEFINE_PER_CPU(unsigned, mce_exception_count);
@@ -77,6 +81,7 @@ static u64 *bank;
77static unsigned long notify_user; 81static unsigned long notify_user;
78static int rip_msr; 82static int rip_msr;
79static int mce_bootlog = -1; 83static int mce_bootlog = -1;
84static int monarch_timeout = -1;
80 85
81static char trigger[128]; 86static char trigger[128];
82static char *trigger_argv[2] = { trigger, NULL }; 87static char *trigger_argv[2] = { trigger, NULL };
@@ -84,6 +89,9 @@ static char *trigger_argv[2] = { trigger, NULL };
84static unsigned long dont_init_banks; 89static unsigned long dont_init_banks;
85 90
86static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 91static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
92static DEFINE_PER_CPU(struct mce, mces_seen);
93static int cpu_missing;
94
87 95
88/* MCA banks polled by the period polling timer for corrected events */ 96/* MCA banks polled by the period polling timer for corrected events */
89DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 97DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -241,6 +249,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
241 } 249 }
242 if (final) 250 if (final)
243 print_mce(final); 251 print_mce(final);
252 if (cpu_missing)
253 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
244 if (exp) 254 if (exp)
245 printk(KERN_EMERG "Machine check: %s\n", exp); 255 printk(KERN_EMERG "Machine check: %s\n", exp);
246 panic(msg); 256 panic(msg);
@@ -451,18 +461,287 @@ static int mce_no_way_out(struct mce *m, char **msg)
451} 461}
452 462
453/* 463/*
464 * Variable to establish order between CPUs while scanning.
465 * Each CPU spins initially until executing is equal its number.
466 */
467static atomic_t mce_executing;
468
469/*
470 * Defines order of CPUs on entry. First CPU becomes Monarch.
471 */
472static atomic_t mce_callin;
473
474/*
475 * Check if a timeout waiting for other CPUs happened.
476 */
477static int mce_timed_out(u64 *t)
478{
479 /*
480 * The others already did panic for some reason.
481 * Bail out like in a timeout.
482 * rmb() to tell the compiler that system_state
483 * might have been modified by someone else.
484 */
485 rmb();
486 if (atomic_read(&mce_paniced))
487 wait_for_panic();
488 if (!monarch_timeout)
489 goto out;
490 if ((s64)*t < SPINUNIT) {
491 /* CHECKME: Make panic default for 1 too? */
492 if (tolerant < 1)
493 mce_panic("Timeout synchronizing machine check over CPUs",
494 NULL, NULL);
495 cpu_missing = 1;
496 return 1;
497 }
498 *t -= SPINUNIT;
499out:
500 touch_nmi_watchdog();
501 return 0;
502}
503
504/*
505 * The Monarch's reign. The Monarch is the CPU who entered
506 * the machine check handler first. It waits for the others to
507 * raise the exception too and then grades them. When any
508 * error is fatal panic. Only then let the others continue.
509 *
510 * The other CPUs entering the MCE handler will be controlled by the
511 * Monarch. They are called Subjects.
512 *
513 * This way we prevent any potential data corruption in a unrecoverable case
514 * and also makes sure always all CPU's errors are examined.
515 *
516 * Also this detects the case of an machine check event coming from outer
517 * space (not detected by any CPUs) In this case some external agent wants
518 * us to shut down, so panic too.
519 *
520 * The other CPUs might still decide to panic if the handler happens
521 * in a unrecoverable place, but in this case the system is in a semi-stable
522 * state and won't corrupt anything by itself. It's ok to let the others
523 * continue for a bit first.
524 *
525 * All the spin loops have timeouts; when a timeout happens a CPU
526 * typically elects itself to be Monarch.
527 */
528static void mce_reign(void)
529{
530 int cpu;
531 struct mce *m = NULL;
532 int global_worst = 0;
533 char *msg = NULL;
534 char *nmsg = NULL;
535
536 /*
537 * This CPU is the Monarch and the other CPUs have run
538 * through their handlers.
539 * Grade the severity of the errors of all the CPUs.
540 */
541 for_each_possible_cpu(cpu) {
542 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
543 &nmsg);
544 if (severity > global_worst) {
545 msg = nmsg;
546 global_worst = severity;
547 m = &per_cpu(mces_seen, cpu);
548 }
549 }
550
551 /*
552 * Cannot recover? Panic here then.
553 * This dumps all the mces in the log buffer and stops the
554 * other CPUs.
555 */
556 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
557 mce_panic("Fatal machine check", m, msg);
558
559 /*
560 * For UC somewhere we let the CPU who detects it handle it.
561 * Also must let continue the others, otherwise the handling
562 * CPU could deadlock on a lock.
563 */
564
565 /*
566 * No machine check event found. Must be some external
567 * source or one CPU is hung. Panic.
568 */
569 if (!m && tolerant < 3)
570 mce_panic("Machine check from unknown source", NULL, NULL);
571
572 /*
573 * Now clear all the mces_seen so that they don't reappear on
574 * the next mce.
575 */
576 for_each_possible_cpu(cpu)
577 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
578}
579
580static atomic_t global_nwo;
581
582/*
583 * Start of Monarch synchronization. This waits until all CPUs have
584 * entered the exception handler and then determines if any of them
585 * saw a fatal event that requires panic. Then it executes them
586 * in the entry order.
587 * TBD double check parallel CPU hotunplug
588 */
589static int mce_start(int no_way_out, int *order)
590{
591 int nwo;
592 int cpus = num_online_cpus();
593 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
594
595 if (!timeout) {
596 *order = -1;
597 return no_way_out;
598 }
599
600 atomic_add(no_way_out, &global_nwo);
601
602 /*
603 * Wait for everyone.
604 */
605 while (atomic_read(&mce_callin) != cpus) {
606 if (mce_timed_out(&timeout)) {
607 atomic_set(&global_nwo, 0);
608 *order = -1;
609 return no_way_out;
610 }
611 ndelay(SPINUNIT);
612 }
613
614 /*
615 * Cache the global no_way_out state.
616 */
617 nwo = atomic_read(&global_nwo);
618
619 /*
620 * Monarch starts executing now, the others wait.
621 */
622 if (*order == 1) {
623 atomic_set(&mce_executing, 1);
624 return nwo;
625 }
626
627 /*
628 * Now start the scanning loop one by one
629 * in the original callin order.
630 * This way when there are any shared banks it will
631 * be only seen by one CPU before cleared, avoiding duplicates.
632 */
633 while (atomic_read(&mce_executing) < *order) {
634 if (mce_timed_out(&timeout)) {
635 atomic_set(&global_nwo, 0);
636 *order = -1;
637 return no_way_out;
638 }
639 ndelay(SPINUNIT);
640 }
641 return nwo;
642}
643
644/*
645 * Synchronize between CPUs after main scanning loop.
646 * This invokes the bulk of the Monarch processing.
647 */
648static int mce_end(int order)
649{
650 int ret = -1;
651 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
652
653 if (!timeout)
654 goto reset;
655 if (order < 0)
656 goto reset;
657
658 /*
659 * Allow others to run.
660 */
661 atomic_inc(&mce_executing);
662
663 if (order == 1) {
664 /* CHECKME: Can this race with a parallel hotplug? */
665 int cpus = num_online_cpus();
666
667 /*
668 * Monarch: Wait for everyone to go through their scanning
669 * loops.
670 */
671 while (atomic_read(&mce_executing) <= cpus) {
672 if (mce_timed_out(&timeout))
673 goto reset;
674 ndelay(SPINUNIT);
675 }
676
677 mce_reign();
678 barrier();
679 ret = 0;
680 } else {
681 /*
682 * Subject: Wait for Monarch to finish.
683 */
684 while (atomic_read(&mce_executing) != 0) {
685 if (mce_timed_out(&timeout))
686 goto reset;
687 ndelay(SPINUNIT);
688 }
689
690 /*
691 * Don't reset anything. That's done by the Monarch.
692 */
693 return 0;
694 }
695
696 /*
697 * Reset all global state.
698 */
699reset:
700 atomic_set(&global_nwo, 0);
701 atomic_set(&mce_callin, 0);
702 barrier();
703
704 /*
705 * Let others run again.
706 */
707 atomic_set(&mce_executing, 0);
708 return ret;
709}
710
711static void mce_clear_state(unsigned long *toclear)
712{
713 int i;
714
715 for (i = 0; i < banks; i++) {
716 if (test_bit(i, toclear))
717 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
718 }
719}
720
721/*
454 * The actual machine check handler. This only handles real 722 * The actual machine check handler. This only handles real
455 * exceptions when something got corrupted coming in through int 18. 723 * exceptions when something got corrupted coming in through int 18.
456 * 724 *
457 * This is executed in NMI context not subject to normal locking rules. This 725 * This is executed in NMI context not subject to normal locking rules. This
458 * implies that most kernel services cannot be safely used. Don't even 726 * implies that most kernel services cannot be safely used. Don't even
459 * think about putting a printk in there! 727 * think about putting a printk in there!
728 *
729 * On Intel systems this is entered on all CPUs in parallel through
730 * MCE broadcast. However some CPUs might be broken beyond repair,
731 * so be always careful when synchronizing with others.
460 */ 732 */
461void do_machine_check(struct pt_regs *regs, long error_code) 733void do_machine_check(struct pt_regs *regs, long error_code)
462{ 734{
463 struct mce m, panicm; 735 struct mce m, *final;
464 int panicm_found = 0;
465 int i; 736 int i;
737 int worst = 0;
738 int severity;
739 /*
740 * Establish sequential order between the CPUs entering the machine
741 * check handler.
742 */
743 int order;
744
466 /* 745 /*
467 * If no_way_out gets set, there is no safe way to recover from this 746 * If no_way_out gets set, there is no safe way to recover from this
468 * MCE. If tolerant is cranked up, we'll try anyway. 747 * MCE. If tolerant is cranked up, we'll try anyway.
@@ -486,13 +765,23 @@ void do_machine_check(struct pt_regs *regs, long error_code)
486 if (!banks) 765 if (!banks)
487 goto out; 766 goto out;
488 767
768 order = atomic_add_return(1, &mce_callin);
489 mce_setup(&m); 769 mce_setup(&m);
490 770
491 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 771 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
492 no_way_out = mce_no_way_out(&m, &msg); 772 no_way_out = mce_no_way_out(&m, &msg);
493 773
774 final = &__get_cpu_var(mces_seen);
775 *final = m;
776
494 barrier(); 777 barrier();
495 778
779 /*
780 * Go through all the banks in exclusion of the other CPUs.
781 * This way we don't report duplicated events on shared banks
782 * because the first one to see it will clear it.
783 */
784 no_way_out = mce_start(no_way_out, &order);
496 for (i = 0; i < banks; i++) { 785 for (i = 0; i < banks; i++) {
497 __clear_bit(i, toclear); 786 __clear_bit(i, toclear);
498 if (!bank[i]) 787 if (!bank[i])
@@ -544,32 +833,32 @@ void do_machine_check(struct pt_regs *regs, long error_code)
544 mce_get_rip(&m, regs); 833 mce_get_rip(&m, regs);
545 mce_log(&m); 834 mce_log(&m);
546 835
547 /* 836 severity = mce_severity(&m, tolerant, NULL);
548 * Did this bank cause the exception? 837 if (severity > worst) {
549 * 838 *final = m;
550 * Assume that the bank with uncorrectable errors did it, 839 worst = severity;
551 * and that there is only a single one:
552 */
553 if ((m.status & MCI_STATUS_UC) &&
554 (m.status & MCI_STATUS_EN)) {
555 panicm = m;
556 panicm_found = 1;
557 } 840 }
558 } 841 }
559 842
843 if (!no_way_out)
844 mce_clear_state(toclear);
845
560 /* 846 /*
561 * If we didn't find an uncorrectable error, pick 847 * Do most of the synchronization with other CPUs.
562 * the last one (shouldn't happen, just being safe). 848 * When there's any problem use only local no_way_out state.
563 */ 849 */
564 if (!panicm_found) 850 if (mce_end(order) < 0)
565 panicm = m; 851 no_way_out = worst >= MCE_PANIC_SEVERITY;
566 852
567 /* 853 /*
568 * If we have decided that we just CAN'T continue, and the user 854 * If we have decided that we just CAN'T continue, and the user
569 * has not set tolerant to an insane level, give up and die. 855 * has not set tolerant to an insane level, give up and die.
856 *
857 * This is mainly used in the case when the system doesn't
858 * support MCE broadcasting or it has been disabled.
570 */ 859 */
571 if (no_way_out && tolerant < 3) 860 if (no_way_out && tolerant < 3)
572 mce_panic("Machine check", &panicm, msg); 861 mce_panic("Machine check", final, msg);
573 862
574 /* 863 /*
575 * If the error seems to be unrecoverable, something should be 864 * If the error seems to be unrecoverable, something should be
@@ -585,7 +874,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
585 * instruction which caused the MCE. 874 * instruction which caused the MCE.
586 */ 875 */
587 if (m.mcgstatus & MCG_STATUS_EIPV) 876 if (m.mcgstatus & MCG_STATUS_EIPV)
588 user_space = panicm.ip && (panicm.cs & 3); 877 user_space = final->ip && (final->cs & 3);
589 878
590 /* 879 /*
591 * If we know that the error was in user space, send a 880 * If we know that the error was in user space, send a
@@ -597,20 +886,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
597 if (user_space) { 886 if (user_space) {
598 force_sig(SIGBUS, current); 887 force_sig(SIGBUS, current);
599 } else if (panic_on_oops || tolerant < 2) { 888 } else if (panic_on_oops || tolerant < 2) {
600 mce_panic("Uncorrected machine check", &panicm, msg); 889 mce_panic("Uncorrected machine check", final, msg);
601 } 890 }
602 } 891 }
603 892
604 /* notify userspace ASAP */ 893 /* notify userspace ASAP */
605 set_thread_flag(TIF_MCE_NOTIFY); 894 set_thread_flag(TIF_MCE_NOTIFY);
606 895
607 mce_report_event(regs); 896 if (worst > 0)
608 897 mce_report_event(regs);
609 /* the last thing we do is clear state */
610 for (i = 0; i < banks; i++) {
611 if (test_bit(i, toclear))
612 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
613 }
614 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); 898 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
615out: 899out:
616 atomic_dec(&mce_entry); 900 atomic_dec(&mce_entry);
@@ -821,7 +1105,17 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
821 1105
822 if (c->x86 == 6 && c->x86_model < 0x1A) 1106 if (c->x86 == 6 && c->x86_model < 0x1A)
823 __set_bit(0, &dont_init_banks); 1107 __set_bit(0, &dont_init_banks);
1108
1109 /*
1110 * All newer Intel systems support MCE broadcasting. Enable
1111 * synchronization with a one second timeout.
1112 */
1113 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1114 monarch_timeout < 0)
1115 monarch_timeout = USEC_PER_SEC;
824 } 1116 }
1117 if (monarch_timeout < 0)
1118 monarch_timeout = 0;
825} 1119}
826 1120
827static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1121static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
@@ -1068,7 +1362,9 @@ static struct miscdevice mce_log_device = {
1068 1362
1069/* 1363/*
1070 * mce=off disables machine check 1364 * mce=off disables machine check
1071 * mce=TOLERANCELEVEL (number, see above) 1365 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1366 * monarchtimeout is how long to wait for other CPUs on machine
1367 * check, or 0 to not wait
1072 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1368 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1073 * mce=nobootlog Don't log MCEs from before booting. 1369 * mce=nobootlog Don't log MCEs from before booting.
1074 */ 1370 */
@@ -1082,9 +1378,13 @@ static int __init mcheck_enable(char *str)
1082 mce_disabled = 1; 1378 mce_disabled = 1;
1083 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1379 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1084 mce_bootlog = (str[0] == 'b'); 1380 mce_bootlog = (str[0] == 'b');
1085 else if (isdigit(str[0])) 1381 else if (isdigit(str[0])) {
1086 get_option(&str, &tolerant); 1382 get_option(&str, &tolerant);
1087 else { 1383 if (*str == ',') {
1384 ++str;
1385 get_option(&str, &monarch_timeout);
1386 }
1387 } else {
1088 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", 1388 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1089 str); 1389 str);
1090 return 0; 1390 return 0;
@@ -1221,6 +1521,7 @@ static ssize_t store_int_with_restart(struct sys_device *s,
1221 1521
1222static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1522static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1223static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1523static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1524static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1224 1525
1225static struct sysdev_ext_attribute attr_check_interval = { 1526static struct sysdev_ext_attribute attr_check_interval = {
1226 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1527 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
@@ -1230,6 +1531,7 @@ static struct sysdev_ext_attribute attr_check_interval = {
1230 1531
1231static struct sysdev_attribute *mce_attrs[] = { 1532static struct sysdev_attribute *mce_attrs[] = {
1232 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger, 1533 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1534 &attr_monarch_timeout.attr,
1233 NULL 1535 NULL
1234}; 1536};
1235 1537