x86, mce: switch x86 machine check handler to Monarch election.

On Intel platforms machine check exceptions are always broadcast to all CPUs. This patch makes the machine check handler synchronize all these machine checks, elect a Monarch to handle the event and collect the worst event from all CPUs and then process it first. This has some advantages: - When there is a truly data corrupting error the system panics as quickly as possible. This improves containment of corrupted data and makes sure the corrupted data never hits stable storage. - The panics are synchronized and do not reenter the panic code on multiple CPUs (which currently does not handle this well). - All the errors are reported. Currently it often happens that another CPU happens to do the panic first, but reports useless information (empty machine check) because the real error happened on another CPU which came in later. This is a big advantage on Nehalem where the 8 threads per CPU lead to often the wrong CPU winning the race and dumping useless information on a machine check. The problem also occurs in a less severe form on older CPUs. - The system can detect when no CPUs detected a machine check and shut down the system. This can happen when one CPU is so badly hung that that it cannot process a machine check anymore or when some external agent wants to stop the system by asserting the machine check pin. This follows Intel hardware recommendations. - This matches the recommended error model by the CPU designers. - The events can be output in true severity order - When a panic happens on another CPU it makes sure to be actually be able to process the stop IPI by enabling interrupts. The code is extremly careful to handle timeouts while waiting for other CPUs. It can't rely on the normal timing mechanisms (jiffies, ktime_get) because of its asynchronous/lockless nature, so it uses own timeouts using ndelay() and a "SPINUNIT" The timeout is configurable. By default it waits for upto one second for the other CPUs. This can be also disabled. From some informal testing AMD systems do not see to broadcast machine checks, so right now it's always disabled by default on non Intel CPUs or also on very old Intel systems. Includes fixes from Ying Huang Fixed a "ecception" in a comment (H.Seto) Moved global_nwo reset later based on suggestion from H.Seto v2: Avoid duplicate messages [ Impact: feature, fixes long standing problems. ] Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
author: Andi Kleen <andi@firstfloor.org> 2009-05-27 15:56:55 -0400
committer: H. Peter Anvin <hpa@zytor.com> 2009-06-03 17:45:12 -0400
commit: 3c0797925f4ef9d55a32059d2af61a9c262e639d (patch)
tree: 7037a444ec7042352b33f6a7e24b255f9e4d9332 /arch/x86/kernel/cpu/mcheck/mce.c
parent: f94b61c2c9fdcc90773c49df9ccf9ede3ad0d7db (diff)
1 files changed, 331 insertions, 29 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 421020f1d7d..ba431893e31 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -21,6 +21,7 @@
 #include <linux/percpu.h>
 #include <linux/string.h>
 #include <linux/sysdev.h>
+#include <linux/delay.h>
 #include <linux/ctype.h>
 #include <linux/sched.h>
 #include <linux/sysfs.h>
@@ -28,6 +29,7 @@
 #include <linux/init.h>
 #include <linux/kmod.h>
 #include <linux/poll.h>
+#include <linux/nmi.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/fs.h>
@@ -60,6 +62,8 @@ int				mce_disabled;
 #define MISC_MCELOG_MINOR       227
+#define SPINUNIT 100    /* 100ns */
 atomic_t mce_entry;
 DEFINE_PER_CPU(unsigned, mce_exception_count);
@@ -77,6 +81,7 @@ static u64			*bank;
 static unsigned long            notify_user;
 static int                      rip_msr;
 static int                      mce_bootlog = -1;
+static int                      monarch_timeout = -1;
 static char                     trigger[128];
 static char                     *trigger_argv[2] = { trigger, NULL };
@@ -84,6 +89,9 @@ static char			*trigger_argv[2] = { trigger, NULL };
 static unsigned long            dont_init_banks;
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+static DEFINE_PER_CPU(struct mce, mces_seen);
+static int                      cpu_missing;
 /* MCA banks polled by the period polling timer for corrected events */
 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -241,6 +249,8 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
        }
        if (final)
                print_mce(final);
+        if (cpu_missing)
+                printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
        if (exp)
                printk(KERN_EMERG "Machine check: %s\n", exp);
        panic(msg);
@@ -451,18 +461,287 @@ static int mce_no_way_out(struct mce *m, char **msg)
 }
 /*
+ * Variable to establish order between CPUs while scanning.
+ * Each CPU spins initially until executing is equal its number.
+ */
+static atomic_t mce_executing;
+/*
+ * Defines order of CPUs on entry. First CPU becomes Monarch.
+ */
+static atomic_t mce_callin;
+/*
+ * Check if a timeout waiting for other CPUs happened.
+ */
+static int mce_timed_out(u64 *t)
+{
+        /*
+         * The others already did panic for some reason.
+         * Bail out like in a timeout.
+         * rmb() to tell the compiler that system_state
+         * might have been modified by someone else.
+         */
+        rmb();
+        if (atomic_read(&mce_paniced))
+                wait_for_panic();
+        if (!monarch_timeout)
+                goto out;
+        if ((s64)*t < SPINUNIT) {
+                /* CHECKME: Make panic default for 1 too? */
+                if (tolerant < 1)
+                        mce_panic("Timeout synchronizing machine check over CPUs",
+                                  NULL, NULL);
+                cpu_missing = 1;
+                return 1;
+        }
+        *t -= SPINUNIT;
+out:
+        touch_nmi_watchdog();
+        return 0;
+}
+/*
+ * The Monarch's reign.  The Monarch is the CPU who entered
+ * the machine check handler first. It waits for the others to
+ * raise the exception too and then grades them. When any
+ * error is fatal panic. Only then let the others continue.
+ *
+ * The other CPUs entering the MCE handler will be controlled by the
+ * Monarch. They are called Subjects.
+ *
+ * This way we prevent any potential data corruption in a unrecoverable case
+ * and also makes sure always all CPU's errors are examined.
+ *
+ * Also this detects the case of an machine check event coming from outer
+ * space (not detected by any CPUs) In this case some external agent wants
+ * us to shut down, so panic too.
+ *
+ * The other CPUs might still decide to panic if the handler happens
+ * in a unrecoverable place, but in this case the system is in a semi-stable
+ * state and won't corrupt anything by itself. It's ok to let the others
+ * continue for a bit first.
+ *
+ * All the spin loops have timeouts; when a timeout happens a CPU
+ * typically elects itself to be Monarch.
+ */
+static void mce_reign(void)
+{
+        int cpu;
+        struct mce *m = NULL;
+        int global_worst = 0;
+        char *msg = NULL;
+        char *nmsg = NULL;
+        /*
+         * This CPU is the Monarch and the other CPUs have run
+         * through their handlers.
+         * Grade the severity of the errors of all the CPUs.
+         */
+        for_each_possible_cpu(cpu) {
+                int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
+                                            &nmsg);
+                if (severity > global_worst) {
+                        msg = nmsg;
+                        global_worst = severity;
+                        m = &per_cpu(mces_seen, cpu);
+                }
+        }
+        /*
+         * Cannot recover? Panic here then.
+         * This dumps all the mces in the log buffer and stops the
+         * other CPUs.
+         */
+        if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
+                mce_panic("Fatal machine check", m, msg);
+        /*
+         * For UC somewhere we let the CPU who detects it handle it.
+         * Also must let continue the others, otherwise the handling
+         * CPU could deadlock on a lock.
+         */
+        /*
+         * No machine check event found. Must be some external
+         * source or one CPU is hung. Panic.
+         */
+        if (!m && tolerant < 3)
+                mce_panic("Machine check from unknown source", NULL, NULL);
+        /*
+         * Now clear all the mces_seen so that they don't reappear on
+         * the next mce.
+         */
+        for_each_possible_cpu(cpu)
+                memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
+}
+static atomic_t global_nwo;
+/*
+ * Start of Monarch synchronization. This waits until all CPUs have
+ * entered the exception handler and then determines if any of them
+ * saw a fatal event that requires panic. Then it executes them
+ * in the entry order.
+ * TBD double check parallel CPU hotunplug
+ */
+static int mce_start(int no_way_out, int *order)
+{
+        int nwo;
+        int cpus = num_online_cpus();
+        u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+        if (!timeout) {
+                *order = -1;
+                return no_way_out;
+        }
+        atomic_add(no_way_out, &global_nwo);
+        /*
+         * Wait for everyone.
+         */
+        while (atomic_read(&mce_callin) != cpus) {
+                if (mce_timed_out(&timeout)) {
+                        atomic_set(&global_nwo, 0);
+                        *order = -1;
+                        return no_way_out;
+                }
+                ndelay(SPINUNIT);
+        }
+        /*
+         * Cache the global no_way_out state.
+         */
+        nwo = atomic_read(&global_nwo);
+        /*
+         * Monarch starts executing now, the others wait.
+         */
+        if (*order == 1) {
+                atomic_set(&mce_executing, 1);
+                return nwo;
+        }
+        /*
+         * Now start the scanning loop one by one
+         * in the original callin order.
+         * This way when there are any shared banks it will
+         * be only seen by one CPU before cleared, avoiding duplicates.
+         */
+        while (atomic_read(&mce_executing) < *order) {
+                if (mce_timed_out(&timeout)) {
+                        atomic_set(&global_nwo, 0);
+                        *order = -1;
+                        return no_way_out;
+                }
+                ndelay(SPINUNIT);
+        }
+        return nwo;
+}
+/*
+ * Synchronize between CPUs after main scanning loop.
+ * This invokes the bulk of the Monarch processing.
+ */
+static int mce_end(int order)
+{
+        int ret = -1;
+        u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+        if (!timeout)
+                goto reset;
+        if (order < 0)
+                goto reset;
+        /*
+         * Allow others to run.
+         */
+        atomic_inc(&mce_executing);
+        if (order == 1) {
+                /* CHECKME: Can this race with a parallel hotplug? */
+                int cpus = num_online_cpus();
+                /*
+                 * Monarch: Wait for everyone to go through their scanning
+                 * loops.
+                 */
+                while (atomic_read(&mce_executing) <= cpus) {
+                        if (mce_timed_out(&timeout))
+                                goto reset;
+                        ndelay(SPINUNIT);
+                }
+                mce_reign();
+                barrier();
+                ret = 0;
+        } else {
+                /*
+                 * Subject: Wait for Monarch to finish.
+                 */
+                while (atomic_read(&mce_executing) != 0) {
+                        if (mce_timed_out(&timeout))
+                                goto reset;
+                        ndelay(SPINUNIT);
+                }
+                /*
+                 * Don't reset anything. That's done by the Monarch.
+                 */
+                return 0;
+        }
+        /*
+         * Reset all global state.
+         */
+reset:
+        atomic_set(&global_nwo, 0);
+        atomic_set(&mce_callin, 0);
+        barrier();
+        /*
+         * Let others run again.
+         */
+        atomic_set(&mce_executing, 0);
+        return ret;
+}
+static void mce_clear_state(unsigned long *toclear)
+{
+        int i;
+        for (i = 0; i < banks; i++) {
+                if (test_bit(i, toclear))
+                        mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+        }
+}
+/*
 * The actual machine check handler. This only handles real
 * exceptions when something got corrupted coming in through int 18.
 *
 * This is executed in NMI context not subject to normal locking rules. This
 * implies that most kernel services cannot be safely used. Don't even
 * think about putting a printk in there!
+ *
+ * On Intel systems this is entered on all CPUs in parallel through
+ * MCE broadcast. However some CPUs might be broken beyond repair,
+ * so be always careful when synchronizing with others.
 */
 void do_machine_check(struct pt_regs *regs, long error_code)
 {
-        struct mce m, panicm;
+        struct mce m, *final;
-        int panicm_found = 0;
        int i;
+        int worst = 0;
+        int severity;
+        /*
+         * Establish sequential order between the CPUs entering the machine
+         * check handler.
+         */
+        int order;
        /*
         * If no_way_out gets set, there is no safe way to recover from this
         * MCE.  If tolerant is cranked up, we'll try anyway.
@@ -486,13 +765,23 @@ void do_machine_check(struct pt_regs *regs, long error_code)
        if (!banks)
                goto out;
+        order = atomic_add_return(1, &mce_callin);
        mce_setup(&m);
        m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
        no_way_out = mce_no_way_out(&m, &msg);
+        final = &__get_cpu_var(mces_seen);
+        *final = m;
        barrier();
+        /*
+         * Go through all the banks in exclusion of the other CPUs.
+         * This way we don't report duplicated events on shared banks
+         * because the first one to see it will clear it.
+         */
+        no_way_out = mce_start(no_way_out, &order);
        for (i = 0; i < banks; i++) {
                __clear_bit(i, toclear);
                if (!bank[i])
@@ -544,32 +833,32 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                mce_get_rip(&m, regs);
                mce_log(&m);
-                /*
+                severity = mce_severity(&m, tolerant, NULL);
-                 * Did this bank cause the exception?
+                if (severity > worst) {
-                 *
+                        *final = m;
-                 * Assume that the bank with uncorrectable errors did it,
+                        worst = severity;
-                 * and that there is only a single one:
-                 */
-                if ((m.status & MCI_STATUS_UC) &&
-                                        (m.status & MCI_STATUS_EN)) {
-                        panicm = m;
-                        panicm_found = 1;
                }
        }
+        if (!no_way_out)
+                mce_clear_state(toclear);
        /*
-         * If we didn't find an uncorrectable error, pick
+         * Do most of the synchronization with other CPUs.
-         * the last one (shouldn't happen, just being safe).
+         * When there's any problem use only local no_way_out state.
         */
-        if (!panicm_found)
+        if (mce_end(order) < 0)
-                panicm = m;
+                no_way_out = worst >= MCE_PANIC_SEVERITY;
        /*
         * If we have decided that we just CAN'T continue, and the user
         * has not set tolerant to an insane level, give up and die.
+         *
+         * This is mainly used in the case when the system doesn't
+         * support MCE broadcasting or it has been disabled.
         */
        if (no_way_out && tolerant < 3)
-                mce_panic("Machine check", &panicm, msg);
+                mce_panic("Machine check", final, msg);
        /*
         * If the error seems to be unrecoverable, something should be
@@ -585,7 +874,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                 * instruction which caused the MCE.
                 */
                if (m.mcgstatus & MCG_STATUS_EIPV)
-                        user_space = panicm.ip && (panicm.cs & 3);
+                        user_space = final->ip && (final->cs & 3);
                /*
                 * If we know that the error was in user space, send a
@@ -597,20 +886,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                if (user_space) {
                        force_sig(SIGBUS, current);
                } else if (panic_on_oops || tolerant < 2) {
-                        mce_panic("Uncorrected machine check", &panicm, msg);
+                        mce_panic("Uncorrected machine check", final, msg);
                }
        }
        /* notify userspace ASAP */
        set_thread_flag(TIF_MCE_NOTIFY);
-        mce_report_event(regs);
+        if (worst > 0)
+                mce_report_event(regs);
-        /* the last thing we do is clear state */
-        for (i = 0; i < banks; i++) {
-                if (test_bit(i, toclear))
-                        mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
-        }
        mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
 out:
        atomic_dec(&mce_entry);
@@ -821,7 +1105,17 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
                if (c->x86 == 6 && c->x86_model < 0x1A)
                        __set_bit(0, &dont_init_banks);
+                /*
+                 * All newer Intel systems support MCE broadcasting. Enable
+                 * synchronization with a one second timeout.
+                 */
+                if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
+                        monarch_timeout < 0)
+                        monarch_timeout = USEC_PER_SEC;
        }
+        if (monarch_timeout < 0)
+                monarch_timeout = 0;
 }
 static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
@@ -1068,7 +1362,9 @@ static struct miscdevice mce_log_device = {
 /*
 * mce=off disables machine check
- * mce=TOLERANCELEVEL (number, see above)
+ * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
+ *      monarchtimeout is how long to wait for other CPUs on machine
+ *      check, or 0 to not wait
 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 * mce=nobootlog Don't log MCEs from before booting.
 */
@@ -1082,9 +1378,13 @@ static int __init mcheck_enable(char *str)
                mce_disabled = 1;
        else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
                mce_bootlog = (str[0] == 'b');
-        else if (isdigit(str[0]))
+        else if (isdigit(str[0])) {
                get_option(&str, &tolerant);
-        else {
+                if (*str == ',') {
+                        ++str;
+                        get_option(&str, &monarch_timeout);
+                }
+        } else {
                printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
                       str);
                return 0;
@@ -1221,6 +1521,7 @@ static ssize_t store_int_with_restart(struct sys_device *s,
 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
+static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
 static struct sysdev_ext_attribute attr_check_interval = {
        _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
@@ -1230,6 +1531,7 @@ static struct sysdev_ext_attribute attr_check_interval = {
 static struct sysdev_attribute *mce_attrs[] = {
        &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
+        &attr_monarch_timeout.attr,
        NULL
 };
author	Andi Kleen <andi@firstfloor.org>	2009-05-27 15:56:55 -0400
committer	H. Peter Anvin <hpa@zytor.com>	2009-06-03 17:45:12 -0400
commit	3c0797925f4ef9d55a32059d2af61a9c262e639d (patch)
tree	7037a444ec7042352b33f6a7e24b255f9e4d9332 /arch/x86/kernel/cpu/mcheck/mce.c
parent	f94b61c2c9fdcc90773c49df9ccf9ede3ad0d7db (diff)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 421020f1d7d..ba431893e31 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -21,6 +21,7 @@
21	#include <linux/percpu.h>	21	#include <linux/percpu.h>
22	#include <linux/string.h>	22	#include <linux/string.h>
23	#include <linux/sysdev.h>	23	#include <linux/sysdev.h>
		24	#include <linux/delay.h>
24	#include <linux/ctype.h>	25	#include <linux/ctype.h>
25	#include <linux/sched.h>	26	#include <linux/sched.h>
26	#include <linux/sysfs.h>	27	#include <linux/sysfs.h>
@@ -28,6 +29,7 @@
28	#include <linux/init.h>	29	#include <linux/init.h>
29	#include <linux/kmod.h>	30	#include <linux/kmod.h>
30	#include <linux/poll.h>	31	#include <linux/poll.h>
		32	#include <linux/nmi.h>
31	#include <linux/cpu.h>	33	#include <linux/cpu.h>
32	#include <linux/smp.h>	34	#include <linux/smp.h>
33	#include <linux/fs.h>	35	#include <linux/fs.h>
@@ -60,6 +62,8 @@ int mce_disabled;
60		62
61	#define MISC_MCELOG_MINOR 227	63	#define MISC_MCELOG_MINOR 227
62		64
		65	#define SPINUNIT 100 /* 100ns */
		66
63	atomic_t mce_entry;	67	atomic_t mce_entry;
64		68
65	DEFINE_PER_CPU(unsigned, mce_exception_count);	69	DEFINE_PER_CPU(unsigned, mce_exception_count);
@@ -77,6 +81,7 @@ static u64 *bank;
77	static unsigned long notify_user;	81	static unsigned long notify_user;
78	static int rip_msr;	82	static int rip_msr;
79	static int mce_bootlog = -1;	83	static int mce_bootlog = -1;
		84	static int monarch_timeout = -1;
80		85
81	static char trigger[128];	86	static char trigger[128];
82	static char *trigger_argv[2] = { trigger, NULL };	87	static char *trigger_argv[2] = { trigger, NULL };
@@ -84,6 +89,9 @@ static char *trigger_argv[2] = { trigger, NULL };
84	static unsigned long dont_init_banks;	89	static unsigned long dont_init_banks;
85		90
86	static DECLARE_WAIT_QUEUE_HEAD(mce_wait);	91	static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
		92	static DEFINE_PER_CPU(struct mce, mces_seen);
		93	static int cpu_missing;
		94
87		95
88	/* MCA banks polled by the period polling timer for corrected events */	96	/* MCA banks polled by the period polling timer for corrected events */
89	DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {	97	DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -241,6 +249,8 @@ static void mce_panic(char msg, struct mce final, char *exp)
241	}	249	}
242	if (final)	250	if (final)
243	print_mce(final);	251	print_mce(final);
		252	if (cpu_missing)
		253	printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
244	if (exp)	254	if (exp)
245	printk(KERN_EMERG "Machine check: %s\n", exp);	255	printk(KERN_EMERG "Machine check: %s\n", exp);
246	panic(msg);	256	panic(msg);
@@ -451,18 +461,287 @@ static int mce_no_way_out(struct mce m, char *msg)
451	}	461	}
452		462
453	/*	463	/*
		464	* Variable to establish order between CPUs while scanning.
		465	* Each CPU spins initially until executing is equal its number.
		466	*/
		467	static atomic_t mce_executing;
		468
		469	/*
		470	* Defines order of CPUs on entry. First CPU becomes Monarch.
		471	*/
		472	static atomic_t mce_callin;
		473
		474	/*
		475	* Check if a timeout waiting for other CPUs happened.
		476	*/
		477	static int mce_timed_out(u64 *t)
		478	{
		479	/*
		480	* The others already did panic for some reason.
		481	* Bail out like in a timeout.
		482	* rmb() to tell the compiler that system_state
		483	* might have been modified by someone else.
		484	*/
		485	rmb();
		486	if (atomic_read(&mce_paniced))
		487	wait_for_panic();
		488	if (!monarch_timeout)
		489	goto out;
		490	if ((s64)*t < SPINUNIT) {
		491	/* CHECKME: Make panic default for 1 too? */
		492	if (tolerant < 1)
		493	mce_panic("Timeout synchronizing machine check over CPUs",
		494	NULL, NULL);
		495	cpu_missing = 1;
		496	return 1;
		497	}
		498	*t -= SPINUNIT;
		499	out:
		500	touch_nmi_watchdog();
		501	return 0;
		502	}
		503
		504	/*
		505	* The Monarch's reign. The Monarch is the CPU who entered
		506	* the machine check handler first. It waits for the others to
		507	* raise the exception too and then grades them. When any
		508	* error is fatal panic. Only then let the others continue.
		509	*
		510	* The other CPUs entering the MCE handler will be controlled by the
		511	* Monarch. They are called Subjects.
		512	*
		513	* This way we prevent any potential data corruption in a unrecoverable case
		514	* and also makes sure always all CPU's errors are examined.
		515	*
		516	* Also this detects the case of an machine check event coming from outer
		517	* space (not detected by any CPUs) In this case some external agent wants
		518	* us to shut down, so panic too.
		519	*
		520	* The other CPUs might still decide to panic if the handler happens
		521	* in a unrecoverable place, but in this case the system is in a semi-stable
		522	* state and won't corrupt anything by itself. It's ok to let the others
		523	* continue for a bit first.
		524	*
		525	* All the spin loops have timeouts; when a timeout happens a CPU
		526	* typically elects itself to be Monarch.
		527	*/
		528	static void mce_reign(void)
		529	{
		530	int cpu;
		531	struct mce *m = NULL;
		532	int global_worst = 0;
		533	char *msg = NULL;
		534	char *nmsg = NULL;
		535
		536	/*
		537	* This CPU is the Monarch and the other CPUs have run
		538	* through their handlers.
		539	* Grade the severity of the errors of all the CPUs.
		540	*/
		541	for_each_possible_cpu(cpu) {
		542	int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
		543	&nmsg);
		544	if (severity > global_worst) {
		545	msg = nmsg;
		546	global_worst = severity;
		547	m = &per_cpu(mces_seen, cpu);
		548	}
		549	}
		550
		551	/*
		552	* Cannot recover? Panic here then.
		553	* This dumps all the mces in the log buffer and stops the
		554	* other CPUs.
		555	*/
		556	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
		557	mce_panic("Fatal machine check", m, msg);
		558
		559	/*
		560	* For UC somewhere we let the CPU who detects it handle it.
		561	* Also must let continue the others, otherwise the handling
		562	* CPU could deadlock on a lock.
		563	*/
		564
		565	/*
		566	* No machine check event found. Must be some external
		567	* source or one CPU is hung. Panic.
		568	*/
		569	if (!m && tolerant < 3)
		570	mce_panic("Machine check from unknown source", NULL, NULL);
		571
		572	/*
		573	* Now clear all the mces_seen so that they don't reappear on
		574	* the next mce.
		575	*/
		576	for_each_possible_cpu(cpu)
		577	memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
		578	}
		579
		580	static atomic_t global_nwo;
		581
		582	/*
		583	* Start of Monarch synchronization. This waits until all CPUs have
		584	* entered the exception handler and then determines if any of them
		585	* saw a fatal event that requires panic. Then it executes them
		586	* in the entry order.
		587	* TBD double check parallel CPU hotunplug
		588	*/
		589	static int mce_start(int no_way_out, int *order)
		590	{
		591	int nwo;
		592	int cpus = num_online_cpus();
		593	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
		594
		595	if (!timeout) {
		596	*order = -1;
		597	return no_way_out;
		598	}
		599
		600	atomic_add(no_way_out, &global_nwo);
		601
		602	/*
		603	* Wait for everyone.
		604	*/
		605	while (atomic_read(&mce_callin) != cpus) {
		606	if (mce_timed_out(&timeout)) {
		607	atomic_set(&global_nwo, 0);
		608	*order = -1;
		609	return no_way_out;
		610	}
		611	ndelay(SPINUNIT);
		612	}
		613
		614	/*
		615	* Cache the global no_way_out state.
		616	*/
		617	nwo = atomic_read(&global_nwo);
		618
		619	/*
		620	* Monarch starts executing now, the others wait.
		621	*/
		622	if (*order == 1) {
		623	atomic_set(&mce_executing, 1);
		624	return nwo;
		625	}
		626
		627	/*
		628	* Now start the scanning loop one by one
		629	* in the original callin order.
		630	* This way when there are any shared banks it will
		631	* be only seen by one CPU before cleared, avoiding duplicates.
		632	*/
		633	while (atomic_read(&mce_executing) < *order) {
		634	if (mce_timed_out(&timeout)) {
		635	atomic_set(&global_nwo, 0);
		636	*order = -1;
		637	return no_way_out;
		638	}
		639	ndelay(SPINUNIT);
		640	}
		641	return nwo;
		642	}
		643
		644	/*
		645	* Synchronize between CPUs after main scanning loop.
		646	* This invokes the bulk of the Monarch processing.
		647	*/
		648	static int mce_end(int order)
		649	{
		650	int ret = -1;
		651	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
		652
		653	if (!timeout)
		654	goto reset;
		655	if (order < 0)
		656	goto reset;
		657
		658	/*
		659	* Allow others to run.
		660	*/
		661	atomic_inc(&mce_executing);
		662
		663	if (order == 1) {
		664	/* CHECKME: Can this race with a parallel hotplug? */
		665	int cpus = num_online_cpus();
		666
		667	/*
		668	* Monarch: Wait for everyone to go through their scanning
		669	* loops.
		670	*/
		671	while (atomic_read(&mce_executing) <= cpus) {
		672	if (mce_timed_out(&timeout))
		673	goto reset;
		674	ndelay(SPINUNIT);
		675	}
		676
		677	mce_reign();
		678	barrier();
		679	ret = 0;
		680	} else {
		681	/*
		682	* Subject: Wait for Monarch to finish.
		683	*/
		684	while (atomic_read(&mce_executing) != 0) {
		685	if (mce_timed_out(&timeout))
		686	goto reset;
		687	ndelay(SPINUNIT);
		688	}
		689
		690	/*
		691	* Don't reset anything. That's done by the Monarch.
		692	*/
		693	return 0;
		694	}
		695
		696	/*
		697	* Reset all global state.
		698	*/
		699	reset:
		700	atomic_set(&global_nwo, 0);
		701	atomic_set(&mce_callin, 0);
		702	barrier();
		703
		704	/*
		705	* Let others run again.
		706	*/
		707	atomic_set(&mce_executing, 0);
		708	return ret;
		709	}
		710
		711	static void mce_clear_state(unsigned long *toclear)
		712	{
		713	int i;
		714
		715	for (i = 0; i < banks; i++) {
		716	if (test_bit(i, toclear))
		717	mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
		718	}
		719	}
		720
		721	/*
454	* The actual machine check handler. This only handles real	722	* The actual machine check handler. This only handles real
455	* exceptions when something got corrupted coming in through int 18.	723	* exceptions when something got corrupted coming in through int 18.
456	*	724	*
457	* This is executed in NMI context not subject to normal locking rules. This	725	* This is executed in NMI context not subject to normal locking rules. This
458	* implies that most kernel services cannot be safely used. Don't even	726	* implies that most kernel services cannot be safely used. Don't even
459	* think about putting a printk in there!	727	* think about putting a printk in there!
		728	*
		729	* On Intel systems this is entered on all CPUs in parallel through
		730	* MCE broadcast. However some CPUs might be broken beyond repair,
		731	* so be always careful when synchronizing with others.
460	*/	732	*/
461	void do_machine_check(struct pt_regs *regs, long error_code)	733	void do_machine_check(struct pt_regs *regs, long error_code)
462	{	734	{
463	struct mce m, panicm;	735	struct mce m, *final;
464	int panicm_found = 0;
465	int i;	736	int i;
		737	int worst = 0;
		738	int severity;
		739	/*
		740	* Establish sequential order between the CPUs entering the machine
		741	* check handler.
		742	*/
		743	int order;
		744
466	/*	745	/*
467	* If no_way_out gets set, there is no safe way to recover from this	746	* If no_way_out gets set, there is no safe way to recover from this
468	* MCE. If tolerant is cranked up, we'll try anyway.	747	* MCE. If tolerant is cranked up, we'll try anyway.
@@ -486,13 +765,23 @@ void do_machine_check(struct pt_regs *regs, long error_code)
486	if (!banks)	765	if (!banks)
487	goto out;	766	goto out;
488		767
		768	order = atomic_add_return(1, &mce_callin);
489	mce_setup(&m);	769	mce_setup(&m);
490		770
491	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);	771	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
492	no_way_out = mce_no_way_out(&m, &msg);	772	no_way_out = mce_no_way_out(&m, &msg);
493		773
		774	final = &__get_cpu_var(mces_seen);
		775	*final = m;
		776
494	barrier();	777	barrier();
495		778
		779	/*
		780	* Go through all the banks in exclusion of the other CPUs.
		781	* This way we don't report duplicated events on shared banks
		782	* because the first one to see it will clear it.
		783	*/
		784	no_way_out = mce_start(no_way_out, &order);
496	for (i = 0; i < banks; i++) {	785	for (i = 0; i < banks; i++) {
497	__clear_bit(i, toclear);	786	__clear_bit(i, toclear);
498	if (!bank[i])	787	if (!bank[i])
@@ -544,32 +833,32 @@ void do_machine_check(struct pt_regs *regs, long error_code)
544	mce_get_rip(&m, regs);	833	mce_get_rip(&m, regs);
545	mce_log(&m);	834	mce_log(&m);
546		835
547	/*	836	severity = mce_severity(&m, tolerant, NULL);
548	* Did this bank cause the exception?	837	if (severity > worst) {
549	*	838	*final = m;
550	* Assume that the bank with uncorrectable errors did it,	839	worst = severity;
551	* and that there is only a single one:
552	*/
553	if ((m.status & MCI_STATUS_UC) &&
554	(m.status & MCI_STATUS_EN)) {
555	panicm = m;
556	panicm_found = 1;
557	}	840	}
558	}	841	}
559		842
		843	if (!no_way_out)
		844	mce_clear_state(toclear);
		845
560	/*	846	/*
561	* If we didn't find an uncorrectable error, pick	847	* Do most of the synchronization with other CPUs.
562	* the last one (shouldn't happen, just being safe).	848	* When there's any problem use only local no_way_out state.
563	*/	849	*/
564	if (!panicm_found)	850	if (mce_end(order) < 0)
565	panicm = m;	851	no_way_out = worst >= MCE_PANIC_SEVERITY;
566		852
567	/*	853	/*
568	* If we have decided that we just CAN'T continue, and the user	854	* If we have decided that we just CAN'T continue, and the user
569	* has not set tolerant to an insane level, give up and die.	855	* has not set tolerant to an insane level, give up and die.
		856	*
		857	* This is mainly used in the case when the system doesn't
		858	* support MCE broadcasting or it has been disabled.
570	*/	859	*/
571	if (no_way_out && tolerant < 3)	860	if (no_way_out && tolerant < 3)
572	mce_panic("Machine check", &panicm, msg);	861	mce_panic("Machine check", final, msg);
573		862
574	/*	863	/*
575	* If the error seems to be unrecoverable, something should be	864	* If the error seems to be unrecoverable, something should be
@@ -585,7 +874,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
585	* instruction which caused the MCE.	874	* instruction which caused the MCE.
586	*/	875	*/
587	if (m.mcgstatus & MCG_STATUS_EIPV)	876	if (m.mcgstatus & MCG_STATUS_EIPV)
588	user_space = panicm.ip && (panicm.cs & 3);	877	user_space = final->ip && (final->cs & 3);
589		878
590	/*	879	/*
591	* If we know that the error was in user space, send a	880	* If we know that the error was in user space, send a
@@ -597,20 +886,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
597	if (user_space) {	886	if (user_space) {
598	force_sig(SIGBUS, current);	887	force_sig(SIGBUS, current);
599	} else if (panic_on_oops \|\| tolerant < 2) {	888	} else if (panic_on_oops \|\| tolerant < 2) {
600	mce_panic("Uncorrected machine check", &panicm, msg);	889	mce_panic("Uncorrected machine check", final, msg);
601	}	890	}
602	}	891	}
603		892
604	/* notify userspace ASAP */	893	/* notify userspace ASAP */
605	set_thread_flag(TIF_MCE_NOTIFY);	894	set_thread_flag(TIF_MCE_NOTIFY);
606		895
607	mce_report_event(regs);	896	if (worst > 0)
608		897	mce_report_event(regs);
609	/* the last thing we do is clear state */
610	for (i = 0; i < banks; i++) {
611	if (test_bit(i, toclear))
612	mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
613	}
614	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);	898	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
615	out:	899	out:
616	atomic_dec(&mce_entry);	900	atomic_dec(&mce_entry);
@@ -821,7 +1105,17 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
821		1105
822	if (c->x86 == 6 && c->x86_model < 0x1A)	1106	if (c->x86 == 6 && c->x86_model < 0x1A)
823	__set_bit(0, &dont_init_banks);	1107	__set_bit(0, &dont_init_banks);
		1108
		1109	/*
		1110	* All newer Intel systems support MCE broadcasting. Enable
		1111	* synchronization with a one second timeout.
		1112	*/
		1113	if ((c->x86 > 6 \|\| (c->x86 == 6 && c->x86_model >= 0xe)) &&
		1114	monarch_timeout < 0)
		1115	monarch_timeout = USEC_PER_SEC;
824	}	1116	}
		1117	if (monarch_timeout < 0)
		1118	monarch_timeout = 0;
825	}	1119	}
826		1120
827	static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)	1121	static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
@@ -1068,7 +1362,9 @@ static struct miscdevice mce_log_device = {
1068		1362
1069	/*	1363	/*
1070	* mce=off disables machine check	1364	* mce=off disables machine check
1071	* mce=TOLERANCELEVEL (number, see above)	1365	* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
		1366	* monarchtimeout is how long to wait for other CPUs on machine
		1367	* check, or 0 to not wait
1072	* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.	1368	* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1073	* mce=nobootlog Don't log MCEs from before booting.	1369	* mce=nobootlog Don't log MCEs from before booting.
1074	*/	1370	*/
@@ -1082,9 +1378,13 @@ static int __init mcheck_enable(char *str)
1082	mce_disabled = 1;	1378	mce_disabled = 1;
1083	else if (!strcmp(str, "bootlog") \|\| !strcmp(str, "nobootlog"))	1379	else if (!strcmp(str, "bootlog") \|\| !strcmp(str, "nobootlog"))
1084	mce_bootlog = (str[0] == 'b');	1380	mce_bootlog = (str[0] == 'b');
1085	else if (isdigit(str[0]))	1381	else if (isdigit(str[0])) {
1086	get_option(&str, &tolerant);	1382	get_option(&str, &tolerant);
1087	else {	1383	if (*str == ',') {
		1384	++str;
		1385	get_option(&str, &monarch_timeout);
		1386	}
		1387	} else {
1088	printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",	1388	printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1089	str);	1389	str);
1090	return 0;	1390	return 0;
@@ -1221,6 +1521,7 @@ static ssize_t store_int_with_restart(struct sys_device *s,
1221		1521
1222	static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);	1522	static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1223	static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);	1523	static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
		1524	static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1224		1525
1225	static struct sysdev_ext_attribute attr_check_interval = {	1526	static struct sysdev_ext_attribute attr_check_interval = {
1226	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,	1527	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
@@ -1230,6 +1531,7 @@ static struct sysdev_ext_attribute attr_check_interval = {
1230		1531
1231	static struct sysdev_attribute *mce_attrs[] = {	1532	static struct sysdev_attribute *mce_attrs[] = {
1232	&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,	1533	&attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
		1534	&attr_monarch_timeout.attr,
1233	NULL	1535	NULL
1234	};	1536	};
1235		1537