Merge branch 'x86-mce-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull MCE changes from Ingo Molnar. * 'x86-mce-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Fix return value of mce_chrdev_read() when erst is disabled x86/mce: Convert static array of pointers to per-cpu variables x86/mce: Replace hard coded hex constants with symbolic defines x86/mce: Recognise machine check bank signature for data path error x86/mce: Handle "action required" errors x86/mce: Add mechanism to safely save information in MCE handler x86/mce: Create helper function to save addr/misc when needed HWPOISON: Add code to handle "action required" errors. HWPOISON: Clean up memory_failure() vs. __memory_failure()
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-03-22 12:42:04 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-03-22 12:42:04 -0400
commit: 754b9800779402924fffe456b49d557e15260cbf (patch)
tree: 0e0441eca766616fccd8fc37a3885397efc6063a /arch/x86/kernel/cpu
parent: 35cb8d9e18c0bb33b90d7e574abadbe23b65427d (diff)
parent: ea281a9ebaba3287130dbe15bb0aad6f798bb06b (diff)
3 files changed, 164 insertions, 64 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 7395d5f4272d..0c82091b1652 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -54,7 +54,14 @@ static struct severity {
 #define  MASK(x, y)     .mask = x, .result = y
 #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
+#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
 #define MCACOD 0xffff
+/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
+#define MCACOD_SCRUB    0x00C0  /* 0xC0-0xCF Memory Scrubbing */
+#define MCACOD_SCRUBMSK 0xfff0
+#define MCACOD_L3WB     0x017A  /* L3 Explicit Writeback */
+#define MCACOD_DATA     0x0134  /* Data Load */
+#define MCACOD_INSTR    0x0150  /* Instruction Fetch */
        MCESEV(
                NO, "Invalid",
@@ -102,11 +109,24 @@ static struct severity {
                SER, BITCLR(MCI_STATUS_S)
                ),
-        /* AR add known MCACODs here */
        MCESEV(
                PANIC, "Action required with lost events",
                SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
                ),
+        /* known AR MCACODs: */
+#ifdef  CONFIG_MEMORY_FAILURE
+        MCESEV(
+                KEEP, "HT thread notices Action required: data load error",
+                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+                MCGMASK(MCG_STATUS_EIPV, 0)
+                ),
+        MCESEV(
+                AR, "Action required: data load error",
+                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+                USER
+                ),
+#endif
        MCESEV(
                PANIC, "Action required: unknown MCACOD",
                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
@@ -115,11 +135,11 @@ static struct severity {
        /* known AO MCACODs: */
        MCESEV(
                AO, "Action optional: memory scrubbing error",
-                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
+                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
                ),
        MCESEV(
                AO, "Action optional: last level cache writeback error",
-                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
+                SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
                ),
        MCESEV(
                SOME, "Action optional: unknown MCACOD",
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5a11ae2e9e91..c614bd4de0f3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -540,6 +540,27 @@ static void mce_report_event(struct pt_regs *regs)
        irq_work_queue(&__get_cpu_var(mce_irq_work));
 }
+/*
+ * Read ADDR and MISC registers.
+ */
+static void mce_read_aux(struct mce *m, int i)
+{
+        if (m->status & MCI_STATUS_MISCV)
+                m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
+        if (m->status & MCI_STATUS_ADDRV) {
+                m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+                /*
+                 * Mask the reported address by the reported granularity.
+                 */
+                if (mce_ser && (m->status & MCI_STATUS_MISCV)) {
+                        u8 shift = MCI_MISC_ADDR_LSB(m->misc);
+                        m->addr >>= shift;
+                        m->addr <<= shift;
+                }
+        }
+}
 DEFINE_PER_CPU(unsigned, mce_poll_count);
 /*
@@ -590,10 +611,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
                    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
                        continue;
-                if (m.status & MCI_STATUS_MISCV)
+                mce_read_aux(&m, i);
-                        m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
-                if (m.status & MCI_STATUS_ADDRV)
-                        m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
                if (!(flags & MCP_TIMESTAMP))
                        m.tsc = 0;
@@ -917,6 +935,49 @@ static void mce_clear_state(unsigned long *toclear)
 }
 /*
+ * Need to save faulting physical address associated with a process
+ * in the machine check handler some place where we can grab it back
+ * later in mce_notify_process()
+ */
+#define MCE_INFO_MAX    16
+struct mce_info {
+        atomic_t                inuse;
+        struct task_struct      *t;
+        __u64                   paddr;
+} mce_info[MCE_INFO_MAX];
+static void mce_save_info(__u64 addr)
+{
+        struct mce_info *mi;
+        for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
+                if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
+                        mi->t = current;
+                        mi->paddr = addr;
+                        return;
+                }
+        }
+        mce_panic("Too many concurrent recoverable errors", NULL, NULL);
+}
+static struct mce_info *mce_find_info(void)
+{
+        struct mce_info *mi;
+        for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
+                if (atomic_read(&mi->inuse) && mi->t == current)
+                        return mi;
+        return NULL;
+}
+static void mce_clear_info(struct mce_info *mi)
+{
+        atomic_set(&mi->inuse, 0);
+}
+/*
 * The actual machine check handler. This only handles real
 * exceptions when something got corrupted coming in through int 18.
 *
@@ -969,7 +1030,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
        barrier();
        /*
-         * When no restart IP must always kill or panic.
+         * When no restart IP might need to kill or panic.
+         * Assume the worst for now, but if we find the
+         * severity is MCE_AR_SEVERITY we have other options.
         */
        if (!(m.mcgstatus & MCG_STATUS_RIPV))
                kill_it = 1;
@@ -1023,16 +1086,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                        continue;
                }
-                /*
+                mce_read_aux(&m, i);
-                 * Kill on action required.
-                 */
-                if (severity == MCE_AR_SEVERITY)
-                        kill_it = 1;
-                if (m.status & MCI_STATUS_MISCV)
-                        m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
-                if (m.status & MCI_STATUS_ADDRV)
-                        m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
                /*
                 * Action optional error. Queue address for later processing.
@@ -1052,6 +1106,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                }
        }
+        /* mce_clear_state will clear *final, save locally for use later */
+        m = *final;
        if (!no_way_out)
                mce_clear_state(toclear);
@@ -1063,27 +1120,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                no_way_out = worst >= MCE_PANIC_SEVERITY;
        /*
-         * If we have decided that we just CAN'T continue, and the user
+         * At insane "tolerant" levels we take no action. Otherwise
-         * has not set tolerant to an insane level, give up and die.
+         * we only die if we have no other choice. For less serious
-         *
+         * issues we try to recover, or limit damage to the current
-         * This is mainly used in the case when the system doesn't
+         * process.
-         * support MCE broadcasting or it has been disabled.
-         */
-        if (no_way_out && tolerant < 3)
-                mce_panic("Fatal machine check on current CPU", final, msg);
-        /*
-         * If the error seems to be unrecoverable, something should be
-         * done.  Try to kill as little as possible.  If we can kill just
-         * one task, do that.  If the user has set the tolerance very
-         * high, don't try to do anything at all.
         */
+        if (tolerant < 3) {
-        if (kill_it && tolerant < 3)
+                if (no_way_out)
-                force_sig(SIGBUS, current);
+                        mce_panic("Fatal machine check on current CPU", &m, msg);
+                if (worst == MCE_AR_SEVERITY) {
-        /* notify userspace ASAP */
+                        /* schedule action before return to userland */
-        set_thread_flag(TIF_MCE_NOTIFY);
+                        mce_save_info(m.addr);
+                        set_thread_flag(TIF_MCE_NOTIFY);
+                } else if (kill_it) {
+                        force_sig(SIGBUS, current);
+                }
+        }
        if (worst > 0)
                mce_report_event(regs);
@@ -1094,34 +1146,57 @@ out:
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
-/* dummy to break dependency. actual code is in mm/memory-failure.c */
+#ifndef CONFIG_MEMORY_FAILURE
-void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
+int memory_failure(unsigned long pfn, int vector, int flags)
 {
-        printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
+        /* mce_severity() should not hand us an ACTION_REQUIRED error */
+        BUG_ON(flags & MF_ACTION_REQUIRED);
+        printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
+                "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
+        return 0;
 }
+#endif
 /*
- * Called after mce notification in process context. This code
+ * Called in process context that interrupted by MCE and marked with
- * is allowed to sleep. Call the high level VM handler to process
+ * TIF_MCE_NOTIFY, just before returning to erroneous userland.
- * any corrupted pages.
+ * This code is allowed to sleep.
- * Assume that the work queue code only calls this one at a time
+ * Attempt possible recovery such as calling the high level VM handler to
- * per CPU.
+ * process any corrupted pages, and kill/signal current process if required.
- * Note we don't disable preemption, so this code might run on the wrong
+ * Action required errors are handled here.
- * CPU. In this case the event is picked up by the scheduled work queue.
- * This is merely a fast path to expedite processing in some common
- * cases.
 */
 void mce_notify_process(void)
 {
        unsigned long pfn;
-        mce_notify_irq();
+        struct mce_info *mi = mce_find_info();
-        while (mce_ring_get(&pfn))
-                memory_failure(pfn, MCE_VECTOR);
+        if (!mi)
+                mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
+        pfn = mi->paddr >> PAGE_SHIFT;
+        clear_thread_flag(TIF_MCE_NOTIFY);
+        pr_err("Uncorrected hardware memory error in user-access at %llx",
+                 mi->paddr);
+        if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) {
+                pr_err("Memory error not recovered");
+                force_sig(SIGBUS, current);
+        }
+        mce_clear_info(mi);
 }
+/*
+ * Action optional processing happens here (picking up
+ * from the list of faulting pages that do_machine_check()
+ * placed into the "ring").
+ */
 static void mce_process_work(struct work_struct *dummy)
 {
-        mce_notify_process();
+        unsigned long pfn;
+        while (mce_ring_get(&pfn))
+                memory_failure(pfn, MCE_VECTOR, 0);
 }
 #ifdef CONFIG_X86_MCE_INTEL
@@ -1211,8 +1286,6 @@ int mce_notify_irq(void)
        /* Not more than two messages every minute */
        static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-        clear_thread_flag(TIF_MCE_NOTIFY);
        if (test_and_clear_bit(0, &mce_need_notify)) {
                /* wake processes polling /dev/mcelog */
                wake_up_interruptible(&mce_chrdev_wait);
@@ -1541,6 +1614,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
        /* Error or no more MCE record */
        if (rc <= 0) {
                mce_apei_read_done = 1;
+                /*
+                 * When ERST is disabled, mce_chrdev_read() should return
+                 * "no record" instead of "no device."
+                 */
+                if (rc == -ENODEV)
+                        return 0;
                return rc;
        }
        rc = -EFAULT;
@@ -1859,7 +1938,7 @@ static struct bus_type mce_subsys = {
        .dev_name       = "machinecheck",
 };
-struct device *mce_device[CONFIG_NR_CPUS];
+DEFINE_PER_CPU(struct device *, mce_device);
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -2038,7 +2117,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)
                        goto error2;
        }
        cpumask_set_cpu(cpu, mce_device_initialized);
-        mce_device[cpu] = dev;
+        per_cpu(mce_device, cpu) = dev;
        return 0;
 error2:
@@ -2055,7 +2134,7 @@ error:
 static __cpuinit void mce_device_remove(unsigned int cpu)
 {
-        struct device *dev = mce_device[cpu];
+        struct device *dev = per_cpu(mce_device, cpu);
        int i;
        if (!cpumask_test_cpu(cpu, mce_device_initialized))
@@ -2069,7 +2148,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu)
        device_unregister(dev);
        cpumask_clear_cpu(cpu, mce_device_initialized);
-        mce_device[cpu] = NULL;
+        per_cpu(mce_device, cpu) = NULL;
 }
 /* Make sure there are no machine checks on offlined CPUs. */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index e4eeaaf58a47..99b57179f912 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -523,7 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 {
        int i, err = 0;
        struct threshold_bank *b = NULL;
-        struct device *dev = mce_device[cpu];
+        struct device *dev = per_cpu(mce_device, cpu);
        char name[32];
        sprintf(name, "threshold_bank%i", bank);
@@ -587,7 +587,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
                if (i == cpu)
                        continue;
-                dev = mce_device[i];
+                dev = per_cpu(mce_device, i);
                if (dev)
                        err = sysfs_create_link(&dev->kobj,b->kobj, name);
                if (err)
@@ -667,7 +667,8 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #ifdef CONFIG_SMP
        /* sibling symlink */
        if (shared_bank[bank] && b->blocks->cpu != cpu) {
-                sysfs_remove_link(&mce_device[cpu]->kobj, name);
+                dev = per_cpu(mce_device, cpu);
+                sysfs_remove_link(&dev->kobj, name);
                per_cpu(threshold_banks, cpu)[bank] = NULL;
                return;
@@ -679,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
                if (i == cpu)
                        continue;
-                dev = mce_device[i];
+                dev = per_cpu(mce_device, i);
                if (dev)
                        sysfs_remove_link(&dev->kobj, name);
                per_cpu(threshold_banks, i)[bank] = NULL;
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-03-22 12:42:04 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-03-22 12:42:04 -0400
commit	754b9800779402924fffe456b49d557e15260cbf (patch)
tree	0e0441eca766616fccd8fc37a3885397efc6063a /arch/x86/kernel/cpu
parent	35cb8d9e18c0bb33b90d7e574abadbe23b65427d (diff)
parent	ea281a9ebaba3287130dbe15bb0aad6f798bb06b (diff)

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 7395d5f4272d..0c82091b1652 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -54,7 +54,14 @@ static struct severity {
54	#define MASK(x, y) .mask = x, .result = y	54	#define MASK(x, y) .mask = x, .result = y
55	#define MCI_UC_S (MCI_STATUS_UC\|MCI_STATUS_S)	55	#define MCI_UC_S (MCI_STATUS_UC\|MCI_STATUS_S)
56	#define MCI_UC_SAR (MCI_STATUS_UC\|MCI_STATUS_S\|MCI_STATUS_AR)	56	#define MCI_UC_SAR (MCI_STATUS_UC\|MCI_STATUS_S\|MCI_STATUS_AR)
		57	#define MCI_ADDR (MCI_STATUS_ADDRV\|MCI_STATUS_MISCV)
57	#define MCACOD 0xffff	58	#define MCACOD 0xffff
		59	/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
		60	#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */
		61	#define MCACOD_SCRUBMSK 0xfff0
		62	#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */
		63	#define MCACOD_DATA 0x0134 /* Data Load */
		64	#define MCACOD_INSTR 0x0150 /* Instruction Fetch */
58		65
59	MCESEV(	66	MCESEV(
60	NO, "Invalid",	67	NO, "Invalid",
@@ -102,11 +109,24 @@ static struct severity {
102	SER, BITCLR(MCI_STATUS_S)	109	SER, BITCLR(MCI_STATUS_S)
103	),	110	),
104		111
105	/* AR add known MCACODs here */
106	MCESEV(	112	MCESEV(
107	PANIC, "Action required with lost events",	113	PANIC, "Action required with lost events",
108	SER, BITSET(MCI_STATUS_OVER\|MCI_UC_SAR)	114	SER, BITSET(MCI_STATUS_OVER\|MCI_UC_SAR)
109	),	115	),
		116
		117	/* known AR MCACODs: */
		118	#ifdef CONFIG_MEMORY_FAILURE
		119	MCESEV(
		120	KEEP, "HT thread notices Action required: data load error",
		121	SER, MASK(MCI_STATUS_OVER\|MCI_UC_SAR\|MCI_ADDR\|MCACOD, MCI_UC_SAR\|MCI_ADDR\|MCACOD_DATA),
		122	MCGMASK(MCG_STATUS_EIPV, 0)
		123	),
		124	MCESEV(
		125	AR, "Action required: data load error",
		126	SER, MASK(MCI_STATUS_OVER\|MCI_UC_SAR\|MCI_ADDR\|MCACOD, MCI_UC_SAR\|MCI_ADDR\|MCACOD_DATA),
		127	USER
		128	),
		129	#endif
110	MCESEV(	130	MCESEV(
111	PANIC, "Action required: unknown MCACOD",	131	PANIC, "Action required: unknown MCACOD",
112	SER, MASK(MCI_STATUS_OVER\|MCI_UC_SAR, MCI_UC_SAR)	132	SER, MASK(MCI_STATUS_OVER\|MCI_UC_SAR, MCI_UC_SAR)
@@ -115,11 +135,11 @@ static struct severity {
115	/* known AO MCACODs: */	135	/* known AO MCACODs: */
116	MCESEV(	136	MCESEV(
117	AO, "Action optional: memory scrubbing error",	137	AO, "Action optional: memory scrubbing error",
118	SER, MASK(MCI_STATUS_OVER\|MCI_UC_SAR\|0xfff0, MCI_UC_S\|0x00c0)	138	SER, MASK(MCI_STATUS_OVER\|MCI_UC_SAR\|MCACOD_SCRUBMSK, MCI_UC_S\|MCACOD_SCRUB)
119	),	139	),
120	MCESEV(	140	MCESEV(
121	AO, "Action optional: last level cache writeback error",	141	AO, "Action optional: last level cache writeback error",
122	SER, MASK(MCI_STATUS_OVER\|MCI_UC_SAR\|MCACOD, MCI_UC_S\|0x017a)	142	SER, MASK(MCI_STATUS_OVER\|MCI_UC_SAR\|MCACOD, MCI_UC_S\|MCACOD_L3WB)
123	),	143	),
124	MCESEV(	144	MCESEV(
125	SOME, "Action optional: unknown MCACOD",	145	SOME, "Action optional: unknown MCACOD",


diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a11ae2e9e91..c614bd4de0f3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -540,6 +540,27 @@ static void mce_report_event(struct pt_regs *regs)
540	irq_work_queue(&__get_cpu_var(mce_irq_work));	540	irq_work_queue(&__get_cpu_var(mce_irq_work));
541	}	541	}
542		542
		543	/*
		544	* Read ADDR and MISC registers.
		545	*/
		546	static void mce_read_aux(struct mce *m, int i)
		547	{
		548	if (m->status & MCI_STATUS_MISCV)
		549	m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
		550	if (m->status & MCI_STATUS_ADDRV) {
		551	m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
		552
		553	/*
		554	* Mask the reported address by the reported granularity.
		555	*/
		556	if (mce_ser && (m->status & MCI_STATUS_MISCV)) {
		557	u8 shift = MCI_MISC_ADDR_LSB(m->misc);
		558	m->addr >>= shift;
		559	m->addr <<= shift;
		560	}
		561	}
		562	}
		563
543	DEFINE_PER_CPU(unsigned, mce_poll_count);	564	DEFINE_PER_CPU(unsigned, mce_poll_count);
544		565
545	/*	566	/*
@@ -590,10 +611,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
590	(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))	611	(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
591	continue;	612	continue;
592		613
593	if (m.status & MCI_STATUS_MISCV)	614	mce_read_aux(&m, i);
594	m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
595	if (m.status & MCI_STATUS_ADDRV)
596	m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
597		615
598	if (!(flags & MCP_TIMESTAMP))	616	if (!(flags & MCP_TIMESTAMP))
599	m.tsc = 0;	617	m.tsc = 0;
@@ -917,6 +935,49 @@ static void mce_clear_state(unsigned long *toclear)
917	}	935	}
918		936
919	/*	937	/*
		938	* Need to save faulting physical address associated with a process
		939	* in the machine check handler some place where we can grab it back
		940	* later in mce_notify_process()
		941	*/
		942	#define MCE_INFO_MAX 16
		943
		944	struct mce_info {
		945	atomic_t inuse;
		946	struct task_struct *t;
		947	__u64 paddr;
		948	} mce_info[MCE_INFO_MAX];
		949
		950	static void mce_save_info(__u64 addr)
		951	{
		952	struct mce_info *mi;
		953
		954	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
		955	if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
		956	mi->t = current;
		957	mi->paddr = addr;
		958	return;
		959	}
		960	}
		961
		962	mce_panic("Too many concurrent recoverable errors", NULL, NULL);
		963	}
		964
		965	static struct mce_info *mce_find_info(void)
		966	{
		967	struct mce_info *mi;
		968
		969	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
		970	if (atomic_read(&mi->inuse) && mi->t == current)
		971	return mi;
		972	return NULL;
		973	}
		974
		975	static void mce_clear_info(struct mce_info *mi)
		976	{
		977	atomic_set(&mi->inuse, 0);
		978	}
		979
		980	/*
920	* The actual machine check handler. This only handles real	981	* The actual machine check handler. This only handles real
921	* exceptions when something got corrupted coming in through int 18.	982	* exceptions when something got corrupted coming in through int 18.
922	*	983	*
@@ -969,7 +1030,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
969	barrier();	1030	barrier();
970		1031
971	/*	1032	/*
972	* When no restart IP must always kill or panic.	1033	* When no restart IP might need to kill or panic.
		1034	* Assume the worst for now, but if we find the
		1035	* severity is MCE_AR_SEVERITY we have other options.
973	*/	1036	*/
974	if (!(m.mcgstatus & MCG_STATUS_RIPV))	1037	if (!(m.mcgstatus & MCG_STATUS_RIPV))
975	kill_it = 1;	1038	kill_it = 1;
@@ -1023,16 +1086,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1023	continue;	1086	continue;
1024	}	1087	}
1025		1088
1026	/*	1089	mce_read_aux(&m, i);
1027	* Kill on action required.
1028	*/
1029	if (severity == MCE_AR_SEVERITY)
1030	kill_it = 1;
1031
1032	if (m.status & MCI_STATUS_MISCV)
1033	m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
1034	if (m.status & MCI_STATUS_ADDRV)
1035	m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
1036		1090
1037	/*	1091	/*
1038	* Action optional error. Queue address for later processing.	1092	* Action optional error. Queue address for later processing.
@@ -1052,6 +1106,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1052	}	1106	}
1053	}	1107	}
1054		1108
		1109	/* mce_clear_state will clear final, save locally for use later /
		1110	m = *final;
		1111
1055	if (!no_way_out)	1112	if (!no_way_out)
1056	mce_clear_state(toclear);	1113	mce_clear_state(toclear);
1057		1114
@@ -1063,27 +1120,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1063	no_way_out = worst >= MCE_PANIC_SEVERITY;	1120	no_way_out = worst >= MCE_PANIC_SEVERITY;
1064		1121
1065	/*	1122	/*
1066	* If we have decided that we just CAN'T continue, and the user	1123	* At insane "tolerant" levels we take no action. Otherwise
1067	* has not set tolerant to an insane level, give up and die.	1124	* we only die if we have no other choice. For less serious
1068	*	1125	* issues we try to recover, or limit damage to the current
1069	* This is mainly used in the case when the system doesn't	1126	* process.
1070	* support MCE broadcasting or it has been disabled.
1071	*/
1072	if (no_way_out && tolerant < 3)
1073	mce_panic("Fatal machine check on current CPU", final, msg);
1074
1075	/*
1076	* If the error seems to be unrecoverable, something should be
1077	* done. Try to kill as little as possible. If we can kill just
1078	* one task, do that. If the user has set the tolerance very
1079	* high, don't try to do anything at all.
1080	*/	1127	*/
1081		1128	if (tolerant < 3) {
1082	if (kill_it && tolerant < 3)	1129	if (no_way_out)
1083	force_sig(SIGBUS, current);	1130	mce_panic("Fatal machine check on current CPU", &m, msg);
1084		1131	if (worst == MCE_AR_SEVERITY) {
1085	/* notify userspace ASAP */	1132	/* schedule action before return to userland */
1086	set_thread_flag(TIF_MCE_NOTIFY);	1133	mce_save_info(m.addr);
		1134	set_thread_flag(TIF_MCE_NOTIFY);
		1135	} else if (kill_it) {
		1136	force_sig(SIGBUS, current);
		1137	}
		1138	}
1087		1139
1088	if (worst > 0)	1140	if (worst > 0)
1089	mce_report_event(regs);	1141	mce_report_event(regs);
@@ -1094,34 +1146,57 @@ out:
1094	}	1146	}
1095	EXPORT_SYMBOL_GPL(do_machine_check);	1147	EXPORT_SYMBOL_GPL(do_machine_check);
1096		1148
1097	/* dummy to break dependency. actual code is in mm/memory-failure.c */	1149	#ifndef CONFIG_MEMORY_FAILURE
1098	void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)	1150	int memory_failure(unsigned long pfn, int vector, int flags)
1099	{	1151	{
1100	printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);	1152	/* mce_severity() should not hand us an ACTION_REQUIRED error */
		1153	BUG_ON(flags & MF_ACTION_REQUIRED);
		1154	printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
		1155	"Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
		1156
		1157	return 0;
1101	}	1158	}
		1159	#endif
1102		1160
1103	/*	1161	/*
1104	* Called after mce notification in process context. This code	1162	* Called in process context that interrupted by MCE and marked with
1105	* is allowed to sleep. Call the high level VM handler to process	1163	* TIF_MCE_NOTIFY, just before returning to erroneous userland.
1106	* any corrupted pages.	1164	* This code is allowed to sleep.
1107	* Assume that the work queue code only calls this one at a time	1165	* Attempt possible recovery such as calling the high level VM handler to
1108	* per CPU.	1166	* process any corrupted pages, and kill/signal current process if required.
1109	* Note we don't disable preemption, so this code might run on the wrong	1167	* Action required errors are handled here.
1110	* CPU. In this case the event is picked up by the scheduled work queue.
1111	* This is merely a fast path to expedite processing in some common
1112	* cases.
1113	*/	1168	*/
1114	void mce_notify_process(void)	1169	void mce_notify_process(void)
1115	{	1170	{
1116	unsigned long pfn;	1171	unsigned long pfn;
1117	mce_notify_irq();	1172	struct mce_info *mi = mce_find_info();
1118	while (mce_ring_get(&pfn))	1173
1119	memory_failure(pfn, MCE_VECTOR);	1174	if (!mi)
		1175	mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
		1176	pfn = mi->paddr >> PAGE_SHIFT;
		1177
		1178	clear_thread_flag(TIF_MCE_NOTIFY);
		1179
		1180	pr_err("Uncorrected hardware memory error in user-access at %llx",
		1181	mi->paddr);
		1182	if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0) {
		1183	pr_err("Memory error not recovered");
		1184	force_sig(SIGBUS, current);
		1185	}
		1186	mce_clear_info(mi);
1120	}	1187	}
1121		1188
		1189	/*
		1190	* Action optional processing happens here (picking up
		1191	* from the list of faulting pages that do_machine_check()
		1192	* placed into the "ring").
		1193	*/
1122	static void mce_process_work(struct work_struct *dummy)	1194	static void mce_process_work(struct work_struct *dummy)
1123	{	1195	{
1124	mce_notify_process();	1196	unsigned long pfn;
		1197
		1198	while (mce_ring_get(&pfn))
		1199	memory_failure(pfn, MCE_VECTOR, 0);
1125	}	1200	}
1126		1201
1127	#ifdef CONFIG_X86_MCE_INTEL	1202	#ifdef CONFIG_X86_MCE_INTEL
@@ -1211,8 +1286,6 @@ int mce_notify_irq(void)
1211	/* Not more than two messages every minute */	1286	/* Not more than two messages every minute */
1212	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);	1287	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1213		1288
1214	clear_thread_flag(TIF_MCE_NOTIFY);
1215
1216	if (test_and_clear_bit(0, &mce_need_notify)) {	1289	if (test_and_clear_bit(0, &mce_need_notify)) {
1217	/* wake processes polling /dev/mcelog */	1290	/* wake processes polling /dev/mcelog */
1218	wake_up_interruptible(&mce_chrdev_wait);	1291	wake_up_interruptible(&mce_chrdev_wait);
@@ -1541,6 +1614,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
1541	/* Error or no more MCE record */	1614	/* Error or no more MCE record */
1542	if (rc <= 0) {	1615	if (rc <= 0) {
1543	mce_apei_read_done = 1;	1616	mce_apei_read_done = 1;
		1617	/*
		1618	* When ERST is disabled, mce_chrdev_read() should return
		1619	* "no record" instead of "no device."
		1620	*/
		1621	if (rc == -ENODEV)
		1622	return 0;
1544	return rc;	1623	return rc;
1545	}	1624	}
1546	rc = -EFAULT;	1625	rc = -EFAULT;
@@ -1859,7 +1938,7 @@ static struct bus_type mce_subsys = {
1859	.dev_name = "machinecheck",	1938	.dev_name = "machinecheck",
1860	};	1939	};
1861		1940
1862	struct device *mce_device[CONFIG_NR_CPUS];	1941	DEFINE_PER_CPU(struct device *, mce_device);
1863		1942
1864	__cpuinitdata	1943	__cpuinitdata
1865	void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);	1944	void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -2038,7 +2117,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)
2038	goto error2;	2117	goto error2;
2039	}	2118	}
2040	cpumask_set_cpu(cpu, mce_device_initialized);	2119	cpumask_set_cpu(cpu, mce_device_initialized);
2041	mce_device[cpu] = dev;	2120	per_cpu(mce_device, cpu) = dev;
2042		2121
2043	return 0;	2122	return 0;
2044	error2:	2123	error2:
@@ -2055,7 +2134,7 @@ error:
2055		2134
2056	static __cpuinit void mce_device_remove(unsigned int cpu)	2135	static __cpuinit void mce_device_remove(unsigned int cpu)
2057	{	2136	{
2058	struct device *dev = mce_device[cpu];	2137	struct device *dev = per_cpu(mce_device, cpu);
2059	int i;	2138	int i;
2060		2139
2061	if (!cpumask_test_cpu(cpu, mce_device_initialized))	2140	if (!cpumask_test_cpu(cpu, mce_device_initialized))
@@ -2069,7 +2148,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu)
2069		2148
2070	device_unregister(dev);	2149	device_unregister(dev);
2071	cpumask_clear_cpu(cpu, mce_device_initialized);	2150	cpumask_clear_cpu(cpu, mce_device_initialized);
2072	mce_device[cpu] = NULL;	2151	per_cpu(mce_device, cpu) = NULL;
2073	}	2152	}
2074		2153
2075	/* Make sure there are no machine checks on offlined CPUs. */	2154	/* Make sure there are no machine checks on offlined CPUs. */


diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index e4eeaaf58a47..99b57179f912 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -523,7 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
523	{	523	{
524	int i, err = 0;	524	int i, err = 0;
525	struct threshold_bank *b = NULL;	525	struct threshold_bank *b = NULL;
526	struct device *dev = mce_device[cpu];	526	struct device *dev = per_cpu(mce_device, cpu);
527	char name[32];	527	char name[32];
528		528
529	sprintf(name, "threshold_bank%i", bank);	529	sprintf(name, "threshold_bank%i", bank);
@@ -587,7 +587,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
587	if (i == cpu)	587	if (i == cpu)
588	continue;	588	continue;
589		589
590	dev = mce_device[i];	590	dev = per_cpu(mce_device, i);
591	if (dev)	591	if (dev)
592	err = sysfs_create_link(&dev->kobj,b->kobj, name);	592	err = sysfs_create_link(&dev->kobj,b->kobj, name);
593	if (err)	593	if (err)
@@ -667,7 +667,8 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
667	#ifdef CONFIG_SMP	667	#ifdef CONFIG_SMP
668	/* sibling symlink */	668	/* sibling symlink */
669	if (shared_bank[bank] && b->blocks->cpu != cpu) {	669	if (shared_bank[bank] && b->blocks->cpu != cpu) {
670	sysfs_remove_link(&mce_device[cpu]->kobj, name);	670	dev = per_cpu(mce_device, cpu);
		671	sysfs_remove_link(&dev->kobj, name);
671	per_cpu(threshold_banks, cpu)[bank] = NULL;	672	per_cpu(threshold_banks, cpu)[bank] = NULL;
672		673
673	return;	674	return;
@@ -679,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
679	if (i == cpu)	680	if (i == cpu)
680	continue;	681	continue;
681		682
682	dev = mce_device[i];	683	dev = per_cpu(mce_device, i);
683	if (dev)	684	if (dev)
684	sysfs_remove_link(&dev->kobj, name);	685	sysfs_remove_link(&dev->kobj, name);
685	per_cpu(threshold_banks, i)[bank] = NULL;	686	per_cpu(threshold_banks, i)[bank] = NULL;