aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/apic/apic.c4
-rw-r--r--arch/x86/kernel/apic/nmi.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile10
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c42
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c218
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c1964
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c76
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c1188
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c203
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c74
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c65
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c57
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c86
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c48
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c73
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c17
-rw-r--r--arch/x86/kernel/entry_64.S11
-rw-r--r--arch/x86/kernel/irq.c19
-rw-r--r--arch/x86/kernel/irqinit.c3
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/smp.c28
-rw-r--r--arch/x86/kernel/traps.c6
27 files changed, 2782 insertions, 1614 deletions
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 076d3881f3da..8c7c042ecad1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -899,7 +899,7 @@ void clear_local_APIC(void)
899 } 899 }
900 900
901 /* lets not touch this if we didn't frob it */ 901 /* lets not touch this if we didn't frob it */
902#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) 902#ifdef CONFIG_X86_THERMAL_VECTOR
903 if (maxlvt >= 5) { 903 if (maxlvt >= 5) {
904 v = apic_read(APIC_LVTTHMR); 904 v = apic_read(APIC_LVTTHMR);
905 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 905 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -2017,7 +2017,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
2017 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 2017 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
2018 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 2018 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
2019 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 2019 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
2020#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) 2020#ifdef CONFIG_X86_THERMAL_VECTOR
2021 if (maxlvt >= 5) 2021 if (maxlvt >= 5)
2022 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 2022 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
2023#endif 2023#endif
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index a691302dc3ff..b3025b43b63a 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
66 66
67static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
68{ 68{
69#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) 69#if defined(CONFIG_X86_NEW_MCE)
70 return atomic_read(&mce_entry) > 0; 70 return atomic_read(&mce_entry) > 0;
71#endif 71#endif
72 return 0; 72 return 0;
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index b2f89829bbe8..45004faf67ea 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,7 +1,11 @@
1obj-y = mce_$(BITS).o therm_throt.o 1obj-y = mce.o therm_throt.o
2 2
3obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o 3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o 4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
6obj-$(CONFIG_X86_MCE_P4THERMAL) += mce_intel.o
7obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o mce_intel.o
5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o 8obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o
6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 9obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
7obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o 10obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
11obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index dd3af6e7b39a..89e510424152 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -2,11 +2,10 @@
2 * Athlon specific Machine Check Exception Reporting 2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@redhat.com> 3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -15,12 +14,12 @@
15 14
16#include "mce.h" 15#include "mce.h"
17 16
18/* Machine Check Handler For AMD Athlon/Duron */ 17/* Machine Check Handler For AMD Athlon/Duron: */
19static void k7_machine_check(struct pt_regs *regs, long error_code) 18static void k7_machine_check(struct pt_regs *regs, long error_code)
20{ 19{
21 int recover = 1;
22 u32 alow, ahigh, high, low; 20 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 21 u32 mcgstl, mcgsth;
22 int recover = 1;
24 int i; 23 int i;
25 24
26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 25 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -32,15 +31,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
32 31
33 for (i = 1; i < nr_mce_banks; i++) { 32 for (i = 1; i < nr_mce_banks; i++) {
34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); 33 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
35 if (high&(1<<31)) { 34 if (high & (1<<31)) {
36 char misc[20]; 35 char misc[20];
37 char addr[24]; 36 char addr[24];
38 misc[0] = addr[0] = '\0'; 37
38 misc[0] = '\0';
39 addr[0] = '\0';
40
39 if (high & (1<<29)) 41 if (high & (1<<29))
40 recover |= 1; 42 recover |= 1;
41 if (high & (1<<25)) 43 if (high & (1<<25))
42 recover |= 2; 44 recover |= 2;
43 high &= ~(1<<31); 45 high &= ~(1<<31);
46
44 if (high & (1<<27)) { 47 if (high & (1<<27)) {
45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); 48 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow); 49 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,27 +52,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 52 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 snprintf(addr, 24, " at %08x%08x", ahigh, alow); 53 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
51 } 54 }
55
52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", 56 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr); 57 smp_processor_id(), i, high, low, misc, addr);
54 /* Clear it */ 58
59 /* Clear it: */
55 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 60 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
56 /* Serialize */ 61 /* Serialize: */
57 wmb(); 62 wmb();
58 add_taint(TAINT_MACHINE_CHECK); 63 add_taint(TAINT_MACHINE_CHECK);
59 } 64 }
60 } 65 }
61 66
62 if (recover&2) 67 if (recover & 2)
63 panic("CPU context corrupt"); 68 panic("CPU context corrupt");
64 if (recover&1) 69 if (recover & 1)
65 panic("Unable to continue"); 70 panic("Unable to continue");
71
66 printk(KERN_EMERG "Attempting to continue.\n"); 72 printk(KERN_EMERG "Attempting to continue.\n");
73
67 mcgstl &= ~(1<<2); 74 mcgstl &= ~(1<<2);
68 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 75 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
69} 76}
70 77
71 78
72/* AMD K7 machine check is Intel like */ 79/* AMD K7 machine check is Intel like: */
73void amd_mcheck_init(struct cpuinfo_x86 *c) 80void amd_mcheck_init(struct cpuinfo_x86 *c)
74{ 81{
75 u32 l, h; 82 u32 l, h;
@@ -79,21 +86,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
79 return; 86 return;
80 87
81 machine_check_vector = k7_machine_check; 88 machine_check_vector = k7_machine_check;
89 /* Make sure the vector pointer is visible before we enable MCEs: */
82 wmb(); 90 wmb();
83 91
84 printk(KERN_INFO "Intel machine check architecture supported.\n"); 92 printk(KERN_INFO "Intel machine check architecture supported.\n");
93
85 rdmsr(MSR_IA32_MCG_CAP, l, h); 94 rdmsr(MSR_IA32_MCG_CAP, l, h);
86 if (l & (1<<8)) /* Control register present ? */ 95 if (l & (1<<8)) /* Control register present ? */
87 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 96 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
88 nr_mce_banks = l & 0xff; 97 nr_mce_banks = l & 0xff;
89 98
90 /* Clear status for MC index 0 separately, we don't touch CTL, 99 /*
91 * as some K7 Athlons cause spurious MCEs when its enabled. */ 100 * Clear status for MC index 0 separately, we don't touch CTL,
101 * as some K7 Athlons cause spurious MCEs when its enabled:
102 */
92 if (boot_cpu_data.x86 == 6) { 103 if (boot_cpu_data.x86 == 6) {
93 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); 104 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
94 i = 1; 105 i = 1;
95 } else 106 } else
96 i = 0; 107 i = 0;
108
97 for (; i < nr_mce_banks; i++) { 109 for (; i < nr_mce_banks; i++) {
98 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); 110 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
99 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); 111 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
new file mode 100644
index 000000000000..a3a235a53f09
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -0,0 +1,127 @@
1/*
2 * Machine check injection support.
3 * Copyright 2008 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 *
10 * Authors:
11 * Andi Kleen
12 * Ying Huang
13 */
14#include <linux/uaccess.h>
15#include <linux/module.h>
16#include <linux/timer.h>
17#include <linux/kernel.h>
18#include <linux/string.h>
19#include <linux/fs.h>
20#include <linux/smp.h>
21#include <asm/mce.h>
22
23/* Update fake mce registers on current CPU. */
24static void inject_mce(struct mce *m)
25{
26 struct mce *i = &per_cpu(injectm, m->extcpu);
27
28 /* Make sure noone reads partially written injectm */
29 i->finished = 0;
30 mb();
31 m->finished = 0;
32 /* First set the fields after finished */
33 i->extcpu = m->extcpu;
34 mb();
35 /* Now write record in order, finished last (except above) */
36 memcpy(i, m, sizeof(struct mce));
37 /* Finally activate it */
38 mb();
39 i->finished = 1;
40}
41
42struct delayed_mce {
43 struct timer_list timer;
44 struct mce m;
45};
46
47/* Inject mce on current CPU */
48static void raise_mce(unsigned long data)
49{
50 struct delayed_mce *dm = (struct delayed_mce *)data;
51 struct mce *m = &dm->m;
52 int cpu = m->extcpu;
53
54 inject_mce(m);
55 if (m->status & MCI_STATUS_UC) {
56 struct pt_regs regs;
57 memset(&regs, 0, sizeof(struct pt_regs));
58 regs.ip = m->ip;
59 regs.cs = m->cs;
60 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
61 do_machine_check(&regs, 0);
62 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
63 } else {
64 mce_banks_t b;
65 memset(&b, 0xff, sizeof(mce_banks_t));
66 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
67 machine_check_poll(0, &b);
68 mce_notify_irq();
69 printk(KERN_INFO "Finished machine check poll on CPU %d\n",
70 cpu);
71 }
72 kfree(dm);
73}
74
75/* Error injection interface */
76static ssize_t mce_write(struct file *filp, const char __user *ubuf,
77 size_t usize, loff_t *off)
78{
79 struct delayed_mce *dm;
80 struct mce m;
81
82 if (!capable(CAP_SYS_ADMIN))
83 return -EPERM;
84 /*
85 * There are some cases where real MSR reads could slip
86 * through.
87 */
88 if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
89 return -EIO;
90
91 if ((unsigned long)usize > sizeof(struct mce))
92 usize = sizeof(struct mce);
93 if (copy_from_user(&m, ubuf, usize))
94 return -EFAULT;
95
96 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
97 return -EINVAL;
98
99 dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
100 if (!dm)
101 return -ENOMEM;
102
103 /*
104 * Need to give user space some time to set everything up,
105 * so do it a jiffie or two later everywhere.
106 * Should we use a hrtimer here for better synchronization?
107 */
108 memcpy(&dm->m, &m, sizeof(struct mce));
109 setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
110 dm->timer.expires = jiffies + 2;
111 add_timer_on(&dm->timer, m.extcpu);
112 return usize;
113}
114
115static int inject_init(void)
116{
117 printk(KERN_INFO "Machine check injector initialized\n");
118 mce_chrdev_ops.write = mce_write;
119 return 0;
120}
121
122module_init(inject_init);
123/*
124 * Cannot tolerate unloading currently because we cannot
125 * guarantee all openers of mce_chrdev will get a reference to us.
126 */
127MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
new file mode 100644
index 000000000000..54dcb8ff12e5
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -0,0 +1,15 @@
1#include <asm/mce.h>
2
3enum severity_level {
4 MCE_NO_SEVERITY,
5 MCE_KEEP_SEVERITY,
6 MCE_SOME_SEVERITY,
7 MCE_AO_SEVERITY,
8 MCE_UC_SEVERITY,
9 MCE_AR_SEVERITY,
10 MCE_PANIC_SEVERITY,
11};
12
13int mce_severity(struct mce *a, int tolerant, char **msg);
14
15extern int mce_ser;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
new file mode 100644
index 000000000000..ff0807f97056
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -0,0 +1,218 @@
1/*
2 * MCE grading rules.
3 * Copyright 2008, 2009 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 *
10 * Author: Andi Kleen
11 */
12#include <linux/kernel.h>
13#include <linux/seq_file.h>
14#include <linux/init.h>
15#include <linux/debugfs.h>
16#include <asm/mce.h>
17
18#include "mce-internal.h"
19
20/*
21 * Grade an mce by severity. In general the most severe ones are processed
22 * first. Since there are quite a lot of combinations test the bits in a
23 * table-driven way. The rules are simply processed in order, first
24 * match wins.
25 *
26 * Note this is only used for machine check exceptions, the corrected
27 * errors use much simpler rules. The exceptions still check for the corrected
28 * errors, but only to leave them alone for the CMCI handler (except for
29 * panic situations)
30 */
31
32enum context { IN_KERNEL = 1, IN_USER = 2 };
33enum ser { SER_REQUIRED = 1, NO_SER = 2 };
34
35static struct severity {
36 u64 mask;
37 u64 result;
38 unsigned char sev;
39 unsigned char mcgmask;
40 unsigned char mcgres;
41 unsigned char ser;
42 unsigned char context;
43 unsigned char covered;
44 char *msg;
45} severities[] = {
46#define KERNEL .context = IN_KERNEL
47#define USER .context = IN_USER
48#define SER .ser = SER_REQUIRED
49#define NOSER .ser = NO_SER
50#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
51#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
52#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
53#define MCGMASK(x, res, s, m, r...) \
54 { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
55#define MASK(x, y, s, m, r...) \
56 { .mask = x, .result = y, SEV(s), .msg = m, ## r }
57#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
58#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
59#define MCACOD 0xffff
60
61 BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
62 BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
63 BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
64 /* When MCIP is not set something is very confused */
65 MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
66 /* Neither return not error IP -- no chance to recover -> PANIC */
67 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
68 "Neither restart nor error IP"),
69 MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
70 KERNEL),
71 BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
72 MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
73 "Spurious not enabled", SER),
74
75 /* ignore OVER for UCNA */
76 MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
77 "Uncorrected no action required", SER),
78 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
79 "Illegal combination (UCNA with AR=1)", SER),
80 MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
81
82 /* AR add known MCACODs here */
83 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
84 "Action required with lost events", SER),
85 MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
86 "Action required; unknown MCACOD", SER),
87
88 /* known AO MCACODs: */
89 MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
90 "Action optional: memory scrubbing error", SER),
91 MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
92 "Action optional: last level cache writeback error", SER),
93
94 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
95 "Action optional unknown MCACOD", SER),
96 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
97 "Action optional with lost events", SER),
98 BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
99 BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
100 BITSET(0, SOME, "No match") /* always matches. keep at end */
101};
102
103/*
104 * If the EIPV bit is set, it means the saved IP is the
105 * instruction which caused the MCE.
106 */
107static int error_context(struct mce *m)
108{
109 if (m->mcgstatus & MCG_STATUS_EIPV)
110 return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
111 /* Unknown, assume kernel */
112 return IN_KERNEL;
113}
114
115int mce_severity(struct mce *a, int tolerant, char **msg)
116{
117 enum context ctx = error_context(a);
118 struct severity *s;
119
120 for (s = severities;; s++) {
121 if ((a->status & s->mask) != s->result)
122 continue;
123 if ((a->mcgstatus & s->mcgmask) != s->mcgres)
124 continue;
125 if (s->ser == SER_REQUIRED && !mce_ser)
126 continue;
127 if (s->ser == NO_SER && mce_ser)
128 continue;
129 if (s->context && ctx != s->context)
130 continue;
131 if (msg)
132 *msg = s->msg;
133 s->covered = 1;
134 if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
135 if (panic_on_oops || tolerant < 1)
136 return MCE_PANIC_SEVERITY;
137 }
138 return s->sev;
139 }
140}
141
142static void *s_start(struct seq_file *f, loff_t *pos)
143{
144 if (*pos >= ARRAY_SIZE(severities))
145 return NULL;
146 return &severities[*pos];
147}
148
149static void *s_next(struct seq_file *f, void *data, loff_t *pos)
150{
151 if (++(*pos) >= ARRAY_SIZE(severities))
152 return NULL;
153 return &severities[*pos];
154}
155
156static void s_stop(struct seq_file *f, void *data)
157{
158}
159
160static int s_show(struct seq_file *f, void *data)
161{
162 struct severity *ser = data;
163 seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
164 return 0;
165}
166
167static const struct seq_operations severities_seq_ops = {
168 .start = s_start,
169 .next = s_next,
170 .stop = s_stop,
171 .show = s_show,
172};
173
174static int severities_coverage_open(struct inode *inode, struct file *file)
175{
176 return seq_open(file, &severities_seq_ops);
177}
178
179static ssize_t severities_coverage_write(struct file *file,
180 const char __user *ubuf,
181 size_t count, loff_t *ppos)
182{
183 int i;
184 for (i = 0; i < ARRAY_SIZE(severities); i++)
185 severities[i].covered = 0;
186 return count;
187}
188
189static const struct file_operations severities_coverage_fops = {
190 .open = severities_coverage_open,
191 .release = seq_release,
192 .read = seq_read,
193 .write = severities_coverage_write,
194};
195
196static int __init severities_debugfs_init(void)
197{
198 struct dentry *dmce = NULL, *fseverities_coverage = NULL;
199
200 dmce = debugfs_create_dir("mce", NULL);
201 if (dmce == NULL)
202 goto err_out;
203 fseverities_coverage = debugfs_create_file("severities-coverage",
204 0444, dmce, NULL,
205 &severities_coverage_fops);
206 if (fseverities_coverage == NULL)
207 goto err_out;
208
209 return 0;
210
211err_out:
212 if (fseverities_coverage)
213 debugfs_remove(fseverities_coverage);
214 if (dmce)
215 debugfs_remove(dmce);
216 return -ENOMEM;
217}
218late_initcall(severities_debugfs_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
new file mode 100644
index 000000000000..fabba15e4558
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -0,0 +1,1964 @@
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h>
15#include <linux/kallsyms.h>
16#include <linux/rcupdate.h>
17#include <linux/kobject.h>
18#include <linux/uaccess.h>
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
22#include <linux/string.h>
23#include <linux/sysdev.h>
24#include <linux/delay.h>
25#include <linux/ctype.h>
26#include <linux/sched.h>
27#include <linux/sysfs.h>
28#include <linux/types.h>
29#include <linux/init.h>
30#include <linux/kmod.h>
31#include <linux/poll.h>
32#include <linux/nmi.h>
33#include <linux/cpu.h>
34#include <linux/smp.h>
35#include <linux/fs.h>
36#include <linux/mm.h>
37
38#include <asm/processor.h>
39#include <asm/hw_irq.h>
40#include <asm/apic.h>
41#include <asm/idle.h>
42#include <asm/ipi.h>
43#include <asm/mce.h>
44#include <asm/msr.h>
45
46#include "mce-internal.h"
47#include "mce.h"
48
49/* Handle unconfigured int18 (should never happen) */
50static void unexpected_machine_check(struct pt_regs *regs, long error_code)
51{
52 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
53 smp_processor_id());
54}
55
56/* Call the installed machine check handler for this CPU setup. */
57void (*machine_check_vector)(struct pt_regs *, long error_code) =
58 unexpected_machine_check;
59
60int mce_disabled;
61
62#ifdef CONFIG_X86_NEW_MCE
63
64#define MISC_MCELOG_MINOR 227
65
66#define SPINUNIT 100 /* 100ns */
67
68atomic_t mce_entry;
69
70DEFINE_PER_CPU(unsigned, mce_exception_count);
71
72/*
73 * Tolerant levels:
74 * 0: always panic on uncorrected errors, log corrected errors
75 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
76 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
77 * 3: never panic or SIGBUS, log all errors (for testing only)
78 */
79static int tolerant = 1;
80static int banks;
81static u64 *bank;
82static unsigned long notify_user;
83static int rip_msr;
84static int mce_bootlog = -1;
85static int monarch_timeout = -1;
86static int mce_panic_timeout;
87static int mce_dont_log_ce;
88int mce_cmci_disabled;
89int mce_ignore_ce;
90int mce_ser;
91
92static char trigger[128];
93static char *trigger_argv[2] = { trigger, NULL };
94
95static unsigned long dont_init_banks;
96
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
98static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing;
100
101
102/* MCA banks polled by the period polling timer for corrected events */
103DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
105};
106
107static inline int skip_bank_init(int i)
108{
109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
110}
111
112static DEFINE_PER_CPU(struct work_struct, mce_work);
113
114/* Do initial initialization of a struct mce */
115void mce_setup(struct mce *m)
116{
117 memset(m, 0, sizeof(struct mce));
118 m->cpu = m->extcpu = smp_processor_id();
119 rdtscll(m->tsc);
120 /* We hope get_seconds stays lockless */
121 m->time = get_seconds();
122 m->cpuvendor = boot_cpu_data.x86_vendor;
123 m->cpuid = cpuid_eax(1);
124#ifdef CONFIG_SMP
125 m->socketid = cpu_data(m->extcpu).phys_proc_id;
126#endif
127 m->apicid = cpu_data(m->extcpu).initial_apicid;
128 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
129}
130
131DEFINE_PER_CPU(struct mce, injectm);
132EXPORT_PER_CPU_SYMBOL_GPL(injectm);
133
134/*
135 * Lockless MCE logging infrastructure.
136 * This avoids deadlocks on printk locks without having to break locks. Also
137 * separate MCEs from kernel messages to avoid bogus bug reports.
138 */
139
140static struct mce_log mcelog = {
141 .signature = MCE_LOG_SIGNATURE,
142 .len = MCE_LOG_LEN,
143 .recordlen = sizeof(struct mce),
144};
145
146void mce_log(struct mce *mce)
147{
148 unsigned next, entry;
149
150 mce->finished = 0;
151 wmb();
152 for (;;) {
153 entry = rcu_dereference(mcelog.next);
154 for (;;) {
155 /*
156 * When the buffer fills up discard new entries.
157 * Assume that the earlier errors are the more
158 * interesting ones:
159 */
160 if (entry >= MCE_LOG_LEN) {
161 set_bit(MCE_OVERFLOW,
162 (unsigned long *)&mcelog.flags);
163 return;
164 }
165 /* Old left over entry. Skip: */
166 if (mcelog.entry[entry].finished) {
167 entry++;
168 continue;
169 }
170 break;
171 }
172 smp_rmb();
173 next = entry + 1;
174 if (cmpxchg(&mcelog.next, entry, next) == entry)
175 break;
176 }
177 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
178 wmb();
179 mcelog.entry[entry].finished = 1;
180 wmb();
181
182 mce->finished = 1;
183 set_bit(0, &notify_user);
184}
185
186static void print_mce(struct mce *m)
187{
188 printk(KERN_EMERG
189 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
190 m->extcpu, m->mcgstatus, m->bank, m->status);
191 if (m->ip) {
192 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
193 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
194 m->cs, m->ip);
195 if (m->cs == __KERNEL_CS)
196 print_symbol("{%s}", m->ip);
197 printk("\n");
198 }
199 printk(KERN_EMERG "TSC %llx ", m->tsc);
200 if (m->addr)
201 printk("ADDR %llx ", m->addr);
202 if (m->misc)
203 printk("MISC %llx ", m->misc);
204 printk("\n");
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid);
208}
209
210static void print_mce_head(void)
211{
212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
213}
214
215static void print_mce_tail(void)
216{
217 printk(KERN_EMERG "This is not a software problem!\n"
218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
219}
220
221#define PANIC_TIMEOUT 5 /* 5 seconds */
222
223static atomic_t mce_paniced;
224
225/* Panic in progress. Enable interrupts and wait for final IPI */
226static void wait_for_panic(void)
227{
228 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
229 preempt_disable();
230 local_irq_enable();
231 while (timeout-- > 0)
232 udelay(1);
233 if (panic_timeout == 0)
234 panic_timeout = mce_panic_timeout;
235 panic("Panicing machine check CPU died");
236}
237
238static void mce_panic(char *msg, struct mce *final, char *exp)
239{
240 int i;
241
242 /*
243 * Make sure only one CPU runs in machine check panic
244 */
245 if (atomic_add_return(1, &mce_paniced) > 1)
246 wait_for_panic();
247 barrier();
248
249 bust_spinlocks(1);
250 console_verbose();
251 print_mce_head();
252 /* First print corrected ones that are still unlogged */
253 for (i = 0; i < MCE_LOG_LEN; i++) {
254 struct mce *m = &mcelog.entry[i];
255 if (!(m->status & MCI_STATUS_VAL))
256 continue;
257 if (!(m->status & MCI_STATUS_UC))
258 print_mce(m);
259 }
260 /* Now print uncorrected but with the final one last */
261 for (i = 0; i < MCE_LOG_LEN; i++) {
262 struct mce *m = &mcelog.entry[i];
263 if (!(m->status & MCI_STATUS_VAL))
264 continue;
265 if (!(m->status & MCI_STATUS_UC))
266 continue;
267 if (!final || memcmp(m, final, sizeof(struct mce)))
268 print_mce(m);
269 }
270 if (final)
271 print_mce(final);
272 if (cpu_missing)
273 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
274 print_mce_tail();
275 if (exp)
276 printk(KERN_EMERG "Machine check: %s\n", exp);
277 if (panic_timeout == 0)
278 panic_timeout = mce_panic_timeout;
279 panic(msg);
280}
281
282/* Support code for software error injection */
283
284static int msr_to_offset(u32 msr)
285{
286 unsigned bank = __get_cpu_var(injectm.bank);
287 if (msr == rip_msr)
288 return offsetof(struct mce, ip);
289 if (msr == MSR_IA32_MC0_STATUS + bank*4)
290 return offsetof(struct mce, status);
291 if (msr == MSR_IA32_MC0_ADDR + bank*4)
292 return offsetof(struct mce, addr);
293 if (msr == MSR_IA32_MC0_MISC + bank*4)
294 return offsetof(struct mce, misc);
295 if (msr == MSR_IA32_MCG_STATUS)
296 return offsetof(struct mce, mcgstatus);
297 return -1;
298}
299
300/* MSR access wrappers used for error injection */
301static u64 mce_rdmsrl(u32 msr)
302{
303 u64 v;
304 if (__get_cpu_var(injectm).finished) {
305 int offset = msr_to_offset(msr);
306 if (offset < 0)
307 return 0;
308 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
309 }
310 rdmsrl(msr, v);
311 return v;
312}
313
314static void mce_wrmsrl(u32 msr, u64 v)
315{
316 if (__get_cpu_var(injectm).finished) {
317 int offset = msr_to_offset(msr);
318 if (offset >= 0)
319 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
320 return;
321 }
322 wrmsrl(msr, v);
323}
324
325/*
326 * Simple lockless ring to communicate PFNs from the exception handler with the
327 * process context work function. This is vastly simplified because there's
328 * only a single reader and a single writer.
329 */
330#define MCE_RING_SIZE 16 /* we use one entry less */
331
332struct mce_ring {
333 unsigned short start;
334 unsigned short end;
335 unsigned long ring[MCE_RING_SIZE];
336};
337static DEFINE_PER_CPU(struct mce_ring, mce_ring);
338
339/* Runs with CPU affinity in workqueue */
340static int mce_ring_empty(void)
341{
342 struct mce_ring *r = &__get_cpu_var(mce_ring);
343
344 return r->start == r->end;
345}
346
347static int mce_ring_get(unsigned long *pfn)
348{
349 struct mce_ring *r;
350 int ret = 0;
351
352 *pfn = 0;
353 get_cpu();
354 r = &__get_cpu_var(mce_ring);
355 if (r->start == r->end)
356 goto out;
357 *pfn = r->ring[r->start];
358 r->start = (r->start + 1) % MCE_RING_SIZE;
359 ret = 1;
360out:
361 put_cpu();
362 return ret;
363}
364
365/* Always runs in MCE context with preempt off */
366static int mce_ring_add(unsigned long pfn)
367{
368 struct mce_ring *r = &__get_cpu_var(mce_ring);
369 unsigned next;
370
371 next = (r->end + 1) % MCE_RING_SIZE;
372 if (next == r->start)
373 return -1;
374 r->ring[r->end] = pfn;
375 wmb();
376 r->end = next;
377 return 0;
378}
379
380int mce_available(struct cpuinfo_x86 *c)
381{
382 if (mce_disabled)
383 return 0;
384 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
385}
386
387static void mce_schedule_work(void)
388{
389 if (!mce_ring_empty()) {
390 struct work_struct *work = &__get_cpu_var(mce_work);
391 if (!work_pending(work))
392 schedule_work(work);
393 }
394}
395
396/*
397 * Get the address of the instruction at the time of the machine check
398 * error.
399 */
400static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
401{
402
403 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
404 m->ip = regs->ip;
405 m->cs = regs->cs;
406 } else {
407 m->ip = 0;
408 m->cs = 0;
409 }
410 if (rip_msr)
411 m->ip = mce_rdmsrl(rip_msr);
412}
413
414#ifdef CONFIG_X86_LOCAL_APIC
415/*
416 * Called after interrupts have been reenabled again
417 * when a MCE happened during an interrupts off region
418 * in the kernel.
419 */
420asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
421{
422 ack_APIC_irq();
423 exit_idle();
424 irq_enter();
425 mce_notify_irq();
426 mce_schedule_work();
427 irq_exit();
428}
429#endif
430
431static void mce_report_event(struct pt_regs *regs)
432{
433 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
434 mce_notify_irq();
435 /*
436 * Triggering the work queue here is just an insurance
437 * policy in case the syscall exit notify handler
438 * doesn't run soon enough or ends up running on the
439 * wrong CPU (can happen when audit sleeps)
440 */
441 mce_schedule_work();
442 return;
443 }
444
445#ifdef CONFIG_X86_LOCAL_APIC
446 /*
447 * Without APIC do not notify. The event will be picked
448 * up eventually.
449 */
450 if (!cpu_has_apic)
451 return;
452
453 /*
454 * When interrupts are disabled we cannot use
455 * kernel services safely. Trigger an self interrupt
456 * through the APIC to instead do the notification
457 * after interrupts are reenabled again.
458 */
459 apic->send_IPI_self(MCE_SELF_VECTOR);
460
461 /*
462 * Wait for idle afterwards again so that we don't leave the
463 * APIC in a non idle state because the normal APIC writes
464 * cannot exclude us.
465 */
466 apic_wait_icr_idle();
467#endif
468}
469
470DEFINE_PER_CPU(unsigned, mce_poll_count);
471
472/*
473 * Poll for corrected events or events that happened before reset.
474 * Those are just logged through /dev/mcelog.
475 *
476 * This is executed in standard interrupt context.
477 *
478 * Note: spec recommends to panic for fatal unsignalled
479 * errors here. However this would be quite problematic --
480 * we would need to reimplement the Monarch handling and
481 * it would mess up the exclusion between exception handler
482 * and poll hander -- * so we skip this for now.
483 * These cases should not happen anyways, or only when the CPU
484 * is already totally * confused. In this case it's likely it will
485 * not fully execute the machine check handler either.
486 */
487void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
488{
489 struct mce m;
490 int i;
491
492 __get_cpu_var(mce_poll_count)++;
493
494 mce_setup(&m);
495
496 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
497 for (i = 0; i < banks; i++) {
498 if (!bank[i] || !test_bit(i, *b))
499 continue;
500
501 m.misc = 0;
502 m.addr = 0;
503 m.bank = i;
504 m.tsc = 0;
505
506 barrier();
507 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
508 if (!(m.status & MCI_STATUS_VAL))
509 continue;
510
511 /*
512 * Uncorrected or signalled events are handled by the exception
513 * handler when it is enabled, so don't process those here.
514 *
515 * TBD do the same check for MCI_STATUS_EN here?
516 */
517 if (!(flags & MCP_UC) &&
518 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
519 continue;
520
521 if (m.status & MCI_STATUS_MISCV)
522 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
523 if (m.status & MCI_STATUS_ADDRV)
524 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
525
526 if (!(flags & MCP_TIMESTAMP))
527 m.tsc = 0;
528 /*
529 * Don't get the IP here because it's unlikely to
530 * have anything to do with the actual error location.
531 */
532 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
533 mce_log(&m);
534 add_taint(TAINT_MACHINE_CHECK);
535 }
536
537 /*
538 * Clear state for this bank.
539 */
540 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
541 }
542
543 /*
544 * Don't clear MCG_STATUS here because it's only defined for
545 * exceptions.
546 */
547
548 sync_core();
549}
550EXPORT_SYMBOL_GPL(machine_check_poll);
551
552/*
553 * Do a quick check if any of the events requires a panic.
554 * This decides if we keep the events around or clear them.
555 */
556static int mce_no_way_out(struct mce *m, char **msg)
557{
558 int i;
559
560 for (i = 0; i < banks; i++) {
561 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
562 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
563 return 1;
564 }
565 return 0;
566}
567
568/*
569 * Variable to establish order between CPUs while scanning.
570 * Each CPU spins initially until executing is equal its number.
571 */
572static atomic_t mce_executing;
573
574/*
575 * Defines order of CPUs on entry. First CPU becomes Monarch.
576 */
577static atomic_t mce_callin;
578
579/*
580 * Check if a timeout waiting for other CPUs happened.
581 */
582static int mce_timed_out(u64 *t)
583{
584 /*
585 * The others already did panic for some reason.
586 * Bail out like in a timeout.
587 * rmb() to tell the compiler that system_state
588 * might have been modified by someone else.
589 */
590 rmb();
591 if (atomic_read(&mce_paniced))
592 wait_for_panic();
593 if (!monarch_timeout)
594 goto out;
595 if ((s64)*t < SPINUNIT) {
596 /* CHECKME: Make panic default for 1 too? */
597 if (tolerant < 1)
598 mce_panic("Timeout synchronizing machine check over CPUs",
599 NULL, NULL);
600 cpu_missing = 1;
601 return 1;
602 }
603 *t -= SPINUNIT;
604out:
605 touch_nmi_watchdog();
606 return 0;
607}
608
609/*
610 * The Monarch's reign. The Monarch is the CPU who entered
611 * the machine check handler first. It waits for the others to
612 * raise the exception too and then grades them. When any
613 * error is fatal panic. Only then let the others continue.
614 *
615 * The other CPUs entering the MCE handler will be controlled by the
616 * Monarch. They are called Subjects.
617 *
618 * This way we prevent any potential data corruption in a unrecoverable case
619 * and also makes sure always all CPU's errors are examined.
620 *
621 * Also this detects the case of an machine check event coming from outer
622 * space (not detected by any CPUs) In this case some external agent wants
623 * us to shut down, so panic too.
624 *
625 * The other CPUs might still decide to panic if the handler happens
626 * in a unrecoverable place, but in this case the system is in a semi-stable
627 * state and won't corrupt anything by itself. It's ok to let the others
628 * continue for a bit first.
629 *
630 * All the spin loops have timeouts; when a timeout happens a CPU
631 * typically elects itself to be Monarch.
632 */
633static void mce_reign(void)
634{
635 int cpu;
636 struct mce *m = NULL;
637 int global_worst = 0;
638 char *msg = NULL;
639 char *nmsg = NULL;
640
641 /*
642 * This CPU is the Monarch and the other CPUs have run
643 * through their handlers.
644 * Grade the severity of the errors of all the CPUs.
645 */
646 for_each_possible_cpu(cpu) {
647 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
648 &nmsg);
649 if (severity > global_worst) {
650 msg = nmsg;
651 global_worst = severity;
652 m = &per_cpu(mces_seen, cpu);
653 }
654 }
655
656 /*
657 * Cannot recover? Panic here then.
658 * This dumps all the mces in the log buffer and stops the
659 * other CPUs.
660 */
661 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
662 mce_panic("Fatal Machine check", m, msg);
663
664 /*
665 * For UC somewhere we let the CPU who detects it handle it.
666 * Also must let continue the others, otherwise the handling
667 * CPU could deadlock on a lock.
668 */
669
670 /*
671 * No machine check event found. Must be some external
672 * source or one CPU is hung. Panic.
673 */
674 if (!m && tolerant < 3)
675 mce_panic("Machine check from unknown source", NULL, NULL);
676
677 /*
678 * Now clear all the mces_seen so that they don't reappear on
679 * the next mce.
680 */
681 for_each_possible_cpu(cpu)
682 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
683}
684
685static atomic_t global_nwo;
686
687/*
688 * Start of Monarch synchronization. This waits until all CPUs have
689 * entered the exception handler and then determines if any of them
690 * saw a fatal event that requires panic. Then it executes them
691 * in the entry order.
692 * TBD double check parallel CPU hotunplug
693 */
694static int mce_start(int no_way_out, int *order)
695{
696 int nwo;
697 int cpus = num_online_cpus();
698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
699
700 if (!timeout) {
701 *order = -1;
702 return no_way_out;
703 }
704
705 atomic_add(no_way_out, &global_nwo);
706
707 /*
708 * Wait for everyone.
709 */
710 while (atomic_read(&mce_callin) != cpus) {
711 if (mce_timed_out(&timeout)) {
712 atomic_set(&global_nwo, 0);
713 *order = -1;
714 return no_way_out;
715 }
716 ndelay(SPINUNIT);
717 }
718
719 /*
720 * Cache the global no_way_out state.
721 */
722 nwo = atomic_read(&global_nwo);
723
724 /*
725 * Monarch starts executing now, the others wait.
726 */
727 if (*order == 1) {
728 atomic_set(&mce_executing, 1);
729 return nwo;
730 }
731
732 /*
733 * Now start the scanning loop one by one
734 * in the original callin order.
735 * This way when there are any shared banks it will
736 * be only seen by one CPU before cleared, avoiding duplicates.
737 */
738 while (atomic_read(&mce_executing) < *order) {
739 if (mce_timed_out(&timeout)) {
740 atomic_set(&global_nwo, 0);
741 *order = -1;
742 return no_way_out;
743 }
744 ndelay(SPINUNIT);
745 }
746 return nwo;
747}
748
749/*
750 * Synchronize between CPUs after main scanning loop.
751 * This invokes the bulk of the Monarch processing.
752 */
753static int mce_end(int order)
754{
755 int ret = -1;
756 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
757
758 if (!timeout)
759 goto reset;
760 if (order < 0)
761 goto reset;
762
763 /*
764 * Allow others to run.
765 */
766 atomic_inc(&mce_executing);
767
768 if (order == 1) {
769 /* CHECKME: Can this race with a parallel hotplug? */
770 int cpus = num_online_cpus();
771
772 /*
773 * Monarch: Wait for everyone to go through their scanning
774 * loops.
775 */
776 while (atomic_read(&mce_executing) <= cpus) {
777 if (mce_timed_out(&timeout))
778 goto reset;
779 ndelay(SPINUNIT);
780 }
781
782 mce_reign();
783 barrier();
784 ret = 0;
785 } else {
786 /*
787 * Subject: Wait for Monarch to finish.
788 */
789 while (atomic_read(&mce_executing) != 0) {
790 if (mce_timed_out(&timeout))
791 goto reset;
792 ndelay(SPINUNIT);
793 }
794
795 /*
796 * Don't reset anything. That's done by the Monarch.
797 */
798 return 0;
799 }
800
801 /*
802 * Reset all global state.
803 */
804reset:
805 atomic_set(&global_nwo, 0);
806 atomic_set(&mce_callin, 0);
807 barrier();
808
809 /*
810 * Let others run again.
811 */
812 atomic_set(&mce_executing, 0);
813 return ret;
814}
815
816/*
817 * Check if the address reported by the CPU is in a format we can parse.
818 * It would be possible to add code for most other cases, but all would
819 * be somewhat complicated (e.g. segment offset would require an instruction
820 * parser). So only support physical addresses upto page granuality for now.
821 */
822static int mce_usable_address(struct mce *m)
823{
824 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
825 return 0;
826 if ((m->misc & 0x3f) > PAGE_SHIFT)
827 return 0;
828 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
829 return 0;
830 return 1;
831}
832
833static void mce_clear_state(unsigned long *toclear)
834{
835 int i;
836
837 for (i = 0; i < banks; i++) {
838 if (test_bit(i, toclear))
839 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
840 }
841}
842
843/*
844 * The actual machine check handler. This only handles real
845 * exceptions when something got corrupted coming in through int 18.
846 *
847 * This is executed in NMI context not subject to normal locking rules. This
848 * implies that most kernel services cannot be safely used. Don't even
849 * think about putting a printk in there!
850 *
851 * On Intel systems this is entered on all CPUs in parallel through
852 * MCE broadcast. However some CPUs might be broken beyond repair,
853 * so be always careful when synchronizing with others.
854 */
855void do_machine_check(struct pt_regs *regs, long error_code)
856{
857 struct mce m, *final;
858 int i;
859 int worst = 0;
860 int severity;
861 /*
862 * Establish sequential order between the CPUs entering the machine
863 * check handler.
864 */
865 int order;
866
867 /*
868 * If no_way_out gets set, there is no safe way to recover from this
869 * MCE. If tolerant is cranked up, we'll try anyway.
870 */
871 int no_way_out = 0;
872 /*
873 * If kill_it gets set, there might be a way to recover from this
874 * error.
875 */
876 int kill_it = 0;
877 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
878 char *msg = "Unknown";
879
880 atomic_inc(&mce_entry);
881
882 __get_cpu_var(mce_exception_count)++;
883
884 if (notify_die(DIE_NMI, "machine check", regs, error_code,
885 18, SIGKILL) == NOTIFY_STOP)
886 goto out;
887 if (!banks)
888 goto out;
889
890 order = atomic_add_return(1, &mce_callin);
891 mce_setup(&m);
892
893 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
894 no_way_out = mce_no_way_out(&m, &msg);
895
896 final = &__get_cpu_var(mces_seen);
897 *final = m;
898
899 barrier();
900
901 /*
902 * When no restart IP must always kill or panic.
903 */
904 if (!(m.mcgstatus & MCG_STATUS_RIPV))
905 kill_it = 1;
906
907 /*
908 * Go through all the banks in exclusion of the other CPUs.
909 * This way we don't report duplicated events on shared banks
910 * because the first one to see it will clear it.
911 */
912 no_way_out = mce_start(no_way_out, &order);
913 for (i = 0; i < banks; i++) {
914 __clear_bit(i, toclear);
915 if (!bank[i])
916 continue;
917
918 m.misc = 0;
919 m.addr = 0;
920 m.bank = i;
921
922 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
923 if ((m.status & MCI_STATUS_VAL) == 0)
924 continue;
925
926 /*
927 * Non uncorrected or non signaled errors are handled by
928 * machine_check_poll. Leave them alone, unless this panics.
929 */
930 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
931 !no_way_out)
932 continue;
933
934 /*
935 * Set taint even when machine check was not enabled.
936 */
937 add_taint(TAINT_MACHINE_CHECK);
938
939 severity = mce_severity(&m, tolerant, NULL);
940
941 /*
942 * When machine check was for corrected handler don't touch,
943 * unless we're panicing.
944 */
945 if (severity == MCE_KEEP_SEVERITY && !no_way_out)
946 continue;
947 __set_bit(i, toclear);
948 if (severity == MCE_NO_SEVERITY) {
949 /*
950 * Machine check event was not enabled. Clear, but
951 * ignore.
952 */
953 continue;
954 }
955
956 /*
957 * Kill on action required.
958 */
959 if (severity == MCE_AR_SEVERITY)
960 kill_it = 1;
961
962 if (m.status & MCI_STATUS_MISCV)
963 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
964 if (m.status & MCI_STATUS_ADDRV)
965 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
966
967 /*
968 * Action optional error. Queue address for later processing.
969 * When the ring overflows we just ignore the AO error.
970 * RED-PEN add some logging mechanism when
971 * usable_address or mce_add_ring fails.
972 * RED-PEN don't ignore overflow for tolerant == 0
973 */
974 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
975 mce_ring_add(m.addr >> PAGE_SHIFT);
976
977 mce_get_rip(&m, regs);
978 mce_log(&m);
979
980 if (severity > worst) {
981 *final = m;
982 worst = severity;
983 }
984 }
985
986 if (!no_way_out)
987 mce_clear_state(toclear);
988
989 /*
990 * Do most of the synchronization with other CPUs.
991 * When there's any problem use only local no_way_out state.
992 */
993 if (mce_end(order) < 0)
994 no_way_out = worst >= MCE_PANIC_SEVERITY;
995
996 /*
997 * If we have decided that we just CAN'T continue, and the user
998 * has not set tolerant to an insane level, give up and die.
999 *
1000 * This is mainly used in the case when the system doesn't
1001 * support MCE broadcasting or it has been disabled.
1002 */
1003 if (no_way_out && tolerant < 3)
1004 mce_panic("Fatal machine check on current CPU", final, msg);
1005
1006 /*
1007 * If the error seems to be unrecoverable, something should be
1008 * done. Try to kill as little as possible. If we can kill just
1009 * one task, do that. If the user has set the tolerance very
1010 * high, don't try to do anything at all.
1011 */
1012
1013 if (kill_it && tolerant < 3)
1014 force_sig(SIGBUS, current);
1015
1016 /* notify userspace ASAP */
1017 set_thread_flag(TIF_MCE_NOTIFY);
1018
1019 if (worst > 0)
1020 mce_report_event(regs);
1021 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1022out:
1023 atomic_dec(&mce_entry);
1024 sync_core();
1025}
1026EXPORT_SYMBOL_GPL(do_machine_check);
1027
1028/* dummy to break dependency. actual code is in mm/memory-failure.c */
1029void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1030{
1031 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1032}
1033
1034/*
1035 * Called after mce notification in process context. This code
1036 * is allowed to sleep. Call the high level VM handler to process
1037 * any corrupted pages.
1038 * Assume that the work queue code only calls this one at a time
1039 * per CPU.
1040 * Note we don't disable preemption, so this code might run on the wrong
1041 * CPU. In this case the event is picked up by the scheduled work queue.
1042 * This is merely a fast path to expedite processing in some common
1043 * cases.
1044 */
1045void mce_notify_process(void)
1046{
1047 unsigned long pfn;
1048 mce_notify_irq();
1049 while (mce_ring_get(&pfn))
1050 memory_failure(pfn, MCE_VECTOR);
1051}
1052
1053static void mce_process_work(struct work_struct *dummy)
1054{
1055 mce_notify_process();
1056}
1057
1058#ifdef CONFIG_X86_MCE_INTEL
1059/***
1060 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1061 * @cpu: The CPU on which the event occurred.
1062 * @status: Event status information
1063 *
1064 * This function should be called by the thermal interrupt after the
1065 * event has been processed and the decision was made to log the event
1066 * further.
1067 *
1068 * The status parameter will be saved to the 'status' field of 'struct mce'
1069 * and historically has been the register value of the
1070 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1071 */
1072void mce_log_therm_throt_event(__u64 status)
1073{
1074 struct mce m;
1075
1076 mce_setup(&m);
1077 m.bank = MCE_THERMAL_BANK;
1078 m.status = status;
1079 mce_log(&m);
1080}
1081#endif /* CONFIG_X86_MCE_INTEL */
1082
1083/*
1084 * Periodic polling timer for "silent" machine check errors. If the
1085 * poller finds an MCE, poll 2x faster. When the poller finds no more
1086 * errors, poll 2x slower (up to check_interval seconds).
1087 */
1088static int check_interval = 5 * 60; /* 5 minutes */
1089
1090static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
1091static DEFINE_PER_CPU(struct timer_list, mce_timer);
1092
1093static void mcheck_timer(unsigned long data)
1094{
1095 struct timer_list *t = &per_cpu(mce_timer, data);
1096 int *n;
1097
1098 WARN_ON(smp_processor_id() != data);
1099
1100 if (mce_available(&current_cpu_data)) {
1101 machine_check_poll(MCP_TIMESTAMP,
1102 &__get_cpu_var(mce_poll_banks));
1103 }
1104
1105 /*
1106 * Alert userspace if needed. If we logged an MCE, reduce the
1107 * polling interval, otherwise increase the polling interval.
1108 */
1109 n = &__get_cpu_var(next_interval);
1110 if (mce_notify_irq())
1111 *n = max(*n/2, HZ/100);
1112 else
1113 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1114
1115 t->expires = jiffies + *n;
1116 add_timer(t);
1117}
1118
1119static void mce_do_trigger(struct work_struct *work)
1120{
1121 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
1122}
1123
1124static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1125
1126/*
1127 * Notify the user(s) about new machine check events.
1128 * Can be called from interrupt context, but not from machine check/NMI
1129 * context.
1130 */
1131int mce_notify_irq(void)
1132{
1133 /* Not more than two messages every minute */
1134 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1135
1136 clear_thread_flag(TIF_MCE_NOTIFY);
1137
1138 if (test_and_clear_bit(0, &notify_user)) {
1139 wake_up_interruptible(&mce_wait);
1140
1141 /*
1142 * There is no risk of missing notifications because
1143 * work_pending is always cleared before the function is
1144 * executed.
1145 */
1146 if (trigger[0] && !work_pending(&mce_trigger_work))
1147 schedule_work(&mce_trigger_work);
1148
1149 if (__ratelimit(&ratelimit))
1150 printk(KERN_INFO "Machine check events logged\n");
1151
1152 return 1;
1153 }
1154 return 0;
1155}
1156EXPORT_SYMBOL_GPL(mce_notify_irq);
1157
1158/*
1159 * Initialize Machine Checks for a CPU.
1160 */
1161static int mce_cap_init(void)
1162{
1163 unsigned b;
1164 u64 cap;
1165
1166 rdmsrl(MSR_IA32_MCG_CAP, cap);
1167
1168 b = cap & MCG_BANKCNT_MASK;
1169 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1170
1171 if (b > MAX_NR_BANKS) {
1172 printk(KERN_WARNING
1173 "MCE: Using only %u machine check banks out of %u\n",
1174 MAX_NR_BANKS, b);
1175 b = MAX_NR_BANKS;
1176 }
1177
1178 /* Don't support asymmetric configurations today */
1179 WARN_ON(banks != 0 && b != banks);
1180 banks = b;
1181 if (!bank) {
1182 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
1183 if (!bank)
1184 return -ENOMEM;
1185 memset(bank, 0xff, banks * sizeof(u64));
1186 }
1187
1188 /* Use accurate RIP reporting if available. */
1189 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1190 rip_msr = MSR_IA32_MCG_EIP;
1191
1192 if (cap & MCG_SER_P)
1193 mce_ser = 1;
1194
1195 return 0;
1196}
1197
1198static void mce_init(void)
1199{
1200 mce_banks_t all_banks;
1201 u64 cap;
1202 int i;
1203
1204 /*
1205 * Log the machine checks left over from the previous reset.
1206 */
1207 bitmap_fill(all_banks, MAX_NR_BANKS);
1208 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1209
1210 set_in_cr4(X86_CR4_MCE);
1211
1212 rdmsrl(MSR_IA32_MCG_CAP, cap);
1213 if (cap & MCG_CTL_P)
1214 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1215
1216 for (i = 0; i < banks; i++) {
1217 if (skip_bank_init(i))
1218 continue;
1219 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
1220 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
1221 }
1222}
1223
1224/* Add per CPU specific workarounds here */
1225static void mce_cpu_quirks(struct cpuinfo_x86 *c)
1226{
1227 /* This should be disabled by the BIOS, but isn't always */
1228 if (c->x86_vendor == X86_VENDOR_AMD) {
1229 if (c->x86 == 15 && banks > 4) {
1230 /*
1231 * disable GART TBL walk error reporting, which
1232 * trips off incorrectly with the IOMMU & 3ware
1233 * & Cerberus:
1234 */
1235 clear_bit(10, (unsigned long *)&bank[4]);
1236 }
1237 if (c->x86 <= 17 && mce_bootlog < 0) {
1238 /*
1239 * Lots of broken BIOS around that don't clear them
1240 * by default and leave crap in there. Don't log:
1241 */
1242 mce_bootlog = 0;
1243 }
1244 /*
1245 * Various K7s with broken bank 0 around. Always disable
1246 * by default.
1247 */
1248 if (c->x86 == 6)
1249 bank[0] = 0;
1250 }
1251
1252 if (c->x86_vendor == X86_VENDOR_INTEL) {
1253 /*
1254 * SDM documents that on family 6 bank 0 should not be written
1255 * because it aliases to another special BIOS controlled
1256 * register.
1257 * But it's not aliased anymore on model 0x1a+
1258 * Don't ignore bank 0 completely because there could be a
1259 * valid event later, merely don't write CTL0.
1260 */
1261
1262 if (c->x86 == 6 && c->x86_model < 0x1A)
1263 __set_bit(0, &dont_init_banks);
1264
1265 /*
1266 * All newer Intel systems support MCE broadcasting. Enable
1267 * synchronization with a one second timeout.
1268 */
1269 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1270 monarch_timeout < 0)
1271 monarch_timeout = USEC_PER_SEC;
1272 }
1273 if (monarch_timeout < 0)
1274 monarch_timeout = 0;
1275 if (mce_bootlog != 0)
1276 mce_panic_timeout = 30;
1277}
1278
1279static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1280{
1281 if (c->x86 != 5)
1282 return;
1283 switch (c->x86_vendor) {
1284 case X86_VENDOR_INTEL:
1285 if (mce_p5_enabled())
1286 intel_p5_mcheck_init(c);
1287 break;
1288 case X86_VENDOR_CENTAUR:
1289 winchip_mcheck_init(c);
1290 break;
1291 }
1292}
1293
1294static void mce_cpu_features(struct cpuinfo_x86 *c)
1295{
1296 switch (c->x86_vendor) {
1297 case X86_VENDOR_INTEL:
1298 mce_intel_feature_init(c);
1299 break;
1300 case X86_VENDOR_AMD:
1301 mce_amd_feature_init(c);
1302 break;
1303 default:
1304 break;
1305 }
1306}
1307
1308static void mce_init_timer(void)
1309{
1310 struct timer_list *t = &__get_cpu_var(mce_timer);
1311 int *n = &__get_cpu_var(next_interval);
1312
1313 if (mce_ignore_ce)
1314 return;
1315
1316 *n = check_interval * HZ;
1317 if (!*n)
1318 return;
1319 setup_timer(t, mcheck_timer, smp_processor_id());
1320 t->expires = round_jiffies(jiffies + *n);
1321 add_timer(t);
1322}
1323
1324/*
1325 * Called for each booted CPU to set up machine checks.
1326 * Must be called with preempt off:
1327 */
1328void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1329{
1330 if (mce_disabled)
1331 return;
1332
1333 mce_ancient_init(c);
1334
1335 if (!mce_available(c))
1336 return;
1337
1338 if (mce_cap_init() < 0) {
1339 mce_disabled = 1;
1340 return;
1341 }
1342 mce_cpu_quirks(c);
1343
1344 machine_check_vector = do_machine_check;
1345
1346 mce_init();
1347 mce_cpu_features(c);
1348 mce_init_timer();
1349 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1350}
1351
1352/*
1353 * Character device to read and clear the MCE log.
1354 */
1355
1356static DEFINE_SPINLOCK(mce_state_lock);
1357static int open_count; /* #times opened */
1358static int open_exclu; /* already open exclusive? */
1359
1360static int mce_open(struct inode *inode, struct file *file)
1361{
1362 spin_lock(&mce_state_lock);
1363
1364 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1365 spin_unlock(&mce_state_lock);
1366
1367 return -EBUSY;
1368 }
1369
1370 if (file->f_flags & O_EXCL)
1371 open_exclu = 1;
1372 open_count++;
1373
1374 spin_unlock(&mce_state_lock);
1375
1376 return nonseekable_open(inode, file);
1377}
1378
1379static int mce_release(struct inode *inode, struct file *file)
1380{
1381 spin_lock(&mce_state_lock);
1382
1383 open_count--;
1384 open_exclu = 0;
1385
1386 spin_unlock(&mce_state_lock);
1387
1388 return 0;
1389}
1390
1391static void collect_tscs(void *data)
1392{
1393 unsigned long *cpu_tsc = (unsigned long *)data;
1394
1395 rdtscll(cpu_tsc[smp_processor_id()]);
1396}
1397
1398static DEFINE_MUTEX(mce_read_mutex);
1399
1400static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1401 loff_t *off)
1402{
1403 char __user *buf = ubuf;
1404 unsigned long *cpu_tsc;
1405 unsigned prev, next;
1406 int i, err;
1407
1408 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1409 if (!cpu_tsc)
1410 return -ENOMEM;
1411
1412 mutex_lock(&mce_read_mutex);
1413 next = rcu_dereference(mcelog.next);
1414
1415 /* Only supports full reads right now */
1416 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
1417 mutex_unlock(&mce_read_mutex);
1418 kfree(cpu_tsc);
1419
1420 return -EINVAL;
1421 }
1422
1423 err = 0;
1424 prev = 0;
1425 do {
1426 for (i = prev; i < next; i++) {
1427 unsigned long start = jiffies;
1428
1429 while (!mcelog.entry[i].finished) {
1430 if (time_after_eq(jiffies, start + 2)) {
1431 memset(mcelog.entry + i, 0,
1432 sizeof(struct mce));
1433 goto timeout;
1434 }
1435 cpu_relax();
1436 }
1437 smp_rmb();
1438 err |= copy_to_user(buf, mcelog.entry + i,
1439 sizeof(struct mce));
1440 buf += sizeof(struct mce);
1441timeout:
1442 ;
1443 }
1444
1445 memset(mcelog.entry + prev, 0,
1446 (next - prev) * sizeof(struct mce));
1447 prev = next;
1448 next = cmpxchg(&mcelog.next, prev, 0);
1449 } while (next != prev);
1450
1451 synchronize_sched();
1452
1453 /*
1454 * Collect entries that were still getting written before the
1455 * synchronize.
1456 */
1457 on_each_cpu(collect_tscs, cpu_tsc, 1);
1458
1459 for (i = next; i < MCE_LOG_LEN; i++) {
1460 if (mcelog.entry[i].finished &&
1461 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1462 err |= copy_to_user(buf, mcelog.entry+i,
1463 sizeof(struct mce));
1464 smp_rmb();
1465 buf += sizeof(struct mce);
1466 memset(&mcelog.entry[i], 0, sizeof(struct mce));
1467 }
1468 }
1469 mutex_unlock(&mce_read_mutex);
1470 kfree(cpu_tsc);
1471
1472 return err ? -EFAULT : buf - ubuf;
1473}
1474
1475static unsigned int mce_poll(struct file *file, poll_table *wait)
1476{
1477 poll_wait(file, &mce_wait, wait);
1478 if (rcu_dereference(mcelog.next))
1479 return POLLIN | POLLRDNORM;
1480 return 0;
1481}
1482
1483static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1484{
1485 int __user *p = (int __user *)arg;
1486
1487 if (!capable(CAP_SYS_ADMIN))
1488 return -EPERM;
1489
1490 switch (cmd) {
1491 case MCE_GET_RECORD_LEN:
1492 return put_user(sizeof(struct mce), p);
1493 case MCE_GET_LOG_LEN:
1494 return put_user(MCE_LOG_LEN, p);
1495 case MCE_GETCLEAR_FLAGS: {
1496 unsigned flags;
1497
1498 do {
1499 flags = mcelog.flags;
1500 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1501
1502 return put_user(flags, p);
1503 }
1504 default:
1505 return -ENOTTY;
1506 }
1507}
1508
1509/* Modified in mce-inject.c, so not static or const */
1510struct file_operations mce_chrdev_ops = {
1511 .open = mce_open,
1512 .release = mce_release,
1513 .read = mce_read,
1514 .poll = mce_poll,
1515 .unlocked_ioctl = mce_ioctl,
1516};
1517EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1518
1519static struct miscdevice mce_log_device = {
1520 MISC_MCELOG_MINOR,
1521 "mcelog",
1522 &mce_chrdev_ops,
1523};
1524
1525/*
1526 * mce=off Disables machine check
1527 * mce=no_cmci Disables CMCI
1528 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1529 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1530 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1531 * monarchtimeout is how long to wait for other CPUs on machine
1532 * check, or 0 to not wait
1533 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1534 * mce=nobootlog Don't log MCEs from before booting.
1535 */
1536static int __init mcheck_enable(char *str)
1537{
1538 if (*str == 0)
1539 enable_p5_mce();
1540 if (*str == '=')
1541 str++;
1542 if (!strcmp(str, "off"))
1543 mce_disabled = 1;
1544 else if (!strcmp(str, "no_cmci"))
1545 mce_cmci_disabled = 1;
1546 else if (!strcmp(str, "dont_log_ce"))
1547 mce_dont_log_ce = 1;
1548 else if (!strcmp(str, "ignore_ce"))
1549 mce_ignore_ce = 1;
1550 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1551 mce_bootlog = (str[0] == 'b');
1552 else if (isdigit(str[0])) {
1553 get_option(&str, &tolerant);
1554 if (*str == ',') {
1555 ++str;
1556 get_option(&str, &monarch_timeout);
1557 }
1558 } else {
1559 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1560 str);
1561 return 0;
1562 }
1563 return 1;
1564}
1565__setup("mce", mcheck_enable);
1566
1567/*
1568 * Sysfs support
1569 */
1570
1571/*
1572 * Disable machine checks on suspend and shutdown. We can't really handle
1573 * them later.
1574 */
1575static int mce_disable(void)
1576{
1577 int i;
1578
1579 for (i = 0; i < banks; i++) {
1580 if (!skip_bank_init(i))
1581 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1582 }
1583 return 0;
1584}
1585
1586static int mce_suspend(struct sys_device *dev, pm_message_t state)
1587{
1588 return mce_disable();
1589}
1590
1591static int mce_shutdown(struct sys_device *dev)
1592{
1593 return mce_disable();
1594}
1595
1596/*
1597 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1598 * Only one CPU is active at this time, the others get re-added later using
1599 * CPU hotplug:
1600 */
1601static int mce_resume(struct sys_device *dev)
1602{
1603 mce_init();
1604 mce_cpu_features(&current_cpu_data);
1605
1606 return 0;
1607}
1608
1609static void mce_cpu_restart(void *data)
1610{
1611 del_timer_sync(&__get_cpu_var(mce_timer));
1612 if (mce_available(&current_cpu_data))
1613 mce_init();
1614 mce_init_timer();
1615}
1616
1617/* Reinit MCEs after user configuration changes */
1618static void mce_restart(void)
1619{
1620 on_each_cpu(mce_cpu_restart, NULL, 1);
1621}
1622
1623static struct sysdev_class mce_sysclass = {
1624 .suspend = mce_suspend,
1625 .shutdown = mce_shutdown,
1626 .resume = mce_resume,
1627 .name = "machinecheck",
1628};
1629
1630DEFINE_PER_CPU(struct sys_device, mce_dev);
1631
1632__cpuinitdata
1633void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1634
1635static struct sysdev_attribute *bank_attrs;
1636
1637static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1638 char *buf)
1639{
1640 u64 b = bank[attr - bank_attrs];
1641
1642 return sprintf(buf, "%llx\n", b);
1643}
1644
1645static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1646 const char *buf, size_t size)
1647{
1648 u64 new;
1649
1650 if (strict_strtoull(buf, 0, &new) < 0)
1651 return -EINVAL;
1652
1653 bank[attr - bank_attrs] = new;
1654 mce_restart();
1655
1656 return size;
1657}
1658
1659static ssize_t
1660show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1661{
1662 strcpy(buf, trigger);
1663 strcat(buf, "\n");
1664 return strlen(trigger) + 1;
1665}
1666
1667static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1668 const char *buf, size_t siz)
1669{
1670 char *p;
1671 int len;
1672
1673 strncpy(trigger, buf, sizeof(trigger));
1674 trigger[sizeof(trigger)-1] = 0;
1675 len = strlen(trigger);
1676 p = strchr(trigger, '\n');
1677
1678 if (*p)
1679 *p = 0;
1680
1681 return len;
1682}
1683
1684static ssize_t store_int_with_restart(struct sys_device *s,
1685 struct sysdev_attribute *attr,
1686 const char *buf, size_t size)
1687{
1688 ssize_t ret = sysdev_store_int(s, attr, buf, size);
1689 mce_restart();
1690 return ret;
1691}
1692
1693static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1694static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1695static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1696
1697static struct sysdev_ext_attribute attr_check_interval = {
1698 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1699 store_int_with_restart),
1700 &check_interval
1701};
1702
1703static struct sysdev_attribute *mce_attrs[] = {
1704 &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1705 &attr_monarch_timeout.attr,
1706 NULL
1707};
1708
1709static cpumask_var_t mce_dev_initialized;
1710
1711/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1712static __cpuinit int mce_create_device(unsigned int cpu)
1713{
1714 int err;
1715 int i;
1716
1717 if (!mce_available(&boot_cpu_data))
1718 return -EIO;
1719
1720 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1721 per_cpu(mce_dev, cpu).id = cpu;
1722 per_cpu(mce_dev, cpu).cls = &mce_sysclass;
1723
1724 err = sysdev_register(&per_cpu(mce_dev, cpu));
1725 if (err)
1726 return err;
1727
1728 for (i = 0; mce_attrs[i]; i++) {
1729 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1730 if (err)
1731 goto error;
1732 }
1733 for (i = 0; i < banks; i++) {
1734 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1735 &bank_attrs[i]);
1736 if (err)
1737 goto error2;
1738 }
1739 cpumask_set_cpu(cpu, mce_dev_initialized);
1740
1741 return 0;
1742error2:
1743 while (--i >= 0)
1744 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1745error:
1746 while (--i >= 0)
1747 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1748
1749 sysdev_unregister(&per_cpu(mce_dev, cpu));
1750
1751 return err;
1752}
1753
1754static __cpuinit void mce_remove_device(unsigned int cpu)
1755{
1756 int i;
1757
1758 if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1759 return;
1760
1761 for (i = 0; mce_attrs[i]; i++)
1762 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1763
1764 for (i = 0; i < banks; i++)
1765 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1766
1767 sysdev_unregister(&per_cpu(mce_dev, cpu));
1768 cpumask_clear_cpu(cpu, mce_dev_initialized);
1769}
1770
1771/* Make sure there are no machine checks on offlined CPUs. */
1772static void mce_disable_cpu(void *h)
1773{
1774 unsigned long action = *(unsigned long *)h;
1775 int i;
1776
1777 if (!mce_available(&current_cpu_data))
1778 return;
1779 if (!(action & CPU_TASKS_FROZEN))
1780 cmci_clear();
1781 for (i = 0; i < banks; i++) {
1782 if (!skip_bank_init(i))
1783 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1784 }
1785}
1786
1787static void mce_reenable_cpu(void *h)
1788{
1789 unsigned long action = *(unsigned long *)h;
1790 int i;
1791
1792 if (!mce_available(&current_cpu_data))
1793 return;
1794
1795 if (!(action & CPU_TASKS_FROZEN))
1796 cmci_reenable();
1797 for (i = 0; i < banks; i++) {
1798 if (!skip_bank_init(i))
1799 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1800 }
1801}
1802
1803/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1804static int __cpuinit
1805mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1806{
1807 unsigned int cpu = (unsigned long)hcpu;
1808 struct timer_list *t = &per_cpu(mce_timer, cpu);
1809
1810 switch (action) {
1811 case CPU_ONLINE:
1812 case CPU_ONLINE_FROZEN:
1813 mce_create_device(cpu);
1814 if (threshold_cpu_callback)
1815 threshold_cpu_callback(action, cpu);
1816 break;
1817 case CPU_DEAD:
1818 case CPU_DEAD_FROZEN:
1819 if (threshold_cpu_callback)
1820 threshold_cpu_callback(action, cpu);
1821 mce_remove_device(cpu);
1822 break;
1823 case CPU_DOWN_PREPARE:
1824 case CPU_DOWN_PREPARE_FROZEN:
1825 del_timer_sync(t);
1826 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1827 break;
1828 case CPU_DOWN_FAILED:
1829 case CPU_DOWN_FAILED_FROZEN:
1830 t->expires = round_jiffies(jiffies +
1831 __get_cpu_var(next_interval));
1832 add_timer_on(t, cpu);
1833 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1834 break;
1835 case CPU_POST_DEAD:
1836 /* intentionally ignoring frozen here */
1837 cmci_rediscover(cpu);
1838 break;
1839 }
1840 return NOTIFY_OK;
1841}
1842
1843static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1844 .notifier_call = mce_cpu_callback,
1845};
1846
1847static __init int mce_init_banks(void)
1848{
1849 int i;
1850
1851 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1852 GFP_KERNEL);
1853 if (!bank_attrs)
1854 return -ENOMEM;
1855
1856 for (i = 0; i < banks; i++) {
1857 struct sysdev_attribute *a = &bank_attrs[i];
1858
1859 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1860 if (!a->attr.name)
1861 goto nomem;
1862
1863 a->attr.mode = 0644;
1864 a->show = show_bank;
1865 a->store = set_bank;
1866 }
1867 return 0;
1868
1869nomem:
1870 while (--i >= 0)
1871 kfree(bank_attrs[i].attr.name);
1872 kfree(bank_attrs);
1873 bank_attrs = NULL;
1874
1875 return -ENOMEM;
1876}
1877
1878static __init int mce_init_device(void)
1879{
1880 int err;
1881 int i = 0;
1882
1883 if (!mce_available(&boot_cpu_data))
1884 return -EIO;
1885
1886 alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1887
1888 err = mce_init_banks();
1889 if (err)
1890 return err;
1891
1892 err = sysdev_class_register(&mce_sysclass);
1893 if (err)
1894 return err;
1895
1896 for_each_online_cpu(i) {
1897 err = mce_create_device(i);
1898 if (err)
1899 return err;
1900 }
1901
1902 register_hotcpu_notifier(&mce_cpu_notifier);
1903 misc_register(&mce_log_device);
1904
1905 return err;
1906}
1907
1908device_initcall(mce_init_device);
1909
1910#else /* CONFIG_X86_OLD_MCE: */
1911
1912int nr_mce_banks;
1913EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
1914
1915/* This has to be run for each processor */
1916void mcheck_init(struct cpuinfo_x86 *c)
1917{
1918 if (mce_disabled == 1)
1919 return;
1920
1921 switch (c->x86_vendor) {
1922 case X86_VENDOR_AMD:
1923 amd_mcheck_init(c);
1924 break;
1925
1926 case X86_VENDOR_INTEL:
1927 if (c->x86 == 5)
1928 intel_p5_mcheck_init(c);
1929 if (c->x86 == 6)
1930 intel_p6_mcheck_init(c);
1931 if (c->x86 == 15)
1932 intel_p4_mcheck_init(c);
1933 break;
1934
1935 case X86_VENDOR_CENTAUR:
1936 if (c->x86 == 5)
1937 winchip_mcheck_init(c);
1938 break;
1939
1940 default:
1941 break;
1942 }
1943 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1944}
1945
1946static int __init mcheck_enable(char *str)
1947{
1948 mce_disabled = -1;
1949 return 1;
1950}
1951
1952__setup("mce", mcheck_enable);
1953
1954#endif /* CONFIG_X86_OLD_MCE */
1955
1956/*
1957 * Old style boot options parsing. Only for compatibility.
1958 */
1959static int __init mcheck_disable(char *str)
1960{
1961 mce_disabled = 1;
1962 return 1;
1963}
1964__setup("nomce", mcheck_disable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
index ae9f628838f1..84a552b458c8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -1,14 +1,38 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <asm/mce.h> 2#include <asm/mce.h>
3 3
4#ifdef CONFIG_X86_OLD_MCE
4void amd_mcheck_init(struct cpuinfo_x86 *c); 5void amd_mcheck_init(struct cpuinfo_x86 *c);
5void intel_p4_mcheck_init(struct cpuinfo_x86 *c); 6void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
6void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
7void intel_p6_mcheck_init(struct cpuinfo_x86 *c); 7void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
8#endif
9
10#ifdef CONFIG_X86_ANCIENT_MCE
11void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
8void winchip_mcheck_init(struct cpuinfo_x86 *c); 12void winchip_mcheck_init(struct cpuinfo_x86 *c);
13extern int mce_p5_enable;
14static inline int mce_p5_enabled(void) { return mce_p5_enable; }
15static inline void enable_p5_mce(void) { mce_p5_enable = 1; }
16#else
17static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
18static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
19static inline int mce_p5_enabled(void) { return 0; }
20static inline void enable_p5_mce(void) { }
21#endif
9 22
10/* Call the installed machine check handler for this CPU setup. */ 23/* Call the installed machine check handler for this CPU setup. */
11extern void (*machine_check_vector)(struct pt_regs *, long error_code); 24extern void (*machine_check_vector)(struct pt_regs *, long error_code);
12 25
26#ifdef CONFIG_X86_OLD_MCE
27
13extern int nr_mce_banks; 28extern int nr_mce_banks;
14 29
30void intel_set_thermal_handler(void);
31
32#else
33
34static inline void intel_set_thermal_handler(void) { }
35
36#endif
37
38void intel_init_thermal(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
deleted file mode 100644
index 3552119b091d..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ /dev/null
@@ -1,76 +0,0 @@
1/*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/smp.h>
11#include <linux/thread_info.h>
12
13#include <asm/processor.h>
14#include <asm/system.h>
15#include <asm/mce.h>
16
17#include "mce.h"
18
19int mce_disabled;
20int nr_mce_banks;
21
22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
23
24/* Handle unconfigured int18 (should never happen) */
25static void unexpected_machine_check(struct pt_regs *regs, long error_code)
26{
27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
28}
29
30/* Call the installed machine check handler for this CPU setup. */
31void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
32
33/* This has to be run for each processor */
34void mcheck_init(struct cpuinfo_x86 *c)
35{
36 if (mce_disabled == 1)
37 return;
38
39 switch (c->x86_vendor) {
40 case X86_VENDOR_AMD:
41 amd_mcheck_init(c);
42 break;
43
44 case X86_VENDOR_INTEL:
45 if (c->x86 == 5)
46 intel_p5_mcheck_init(c);
47 if (c->x86 == 6)
48 intel_p6_mcheck_init(c);
49 if (c->x86 == 15)
50 intel_p4_mcheck_init(c);
51 break;
52
53 case X86_VENDOR_CENTAUR:
54 if (c->x86 == 5)
55 winchip_mcheck_init(c);
56 break;
57
58 default:
59 break;
60 }
61}
62
63static int __init mcheck_disable(char *str)
64{
65 mce_disabled = 1;
66 return 1;
67}
68
69static int __init mcheck_enable(char *str)
70{
71 mce_disabled = -1;
72 return 1;
73}
74
75__setup("nomce", mcheck_disable);
76__setup("mce", mcheck_enable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
deleted file mode 100644
index 289cc4815028..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ /dev/null
@@ -1,1188 +0,0 @@
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/sched.h>
14#include <linux/smp_lock.h>
15#include <linux/string.h>
16#include <linux/rcupdate.h>
17#include <linux/kallsyms.h>
18#include <linux/sysdev.h>
19#include <linux/miscdevice.h>
20#include <linux/fs.h>
21#include <linux/capability.h>
22#include <linux/cpu.h>
23#include <linux/percpu.h>
24#include <linux/poll.h>
25#include <linux/thread_info.h>
26#include <linux/ctype.h>
27#include <linux/kmod.h>
28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31#include <linux/ratelimit.h>
32#include <asm/processor.h>
33#include <asm/msr.h>
34#include <asm/mce.h>
35#include <asm/uaccess.h>
36#include <asm/smp.h>
37#include <asm/idle.h>
38
39#define MISC_MCELOG_MINOR 227
40
41atomic_t mce_entry;
42
43static int mce_dont_init;
44
45/*
46 * Tolerant levels:
47 * 0: always panic on uncorrected errors, log corrected errors
48 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
49 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
50 * 3: never panic or SIGBUS, log all errors (for testing only)
51 */
52static int tolerant = 1;
53static int banks;
54static u64 *bank;
55static unsigned long notify_user;
56static int rip_msr;
57static int mce_bootlog = -1;
58static atomic_t mce_events;
59
60static char trigger[128];
61static char *trigger_argv[2] = { trigger, NULL };
62
63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
64
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
78/*
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
82 */
83
84static struct mce_log mcelog = {
85 MCE_LOG_SIGNATURE,
86 MCE_LOG_LEN,
87};
88
89void mce_log(struct mce *mce)
90{
91 unsigned next, entry;
92 atomic_inc(&mce_events);
93 mce->finished = 0;
94 wmb();
95 for (;;) {
96 entry = rcu_dereference(mcelog.next);
97 for (;;) {
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry >= MCE_LOG_LEN) {
101 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
102 return;
103 }
104 /* Old left over entry. Skip. */
105 if (mcelog.entry[entry].finished) {
106 entry++;
107 continue;
108 }
109 break;
110 }
111 smp_rmb();
112 next = entry + 1;
113 if (cmpxchg(&mcelog.next, entry, next) == entry)
114 break;
115 }
116 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
117 wmb();
118 mcelog.entry[entry].finished = 1;
119 wmb();
120
121 set_bit(0, &notify_user);
122}
123
124static void print_mce(struct mce *m)
125{
126 printk(KERN_EMERG "\n"
127 KERN_EMERG "HARDWARE ERROR\n"
128 KERN_EMERG
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m->cpu, m->mcgstatus, m->bank, m->status);
131 if (m->ip) {
132 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
133 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
134 m->cs, m->ip);
135 if (m->cs == __KERNEL_CS)
136 print_symbol("{%s}", m->ip);
137 printk("\n");
138 }
139 printk(KERN_EMERG "TSC %llx ", m->tsc);
140 if (m->addr)
141 printk("ADDR %llx ", m->addr);
142 if (m->misc)
143 printk("MISC %llx ", m->misc);
144 printk("\n");
145 printk(KERN_EMERG "This is not a software problem!\n");
146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
148}
149
150static void mce_panic(char *msg, struct mce *backup, unsigned long start)
151{
152 int i;
153
154 oops_begin();
155 for (i = 0; i < MCE_LOG_LEN; i++) {
156 unsigned long tsc = mcelog.entry[i].tsc;
157
158 if (time_before(tsc, start))
159 continue;
160 print_mce(&mcelog.entry[i]);
161 if (backup && mcelog.entry[i].tsc == backup->tsc)
162 backup = NULL;
163 }
164 if (backup)
165 print_mce(backup);
166 panic(msg);
167}
168
169int mce_available(struct cpuinfo_x86 *c)
170{
171 if (mce_dont_init)
172 return 0;
173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
174}
175
176static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
177{
178 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
179 m->ip = regs->ip;
180 m->cs = regs->cs;
181 } else {
182 m->ip = 0;
183 m->cs = 0;
184 }
185 if (rip_msr) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m->mcgstatus |= MCG_STATUS_EIPV;
188 rdmsrl(rip_msr, m->ip);
189 m->cs = 0;
190 }
191}
192
193/*
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242 if (!(flags & MCP_DONTLOG)) {
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245 }
246
247 /*
248 * Clear state for this bank.
249 */
250 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
251 }
252
253 /*
254 * Don't clear MCG_STATUS here because it's only defined for
255 * exceptions.
256 */
257}
258
259/*
260 * The actual machine check handler. This only handles real
261 * exceptions when something got corrupted coming in through int 18.
262 *
263 * This is executed in NMI context not subject to normal locking rules. This
264 * implies that most kernel services cannot be safely used. Don't even
265 * think about putting a printk in there!
266 */
267void do_machine_check(struct pt_regs * regs, long error_code)
268{
269 struct mce m, panicm;
270 u64 mcestart = 0;
271 int i;
272 int panicm_found = 0;
273 /*
274 * If no_way_out gets set, there is no safe way to recover from this
275 * MCE. If tolerant is cranked up, we'll try anyway.
276 */
277 int no_way_out = 0;
278 /*
279 * If kill_it gets set, there might be a way to recover from this
280 * error.
281 */
282 int kill_it = 0;
283 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
284
285 atomic_inc(&mce_entry);
286
287 if (notify_die(DIE_NMI, "machine check", regs, error_code,
288 18, SIGKILL) == NOTIFY_STOP)
289 goto out2;
290 if (!banks)
291 goto out2;
292
293 mce_setup(&m);
294
295 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
296 /* if the restart IP is not valid, we're done for */
297 if (!(m.mcgstatus & MCG_STATUS_RIPV))
298 no_way_out = 1;
299
300 rdtscll(mcestart);
301 barrier();
302
303 for (i = 0; i < banks; i++) {
304 __clear_bit(i, toclear);
305 if (!bank[i])
306 continue;
307
308 m.misc = 0;
309 m.addr = 0;
310 m.bank = i;
311
312 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
313 if ((m.status & MCI_STATUS_VAL) == 0)
314 continue;
315
316 /*
317 * Non uncorrected errors are handled by machine_check_poll
318 * Leave them alone.
319 */
320 if ((m.status & MCI_STATUS_UC) == 0)
321 continue;
322
323 /*
324 * Set taint even when machine check was not enabled.
325 */
326 add_taint(TAINT_MACHINE_CHECK);
327
328 __set_bit(i, toclear);
329
330 if (m.status & MCI_STATUS_EN) {
331 /* if PCC was set, there's no way out */
332 no_way_out |= !!(m.status & MCI_STATUS_PCC);
333 /*
334 * If this error was uncorrectable and there was
335 * an overflow, we're in trouble. If no overflow,
336 * we might get away with just killing a task.
337 */
338 if (m.status & MCI_STATUS_UC) {
339 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
340 no_way_out = 1;
341 kill_it = 1;
342 }
343 } else {
344 /*
345 * Machine check event was not enabled. Clear, but
346 * ignore.
347 */
348 continue;
349 }
350
351 if (m.status & MCI_STATUS_MISCV)
352 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
353 if (m.status & MCI_STATUS_ADDRV)
354 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
355
356 mce_get_rip(&m, regs);
357 mce_log(&m);
358
359 /* Did this bank cause the exception? */
360 /* Assume that the bank with uncorrectable errors did it,
361 and that there is only a single one. */
362 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
363 panicm = m;
364 panicm_found = 1;
365 }
366 }
367
368 /* If we didn't find an uncorrectable error, pick
369 the last one (shouldn't happen, just being safe). */
370 if (!panicm_found)
371 panicm = m;
372
373 /*
374 * If we have decided that we just CAN'T continue, and the user
375 * has not set tolerant to an insane level, give up and die.
376 */
377 if (no_way_out && tolerant < 3)
378 mce_panic("Machine check", &panicm, mcestart);
379
380 /*
381 * If the error seems to be unrecoverable, something should be
382 * done. Try to kill as little as possible. If we can kill just
383 * one task, do that. If the user has set the tolerance very
384 * high, don't try to do anything at all.
385 */
386 if (kill_it && tolerant < 3) {
387 int user_space = 0;
388
389 /*
390 * If the EIPV bit is set, it means the saved IP is the
391 * instruction which caused the MCE.
392 */
393 if (m.mcgstatus & MCG_STATUS_EIPV)
394 user_space = panicm.ip && (panicm.cs & 3);
395
396 /*
397 * If we know that the error was in user space, send a
398 * SIGBUS. Otherwise, panic if tolerance is low.
399 *
400 * force_sig() takes an awful lot of locks and has a slight
401 * risk of deadlocking.
402 */
403 if (user_space) {
404 force_sig(SIGBUS, current);
405 } else if (panic_on_oops || tolerant < 2) {
406 mce_panic("Uncorrected machine check",
407 &panicm, mcestart);
408 }
409 }
410
411 /* notify userspace ASAP */
412 set_thread_flag(TIF_MCE_NOTIFY);
413
414 /* the last thing we do is clear state */
415 for (i = 0; i < banks; i++) {
416 if (test_bit(i, toclear))
417 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
418 }
419 wrmsrl(MSR_IA32_MCG_STATUS, 0);
420 out2:
421 atomic_dec(&mce_entry);
422}
423EXPORT_SYMBOL_GPL(do_machine_check);
424
425#ifdef CONFIG_X86_MCE_INTEL
426/***
427 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
428 * @cpu: The CPU on which the event occurred.
429 * @status: Event status information
430 *
431 * This function should be called by the thermal interrupt after the
432 * event has been processed and the decision was made to log the event
433 * further.
434 *
435 * The status parameter will be saved to the 'status' field of 'struct mce'
436 * and historically has been the register value of the
437 * MSR_IA32_THERMAL_STATUS (Intel) msr.
438 */
439void mce_log_therm_throt_event(__u64 status)
440{
441 struct mce m;
442
443 mce_setup(&m);
444 m.bank = MCE_THERMAL_BANK;
445 m.status = status;
446 mce_log(&m);
447}
448#endif /* CONFIG_X86_MCE_INTEL */
449
450/*
451 * Periodic polling timer for "silent" machine check errors. If the
452 * poller finds an MCE, poll 2x faster. When the poller finds no more
453 * errors, poll 2x slower (up to check_interval seconds).
454 */
455
456static int check_interval = 5 * 60; /* 5 minutes */
457static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
458static void mcheck_timer(unsigned long);
459static DEFINE_PER_CPU(struct timer_list, mce_timer);
460
461static void mcheck_timer(unsigned long data)
462{
463 struct timer_list *t = &per_cpu(mce_timer, data);
464 int *n;
465
466 WARN_ON(smp_processor_id() != data);
467
468 if (mce_available(&current_cpu_data))
469 machine_check_poll(MCP_TIMESTAMP,
470 &__get_cpu_var(mce_poll_banks));
471
472 /*
473 * Alert userspace if needed. If we logged an MCE, reduce the
474 * polling interval, otherwise increase the polling interval.
475 */
476 n = &__get_cpu_var(next_interval);
477 if (mce_notify_user()) {
478 *n = max(*n/2, HZ/100);
479 } else {
480 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
481 }
482
483 t->expires = jiffies + *n;
484 add_timer(t);
485}
486
487static void mce_do_trigger(struct work_struct *work)
488{
489 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
490}
491
492static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
493
494/*
495 * Notify the user(s) about new machine check events.
496 * Can be called from interrupt context, but not from machine check/NMI
497 * context.
498 */
499int mce_notify_user(void)
500{
501 /* Not more than two messages every minute */
502 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
503
504 clear_thread_flag(TIF_MCE_NOTIFY);
505 if (test_and_clear_bit(0, &notify_user)) {
506 wake_up_interruptible(&mce_wait);
507
508 /*
509 * There is no risk of missing notifications because
510 * work_pending is always cleared before the function is
511 * executed.
512 */
513 if (trigger[0] && !work_pending(&mce_trigger_work))
514 schedule_work(&mce_trigger_work);
515
516 if (__ratelimit(&ratelimit))
517 printk(KERN_INFO "Machine check events logged\n");
518
519 return 1;
520 }
521 return 0;
522}
523
524/* see if the idle task needs to notify userspace */
525static int
526mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
527{
528 /* IDLE_END should be safe - interrupts are back on */
529 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
530 mce_notify_user();
531
532 return NOTIFY_OK;
533}
534
535static struct notifier_block mce_idle_notifier = {
536 .notifier_call = mce_idle_callback,
537};
538
539static __init int periodic_mcheck_init(void)
540{
541 idle_notifier_register(&mce_idle_notifier);
542 return 0;
543}
544__initcall(periodic_mcheck_init);
545
546/*
547 * Initialize Machine Checks for a CPU.
548 */
549static int mce_cap_init(void)
550{
551 u64 cap;
552 unsigned b;
553
554 rdmsrl(MSR_IA32_MCG_CAP, cap);
555 b = cap & 0xff;
556 if (b > MAX_NR_BANKS) {
557 printk(KERN_WARNING
558 "MCE: Using only %u machine check banks out of %u\n",
559 MAX_NR_BANKS, b);
560 b = MAX_NR_BANKS;
561 }
562
563 /* Don't support asymmetric configurations today */
564 WARN_ON(banks != 0 && b != banks);
565 banks = b;
566 if (!bank) {
567 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
568 if (!bank)
569 return -ENOMEM;
570 memset(bank, 0xff, banks * sizeof(u64));
571 }
572
573 /* Use accurate RIP reporting if available. */
574 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
575 rip_msr = MSR_IA32_MCG_EIP;
576
577 return 0;
578}
579
580static void mce_init(void *dummy)
581{
582 u64 cap;
583 int i;
584 mce_banks_t all_banks;
585
586 /*
587 * Log the machine checks left over from the previous reset.
588 */
589 bitmap_fill(all_banks, MAX_NR_BANKS);
590 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
591
592 set_in_cr4(X86_CR4_MCE);
593
594 rdmsrl(MSR_IA32_MCG_CAP, cap);
595 if (cap & MCG_CTL_P)
596 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
597
598 for (i = 0; i < banks; i++) {
599 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
600 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
601 }
602}
603
604/* Add per CPU specific workarounds here */
605static void mce_cpu_quirks(struct cpuinfo_x86 *c)
606{
607 /* This should be disabled by the BIOS, but isn't always */
608 if (c->x86_vendor == X86_VENDOR_AMD) {
609 if (c->x86 == 15 && banks > 4)
610 /* disable GART TBL walk error reporting, which trips off
611 incorrectly with the IOMMU & 3ware & Cerberus. */
612 clear_bit(10, (unsigned long *)&bank[4]);
613 if(c->x86 <= 17 && mce_bootlog < 0)
614 /* Lots of broken BIOS around that don't clear them
615 by default and leave crap in there. Don't log. */
616 mce_bootlog = 0;
617 }
618
619}
620
621static void mce_cpu_features(struct cpuinfo_x86 *c)
622{
623 switch (c->x86_vendor) {
624 case X86_VENDOR_INTEL:
625 mce_intel_feature_init(c);
626 break;
627 case X86_VENDOR_AMD:
628 mce_amd_feature_init(c);
629 break;
630 default:
631 break;
632 }
633}
634
635static void mce_init_timer(void)
636{
637 struct timer_list *t = &__get_cpu_var(mce_timer);
638 int *n = &__get_cpu_var(next_interval);
639
640 *n = check_interval * HZ;
641 if (!*n)
642 return;
643 setup_timer(t, mcheck_timer, smp_processor_id());
644 t->expires = round_jiffies(jiffies + *n);
645 add_timer(t);
646}
647
648/*
649 * Called for each booted CPU to set up machine checks.
650 * Must be called with preempt off.
651 */
652void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
653{
654 if (!mce_available(c))
655 return;
656
657 if (mce_cap_init() < 0) {
658 mce_dont_init = 1;
659 return;
660 }
661 mce_cpu_quirks(c);
662
663 mce_init(NULL);
664 mce_cpu_features(c);
665 mce_init_timer();
666}
667
668/*
669 * Character device to read and clear the MCE log.
670 */
671
672static DEFINE_SPINLOCK(mce_state_lock);
673static int open_count; /* #times opened */
674static int open_exclu; /* already open exclusive? */
675
676static int mce_open(struct inode *inode, struct file *file)
677{
678 lock_kernel();
679 spin_lock(&mce_state_lock);
680
681 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
682 spin_unlock(&mce_state_lock);
683 unlock_kernel();
684 return -EBUSY;
685 }
686
687 if (file->f_flags & O_EXCL)
688 open_exclu = 1;
689 open_count++;
690
691 spin_unlock(&mce_state_lock);
692 unlock_kernel();
693
694 return nonseekable_open(inode, file);
695}
696
697static int mce_release(struct inode *inode, struct file *file)
698{
699 spin_lock(&mce_state_lock);
700
701 open_count--;
702 open_exclu = 0;
703
704 spin_unlock(&mce_state_lock);
705
706 return 0;
707}
708
709static void collect_tscs(void *data)
710{
711 unsigned long *cpu_tsc = (unsigned long *)data;
712
713 rdtscll(cpu_tsc[smp_processor_id()]);
714}
715
716static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
717 loff_t *off)
718{
719 unsigned long *cpu_tsc;
720 static DEFINE_MUTEX(mce_read_mutex);
721 unsigned prev, next;
722 char __user *buf = ubuf;
723 int i, err;
724
725 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
726 if (!cpu_tsc)
727 return -ENOMEM;
728
729 mutex_lock(&mce_read_mutex);
730 next = rcu_dereference(mcelog.next);
731
732 /* Only supports full reads right now */
733 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
734 mutex_unlock(&mce_read_mutex);
735 kfree(cpu_tsc);
736 return -EINVAL;
737 }
738
739 err = 0;
740 prev = 0;
741 do {
742 for (i = prev; i < next; i++) {
743 unsigned long start = jiffies;
744
745 while (!mcelog.entry[i].finished) {
746 if (time_after_eq(jiffies, start + 2)) {
747 memset(mcelog.entry + i, 0,
748 sizeof(struct mce));
749 goto timeout;
750 }
751 cpu_relax();
752 }
753 smp_rmb();
754 err |= copy_to_user(buf, mcelog.entry + i,
755 sizeof(struct mce));
756 buf += sizeof(struct mce);
757timeout:
758 ;
759 }
760
761 memset(mcelog.entry + prev, 0,
762 (next - prev) * sizeof(struct mce));
763 prev = next;
764 next = cmpxchg(&mcelog.next, prev, 0);
765 } while (next != prev);
766
767 synchronize_sched();
768
769 /*
770 * Collect entries that were still getting written before the
771 * synchronize.
772 */
773 on_each_cpu(collect_tscs, cpu_tsc, 1);
774 for (i = next; i < MCE_LOG_LEN; i++) {
775 if (mcelog.entry[i].finished &&
776 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
777 err |= copy_to_user(buf, mcelog.entry+i,
778 sizeof(struct mce));
779 smp_rmb();
780 buf += sizeof(struct mce);
781 memset(&mcelog.entry[i], 0, sizeof(struct mce));
782 }
783 }
784 mutex_unlock(&mce_read_mutex);
785 kfree(cpu_tsc);
786 return err ? -EFAULT : buf - ubuf;
787}
788
789static unsigned int mce_poll(struct file *file, poll_table *wait)
790{
791 poll_wait(file, &mce_wait, wait);
792 if (rcu_dereference(mcelog.next))
793 return POLLIN | POLLRDNORM;
794 return 0;
795}
796
797static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
798{
799 int __user *p = (int __user *)arg;
800
801 if (!capable(CAP_SYS_ADMIN))
802 return -EPERM;
803 switch (cmd) {
804 case MCE_GET_RECORD_LEN:
805 return put_user(sizeof(struct mce), p);
806 case MCE_GET_LOG_LEN:
807 return put_user(MCE_LOG_LEN, p);
808 case MCE_GETCLEAR_FLAGS: {
809 unsigned flags;
810
811 do {
812 flags = mcelog.flags;
813 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
814 return put_user(flags, p);
815 }
816 default:
817 return -ENOTTY;
818 }
819}
820
821static const struct file_operations mce_chrdev_ops = {
822 .open = mce_open,
823 .release = mce_release,
824 .read = mce_read,
825 .poll = mce_poll,
826 .unlocked_ioctl = mce_ioctl,
827};
828
829static struct miscdevice mce_log_device = {
830 MISC_MCELOG_MINOR,
831 "mcelog",
832 &mce_chrdev_ops,
833};
834
835/*
836 * Old style boot options parsing. Only for compatibility.
837 */
838static int __init mcheck_disable(char *str)
839{
840 mce_dont_init = 1;
841 return 1;
842}
843
844/* mce=off disables machine check.
845 mce=TOLERANCELEVEL (number, see above)
846 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
847 mce=nobootlog Don't log MCEs from before booting. */
848static int __init mcheck_enable(char *str)
849{
850 if (!strcmp(str, "off"))
851 mce_dont_init = 1;
852 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
853 mce_bootlog = str[0] == 'b';
854 else if (isdigit(str[0]))
855 get_option(&str, &tolerant);
856 else
857 printk("mce= argument %s ignored. Please use /sys", str);
858 return 1;
859}
860
861__setup("nomce", mcheck_disable);
862__setup("mce=", mcheck_enable);
863
864/*
865 * Sysfs support
866 */
867
868/*
869 * Disable machine checks on suspend and shutdown. We can't really handle
870 * them later.
871 */
872static int mce_disable(void)
873{
874 int i;
875
876 for (i = 0; i < banks; i++)
877 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
878 return 0;
879}
880
881static int mce_suspend(struct sys_device *dev, pm_message_t state)
882{
883 return mce_disable();
884}
885
886static int mce_shutdown(struct sys_device *dev)
887{
888 return mce_disable();
889}
890
891/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
892 Only one CPU is active at this time, the others get readded later using
893 CPU hotplug. */
894static int mce_resume(struct sys_device *dev)
895{
896 mce_init(NULL);
897 mce_cpu_features(&current_cpu_data);
898 return 0;
899}
900
901static void mce_cpu_restart(void *data)
902{
903 del_timer_sync(&__get_cpu_var(mce_timer));
904 if (mce_available(&current_cpu_data))
905 mce_init(NULL);
906 mce_init_timer();
907}
908
909/* Reinit MCEs after user configuration changes */
910static void mce_restart(void)
911{
912 on_each_cpu(mce_cpu_restart, NULL, 1);
913}
914
915static struct sysdev_class mce_sysclass = {
916 .suspend = mce_suspend,
917 .shutdown = mce_shutdown,
918 .resume = mce_resume,
919 .name = "machinecheck",
920};
921
922DEFINE_PER_CPU(struct sys_device, device_mce);
923void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
924
925/* Why are there no generic functions for this? */
926#define ACCESSOR(name, var, start) \
927 static ssize_t show_ ## name(struct sys_device *s, \
928 struct sysdev_attribute *attr, \
929 char *buf) { \
930 return sprintf(buf, "%lx\n", (unsigned long)var); \
931 } \
932 static ssize_t set_ ## name(struct sys_device *s, \
933 struct sysdev_attribute *attr, \
934 const char *buf, size_t siz) { \
935 char *end; \
936 unsigned long new = simple_strtoul(buf, &end, 0); \
937 if (end == buf) return -EINVAL; \
938 var = new; \
939 start; \
940 return end-buf; \
941 } \
942 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
943
944static struct sysdev_attribute *bank_attrs;
945
946static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
947 char *buf)
948{
949 u64 b = bank[attr - bank_attrs];
950 return sprintf(buf, "%llx\n", b);
951}
952
953static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
954 const char *buf, size_t siz)
955{
956 char *end;
957 u64 new = simple_strtoull(buf, &end, 0);
958 if (end == buf)
959 return -EINVAL;
960 bank[attr - bank_attrs] = new;
961 mce_restart();
962 return end-buf;
963}
964
965static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
966 char *buf)
967{
968 strcpy(buf, trigger);
969 strcat(buf, "\n");
970 return strlen(trigger) + 1;
971}
972
973static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
974 const char *buf,size_t siz)
975{
976 char *p;
977 int len;
978 strncpy(trigger, buf, sizeof(trigger));
979 trigger[sizeof(trigger)-1] = 0;
980 len = strlen(trigger);
981 p = strchr(trigger, '\n');
982 if (*p) *p = 0;
983 return len;
984}
985
986static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
987static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
988ACCESSOR(check_interval,check_interval,mce_restart())
989static struct sysdev_attribute *mce_attributes[] = {
990 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
991 NULL
992};
993
994static cpumask_var_t mce_device_initialized;
995
996/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
997static __cpuinit int mce_create_device(unsigned int cpu)
998{
999 int err;
1000 int i;
1001
1002 if (!mce_available(&boot_cpu_data))
1003 return -EIO;
1004
1005 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1006 per_cpu(device_mce,cpu).id = cpu;
1007 per_cpu(device_mce,cpu).cls = &mce_sysclass;
1008
1009 err = sysdev_register(&per_cpu(device_mce,cpu));
1010 if (err)
1011 return err;
1012
1013 for (i = 0; mce_attributes[i]; i++) {
1014 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1015 mce_attributes[i]);
1016 if (err)
1017 goto error;
1018 }
1019 for (i = 0; i < banks; i++) {
1020 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1021 &bank_attrs[i]);
1022 if (err)
1023 goto error2;
1024 }
1025 cpumask_set_cpu(cpu, mce_device_initialized);
1026
1027 return 0;
1028error2:
1029 while (--i >= 0) {
1030 sysdev_remove_file(&per_cpu(device_mce, cpu),
1031 &bank_attrs[i]);
1032 }
1033error:
1034 while (--i >= 0) {
1035 sysdev_remove_file(&per_cpu(device_mce,cpu),
1036 mce_attributes[i]);
1037 }
1038 sysdev_unregister(&per_cpu(device_mce,cpu));
1039
1040 return err;
1041}
1042
1043static __cpuinit void mce_remove_device(unsigned int cpu)
1044{
1045 int i;
1046
1047 if (!cpumask_test_cpu(cpu, mce_device_initialized))
1048 return;
1049
1050 for (i = 0; mce_attributes[i]; i++)
1051 sysdev_remove_file(&per_cpu(device_mce,cpu),
1052 mce_attributes[i]);
1053 for (i = 0; i < banks; i++)
1054 sysdev_remove_file(&per_cpu(device_mce, cpu),
1055 &bank_attrs[i]);
1056 sysdev_unregister(&per_cpu(device_mce,cpu));
1057 cpumask_clear_cpu(cpu, mce_device_initialized);
1058}
1059
1060/* Make sure there are no machine checks on offlined CPUs. */
1061static void mce_disable_cpu(void *h)
1062{
1063 int i;
1064 unsigned long action = *(unsigned long *)h;
1065
1066 if (!mce_available(&current_cpu_data))
1067 return;
1068 if (!(action & CPU_TASKS_FROZEN))
1069 cmci_clear();
1070 for (i = 0; i < banks; i++)
1071 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1072}
1073
1074static void mce_reenable_cpu(void *h)
1075{
1076 int i;
1077 unsigned long action = *(unsigned long *)h;
1078
1079 if (!mce_available(&current_cpu_data))
1080 return;
1081 if (!(action & CPU_TASKS_FROZEN))
1082 cmci_reenable();
1083 for (i = 0; i < banks; i++)
1084 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1085}
1086
1087/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1088static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1089 unsigned long action, void *hcpu)
1090{
1091 unsigned int cpu = (unsigned long)hcpu;
1092 struct timer_list *t = &per_cpu(mce_timer, cpu);
1093
1094 switch (action) {
1095 case CPU_ONLINE:
1096 case CPU_ONLINE_FROZEN:
1097 mce_create_device(cpu);
1098 if (threshold_cpu_callback)
1099 threshold_cpu_callback(action, cpu);
1100 break;
1101 case CPU_DEAD:
1102 case CPU_DEAD_FROZEN:
1103 if (threshold_cpu_callback)
1104 threshold_cpu_callback(action, cpu);
1105 mce_remove_device(cpu);
1106 break;
1107 case CPU_DOWN_PREPARE:
1108 case CPU_DOWN_PREPARE_FROZEN:
1109 del_timer_sync(t);
1110 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1111 break;
1112 case CPU_DOWN_FAILED:
1113 case CPU_DOWN_FAILED_FROZEN:
1114 t->expires = round_jiffies(jiffies +
1115 __get_cpu_var(next_interval));
1116 add_timer_on(t, cpu);
1117 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1118 break;
1119 case CPU_POST_DEAD:
1120 /* intentionally ignoring frozen here */
1121 cmci_rediscover(cpu);
1122 break;
1123 }
1124 return NOTIFY_OK;
1125}
1126
1127static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1128 .notifier_call = mce_cpu_callback,
1129};
1130
1131static __init int mce_init_banks(void)
1132{
1133 int i;
1134
1135 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1136 GFP_KERNEL);
1137 if (!bank_attrs)
1138 return -ENOMEM;
1139
1140 for (i = 0; i < banks; i++) {
1141 struct sysdev_attribute *a = &bank_attrs[i];
1142 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1143 if (!a->attr.name)
1144 goto nomem;
1145 a->attr.mode = 0644;
1146 a->show = show_bank;
1147 a->store = set_bank;
1148 }
1149 return 0;
1150
1151nomem:
1152 while (--i >= 0)
1153 kfree(bank_attrs[i].attr.name);
1154 kfree(bank_attrs);
1155 bank_attrs = NULL;
1156 return -ENOMEM;
1157}
1158
1159static __init int mce_init_device(void)
1160{
1161 int err;
1162 int i = 0;
1163
1164 if (!mce_available(&boot_cpu_data))
1165 return -EIO;
1166
1167 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1168
1169 err = mce_init_banks();
1170 if (err)
1171 return err;
1172
1173 err = sysdev_class_register(&mce_sysclass);
1174 if (err)
1175 return err;
1176
1177 for_each_online_cpu(i) {
1178 err = mce_create_device(i);
1179 if (err)
1180 return err;
1181 }
1182
1183 register_hotcpu_notifier(&mce_cpu_notifier);
1184 misc_register(&mce_log_device);
1185 return err;
1186}
1187
1188device_initcall(mce_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 56dde9c4bc96..ddae21620bda 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -13,22 +13,22 @@
13 * 13 *
14 * All MC4_MISCi registers are shared between multi-cores 14 * All MC4_MISCi registers are shared between multi-cores
15 */ 15 */
16
17#include <linux/cpu.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/interrupt.h> 16#include <linux/interrupt.h>
21#include <linux/kobject.h>
22#include <linux/notifier.h> 17#include <linux/notifier.h>
23#include <linux/sched.h> 18#include <linux/kobject.h>
24#include <linux/smp.h> 19#include <linux/percpu.h>
25#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/errno.h>
22#include <linux/sched.h>
26#include <linux/sysfs.h> 23#include <linux/sysfs.h>
24#include <linux/init.h>
25#include <linux/cpu.h>
26#include <linux/smp.h>
27
27#include <asm/apic.h> 28#include <asm/apic.h>
29#include <asm/idle.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
29#include <asm/msr.h> 31#include <asm/msr.h>
30#include <asm/percpu.h>
31#include <asm/idle.h>
32 32
33#define PFX "mce_threshold: " 33#define PFX "mce_threshold: "
34#define VERSION "version 1.1.1" 34#define VERSION "version 1.1.1"
@@ -48,26 +48,26 @@
48#define MCG_XBLK_ADDR 0xC0000400 48#define MCG_XBLK_ADDR 0xC0000400
49 49
50struct threshold_block { 50struct threshold_block {
51 unsigned int block; 51 unsigned int block;
52 unsigned int bank; 52 unsigned int bank;
53 unsigned int cpu; 53 unsigned int cpu;
54 u32 address; 54 u32 address;
55 u16 interrupt_enable; 55 u16 interrupt_enable;
56 u16 threshold_limit; 56 u16 threshold_limit;
57 struct kobject kobj; 57 struct kobject kobj;
58 struct list_head miscj; 58 struct list_head miscj;
59}; 59};
60 60
61/* defaults used early on boot */ 61/* defaults used early on boot */
62static struct threshold_block threshold_defaults = { 62static struct threshold_block threshold_defaults = {
63 .interrupt_enable = 0, 63 .interrupt_enable = 0,
64 .threshold_limit = THRESHOLD_MAX, 64 .threshold_limit = THRESHOLD_MAX,
65}; 65};
66 66
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject *kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_var_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73 73
@@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void);
86 */ 86 */
87 87
88struct thresh_restart { 88struct thresh_restart {
89 struct threshold_block *b; 89 struct threshold_block *b;
90 int reset; 90 int reset;
91 u16 old_limit; 91 u16 old_limit;
92}; 92};
93 93
94/* must be called with correct cpu affinity */ 94/* must be called with correct cpu affinity */
@@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr)
110 } else if (tr->old_limit) { /* change limit w/o reset */ 110 } else if (tr->old_limit) { /* change limit w/o reset */
111 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 111 int new_count = (mci_misc_hi & THRESHOLD_MAX) +
112 (tr->old_limit - tr->b->threshold_limit); 112 (tr->old_limit - tr->b->threshold_limit);
113
113 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 114 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
114 (new_count & THRESHOLD_MAX); 115 (new_count & THRESHOLD_MAX);
115 } 116 }
@@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr)
125/* cpu init entry point, called from mce.c with preempt off */ 126/* cpu init entry point, called from mce.c with preempt off */
126void mce_amd_feature_init(struct cpuinfo_x86 *c) 127void mce_amd_feature_init(struct cpuinfo_x86 *c)
127{ 128{
128 unsigned int bank, block;
129 unsigned int cpu = smp_processor_id(); 129 unsigned int cpu = smp_processor_id();
130 u8 lvt_off;
131 u32 low = 0, high = 0, address = 0; 130 u32 low = 0, high = 0, address = 0;
131 unsigned int bank, block;
132 struct thresh_restart tr; 132 struct thresh_restart tr;
133 u8 lvt_off;
133 134
134 for (bank = 0; bank < NR_BANKS; ++bank) { 135 for (bank = 0; bank < NR_BANKS; ++bank) {
135 for (block = 0; block < NR_BLOCKS; ++block) { 136 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
140 if (!address) 141 if (!address)
141 break; 142 break;
142 address += MCG_XBLK_ADDR; 143 address += MCG_XBLK_ADDR;
143 } 144 } else
144 else
145 ++address; 145 ++address;
146 146
147 if (rdmsr_safe(address, &low, &high)) 147 if (rdmsr_safe(address, &low, &high))
@@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
193 */ 193 */
194static void amd_threshold_interrupt(void) 194static void amd_threshold_interrupt(void)
195{ 195{
196 u32 low = 0, high = 0, address = 0;
196 unsigned int bank, block; 197 unsigned int bank, block;
197 struct mce m; 198 struct mce m;
198 u32 low = 0, high = 0, address = 0;
199 199
200 mce_setup(&m); 200 mce_setup(&m);
201 201
@@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void)
204 if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) 204 if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
205 continue; 205 continue;
206 for (block = 0; block < NR_BLOCKS; ++block) { 206 for (block = 0; block < NR_BLOCKS; ++block) {
207 if (block == 0) 207 if (block == 0) {
208 address = MSR_IA32_MC0_MISC + bank * 4; 208 address = MSR_IA32_MC0_MISC + bank * 4;
209 else if (block == 1) { 209 } else if (block == 1) {
210 address = (low & MASK_BLKPTR_LO) >> 21; 210 address = (low & MASK_BLKPTR_LO) >> 21;
211 if (!address) 211 if (!address)
212 break; 212 break;
213 address += MCG_XBLK_ADDR; 213 address += MCG_XBLK_ADDR;
214 } 214 } else {
215 else
216 ++address; 215 ++address;
216 }
217 217
218 if (rdmsr_safe(address, &low, &high)) 218 if (rdmsr_safe(address, &low, &high))
219 break; 219 break;
@@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void)
229 (high & MASK_LOCKED_HI)) 229 (high & MASK_LOCKED_HI))
230 continue; 230 continue;
231 231
232 /* Log the machine check that caused the threshold 232 /*
233 event. */ 233 * Log the machine check that caused the threshold
234 * event.
235 */
234 machine_check_poll(MCP_TIMESTAMP, 236 machine_check_poll(MCP_TIMESTAMP,
235 &__get_cpu_var(mce_poll_banks)); 237 &__get_cpu_var(mce_poll_banks));
236 238
@@ -254,48 +256,52 @@ static void amd_threshold_interrupt(void)
254 256
255struct threshold_attr { 257struct threshold_attr {
256 struct attribute attr; 258 struct attribute attr;
257 ssize_t(*show) (struct threshold_block *, char *); 259 ssize_t (*show) (struct threshold_block *, char *);
258 ssize_t(*store) (struct threshold_block *, const char *, size_t count); 260 ssize_t (*store) (struct threshold_block *, const char *, size_t count);
259}; 261};
260 262
261#define SHOW_FIELDS(name) \ 263#define SHOW_FIELDS(name) \
262static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ 264static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
263{ \ 265{ \
264 return sprintf(buf, "%lx\n", (unsigned long) b->name); \ 266 return sprintf(buf, "%lx\n", (unsigned long) b->name); \
265} 267}
266SHOW_FIELDS(interrupt_enable) 268SHOW_FIELDS(interrupt_enable)
267SHOW_FIELDS(threshold_limit) 269SHOW_FIELDS(threshold_limit)
268 270
269static ssize_t store_interrupt_enable(struct threshold_block *b, 271static ssize_t
270 const char *buf, size_t count) 272store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
271{ 273{
272 char *end;
273 struct thresh_restart tr; 274 struct thresh_restart tr;
274 unsigned long new = simple_strtoul(buf, &end, 0); 275 unsigned long new;
275 if (end == buf) 276
277 if (strict_strtoul(buf, 0, &new) < 0)
276 return -EINVAL; 278 return -EINVAL;
279
277 b->interrupt_enable = !!new; 280 b->interrupt_enable = !!new;
278 281
279 tr.b = b; 282 tr.b = b;
280 tr.reset = 0; 283 tr.reset = 0;
281 tr.old_limit = 0; 284 tr.old_limit = 0;
285
282 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 286 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
283 287
284 return end - buf; 288 return size;
285} 289}
286 290
287static ssize_t store_threshold_limit(struct threshold_block *b, 291static ssize_t
288 const char *buf, size_t count) 292store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
289{ 293{
290 char *end;
291 struct thresh_restart tr; 294 struct thresh_restart tr;
292 unsigned long new = simple_strtoul(buf, &end, 0); 295 unsigned long new;
293 if (end == buf) 296
297 if (strict_strtoul(buf, 0, &new) < 0)
294 return -EINVAL; 298 return -EINVAL;
299
295 if (new > THRESHOLD_MAX) 300 if (new > THRESHOLD_MAX)
296 new = THRESHOLD_MAX; 301 new = THRESHOLD_MAX;
297 if (new < 1) 302 if (new < 1)
298 new = 1; 303 new = 1;
304
299 tr.old_limit = b->threshold_limit; 305 tr.old_limit = b->threshold_limit;
300 b->threshold_limit = new; 306 b->threshold_limit = new;
301 tr.b = b; 307 tr.b = b;
@@ -303,12 +309,12 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
303 309
304 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 310 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
305 311
306 return end - buf; 312 return size;
307} 313}
308 314
309struct threshold_block_cross_cpu { 315struct threshold_block_cross_cpu {
310 struct threshold_block *tb; 316 struct threshold_block *tb;
311 long retval; 317 long retval;
312}; 318};
313 319
314static void local_error_count_handler(void *_tbcc) 320static void local_error_count_handler(void *_tbcc)
@@ -338,16 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b,
338 return 1; 344 return 1;
339} 345}
340 346
341#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ 347#define RW_ATTR(val) \
342 .attr = {.name = __stringify(_name), .mode = _mode }, \ 348static struct threshold_attr val = { \
343 .show = _show, \ 349 .attr = {.name = __stringify(val), .mode = 0644 }, \
344 .store = _store, \ 350 .show = show_## val, \
351 .store = store_## val, \
345}; 352};
346 353
347#define RW_ATTR(name) \
348static struct threshold_attr name = \
349 THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
350
351RW_ATTR(interrupt_enable); 354RW_ATTR(interrupt_enable);
352RW_ATTR(threshold_limit); 355RW_ATTR(threshold_limit);
353RW_ATTR(error_count); 356RW_ATTR(error_count);
@@ -359,15 +362,17 @@ static struct attribute *default_attrs[] = {
359 NULL 362 NULL
360}; 363};
361 364
362#define to_block(k) container_of(k, struct threshold_block, kobj) 365#define to_block(k) container_of(k, struct threshold_block, kobj)
363#define to_attr(a) container_of(a, struct threshold_attr, attr) 366#define to_attr(a) container_of(a, struct threshold_attr, attr)
364 367
365static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) 368static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
366{ 369{
367 struct threshold_block *b = to_block(kobj); 370 struct threshold_block *b = to_block(kobj);
368 struct threshold_attr *a = to_attr(attr); 371 struct threshold_attr *a = to_attr(attr);
369 ssize_t ret; 372 ssize_t ret;
373
370 ret = a->show ? a->show(b, buf) : -EIO; 374 ret = a->show ? a->show(b, buf) : -EIO;
375
371 return ret; 376 return ret;
372} 377}
373 378
@@ -377,18 +382,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
377 struct threshold_block *b = to_block(kobj); 382 struct threshold_block *b = to_block(kobj);
378 struct threshold_attr *a = to_attr(attr); 383 struct threshold_attr *a = to_attr(attr);
379 ssize_t ret; 384 ssize_t ret;
385
380 ret = a->store ? a->store(b, buf, count) : -EIO; 386 ret = a->store ? a->store(b, buf, count) : -EIO;
387
381 return ret; 388 return ret;
382} 389}
383 390
384static struct sysfs_ops threshold_ops = { 391static struct sysfs_ops threshold_ops = {
385 .show = show, 392 .show = show,
386 .store = store, 393 .store = store,
387}; 394};
388 395
389static struct kobj_type threshold_ktype = { 396static struct kobj_type threshold_ktype = {
390 .sysfs_ops = &threshold_ops, 397 .sysfs_ops = &threshold_ops,
391 .default_attrs = default_attrs, 398 .default_attrs = default_attrs,
392}; 399};
393 400
394static __cpuinit int allocate_threshold_blocks(unsigned int cpu, 401static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
@@ -396,9 +403,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
396 unsigned int block, 403 unsigned int block,
397 u32 address) 404 u32 address)
398{ 405{
399 int err;
400 u32 low, high;
401 struct threshold_block *b = NULL; 406 struct threshold_block *b = NULL;
407 u32 low, high;
408 int err;
402 409
403 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) 410 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
404 return 0; 411 return 0;
@@ -421,20 +428,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
421 if (!b) 428 if (!b)
422 return -ENOMEM; 429 return -ENOMEM;
423 430
424 b->block = block; 431 b->block = block;
425 b->bank = bank; 432 b->bank = bank;
426 b->cpu = cpu; 433 b->cpu = cpu;
427 b->address = address; 434 b->address = address;
428 b->interrupt_enable = 0; 435 b->interrupt_enable = 0;
429 b->threshold_limit = THRESHOLD_MAX; 436 b->threshold_limit = THRESHOLD_MAX;
430 437
431 INIT_LIST_HEAD(&b->miscj); 438 INIT_LIST_HEAD(&b->miscj);
432 439
433 if (per_cpu(threshold_banks, cpu)[bank]->blocks) 440 if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
434 list_add(&b->miscj, 441 list_add(&b->miscj,
435 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); 442 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
436 else 443 } else {
437 per_cpu(threshold_banks, cpu)[bank]->blocks = b; 444 per_cpu(threshold_banks, cpu)[bank]->blocks = b;
445 }
438 446
439 err = kobject_init_and_add(&b->kobj, &threshold_ktype, 447 err = kobject_init_and_add(&b->kobj, &threshold_ktype,
440 per_cpu(threshold_banks, cpu)[bank]->kobj, 448 per_cpu(threshold_banks, cpu)[bank]->kobj,
@@ -447,8 +455,9 @@ recurse:
447 if (!address) 455 if (!address)
448 return 0; 456 return 0;
449 address += MCG_XBLK_ADDR; 457 address += MCG_XBLK_ADDR;
450 } else 458 } else {
451 ++address; 459 ++address;
460 }
452 461
453 err = allocate_threshold_blocks(cpu, bank, ++block, address); 462 err = allocate_threshold_blocks(cpu, bank, ++block, address);
454 if (err) 463 if (err)
@@ -500,13 +509,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
500 if (!b) 509 if (!b)
501 goto out; 510 goto out;
502 511
503 err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, 512 err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
504 b->kobj, name); 513 b->kobj, name);
505 if (err) 514 if (err)
506 goto out; 515 goto out;
507 516
508 cpumask_copy(b->cpus, cpu_core_mask(cpu)); 517 cpumask_copy(b->cpus, cpu_core_mask(cpu));
509 per_cpu(threshold_banks, cpu)[bank] = b; 518 per_cpu(threshold_banks, cpu)[bank] = b;
519
510 goto out; 520 goto out;
511 } 521 }
512#endif 522#endif
@@ -522,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
522 goto out; 532 goto out;
523 } 533 }
524 534
525 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); 535 b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
526 if (!b->kobj) 536 if (!b->kobj)
527 goto out_free; 537 goto out_free;
528 538
@@ -542,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
542 if (i == cpu) 552 if (i == cpu)
543 continue; 553 continue;
544 554
545 err = sysfs_create_link(&per_cpu(device_mce, i).kobj, 555 err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
546 b->kobj, name); 556 b->kobj, name);
547 if (err) 557 if (err)
548 goto out; 558 goto out;
@@ -605,15 +615,13 @@ static void deallocate_threshold_block(unsigned int cpu,
605 615
606static void threshold_remove_bank(unsigned int cpu, int bank) 616static void threshold_remove_bank(unsigned int cpu, int bank)
607{ 617{
608 int i = 0;
609 struct threshold_bank *b; 618 struct threshold_bank *b;
610 char name[32]; 619 char name[32];
620 int i = 0;
611 621
612 b = per_cpu(threshold_banks, cpu)[bank]; 622 b = per_cpu(threshold_banks, cpu)[bank];
613
614 if (!b) 623 if (!b)
615 return; 624 return;
616
617 if (!b->blocks) 625 if (!b->blocks)
618 goto free_out; 626 goto free_out;
619 627
@@ -622,8 +630,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
622#ifdef CONFIG_SMP 630#ifdef CONFIG_SMP
623 /* sibling symlink */ 631 /* sibling symlink */
624 if (shared_bank[bank] && b->blocks->cpu != cpu) { 632 if (shared_bank[bank] && b->blocks->cpu != cpu) {
625 sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); 633 sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
626 per_cpu(threshold_banks, cpu)[bank] = NULL; 634 per_cpu(threshold_banks, cpu)[bank] = NULL;
635
627 return; 636 return;
628 } 637 }
629#endif 638#endif
@@ -633,7 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
633 if (i == cpu) 642 if (i == cpu)
634 continue; 643 continue;
635 644
636 sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); 645 sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
637 per_cpu(threshold_banks, i)[bank] = NULL; 646 per_cpu(threshold_banks, i)[bank] = NULL;
638 } 647 }
639 648
@@ -659,12 +668,9 @@ static void threshold_remove_device(unsigned int cpu)
659} 668}
660 669
661/* get notified when a cpu comes on/off */ 670/* get notified when a cpu comes on/off */
662static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, 671static void __cpuinit
663 unsigned int cpu) 672amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
664{ 673{
665 if (cpu >= NR_CPUS)
666 return;
667
668 switch (action) { 674 switch (action) {
669 case CPU_ONLINE: 675 case CPU_ONLINE:
670 case CPU_ONLINE_FROZEN: 676 case CPU_ONLINE_FROZEN:
@@ -686,11 +692,12 @@ static __init int threshold_init_device(void)
686 /* to hit CPUs online before the notifier is up */ 692 /* to hit CPUs online before the notifier is up */
687 for_each_online_cpu(lcpu) { 693 for_each_online_cpu(lcpu) {
688 int err = threshold_create_device(lcpu); 694 int err = threshold_create_device(lcpu);
695
689 if (err) 696 if (err)
690 return err; 697 return err;
691 } 698 }
692 threshold_cpu_callback = amd_64_threshold_cpu_callback; 699 threshold_cpu_callback = amd_64_threshold_cpu_callback;
700
693 return 0; 701 return 0;
694} 702}
695
696device_initcall(threshold_init_device); 703device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
new file mode 100644
index 000000000000..2b011d2d8579
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -0,0 +1,74 @@
1/*
2 * Common code for Intel machine checks
3 */
4#include <linux/interrupt.h>
5#include <linux/kernel.h>
6#include <linux/types.h>
7#include <linux/init.h>
8#include <linux/smp.h>
9
10#include <asm/therm_throt.h>
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/apic.h>
14#include <asm/msr.h>
15
16#include "mce.h"
17
18void intel_init_thermal(struct cpuinfo_x86 *c)
19{
20 unsigned int cpu = smp_processor_id();
21 int tm2 = 0;
22 u32 l, h;
23
24 /* Thermal monitoring depends on ACPI and clock modulation*/
25 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
26 return;
27
28 /*
29 * First check if its enabled already, in which case there might
30 * be some SMM goo which handles it, so we can't even put a handler
31 * since it might be delivered via SMI already:
32 */
33 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
34 h = apic_read(APIC_LVTTHMR);
35 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
36 printk(KERN_DEBUG
37 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
38 return;
39 }
40
41 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
42 tm2 = 1;
43
44 /* Check whether a vector already exists */
45 if (h & APIC_VECTOR_MASK) {
46 printk(KERN_DEBUG
47 "CPU%d: Thermal LVT vector (%#x) already installed\n",
48 cpu, (h & APIC_VECTOR_MASK));
49 return;
50 }
51
52 /* We'll mask the thermal vector in the lapic till we're ready: */
53 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
54 apic_write(APIC_LVTTHMR, h);
55
56 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
57 wrmsr(MSR_IA32_THERM_INTERRUPT,
58 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
59
60 intel_set_thermal_handler();
61
62 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
63 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
64
65 /* Unmask the thermal vector: */
66 l = apic_read(APIC_LVTTHMR);
67 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
68
69 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
70 cpu, tm2 ? "TM2" : "TM1");
71
72 /* enable thermal throttle processing */
73 atomic_set(&therm_throt_en, 1);
74}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 65a0fceedcd7..f2ef6952c400 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -16,6 +16,8 @@
16#include <asm/idle.h> 16#include <asm/idle.h>
17#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
18 18
19#include "mce.h"
20
19asmlinkage void smp_thermal_interrupt(void) 21asmlinkage void smp_thermal_interrupt(void)
20{ 22{
21 __u64 msr_val; 23 __u64 msr_val;
@@ -26,67 +28,13 @@ asmlinkage void smp_thermal_interrupt(void)
26 irq_enter(); 28 irq_enter();
27 29
28 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 30 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
29 if (therm_throt_process(msr_val & 1)) 31 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
30 mce_log_therm_throt_event(msr_val); 32 mce_log_therm_throt_event(msr_val);
31 33
32 inc_irq_stat(irq_thermal_count); 34 inc_irq_stat(irq_thermal_count);
33 irq_exit(); 35 irq_exit();
34} 36}
35 37
36static void intel_init_thermal(struct cpuinfo_x86 *c)
37{
38 u32 l, h;
39 int tm2 = 0;
40 unsigned int cpu = smp_processor_id();
41
42 if (!cpu_has(c, X86_FEATURE_ACPI))
43 return;
44
45 if (!cpu_has(c, X86_FEATURE_ACC))
46 return;
47
48 /* first check if TM1 is already enabled by the BIOS, in which
49 * case there might be some SMM goo which handles it, so we can't even
50 * put a handler since it might be delivered via SMI already.
51 */
52 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
53 h = apic_read(APIC_LVTTHMR);
54 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
55 printk(KERN_DEBUG
56 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
57 return;
58 }
59
60 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
61 tm2 = 1;
62
63 if (h & APIC_VECTOR_MASK) {
64 printk(KERN_DEBUG
65 "CPU%d: Thermal LVT vector (%#x) already "
66 "installed\n", cpu, (h & APIC_VECTOR_MASK));
67 return;
68 }
69
70 h = THERMAL_APIC_VECTOR;
71 h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
72 apic_write(APIC_LVTTHMR, h);
73
74 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
75 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
76
77 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
78 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
79
80 l = apic_read(APIC_LVTTHMR);
81 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
82 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
83 cpu, tm2 ? "TM2" : "TM1");
84
85 /* enable thermal throttle processing */
86 atomic_set(&therm_throt_en, 1);
87 return;
88}
89
90/* 38/*
91 * Support for Intel Correct Machine Check Interrupts. This allows 39 * Support for Intel Correct Machine Check Interrupts. This allows
92 * the CPU to raise an interrupt when a corrected machine check happened. 40 * the CPU to raise an interrupt when a corrected machine check happened.
@@ -108,6 +56,9 @@ static int cmci_supported(int *banks)
108{ 56{
109 u64 cap; 57 u64 cap;
110 58
59 if (mce_cmci_disabled || mce_ignore_ce)
60 return 0;
61
111 /* 62 /*
112 * Vendor check is not strictly needed, but the initial 63 * Vendor check is not strictly needed, but the initial
113 * initialization is vendor keyed and this 64 * initialization is vendor keyed and this
@@ -131,7 +82,7 @@ static int cmci_supported(int *banks)
131static void intel_threshold_interrupt(void) 82static void intel_threshold_interrupt(void)
132{ 83{
133 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); 84 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
134 mce_notify_user(); 85 mce_notify_irq();
135} 86}
136 87
137static void print_update(char *type, int *hdr, int num) 88static void print_update(char *type, int *hdr, int num)
@@ -247,7 +198,7 @@ void cmci_rediscover(int dying)
247 return; 198 return;
248 cpumask_copy(old, &current->cpus_allowed); 199 cpumask_copy(old, &current->cpus_allowed);
249 200
250 for_each_online_cpu (cpu) { 201 for_each_online_cpu(cpu) {
251 if (cpu == dying) 202 if (cpu == dying)
252 continue; 203 continue;
253 if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) 204 if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index a74af128efc9..70b710420f74 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -6,15 +6,14 @@
6 * This file contains routines to check for non-fatal MCEs every 15s 6 * This file contains routines to check for non-fatal MCEs every 15s
7 * 7 *
8 */ 8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/jiffies.h>
14#include <linux/workqueue.h>
15#include <linux/interrupt.h> 9#include <linux/interrupt.h>
16#include <linux/smp.h> 10#include <linux/workqueue.h>
11#include <linux/jiffies.h>
12#include <linux/kernel.h>
17#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/init.h>
16#include <linux/smp.h>
18 17
19#include <asm/processor.h> 18#include <asm/processor.h>
20#include <asm/system.h> 19#include <asm/system.h>
@@ -22,9 +21,9 @@
22 21
23#include "mce.h" 22#include "mce.h"
24 23
25static int firstbank; 24static int firstbank;
26 25
27#define MCE_RATE 15*HZ /* timer rate is 15s */ 26#define MCE_RATE (15*HZ) /* timer rate is 15s */
28 27
29static void mce_checkregs(void *info) 28static void mce_checkregs(void *info)
30{ 29{
@@ -34,23 +33,24 @@ static void mce_checkregs(void *info)
34 for (i = firstbank; i < nr_mce_banks; i++) { 33 for (i = firstbank; i < nr_mce_banks; i++) {
35 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); 34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
36 35
37 if (high & (1<<31)) { 36 if (!(high & (1<<31)))
38 printk(KERN_INFO "MCE: The hardware reports a non " 37 continue;
39 "fatal, correctable incident occurred on " 38
40 "CPU %d.\n", 39 printk(KERN_INFO "MCE: The hardware reports a non fatal, "
40 "correctable incident occurred on CPU %d.\n",
41 smp_processor_id()); 41 smp_processor_id());
42 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); 42
43 43 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
44 /* 44
45 * Scrub the error so we don't pick it up in MCE_RATE 45 /*
46 * seconds time. 46 * Scrub the error so we don't pick it up in MCE_RATE
47 */ 47 * seconds time:
48 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 48 */
49 49 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
50 /* Serialize */ 50
51 wmb(); 51 /* Serialize: */
52 add_taint(TAINT_MACHINE_CHECK); 52 wmb();
53 } 53 add_taint(TAINT_MACHINE_CHECK);
54 } 54 }
55} 55}
56 56
@@ -77,16 +77,17 @@ static int __init init_nonfatal_mce_checker(void)
77 77
78 /* Some Athlons misbehave when we frob bank 0 */ 78 /* Some Athlons misbehave when we frob bank 0 */
79 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 79 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
80 boot_cpu_data.x86 == 6) 80 boot_cpu_data.x86 == 6)
81 firstbank = 1; 81 firstbank = 1;
82 else 82 else
83 firstbank = 0; 83 firstbank = 0;
84 84
85 /* 85 /*
86 * Check for non-fatal errors every MCE_RATE s 86 * Check for non-fatal errors every MCE_RATE s
87 */ 87 */
88 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); 88 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
89 printk(KERN_INFO "Machine check exception polling timer started.\n"); 89 printk(KERN_INFO "Machine check exception polling timer started.\n");
90
90 return 0; 91 return 0;
91} 92}
92module_init(init_nonfatal_mce_checker); 93module_init(init_nonfatal_mce_checker);
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index f53bdcbaf382..82cee108a2d3 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -2,18 +2,17 @@
2 * P4 specific Machine Check Exception Reporting 2 * P4 specific Machine Check Exception Reporting
3 */ 3 */
4 4
5#include <linux/init.h>
6#include <linux/types.h>
7#include <linux/kernel.h>
8#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h> 9#include <linux/smp.h>
10 10
11#include <asm/therm_throt.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/system.h> 13#include <asm/system.h>
13#include <asm/msr.h>
14#include <asm/apic.h> 14#include <asm/apic.h>
15 15#include <asm/msr.h>
16#include <asm/therm_throt.h>
17 16
18#include "mce.h" 17#include "mce.h"
19 18
@@ -36,6 +35,7 @@ static int mce_num_extended_msrs;
36 35
37 36
38#ifdef CONFIG_X86_MCE_P4THERMAL 37#ifdef CONFIG_X86_MCE_P4THERMAL
38
39static void unexpected_thermal_interrupt(struct pt_regs *regs) 39static void unexpected_thermal_interrupt(struct pt_regs *regs)
40{ 40{
41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", 41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
@@ -43,7 +43,7 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs)
43 add_taint(TAINT_MACHINE_CHECK); 43 add_taint(TAINT_MACHINE_CHECK);
44} 44}
45 45
46/* P4/Xeon Thermal transition interrupt handler */ 46/* P4/Xeon Thermal transition interrupt handler: */
47static void intel_thermal_interrupt(struct pt_regs *regs) 47static void intel_thermal_interrupt(struct pt_regs *regs)
48{ 48{
49 __u64 msr_val; 49 __u64 msr_val;
@@ -51,11 +51,12 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
51 ack_APIC_irq(); 51 ack_APIC_irq();
52 52
53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
54 therm_throt_process(msr_val & 0x1); 54 therm_throt_process(msr_val & THERM_STATUS_PROCHOT);
55} 55}
56 56
57/* Thermal interrupt handler for this CPU setup */ 57/* Thermal interrupt handler for this CPU setup: */
58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; 58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) =
59 unexpected_thermal_interrupt;
59 60
60void smp_thermal_interrupt(struct pt_regs *regs) 61void smp_thermal_interrupt(struct pt_regs *regs)
61{ 62{
@@ -65,67 +66,15 @@ void smp_thermal_interrupt(struct pt_regs *regs)
65 irq_exit(); 66 irq_exit();
66} 67}
67 68
68/* P4/Xeon Thermal regulation detect and init */ 69void intel_set_thermal_handler(void)
69static void intel_init_thermal(struct cpuinfo_x86 *c)
70{ 70{
71 u32 l, h;
72 unsigned int cpu = smp_processor_id();
73
74 /* Thermal monitoring */
75 if (!cpu_has(c, X86_FEATURE_ACPI))
76 return; /* -ENODEV */
77
78 /* Clock modulation */
79 if (!cpu_has(c, X86_FEATURE_ACC))
80 return; /* -ENODEV */
81
82 /* first check if its enabled already, in which case there might
83 * be some SMM goo which handles it, so we can't even put a handler
84 * since it might be delivered via SMI already -zwanem.
85 */
86 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
87 h = apic_read(APIC_LVTTHMR);
88 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
90 cpu);
91 return; /* -EBUSY */
92 }
93
94 /* check whether a vector already exists, temporarily masked? */
95 if (h & APIC_VECTOR_MASK) {
96 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
97 "installed\n",
98 cpu, (h & APIC_VECTOR_MASK));
99 return; /* -EBUSY */
100 }
101
102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write(APIC_LVTTHMR, h);
106
107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
109
110 /* ok we're good to go... */
111 vendor_thermal_interrupt = intel_thermal_interrupt; 71 vendor_thermal_interrupt = intel_thermal_interrupt;
112
113 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
114 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
115
116 l = apic_read(APIC_LVTTHMR);
117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119
120 /* enable thermal throttle processing */
121 atomic_set(&therm_throt_en, 1);
122 return;
123} 72}
124#endif /* CONFIG_X86_MCE_P4THERMAL */
125 73
74#endif /* CONFIG_X86_MCE_P4THERMAL */
126 75
127/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ 76/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
128static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) 77static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
129{ 78{
130 u32 h; 79 u32 h;
131 80
@@ -143,9 +92,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
143 92
144static void intel_machine_check(struct pt_regs *regs, long error_code) 93static void intel_machine_check(struct pt_regs *regs, long error_code)
145{ 94{
146 int recover = 1;
147 u32 alow, ahigh, high, low; 95 u32 alow, ahigh, high, low;
148 u32 mcgstl, mcgsth; 96 u32 mcgstl, mcgsth;
97 int recover = 1;
149 int i; 98 int i;
150 99
151 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 100 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -157,7 +106,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
157 106
158 if (mce_num_extended_msrs > 0) { 107 if (mce_num_extended_msrs > 0) {
159 struct intel_mce_extended_msrs dbg; 108 struct intel_mce_extended_msrs dbg;
109
160 intel_get_extended_msrs(&dbg); 110 intel_get_extended_msrs(&dbg);
111
161 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" 112 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
162 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" 113 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
163 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", 114 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
@@ -171,6 +122,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
171 if (high & (1<<31)) { 122 if (high & (1<<31)) {
172 char misc[20]; 123 char misc[20];
173 char addr[24]; 124 char addr[24];
125
174 misc[0] = addr[0] = '\0'; 126 misc[0] = addr[0] = '\0';
175 if (high & (1<<29)) 127 if (high & (1<<29))
176 recover |= 1; 128 recover |= 1;
@@ -196,6 +148,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
196 panic("Unable to continue"); 148 panic("Unable to continue");
197 149
198 printk(KERN_EMERG "Attempting to continue.\n"); 150 printk(KERN_EMERG "Attempting to continue.\n");
151
199 /* 152 /*
200 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 153 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
201 * recoverable/continuable.This will allow BIOS to look at the MSRs 154 * recoverable/continuable.This will allow BIOS to look at the MSRs
@@ -217,7 +170,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
217 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 170 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
218} 171}
219 172
220
221void intel_p4_mcheck_init(struct cpuinfo_x86 *c) 173void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
222{ 174{
223 u32 l, h; 175 u32 l, h;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index c9f77ea69edc..015f481ab1b0 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -2,11 +2,10 @@
2 * P5 specific Machine Check Exception Reporting 2 * P5 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -15,39 +14,58 @@
15 14
16#include "mce.h" 15#include "mce.h"
17 16
18/* Machine check handler for Pentium class Intel */ 17/* By default disabled */
18int mce_p5_enable;
19
20/* Machine check handler for Pentium class Intel CPUs: */
19static void pentium_machine_check(struct pt_regs *regs, long error_code) 21static void pentium_machine_check(struct pt_regs *regs, long error_code)
20{ 22{
21 u32 loaddr, hi, lotype; 23 u32 loaddr, hi, lotype;
24
22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 25 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
23 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); 26 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
24 printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); 27
25 if (lotype&(1<<5)) 28 printk(KERN_EMERG
26 printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); 29 "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
30 smp_processor_id(), loaddr, lotype);
31
32 if (lotype & (1<<5)) {
33 printk(KERN_EMERG
34 "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
35 smp_processor_id());
36 }
37
27 add_taint(TAINT_MACHINE_CHECK); 38 add_taint(TAINT_MACHINE_CHECK);
28} 39}
29 40
30/* Set up machine check reporting for processors with Intel style MCE */ 41/* Set up machine check reporting for processors with Intel style MCE: */
31void intel_p5_mcheck_init(struct cpuinfo_x86 *c) 42void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
32{ 43{
33 u32 l, h; 44 u32 l, h;
34 45
35 /*Check for MCE support */ 46 /* Check for MCE support: */
36 if (!cpu_has(c, X86_FEATURE_MCE)) 47 if (!cpu_has(c, X86_FEATURE_MCE))
37 return; 48 return;
38 49
39 /* Default P5 to off as its often misconnected */ 50#ifdef CONFIG_X86_OLD_MCE
51 /* Default P5 to off as its often misconnected: */
40 if (mce_disabled != -1) 52 if (mce_disabled != -1)
41 return; 53 return;
54#endif
55
42 machine_check_vector = pentium_machine_check; 56 machine_check_vector = pentium_machine_check;
57 /* Make sure the vector pointer is visible before we enable MCEs: */
43 wmb(); 58 wmb();
44 59
45 /* Read registers before enabling */ 60 /* Read registers before enabling: */
46 rdmsr(MSR_IA32_P5_MC_ADDR, l, h); 61 rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
47 rdmsr(MSR_IA32_P5_MC_TYPE, l, h); 62 rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
48 printk(KERN_INFO "Intel old style machine check architecture supported.\n"); 63 printk(KERN_INFO
64 "Intel old style machine check architecture supported.\n");
49 65
50 /* Enable MCE */ 66 /* Enable MCE: */
51 set_in_cr4(X86_CR4_MCE); 67 set_in_cr4(X86_CR4_MCE);
52 printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); 68 printk(KERN_INFO
69 "Intel old style machine check reporting enabled on CPU#%d.\n",
70 smp_processor_id());
53} 71}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 2ac52d7b434b..43c24e667457 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -2,11 +2,10 @@
2 * P6 specific Machine Check Exception Reporting 2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
@@ -18,9 +17,9 @@
18/* Machine Check Handler For PII/PIII */ 17/* Machine Check Handler For PII/PIII */
19static void intel_machine_check(struct pt_regs *regs, long error_code) 18static void intel_machine_check(struct pt_regs *regs, long error_code)
20{ 19{
21 int recover = 1;
22 u32 alow, ahigh, high, low; 20 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 21 u32 mcgstl, mcgsth;
22 int recover = 1;
24 int i; 23 int i;
25 24
26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 25 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -35,12 +34,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
35 if (high & (1<<31)) { 34 if (high & (1<<31)) {
36 char misc[20]; 35 char misc[20];
37 char addr[24]; 36 char addr[24];
38 misc[0] = addr[0] = '\0'; 37
38 misc[0] = '\0';
39 addr[0] = '\0';
40
39 if (high & (1<<29)) 41 if (high & (1<<29))
40 recover |= 1; 42 recover |= 1;
41 if (high & (1<<25)) 43 if (high & (1<<25))
42 recover |= 2; 44 recover |= 2;
43 high &= ~(1<<31); 45 high &= ~(1<<31);
46
44 if (high & (1<<27)) { 47 if (high & (1<<27)) {
45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); 48 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow); 49 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,6 +52,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 52 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 snprintf(addr, 24, " at %08x%08x", ahigh, alow); 53 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
51 } 54 }
55
52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", 56 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr); 57 smp_processor_id(), i, high, low, misc, addr);
54 } 58 }
@@ -63,16 +67,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
63 /* 67 /*
64 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 68 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
65 * recoverable/continuable.This will allow BIOS to look at the MSRs 69 * recoverable/continuable.This will allow BIOS to look at the MSRs
66 * for errors if the OS could not log the error. 70 * for errors if the OS could not log the error:
67 */ 71 */
68 for (i = 0; i < nr_mce_banks; i++) { 72 for (i = 0; i < nr_mce_banks; i++) {
69 unsigned int msr; 73 unsigned int msr;
74
70 msr = MSR_IA32_MC0_STATUS+i*4; 75 msr = MSR_IA32_MC0_STATUS+i*4;
71 rdmsr(msr, low, high); 76 rdmsr(msr, low, high);
72 if (high & (1<<31)) { 77 if (high & (1<<31)) {
73 /* Clear it */ 78 /* Clear it: */
74 wrmsr(msr, 0UL, 0UL); 79 wrmsr(msr, 0UL, 0UL);
75 /* Serialize */ 80 /* Serialize: */
76 wmb(); 81 wmb();
77 add_taint(TAINT_MACHINE_CHECK); 82 add_taint(TAINT_MACHINE_CHECK);
78 } 83 }
@@ -81,7 +86,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
81 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 86 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
82} 87}
83 88
84/* Set up machine check reporting for processors with Intel style MCE */ 89/* Set up machine check reporting for processors with Intel style MCE: */
85void intel_p6_mcheck_init(struct cpuinfo_x86 *c) 90void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
86{ 91{
87 u32 l, h; 92 u32 l, h;
@@ -97,6 +102,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
97 102
98 /* Ok machine check is available */ 103 /* Ok machine check is available */
99 machine_check_vector = intel_machine_check; 104 machine_check_vector = intel_machine_check;
105 /* Make sure the vector pointer is visible before we enable MCEs: */
100 wmb(); 106 wmb();
101 107
102 printk(KERN_INFO "Intel machine check architecture supported.\n"); 108 printk(KERN_INFO "Intel machine check architecture supported.\n");
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d5ae2243f0b9..7b1ae2e20ba5 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -1,7 +1,7 @@
1/* 1/*
2 *
3 * Thermal throttle event support code (such as syslog messaging and rate 2 * Thermal throttle event support code (such as syslog messaging and rate
4 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). 3 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
4 *
5 * This allows consistent reporting of CPU thermal throttle events. 5 * This allows consistent reporting of CPU thermal throttle events.
6 * 6 *
7 * Maintains a counter in /sys that keeps track of the number of thermal 7 * Maintains a counter in /sys that keeps track of the number of thermal
@@ -13,43 +13,43 @@
13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. 13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
14 * Inspired by Ross Biro's and Al Borchers' counter code. 14 * Inspired by Ross Biro's and Al Borchers' counter code.
15 */ 15 */
16 16#include <linux/notifier.h>
17#include <linux/jiffies.h>
17#include <linux/percpu.h> 18#include <linux/percpu.h>
18#include <linux/sysdev.h> 19#include <linux/sysdev.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <asm/cpu.h> 21
21#include <linux/notifier.h>
22#include <linux/jiffies.h>
23#include <asm/therm_throt.h> 22#include <asm/therm_throt.h>
24 23
25/* How long to wait between reporting thermal events */ 24/* How long to wait between reporting thermal events */
26#define CHECK_INTERVAL (300 * HZ) 25#define CHECK_INTERVAL (300 * HZ)
27 26
28static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; 27static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
29static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); 28static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
30atomic_t therm_throt_en = ATOMIC_INIT(0); 29
30atomic_t therm_throt_en = ATOMIC_INIT(0);
31 31
32#ifdef CONFIG_SYSFS 32#ifdef CONFIG_SYSFS
33#define define_therm_throt_sysdev_one_ro(_name) \ 33#define define_therm_throt_sysdev_one_ro(_name) \
34 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 34 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
35 35
36#define define_therm_throt_sysdev_show_func(name) \ 36#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \ 38 struct sysdev_attribute *attr, \
39 char *buf) \ 39 char *buf) \
40{ \ 40{ \
41 unsigned int cpu = dev->id; \ 41 unsigned int cpu = dev->id; \
42 ssize_t ret; \ 42 ssize_t ret; \
43 \ 43 \
44 preempt_disable(); /* CPU hotplug */ \ 44 preempt_disable(); /* CPU hotplug */ \
45 if (cpu_online(cpu)) \ 45 if (cpu_online(cpu)) \
46 ret = sprintf(buf, "%lu\n", \ 46 ret = sprintf(buf, "%lu\n", \
47 per_cpu(thermal_throttle_##name, cpu)); \ 47 per_cpu(thermal_throttle_##name, cpu)); \
48 else \ 48 else \
49 ret = 0; \ 49 ret = 0; \
50 preempt_enable(); \ 50 preempt_enable(); \
51 \ 51 \
52 return ret; \ 52 return ret; \
53} 53}
54 54
55define_therm_throt_sysdev_show_func(count); 55define_therm_throt_sysdev_show_func(count);
@@ -61,8 +61,8 @@ static struct attribute *thermal_throttle_attrs[] = {
61}; 61};
62 62
63static struct attribute_group thermal_throttle_attr_group = { 63static struct attribute_group thermal_throttle_attr_group = {
64 .attrs = thermal_throttle_attrs, 64 .attrs = thermal_throttle_attrs,
65 .name = "thermal_throttle" 65 .name = "thermal_throttle"
66}; 66};
67#endif /* CONFIG_SYSFS */ 67#endif /* CONFIG_SYSFS */
68 68
@@ -110,10 +110,11 @@ int therm_throt_process(int curr)
110} 110}
111 111
112#ifdef CONFIG_SYSFS 112#ifdef CONFIG_SYSFS
113/* Add/Remove thermal_throttle interface for CPU device */ 113/* Add/Remove thermal_throttle interface for CPU device: */
114static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) 114static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
115{ 115{
116 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); 116 return sysfs_create_group(&sys_dev->kobj,
117 &thermal_throttle_attr_group);
117} 118}
118 119
119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 120static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
@@ -121,19 +122,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
121 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 122 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
122} 123}
123 124
124/* Mutex protecting device creation against CPU hotplug */ 125/* Mutex protecting device creation against CPU hotplug: */
125static DEFINE_MUTEX(therm_cpu_lock); 126static DEFINE_MUTEX(therm_cpu_lock);
126 127
127/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 128/* Get notified when a cpu comes on/off. Be hotplug friendly. */
128static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, 129static __cpuinit int
129 unsigned long action, 130thermal_throttle_cpu_callback(struct notifier_block *nfb,
130 void *hcpu) 131 unsigned long action,
132 void *hcpu)
131{ 133{
132 unsigned int cpu = (unsigned long)hcpu; 134 unsigned int cpu = (unsigned long)hcpu;
133 struct sys_device *sys_dev; 135 struct sys_device *sys_dev;
134 int err = 0; 136 int err = 0;
135 137
136 sys_dev = get_cpu_sysdev(cpu); 138 sys_dev = get_cpu_sysdev(cpu);
139
137 switch (action) { 140 switch (action) {
138 case CPU_UP_PREPARE: 141 case CPU_UP_PREPARE:
139 case CPU_UP_PREPARE_FROZEN: 142 case CPU_UP_PREPARE_FROZEN:
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 23ee9e730f78..d746df2909c9 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -17,7 +17,7 @@ static void default_threshold_interrupt(void)
17 17
18void (*mce_threshold_vector)(void) = default_threshold_interrupt; 18void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19 19
20asmlinkage void mce_threshold_interrupt(void) 20asmlinkage void smp_threshold_interrupt(void)
21{ 21{
22 exit_idle(); 22 exit_idle();
23 irq_enter(); 23 irq_enter();
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 2a043d89811d..81b02487090b 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -2,11 +2,10 @@
2 * IDT Winchip specific Machine Check Exception Reporting 2 * IDT Winchip specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10 9
11#include <asm/processor.h> 10#include <asm/processor.h>
12#include <asm/system.h> 11#include <asm/system.h>
@@ -14,7 +13,7 @@
14 13
15#include "mce.h" 14#include "mce.h"
16 15
17/* Machine check handler for WinChip C6 */ 16/* Machine check handler for WinChip C6: */
18static void winchip_machine_check(struct pt_regs *regs, long error_code) 17static void winchip_machine_check(struct pt_regs *regs, long error_code)
19{ 18{
20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 19 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
@@ -25,12 +24,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
25void winchip_mcheck_init(struct cpuinfo_x86 *c) 24void winchip_mcheck_init(struct cpuinfo_x86 *c)
26{ 25{
27 u32 lo, hi; 26 u32 lo, hi;
27
28 machine_check_vector = winchip_machine_check; 28 machine_check_vector = winchip_machine_check;
29 /* Make sure the vector pointer is visible before we enable MCEs: */
29 wmb(); 30 wmb();
31
30 rdmsr(MSR_IDT_FCR1, lo, hi); 32 rdmsr(MSR_IDT_FCR1, lo, hi);
31 lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ 33 lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
32 lo &= ~(1<<4); /* Enable MCE */ 34 lo &= ~(1<<4); /* Enable MCE */
33 wrmsr(MSR_IDT_FCR1, lo, hi); 35 wrmsr(MSR_IDT_FCR1, lo, hi);
36
34 set_in_cr4(X86_CR4_MCE); 37 set_in_cr4(X86_CR4_MCE);
35 printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); 38
39 printk(KERN_INFO
40 "Winchip machine check reporting enabled on CPU#0.\n");
36} 41}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a4742a340d8d..de74f0a3e0ed 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -963,6 +963,8 @@ END(\sym)
963#ifdef CONFIG_SMP 963#ifdef CONFIG_SMP
964apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ 964apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
965 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 965 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
966apicinterrupt REBOOT_VECTOR \
967 reboot_interrupt smp_reboot_interrupt
966#endif 968#endif
967 969
968#ifdef CONFIG_X86_UV 970#ifdef CONFIG_X86_UV
@@ -994,10 +996,15 @@ apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
994#endif 996#endif
995 997
996apicinterrupt THRESHOLD_APIC_VECTOR \ 998apicinterrupt THRESHOLD_APIC_VECTOR \
997 threshold_interrupt mce_threshold_interrupt 999 threshold_interrupt smp_threshold_interrupt
998apicinterrupt THERMAL_APIC_VECTOR \ 1000apicinterrupt THERMAL_APIC_VECTOR \
999 thermal_interrupt smp_thermal_interrupt 1001 thermal_interrupt smp_thermal_interrupt
1000 1002
1003#ifdef CONFIG_X86_MCE
1004apicinterrupt MCE_SELF_VECTOR \
1005 mce_self_interrupt smp_mce_self_interrupt
1006#endif
1007
1001#ifdef CONFIG_SMP 1008#ifdef CONFIG_SMP
1002apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ 1009apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1003 call_function_single_interrupt smp_call_function_single_interrupt 1010 call_function_single_interrupt smp_call_function_single_interrupt
@@ -1379,7 +1386,7 @@ errorentry xen_stack_segment do_stack_segment
1379errorentry general_protection do_general_protection 1386errorentry general_protection do_general_protection
1380errorentry page_fault do_page_fault 1387errorentry page_fault do_page_fault
1381#ifdef CONFIG_X86_MCE 1388#ifdef CONFIG_X86_MCE
1382paranoidzeroentry machine_check do_machine_check 1389paranoidzeroentry machine_check *machine_check_vector(%rip)
1383#endif 1390#endif
1384 1391
1385 /* 1392 /*
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 38287b5f116e..b0cdde6932f5 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,7 @@
12#include <asm/io_apic.h> 12#include <asm/io_apic.h>
13#include <asm/irq.h> 13#include <asm/irq.h>
14#include <asm/idle.h> 14#include <asm/idle.h>
15#include <asm/mce.h>
15#include <asm/hw_irq.h> 16#include <asm/hw_irq.h>
16 17
17atomic_t irq_err_count; 18atomic_t irq_err_count;
@@ -96,13 +97,23 @@ static int show_other_interrupts(struct seq_file *p, int prec)
96 for_each_online_cpu(j) 97 for_each_online_cpu(j)
97 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
98 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
99# ifdef CONFIG_X86_64 100# ifdef CONFIG_X86_MCE_THRESHOLD
100 seq_printf(p, "%*s: ", prec, "THR"); 101 seq_printf(p, "%*s: ", prec, "THR");
101 for_each_online_cpu(j) 102 for_each_online_cpu(j)
102 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
103 seq_printf(p, " Threshold APIC interrupts\n"); 104 seq_printf(p, " Threshold APIC interrupts\n");
104# endif 105# endif
105#endif 106#endif
107#ifdef CONFIG_X86_NEW_MCE
108 seq_printf(p, "%*s: ", prec, "MCE");
109 for_each_online_cpu(j)
110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
111 seq_printf(p, " Machine check exceptions\n");
112 seq_printf(p, "%*s: ", prec, "MCP");
113 for_each_online_cpu(j)
114 seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
115 seq_printf(p, " Machine check polls\n");
116#endif
106 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); 117 seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
107#if defined(CONFIG_X86_IO_APIC) 118#if defined(CONFIG_X86_IO_APIC)
108 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); 119 seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
@@ -185,10 +196,14 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
185#endif 196#endif
186#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_MCE
187 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
188# ifdef CONFIG_X86_64 199# ifdef CONFIG_X86_MCE_THRESHOLD
189 sum += irq_stats(cpu)->irq_threshold_count; 200 sum += irq_stats(cpu)->irq_threshold_count;
190# endif 201# endif
191#endif 202#endif
203#ifdef CONFIG_X86_NEW_MCE
204 sum += per_cpu(mce_exception_count, cpu);
205 sum += per_cpu(mce_poll_count, cpu);
206#endif
192 return sum; 207 return sum;
193} 208}
194 209
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 267c6624c77f..696f0e475c2d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -173,6 +173,9 @@ static void __init smp_intr_init(void)
173 /* Low priority IPI to cleanup after moving an irq */ 173 /* Low priority IPI to cleanup after moving an irq */
174 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 174 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
175 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); 175 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
176
177 /* IPI used for rebooting/stopping */
178 alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
176#endif 179#endif
177#endif /* CONFIG_SMP */ 180#endif /* CONFIG_SMP */
178} 181}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 0a813b17b172..4c578751e94e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -24,11 +24,11 @@
24#include <asm/ucontext.h> 24#include <asm/ucontext.h>
25#include <asm/i387.h> 25#include <asm/i387.h>
26#include <asm/vdso.h> 26#include <asm/vdso.h>
27#include <asm/mce.h>
27 28
28#ifdef CONFIG_X86_64 29#ifdef CONFIG_X86_64
29#include <asm/proto.h> 30#include <asm/proto.h>
30#include <asm/ia32_unistd.h> 31#include <asm/ia32_unistd.h>
31#include <asm/mce.h>
32#endif /* CONFIG_X86_64 */ 32#endif /* CONFIG_X86_64 */
33 33
34#include <asm/syscall.h> 34#include <asm/syscall.h>
@@ -856,10 +856,10 @@ static void do_signal(struct pt_regs *regs)
856void 856void
857do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 857do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
858{ 858{
859#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) 859#ifdef CONFIG_X86_NEW_MCE
860 /* notify userspace of pending MCEs */ 860 /* notify userspace of pending MCEs */
861 if (thread_info_flags & _TIF_MCE_NOTIFY) 861 if (thread_info_flags & _TIF_MCE_NOTIFY)
862 mce_notify_user(); 862 mce_notify_process();
863#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ 863#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
864 864
865 /* deal with pending signal delivery */ 865 /* deal with pending signal delivery */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 28f5fb495a66..ec1de97600e7 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -150,14 +150,40 @@ void native_send_call_func_ipi(const struct cpumask *mask)
150 * this function calls the 'stop' function on all other CPUs in the system. 150 * this function calls the 'stop' function on all other CPUs in the system.
151 */ 151 */
152 152
153asmlinkage void smp_reboot_interrupt(void)
154{
155 ack_APIC_irq();
156 irq_enter();
157 stop_this_cpu(NULL);
158 irq_exit();
159}
160
153static void native_smp_send_stop(void) 161static void native_smp_send_stop(void)
154{ 162{
155 unsigned long flags; 163 unsigned long flags;
164 unsigned long wait;
156 165
157 if (reboot_force) 166 if (reboot_force)
158 return; 167 return;
159 168
160 smp_call_function(stop_this_cpu, NULL, 0); 169 /*
170 * Use an own vector here because smp_call_function
171 * does lots of things not suitable in a panic situation.
172 * On most systems we could also use an NMI here,
173 * but there are a few systems around where NMI
174 * is problematic so stay with an non NMI for now
175 * (this implies we cannot stop CPUs spinning with irq off
176 * currently)
177 */
178 if (num_online_cpus() > 1) {
179 apic->send_IPI_allbutself(REBOOT_VECTOR);
180
181 /* Don't wait longer than a second */
182 wait = USEC_PER_SEC;
183 while (num_online_cpus() > 1 && wait--)
184 udelay(1);
185 }
186
161 local_irq_save(flags); 187 local_irq_save(flags);
162 disable_local_APIC(); 188 disable_local_APIC();
163 local_irq_restore(flags); 189 local_irq_restore(flags);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 07d60c870ce2..1e1e27b7d438 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -798,15 +798,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
798 798
799 return new_kesp; 799 return new_kesp;
800} 800}
801#else 801#endif
802
802asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) 803asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
803{ 804{
804} 805}
805 806
806asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) 807asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
807{ 808{
808} 809}
809#endif
810 810
811/* 811/*
812 * 'math_state_restore()' saves the current math information in the 812 * 'math_state_restore()' saves the current math information in the