aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu/mcheck
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck')
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile13
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c45
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c218
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c2049
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c76
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c1187
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c (renamed from arch/x86/kernel/cpu/mcheck/mce_amd_64.c)203
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c (renamed from arch/x86/kernel/cpu/mcheck/mce_intel_64.c)84
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c60
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c112
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c51
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c29
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c177
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c20
18 files changed, 2798 insertions, 1684 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index b2f89829bbe8..188a1ca5ad2b 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,7 +1,12 @@
1obj-y = mce_$(BITS).o therm_throt.o 1obj-y = mce.o
2 2
3obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o 3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o 4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
5obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o 5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
6obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
7obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
6obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o 8obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
7obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o 9obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
10obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
11
12obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index dd3af6e7b39a..b945d5dbc609 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -2,25 +2,23 @@
2 * Athlon specific Machine Check Exception Reporting 2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@redhat.com> 3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
13#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15 15
16#include "mce.h" 16/* Machine Check Handler For AMD Athlon/Duron: */
17
18/* Machine Check Handler For AMD Athlon/Duron */
19static void k7_machine_check(struct pt_regs *regs, long error_code) 17static void k7_machine_check(struct pt_regs *regs, long error_code)
20{ 18{
21 int recover = 1;
22 u32 alow, ahigh, high, low; 19 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 20 u32 mcgstl, mcgsth;
21 int recover = 1;
24 int i; 22 int i;
25 23
26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 24 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -32,15 +30,19 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
32 30
33 for (i = 1; i < nr_mce_banks; i++) { 31 for (i = 1; i < nr_mce_banks; i++) {
34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); 32 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
35 if (high&(1<<31)) { 33 if (high & (1<<31)) {
36 char misc[20]; 34 char misc[20];
37 char addr[24]; 35 char addr[24];
38 misc[0] = addr[0] = '\0'; 36
37 misc[0] = '\0';
38 addr[0] = '\0';
39
39 if (high & (1<<29)) 40 if (high & (1<<29))
40 recover |= 1; 41 recover |= 1;
41 if (high & (1<<25)) 42 if (high & (1<<25))
42 recover |= 2; 43 recover |= 2;
43 high &= ~(1<<31); 44 high &= ~(1<<31);
45
44 if (high & (1<<27)) { 46 if (high & (1<<27)) {
45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); 47 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow); 48 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,27 +51,31 @@ static void k7_machine_check(struct pt_regs *regs, long error_code)
49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 51 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 snprintf(addr, 24, " at %08x%08x", ahigh, alow); 52 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
51 } 53 }
54
52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", 55 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr); 56 smp_processor_id(), i, high, low, misc, addr);
54 /* Clear it */ 57
58 /* Clear it: */
55 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 59 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
56 /* Serialize */ 60 /* Serialize: */
57 wmb(); 61 wmb();
58 add_taint(TAINT_MACHINE_CHECK); 62 add_taint(TAINT_MACHINE_CHECK);
59 } 63 }
60 } 64 }
61 65
62 if (recover&2) 66 if (recover & 2)
63 panic("CPU context corrupt"); 67 panic("CPU context corrupt");
64 if (recover&1) 68 if (recover & 1)
65 panic("Unable to continue"); 69 panic("Unable to continue");
70
66 printk(KERN_EMERG "Attempting to continue.\n"); 71 printk(KERN_EMERG "Attempting to continue.\n");
72
67 mcgstl &= ~(1<<2); 73 mcgstl &= ~(1<<2);
68 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 74 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
69} 75}
70 76
71 77
72/* AMD K7 machine check is Intel like */ 78/* AMD K7 machine check is Intel like: */
73void amd_mcheck_init(struct cpuinfo_x86 *c) 79void amd_mcheck_init(struct cpuinfo_x86 *c)
74{ 80{
75 u32 l, h; 81 u32 l, h;
@@ -79,21 +85,26 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
79 return; 85 return;
80 86
81 machine_check_vector = k7_machine_check; 87 machine_check_vector = k7_machine_check;
88 /* Make sure the vector pointer is visible before we enable MCEs: */
82 wmb(); 89 wmb();
83 90
84 printk(KERN_INFO "Intel machine check architecture supported.\n"); 91 printk(KERN_INFO "Intel machine check architecture supported.\n");
92
85 rdmsr(MSR_IA32_MCG_CAP, l, h); 93 rdmsr(MSR_IA32_MCG_CAP, l, h);
86 if (l & (1<<8)) /* Control register present ? */ 94 if (l & (1<<8)) /* Control register present ? */
87 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 95 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
88 nr_mce_banks = l & 0xff; 96 nr_mce_banks = l & 0xff;
89 97
90 /* Clear status for MC index 0 separately, we don't touch CTL, 98 /*
91 * as some K7 Athlons cause spurious MCEs when its enabled. */ 99 * Clear status for MC index 0 separately, we don't touch CTL,
100 * as some K7 Athlons cause spurious MCEs when its enabled:
101 */
92 if (boot_cpu_data.x86 == 6) { 102 if (boot_cpu_data.x86 == 6) {
93 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); 103 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
94 i = 1; 104 i = 1;
95 } else 105 } else
96 i = 0; 106 i = 0;
107
97 for (; i < nr_mce_banks; i++) { 108 for (; i < nr_mce_banks; i++) {
98 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); 109 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
99 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); 110 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
new file mode 100644
index 000000000000..a3a235a53f09
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -0,0 +1,127 @@
1/*
2 * Machine check injection support.
3 * Copyright 2008 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 *
10 * Authors:
11 * Andi Kleen
12 * Ying Huang
13 */
14#include <linux/uaccess.h>
15#include <linux/module.h>
16#include <linux/timer.h>
17#include <linux/kernel.h>
18#include <linux/string.h>
19#include <linux/fs.h>
20#include <linux/smp.h>
21#include <asm/mce.h>
22
23/* Update fake mce registers on current CPU. */
24static void inject_mce(struct mce *m)
25{
26 struct mce *i = &per_cpu(injectm, m->extcpu);
27
28 /* Make sure noone reads partially written injectm */
29 i->finished = 0;
30 mb();
31 m->finished = 0;
32 /* First set the fields after finished */
33 i->extcpu = m->extcpu;
34 mb();
35 /* Now write record in order, finished last (except above) */
36 memcpy(i, m, sizeof(struct mce));
37 /* Finally activate it */
38 mb();
39 i->finished = 1;
40}
41
42struct delayed_mce {
43 struct timer_list timer;
44 struct mce m;
45};
46
47/* Inject mce on current CPU */
48static void raise_mce(unsigned long data)
49{
50 struct delayed_mce *dm = (struct delayed_mce *)data;
51 struct mce *m = &dm->m;
52 int cpu = m->extcpu;
53
54 inject_mce(m);
55 if (m->status & MCI_STATUS_UC) {
56 struct pt_regs regs;
57 memset(&regs, 0, sizeof(struct pt_regs));
58 regs.ip = m->ip;
59 regs.cs = m->cs;
60 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
61 do_machine_check(&regs, 0);
62 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
63 } else {
64 mce_banks_t b;
65 memset(&b, 0xff, sizeof(mce_banks_t));
66 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
67 machine_check_poll(0, &b);
68 mce_notify_irq();
69 printk(KERN_INFO "Finished machine check poll on CPU %d\n",
70 cpu);
71 }
72 kfree(dm);
73}
74
75/* Error injection interface */
76static ssize_t mce_write(struct file *filp, const char __user *ubuf,
77 size_t usize, loff_t *off)
78{
79 struct delayed_mce *dm;
80 struct mce m;
81
82 if (!capable(CAP_SYS_ADMIN))
83 return -EPERM;
84 /*
85 * There are some cases where real MSR reads could slip
86 * through.
87 */
88 if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
89 return -EIO;
90
91 if ((unsigned long)usize > sizeof(struct mce))
92 usize = sizeof(struct mce);
93 if (copy_from_user(&m, ubuf, usize))
94 return -EFAULT;
95
96 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
97 return -EINVAL;
98
99 dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
100 if (!dm)
101 return -ENOMEM;
102
103 /*
104 * Need to give user space some time to set everything up,
105 * so do it a jiffie or two later everywhere.
106 * Should we use a hrtimer here for better synchronization?
107 */
108 memcpy(&dm->m, &m, sizeof(struct mce));
109 setup_timer(&dm->timer, raise_mce, (unsigned long)dm);
110 dm->timer.expires = jiffies + 2;
111 add_timer_on(&dm->timer, m.extcpu);
112 return usize;
113}
114
115static int inject_init(void)
116{
117 printk(KERN_INFO "Machine check injector initialized\n");
118 mce_chrdev_ops.write = mce_write;
119 return 0;
120}
121
122module_init(inject_init);
123/*
124 * Cannot tolerate unloading currently because we cannot
125 * guarantee all openers of mce_chrdev will get a reference to us.
126 */
127MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
new file mode 100644
index 000000000000..54dcb8ff12e5
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -0,0 +1,15 @@
1#include <asm/mce.h>
2
3enum severity_level {
4 MCE_NO_SEVERITY,
5 MCE_KEEP_SEVERITY,
6 MCE_SOME_SEVERITY,
7 MCE_AO_SEVERITY,
8 MCE_UC_SEVERITY,
9 MCE_AR_SEVERITY,
10 MCE_PANIC_SEVERITY,
11};
12
13int mce_severity(struct mce *a, int tolerant, char **msg);
14
15extern int mce_ser;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
new file mode 100644
index 000000000000..ff0807f97056
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -0,0 +1,218 @@
1/*
2 * MCE grading rules.
3 * Copyright 2008, 2009 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 *
10 * Author: Andi Kleen
11 */
12#include <linux/kernel.h>
13#include <linux/seq_file.h>
14#include <linux/init.h>
15#include <linux/debugfs.h>
16#include <asm/mce.h>
17
18#include "mce-internal.h"
19
20/*
21 * Grade an mce by severity. In general the most severe ones are processed
22 * first. Since there are quite a lot of combinations test the bits in a
23 * table-driven way. The rules are simply processed in order, first
24 * match wins.
25 *
26 * Note this is only used for machine check exceptions, the corrected
27 * errors use much simpler rules. The exceptions still check for the corrected
28 * errors, but only to leave them alone for the CMCI handler (except for
29 * panic situations)
30 */
31
32enum context { IN_KERNEL = 1, IN_USER = 2 };
33enum ser { SER_REQUIRED = 1, NO_SER = 2 };
34
35static struct severity {
36 u64 mask;
37 u64 result;
38 unsigned char sev;
39 unsigned char mcgmask;
40 unsigned char mcgres;
41 unsigned char ser;
42 unsigned char context;
43 unsigned char covered;
44 char *msg;
45} severities[] = {
46#define KERNEL .context = IN_KERNEL
47#define USER .context = IN_USER
48#define SER .ser = SER_REQUIRED
49#define NOSER .ser = NO_SER
50#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
51#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
52#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
53#define MCGMASK(x, res, s, m, r...) \
54 { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
55#define MASK(x, y, s, m, r...) \
56 { .mask = x, .result = y, SEV(s), .msg = m, ## r }
57#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
58#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
59#define MCACOD 0xffff
60
61 BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
62 BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
63 BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
64 /* When MCIP is not set something is very confused */
65 MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
66 /* Neither return not error IP -- no chance to recover -> PANIC */
67 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
68 "Neither restart nor error IP"),
69 MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
70 KERNEL),
71 BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
72 MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
73 "Spurious not enabled", SER),
74
75 /* ignore OVER for UCNA */
76 MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
77 "Uncorrected no action required", SER),
78 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
79 "Illegal combination (UCNA with AR=1)", SER),
80 MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
81
82 /* AR add known MCACODs here */
83 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
84 "Action required with lost events", SER),
85 MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
86 "Action required; unknown MCACOD", SER),
87
88 /* known AO MCACODs: */
89 MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
90 "Action optional: memory scrubbing error", SER),
91 MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
92 "Action optional: last level cache writeback error", SER),
93
94 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
95 "Action optional unknown MCACOD", SER),
96 MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
97 "Action optional with lost events", SER),
98 BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
99 BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
100 BITSET(0, SOME, "No match") /* always matches. keep at end */
101};
102
103/*
104 * If the EIPV bit is set, it means the saved IP is the
105 * instruction which caused the MCE.
106 */
107static int error_context(struct mce *m)
108{
109 if (m->mcgstatus & MCG_STATUS_EIPV)
110 return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
111 /* Unknown, assume kernel */
112 return IN_KERNEL;
113}
114
115int mce_severity(struct mce *a, int tolerant, char **msg)
116{
117 enum context ctx = error_context(a);
118 struct severity *s;
119
120 for (s = severities;; s++) {
121 if ((a->status & s->mask) != s->result)
122 continue;
123 if ((a->mcgstatus & s->mcgmask) != s->mcgres)
124 continue;
125 if (s->ser == SER_REQUIRED && !mce_ser)
126 continue;
127 if (s->ser == NO_SER && mce_ser)
128 continue;
129 if (s->context && ctx != s->context)
130 continue;
131 if (msg)
132 *msg = s->msg;
133 s->covered = 1;
134 if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
135 if (panic_on_oops || tolerant < 1)
136 return MCE_PANIC_SEVERITY;
137 }
138 return s->sev;
139 }
140}
141
142static void *s_start(struct seq_file *f, loff_t *pos)
143{
144 if (*pos >= ARRAY_SIZE(severities))
145 return NULL;
146 return &severities[*pos];
147}
148
149static void *s_next(struct seq_file *f, void *data, loff_t *pos)
150{
151 if (++(*pos) >= ARRAY_SIZE(severities))
152 return NULL;
153 return &severities[*pos];
154}
155
156static void s_stop(struct seq_file *f, void *data)
157{
158}
159
160static int s_show(struct seq_file *f, void *data)
161{
162 struct severity *ser = data;
163 seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
164 return 0;
165}
166
167static const struct seq_operations severities_seq_ops = {
168 .start = s_start,
169 .next = s_next,
170 .stop = s_stop,
171 .show = s_show,
172};
173
174static int severities_coverage_open(struct inode *inode, struct file *file)
175{
176 return seq_open(file, &severities_seq_ops);
177}
178
179static ssize_t severities_coverage_write(struct file *file,
180 const char __user *ubuf,
181 size_t count, loff_t *ppos)
182{
183 int i;
184 for (i = 0; i < ARRAY_SIZE(severities); i++)
185 severities[i].covered = 0;
186 return count;
187}
188
189static const struct file_operations severities_coverage_fops = {
190 .open = severities_coverage_open,
191 .release = seq_release,
192 .read = seq_read,
193 .write = severities_coverage_write,
194};
195
196static int __init severities_debugfs_init(void)
197{
198 struct dentry *dmce = NULL, *fseverities_coverage = NULL;
199
200 dmce = debugfs_create_dir("mce", NULL);
201 if (dmce == NULL)
202 goto err_out;
203 fseverities_coverage = debugfs_create_file("severities-coverage",
204 0444, dmce, NULL,
205 &severities_coverage_fops);
206 if (fseverities_coverage == NULL)
207 goto err_out;
208
209 return 0;
210
211err_out:
212 if (fseverities_coverage)
213 debugfs_remove(fseverities_coverage);
214 if (dmce)
215 debugfs_remove(dmce);
216 return -ENOMEM;
217}
218late_initcall(severities_debugfs_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
new file mode 100644
index 000000000000..284d1de968bc
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -0,0 +1,2049 @@
1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
13#include <linux/interrupt.h>
14#include <linux/ratelimit.h>
15#include <linux/kallsyms.h>
16#include <linux/rcupdate.h>
17#include <linux/kobject.h>
18#include <linux/uaccess.h>
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
22#include <linux/string.h>
23#include <linux/sysdev.h>
24#include <linux/delay.h>
25#include <linux/ctype.h>
26#include <linux/sched.h>
27#include <linux/sysfs.h>
28#include <linux/types.h>
29#include <linux/init.h>
30#include <linux/kmod.h>
31#include <linux/poll.h>
32#include <linux/nmi.h>
33#include <linux/cpu.h>
34#include <linux/smp.h>
35#include <linux/fs.h>
36#include <linux/mm.h>
37
38#include <asm/processor.h>
39#include <asm/hw_irq.h>
40#include <asm/apic.h>
41#include <asm/idle.h>
42#include <asm/ipi.h>
43#include <asm/mce.h>
44#include <asm/msr.h>
45
46#include "mce-internal.h"
47
48/* Handle unconfigured int18 (should never happen) */
49static void unexpected_machine_check(struct pt_regs *regs, long error_code)
50{
51 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
52 smp_processor_id());
53}
54
55/* Call the installed machine check handler for this CPU setup. */
56void (*machine_check_vector)(struct pt_regs *, long error_code) =
57 unexpected_machine_check;
58
59int mce_disabled __read_mostly;
60
61#ifdef CONFIG_X86_NEW_MCE
62
63#define MISC_MCELOG_MINOR 227
64
65#define SPINUNIT 100 /* 100ns */
66
67atomic_t mce_entry;
68
69DEFINE_PER_CPU(unsigned, mce_exception_count);
70
71/*
72 * Tolerant levels:
73 * 0: always panic on uncorrected errors, log corrected errors
74 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
75 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
76 * 3: never panic or SIGBUS, log all errors (for testing only)
77 */
78static int tolerant __read_mostly = 1;
79static int banks __read_mostly;
80static u64 *bank __read_mostly;
81static int rip_msr __read_mostly;
82static int mce_bootlog __read_mostly = -1;
83static int monarch_timeout __read_mostly = -1;
84static int mce_panic_timeout __read_mostly;
85static int mce_dont_log_ce __read_mostly;
86int mce_cmci_disabled __read_mostly;
87int mce_ignore_ce __read_mostly;
88int mce_ser __read_mostly;
89
90/* User mode helper program triggered by machine check event */
91static unsigned long mce_need_notify;
92static char mce_helper[128];
93static char *mce_helper_argv[2] = { mce_helper, NULL };
94
95static unsigned long dont_init_banks;
96
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
98static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing;
100
101
102/* MCA banks polled by the period polling timer for corrected events */
103DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
105};
106
107static inline int skip_bank_init(int i)
108{
109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
110}
111
112static DEFINE_PER_CPU(struct work_struct, mce_work);
113
114/* Do initial initialization of a struct mce */
115void mce_setup(struct mce *m)
116{
117 memset(m, 0, sizeof(struct mce));
118 m->cpu = m->extcpu = smp_processor_id();
119 rdtscll(m->tsc);
120 /* We hope get_seconds stays lockless */
121 m->time = get_seconds();
122 m->cpuvendor = boot_cpu_data.x86_vendor;
123 m->cpuid = cpuid_eax(1);
124#ifdef CONFIG_SMP
125 m->socketid = cpu_data(m->extcpu).phys_proc_id;
126#endif
127 m->apicid = cpu_data(m->extcpu).initial_apicid;
128 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
129}
130
131DEFINE_PER_CPU(struct mce, injectm);
132EXPORT_PER_CPU_SYMBOL_GPL(injectm);
133
134/*
135 * Lockless MCE logging infrastructure.
136 * This avoids deadlocks on printk locks without having to break locks. Also
137 * separate MCEs from kernel messages to avoid bogus bug reports.
138 */
139
140static struct mce_log mcelog = {
141 .signature = MCE_LOG_SIGNATURE,
142 .len = MCE_LOG_LEN,
143 .recordlen = sizeof(struct mce),
144};
145
146void mce_log(struct mce *mce)
147{
148 unsigned next, entry;
149
150 mce->finished = 0;
151 wmb();
152 for (;;) {
153 entry = rcu_dereference(mcelog.next);
154 for (;;) {
155 /*
156 * When the buffer fills up discard new entries.
157 * Assume that the earlier errors are the more
158 * interesting ones:
159 */
160 if (entry >= MCE_LOG_LEN) {
161 set_bit(MCE_OVERFLOW,
162 (unsigned long *)&mcelog.flags);
163 return;
164 }
165 /* Old left over entry. Skip: */
166 if (mcelog.entry[entry].finished) {
167 entry++;
168 continue;
169 }
170 break;
171 }
172 smp_rmb();
173 next = entry + 1;
174 if (cmpxchg(&mcelog.next, entry, next) == entry)
175 break;
176 }
177 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
178 wmb();
179 mcelog.entry[entry].finished = 1;
180 wmb();
181
182 mce->finished = 1;
183 set_bit(0, &mce_need_notify);
184}
185
186static void print_mce(struct mce *m)
187{
188 printk(KERN_EMERG
189 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
190 m->extcpu, m->mcgstatus, m->bank, m->status);
191 if (m->ip) {
192 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
193 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
194 m->cs, m->ip);
195 if (m->cs == __KERNEL_CS)
196 print_symbol("{%s}", m->ip);
197 printk("\n");
198 }
199 printk(KERN_EMERG "TSC %llx ", m->tsc);
200 if (m->addr)
201 printk("ADDR %llx ", m->addr);
202 if (m->misc)
203 printk("MISC %llx ", m->misc);
204 printk("\n");
205 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
206 m->cpuvendor, m->cpuid, m->time, m->socketid,
207 m->apicid);
208}
209
210static void print_mce_head(void)
211{
212 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
213}
214
215static void print_mce_tail(void)
216{
217 printk(KERN_EMERG "This is not a software problem!\n"
218 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
219}
220
221#define PANIC_TIMEOUT 5 /* 5 seconds */
222
223static atomic_t mce_paniced;
224
225/* Panic in progress. Enable interrupts and wait for final IPI */
226static void wait_for_panic(void)
227{
228 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
229 preempt_disable();
230 local_irq_enable();
231 while (timeout-- > 0)
232 udelay(1);
233 if (panic_timeout == 0)
234 panic_timeout = mce_panic_timeout;
235 panic("Panicing machine check CPU died");
236}
237
238static void mce_panic(char *msg, struct mce *final, char *exp)
239{
240 int i;
241
242 /*
243 * Make sure only one CPU runs in machine check panic
244 */
245 if (atomic_add_return(1, &mce_paniced) > 1)
246 wait_for_panic();
247 barrier();
248
249 bust_spinlocks(1);
250 console_verbose();
251 print_mce_head();
252 /* First print corrected ones that are still unlogged */
253 for (i = 0; i < MCE_LOG_LEN; i++) {
254 struct mce *m = &mcelog.entry[i];
255 if (!(m->status & MCI_STATUS_VAL))
256 continue;
257 if (!(m->status & MCI_STATUS_UC))
258 print_mce(m);
259 }
260 /* Now print uncorrected but with the final one last */
261 for (i = 0; i < MCE_LOG_LEN; i++) {
262 struct mce *m = &mcelog.entry[i];
263 if (!(m->status & MCI_STATUS_VAL))
264 continue;
265 if (!(m->status & MCI_STATUS_UC))
266 continue;
267 if (!final || memcmp(m, final, sizeof(struct mce)))
268 print_mce(m);
269 }
270 if (final)
271 print_mce(final);
272 if (cpu_missing)
273 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
274 print_mce_tail();
275 if (exp)
276 printk(KERN_EMERG "Machine check: %s\n", exp);
277 if (panic_timeout == 0)
278 panic_timeout = mce_panic_timeout;
279 panic(msg);
280}
281
282/* Support code for software error injection */
283
284static int msr_to_offset(u32 msr)
285{
286 unsigned bank = __get_cpu_var(injectm.bank);
287 if (msr == rip_msr)
288 return offsetof(struct mce, ip);
289 if (msr == MSR_IA32_MC0_STATUS + bank*4)
290 return offsetof(struct mce, status);
291 if (msr == MSR_IA32_MC0_ADDR + bank*4)
292 return offsetof(struct mce, addr);
293 if (msr == MSR_IA32_MC0_MISC + bank*4)
294 return offsetof(struct mce, misc);
295 if (msr == MSR_IA32_MCG_STATUS)
296 return offsetof(struct mce, mcgstatus);
297 return -1;
298}
299
300/* MSR access wrappers used for error injection */
301static u64 mce_rdmsrl(u32 msr)
302{
303 u64 v;
304 if (__get_cpu_var(injectm).finished) {
305 int offset = msr_to_offset(msr);
306 if (offset < 0)
307 return 0;
308 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
309 }
310 rdmsrl(msr, v);
311 return v;
312}
313
314static void mce_wrmsrl(u32 msr, u64 v)
315{
316 if (__get_cpu_var(injectm).finished) {
317 int offset = msr_to_offset(msr);
318 if (offset >= 0)
319 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
320 return;
321 }
322 wrmsrl(msr, v);
323}
324
325/*
326 * Simple lockless ring to communicate PFNs from the exception handler with the
327 * process context work function. This is vastly simplified because there's
328 * only a single reader and a single writer.
329 */
330#define MCE_RING_SIZE 16 /* we use one entry less */
331
332struct mce_ring {
333 unsigned short start;
334 unsigned short end;
335 unsigned long ring[MCE_RING_SIZE];
336};
337static DEFINE_PER_CPU(struct mce_ring, mce_ring);
338
339/* Runs with CPU affinity in workqueue */
340static int mce_ring_empty(void)
341{
342 struct mce_ring *r = &__get_cpu_var(mce_ring);
343
344 return r->start == r->end;
345}
346
347static int mce_ring_get(unsigned long *pfn)
348{
349 struct mce_ring *r;
350 int ret = 0;
351
352 *pfn = 0;
353 get_cpu();
354 r = &__get_cpu_var(mce_ring);
355 if (r->start == r->end)
356 goto out;
357 *pfn = r->ring[r->start];
358 r->start = (r->start + 1) % MCE_RING_SIZE;
359 ret = 1;
360out:
361 put_cpu();
362 return ret;
363}
364
365/* Always runs in MCE context with preempt off */
366static int mce_ring_add(unsigned long pfn)
367{
368 struct mce_ring *r = &__get_cpu_var(mce_ring);
369 unsigned next;
370
371 next = (r->end + 1) % MCE_RING_SIZE;
372 if (next == r->start)
373 return -1;
374 r->ring[r->end] = pfn;
375 wmb();
376 r->end = next;
377 return 0;
378}
379
380int mce_available(struct cpuinfo_x86 *c)
381{
382 if (mce_disabled)
383 return 0;
384 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
385}
386
387static void mce_schedule_work(void)
388{
389 if (!mce_ring_empty()) {
390 struct work_struct *work = &__get_cpu_var(mce_work);
391 if (!work_pending(work))
392 schedule_work(work);
393 }
394}
395
396/*
397 * Get the address of the instruction at the time of the machine check
398 * error.
399 */
400static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
401{
402
403 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
404 m->ip = regs->ip;
405 m->cs = regs->cs;
406 } else {
407 m->ip = 0;
408 m->cs = 0;
409 }
410 if (rip_msr)
411 m->ip = mce_rdmsrl(rip_msr);
412}
413
414#ifdef CONFIG_X86_LOCAL_APIC
415/*
416 * Called after interrupts have been reenabled again
417 * when a MCE happened during an interrupts off region
418 * in the kernel.
419 */
420asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
421{
422 ack_APIC_irq();
423 exit_idle();
424 irq_enter();
425 mce_notify_irq();
426 mce_schedule_work();
427 irq_exit();
428}
429#endif
430
431static void mce_report_event(struct pt_regs *regs)
432{
433 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
434 mce_notify_irq();
435 /*
436 * Triggering the work queue here is just an insurance
437 * policy in case the syscall exit notify handler
438 * doesn't run soon enough or ends up running on the
439 * wrong CPU (can happen when audit sleeps)
440 */
441 mce_schedule_work();
442 return;
443 }
444
445#ifdef CONFIG_X86_LOCAL_APIC
446 /*
447 * Without APIC do not notify. The event will be picked
448 * up eventually.
449 */
450 if (!cpu_has_apic)
451 return;
452
453 /*
454 * When interrupts are disabled we cannot use
455 * kernel services safely. Trigger an self interrupt
456 * through the APIC to instead do the notification
457 * after interrupts are reenabled again.
458 */
459 apic->send_IPI_self(MCE_SELF_VECTOR);
460
461 /*
462 * Wait for idle afterwards again so that we don't leave the
463 * APIC in a non idle state because the normal APIC writes
464 * cannot exclude us.
465 */
466 apic_wait_icr_idle();
467#endif
468}
469
470DEFINE_PER_CPU(unsigned, mce_poll_count);
471
472/*
473 * Poll for corrected events or events that happened before reset.
474 * Those are just logged through /dev/mcelog.
475 *
476 * This is executed in standard interrupt context.
477 *
478 * Note: spec recommends to panic for fatal unsignalled
479 * errors here. However this would be quite problematic --
480 * we would need to reimplement the Monarch handling and
481 * it would mess up the exclusion between exception handler
482 * and poll hander -- * so we skip this for now.
483 * These cases should not happen anyways, or only when the CPU
484 * is already totally * confused. In this case it's likely it will
485 * not fully execute the machine check handler either.
486 */
487void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
488{
489 struct mce m;
490 int i;
491
492 __get_cpu_var(mce_poll_count)++;
493
494 mce_setup(&m);
495
496 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
497 for (i = 0; i < banks; i++) {
498 if (!bank[i] || !test_bit(i, *b))
499 continue;
500
501 m.misc = 0;
502 m.addr = 0;
503 m.bank = i;
504 m.tsc = 0;
505
506 barrier();
507 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
508 if (!(m.status & MCI_STATUS_VAL))
509 continue;
510
511 /*
512 * Uncorrected or signalled events are handled by the exception
513 * handler when it is enabled, so don't process those here.
514 *
515 * TBD do the same check for MCI_STATUS_EN here?
516 */
517 if (!(flags & MCP_UC) &&
518 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
519 continue;
520
521 if (m.status & MCI_STATUS_MISCV)
522 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
523 if (m.status & MCI_STATUS_ADDRV)
524 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
525
526 if (!(flags & MCP_TIMESTAMP))
527 m.tsc = 0;
528 /*
529 * Don't get the IP here because it's unlikely to
530 * have anything to do with the actual error location.
531 */
532 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
533 mce_log(&m);
534 add_taint(TAINT_MACHINE_CHECK);
535 }
536
537 /*
538 * Clear state for this bank.
539 */
540 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
541 }
542
543 /*
544 * Don't clear MCG_STATUS here because it's only defined for
545 * exceptions.
546 */
547
548 sync_core();
549}
550EXPORT_SYMBOL_GPL(machine_check_poll);
551
552/*
553 * Do a quick check if any of the events requires a panic.
554 * This decides if we keep the events around or clear them.
555 */
556static int mce_no_way_out(struct mce *m, char **msg)
557{
558 int i;
559
560 for (i = 0; i < banks; i++) {
561 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
562 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
563 return 1;
564 }
565 return 0;
566}
567
568/*
569 * Variable to establish order between CPUs while scanning.
570 * Each CPU spins initially until executing is equal its number.
571 */
572static atomic_t mce_executing;
573
574/*
575 * Defines order of CPUs on entry. First CPU becomes Monarch.
576 */
577static atomic_t mce_callin;
578
579/*
580 * Check if a timeout waiting for other CPUs happened.
581 */
582static int mce_timed_out(u64 *t)
583{
584 /*
585 * The others already did panic for some reason.
586 * Bail out like in a timeout.
587 * rmb() to tell the compiler that system_state
588 * might have been modified by someone else.
589 */
590 rmb();
591 if (atomic_read(&mce_paniced))
592 wait_for_panic();
593 if (!monarch_timeout)
594 goto out;
595 if ((s64)*t < SPINUNIT) {
596 /* CHECKME: Make panic default for 1 too? */
597 if (tolerant < 1)
598 mce_panic("Timeout synchronizing machine check over CPUs",
599 NULL, NULL);
600 cpu_missing = 1;
601 return 1;
602 }
603 *t -= SPINUNIT;
604out:
605 touch_nmi_watchdog();
606 return 0;
607}
608
609/*
610 * The Monarch's reign. The Monarch is the CPU who entered
611 * the machine check handler first. It waits for the others to
612 * raise the exception too and then grades them. When any
613 * error is fatal panic. Only then let the others continue.
614 *
615 * The other CPUs entering the MCE handler will be controlled by the
616 * Monarch. They are called Subjects.
617 *
618 * This way we prevent any potential data corruption in a unrecoverable case
619 * and also makes sure always all CPU's errors are examined.
620 *
621 * Also this detects the case of an machine check event coming from outer
622 * space (not detected by any CPUs) In this case some external agent wants
623 * us to shut down, so panic too.
624 *
625 * The other CPUs might still decide to panic if the handler happens
626 * in a unrecoverable place, but in this case the system is in a semi-stable
627 * state and won't corrupt anything by itself. It's ok to let the others
628 * continue for a bit first.
629 *
630 * All the spin loops have timeouts; when a timeout happens a CPU
631 * typically elects itself to be Monarch.
632 */
633static void mce_reign(void)
634{
635 int cpu;
636 struct mce *m = NULL;
637 int global_worst = 0;
638 char *msg = NULL;
639 char *nmsg = NULL;
640
641 /*
642 * This CPU is the Monarch and the other CPUs have run
643 * through their handlers.
644 * Grade the severity of the errors of all the CPUs.
645 */
646 for_each_possible_cpu(cpu) {
647 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
648 &nmsg);
649 if (severity > global_worst) {
650 msg = nmsg;
651 global_worst = severity;
652 m = &per_cpu(mces_seen, cpu);
653 }
654 }
655
656 /*
657 * Cannot recover? Panic here then.
658 * This dumps all the mces in the log buffer and stops the
659 * other CPUs.
660 */
661 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
662 mce_panic("Fatal Machine check", m, msg);
663
664 /*
665 * For UC somewhere we let the CPU who detects it handle it.
666 * Also must let continue the others, otherwise the handling
667 * CPU could deadlock on a lock.
668 */
669
670 /*
671 * No machine check event found. Must be some external
672 * source or one CPU is hung. Panic.
673 */
674 if (!m && tolerant < 3)
675 mce_panic("Machine check from unknown source", NULL, NULL);
676
677 /*
678 * Now clear all the mces_seen so that they don't reappear on
679 * the next mce.
680 */
681 for_each_possible_cpu(cpu)
682 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
683}
684
685static atomic_t global_nwo;
686
687/*
688 * Start of Monarch synchronization. This waits until all CPUs have
689 * entered the exception handler and then determines if any of them
690 * saw a fatal event that requires panic. Then it executes them
691 * in the entry order.
692 * TBD double check parallel CPU hotunplug
693 */
694static int mce_start(int *no_way_out)
695{
696 int order;
697 int cpus = num_online_cpus();
698 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
699
700 if (!timeout)
701 return -1;
702
703 atomic_add(*no_way_out, &global_nwo);
704 /*
705 * global_nwo should be updated before mce_callin
706 */
707 smp_wmb();
708 order = atomic_add_return(1, &mce_callin);
709
710 /*
711 * Wait for everyone.
712 */
713 while (atomic_read(&mce_callin) != cpus) {
714 if (mce_timed_out(&timeout)) {
715 atomic_set(&global_nwo, 0);
716 return -1;
717 }
718 ndelay(SPINUNIT);
719 }
720
721 /*
722 * mce_callin should be read before global_nwo
723 */
724 smp_rmb();
725
726 if (order == 1) {
727 /*
728 * Monarch: Starts executing now, the others wait.
729 */
730 atomic_set(&mce_executing, 1);
731 } else {
732 /*
733 * Subject: Now start the scanning loop one by one in
734 * the original callin order.
735 * This way when there are any shared banks it will be
736 * only seen by one CPU before cleared, avoiding duplicates.
737 */
738 while (atomic_read(&mce_executing) < order) {
739 if (mce_timed_out(&timeout)) {
740 atomic_set(&global_nwo, 0);
741 return -1;
742 }
743 ndelay(SPINUNIT);
744 }
745 }
746
747 /*
748 * Cache the global no_way_out state.
749 */
750 *no_way_out = atomic_read(&global_nwo);
751
752 return order;
753}
754
755/*
756 * Synchronize between CPUs after main scanning loop.
757 * This invokes the bulk of the Monarch processing.
758 */
759static int mce_end(int order)
760{
761 int ret = -1;
762 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
763
764 if (!timeout)
765 goto reset;
766 if (order < 0)
767 goto reset;
768
769 /*
770 * Allow others to run.
771 */
772 atomic_inc(&mce_executing);
773
774 if (order == 1) {
775 /* CHECKME: Can this race with a parallel hotplug? */
776 int cpus = num_online_cpus();
777
778 /*
779 * Monarch: Wait for everyone to go through their scanning
780 * loops.
781 */
782 while (atomic_read(&mce_executing) <= cpus) {
783 if (mce_timed_out(&timeout))
784 goto reset;
785 ndelay(SPINUNIT);
786 }
787
788 mce_reign();
789 barrier();
790 ret = 0;
791 } else {
792 /*
793 * Subject: Wait for Monarch to finish.
794 */
795 while (atomic_read(&mce_executing) != 0) {
796 if (mce_timed_out(&timeout))
797 goto reset;
798 ndelay(SPINUNIT);
799 }
800
801 /*
802 * Don't reset anything. That's done by the Monarch.
803 */
804 return 0;
805 }
806
807 /*
808 * Reset all global state.
809 */
810reset:
811 atomic_set(&global_nwo, 0);
812 atomic_set(&mce_callin, 0);
813 barrier();
814
815 /*
816 * Let others run again.
817 */
818 atomic_set(&mce_executing, 0);
819 return ret;
820}
821
822/*
823 * Check if the address reported by the CPU is in a format we can parse.
824 * It would be possible to add code for most other cases, but all would
825 * be somewhat complicated (e.g. segment offset would require an instruction
826 * parser). So only support physical addresses upto page granuality for now.
827 */
828static int mce_usable_address(struct mce *m)
829{
830 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
831 return 0;
832 if ((m->misc & 0x3f) > PAGE_SHIFT)
833 return 0;
834 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
835 return 0;
836 return 1;
837}
838
839static void mce_clear_state(unsigned long *toclear)
840{
841 int i;
842
843 for (i = 0; i < banks; i++) {
844 if (test_bit(i, toclear))
845 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
846 }
847}
848
849/*
850 * The actual machine check handler. This only handles real
851 * exceptions when something got corrupted coming in through int 18.
852 *
853 * This is executed in NMI context not subject to normal locking rules. This
854 * implies that most kernel services cannot be safely used. Don't even
855 * think about putting a printk in there!
856 *
857 * On Intel systems this is entered on all CPUs in parallel through
858 * MCE broadcast. However some CPUs might be broken beyond repair,
859 * so be always careful when synchronizing with others.
860 */
861void do_machine_check(struct pt_regs *regs, long error_code)
862{
863 struct mce m, *final;
864 int i;
865 int worst = 0;
866 int severity;
867 /*
868 * Establish sequential order between the CPUs entering the machine
869 * check handler.
870 */
871 int order;
872 /*
873 * If no_way_out gets set, there is no safe way to recover from this
874 * MCE. If tolerant is cranked up, we'll try anyway.
875 */
876 int no_way_out = 0;
877 /*
878 * If kill_it gets set, there might be a way to recover from this
879 * error.
880 */
881 int kill_it = 0;
882 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
883 char *msg = "Unknown";
884
885 atomic_inc(&mce_entry);
886
887 __get_cpu_var(mce_exception_count)++;
888
889 if (notify_die(DIE_NMI, "machine check", regs, error_code,
890 18, SIGKILL) == NOTIFY_STOP)
891 goto out;
892 if (!banks)
893 goto out;
894
895 mce_setup(&m);
896
897 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
898 no_way_out = mce_no_way_out(&m, &msg);
899
900 final = &__get_cpu_var(mces_seen);
901 *final = m;
902
903 barrier();
904
905 /*
906 * When no restart IP must always kill or panic.
907 */
908 if (!(m.mcgstatus & MCG_STATUS_RIPV))
909 kill_it = 1;
910
911 /*
912 * Go through all the banks in exclusion of the other CPUs.
913 * This way we don't report duplicated events on shared banks
914 * because the first one to see it will clear it.
915 */
916 order = mce_start(&no_way_out);
917 for (i = 0; i < banks; i++) {
918 __clear_bit(i, toclear);
919 if (!bank[i])
920 continue;
921
922 m.misc = 0;
923 m.addr = 0;
924 m.bank = i;
925
926 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
927 if ((m.status & MCI_STATUS_VAL) == 0)
928 continue;
929
930 /*
931 * Non uncorrected or non signaled errors are handled by
932 * machine_check_poll. Leave them alone, unless this panics.
933 */
934 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
935 !no_way_out)
936 continue;
937
938 /*
939 * Set taint even when machine check was not enabled.
940 */
941 add_taint(TAINT_MACHINE_CHECK);
942
943 severity = mce_severity(&m, tolerant, NULL);
944
945 /*
946 * When machine check was for corrected handler don't touch,
947 * unless we're panicing.
948 */
949 if (severity == MCE_KEEP_SEVERITY && !no_way_out)
950 continue;
951 __set_bit(i, toclear);
952 if (severity == MCE_NO_SEVERITY) {
953 /*
954 * Machine check event was not enabled. Clear, but
955 * ignore.
956 */
957 continue;
958 }
959
960 /*
961 * Kill on action required.
962 */
963 if (severity == MCE_AR_SEVERITY)
964 kill_it = 1;
965
966 if (m.status & MCI_STATUS_MISCV)
967 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
968 if (m.status & MCI_STATUS_ADDRV)
969 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
970
971 /*
972 * Action optional error. Queue address for later processing.
973 * When the ring overflows we just ignore the AO error.
974 * RED-PEN add some logging mechanism when
975 * usable_address or mce_add_ring fails.
976 * RED-PEN don't ignore overflow for tolerant == 0
977 */
978 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
979 mce_ring_add(m.addr >> PAGE_SHIFT);
980
981 mce_get_rip(&m, regs);
982 mce_log(&m);
983
984 if (severity > worst) {
985 *final = m;
986 worst = severity;
987 }
988 }
989
990 if (!no_way_out)
991 mce_clear_state(toclear);
992
993 /*
994 * Do most of the synchronization with other CPUs.
995 * When there's any problem use only local no_way_out state.
996 */
997 if (mce_end(order) < 0)
998 no_way_out = worst >= MCE_PANIC_SEVERITY;
999
1000 /*
1001 * If we have decided that we just CAN'T continue, and the user
1002 * has not set tolerant to an insane level, give up and die.
1003 *
1004 * This is mainly used in the case when the system doesn't
1005 * support MCE broadcasting or it has been disabled.
1006 */
1007 if (no_way_out && tolerant < 3)
1008 mce_panic("Fatal machine check on current CPU", final, msg);
1009
1010 /*
1011 * If the error seems to be unrecoverable, something should be
1012 * done. Try to kill as little as possible. If we can kill just
1013 * one task, do that. If the user has set the tolerance very
1014 * high, don't try to do anything at all.
1015 */
1016
1017 if (kill_it && tolerant < 3)
1018 force_sig(SIGBUS, current);
1019
1020 /* notify userspace ASAP */
1021 set_thread_flag(TIF_MCE_NOTIFY);
1022
1023 if (worst > 0)
1024 mce_report_event(regs);
1025 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1026out:
1027 atomic_dec(&mce_entry);
1028 sync_core();
1029}
1030EXPORT_SYMBOL_GPL(do_machine_check);
1031
1032/* dummy to break dependency. actual code is in mm/memory-failure.c */
1033void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1034{
1035 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1036}
1037
1038/*
1039 * Called after mce notification in process context. This code
1040 * is allowed to sleep. Call the high level VM handler to process
1041 * any corrupted pages.
1042 * Assume that the work queue code only calls this one at a time
1043 * per CPU.
1044 * Note we don't disable preemption, so this code might run on the wrong
1045 * CPU. In this case the event is picked up by the scheduled work queue.
1046 * This is merely a fast path to expedite processing in some common
1047 * cases.
1048 */
1049void mce_notify_process(void)
1050{
1051 unsigned long pfn;
1052 mce_notify_irq();
1053 while (mce_ring_get(&pfn))
1054 memory_failure(pfn, MCE_VECTOR);
1055}
1056
1057static void mce_process_work(struct work_struct *dummy)
1058{
1059 mce_notify_process();
1060}
1061
1062#ifdef CONFIG_X86_MCE_INTEL
1063/***
1064 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1065 * @cpu: The CPU on which the event occurred.
1066 * @status: Event status information
1067 *
1068 * This function should be called by the thermal interrupt after the
1069 * event has been processed and the decision was made to log the event
1070 * further.
1071 *
1072 * The status parameter will be saved to the 'status' field of 'struct mce'
1073 * and historically has been the register value of the
1074 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1075 */
1076void mce_log_therm_throt_event(__u64 status)
1077{
1078 struct mce m;
1079
1080 mce_setup(&m);
1081 m.bank = MCE_THERMAL_BANK;
1082 m.status = status;
1083 mce_log(&m);
1084}
1085#endif /* CONFIG_X86_MCE_INTEL */
1086
1087/*
1088 * Periodic polling timer for "silent" machine check errors. If the
1089 * poller finds an MCE, poll 2x faster. When the poller finds no more
1090 * errors, poll 2x slower (up to check_interval seconds).
1091 */
1092static int check_interval = 5 * 60; /* 5 minutes */
1093
1094static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
1095static DEFINE_PER_CPU(struct timer_list, mce_timer);
1096
1097static void mcheck_timer(unsigned long data)
1098{
1099 struct timer_list *t = &per_cpu(mce_timer, data);
1100 int *n;
1101
1102 WARN_ON(smp_processor_id() != data);
1103
1104 if (mce_available(&current_cpu_data)) {
1105 machine_check_poll(MCP_TIMESTAMP,
1106 &__get_cpu_var(mce_poll_banks));
1107 }
1108
1109 /*
1110 * Alert userspace if needed. If we logged an MCE, reduce the
1111 * polling interval, otherwise increase the polling interval.
1112 */
1113 n = &__get_cpu_var(next_interval);
1114 if (mce_notify_irq())
1115 *n = max(*n/2, HZ/100);
1116 else
1117 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1118
1119 t->expires = jiffies + *n;
1120 add_timer(t);
1121}
1122
1123static void mce_do_trigger(struct work_struct *work)
1124{
1125 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1126}
1127
1128static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1129
1130/*
1131 * Notify the user(s) about new machine check events.
1132 * Can be called from interrupt context, but not from machine check/NMI
1133 * context.
1134 */
1135int mce_notify_irq(void)
1136{
1137 /* Not more than two messages every minute */
1138 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1139
1140 clear_thread_flag(TIF_MCE_NOTIFY);
1141
1142 if (test_and_clear_bit(0, &mce_need_notify)) {
1143 wake_up_interruptible(&mce_wait);
1144
1145 /*
1146 * There is no risk of missing notifications because
1147 * work_pending is always cleared before the function is
1148 * executed.
1149 */
1150 if (mce_helper[0] && !work_pending(&mce_trigger_work))
1151 schedule_work(&mce_trigger_work);
1152
1153 if (__ratelimit(&ratelimit))
1154 printk(KERN_INFO "Machine check events logged\n");
1155
1156 return 1;
1157 }
1158 return 0;
1159}
1160EXPORT_SYMBOL_GPL(mce_notify_irq);
1161
1162/*
1163 * Initialize Machine Checks for a CPU.
1164 */
1165static int mce_cap_init(void)
1166{
1167 unsigned b;
1168 u64 cap;
1169
1170 rdmsrl(MSR_IA32_MCG_CAP, cap);
1171
1172 b = cap & MCG_BANKCNT_MASK;
1173 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1174
1175 if (b > MAX_NR_BANKS) {
1176 printk(KERN_WARNING
1177 "MCE: Using only %u machine check banks out of %u\n",
1178 MAX_NR_BANKS, b);
1179 b = MAX_NR_BANKS;
1180 }
1181
1182 /* Don't support asymmetric configurations today */
1183 WARN_ON(banks != 0 && b != banks);
1184 banks = b;
1185 if (!bank) {
1186 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
1187 if (!bank)
1188 return -ENOMEM;
1189 memset(bank, 0xff, banks * sizeof(u64));
1190 }
1191
1192 /* Use accurate RIP reporting if available. */
1193 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1194 rip_msr = MSR_IA32_MCG_EIP;
1195
1196 if (cap & MCG_SER_P)
1197 mce_ser = 1;
1198
1199 return 0;
1200}
1201
1202static void mce_init(void)
1203{
1204 mce_banks_t all_banks;
1205 u64 cap;
1206 int i;
1207
1208 /*
1209 * Log the machine checks left over from the previous reset.
1210 */
1211 bitmap_fill(all_banks, MAX_NR_BANKS);
1212 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1213
1214 set_in_cr4(X86_CR4_MCE);
1215
1216 rdmsrl(MSR_IA32_MCG_CAP, cap);
1217 if (cap & MCG_CTL_P)
1218 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1219
1220 for (i = 0; i < banks; i++) {
1221 if (skip_bank_init(i))
1222 continue;
1223 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
1224 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
1225 }
1226}
1227
1228/* Add per CPU specific workarounds here */
1229static void mce_cpu_quirks(struct cpuinfo_x86 *c)
1230{
1231 /* This should be disabled by the BIOS, but isn't always */
1232 if (c->x86_vendor == X86_VENDOR_AMD) {
1233 if (c->x86 == 15 && banks > 4) {
1234 /*
1235 * disable GART TBL walk error reporting, which
1236 * trips off incorrectly with the IOMMU & 3ware
1237 * & Cerberus:
1238 */
1239 clear_bit(10, (unsigned long *)&bank[4]);
1240 }
1241 if (c->x86 <= 17 && mce_bootlog < 0) {
1242 /*
1243 * Lots of broken BIOS around that don't clear them
1244 * by default and leave crap in there. Don't log:
1245 */
1246 mce_bootlog = 0;
1247 }
1248 /*
1249 * Various K7s with broken bank 0 around. Always disable
1250 * by default.
1251 */
1252 if (c->x86 == 6 && banks > 0)
1253 bank[0] = 0;
1254 }
1255
1256 if (c->x86_vendor == X86_VENDOR_INTEL) {
1257 /*
1258 * SDM documents that on family 6 bank 0 should not be written
1259 * because it aliases to another special BIOS controlled
1260 * register.
1261 * But it's not aliased anymore on model 0x1a+
1262 * Don't ignore bank 0 completely because there could be a
1263 * valid event later, merely don't write CTL0.
1264 */
1265
1266 if (c->x86 == 6 && c->x86_model < 0x1A)
1267 __set_bit(0, &dont_init_banks);
1268
1269 /*
1270 * All newer Intel systems support MCE broadcasting. Enable
1271 * synchronization with a one second timeout.
1272 */
1273 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1274 monarch_timeout < 0)
1275 monarch_timeout = USEC_PER_SEC;
1276 }
1277 if (monarch_timeout < 0)
1278 monarch_timeout = 0;
1279 if (mce_bootlog != 0)
1280 mce_panic_timeout = 30;
1281}
1282
1283static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1284{
1285 if (c->x86 != 5)
1286 return;
1287 switch (c->x86_vendor) {
1288 case X86_VENDOR_INTEL:
1289 intel_p5_mcheck_init(c);
1290 break;
1291 case X86_VENDOR_CENTAUR:
1292 winchip_mcheck_init(c);
1293 break;
1294 }
1295}
1296
1297static void mce_cpu_features(struct cpuinfo_x86 *c)
1298{
1299 switch (c->x86_vendor) {
1300 case X86_VENDOR_INTEL:
1301 mce_intel_feature_init(c);
1302 break;
1303 case X86_VENDOR_AMD:
1304 mce_amd_feature_init(c);
1305 break;
1306 default:
1307 break;
1308 }
1309}
1310
1311static void mce_init_timer(void)
1312{
1313 struct timer_list *t = &__get_cpu_var(mce_timer);
1314 int *n = &__get_cpu_var(next_interval);
1315
1316 if (mce_ignore_ce)
1317 return;
1318
1319 *n = check_interval * HZ;
1320 if (!*n)
1321 return;
1322 setup_timer(t, mcheck_timer, smp_processor_id());
1323 t->expires = round_jiffies(jiffies + *n);
1324 add_timer(t);
1325}
1326
1327/*
1328 * Called for each booted CPU to set up machine checks.
1329 * Must be called with preempt off:
1330 */
1331void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1332{
1333 if (mce_disabled)
1334 return;
1335
1336 mce_ancient_init(c);
1337
1338 if (!mce_available(c))
1339 return;
1340
1341 if (mce_cap_init() < 0) {
1342 mce_disabled = 1;
1343 return;
1344 }
1345 mce_cpu_quirks(c);
1346
1347 machine_check_vector = do_machine_check;
1348
1349 mce_init();
1350 mce_cpu_features(c);
1351 mce_init_timer();
1352 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1353}
1354
1355/*
1356 * Character device to read and clear the MCE log.
1357 */
1358
1359static DEFINE_SPINLOCK(mce_state_lock);
1360static int open_count; /* #times opened */
1361static int open_exclu; /* already open exclusive? */
1362
1363static int mce_open(struct inode *inode, struct file *file)
1364{
1365 spin_lock(&mce_state_lock);
1366
1367 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1368 spin_unlock(&mce_state_lock);
1369
1370 return -EBUSY;
1371 }
1372
1373 if (file->f_flags & O_EXCL)
1374 open_exclu = 1;
1375 open_count++;
1376
1377 spin_unlock(&mce_state_lock);
1378
1379 return nonseekable_open(inode, file);
1380}
1381
1382static int mce_release(struct inode *inode, struct file *file)
1383{
1384 spin_lock(&mce_state_lock);
1385
1386 open_count--;
1387 open_exclu = 0;
1388
1389 spin_unlock(&mce_state_lock);
1390
1391 return 0;
1392}
1393
1394static void collect_tscs(void *data)
1395{
1396 unsigned long *cpu_tsc = (unsigned long *)data;
1397
1398 rdtscll(cpu_tsc[smp_processor_id()]);
1399}
1400
1401static DEFINE_MUTEX(mce_read_mutex);
1402
1403static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1404 loff_t *off)
1405{
1406 char __user *buf = ubuf;
1407 unsigned long *cpu_tsc;
1408 unsigned prev, next;
1409 int i, err;
1410
1411 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1412 if (!cpu_tsc)
1413 return -ENOMEM;
1414
1415 mutex_lock(&mce_read_mutex);
1416 next = rcu_dereference(mcelog.next);
1417
1418 /* Only supports full reads right now */
1419 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
1420 mutex_unlock(&mce_read_mutex);
1421 kfree(cpu_tsc);
1422
1423 return -EINVAL;
1424 }
1425
1426 err = 0;
1427 prev = 0;
1428 do {
1429 for (i = prev; i < next; i++) {
1430 unsigned long start = jiffies;
1431
1432 while (!mcelog.entry[i].finished) {
1433 if (time_after_eq(jiffies, start + 2)) {
1434 memset(mcelog.entry + i, 0,
1435 sizeof(struct mce));
1436 goto timeout;
1437 }
1438 cpu_relax();
1439 }
1440 smp_rmb();
1441 err |= copy_to_user(buf, mcelog.entry + i,
1442 sizeof(struct mce));
1443 buf += sizeof(struct mce);
1444timeout:
1445 ;
1446 }
1447
1448 memset(mcelog.entry + prev, 0,
1449 (next - prev) * sizeof(struct mce));
1450 prev = next;
1451 next = cmpxchg(&mcelog.next, prev, 0);
1452 } while (next != prev);
1453
1454 synchronize_sched();
1455
1456 /*
1457 * Collect entries that were still getting written before the
1458 * synchronize.
1459 */
1460 on_each_cpu(collect_tscs, cpu_tsc, 1);
1461
1462 for (i = next; i < MCE_LOG_LEN; i++) {
1463 if (mcelog.entry[i].finished &&
1464 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1465 err |= copy_to_user(buf, mcelog.entry+i,
1466 sizeof(struct mce));
1467 smp_rmb();
1468 buf += sizeof(struct mce);
1469 memset(&mcelog.entry[i], 0, sizeof(struct mce));
1470 }
1471 }
1472 mutex_unlock(&mce_read_mutex);
1473 kfree(cpu_tsc);
1474
1475 return err ? -EFAULT : buf - ubuf;
1476}
1477
1478static unsigned int mce_poll(struct file *file, poll_table *wait)
1479{
1480 poll_wait(file, &mce_wait, wait);
1481 if (rcu_dereference(mcelog.next))
1482 return POLLIN | POLLRDNORM;
1483 return 0;
1484}
1485
1486static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1487{
1488 int __user *p = (int __user *)arg;
1489
1490 if (!capable(CAP_SYS_ADMIN))
1491 return -EPERM;
1492
1493 switch (cmd) {
1494 case MCE_GET_RECORD_LEN:
1495 return put_user(sizeof(struct mce), p);
1496 case MCE_GET_LOG_LEN:
1497 return put_user(MCE_LOG_LEN, p);
1498 case MCE_GETCLEAR_FLAGS: {
1499 unsigned flags;
1500
1501 do {
1502 flags = mcelog.flags;
1503 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1504
1505 return put_user(flags, p);
1506 }
1507 default:
1508 return -ENOTTY;
1509 }
1510}
1511
1512/* Modified in mce-inject.c, so not static or const */
1513struct file_operations mce_chrdev_ops = {
1514 .open = mce_open,
1515 .release = mce_release,
1516 .read = mce_read,
1517 .poll = mce_poll,
1518 .unlocked_ioctl = mce_ioctl,
1519};
1520EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1521
1522static struct miscdevice mce_log_device = {
1523 MISC_MCELOG_MINOR,
1524 "mcelog",
1525 &mce_chrdev_ops,
1526};
1527
1528/*
1529 * mce=off Disables machine check
1530 * mce=no_cmci Disables CMCI
1531 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1532 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1533 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1534 * monarchtimeout is how long to wait for other CPUs on machine
1535 * check, or 0 to not wait
1536 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1537 * mce=nobootlog Don't log MCEs from before booting.
1538 */
1539static int __init mcheck_enable(char *str)
1540{
1541 if (*str == 0)
1542 enable_p5_mce();
1543 if (*str == '=')
1544 str++;
1545 if (!strcmp(str, "off"))
1546 mce_disabled = 1;
1547 else if (!strcmp(str, "no_cmci"))
1548 mce_cmci_disabled = 1;
1549 else if (!strcmp(str, "dont_log_ce"))
1550 mce_dont_log_ce = 1;
1551 else if (!strcmp(str, "ignore_ce"))
1552 mce_ignore_ce = 1;
1553 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1554 mce_bootlog = (str[0] == 'b');
1555 else if (isdigit(str[0])) {
1556 get_option(&str, &tolerant);
1557 if (*str == ',') {
1558 ++str;
1559 get_option(&str, &monarch_timeout);
1560 }
1561 } else {
1562 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1563 str);
1564 return 0;
1565 }
1566 return 1;
1567}
1568__setup("mce", mcheck_enable);
1569
1570/*
1571 * Sysfs support
1572 */
1573
1574/*
1575 * Disable machine checks on suspend and shutdown. We can't really handle
1576 * them later.
1577 */
1578static int mce_disable(void)
1579{
1580 int i;
1581
1582 for (i = 0; i < banks; i++) {
1583 if (!skip_bank_init(i))
1584 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1585 }
1586 return 0;
1587}
1588
1589static int mce_suspend(struct sys_device *dev, pm_message_t state)
1590{
1591 return mce_disable();
1592}
1593
1594static int mce_shutdown(struct sys_device *dev)
1595{
1596 return mce_disable();
1597}
1598
1599/*
1600 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1601 * Only one CPU is active at this time, the others get re-added later using
1602 * CPU hotplug:
1603 */
1604static int mce_resume(struct sys_device *dev)
1605{
1606 mce_init();
1607 mce_cpu_features(&current_cpu_data);
1608
1609 return 0;
1610}
1611
1612static void mce_cpu_restart(void *data)
1613{
1614 del_timer_sync(&__get_cpu_var(mce_timer));
1615 if (!mce_available(&current_cpu_data))
1616 return;
1617 mce_init();
1618 mce_init_timer();
1619}
1620
1621/* Reinit MCEs after user configuration changes */
1622static void mce_restart(void)
1623{
1624 on_each_cpu(mce_cpu_restart, NULL, 1);
1625}
1626
1627/* Toggle features for corrected errors */
1628static void mce_disable_ce(void *all)
1629{
1630 if (!mce_available(&current_cpu_data))
1631 return;
1632 if (all)
1633 del_timer_sync(&__get_cpu_var(mce_timer));
1634 cmci_clear();
1635}
1636
1637static void mce_enable_ce(void *all)
1638{
1639 if (!mce_available(&current_cpu_data))
1640 return;
1641 cmci_reenable();
1642 cmci_recheck();
1643 if (all)
1644 mce_init_timer();
1645}
1646
1647static struct sysdev_class mce_sysclass = {
1648 .suspend = mce_suspend,
1649 .shutdown = mce_shutdown,
1650 .resume = mce_resume,
1651 .name = "machinecheck",
1652};
1653
1654DEFINE_PER_CPU(struct sys_device, mce_dev);
1655
1656__cpuinitdata
1657void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1658
1659static struct sysdev_attribute *bank_attrs;
1660
1661static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1662 char *buf)
1663{
1664 u64 b = bank[attr - bank_attrs];
1665
1666 return sprintf(buf, "%llx\n", b);
1667}
1668
1669static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1670 const char *buf, size_t size)
1671{
1672 u64 new;
1673
1674 if (strict_strtoull(buf, 0, &new) < 0)
1675 return -EINVAL;
1676
1677 bank[attr - bank_attrs] = new;
1678 mce_restart();
1679
1680 return size;
1681}
1682
1683static ssize_t
1684show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1685{
1686 strcpy(buf, mce_helper);
1687 strcat(buf, "\n");
1688 return strlen(mce_helper) + 1;
1689}
1690
1691static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1692 const char *buf, size_t siz)
1693{
1694 char *p;
1695 int len;
1696
1697 strncpy(mce_helper, buf, sizeof(mce_helper));
1698 mce_helper[sizeof(mce_helper)-1] = 0;
1699 len = strlen(mce_helper);
1700 p = strchr(mce_helper, '\n');
1701
1702 if (*p)
1703 *p = 0;
1704
1705 return len;
1706}
1707
1708static ssize_t set_ignore_ce(struct sys_device *s,
1709 struct sysdev_attribute *attr,
1710 const char *buf, size_t size)
1711{
1712 u64 new;
1713
1714 if (strict_strtoull(buf, 0, &new) < 0)
1715 return -EINVAL;
1716
1717 if (mce_ignore_ce ^ !!new) {
1718 if (new) {
1719 /* disable ce features */
1720 on_each_cpu(mce_disable_ce, (void *)1, 1);
1721 mce_ignore_ce = 1;
1722 } else {
1723 /* enable ce features */
1724 mce_ignore_ce = 0;
1725 on_each_cpu(mce_enable_ce, (void *)1, 1);
1726 }
1727 }
1728 return size;
1729}
1730
1731static ssize_t set_cmci_disabled(struct sys_device *s,
1732 struct sysdev_attribute *attr,
1733 const char *buf, size_t size)
1734{
1735 u64 new;
1736
1737 if (strict_strtoull(buf, 0, &new) < 0)
1738 return -EINVAL;
1739
1740 if (mce_cmci_disabled ^ !!new) {
1741 if (new) {
1742 /* disable cmci */
1743 on_each_cpu(mce_disable_ce, NULL, 1);
1744 mce_cmci_disabled = 1;
1745 } else {
1746 /* enable cmci */
1747 mce_cmci_disabled = 0;
1748 on_each_cpu(mce_enable_ce, NULL, 1);
1749 }
1750 }
1751 return size;
1752}
1753
1754static ssize_t store_int_with_restart(struct sys_device *s,
1755 struct sysdev_attribute *attr,
1756 const char *buf, size_t size)
1757{
1758 ssize_t ret = sysdev_store_int(s, attr, buf, size);
1759 mce_restart();
1760 return ret;
1761}
1762
1763static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1764static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1765static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1766static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
1767
1768static struct sysdev_ext_attribute attr_check_interval = {
1769 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1770 store_int_with_restart),
1771 &check_interval
1772};
1773
1774static struct sysdev_ext_attribute attr_ignore_ce = {
1775 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
1776 &mce_ignore_ce
1777};
1778
1779static struct sysdev_ext_attribute attr_cmci_disabled = {
1780 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
1781 &mce_cmci_disabled
1782};
1783
1784static struct sysdev_attribute *mce_attrs[] = {
1785 &attr_tolerant.attr,
1786 &attr_check_interval.attr,
1787 &attr_trigger,
1788 &attr_monarch_timeout.attr,
1789 &attr_dont_log_ce.attr,
1790 &attr_ignore_ce.attr,
1791 &attr_cmci_disabled.attr,
1792 NULL
1793};
1794
1795static cpumask_var_t mce_dev_initialized;
1796
1797/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1798static __cpuinit int mce_create_device(unsigned int cpu)
1799{
1800 int err;
1801 int i, j;
1802
1803 if (!mce_available(&boot_cpu_data))
1804 return -EIO;
1805
1806 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1807 per_cpu(mce_dev, cpu).id = cpu;
1808 per_cpu(mce_dev, cpu).cls = &mce_sysclass;
1809
1810 err = sysdev_register(&per_cpu(mce_dev, cpu));
1811 if (err)
1812 return err;
1813
1814 for (i = 0; mce_attrs[i]; i++) {
1815 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1816 if (err)
1817 goto error;
1818 }
1819 for (j = 0; j < banks; j++) {
1820 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1821 &bank_attrs[j]);
1822 if (err)
1823 goto error2;
1824 }
1825 cpumask_set_cpu(cpu, mce_dev_initialized);
1826
1827 return 0;
1828error2:
1829 while (--j >= 0)
1830 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]);
1831error:
1832 while (--i >= 0)
1833 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1834
1835 sysdev_unregister(&per_cpu(mce_dev, cpu));
1836
1837 return err;
1838}
1839
1840static __cpuinit void mce_remove_device(unsigned int cpu)
1841{
1842 int i;
1843
1844 if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1845 return;
1846
1847 for (i = 0; mce_attrs[i]; i++)
1848 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1849
1850 for (i = 0; i < banks; i++)
1851 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1852
1853 sysdev_unregister(&per_cpu(mce_dev, cpu));
1854 cpumask_clear_cpu(cpu, mce_dev_initialized);
1855}
1856
1857/* Make sure there are no machine checks on offlined CPUs. */
1858static void mce_disable_cpu(void *h)
1859{
1860 unsigned long action = *(unsigned long *)h;
1861 int i;
1862
1863 if (!mce_available(&current_cpu_data))
1864 return;
1865 if (!(action & CPU_TASKS_FROZEN))
1866 cmci_clear();
1867 for (i = 0; i < banks; i++) {
1868 if (!skip_bank_init(i))
1869 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1870 }
1871}
1872
1873static void mce_reenable_cpu(void *h)
1874{
1875 unsigned long action = *(unsigned long *)h;
1876 int i;
1877
1878 if (!mce_available(&current_cpu_data))
1879 return;
1880
1881 if (!(action & CPU_TASKS_FROZEN))
1882 cmci_reenable();
1883 for (i = 0; i < banks; i++) {
1884 if (!skip_bank_init(i))
1885 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1886 }
1887}
1888
1889/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1890static int __cpuinit
1891mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1892{
1893 unsigned int cpu = (unsigned long)hcpu;
1894 struct timer_list *t = &per_cpu(mce_timer, cpu);
1895
1896 switch (action) {
1897 case CPU_ONLINE:
1898 case CPU_ONLINE_FROZEN:
1899 mce_create_device(cpu);
1900 if (threshold_cpu_callback)
1901 threshold_cpu_callback(action, cpu);
1902 break;
1903 case CPU_DEAD:
1904 case CPU_DEAD_FROZEN:
1905 if (threshold_cpu_callback)
1906 threshold_cpu_callback(action, cpu);
1907 mce_remove_device(cpu);
1908 break;
1909 case CPU_DOWN_PREPARE:
1910 case CPU_DOWN_PREPARE_FROZEN:
1911 del_timer_sync(t);
1912 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1913 break;
1914 case CPU_DOWN_FAILED:
1915 case CPU_DOWN_FAILED_FROZEN:
1916 t->expires = round_jiffies(jiffies +
1917 __get_cpu_var(next_interval));
1918 add_timer_on(t, cpu);
1919 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1920 break;
1921 case CPU_POST_DEAD:
1922 /* intentionally ignoring frozen here */
1923 cmci_rediscover(cpu);
1924 break;
1925 }
1926 return NOTIFY_OK;
1927}
1928
1929static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1930 .notifier_call = mce_cpu_callback,
1931};
1932
1933static __init int mce_init_banks(void)
1934{
1935 int i;
1936
1937 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1938 GFP_KERNEL);
1939 if (!bank_attrs)
1940 return -ENOMEM;
1941
1942 for (i = 0; i < banks; i++) {
1943 struct sysdev_attribute *a = &bank_attrs[i];
1944
1945 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1946 if (!a->attr.name)
1947 goto nomem;
1948
1949 a->attr.mode = 0644;
1950 a->show = show_bank;
1951 a->store = set_bank;
1952 }
1953 return 0;
1954
1955nomem:
1956 while (--i >= 0)
1957 kfree(bank_attrs[i].attr.name);
1958 kfree(bank_attrs);
1959 bank_attrs = NULL;
1960
1961 return -ENOMEM;
1962}
1963
1964static __init int mce_init_device(void)
1965{
1966 int err;
1967 int i = 0;
1968
1969 if (!mce_available(&boot_cpu_data))
1970 return -EIO;
1971
1972 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1973
1974 err = mce_init_banks();
1975 if (err)
1976 return err;
1977
1978 err = sysdev_class_register(&mce_sysclass);
1979 if (err)
1980 return err;
1981
1982 for_each_online_cpu(i) {
1983 err = mce_create_device(i);
1984 if (err)
1985 return err;
1986 }
1987
1988 register_hotcpu_notifier(&mce_cpu_notifier);
1989 misc_register(&mce_log_device);
1990
1991 return err;
1992}
1993
1994device_initcall(mce_init_device);
1995
1996#else /* CONFIG_X86_OLD_MCE: */
1997
1998int nr_mce_banks;
1999EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
2000
2001/* This has to be run for each processor */
2002void mcheck_init(struct cpuinfo_x86 *c)
2003{
2004 if (mce_disabled)
2005 return;
2006
2007 switch (c->x86_vendor) {
2008 case X86_VENDOR_AMD:
2009 amd_mcheck_init(c);
2010 break;
2011
2012 case X86_VENDOR_INTEL:
2013 if (c->x86 == 5)
2014 intel_p5_mcheck_init(c);
2015 if (c->x86 == 6)
2016 intel_p6_mcheck_init(c);
2017 if (c->x86 == 15)
2018 intel_p4_mcheck_init(c);
2019 break;
2020
2021 case X86_VENDOR_CENTAUR:
2022 if (c->x86 == 5)
2023 winchip_mcheck_init(c);
2024 break;
2025
2026 default:
2027 break;
2028 }
2029 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
2030}
2031
2032static int __init mcheck_enable(char *str)
2033{
2034 mce_p5_enabled = 1;
2035 return 1;
2036}
2037__setup("mce", mcheck_enable);
2038
2039#endif /* CONFIG_X86_OLD_MCE */
2040
2041/*
2042 * Old style boot options parsing. Only for compatibility.
2043 */
2044static int __init mcheck_disable(char *str)
2045{
2046 mce_disabled = 1;
2047 return 1;
2048}
2049__setup("nomce", mcheck_disable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
deleted file mode 100644
index ae9f628838f1..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ /dev/null
@@ -1,14 +0,0 @@
1#include <linux/init.h>
2#include <asm/mce.h>
3
4void amd_mcheck_init(struct cpuinfo_x86 *c);
5void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
6void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
7void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
8void winchip_mcheck_init(struct cpuinfo_x86 *c);
9
10/* Call the installed machine check handler for this CPU setup. */
11extern void (*machine_check_vector)(struct pt_regs *, long error_code);
12
13extern int nr_mce_banks;
14
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
deleted file mode 100644
index 3552119b091d..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ /dev/null
@@ -1,76 +0,0 @@
1/*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>, Dave Jones <davej@redhat.com>
4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/smp.h>
11#include <linux/thread_info.h>
12
13#include <asm/processor.h>
14#include <asm/system.h>
15#include <asm/mce.h>
16
17#include "mce.h"
18
19int mce_disabled;
20int nr_mce_banks;
21
22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
23
24/* Handle unconfigured int18 (should never happen) */
25static void unexpected_machine_check(struct pt_regs *regs, long error_code)
26{
27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
28}
29
30/* Call the installed machine check handler for this CPU setup. */
31void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
32
33/* This has to be run for each processor */
34void mcheck_init(struct cpuinfo_x86 *c)
35{
36 if (mce_disabled == 1)
37 return;
38
39 switch (c->x86_vendor) {
40 case X86_VENDOR_AMD:
41 amd_mcheck_init(c);
42 break;
43
44 case X86_VENDOR_INTEL:
45 if (c->x86 == 5)
46 intel_p5_mcheck_init(c);
47 if (c->x86 == 6)
48 intel_p6_mcheck_init(c);
49 if (c->x86 == 15)
50 intel_p4_mcheck_init(c);
51 break;
52
53 case X86_VENDOR_CENTAUR:
54 if (c->x86 == 5)
55 winchip_mcheck_init(c);
56 break;
57
58 default:
59 break;
60 }
61}
62
63static int __init mcheck_disable(char *str)
64{
65 mce_disabled = 1;
66 return 1;
67}
68
69static int __init mcheck_enable(char *str)
70{
71 mce_disabled = -1;
72 return 1;
73}
74
75__setup("nomce", mcheck_disable);
76__setup("mce", mcheck_enable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
deleted file mode 100644
index 09dd1d414fc3..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ /dev/null
@@ -1,1187 +0,0 @@
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/sched.h>
14#include <linux/smp_lock.h>
15#include <linux/string.h>
16#include <linux/rcupdate.h>
17#include <linux/kallsyms.h>
18#include <linux/sysdev.h>
19#include <linux/miscdevice.h>
20#include <linux/fs.h>
21#include <linux/capability.h>
22#include <linux/cpu.h>
23#include <linux/percpu.h>
24#include <linux/poll.h>
25#include <linux/thread_info.h>
26#include <linux/ctype.h>
27#include <linux/kmod.h>
28#include <linux/kdebug.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31#include <linux/ratelimit.h>
32#include <asm/processor.h>
33#include <asm/msr.h>
34#include <asm/mce.h>
35#include <asm/uaccess.h>
36#include <asm/smp.h>
37#include <asm/idle.h>
38
39#define MISC_MCELOG_MINOR 227
40
41atomic_t mce_entry;
42
43static int mce_dont_init;
44
45/*
46 * Tolerant levels:
47 * 0: always panic on uncorrected errors, log corrected errors
48 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
49 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
50 * 3: never panic or SIGBUS, log all errors (for testing only)
51 */
52static int tolerant = 1;
53static int banks;
54static u64 *bank;
55static unsigned long notify_user;
56static int rip_msr;
57static int mce_bootlog = -1;
58static atomic_t mce_events;
59
60static char trigger[128];
61static char *trigger_argv[2] = { trigger, NULL };
62
63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
64
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
78/*
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
82 */
83
84static struct mce_log mcelog = {
85 MCE_LOG_SIGNATURE,
86 MCE_LOG_LEN,
87};
88
89void mce_log(struct mce *mce)
90{
91 unsigned next, entry;
92 atomic_inc(&mce_events);
93 mce->finished = 0;
94 wmb();
95 for (;;) {
96 entry = rcu_dereference(mcelog.next);
97 for (;;) {
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry >= MCE_LOG_LEN) {
101 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
102 return;
103 }
104 /* Old left over entry. Skip. */
105 if (mcelog.entry[entry].finished) {
106 entry++;
107 continue;
108 }
109 break;
110 }
111 smp_rmb();
112 next = entry + 1;
113 if (cmpxchg(&mcelog.next, entry, next) == entry)
114 break;
115 }
116 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
117 wmb();
118 mcelog.entry[entry].finished = 1;
119 wmb();
120
121 set_bit(0, &notify_user);
122}
123
124static void print_mce(struct mce *m)
125{
126 printk(KERN_EMERG "\n"
127 KERN_EMERG "HARDWARE ERROR\n"
128 KERN_EMERG
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m->cpu, m->mcgstatus, m->bank, m->status);
131 if (m->ip) {
132 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
133 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
134 m->cs, m->ip);
135 if (m->cs == __KERNEL_CS)
136 print_symbol("{%s}", m->ip);
137 printk("\n");
138 }
139 printk(KERN_EMERG "TSC %llx ", m->tsc);
140 if (m->addr)
141 printk("ADDR %llx ", m->addr);
142 if (m->misc)
143 printk("MISC %llx ", m->misc);
144 printk("\n");
145 printk(KERN_EMERG "This is not a software problem!\n");
146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
148}
149
150static void mce_panic(char *msg, struct mce *backup, unsigned long start)
151{
152 int i;
153
154 oops_begin();
155 for (i = 0; i < MCE_LOG_LEN; i++) {
156 unsigned long tsc = mcelog.entry[i].tsc;
157
158 if (time_before(tsc, start))
159 continue;
160 print_mce(&mcelog.entry[i]);
161 if (backup && mcelog.entry[i].tsc == backup->tsc)
162 backup = NULL;
163 }
164 if (backup)
165 print_mce(backup);
166 panic(msg);
167}
168
169int mce_available(struct cpuinfo_x86 *c)
170{
171 if (mce_dont_init)
172 return 0;
173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
174}
175
176static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
177{
178 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
179 m->ip = regs->ip;
180 m->cs = regs->cs;
181 } else {
182 m->ip = 0;
183 m->cs = 0;
184 }
185 if (rip_msr) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m->mcgstatus |= MCG_STATUS_EIPV;
188 rdmsrl(rip_msr, m->ip);
189 m->cs = 0;
190 }
191}
192
193/*
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242 if (!(flags & MCP_DONTLOG)) {
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245 }
246
247 /*
248 * Clear state for this bank.
249 */
250 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
251 }
252
253 /*
254 * Don't clear MCG_STATUS here because it's only defined for
255 * exceptions.
256 */
257}
258
259/*
260 * The actual machine check handler. This only handles real
261 * exceptions when something got corrupted coming in through int 18.
262 *
263 * This is executed in NMI context not subject to normal locking rules. This
264 * implies that most kernel services cannot be safely used. Don't even
265 * think about putting a printk in there!
266 */
267void do_machine_check(struct pt_regs * regs, long error_code)
268{
269 struct mce m, panicm;
270 u64 mcestart = 0;
271 int i;
272 int panicm_found = 0;
273 /*
274 * If no_way_out gets set, there is no safe way to recover from this
275 * MCE. If tolerant is cranked up, we'll try anyway.
276 */
277 int no_way_out = 0;
278 /*
279 * If kill_it gets set, there might be a way to recover from this
280 * error.
281 */
282 int kill_it = 0;
283 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
284
285 atomic_inc(&mce_entry);
286
287 if (notify_die(DIE_NMI, "machine check", regs, error_code,
288 18, SIGKILL) == NOTIFY_STOP)
289 goto out2;
290 if (!banks)
291 goto out2;
292
293 mce_setup(&m);
294
295 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
296 /* if the restart IP is not valid, we're done for */
297 if (!(m.mcgstatus & MCG_STATUS_RIPV))
298 no_way_out = 1;
299
300 rdtscll(mcestart);
301 barrier();
302
303 for (i = 0; i < banks; i++) {
304 __clear_bit(i, toclear);
305 if (!bank[i])
306 continue;
307
308 m.misc = 0;
309 m.addr = 0;
310 m.bank = i;
311
312 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
313 if ((m.status & MCI_STATUS_VAL) == 0)
314 continue;
315
316 /*
317 * Non uncorrected errors are handled by machine_check_poll
318 * Leave them alone.
319 */
320 if ((m.status & MCI_STATUS_UC) == 0)
321 continue;
322
323 /*
324 * Set taint even when machine check was not enabled.
325 */
326 add_taint(TAINT_MACHINE_CHECK);
327
328 __set_bit(i, toclear);
329
330 if (m.status & MCI_STATUS_EN) {
331 /* if PCC was set, there's no way out */
332 no_way_out |= !!(m.status & MCI_STATUS_PCC);
333 /*
334 * If this error was uncorrectable and there was
335 * an overflow, we're in trouble. If no overflow,
336 * we might get away with just killing a task.
337 */
338 if (m.status & MCI_STATUS_UC) {
339 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
340 no_way_out = 1;
341 kill_it = 1;
342 }
343 } else {
344 /*
345 * Machine check event was not enabled. Clear, but
346 * ignore.
347 */
348 continue;
349 }
350
351 if (m.status & MCI_STATUS_MISCV)
352 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
353 if (m.status & MCI_STATUS_ADDRV)
354 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
355
356 mce_get_rip(&m, regs);
357 mce_log(&m);
358
359 /* Did this bank cause the exception? */
360 /* Assume that the bank with uncorrectable errors did it,
361 and that there is only a single one. */
362 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
363 panicm = m;
364 panicm_found = 1;
365 }
366 }
367
368 /* If we didn't find an uncorrectable error, pick
369 the last one (shouldn't happen, just being safe). */
370 if (!panicm_found)
371 panicm = m;
372
373 /*
374 * If we have decided that we just CAN'T continue, and the user
375 * has not set tolerant to an insane level, give up and die.
376 */
377 if (no_way_out && tolerant < 3)
378 mce_panic("Machine check", &panicm, mcestart);
379
380 /*
381 * If the error seems to be unrecoverable, something should be
382 * done. Try to kill as little as possible. If we can kill just
383 * one task, do that. If the user has set the tolerance very
384 * high, don't try to do anything at all.
385 */
386 if (kill_it && tolerant < 3) {
387 int user_space = 0;
388
389 /*
390 * If the EIPV bit is set, it means the saved IP is the
391 * instruction which caused the MCE.
392 */
393 if (m.mcgstatus & MCG_STATUS_EIPV)
394 user_space = panicm.ip && (panicm.cs & 3);
395
396 /*
397 * If we know that the error was in user space, send a
398 * SIGBUS. Otherwise, panic if tolerance is low.
399 *
400 * force_sig() takes an awful lot of locks and has a slight
401 * risk of deadlocking.
402 */
403 if (user_space) {
404 force_sig(SIGBUS, current);
405 } else if (panic_on_oops || tolerant < 2) {
406 mce_panic("Uncorrected machine check",
407 &panicm, mcestart);
408 }
409 }
410
411 /* notify userspace ASAP */
412 set_thread_flag(TIF_MCE_NOTIFY);
413
414 /* the last thing we do is clear state */
415 for (i = 0; i < banks; i++) {
416 if (test_bit(i, toclear))
417 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
418 }
419 wrmsrl(MSR_IA32_MCG_STATUS, 0);
420 out2:
421 atomic_dec(&mce_entry);
422}
423
424#ifdef CONFIG_X86_MCE_INTEL
425/***
426 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
427 * @cpu: The CPU on which the event occurred.
428 * @status: Event status information
429 *
430 * This function should be called by the thermal interrupt after the
431 * event has been processed and the decision was made to log the event
432 * further.
433 *
434 * The status parameter will be saved to the 'status' field of 'struct mce'
435 * and historically has been the register value of the
436 * MSR_IA32_THERMAL_STATUS (Intel) msr.
437 */
438void mce_log_therm_throt_event(__u64 status)
439{
440 struct mce m;
441
442 mce_setup(&m);
443 m.bank = MCE_THERMAL_BANK;
444 m.status = status;
445 mce_log(&m);
446}
447#endif /* CONFIG_X86_MCE_INTEL */
448
449/*
450 * Periodic polling timer for "silent" machine check errors. If the
451 * poller finds an MCE, poll 2x faster. When the poller finds no more
452 * errors, poll 2x slower (up to check_interval seconds).
453 */
454
455static int check_interval = 5 * 60; /* 5 minutes */
456static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
457static void mcheck_timer(unsigned long);
458static DEFINE_PER_CPU(struct timer_list, mce_timer);
459
460static void mcheck_timer(unsigned long data)
461{
462 struct timer_list *t = &per_cpu(mce_timer, data);
463 int *n;
464
465 WARN_ON(smp_processor_id() != data);
466
467 if (mce_available(&current_cpu_data))
468 machine_check_poll(MCP_TIMESTAMP,
469 &__get_cpu_var(mce_poll_banks));
470
471 /*
472 * Alert userspace if needed. If we logged an MCE, reduce the
473 * polling interval, otherwise increase the polling interval.
474 */
475 n = &__get_cpu_var(next_interval);
476 if (mce_notify_user()) {
477 *n = max(*n/2, HZ/100);
478 } else {
479 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
480 }
481
482 t->expires = jiffies + *n;
483 add_timer(t);
484}
485
486static void mce_do_trigger(struct work_struct *work)
487{
488 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
489}
490
491static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
492
493/*
494 * Notify the user(s) about new machine check events.
495 * Can be called from interrupt context, but not from machine check/NMI
496 * context.
497 */
498int mce_notify_user(void)
499{
500 /* Not more than two messages every minute */
501 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
502
503 clear_thread_flag(TIF_MCE_NOTIFY);
504 if (test_and_clear_bit(0, &notify_user)) {
505 wake_up_interruptible(&mce_wait);
506
507 /*
508 * There is no risk of missing notifications because
509 * work_pending is always cleared before the function is
510 * executed.
511 */
512 if (trigger[0] && !work_pending(&mce_trigger_work))
513 schedule_work(&mce_trigger_work);
514
515 if (__ratelimit(&ratelimit))
516 printk(KERN_INFO "Machine check events logged\n");
517
518 return 1;
519 }
520 return 0;
521}
522
523/* see if the idle task needs to notify userspace */
524static int
525mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
526{
527 /* IDLE_END should be safe - interrupts are back on */
528 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
529 mce_notify_user();
530
531 return NOTIFY_OK;
532}
533
534static struct notifier_block mce_idle_notifier = {
535 .notifier_call = mce_idle_callback,
536};
537
538static __init int periodic_mcheck_init(void)
539{
540 idle_notifier_register(&mce_idle_notifier);
541 return 0;
542}
543__initcall(periodic_mcheck_init);
544
545/*
546 * Initialize Machine Checks for a CPU.
547 */
548static int mce_cap_init(void)
549{
550 u64 cap;
551 unsigned b;
552
553 rdmsrl(MSR_IA32_MCG_CAP, cap);
554 b = cap & 0xff;
555 if (b > MAX_NR_BANKS) {
556 printk(KERN_WARNING
557 "MCE: Using only %u machine check banks out of %u\n",
558 MAX_NR_BANKS, b);
559 b = MAX_NR_BANKS;
560 }
561
562 /* Don't support asymmetric configurations today */
563 WARN_ON(banks != 0 && b != banks);
564 banks = b;
565 if (!bank) {
566 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
567 if (!bank)
568 return -ENOMEM;
569 memset(bank, 0xff, banks * sizeof(u64));
570 }
571
572 /* Use accurate RIP reporting if available. */
573 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
574 rip_msr = MSR_IA32_MCG_EIP;
575
576 return 0;
577}
578
579static void mce_init(void *dummy)
580{
581 u64 cap;
582 int i;
583 mce_banks_t all_banks;
584
585 /*
586 * Log the machine checks left over from the previous reset.
587 */
588 bitmap_fill(all_banks, MAX_NR_BANKS);
589 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
590
591 set_in_cr4(X86_CR4_MCE);
592
593 rdmsrl(MSR_IA32_MCG_CAP, cap);
594 if (cap & MCG_CTL_P)
595 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
596
597 for (i = 0; i < banks; i++) {
598 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
599 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
600 }
601}
602
603/* Add per CPU specific workarounds here */
604static void mce_cpu_quirks(struct cpuinfo_x86 *c)
605{
606 /* This should be disabled by the BIOS, but isn't always */
607 if (c->x86_vendor == X86_VENDOR_AMD) {
608 if (c->x86 == 15 && banks > 4)
609 /* disable GART TBL walk error reporting, which trips off
610 incorrectly with the IOMMU & 3ware & Cerberus. */
611 clear_bit(10, (unsigned long *)&bank[4]);
612 if(c->x86 <= 17 && mce_bootlog < 0)
613 /* Lots of broken BIOS around that don't clear them
614 by default and leave crap in there. Don't log. */
615 mce_bootlog = 0;
616 }
617
618}
619
620static void mce_cpu_features(struct cpuinfo_x86 *c)
621{
622 switch (c->x86_vendor) {
623 case X86_VENDOR_INTEL:
624 mce_intel_feature_init(c);
625 break;
626 case X86_VENDOR_AMD:
627 mce_amd_feature_init(c);
628 break;
629 default:
630 break;
631 }
632}
633
634static void mce_init_timer(void)
635{
636 struct timer_list *t = &__get_cpu_var(mce_timer);
637 int *n = &__get_cpu_var(next_interval);
638
639 *n = check_interval * HZ;
640 if (!*n)
641 return;
642 setup_timer(t, mcheck_timer, smp_processor_id());
643 t->expires = round_jiffies(jiffies + *n);
644 add_timer(t);
645}
646
647/*
648 * Called for each booted CPU to set up machine checks.
649 * Must be called with preempt off.
650 */
651void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
652{
653 if (!mce_available(c))
654 return;
655
656 if (mce_cap_init() < 0) {
657 mce_dont_init = 1;
658 return;
659 }
660 mce_cpu_quirks(c);
661
662 mce_init(NULL);
663 mce_cpu_features(c);
664 mce_init_timer();
665}
666
667/*
668 * Character device to read and clear the MCE log.
669 */
670
671static DEFINE_SPINLOCK(mce_state_lock);
672static int open_count; /* #times opened */
673static int open_exclu; /* already open exclusive? */
674
675static int mce_open(struct inode *inode, struct file *file)
676{
677 lock_kernel();
678 spin_lock(&mce_state_lock);
679
680 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
681 spin_unlock(&mce_state_lock);
682 unlock_kernel();
683 return -EBUSY;
684 }
685
686 if (file->f_flags & O_EXCL)
687 open_exclu = 1;
688 open_count++;
689
690 spin_unlock(&mce_state_lock);
691 unlock_kernel();
692
693 return nonseekable_open(inode, file);
694}
695
696static int mce_release(struct inode *inode, struct file *file)
697{
698 spin_lock(&mce_state_lock);
699
700 open_count--;
701 open_exclu = 0;
702
703 spin_unlock(&mce_state_lock);
704
705 return 0;
706}
707
708static void collect_tscs(void *data)
709{
710 unsigned long *cpu_tsc = (unsigned long *)data;
711
712 rdtscll(cpu_tsc[smp_processor_id()]);
713}
714
715static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
716 loff_t *off)
717{
718 unsigned long *cpu_tsc;
719 static DEFINE_MUTEX(mce_read_mutex);
720 unsigned prev, next;
721 char __user *buf = ubuf;
722 int i, err;
723
724 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
725 if (!cpu_tsc)
726 return -ENOMEM;
727
728 mutex_lock(&mce_read_mutex);
729 next = rcu_dereference(mcelog.next);
730
731 /* Only supports full reads right now */
732 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
733 mutex_unlock(&mce_read_mutex);
734 kfree(cpu_tsc);
735 return -EINVAL;
736 }
737
738 err = 0;
739 prev = 0;
740 do {
741 for (i = prev; i < next; i++) {
742 unsigned long start = jiffies;
743
744 while (!mcelog.entry[i].finished) {
745 if (time_after_eq(jiffies, start + 2)) {
746 memset(mcelog.entry + i, 0,
747 sizeof(struct mce));
748 goto timeout;
749 }
750 cpu_relax();
751 }
752 smp_rmb();
753 err |= copy_to_user(buf, mcelog.entry + i,
754 sizeof(struct mce));
755 buf += sizeof(struct mce);
756timeout:
757 ;
758 }
759
760 memset(mcelog.entry + prev, 0,
761 (next - prev) * sizeof(struct mce));
762 prev = next;
763 next = cmpxchg(&mcelog.next, prev, 0);
764 } while (next != prev);
765
766 synchronize_sched();
767
768 /*
769 * Collect entries that were still getting written before the
770 * synchronize.
771 */
772 on_each_cpu(collect_tscs, cpu_tsc, 1);
773 for (i = next; i < MCE_LOG_LEN; i++) {
774 if (mcelog.entry[i].finished &&
775 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
776 err |= copy_to_user(buf, mcelog.entry+i,
777 sizeof(struct mce));
778 smp_rmb();
779 buf += sizeof(struct mce);
780 memset(&mcelog.entry[i], 0, sizeof(struct mce));
781 }
782 }
783 mutex_unlock(&mce_read_mutex);
784 kfree(cpu_tsc);
785 return err ? -EFAULT : buf - ubuf;
786}
787
788static unsigned int mce_poll(struct file *file, poll_table *wait)
789{
790 poll_wait(file, &mce_wait, wait);
791 if (rcu_dereference(mcelog.next))
792 return POLLIN | POLLRDNORM;
793 return 0;
794}
795
796static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
797{
798 int __user *p = (int __user *)arg;
799
800 if (!capable(CAP_SYS_ADMIN))
801 return -EPERM;
802 switch (cmd) {
803 case MCE_GET_RECORD_LEN:
804 return put_user(sizeof(struct mce), p);
805 case MCE_GET_LOG_LEN:
806 return put_user(MCE_LOG_LEN, p);
807 case MCE_GETCLEAR_FLAGS: {
808 unsigned flags;
809
810 do {
811 flags = mcelog.flags;
812 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
813 return put_user(flags, p);
814 }
815 default:
816 return -ENOTTY;
817 }
818}
819
820static const struct file_operations mce_chrdev_ops = {
821 .open = mce_open,
822 .release = mce_release,
823 .read = mce_read,
824 .poll = mce_poll,
825 .unlocked_ioctl = mce_ioctl,
826};
827
828static struct miscdevice mce_log_device = {
829 MISC_MCELOG_MINOR,
830 "mcelog",
831 &mce_chrdev_ops,
832};
833
834/*
835 * Old style boot options parsing. Only for compatibility.
836 */
837static int __init mcheck_disable(char *str)
838{
839 mce_dont_init = 1;
840 return 1;
841}
842
843/* mce=off disables machine check.
844 mce=TOLERANCELEVEL (number, see above)
845 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
846 mce=nobootlog Don't log MCEs from before booting. */
847static int __init mcheck_enable(char *str)
848{
849 if (!strcmp(str, "off"))
850 mce_dont_init = 1;
851 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
852 mce_bootlog = str[0] == 'b';
853 else if (isdigit(str[0]))
854 get_option(&str, &tolerant);
855 else
856 printk("mce= argument %s ignored. Please use /sys", str);
857 return 1;
858}
859
860__setup("nomce", mcheck_disable);
861__setup("mce=", mcheck_enable);
862
863/*
864 * Sysfs support
865 */
866
867/*
868 * Disable machine checks on suspend and shutdown. We can't really handle
869 * them later.
870 */
871static int mce_disable(void)
872{
873 int i;
874
875 for (i = 0; i < banks; i++)
876 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
877 return 0;
878}
879
880static int mce_suspend(struct sys_device *dev, pm_message_t state)
881{
882 return mce_disable();
883}
884
885static int mce_shutdown(struct sys_device *dev)
886{
887 return mce_disable();
888}
889
890/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
891 Only one CPU is active at this time, the others get readded later using
892 CPU hotplug. */
893static int mce_resume(struct sys_device *dev)
894{
895 mce_init(NULL);
896 mce_cpu_features(&current_cpu_data);
897 return 0;
898}
899
900static void mce_cpu_restart(void *data)
901{
902 del_timer_sync(&__get_cpu_var(mce_timer));
903 if (mce_available(&current_cpu_data))
904 mce_init(NULL);
905 mce_init_timer();
906}
907
908/* Reinit MCEs after user configuration changes */
909static void mce_restart(void)
910{
911 on_each_cpu(mce_cpu_restart, NULL, 1);
912}
913
914static struct sysdev_class mce_sysclass = {
915 .suspend = mce_suspend,
916 .shutdown = mce_shutdown,
917 .resume = mce_resume,
918 .name = "machinecheck",
919};
920
921DEFINE_PER_CPU(struct sys_device, device_mce);
922void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
923
924/* Why are there no generic functions for this? */
925#define ACCESSOR(name, var, start) \
926 static ssize_t show_ ## name(struct sys_device *s, \
927 struct sysdev_attribute *attr, \
928 char *buf) { \
929 return sprintf(buf, "%lx\n", (unsigned long)var); \
930 } \
931 static ssize_t set_ ## name(struct sys_device *s, \
932 struct sysdev_attribute *attr, \
933 const char *buf, size_t siz) { \
934 char *end; \
935 unsigned long new = simple_strtoul(buf, &end, 0); \
936 if (end == buf) return -EINVAL; \
937 var = new; \
938 start; \
939 return end-buf; \
940 } \
941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
942
943static struct sysdev_attribute *bank_attrs;
944
945static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
946 char *buf)
947{
948 u64 b = bank[attr - bank_attrs];
949 return sprintf(buf, "%llx\n", b);
950}
951
952static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
953 const char *buf, size_t siz)
954{
955 char *end;
956 u64 new = simple_strtoull(buf, &end, 0);
957 if (end == buf)
958 return -EINVAL;
959 bank[attr - bank_attrs] = new;
960 mce_restart();
961 return end-buf;
962}
963
964static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
965 char *buf)
966{
967 strcpy(buf, trigger);
968 strcat(buf, "\n");
969 return strlen(trigger) + 1;
970}
971
972static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
973 const char *buf,size_t siz)
974{
975 char *p;
976 int len;
977 strncpy(trigger, buf, sizeof(trigger));
978 trigger[sizeof(trigger)-1] = 0;
979 len = strlen(trigger);
980 p = strchr(trigger, '\n');
981 if (*p) *p = 0;
982 return len;
983}
984
985static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
986static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
987ACCESSOR(check_interval,check_interval,mce_restart())
988static struct sysdev_attribute *mce_attributes[] = {
989 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
990 NULL
991};
992
993static cpumask_var_t mce_device_initialized;
994
995/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
996static __cpuinit int mce_create_device(unsigned int cpu)
997{
998 int err;
999 int i;
1000
1001 if (!mce_available(&boot_cpu_data))
1002 return -EIO;
1003
1004 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1005 per_cpu(device_mce,cpu).id = cpu;
1006 per_cpu(device_mce,cpu).cls = &mce_sysclass;
1007
1008 err = sysdev_register(&per_cpu(device_mce,cpu));
1009 if (err)
1010 return err;
1011
1012 for (i = 0; mce_attributes[i]; i++) {
1013 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1014 mce_attributes[i]);
1015 if (err)
1016 goto error;
1017 }
1018 for (i = 0; i < banks; i++) {
1019 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020 &bank_attrs[i]);
1021 if (err)
1022 goto error2;
1023 }
1024 cpumask_set_cpu(cpu, mce_device_initialized);
1025
1026 return 0;
1027error2:
1028 while (--i >= 0) {
1029 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030 &bank_attrs[i]);
1031 }
1032error:
1033 while (--i >= 0) {
1034 sysdev_remove_file(&per_cpu(device_mce,cpu),
1035 mce_attributes[i]);
1036 }
1037 sysdev_unregister(&per_cpu(device_mce,cpu));
1038
1039 return err;
1040}
1041
1042static __cpuinit void mce_remove_device(unsigned int cpu)
1043{
1044 int i;
1045
1046 if (!cpumask_test_cpu(cpu, mce_device_initialized))
1047 return;
1048
1049 for (i = 0; mce_attributes[i]; i++)
1050 sysdev_remove_file(&per_cpu(device_mce,cpu),
1051 mce_attributes[i]);
1052 for (i = 0; i < banks; i++)
1053 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054 &bank_attrs[i]);
1055 sysdev_unregister(&per_cpu(device_mce,cpu));
1056 cpumask_clear_cpu(cpu, mce_device_initialized);
1057}
1058
1059/* Make sure there are no machine checks on offlined CPUs. */
1060static void mce_disable_cpu(void *h)
1061{
1062 int i;
1063 unsigned long action = *(unsigned long *)h;
1064
1065 if (!mce_available(&current_cpu_data))
1066 return;
1067 if (!(action & CPU_TASKS_FROZEN))
1068 cmci_clear();
1069 for (i = 0; i < banks; i++)
1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1071}
1072
1073static void mce_reenable_cpu(void *h)
1074{
1075 int i;
1076 unsigned long action = *(unsigned long *)h;
1077
1078 if (!mce_available(&current_cpu_data))
1079 return;
1080 if (!(action & CPU_TASKS_FROZEN))
1081 cmci_reenable();
1082 for (i = 0; i < banks; i++)
1083 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1084}
1085
1086/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1087static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1088 unsigned long action, void *hcpu)
1089{
1090 unsigned int cpu = (unsigned long)hcpu;
1091 struct timer_list *t = &per_cpu(mce_timer, cpu);
1092
1093 switch (action) {
1094 case CPU_ONLINE:
1095 case CPU_ONLINE_FROZEN:
1096 mce_create_device(cpu);
1097 if (threshold_cpu_callback)
1098 threshold_cpu_callback(action, cpu);
1099 break;
1100 case CPU_DEAD:
1101 case CPU_DEAD_FROZEN:
1102 if (threshold_cpu_callback)
1103 threshold_cpu_callback(action, cpu);
1104 mce_remove_device(cpu);
1105 break;
1106 case CPU_DOWN_PREPARE:
1107 case CPU_DOWN_PREPARE_FROZEN:
1108 del_timer_sync(t);
1109 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110 break;
1111 case CPU_DOWN_FAILED:
1112 case CPU_DOWN_FAILED_FROZEN:
1113 t->expires = round_jiffies(jiffies +
1114 __get_cpu_var(next_interval));
1115 add_timer_on(t, cpu);
1116 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1117 break;
1118 case CPU_POST_DEAD:
1119 /* intentionally ignoring frozen here */
1120 cmci_rediscover(cpu);
1121 break;
1122 }
1123 return NOTIFY_OK;
1124}
1125
1126static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1127 .notifier_call = mce_cpu_callback,
1128};
1129
1130static __init int mce_init_banks(void)
1131{
1132 int i;
1133
1134 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1135 GFP_KERNEL);
1136 if (!bank_attrs)
1137 return -ENOMEM;
1138
1139 for (i = 0; i < banks; i++) {
1140 struct sysdev_attribute *a = &bank_attrs[i];
1141 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1142 if (!a->attr.name)
1143 goto nomem;
1144 a->attr.mode = 0644;
1145 a->show = show_bank;
1146 a->store = set_bank;
1147 }
1148 return 0;
1149
1150nomem:
1151 while (--i >= 0)
1152 kfree(bank_attrs[i].attr.name);
1153 kfree(bank_attrs);
1154 bank_attrs = NULL;
1155 return -ENOMEM;
1156}
1157
1158static __init int mce_init_device(void)
1159{
1160 int err;
1161 int i = 0;
1162
1163 if (!mce_available(&boot_cpu_data))
1164 return -EIO;
1165
1166 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1167
1168 err = mce_init_banks();
1169 if (err)
1170 return err;
1171
1172 err = sysdev_class_register(&mce_sysclass);
1173 if (err)
1174 return err;
1175
1176 for_each_online_cpu(i) {
1177 err = mce_create_device(i);
1178 if (err)
1179 return err;
1180 }
1181
1182 register_hotcpu_notifier(&mce_cpu_notifier);
1183 misc_register(&mce_log_device);
1184 return err;
1185}
1186
1187device_initcall(mce_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 56dde9c4bc96..ddae21620bda 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -13,22 +13,22 @@
13 * 13 *
14 * All MC4_MISCi registers are shared between multi-cores 14 * All MC4_MISCi registers are shared between multi-cores
15 */ 15 */
16
17#include <linux/cpu.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/interrupt.h> 16#include <linux/interrupt.h>
21#include <linux/kobject.h>
22#include <linux/notifier.h> 17#include <linux/notifier.h>
23#include <linux/sched.h> 18#include <linux/kobject.h>
24#include <linux/smp.h> 19#include <linux/percpu.h>
25#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/errno.h>
22#include <linux/sched.h>
26#include <linux/sysfs.h> 23#include <linux/sysfs.h>
24#include <linux/init.h>
25#include <linux/cpu.h>
26#include <linux/smp.h>
27
27#include <asm/apic.h> 28#include <asm/apic.h>
29#include <asm/idle.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
29#include <asm/msr.h> 31#include <asm/msr.h>
30#include <asm/percpu.h>
31#include <asm/idle.h>
32 32
33#define PFX "mce_threshold: " 33#define PFX "mce_threshold: "
34#define VERSION "version 1.1.1" 34#define VERSION "version 1.1.1"
@@ -48,26 +48,26 @@
48#define MCG_XBLK_ADDR 0xC0000400 48#define MCG_XBLK_ADDR 0xC0000400
49 49
50struct threshold_block { 50struct threshold_block {
51 unsigned int block; 51 unsigned int block;
52 unsigned int bank; 52 unsigned int bank;
53 unsigned int cpu; 53 unsigned int cpu;
54 u32 address; 54 u32 address;
55 u16 interrupt_enable; 55 u16 interrupt_enable;
56 u16 threshold_limit; 56 u16 threshold_limit;
57 struct kobject kobj; 57 struct kobject kobj;
58 struct list_head miscj; 58 struct list_head miscj;
59}; 59};
60 60
61/* defaults used early on boot */ 61/* defaults used early on boot */
62static struct threshold_block threshold_defaults = { 62static struct threshold_block threshold_defaults = {
63 .interrupt_enable = 0, 63 .interrupt_enable = 0,
64 .threshold_limit = THRESHOLD_MAX, 64 .threshold_limit = THRESHOLD_MAX,
65}; 65};
66 66
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject *kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_var_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73 73
@@ -86,9 +86,9 @@ static void amd_threshold_interrupt(void);
86 */ 86 */
87 87
88struct thresh_restart { 88struct thresh_restart {
89 struct threshold_block *b; 89 struct threshold_block *b;
90 int reset; 90 int reset;
91 u16 old_limit; 91 u16 old_limit;
92}; 92};
93 93
94/* must be called with correct cpu affinity */ 94/* must be called with correct cpu affinity */
@@ -110,6 +110,7 @@ static void threshold_restart_bank(void *_tr)
110 } else if (tr->old_limit) { /* change limit w/o reset */ 110 } else if (tr->old_limit) { /* change limit w/o reset */
111 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 111 int new_count = (mci_misc_hi & THRESHOLD_MAX) +
112 (tr->old_limit - tr->b->threshold_limit); 112 (tr->old_limit - tr->b->threshold_limit);
113
113 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 114 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
114 (new_count & THRESHOLD_MAX); 115 (new_count & THRESHOLD_MAX);
115 } 116 }
@@ -125,11 +126,11 @@ static void threshold_restart_bank(void *_tr)
125/* cpu init entry point, called from mce.c with preempt off */ 126/* cpu init entry point, called from mce.c with preempt off */
126void mce_amd_feature_init(struct cpuinfo_x86 *c) 127void mce_amd_feature_init(struct cpuinfo_x86 *c)
127{ 128{
128 unsigned int bank, block;
129 unsigned int cpu = smp_processor_id(); 129 unsigned int cpu = smp_processor_id();
130 u8 lvt_off;
131 u32 low = 0, high = 0, address = 0; 130 u32 low = 0, high = 0, address = 0;
131 unsigned int bank, block;
132 struct thresh_restart tr; 132 struct thresh_restart tr;
133 u8 lvt_off;
133 134
134 for (bank = 0; bank < NR_BANKS; ++bank) { 135 for (bank = 0; bank < NR_BANKS; ++bank) {
135 for (block = 0; block < NR_BLOCKS; ++block) { 136 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -140,8 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
140 if (!address) 141 if (!address)
141 break; 142 break;
142 address += MCG_XBLK_ADDR; 143 address += MCG_XBLK_ADDR;
143 } 144 } else
144 else
145 ++address; 145 ++address;
146 146
147 if (rdmsr_safe(address, &low, &high)) 147 if (rdmsr_safe(address, &low, &high))
@@ -193,9 +193,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
193 */ 193 */
194static void amd_threshold_interrupt(void) 194static void amd_threshold_interrupt(void)
195{ 195{
196 u32 low = 0, high = 0, address = 0;
196 unsigned int bank, block; 197 unsigned int bank, block;
197 struct mce m; 198 struct mce m;
198 u32 low = 0, high = 0, address = 0;
199 199
200 mce_setup(&m); 200 mce_setup(&m);
201 201
@@ -204,16 +204,16 @@ static void amd_threshold_interrupt(void)
204 if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) 204 if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
205 continue; 205 continue;
206 for (block = 0; block < NR_BLOCKS; ++block) { 206 for (block = 0; block < NR_BLOCKS; ++block) {
207 if (block == 0) 207 if (block == 0) {
208 address = MSR_IA32_MC0_MISC + bank * 4; 208 address = MSR_IA32_MC0_MISC + bank * 4;
209 else if (block == 1) { 209 } else if (block == 1) {
210 address = (low & MASK_BLKPTR_LO) >> 21; 210 address = (low & MASK_BLKPTR_LO) >> 21;
211 if (!address) 211 if (!address)
212 break; 212 break;
213 address += MCG_XBLK_ADDR; 213 address += MCG_XBLK_ADDR;
214 } 214 } else {
215 else
216 ++address; 215 ++address;
216 }
217 217
218 if (rdmsr_safe(address, &low, &high)) 218 if (rdmsr_safe(address, &low, &high))
219 break; 219 break;
@@ -229,8 +229,10 @@ static void amd_threshold_interrupt(void)
229 (high & MASK_LOCKED_HI)) 229 (high & MASK_LOCKED_HI))
230 continue; 230 continue;
231 231
232 /* Log the machine check that caused the threshold 232 /*
233 event. */ 233 * Log the machine check that caused the threshold
234 * event.
235 */
234 machine_check_poll(MCP_TIMESTAMP, 236 machine_check_poll(MCP_TIMESTAMP,
235 &__get_cpu_var(mce_poll_banks)); 237 &__get_cpu_var(mce_poll_banks));
236 238
@@ -254,48 +256,52 @@ static void amd_threshold_interrupt(void)
254 256
255struct threshold_attr { 257struct threshold_attr {
256 struct attribute attr; 258 struct attribute attr;
257 ssize_t(*show) (struct threshold_block *, char *); 259 ssize_t (*show) (struct threshold_block *, char *);
258 ssize_t(*store) (struct threshold_block *, const char *, size_t count); 260 ssize_t (*store) (struct threshold_block *, const char *, size_t count);
259}; 261};
260 262
261#define SHOW_FIELDS(name) \ 263#define SHOW_FIELDS(name) \
262static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ 264static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
263{ \ 265{ \
264 return sprintf(buf, "%lx\n", (unsigned long) b->name); \ 266 return sprintf(buf, "%lx\n", (unsigned long) b->name); \
265} 267}
266SHOW_FIELDS(interrupt_enable) 268SHOW_FIELDS(interrupt_enable)
267SHOW_FIELDS(threshold_limit) 269SHOW_FIELDS(threshold_limit)
268 270
269static ssize_t store_interrupt_enable(struct threshold_block *b, 271static ssize_t
270 const char *buf, size_t count) 272store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
271{ 273{
272 char *end;
273 struct thresh_restart tr; 274 struct thresh_restart tr;
274 unsigned long new = simple_strtoul(buf, &end, 0); 275 unsigned long new;
275 if (end == buf) 276
277 if (strict_strtoul(buf, 0, &new) < 0)
276 return -EINVAL; 278 return -EINVAL;
279
277 b->interrupt_enable = !!new; 280 b->interrupt_enable = !!new;
278 281
279 tr.b = b; 282 tr.b = b;
280 tr.reset = 0; 283 tr.reset = 0;
281 tr.old_limit = 0; 284 tr.old_limit = 0;
285
282 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 286 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
283 287
284 return end - buf; 288 return size;
285} 289}
286 290
287static ssize_t store_threshold_limit(struct threshold_block *b, 291static ssize_t
288 const char *buf, size_t count) 292store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
289{ 293{
290 char *end;
291 struct thresh_restart tr; 294 struct thresh_restart tr;
292 unsigned long new = simple_strtoul(buf, &end, 0); 295 unsigned long new;
293 if (end == buf) 296
297 if (strict_strtoul(buf, 0, &new) < 0)
294 return -EINVAL; 298 return -EINVAL;
299
295 if (new > THRESHOLD_MAX) 300 if (new > THRESHOLD_MAX)
296 new = THRESHOLD_MAX; 301 new = THRESHOLD_MAX;
297 if (new < 1) 302 if (new < 1)
298 new = 1; 303 new = 1;
304
299 tr.old_limit = b->threshold_limit; 305 tr.old_limit = b->threshold_limit;
300 b->threshold_limit = new; 306 b->threshold_limit = new;
301 tr.b = b; 307 tr.b = b;
@@ -303,12 +309,12 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
303 309
304 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 310 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
305 311
306 return end - buf; 312 return size;
307} 313}
308 314
309struct threshold_block_cross_cpu { 315struct threshold_block_cross_cpu {
310 struct threshold_block *tb; 316 struct threshold_block *tb;
311 long retval; 317 long retval;
312}; 318};
313 319
314static void local_error_count_handler(void *_tbcc) 320static void local_error_count_handler(void *_tbcc)
@@ -338,16 +344,13 @@ static ssize_t store_error_count(struct threshold_block *b,
338 return 1; 344 return 1;
339} 345}
340 346
341#define THRESHOLD_ATTR(_name,_mode,_show,_store) { \ 347#define RW_ATTR(val) \
342 .attr = {.name = __stringify(_name), .mode = _mode }, \ 348static struct threshold_attr val = { \
343 .show = _show, \ 349 .attr = {.name = __stringify(val), .mode = 0644 }, \
344 .store = _store, \ 350 .show = show_## val, \
351 .store = store_## val, \
345}; 352};
346 353
347#define RW_ATTR(name) \
348static struct threshold_attr name = \
349 THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
350
351RW_ATTR(interrupt_enable); 354RW_ATTR(interrupt_enable);
352RW_ATTR(threshold_limit); 355RW_ATTR(threshold_limit);
353RW_ATTR(error_count); 356RW_ATTR(error_count);
@@ -359,15 +362,17 @@ static struct attribute *default_attrs[] = {
359 NULL 362 NULL
360}; 363};
361 364
362#define to_block(k) container_of(k, struct threshold_block, kobj) 365#define to_block(k) container_of(k, struct threshold_block, kobj)
363#define to_attr(a) container_of(a, struct threshold_attr, attr) 366#define to_attr(a) container_of(a, struct threshold_attr, attr)
364 367
365static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) 368static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
366{ 369{
367 struct threshold_block *b = to_block(kobj); 370 struct threshold_block *b = to_block(kobj);
368 struct threshold_attr *a = to_attr(attr); 371 struct threshold_attr *a = to_attr(attr);
369 ssize_t ret; 372 ssize_t ret;
373
370 ret = a->show ? a->show(b, buf) : -EIO; 374 ret = a->show ? a->show(b, buf) : -EIO;
375
371 return ret; 376 return ret;
372} 377}
373 378
@@ -377,18 +382,20 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
377 struct threshold_block *b = to_block(kobj); 382 struct threshold_block *b = to_block(kobj);
378 struct threshold_attr *a = to_attr(attr); 383 struct threshold_attr *a = to_attr(attr);
379 ssize_t ret; 384 ssize_t ret;
385
380 ret = a->store ? a->store(b, buf, count) : -EIO; 386 ret = a->store ? a->store(b, buf, count) : -EIO;
387
381 return ret; 388 return ret;
382} 389}
383 390
384static struct sysfs_ops threshold_ops = { 391static struct sysfs_ops threshold_ops = {
385 .show = show, 392 .show = show,
386 .store = store, 393 .store = store,
387}; 394};
388 395
389static struct kobj_type threshold_ktype = { 396static struct kobj_type threshold_ktype = {
390 .sysfs_ops = &threshold_ops, 397 .sysfs_ops = &threshold_ops,
391 .default_attrs = default_attrs, 398 .default_attrs = default_attrs,
392}; 399};
393 400
394static __cpuinit int allocate_threshold_blocks(unsigned int cpu, 401static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
@@ -396,9 +403,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
396 unsigned int block, 403 unsigned int block,
397 u32 address) 404 u32 address)
398{ 405{
399 int err;
400 u32 low, high;
401 struct threshold_block *b = NULL; 406 struct threshold_block *b = NULL;
407 u32 low, high;
408 int err;
402 409
403 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) 410 if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
404 return 0; 411 return 0;
@@ -421,20 +428,21 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
421 if (!b) 428 if (!b)
422 return -ENOMEM; 429 return -ENOMEM;
423 430
424 b->block = block; 431 b->block = block;
425 b->bank = bank; 432 b->bank = bank;
426 b->cpu = cpu; 433 b->cpu = cpu;
427 b->address = address; 434 b->address = address;
428 b->interrupt_enable = 0; 435 b->interrupt_enable = 0;
429 b->threshold_limit = THRESHOLD_MAX; 436 b->threshold_limit = THRESHOLD_MAX;
430 437
431 INIT_LIST_HEAD(&b->miscj); 438 INIT_LIST_HEAD(&b->miscj);
432 439
433 if (per_cpu(threshold_banks, cpu)[bank]->blocks) 440 if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
434 list_add(&b->miscj, 441 list_add(&b->miscj,
435 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); 442 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
436 else 443 } else {
437 per_cpu(threshold_banks, cpu)[bank]->blocks = b; 444 per_cpu(threshold_banks, cpu)[bank]->blocks = b;
445 }
438 446
439 err = kobject_init_and_add(&b->kobj, &threshold_ktype, 447 err = kobject_init_and_add(&b->kobj, &threshold_ktype,
440 per_cpu(threshold_banks, cpu)[bank]->kobj, 448 per_cpu(threshold_banks, cpu)[bank]->kobj,
@@ -447,8 +455,9 @@ recurse:
447 if (!address) 455 if (!address)
448 return 0; 456 return 0;
449 address += MCG_XBLK_ADDR; 457 address += MCG_XBLK_ADDR;
450 } else 458 } else {
451 ++address; 459 ++address;
460 }
452 461
453 err = allocate_threshold_blocks(cpu, bank, ++block, address); 462 err = allocate_threshold_blocks(cpu, bank, ++block, address);
454 if (err) 463 if (err)
@@ -500,13 +509,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
500 if (!b) 509 if (!b)
501 goto out; 510 goto out;
502 511
503 err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, 512 err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
504 b->kobj, name); 513 b->kobj, name);
505 if (err) 514 if (err)
506 goto out; 515 goto out;
507 516
508 cpumask_copy(b->cpus, cpu_core_mask(cpu)); 517 cpumask_copy(b->cpus, cpu_core_mask(cpu));
509 per_cpu(threshold_banks, cpu)[bank] = b; 518 per_cpu(threshold_banks, cpu)[bank] = b;
519
510 goto out; 520 goto out;
511 } 521 }
512#endif 522#endif
@@ -522,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
522 goto out; 532 goto out;
523 } 533 }
524 534
525 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); 535 b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
526 if (!b->kobj) 536 if (!b->kobj)
527 goto out_free; 537 goto out_free;
528 538
@@ -542,7 +552,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
542 if (i == cpu) 552 if (i == cpu)
543 continue; 553 continue;
544 554
545 err = sysfs_create_link(&per_cpu(device_mce, i).kobj, 555 err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
546 b->kobj, name); 556 b->kobj, name);
547 if (err) 557 if (err)
548 goto out; 558 goto out;
@@ -605,15 +615,13 @@ static void deallocate_threshold_block(unsigned int cpu,
605 615
606static void threshold_remove_bank(unsigned int cpu, int bank) 616static void threshold_remove_bank(unsigned int cpu, int bank)
607{ 617{
608 int i = 0;
609 struct threshold_bank *b; 618 struct threshold_bank *b;
610 char name[32]; 619 char name[32];
620 int i = 0;
611 621
612 b = per_cpu(threshold_banks, cpu)[bank]; 622 b = per_cpu(threshold_banks, cpu)[bank];
613
614 if (!b) 623 if (!b)
615 return; 624 return;
616
617 if (!b->blocks) 625 if (!b->blocks)
618 goto free_out; 626 goto free_out;
619 627
@@ -622,8 +630,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
622#ifdef CONFIG_SMP 630#ifdef CONFIG_SMP
623 /* sibling symlink */ 631 /* sibling symlink */
624 if (shared_bank[bank] && b->blocks->cpu != cpu) { 632 if (shared_bank[bank] && b->blocks->cpu != cpu) {
625 sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); 633 sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
626 per_cpu(threshold_banks, cpu)[bank] = NULL; 634 per_cpu(threshold_banks, cpu)[bank] = NULL;
635
627 return; 636 return;
628 } 637 }
629#endif 638#endif
@@ -633,7 +642,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
633 if (i == cpu) 642 if (i == cpu)
634 continue; 643 continue;
635 644
636 sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); 645 sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
637 per_cpu(threshold_banks, i)[bank] = NULL; 646 per_cpu(threshold_banks, i)[bank] = NULL;
638 } 647 }
639 648
@@ -659,12 +668,9 @@ static void threshold_remove_device(unsigned int cpu)
659} 668}
660 669
661/* get notified when a cpu comes on/off */ 670/* get notified when a cpu comes on/off */
662static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, 671static void __cpuinit
663 unsigned int cpu) 672amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
664{ 673{
665 if (cpu >= NR_CPUS)
666 return;
667
668 switch (action) { 674 switch (action) {
669 case CPU_ONLINE: 675 case CPU_ONLINE:
670 case CPU_ONLINE_FROZEN: 676 case CPU_ONLINE_FROZEN:
@@ -686,11 +692,12 @@ static __init int threshold_init_device(void)
686 /* to hit CPUs online before the notifier is up */ 692 /* to hit CPUs online before the notifier is up */
687 for_each_online_cpu(lcpu) { 693 for_each_online_cpu(lcpu) {
688 int err = threshold_create_device(lcpu); 694 int err = threshold_create_device(lcpu);
695
689 if (err) 696 if (err)
690 return err; 697 return err;
691 } 698 }
692 threshold_cpu_callback = amd_64_threshold_cpu_callback; 699 threshold_cpu_callback = amd_64_threshold_cpu_callback;
700
693 return 0; 701 return 0;
694} 702}
695
696device_initcall(threshold_init_device); 703device_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index cef3ee30744b..e1acec0f7a32 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -8,85 +8,10 @@
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/interrupt.h> 9#include <linux/interrupt.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <asm/processor.h>
12#include <asm/apic.h> 11#include <asm/apic.h>
12#include <asm/processor.h>
13#include <asm/msr.h> 13#include <asm/msr.h>
14#include <asm/mce.h> 14#include <asm/mce.h>
15#include <asm/hw_irq.h>
16#include <asm/idle.h>
17#include <asm/therm_throt.h>
18#include <asm/apic.h>
19
20asmlinkage void smp_thermal_interrupt(void)
21{
22 __u64 msr_val;
23
24 ack_APIC_irq();
25
26 exit_idle();
27 irq_enter();
28
29 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
30 if (therm_throt_process(msr_val & 1))
31 mce_log_therm_throt_event(msr_val);
32
33 inc_irq_stat(irq_thermal_count);
34 irq_exit();
35}
36
37static void intel_init_thermal(struct cpuinfo_x86 *c)
38{
39 u32 l, h;
40 int tm2 = 0;
41 unsigned int cpu = smp_processor_id();
42
43 if (!cpu_has(c, X86_FEATURE_ACPI))
44 return;
45
46 if (!cpu_has(c, X86_FEATURE_ACC))
47 return;
48
49 /* first check if TM1 is already enabled by the BIOS, in which
50 * case there might be some SMM goo which handles it, so we can't even
51 * put a handler since it might be delivered via SMI already.
52 */
53 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
54 h = apic_read(APIC_LVTTHMR);
55 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
56 printk(KERN_DEBUG
57 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
58 return;
59 }
60
61 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
62 tm2 = 1;
63
64 if (h & APIC_VECTOR_MASK) {
65 printk(KERN_DEBUG
66 "CPU%d: Thermal LVT vector (%#x) already "
67 "installed\n", cpu, (h & APIC_VECTOR_MASK));
68 return;
69 }
70
71 h = THERMAL_APIC_VECTOR;
72 h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
73 apic_write(APIC_LVTTHMR, h);
74
75 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
76 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
77
78 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
79 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
80
81 l = apic_read(APIC_LVTTHMR);
82 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
83 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
84 cpu, tm2 ? "TM2" : "TM1");
85
86 /* enable thermal throttle processing */
87 atomic_set(&therm_throt_en, 1);
88 return;
89}
90 15
91/* 16/*
92 * Support for Intel Correct Machine Check Interrupts. This allows 17 * Support for Intel Correct Machine Check Interrupts. This allows
@@ -109,6 +34,9 @@ static int cmci_supported(int *banks)
109{ 34{
110 u64 cap; 35 u64 cap;
111 36
37 if (mce_cmci_disabled || mce_ignore_ce)
38 return 0;
39
112 /* 40 /*
113 * Vendor check is not strictly needed, but the initial 41 * Vendor check is not strictly needed, but the initial
114 * initialization is vendor keyed and this 42 * initialization is vendor keyed and this
@@ -132,7 +60,7 @@ static int cmci_supported(int *banks)
132static void intel_threshold_interrupt(void) 60static void intel_threshold_interrupt(void)
133{ 61{
134 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); 62 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
135 mce_notify_user(); 63 mce_notify_irq();
136} 64}
137 65
138static void print_update(char *type, int *hdr, int num) 66static void print_update(char *type, int *hdr, int num)
@@ -248,7 +176,7 @@ void cmci_rediscover(int dying)
248 return; 176 return;
249 cpumask_copy(old, &current->cpus_allowed); 177 cpumask_copy(old, &current->cpus_allowed);
250 178
251 for_each_online_cpu (cpu) { 179 for_each_online_cpu(cpu) {
252 if (cpu == dying) 180 if (cpu == dying)
253 continue; 181 continue;
254 if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) 182 if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index a74af128efc9..f5f2d6f71fb6 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -6,25 +6,23 @@
6 * This file contains routines to check for non-fatal MCEs every 15s 6 * This file contains routines to check for non-fatal MCEs every 15s
7 * 7 *
8 */ 8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/jiffies.h>
14#include <linux/workqueue.h>
15#include <linux/interrupt.h> 9#include <linux/interrupt.h>
16#include <linux/smp.h> 10#include <linux/workqueue.h>
11#include <linux/jiffies.h>
12#include <linux/kernel.h>
17#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/init.h>
16#include <linux/smp.h>
18 17
19#include <asm/processor.h> 18#include <asm/processor.h>
20#include <asm/system.h> 19#include <asm/system.h>
20#include <asm/mce.h>
21#include <asm/msr.h> 21#include <asm/msr.h>
22 22
23#include "mce.h" 23static int firstbank;
24 24
25static int firstbank; 25#define MCE_RATE (15*HZ) /* timer rate is 15s */
26
27#define MCE_RATE 15*HZ /* timer rate is 15s */
28 26
29static void mce_checkregs(void *info) 27static void mce_checkregs(void *info)
30{ 28{
@@ -34,23 +32,24 @@ static void mce_checkregs(void *info)
34 for (i = firstbank; i < nr_mce_banks; i++) { 32 for (i = firstbank; i < nr_mce_banks; i++) {
35 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); 33 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
36 34
37 if (high & (1<<31)) { 35 if (!(high & (1<<31)))
38 printk(KERN_INFO "MCE: The hardware reports a non " 36 continue;
39 "fatal, correctable incident occurred on " 37
40 "CPU %d.\n", 38 printk(KERN_INFO "MCE: The hardware reports a non fatal, "
39 "correctable incident occurred on CPU %d.\n",
41 smp_processor_id()); 40 smp_processor_id());
42 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); 41
43 42 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
44 /* 43
45 * Scrub the error so we don't pick it up in MCE_RATE 44 /*
46 * seconds time. 45 * Scrub the error so we don't pick it up in MCE_RATE
47 */ 46 * seconds time:
48 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 47 */
49 48 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
50 /* Serialize */ 49
51 wmb(); 50 /* Serialize: */
52 add_taint(TAINT_MACHINE_CHECK); 51 wmb();
53 } 52 add_taint(TAINT_MACHINE_CHECK);
54 } 53 }
55} 54}
56 55
@@ -77,16 +76,17 @@ static int __init init_nonfatal_mce_checker(void)
77 76
78 /* Some Athlons misbehave when we frob bank 0 */ 77 /* Some Athlons misbehave when we frob bank 0 */
79 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 78 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
80 boot_cpu_data.x86 == 6) 79 boot_cpu_data.x86 == 6)
81 firstbank = 1; 80 firstbank = 1;
82 else 81 else
83 firstbank = 0; 82 firstbank = 0;
84 83
85 /* 84 /*
86 * Check for non-fatal errors every MCE_RATE s 85 * Check for non-fatal errors every MCE_RATE s
87 */ 86 */
88 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); 87 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
89 printk(KERN_INFO "Machine check exception polling timer started.\n"); 88 printk(KERN_INFO "Machine check exception polling timer started.\n");
89
90 return 0; 90 return 0;
91} 91}
92module_init(init_nonfatal_mce_checker); 92module_init(init_nonfatal_mce_checker);
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index f53bdcbaf382..4482aea9aa2e 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -1,21 +1,14 @@
1/* 1/*
2 * P4 specific Machine Check Exception Reporting 2 * P4 specific Machine Check Exception Reporting
3 */ 3 */
4
5#include <linux/init.h>
6#include <linux/types.h>
7#include <linux/kernel.h> 4#include <linux/kernel.h>
8#include <linux/interrupt.h> 5#include <linux/types.h>
6#include <linux/init.h>
9#include <linux/smp.h> 7#include <linux/smp.h>
10 8
11#include <asm/processor.h> 9#include <asm/processor.h>
12#include <asm/system.h> 10#include <asm/mce.h>
13#include <asm/msr.h> 11#include <asm/msr.h>
14#include <asm/apic.h>
15
16#include <asm/therm_throt.h>
17
18#include "mce.h"
19 12
20/* as supported by the P4/Xeon family */ 13/* as supported by the P4/Xeon family */
21struct intel_mce_extended_msrs { 14struct intel_mce_extended_msrs {
@@ -34,98 +27,8 @@ struct intel_mce_extended_msrs {
34 27
35static int mce_num_extended_msrs; 28static int mce_num_extended_msrs;
36 29
37
38#ifdef CONFIG_X86_MCE_P4THERMAL
39static void unexpected_thermal_interrupt(struct pt_regs *regs)
40{
41 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
42 smp_processor_id());
43 add_taint(TAINT_MACHINE_CHECK);
44}
45
46/* P4/Xeon Thermal transition interrupt handler */
47static void intel_thermal_interrupt(struct pt_regs *regs)
48{
49 __u64 msr_val;
50
51 ack_APIC_irq();
52
53 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
54 therm_throt_process(msr_val & 0x1);
55}
56
57/* Thermal interrupt handler for this CPU setup */
58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
59
60void smp_thermal_interrupt(struct pt_regs *regs)
61{
62 irq_enter();
63 vendor_thermal_interrupt(regs);
64 __get_cpu_var(irq_stat).irq_thermal_count++;
65 irq_exit();
66}
67
68/* P4/Xeon Thermal regulation detect and init */
69static void intel_init_thermal(struct cpuinfo_x86 *c)
70{
71 u32 l, h;
72 unsigned int cpu = smp_processor_id();
73
74 /* Thermal monitoring */
75 if (!cpu_has(c, X86_FEATURE_ACPI))
76 return; /* -ENODEV */
77
78 /* Clock modulation */
79 if (!cpu_has(c, X86_FEATURE_ACC))
80 return; /* -ENODEV */
81
82 /* first check if its enabled already, in which case there might
83 * be some SMM goo which handles it, so we can't even put a handler
84 * since it might be delivered via SMI already -zwanem.
85 */
86 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
87 h = apic_read(APIC_LVTTHMR);
88 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
89 printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
90 cpu);
91 return; /* -EBUSY */
92 }
93
94 /* check whether a vector already exists, temporarily masked? */
95 if (h & APIC_VECTOR_MASK) {
96 printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
97 "installed\n",
98 cpu, (h & APIC_VECTOR_MASK));
99 return; /* -EBUSY */
100 }
101
102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write(APIC_LVTTHMR, h);
106
107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
109
110 /* ok we're good to go... */
111 vendor_thermal_interrupt = intel_thermal_interrupt;
112
113 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
114 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
115
116 l = apic_read(APIC_LVTTHMR);
117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119
120 /* enable thermal throttle processing */
121 atomic_set(&therm_throt_en, 1);
122 return;
123}
124#endif /* CONFIG_X86_MCE_P4THERMAL */
125
126
127/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ 30/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
128static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) 31static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
129{ 32{
130 u32 h; 33 u32 h;
131 34
@@ -143,9 +46,9 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
143 46
144static void intel_machine_check(struct pt_regs *regs, long error_code) 47static void intel_machine_check(struct pt_regs *regs, long error_code)
145{ 48{
146 int recover = 1;
147 u32 alow, ahigh, high, low; 49 u32 alow, ahigh, high, low;
148 u32 mcgstl, mcgsth; 50 u32 mcgstl, mcgsth;
51 int recover = 1;
149 int i; 52 int i;
150 53
151 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 54 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -157,7 +60,9 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
157 60
158 if (mce_num_extended_msrs > 0) { 61 if (mce_num_extended_msrs > 0) {
159 struct intel_mce_extended_msrs dbg; 62 struct intel_mce_extended_msrs dbg;
63
160 intel_get_extended_msrs(&dbg); 64 intel_get_extended_msrs(&dbg);
65
161 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" 66 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
162 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" 67 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
163 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", 68 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
@@ -171,6 +76,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
171 if (high & (1<<31)) { 76 if (high & (1<<31)) {
172 char misc[20]; 77 char misc[20];
173 char addr[24]; 78 char addr[24];
79
174 misc[0] = addr[0] = '\0'; 80 misc[0] = addr[0] = '\0';
175 if (high & (1<<29)) 81 if (high & (1<<29))
176 recover |= 1; 82 recover |= 1;
@@ -196,6 +102,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
196 panic("Unable to continue"); 102 panic("Unable to continue");
197 103
198 printk(KERN_EMERG "Attempting to continue.\n"); 104 printk(KERN_EMERG "Attempting to continue.\n");
105
199 /* 106 /*
200 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 107 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
201 * recoverable/continuable.This will allow BIOS to look at the MSRs 108 * recoverable/continuable.This will allow BIOS to look at the MSRs
@@ -217,7 +124,6 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
217 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 124 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
218} 125}
219 126
220
221void intel_p4_mcheck_init(struct cpuinfo_x86 *c) 127void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
222{ 128{
223 u32 l, h; 129 u32 l, h;
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index c9f77ea69edc..5c0e6533d9bc 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -2,52 +2,67 @@
2 * P5 specific Machine Check Exception Reporting 2 * P5 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
13#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15 15
16#include "mce.h" 16/* By default disabled */
17int mce_p5_enabled __read_mostly;
17 18
18/* Machine check handler for Pentium class Intel */ 19/* Machine check handler for Pentium class Intel CPUs: */
19static void pentium_machine_check(struct pt_regs *regs, long error_code) 20static void pentium_machine_check(struct pt_regs *regs, long error_code)
20{ 21{
21 u32 loaddr, hi, lotype; 22 u32 loaddr, hi, lotype;
23
22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 24 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
23 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); 25 rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
24 printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); 26
25 if (lotype&(1<<5)) 27 printk(KERN_EMERG
26 printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); 28 "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
29 smp_processor_id(), loaddr, lotype);
30
31 if (lotype & (1<<5)) {
32 printk(KERN_EMERG
33 "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
34 smp_processor_id());
35 }
36
27 add_taint(TAINT_MACHINE_CHECK); 37 add_taint(TAINT_MACHINE_CHECK);
28} 38}
29 39
30/* Set up machine check reporting for processors with Intel style MCE */ 40/* Set up machine check reporting for processors with Intel style MCE: */
31void intel_p5_mcheck_init(struct cpuinfo_x86 *c) 41void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
32{ 42{
33 u32 l, h; 43 u32 l, h;
34 44
35 /*Check for MCE support */ 45 /* Default P5 to off as its often misconnected: */
36 if (!cpu_has(c, X86_FEATURE_MCE)) 46 if (!mce_p5_enabled)
37 return; 47 return;
38 48
39 /* Default P5 to off as its often misconnected */ 49 /* Check for MCE support: */
40 if (mce_disabled != -1) 50 if (!cpu_has(c, X86_FEATURE_MCE))
41 return; 51 return;
52
42 machine_check_vector = pentium_machine_check; 53 machine_check_vector = pentium_machine_check;
54 /* Make sure the vector pointer is visible before we enable MCEs: */
43 wmb(); 55 wmb();
44 56
45 /* Read registers before enabling */ 57 /* Read registers before enabling: */
46 rdmsr(MSR_IA32_P5_MC_ADDR, l, h); 58 rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
47 rdmsr(MSR_IA32_P5_MC_TYPE, l, h); 59 rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
48 printk(KERN_INFO "Intel old style machine check architecture supported.\n"); 60 printk(KERN_INFO
61 "Intel old style machine check architecture supported.\n");
49 62
50 /* Enable MCE */ 63 /* Enable MCE: */
51 set_in_cr4(X86_CR4_MCE); 64 set_in_cr4(X86_CR4_MCE);
52 printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); 65 printk(KERN_INFO
66 "Intel old style machine check reporting enabled on CPU#%d.\n",
67 smp_processor_id());
53} 68}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 2ac52d7b434b..01e4f8178183 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -2,25 +2,23 @@
2 * P6 specific Machine Check Exception Reporting 2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10#include <linux/smp.h> 9#include <linux/smp.h>
11 10
12#include <asm/processor.h> 11#include <asm/processor.h>
13#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15 15
16#include "mce.h"
17
18/* Machine Check Handler For PII/PIII */ 16/* Machine Check Handler For PII/PIII */
19static void intel_machine_check(struct pt_regs *regs, long error_code) 17static void intel_machine_check(struct pt_regs *regs, long error_code)
20{ 18{
21 int recover = 1;
22 u32 alow, ahigh, high, low; 19 u32 alow, ahigh, high, low;
23 u32 mcgstl, mcgsth; 20 u32 mcgstl, mcgsth;
21 int recover = 1;
24 int i; 22 int i;
25 23
26 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 24 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
@@ -35,12 +33,16 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
35 if (high & (1<<31)) { 33 if (high & (1<<31)) {
36 char misc[20]; 34 char misc[20];
37 char addr[24]; 35 char addr[24];
38 misc[0] = addr[0] = '\0'; 36
37 misc[0] = '\0';
38 addr[0] = '\0';
39
39 if (high & (1<<29)) 40 if (high & (1<<29))
40 recover |= 1; 41 recover |= 1;
41 if (high & (1<<25)) 42 if (high & (1<<25))
42 recover |= 2; 43 recover |= 2;
43 high &= ~(1<<31); 44 high &= ~(1<<31);
45
44 if (high & (1<<27)) { 46 if (high & (1<<27)) {
45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); 47 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow); 48 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
@@ -49,6 +51,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 51 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
50 snprintf(addr, 24, " at %08x%08x", ahigh, alow); 52 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
51 } 53 }
54
52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", 55 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr); 56 smp_processor_id(), i, high, low, misc, addr);
54 } 57 }
@@ -63,16 +66,17 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
63 /* 66 /*
64 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 67 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
65 * recoverable/continuable.This will allow BIOS to look at the MSRs 68 * recoverable/continuable.This will allow BIOS to look at the MSRs
66 * for errors if the OS could not log the error. 69 * for errors if the OS could not log the error:
67 */ 70 */
68 for (i = 0; i < nr_mce_banks; i++) { 71 for (i = 0; i < nr_mce_banks; i++) {
69 unsigned int msr; 72 unsigned int msr;
73
70 msr = MSR_IA32_MC0_STATUS+i*4; 74 msr = MSR_IA32_MC0_STATUS+i*4;
71 rdmsr(msr, low, high); 75 rdmsr(msr, low, high);
72 if (high & (1<<31)) { 76 if (high & (1<<31)) {
73 /* Clear it */ 77 /* Clear it: */
74 wrmsr(msr, 0UL, 0UL); 78 wrmsr(msr, 0UL, 0UL);
75 /* Serialize */ 79 /* Serialize: */
76 wmb(); 80 wmb();
77 add_taint(TAINT_MACHINE_CHECK); 81 add_taint(TAINT_MACHINE_CHECK);
78 } 82 }
@@ -81,7 +85,7 @@ static void intel_machine_check(struct pt_regs *regs, long error_code)
81 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); 85 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
82} 86}
83 87
84/* Set up machine check reporting for processors with Intel style MCE */ 88/* Set up machine check reporting for processors with Intel style MCE: */
85void intel_p6_mcheck_init(struct cpuinfo_x86 *c) 89void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
86{ 90{
87 u32 l, h; 91 u32 l, h;
@@ -97,6 +101,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
97 101
98 /* Ok machine check is available */ 102 /* Ok machine check is available */
99 machine_check_vector = intel_machine_check; 103 machine_check_vector = intel_machine_check;
104 /* Make sure the vector pointer is visible before we enable MCEs: */
100 wmb(); 105 wmb();
101 106
102 printk(KERN_INFO "Intel machine check architecture supported.\n"); 107 printk(KERN_INFO "Intel machine check architecture supported.\n");
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index d5ae2243f0b9..bff8dd191dd5 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -1,7 +1,7 @@
1/* 1/*
2 *
3 * Thermal throttle event support code (such as syslog messaging and rate 2 * Thermal throttle event support code (such as syslog messaging and rate
4 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). 3 * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
4 *
5 * This allows consistent reporting of CPU thermal throttle events. 5 * This allows consistent reporting of CPU thermal throttle events.
6 * 6 *
7 * Maintains a counter in /sys that keeps track of the number of thermal 7 * Maintains a counter in /sys that keeps track of the number of thermal
@@ -13,43 +13,53 @@
13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. 13 * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
14 * Inspired by Ross Biro's and Al Borchers' counter code. 14 * Inspired by Ross Biro's and Al Borchers' counter code.
15 */ 15 */
16 16#include <linux/interrupt.h>
17#include <linux/notifier.h>
18#include <linux/jiffies.h>
19#include <linux/kernel.h>
17#include <linux/percpu.h> 20#include <linux/percpu.h>
18#include <linux/sysdev.h> 21#include <linux/sysdev.h>
22#include <linux/types.h>
23#include <linux/init.h>
24#include <linux/smp.h>
19#include <linux/cpu.h> 25#include <linux/cpu.h>
20#include <asm/cpu.h> 26
21#include <linux/notifier.h> 27#include <asm/processor.h>
22#include <linux/jiffies.h> 28#include <asm/system.h>
23#include <asm/therm_throt.h> 29#include <asm/apic.h>
30#include <asm/idle.h>
31#include <asm/mce.h>
32#include <asm/msr.h>
24 33
25/* How long to wait between reporting thermal events */ 34/* How long to wait between reporting thermal events */
26#define CHECK_INTERVAL (300 * HZ) 35#define CHECK_INTERVAL (300 * HZ)
27 36
28static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; 37static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
29static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); 38static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
30atomic_t therm_throt_en = ATOMIC_INIT(0); 39
40static atomic_t therm_throt_en = ATOMIC_INIT(0);
31 41
32#ifdef CONFIG_SYSFS 42#ifdef CONFIG_SYSFS
33#define define_therm_throt_sysdev_one_ro(_name) \ 43#define define_therm_throt_sysdev_one_ro(_name) \
34 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 44 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
35 45
36#define define_therm_throt_sysdev_show_func(name) \ 46#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 47static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \ 48 struct sysdev_attribute *attr, \
39 char *buf) \ 49 char *buf) \
40{ \ 50{ \
41 unsigned int cpu = dev->id; \ 51 unsigned int cpu = dev->id; \
42 ssize_t ret; \ 52 ssize_t ret; \
43 \ 53 \
44 preempt_disable(); /* CPU hotplug */ \ 54 preempt_disable(); /* CPU hotplug */ \
45 if (cpu_online(cpu)) \ 55 if (cpu_online(cpu)) \
46 ret = sprintf(buf, "%lu\n", \ 56 ret = sprintf(buf, "%lu\n", \
47 per_cpu(thermal_throttle_##name, cpu)); \ 57 per_cpu(thermal_throttle_##name, cpu)); \
48 else \ 58 else \
49 ret = 0; \ 59 ret = 0; \
50 preempt_enable(); \ 60 preempt_enable(); \
51 \ 61 \
52 return ret; \ 62 return ret; \
53} 63}
54 64
55define_therm_throt_sysdev_show_func(count); 65define_therm_throt_sysdev_show_func(count);
@@ -61,8 +71,8 @@ static struct attribute *thermal_throttle_attrs[] = {
61}; 71};
62 72
63static struct attribute_group thermal_throttle_attr_group = { 73static struct attribute_group thermal_throttle_attr_group = {
64 .attrs = thermal_throttle_attrs, 74 .attrs = thermal_throttle_attrs,
65 .name = "thermal_throttle" 75 .name = "thermal_throttle"
66}; 76};
67#endif /* CONFIG_SYSFS */ 77#endif /* CONFIG_SYSFS */
68 78
@@ -82,7 +92,7 @@ static struct attribute_group thermal_throttle_attr_group = {
82 * 1 : Event should be logged further, and a message has been 92 * 1 : Event should be logged further, and a message has been
83 * printed to the syslog. 93 * printed to the syslog.
84 */ 94 */
85int therm_throt_process(int curr) 95static int therm_throt_process(int curr)
86{ 96{
87 unsigned int cpu = smp_processor_id(); 97 unsigned int cpu = smp_processor_id();
88 __u64 tmp_jiffs = get_jiffies_64(); 98 __u64 tmp_jiffs = get_jiffies_64();
@@ -110,10 +120,11 @@ int therm_throt_process(int curr)
110} 120}
111 121
112#ifdef CONFIG_SYSFS 122#ifdef CONFIG_SYSFS
113/* Add/Remove thermal_throttle interface for CPU device */ 123/* Add/Remove thermal_throttle interface for CPU device: */
114static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) 124static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
115{ 125{
116 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); 126 return sysfs_create_group(&sys_dev->kobj,
127 &thermal_throttle_attr_group);
117} 128}
118 129
119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 130static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
@@ -121,19 +132,21 @@ static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
121 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 132 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
122} 133}
123 134
124/* Mutex protecting device creation against CPU hotplug */ 135/* Mutex protecting device creation against CPU hotplug: */
125static DEFINE_MUTEX(therm_cpu_lock); 136static DEFINE_MUTEX(therm_cpu_lock);
126 137
127/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 138/* Get notified when a cpu comes on/off. Be hotplug friendly. */
128static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, 139static __cpuinit int
129 unsigned long action, 140thermal_throttle_cpu_callback(struct notifier_block *nfb,
130 void *hcpu) 141 unsigned long action,
142 void *hcpu)
131{ 143{
132 unsigned int cpu = (unsigned long)hcpu; 144 unsigned int cpu = (unsigned long)hcpu;
133 struct sys_device *sys_dev; 145 struct sys_device *sys_dev;
134 int err = 0; 146 int err = 0;
135 147
136 sys_dev = get_cpu_sysdev(cpu); 148 sys_dev = get_cpu_sysdev(cpu);
149
137 switch (action) { 150 switch (action) {
138 case CPU_UP_PREPARE: 151 case CPU_UP_PREPARE:
139 case CPU_UP_PREPARE_FROZEN: 152 case CPU_UP_PREPARE_FROZEN:
@@ -183,6 +196,94 @@ static __init int thermal_throttle_init_device(void)
183 196
184 return 0; 197 return 0;
185} 198}
186
187device_initcall(thermal_throttle_init_device); 199device_initcall(thermal_throttle_init_device);
200
188#endif /* CONFIG_SYSFS */ 201#endif /* CONFIG_SYSFS */
202
203/* Thermal transition interrupt handler */
204static void intel_thermal_interrupt(void)
205{
206 __u64 msr_val;
207
208 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
209 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT))
210 mce_log_therm_throt_event(msr_val);
211}
212
213static void unexpected_thermal_interrupt(void)
214{
215 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
216 smp_processor_id());
217 add_taint(TAINT_MACHINE_CHECK);
218}
219
220static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
221
222asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
223{
224 exit_idle();
225 irq_enter();
226 inc_irq_stat(irq_thermal_count);
227 smp_thermal_vector();
228 irq_exit();
229 /* Ack only at the end to avoid potential reentry */
230 ack_APIC_irq();
231}
232
233void intel_init_thermal(struct cpuinfo_x86 *c)
234{
235 unsigned int cpu = smp_processor_id();
236 int tm2 = 0;
237 u32 l, h;
238
239 /* Thermal monitoring depends on ACPI and clock modulation*/
240 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
241 return;
242
243 /*
244 * First check if its enabled already, in which case there might
245 * be some SMM goo which handles it, so we can't even put a handler
246 * since it might be delivered via SMI already:
247 */
248 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
249 h = apic_read(APIC_LVTTHMR);
250 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
251 printk(KERN_DEBUG
252 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
253 return;
254 }
255
256 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
257 tm2 = 1;
258
259 /* Check whether a vector already exists */
260 if (h & APIC_VECTOR_MASK) {
261 printk(KERN_DEBUG
262 "CPU%d: Thermal LVT vector (%#x) already installed\n",
263 cpu, (h & APIC_VECTOR_MASK));
264 return;
265 }
266
267 /* We'll mask the thermal vector in the lapic till we're ready: */
268 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
269 apic_write(APIC_LVTTHMR, h);
270
271 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
272 wrmsr(MSR_IA32_THERM_INTERRUPT,
273 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
274
275 smp_thermal_vector = intel_thermal_interrupt;
276
277 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
278 wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
279
280 /* Unmask the thermal vector: */
281 l = apic_read(APIC_LVTTHMR);
282 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
283
284 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
285 cpu, tm2 ? "TM2" : "TM1");
286
287 /* enable thermal throttle processing */
288 atomic_set(&therm_throt_en, 1);
289}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 23ee9e730f78..d746df2909c9 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -17,7 +17,7 @@ static void default_threshold_interrupt(void)
17 17
18void (*mce_threshold_vector)(void) = default_threshold_interrupt; 18void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19 19
20asmlinkage void mce_threshold_interrupt(void) 20asmlinkage void smp_threshold_interrupt(void)
21{ 21{
22 exit_idle(); 22 exit_idle();
23 irq_enter(); 23 irq_enter();
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 2a043d89811d..54060f565974 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -2,19 +2,17 @@
2 * IDT Winchip specific Machine Check Exception Reporting 2 * IDT Winchip specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> 3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */ 4 */
5
6#include <linux/init.h>
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
10 9
11#include <asm/processor.h> 10#include <asm/processor.h>
12#include <asm/system.h> 11#include <asm/system.h>
12#include <asm/mce.h>
13#include <asm/msr.h> 13#include <asm/msr.h>
14 14
15#include "mce.h" 15/* Machine check handler for WinChip C6: */
16
17/* Machine check handler for WinChip C6 */
18static void winchip_machine_check(struct pt_regs *regs, long error_code) 16static void winchip_machine_check(struct pt_regs *regs, long error_code)
19{ 17{
20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 18 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
@@ -25,12 +23,18 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
25void winchip_mcheck_init(struct cpuinfo_x86 *c) 23void winchip_mcheck_init(struct cpuinfo_x86 *c)
26{ 24{
27 u32 lo, hi; 25 u32 lo, hi;
26
28 machine_check_vector = winchip_machine_check; 27 machine_check_vector = winchip_machine_check;
28 /* Make sure the vector pointer is visible before we enable MCEs: */
29 wmb(); 29 wmb();
30
30 rdmsr(MSR_IDT_FCR1, lo, hi); 31 rdmsr(MSR_IDT_FCR1, lo, hi);
31 lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ 32 lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
32 lo &= ~(1<<4); /* Enable MCE */ 33 lo &= ~(1<<4); /* Enable MCE */
33 wrmsr(MSR_IDT_FCR1, lo, hi); 34 wrmsr(MSR_IDT_FCR1, lo, hi);
35
34 set_in_cr4(X86_CR4_MCE); 36 set_in_cr4(X86_CR4_MCE);
35 printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); 37
38 printk(KERN_INFO
39 "Winchip machine check reporting enabled on CPU#0.\n");
36} 40}