aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/mce.h10
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c16
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c205
3 files changed, 228 insertions, 3 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6fc5e07eca4f..563933e06a35 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -105,8 +105,16 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
105 105
106#ifdef CONFIG_X86_MCE_INTEL 106#ifdef CONFIG_X86_MCE_INTEL
107void mce_intel_feature_init(struct cpuinfo_x86 *c); 107void mce_intel_feature_init(struct cpuinfo_x86 *c);
108void cmci_clear(void);
109void cmci_reenable(void);
110void cmci_rediscover(int dying);
111void cmci_recheck(void);
108#else 112#else
109static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } 113static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
114static inline void cmci_clear(void) {}
115static inline void cmci_reenable(void) {}
116static inline void cmci_rediscover(int dying) {}
117static inline void cmci_recheck(void) {}
110#endif 118#endif
111 119
112#ifdef CONFIG_X86_MCE_AMD 120#ifdef CONFIG_X86_MCE_AMD
@@ -115,6 +123,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
115static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } 123static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
116#endif 124#endif
117 125
126extern int mce_available(struct cpuinfo_x86 *c);
127
118void mce_log_therm_throt_event(__u64 status); 128void mce_log_therm_throt_event(__u64 status);
119 129
120extern atomic_t mce_entry; 130extern atomic_t mce_entry;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index a8ff38bfa6ed..bfbd5323a635 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -166,7 +166,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
166 panic(msg); 166 panic(msg);
167} 167}
168 168
169static int mce_available(struct cpuinfo_x86 *c) 169int mce_available(struct cpuinfo_x86 *c)
170{ 170{
171 if (mce_dont_init) 171 if (mce_dont_init)
172 return 0; 172 return 0;
@@ -1060,9 +1060,12 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1060static void mce_disable_cpu(void *h) 1060static void mce_disable_cpu(void *h)
1061{ 1061{
1062 int i; 1062 int i;
1063 unsigned long action = *(unsigned long *)h;
1063 1064
1064 if (!mce_available(&current_cpu_data)) 1065 if (!mce_available(&current_cpu_data))
1065 return; 1066 return;
1067 if (!(action & CPU_TASKS_FROZEN))
1068 cmci_clear();
1066 for (i = 0; i < banks; i++) 1069 for (i = 0; i < banks; i++)
1067 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1068} 1071}
@@ -1070,9 +1073,12 @@ static void mce_disable_cpu(void *h)
1070static void mce_reenable_cpu(void *h) 1073static void mce_reenable_cpu(void *h)
1071{ 1074{
1072 int i; 1075 int i;
1076 unsigned long action = *(unsigned long *)h;
1073 1077
1074 if (!mce_available(&current_cpu_data)) 1078 if (!mce_available(&current_cpu_data))
1075 return; 1079 return;
1080 if (!(action & CPU_TASKS_FROZEN))
1081 cmci_reenable();
1076 for (i = 0; i < banks; i++) 1082 for (i = 0; i < banks; i++)
1077 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1083 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1078} 1084}
@@ -1100,13 +1106,17 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1100 case CPU_DOWN_PREPARE: 1106 case CPU_DOWN_PREPARE:
1101 case CPU_DOWN_PREPARE_FROZEN: 1107 case CPU_DOWN_PREPARE_FROZEN:
1102 del_timer_sync(t); 1108 del_timer_sync(t);
1103 smp_call_function_single(cpu, mce_disable_cpu, NULL, 1); 1109 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1104 break; 1110 break;
1105 case CPU_DOWN_FAILED: 1111 case CPU_DOWN_FAILED:
1106 case CPU_DOWN_FAILED_FROZEN: 1112 case CPU_DOWN_FAILED_FROZEN:
1107 t->expires = round_jiffies_relative(jiffies + next_interval); 1113 t->expires = round_jiffies_relative(jiffies + next_interval);
1108 add_timer_on(t, cpu); 1114 add_timer_on(t, cpu);
1109 smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1); 1115 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1116 break;
1117 case CPU_POST_DEAD:
1118 /* intentionally ignoring frozen here */
1119 cmci_rediscover(cpu);
1110 break; 1120 break;
1111 } 1121 }
1112 return NOTIFY_OK; 1122 return NOTIFY_OK;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 1b1491a76b55..a518ec8c6f89 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
1/* 1/*
2 * Intel specific MCE features. 2 * Intel specific MCE features.
3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> 3 * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
4 * Copyright (C) 2008, 2009 Intel Corporation
5 * Author: Andi Kleen
4 */ 6 */
5 7
6#include <linux/init.h> 8#include <linux/init.h>
@@ -12,6 +14,7 @@
12#include <asm/hw_irq.h> 14#include <asm/hw_irq.h>
13#include <asm/idle.h> 15#include <asm/idle.h>
14#include <asm/therm_throt.h> 16#include <asm/therm_throt.h>
17#include <asm/apic.h>
15 18
16asmlinkage void smp_thermal_interrupt(void) 19asmlinkage void smp_thermal_interrupt(void)
17{ 20{
@@ -84,7 +87,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
84 return; 87 return;
85} 88}
86 89
90/*
91 * Support for Intel Correct Machine Check Interrupts. This allows
92 * the CPU to raise an interrupt when a corrected machine check happened.
93 * Normally we pick those up using a regular polling timer.
94 * Also supports reliable discovery of shared banks.
95 */
96
97static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
98
99/*
100 * cmci_discover_lock protects against parallel discovery attempts
101 * which could race against each other.
102 */
103static DEFINE_SPINLOCK(cmci_discover_lock);
104
105#define CMCI_THRESHOLD 1
106
107static __cpuinit int cmci_supported(int *banks)
108{
109 u64 cap;
110
111 /*
112 * Vendor check is not strictly needed, but the initial
113 * initialization is vendor keyed and this
114 * makes sure none of the backdoors are entered otherwise.
115 */
116 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
117 return 0;
118 if (!cpu_has_apic || lapic_get_maxlvt() < 6)
119 return 0;
120 rdmsrl(MSR_IA32_MCG_CAP, cap);
121 *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
122 return !!(cap & MCG_CMCI_P);
123}
124
125/*
126 * The interrupt handler. This is called on every event.
127 * Just call the poller directly to log any events.
128 * This could in theory increase the threshold under high load,
129 * but doesn't for now.
130 */
131static void intel_threshold_interrupt(void)
132{
133 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
134 mce_notify_user();
135}
136
137static void print_update(char *type, int *hdr, int num)
138{
139 if (*hdr == 0)
140 printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
141 *hdr = 1;
142 printk(KERN_CONT " %s:%d", type, num);
143}
144
145/*
146 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
147 * on this CPU. Use the algorithm recommended in the SDM to discover shared
148 * banks.
149 */
150static __cpuinit void cmci_discover(int banks, int boot)
151{
152 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
153 int hdr = 0;
154 int i;
155
156 spin_lock(&cmci_discover_lock);
157 for (i = 0; i < banks; i++) {
158 u64 val;
159
160 if (test_bit(i, owned))
161 continue;
162
163 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
164
165 /* Already owned by someone else? */
166 if (val & CMCI_EN) {
167 if (test_and_clear_bit(i, owned) || boot)
168 print_update("SHD", &hdr, i);
169 __clear_bit(i, __get_cpu_var(mce_poll_banks));
170 continue;
171 }
172
173 val |= CMCI_EN | CMCI_THRESHOLD;
174 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
175 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
176
177 /* Did the enable bit stick? -- the bank supports CMCI */
178 if (val & CMCI_EN) {
179 if (!test_and_set_bit(i, owned) || boot)
180 print_update("CMCI", &hdr, i);
181 __clear_bit(i, __get_cpu_var(mce_poll_banks));
182 } else {
183 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
184 }
185 }
186 spin_unlock(&cmci_discover_lock);
187 if (hdr)
188 printk(KERN_CONT "\n");
189}
190
191/*
192 * Just in case we missed an event during initialization check
193 * all the CMCI owned banks.
194 */
195__cpuinit void cmci_recheck(void)
196{
197 unsigned long flags;
198 int banks;
199
200 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
201 return;
202 local_irq_save(flags);
203 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
204 local_irq_restore(flags);
205}
206
207/*
208 * Disable CMCI on this CPU for all banks it owns when it goes down.
209 * This allows other CPUs to claim the banks on rediscovery.
210 */
211void __cpuexit cmci_clear(void)
212{
213 int i;
214 int banks;
215 u64 val;
216
217 if (!cmci_supported(&banks))
218 return;
219 spin_lock(&cmci_discover_lock);
220 for (i = 0; i < banks; i++) {
221 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
222 continue;
223 /* Disable CMCI */
224 rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
225 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
226 wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
227 __clear_bit(i, __get_cpu_var(mce_banks_owned));
228 }
229 spin_unlock(&cmci_discover_lock);
230}
231
232/*
233 * After a CPU went down cycle through all the others and rediscover
234 * Must run in process context.
235 */
236void __cpuexit cmci_rediscover(int dying)
237{
238 int banks;
239 int cpu;
240 cpumask_var_t old;
241
242 if (!cmci_supported(&banks))
243 return;
244 if (!alloc_cpumask_var(&old, GFP_KERNEL))
245 return;
246 cpumask_copy(old, &current->cpus_allowed);
247
248 for_each_online_cpu (cpu) {
249 if (cpu == dying)
250 continue;
251 if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
252 continue;
253 /* Recheck banks in case CPUs don't all have the same */
254 if (cmci_supported(&banks))
255 cmci_discover(banks, 0);
256 }
257
258 set_cpus_allowed_ptr(current, old);
259 free_cpumask_var(old);
260}
261
262/*
263 * Reenable CMCI on this CPU in case a CPU down failed.
264 */
265void cmci_reenable(void)
266{
267 int banks;
268 if (cmci_supported(&banks))
269 cmci_discover(banks, 0);
270}
271
272static __cpuinit void intel_init_cmci(void)
273{
274 int banks;
275
276 if (!cmci_supported(&banks))
277 return;
278
279 mce_threshold_vector = intel_threshold_interrupt;
280 cmci_discover(banks, 1);
281 /*
282 * For CPU #0 this runs with still disabled APIC, but that's
283 * ok because only the vector is set up. We still do another
284 * check for the banks later for CPU #0 just to make sure
285 * to not miss any events.
286 */
287 apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
288 cmci_recheck();
289}
290
87void mce_intel_feature_init(struct cpuinfo_x86 *c) 291void mce_intel_feature_init(struct cpuinfo_x86 *c)
88{ 292{
89 intel_init_thermal(c); 293 intel_init_thermal(c);
294 intel_init_cmci();
90} 295}