aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorNaveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>2013-07-01 11:38:47 -0400
committerTony Luck <tony.luck@intel.com>2013-07-08 14:53:01 -0400
commitc3d1fb567a634dcdff4c6f6095b2053260988336 (patch)
treebe2b0f39a3b489c4aa4b39d6c527be0c63146b92 /arch/x86
parent8bb495e3f02401ee6f76d1b1d77f3ac9f079e376 (diff)
mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
The Corrected Machine Check structure (CMC) in HEST has a flag which can be set by the firmware to indicate to the OS that it prefers to process the corrected error events first. In this scenario, the OS is expected to not monitor for corrected errors (through CMCI/polling). Instead, the firmware notifies the OS on corrected error events through GHES. Linux already has support for GHES. This patch adds support for parsing CMC structure and to disable CMCI/polling if the firmware first flag is set. Further, the list of machine check bank structures at the end of CMC is used to determine which MCA banks function in FF mode, so that we continue to monitor error events on the other banks. Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Acked-by: Borislav Petkov <bp@suse.de> Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/mce.h3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c28
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c42
4 files changed, 66 insertions, 10 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index fa5f71e021d5..9c91683ab5e6 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -188,6 +188,9 @@ extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
188 const char __user *ubuf, 188 const char __user *ubuf,
189 size_t usize, loff_t *off)); 189 size_t usize, loff_t *off));
190 190
191/* Disable CMCI/polling for MCA bank claimed by firmware */
192extern void mce_disable_bank(int bank);
193
191/* 194/*
192 * Exception handler 195 * Exception handler
193 */ 196 */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 5b7d4fa5d3b7..09edd0b65fef 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -25,15 +25,18 @@ int mce_severity(struct mce *a, int tolerant, char **msg);
25struct dentry *mce_get_debugfs_dir(void); 25struct dentry *mce_get_debugfs_dir(void);
26 26
27extern struct mce_bank *mce_banks; 27extern struct mce_bank *mce_banks;
28extern mce_banks_t mce_banks_ce_disabled;
28 29
29#ifdef CONFIG_X86_MCE_INTEL 30#ifdef CONFIG_X86_MCE_INTEL
30unsigned long mce_intel_adjust_timer(unsigned long interval); 31unsigned long mce_intel_adjust_timer(unsigned long interval);
31void mce_intel_cmci_poll(void); 32void mce_intel_cmci_poll(void);
32void mce_intel_hcpu_update(unsigned long cpu); 33void mce_intel_hcpu_update(unsigned long cpu);
34void cmci_disable_bank(int bank);
33#else 35#else
34# define mce_intel_adjust_timer mce_adjust_timer_default 36# define mce_intel_adjust_timer mce_adjust_timer_default
35static inline void mce_intel_cmci_poll(void) { } 37static inline void mce_intel_cmci_poll(void) { }
36static inline void mce_intel_hcpu_update(unsigned long cpu) { } 38static inline void mce_intel_hcpu_update(unsigned long cpu) { }
39static inline void cmci_disable_bank(int bank) { }
37#endif 40#endif
38 41
39void mce_timer_kick(unsigned long interval); 42void mce_timer_kick(unsigned long interval);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9239504b41cb..5bf32c70a9c0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -94,6 +94,15 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
94 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 94 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
95}; 95};
96 96
97/*
98 * MCA banks controlled through firmware first for corrected errors.
99 * This is a global list of banks for which we won't enable CMCI and we
100 * won't poll. Firmware controls these banks and is responsible for
101 * reporting corrected errors through GHES. Uncorrected/recoverable
102 * errors are still notified through a machine check.
103 */
104mce_banks_t mce_banks_ce_disabled;
105
97static DEFINE_PER_CPU(struct work_struct, mce_work); 106static DEFINE_PER_CPU(struct work_struct, mce_work);
98 107
99static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); 108static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
@@ -1932,6 +1941,25 @@ static struct miscdevice mce_chrdev_device = {
1932 &mce_chrdev_ops, 1941 &mce_chrdev_ops,
1933}; 1942};
1934 1943
1944static void __mce_disable_bank(void *arg)
1945{
1946 int bank = *((int *)arg);
1947 __clear_bit(bank, __get_cpu_var(mce_poll_banks));
1948 cmci_disable_bank(bank);
1949}
1950
1951void mce_disable_bank(int bank)
1952{
1953 if (bank >= mca_cfg.banks) {
1954 pr_warn(FW_BUG
1955 "Ignoring request to disable invalid MCA bank %d.\n",
1956 bank);
1957 return;
1958 }
1959 set_bit(bank, mce_banks_ce_disabled);
1960 on_each_cpu(__mce_disable_bank, &bank, 1);
1961}
1962
1935/* 1963/*
1936 * mce=off Disables machine check 1964 * mce=off Disables machine check
1937 * mce=no_cmci Disables CMCI 1965 * mce=no_cmci Disables CMCI
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index ae1697c2afe3..488eae3ec3e2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -191,6 +191,10 @@ static void cmci_discover(int banks)
191 if (test_bit(i, owned)) 191 if (test_bit(i, owned))
192 continue; 192 continue;
193 193
194 /* Skip banks in firmware first mode */
195 if (test_bit(i, mce_banks_ce_disabled))
196 continue;
197
194 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 198 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
195 199
196 /* Already owned by someone else? */ 200 /* Already owned by someone else? */
@@ -259,6 +263,19 @@ void cmci_recheck(void)
259 local_irq_restore(flags); 263 local_irq_restore(flags);
260} 264}
261 265
266/* Caller must hold the lock on cmci_discover_lock */
267static void __cmci_disable_bank(int bank)
268{
269 u64 val;
270
271 if (!test_bit(bank, __get_cpu_var(mce_banks_owned)))
272 return;
273 rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
274 val &= ~MCI_CTL2_CMCI_EN;
275 wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
276 __clear_bit(bank, __get_cpu_var(mce_banks_owned));
277}
278
262/* 279/*
263 * Disable CMCI on this CPU for all banks it owns when it goes down. 280 * Disable CMCI on this CPU for all banks it owns when it goes down.
264 * This allows other CPUs to claim the banks on rediscovery. 281 * This allows other CPUs to claim the banks on rediscovery.
@@ -268,20 +285,12 @@ void cmci_clear(void)
268 unsigned long flags; 285 unsigned long flags;
269 int i; 286 int i;
270 int banks; 287 int banks;
271 u64 val;
272 288
273 if (!cmci_supported(&banks)) 289 if (!cmci_supported(&banks))
274 return; 290 return;
275 raw_spin_lock_irqsave(&cmci_discover_lock, flags); 291 raw_spin_lock_irqsave(&cmci_discover_lock, flags);
276 for (i = 0; i < banks; i++) { 292 for (i = 0; i < banks; i++)
277 if (!test_bit(i, __get_cpu_var(mce_banks_owned))) 293 __cmci_disable_bank(i);
278 continue;
279 /* Disable CMCI */
280 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
281 val &= ~MCI_CTL2_CMCI_EN;
282 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
283 __clear_bit(i, __get_cpu_var(mce_banks_owned));
284 }
285 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); 294 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
286} 295}
287 296
@@ -315,6 +324,19 @@ void cmci_reenable(void)
315 cmci_discover(banks); 324 cmci_discover(banks);
316} 325}
317 326
327void cmci_disable_bank(int bank)
328{
329 int banks;
330 unsigned long flags;
331
332 if (!cmci_supported(&banks))
333 return;
334
335 raw_spin_lock_irqsave(&cmci_discover_lock, flags);
336 __cmci_disable_bank(bank);
337 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
338}
339
318static void intel_init_cmci(void) 340static void intel_init_cmci(void)
319{ 341{
320 int banks; 342 int banks;