aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-01-30 14:48:44 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-01-30 14:48:44 -0500
commita1c75e17e7d1306d35d51d3c330a13f42eba1d2d (patch)
tree8272fdb40804a572c36896c39728ab3e6d91c010
parentd8b91dde38f4c43bd0bbbf17a90f735b16aaff2c (diff)
parent179eb850ac57c06edaed67fc744ba9d902172f96 (diff)
Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 RAS updates from Ingo Molnar: - various AMD SMCA error parsing/reporting improvements (Yazen Ghannam) - extend Intel CMCI error reporting to more cases (Xie XiuQi) * 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/MCE: Make correctable error detection look at the Deferred bit x86/MCE: Report only DRAM ECC as memory errors on AMD systems x86/MCE/AMD: Define a function to get SMCA bank type x86/mce/AMD: Don't set DEF_INT_TYPE in MSR_CU_DEF_ERR on SMCA systems x86/MCE: Extend table to report action optional errors through CMCI too
-rw-r--r--arch/x86/include/asm/mce.h2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c17
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c29
4 files changed, 60 insertions, 14 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index b1e8d8db921f..96ea4b5ba658 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -376,6 +376,7 @@ struct smca_bank {
376extern struct smca_bank smca_banks[MAX_NR_BANKS]; 376extern struct smca_bank smca_banks[MAX_NR_BANKS];
377 377
378extern const char *smca_get_long_name(enum smca_bank_types t); 378extern const char *smca_get_long_name(enum smca_bank_types t);
379extern bool amd_mce_is_memory_error(struct mce *m);
379 380
380extern int mce_threshold_create_device(unsigned int cpu); 381extern int mce_threshold_create_device(unsigned int cpu);
381extern int mce_threshold_remove_device(unsigned int cpu); 382extern int mce_threshold_remove_device(unsigned int cpu);
@@ -384,6 +385,7 @@ extern int mce_threshold_remove_device(unsigned int cpu);
384 385
385static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; 386static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
386static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; 387static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
388static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
387 389
388#endif 390#endif
389 391
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 4ca632a06e0b..5bbd06f38ff6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -59,6 +59,7 @@ static struct severity {
59#define MCGMASK(x, y) .mcgmask = x, .mcgres = y 59#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
60#define MASK(x, y) .mask = x, .result = y 60#define MASK(x, y) .mask = x, .result = y
61#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) 61#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
62#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
62#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) 63#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
63#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) 64#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
64 65
@@ -101,6 +102,22 @@ static struct severity {
101 NOSER, BITCLR(MCI_STATUS_UC) 102 NOSER, BITCLR(MCI_STATUS_UC)
102 ), 103 ),
103 104
105 /*
106 * known AO MCACODs reported via MCE or CMC:
107 *
108 * SRAO could be signaled either via a machine check exception or
109 * CMCI with the corresponding bit S 1 or 0. So we don't need to
110 * check bit S for SRAO.
111 */
112 MCESEV(
113 AO, "Action optional: memory scrubbing error",
114 SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
115 ),
116 MCESEV(
117 AO, "Action optional: last level cache writeback error",
118 SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
119 ),
120
104 /* ignore OVER for UCNA */ 121 /* ignore OVER for UCNA */
105 MCESEV( 122 MCESEV(
106 UCNA, "Uncorrected no action required", 123 UCNA, "Uncorrected no action required",
@@ -149,15 +166,6 @@ static struct severity {
149 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) 166 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
150 ), 167 ),
151 168
152 /* known AO MCACODs: */
153 MCESEV(
154 AO, "Action optional: memory scrubbing error",
155 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
156 ),
157 MCESEV(
158 AO, "Action optional: last level cache writeback error",
159 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
160 ),
161 MCESEV( 169 MCESEV(
162 SOME, "Action optional: unknown MCACOD", 170 SOME, "Action optional: unknown MCACOD",
163 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) 171 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 868e412b4f0c..c3655e0fc156 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -503,10 +503,8 @@ static int mce_usable_address(struct mce *m)
503bool mce_is_memory_error(struct mce *m) 503bool mce_is_memory_error(struct mce *m)
504{ 504{
505 if (m->cpuvendor == X86_VENDOR_AMD) { 505 if (m->cpuvendor == X86_VENDOR_AMD) {
506 /* ErrCodeExt[20:16] */ 506 return amd_mce_is_memory_error(m);
507 u8 xec = (m->status >> 16) & 0x1f;
508 507
509 return (xec == 0x0 || xec == 0x8);
510 } else if (m->cpuvendor == X86_VENDOR_INTEL) { 508 } else if (m->cpuvendor == X86_VENDOR_INTEL) {
511 /* 509 /*
512 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes 510 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
@@ -530,6 +528,17 @@ bool mce_is_memory_error(struct mce *m)
530} 528}
531EXPORT_SYMBOL_GPL(mce_is_memory_error); 529EXPORT_SYMBOL_GPL(mce_is_memory_error);
532 530
531static bool mce_is_correctable(struct mce *m)
532{
533 if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
534 return false;
535
536 if (m->status & MCI_STATUS_UC)
537 return false;
538
539 return true;
540}
541
533static bool cec_add_mce(struct mce *m) 542static bool cec_add_mce(struct mce *m)
534{ 543{
535 if (!m) 544 if (!m)
@@ -537,7 +546,7 @@ static bool cec_add_mce(struct mce *m)
537 546
538 /* We eat only correctable DRAM errors with usable addresses. */ 547 /* We eat only correctable DRAM errors with usable addresses. */
539 if (mce_is_memory_error(m) && 548 if (mce_is_memory_error(m) &&
540 !(m->status & MCI_STATUS_UC) && 549 mce_is_correctable(m) &&
541 mce_usable_address(m)) 550 mce_usable_address(m))
542 if (!cec_add_elem(m->addr >> PAGE_SHIFT)) 551 if (!cec_add_elem(m->addr >> PAGE_SHIFT))
543 return true; 552 return true;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 486f640b02ef..0f32ad242324 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -110,6 +110,20 @@ const char *smca_get_long_name(enum smca_bank_types t)
110} 110}
111EXPORT_SYMBOL_GPL(smca_get_long_name); 111EXPORT_SYMBOL_GPL(smca_get_long_name);
112 112
113static enum smca_bank_types smca_get_bank_type(struct mce *m)
114{
115 struct smca_bank *b;
116
117 if (m->bank >= N_SMCA_BANK_TYPES)
118 return N_SMCA_BANK_TYPES;
119
120 b = &smca_banks[m->bank];
121 if (!b->hwid)
122 return N_SMCA_BANK_TYPES;
123
124 return b->hwid->bank_type;
125}
126
113static struct smca_hwid smca_hwid_mcatypes[] = { 127static struct smca_hwid smca_hwid_mcatypes[] = {
114 /* { bank_type, hwid_mcatype, xec_bitmap } */ 128 /* { bank_type, hwid_mcatype, xec_bitmap } */
115 129
@@ -407,7 +421,9 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
407 (deferred_error_int_vector != amd_deferred_error_interrupt)) 421 (deferred_error_int_vector != amd_deferred_error_interrupt))
408 deferred_error_int_vector = amd_deferred_error_interrupt; 422 deferred_error_int_vector = amd_deferred_error_interrupt;
409 423
410 low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; 424 if (!mce_flags.smca)
425 low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
426
411 wrmsr(MSR_CU_DEF_ERR, low, high); 427 wrmsr(MSR_CU_DEF_ERR, low, high);
412} 428}
413 429
@@ -738,6 +754,17 @@ out_err:
738} 754}
739EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); 755EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
740 756
757bool amd_mce_is_memory_error(struct mce *m)
758{
759 /* ErrCodeExt[20:16] */
760 u8 xec = (m->status >> 16) & 0x1f;
761
762 if (mce_flags.smca)
763 return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0;
764
765 return m->bank == 4 && xec == 0x8;
766}
767
741static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) 768static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
742{ 769{
743 struct mce m; 770 struct mce m;