diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-30 14:48:44 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-30 14:48:44 -0500 |
commit | a1c75e17e7d1306d35d51d3c330a13f42eba1d2d (patch) | |
tree | 8272fdb40804a572c36896c39728ab3e6d91c010 | |
parent | d8b91dde38f4c43bd0bbbf17a90f735b16aaff2c (diff) | |
parent | 179eb850ac57c06edaed67fc744ba9d902172f96 (diff) |
Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 RAS updates from Ingo Molnar:
- various AMD SMCA error parsing/reporting improvements (Yazen Ghannam)
- extend Intel CMCI error reporting to more cases (Xie XiuQi)
* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/MCE: Make correctable error detection look at the Deferred bit
x86/MCE: Report only DRAM ECC as memory errors on AMD systems
x86/MCE/AMD: Define a function to get SMCA bank type
x86/mce/AMD: Don't set DEF_INT_TYPE in MSR_CU_DEF_ERR on SMCA systems
x86/MCE: Extend table to report action optional errors through CMCI too
-rw-r--r-- | arch/x86/include/asm/mce.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-severity.c | 26 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 17 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce_amd.c | 29 |
4 files changed, 60 insertions, 14 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b1e8d8db921f..96ea4b5ba658 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
@@ -376,6 +376,7 @@ struct smca_bank { | |||
376 | extern struct smca_bank smca_banks[MAX_NR_BANKS]; | 376 | extern struct smca_bank smca_banks[MAX_NR_BANKS]; |
377 | 377 | ||
378 | extern const char *smca_get_long_name(enum smca_bank_types t); | 378 | extern const char *smca_get_long_name(enum smca_bank_types t); |
379 | extern bool amd_mce_is_memory_error(struct mce *m); | ||
379 | 380 | ||
380 | extern int mce_threshold_create_device(unsigned int cpu); | 381 | extern int mce_threshold_create_device(unsigned int cpu); |
381 | extern int mce_threshold_remove_device(unsigned int cpu); | 382 | extern int mce_threshold_remove_device(unsigned int cpu); |
@@ -384,6 +385,7 @@ extern int mce_threshold_remove_device(unsigned int cpu); | |||
384 | 385 | ||
385 | static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; | 386 | static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; |
386 | static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; | 387 | static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; |
388 | static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; | ||
387 | 389 | ||
388 | #endif | 390 | #endif |
389 | 391 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 4ca632a06e0b..5bbd06f38ff6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c | |||
@@ -59,6 +59,7 @@ static struct severity { | |||
59 | #define MCGMASK(x, y) .mcgmask = x, .mcgres = y | 59 | #define MCGMASK(x, y) .mcgmask = x, .mcgres = y |
60 | #define MASK(x, y) .mask = x, .result = y | 60 | #define MASK(x, y) .mask = x, .result = y |
61 | #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) | 61 | #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) |
62 | #define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR) | ||
62 | #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) | 63 | #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) |
63 | #define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) | 64 | #define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) |
64 | 65 | ||
@@ -101,6 +102,22 @@ static struct severity { | |||
101 | NOSER, BITCLR(MCI_STATUS_UC) | 102 | NOSER, BITCLR(MCI_STATUS_UC) |
102 | ), | 103 | ), |
103 | 104 | ||
105 | /* | ||
106 | * known AO MCACODs reported via MCE or CMC: | ||
107 | * | ||
108 | * SRAO could be signaled either via a machine check exception or | ||
109 | * CMCI with the corresponding bit S 1 or 0. So we don't need to | ||
110 | * check bit S for SRAO. | ||
111 | */ | ||
112 | MCESEV( | ||
113 | AO, "Action optional: memory scrubbing error", | ||
114 | SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) | ||
115 | ), | ||
116 | MCESEV( | ||
117 | AO, "Action optional: last level cache writeback error", | ||
118 | SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) | ||
119 | ), | ||
120 | |||
104 | /* ignore OVER for UCNA */ | 121 | /* ignore OVER for UCNA */ |
105 | MCESEV( | 122 | MCESEV( |
106 | UCNA, "Uncorrected no action required", | 123 | UCNA, "Uncorrected no action required", |
@@ -149,15 +166,6 @@ static struct severity { | |||
149 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) | 166 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) |
150 | ), | 167 | ), |
151 | 168 | ||
152 | /* known AO MCACODs: */ | ||
153 | MCESEV( | ||
154 | AO, "Action optional: memory scrubbing error", | ||
155 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) | ||
156 | ), | ||
157 | MCESEV( | ||
158 | AO, "Action optional: last level cache writeback error", | ||
159 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) | ||
160 | ), | ||
161 | MCESEV( | 169 | MCESEV( |
162 | SOME, "Action optional: unknown MCACOD", | 170 | SOME, "Action optional: unknown MCACOD", |
163 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) | 171 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 868e412b4f0c..c3655e0fc156 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -503,10 +503,8 @@ static int mce_usable_address(struct mce *m) | |||
503 | bool mce_is_memory_error(struct mce *m) | 503 | bool mce_is_memory_error(struct mce *m) |
504 | { | 504 | { |
505 | if (m->cpuvendor == X86_VENDOR_AMD) { | 505 | if (m->cpuvendor == X86_VENDOR_AMD) { |
506 | /* ErrCodeExt[20:16] */ | 506 | return amd_mce_is_memory_error(m); |
507 | u8 xec = (m->status >> 16) & 0x1f; | ||
508 | 507 | ||
509 | return (xec == 0x0 || xec == 0x8); | ||
510 | } else if (m->cpuvendor == X86_VENDOR_INTEL) { | 508 | } else if (m->cpuvendor == X86_VENDOR_INTEL) { |
511 | /* | 509 | /* |
512 | * Intel SDM Volume 3B - 15.9.2 Compound Error Codes | 510 | * Intel SDM Volume 3B - 15.9.2 Compound Error Codes |
@@ -530,6 +528,17 @@ bool mce_is_memory_error(struct mce *m) | |||
530 | } | 528 | } |
531 | EXPORT_SYMBOL_GPL(mce_is_memory_error); | 529 | EXPORT_SYMBOL_GPL(mce_is_memory_error); |
532 | 530 | ||
531 | static bool mce_is_correctable(struct mce *m) | ||
532 | { | ||
533 | if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) | ||
534 | return false; | ||
535 | |||
536 | if (m->status & MCI_STATUS_UC) | ||
537 | return false; | ||
538 | |||
539 | return true; | ||
540 | } | ||
541 | |||
533 | static bool cec_add_mce(struct mce *m) | 542 | static bool cec_add_mce(struct mce *m) |
534 | { | 543 | { |
535 | if (!m) | 544 | if (!m) |
@@ -537,7 +546,7 @@ static bool cec_add_mce(struct mce *m) | |||
537 | 546 | ||
538 | /* We eat only correctable DRAM errors with usable addresses. */ | 547 | /* We eat only correctable DRAM errors with usable addresses. */ |
539 | if (mce_is_memory_error(m) && | 548 | if (mce_is_memory_error(m) && |
540 | !(m->status & MCI_STATUS_UC) && | 549 | mce_is_correctable(m) && |
541 | mce_usable_address(m)) | 550 | mce_usable_address(m)) |
542 | if (!cec_add_elem(m->addr >> PAGE_SHIFT)) | 551 | if (!cec_add_elem(m->addr >> PAGE_SHIFT)) |
543 | return true; | 552 | return true; |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 486f640b02ef..0f32ad242324 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c | |||
@@ -110,6 +110,20 @@ const char *smca_get_long_name(enum smca_bank_types t) | |||
110 | } | 110 | } |
111 | EXPORT_SYMBOL_GPL(smca_get_long_name); | 111 | EXPORT_SYMBOL_GPL(smca_get_long_name); |
112 | 112 | ||
113 | static enum smca_bank_types smca_get_bank_type(struct mce *m) | ||
114 | { | ||
115 | struct smca_bank *b; | ||
116 | |||
117 | if (m->bank >= N_SMCA_BANK_TYPES) | ||
118 | return N_SMCA_BANK_TYPES; | ||
119 | |||
120 | b = &smca_banks[m->bank]; | ||
121 | if (!b->hwid) | ||
122 | return N_SMCA_BANK_TYPES; | ||
123 | |||
124 | return b->hwid->bank_type; | ||
125 | } | ||
126 | |||
113 | static struct smca_hwid smca_hwid_mcatypes[] = { | 127 | static struct smca_hwid smca_hwid_mcatypes[] = { |
114 | /* { bank_type, hwid_mcatype, xec_bitmap } */ | 128 | /* { bank_type, hwid_mcatype, xec_bitmap } */ |
115 | 129 | ||
@@ -407,7 +421,9 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) | |||
407 | (deferred_error_int_vector != amd_deferred_error_interrupt)) | 421 | (deferred_error_int_vector != amd_deferred_error_interrupt)) |
408 | deferred_error_int_vector = amd_deferred_error_interrupt; | 422 | deferred_error_int_vector = amd_deferred_error_interrupt; |
409 | 423 | ||
410 | low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; | 424 | if (!mce_flags.smca) |
425 | low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; | ||
426 | |||
411 | wrmsr(MSR_CU_DEF_ERR, low, high); | 427 | wrmsr(MSR_CU_DEF_ERR, low, high); |
412 | } | 428 | } |
413 | 429 | ||
@@ -738,6 +754,17 @@ out_err: | |||
738 | } | 754 | } |
739 | EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); | 755 | EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); |
740 | 756 | ||
757 | bool amd_mce_is_memory_error(struct mce *m) | ||
758 | { | ||
759 | /* ErrCodeExt[20:16] */ | ||
760 | u8 xec = (m->status >> 16) & 0x1f; | ||
761 | |||
762 | if (mce_flags.smca) | ||
763 | return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0; | ||
764 | |||
765 | return m->bank == 4 && xec == 0x8; | ||
766 | } | ||
767 | |||
741 | static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) | 768 | static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) |
742 | { | 769 | { |
743 | struct mce m; | 770 | struct mce m; |