aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYazen Ghannam <yazen.ghannam@amd.com>2019-03-25 12:34:22 -0400
committerBorislav Petkov <bp@suse.de>2019-04-23 12:16:07 -0400
commit71a84402b93e5fbd8f817f40059c137e10171788 (patch)
tree09f7c0caf5c5175bf3f3e32cddccec64188bd084
parent45d4b7b9cb88526f6d5bd4c03efab88d75d10e4f (diff)
x86/MCE/AMD: Don't report L1 BTB MCA errors on some family 17h models
AMD family 17h Models 10h-2Fh may report a high number of L1 BTB MCA errors under certain conditions. The errors are benign and can safely be ignored. However, the high error rate may cause the MCA threshold counter to overflow causing a high rate of thresholding interrupts. In addition, users may see the errors reported through the AMD MCE decoder module, even with the interrupt disabled, due to MCA polling. Clear the "Counter Present" bit in the Instruction Fetch bank's MCA_MISC0 register. This will prevent enabling MCA thresholding on this bank which will prevent the high interrupt rate due to this error. Define an AMD-specific function to filter these errors from the MCE event pool so that they don't get reported during early boot. Rename filter function in EDAC/mce_amd to avoid a naming conflict, while at it. [ bp: Move function prototype to the internal header and massage/cleanup, fix typos. ] Reported-by: Rafał Miłecki <rafal@milecki.pl> Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com> Signed-off-by: Borislav Petkov <bp@suse.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "clemej@gmail.com" <clemej@gmail.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Morse <james.morse@arm.com> Cc: Kees Cook <keescook@chromium.org> Cc: Mauro Carvalho Chehab <mchehab@kernel.org> Cc: Pu Wen <puwen@hygon.cn> Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com> Cc: Shirish S <Shirish.S@amd.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tony Luck <tony.luck@intel.com> Cc: Vishal Verma <vishal.l.verma@intel.com> Cc: linux-edac <linux-edac@vger.kernel.org> Cc: x86-ml <x86@kernel.org> Cc: <stable@vger.kernel.org> # 5.0.x: c95b323dcd35: x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models Cc: <stable@vger.kernel.org> # 5.0.x: 30aa3d26edb0: x86/MCE/AMD: Carve out the MC4_MISC thresholding quirk Cc: <stable@vger.kernel.org> # 5.0.x: 9308fd407455: x86/MCE: Group AMD function prototypes in <asm/mce.h> Cc: <stable@vger.kernel.org> # 5.0.x Link: https://lkml.kernel.org/r/20190325163410.171021-2-Yazen.Ghannam@amd.com
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c52
-rw-r--r--arch/x86/kernel/cpu/mce/core.c3
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h6
-rw-r--r--drivers/edac/mce_amd.c4
4 files changed, 50 insertions, 15 deletions
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index e64de5149e50..d904aafe6409 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -563,33 +563,59 @@ out:
563 return offset; 563 return offset;
564} 564}
565 565
566bool amd_filter_mce(struct mce *m)
567{
568 enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
569 struct cpuinfo_x86 *c = &boot_cpu_data;
570 u8 xec = (m->status >> 16) & 0x3F;
571
572 /* See Family 17h Models 10h-2Fh Erratum #1114. */
573 if (c->x86 == 0x17 &&
574 c->x86_model >= 0x10 && c->x86_model <= 0x2F &&
575 bank_type == SMCA_IF && xec == 10)
576 return true;
577
578 return false;
579}
580
566/* 581/*
567 * Turn off MC4_MISC thresholding banks on all family 0x15 models since 582 * Turn off thresholding banks for the following conditions:
568 * they're not supported there. 583 * - MC4_MISC thresholding is not supported on Family 0x15.
584 * - Prevent possible spurious interrupts from the IF bank on Family 0x17
585 * Models 0x10-0x2F due to Erratum #1114.
569 */ 586 */
570void disable_err_thresholding(struct cpuinfo_x86 *c) 587void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
571{ 588{
572 int i; 589 int i, num_msrs;
573 u64 hwcr; 590 u64 hwcr;
574 bool need_toggle; 591 bool need_toggle;
575 u32 msrs[] = { 592 u32 msrs[NR_BLOCKS];
576 0x00000413, /* MC4_MISC0 */ 593
577 0xc0000408, /* MC4_MISC1 */ 594 if (c->x86 == 0x15 && bank == 4) {
578 }; 595 msrs[0] = 0x00000413; /* MC4_MISC0 */
596 msrs[1] = 0xc0000408; /* MC4_MISC1 */
597 num_msrs = 2;
598 } else if (c->x86 == 0x17 &&
599 (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {
579 600
580 if (c->x86 != 0x15) 601 if (smca_get_bank_type(bank) != SMCA_IF)
602 return;
603
604 msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
605 num_msrs = 1;
606 } else {
581 return; 607 return;
608 }
582 609
583 rdmsrl(MSR_K7_HWCR, hwcr); 610 rdmsrl(MSR_K7_HWCR, hwcr);
584 611
585 /* McStatusWrEn has to be set */ 612 /* McStatusWrEn has to be set */
586 need_toggle = !(hwcr & BIT(18)); 613 need_toggle = !(hwcr & BIT(18));
587
588 if (need_toggle) 614 if (need_toggle)
589 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); 615 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
590 616
591 /* Clear CntP bit safely */ 617 /* Clear CntP bit safely */
592 for (i = 0; i < ARRAY_SIZE(msrs); i++) 618 for (i = 0; i < num_msrs; i++)
593 msr_clear_bit(msrs[i], 62); 619 msr_clear_bit(msrs[i], 62);
594 620
595 /* restore old settings */ 621 /* restore old settings */
@@ -604,12 +630,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
604 unsigned int bank, block, cpu = smp_processor_id(); 630 unsigned int bank, block, cpu = smp_processor_id();
605 int offset = -1; 631 int offset = -1;
606 632
607 disable_err_thresholding(c);
608
609 for (bank = 0; bank < mca_cfg.banks; ++bank) { 633 for (bank = 0; bank < mca_cfg.banks; ++bank) {
610 if (mce_flags.smca) 634 if (mce_flags.smca)
611 smca_configure(bank, cpu); 635 smca_configure(bank, cpu);
612 636
637 disable_err_thresholding(c, bank);
638
613 for (block = 0; block < NR_BLOCKS; ++block) { 639 for (block = 0; block < NR_BLOCKS; ++block) {
614 address = get_block_address(address, low, high, bank, block); 640 address = get_block_address(address, low, high, bank, block);
615 if (!address) 641 if (!address)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 80b8c6bff8ed..5112a50e6486 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1777,6 +1777,9 @@ static void __mcheck_cpu_init_timer(void)
1777 1777
1778bool filter_mce(struct mce *m) 1778bool filter_mce(struct mce *m)
1779{ 1779{
1780 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1781 return amd_filter_mce(m);
1782
1780 return false; 1783 return false;
1781} 1784}
1782 1785
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index b822a645395d..a34b55baa7aa 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -176,4 +176,10 @@ extern struct mca_msr_regs msr_ops;
176/* Decide whether to add MCE record to MCE event pool or filter it out. */ 176/* Decide whether to add MCE record to MCE event pool or filter it out. */
177extern bool filter_mce(struct mce *m); 177extern bool filter_mce(struct mce *m);
178 178
179#ifdef CONFIG_X86_MCE_AMD
180extern bool amd_filter_mce(struct mce *m);
181#else
182static inline bool amd_filter_mce(struct mce *m) { return false; };
183#endif
184
179#endif /* __X86_MCE_INTERNAL_H__ */ 185#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 0a1814dad6cf..bb0202ad7a13 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -1004,7 +1004,7 @@ static inline void amd_decode_err_code(u16 ec)
1004/* 1004/*
1005 * Filter out unwanted MCE signatures here. 1005 * Filter out unwanted MCE signatures here.
1006 */ 1006 */
1007static bool amd_filter_mce(struct mce *m) 1007static bool ignore_mce(struct mce *m)
1008{ 1008{
1009 /* 1009 /*
1010 * NB GART TLB error reporting is disabled by default. 1010 * NB GART TLB error reporting is disabled by default.
@@ -1038,7 +1038,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
1038 unsigned int fam = x86_family(m->cpuid); 1038 unsigned int fam = x86_family(m->cpuid);
1039 int ecc; 1039 int ecc;
1040 1040
1041 if (amd_filter_mce(m)) 1041 if (ignore_mce(m))
1042 return NOTIFY_STOP; 1042 return NOTIFY_STOP;
1043 1043
1044 pr_emerg(HW_ERR "%s\n", decode_error_status(m)); 1044 pr_emerg(HW_ERR "%s\n", decode_error_status(m));