diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-24 07:22:19 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-24 07:22:19 -0400 |
commit | a43de489934cadcbc4cc08a6590fdcc833768461 (patch) | |
tree | 61b688a0017e696bd1133d48a7c302d66bfa8ae2 | |
parent | 6242258b6b472f8fdd8ed9b735cc1190c185d16d (diff) | |
parent | 40c36e2741d7fe1e66d6ec55477ba5fd19c9c5d2 (diff) |
Merge branch 'ras-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull ras fixes from Thomas Gleixner:
"A set of fixes for RAS/MCE:
- Improve the error message when the kernel cannot recover from a MCE
so the maximum amount of information gets provided.
- Individually check MCE recovery features on SkyLake CPUs instead of
assuming none when the CAPID0 register does not advertise the
general ability for recovery.
- Prevent MCE to output inconsistent messages which first show an
error location and then claim that the source is unknown.
- Prevent overwriting MCi_STATUS in the attempt to gather more
information when a fatal MCE has alreay been detected. This leads
to empty status values in the printout and failing to react
promptly on the fatal event"
* 'ras-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mce: Fix incorrect "Machine check from unknown source" message
x86/mce: Do not overwrite MCi_STATUS in mce_no_way_out()
x86/mce: Check for alternate indication of machine check recovery on Skylake
x86/mce: Improve error message when kernel cannot recover
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-severity.c | 5 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 44 | ||||
-rw-r--r-- | arch/x86/kernel/quirks.c | 11 |
3 files changed, 42 insertions, 18 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 5bbd06f38ff6..f34d89c01edc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c | |||
@@ -160,6 +160,11 @@ static struct severity { | |||
160 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), | 160 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), |
161 | USER | 161 | USER |
162 | ), | 162 | ), |
163 | MCESEV( | ||
164 | PANIC, "Data load in unrecoverable area of kernel", | ||
165 | SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), | ||
166 | KERNEL | ||
167 | ), | ||
163 | #endif | 168 | #endif |
164 | MCESEV( | 169 | MCESEV( |
165 | PANIC, "Action required: unknown MCACOD", | 170 | PANIC, "Action required: unknown MCACOD", |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e4cf6ff1c2e1..c102ad51025e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -772,23 +772,25 @@ EXPORT_SYMBOL_GPL(machine_check_poll); | |||
772 | static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, | 772 | static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, |
773 | struct pt_regs *regs) | 773 | struct pt_regs *regs) |
774 | { | 774 | { |
775 | int i, ret = 0; | ||
776 | char *tmp; | 775 | char *tmp; |
776 | int i; | ||
777 | 777 | ||
778 | for (i = 0; i < mca_cfg.banks; i++) { | 778 | for (i = 0; i < mca_cfg.banks; i++) { |
779 | m->status = mce_rdmsrl(msr_ops.status(i)); | 779 | m->status = mce_rdmsrl(msr_ops.status(i)); |
780 | if (m->status & MCI_STATUS_VAL) { | 780 | if (!(m->status & MCI_STATUS_VAL)) |
781 | __set_bit(i, validp); | 781 | continue; |
782 | if (quirk_no_way_out) | 782 | |
783 | quirk_no_way_out(i, m, regs); | 783 | __set_bit(i, validp); |
784 | } | 784 | if (quirk_no_way_out) |
785 | quirk_no_way_out(i, m, regs); | ||
785 | 786 | ||
786 | if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { | 787 | if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { |
788 | mce_read_aux(m, i); | ||
787 | *msg = tmp; | 789 | *msg = tmp; |
788 | ret = 1; | 790 | return 1; |
789 | } | 791 | } |
790 | } | 792 | } |
791 | return ret; | 793 | return 0; |
792 | } | 794 | } |
793 | 795 | ||
794 | /* | 796 | /* |
@@ -1205,13 +1207,18 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1205 | lmce = m.mcgstatus & MCG_STATUS_LMCES; | 1207 | lmce = m.mcgstatus & MCG_STATUS_LMCES; |
1206 | 1208 | ||
1207 | /* | 1209 | /* |
1210 | * Local machine check may already know that we have to panic. | ||
1211 | * Broadcast machine check begins rendezvous in mce_start() | ||
1208 | * Go through all banks in exclusion of the other CPUs. This way we | 1212 | * Go through all banks in exclusion of the other CPUs. This way we |
1209 | * don't report duplicated events on shared banks because the first one | 1213 | * don't report duplicated events on shared banks because the first one |
1210 | * to see it will clear it. If this is a Local MCE, then no need to | 1214 | * to see it will clear it. |
1211 | * perform rendezvous. | ||
1212 | */ | 1215 | */ |
1213 | if (!lmce) | 1216 | if (lmce) { |
1217 | if (no_way_out) | ||
1218 | mce_panic("Fatal local machine check", &m, msg); | ||
1219 | } else { | ||
1214 | order = mce_start(&no_way_out); | 1220 | order = mce_start(&no_way_out); |
1221 | } | ||
1215 | 1222 | ||
1216 | for (i = 0; i < cfg->banks; i++) { | 1223 | for (i = 0; i < cfg->banks; i++) { |
1217 | __clear_bit(i, toclear); | 1224 | __clear_bit(i, toclear); |
@@ -1287,12 +1294,17 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
1287 | no_way_out = worst >= MCE_PANIC_SEVERITY; | 1294 | no_way_out = worst >= MCE_PANIC_SEVERITY; |
1288 | } else { | 1295 | } else { |
1289 | /* | 1296 | /* |
1290 | * Local MCE skipped calling mce_reign() | 1297 | * If there was a fatal machine check we should have |
1291 | * If we found a fatal error, we need to panic here. | 1298 | * already called mce_panic earlier in this function. |
1299 | * Since we re-read the banks, we might have found | ||
1300 | * something new. Check again to see if we found a | ||
1301 | * fatal error. We call "mce_severity()" again to | ||
1302 | * make sure we have the right "msg". | ||
1292 | */ | 1303 | */ |
1293 | if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) | 1304 | if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { |
1294 | mce_panic("Machine check from unknown source", | 1305 | mce_severity(&m, cfg->tolerant, &msg, true); |
1295 | NULL, NULL); | 1306 | mce_panic("Local fatal machine check!", &m, msg); |
1307 | } | ||
1296 | } | 1308 | } |
1297 | 1309 | ||
1298 | /* | 1310 | /* |
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 697a4ce04308..736348ead421 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c | |||
@@ -645,12 +645,19 @@ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) | |||
645 | /* Skylake */ | 645 | /* Skylake */ |
646 | static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) | 646 | static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) |
647 | { | 647 | { |
648 | u32 capid0; | 648 | u32 capid0, capid5; |
649 | 649 | ||
650 | pci_read_config_dword(pdev, 0x84, &capid0); | 650 | pci_read_config_dword(pdev, 0x84, &capid0); |
651 | pci_read_config_dword(pdev, 0x98, &capid5); | ||
651 | 652 | ||
652 | if ((capid0 & 0xc0) == 0xc0) | 653 | /* |
654 | * CAPID0{7:6} indicate whether this is an advanced RAS SKU | ||
655 | * CAPID5{8:5} indicate that various NVDIMM usage modes are | ||
656 | * enabled, so memory machine check recovery is also enabled. | ||
657 | */ | ||
658 | if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0)) | ||
653 | static_branch_inc(&mcsafe_key); | 659 | static_branch_inc(&mcsafe_key); |
660 | |||
654 | } | 661 | } |
655 | DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); | 662 | DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); |
656 | DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap); | 663 | DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap); |