diff options
| -rw-r--r-- | arch/x86/include/asm/mce.h | 4 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-internal.h | 4 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-severity.c | 23 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 60 | ||||
| -rw-r--r-- | drivers/edac/mce_amd.h | 3 |
5 files changed, 78 insertions, 16 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 276392f121fb..51b26e895933 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
| @@ -34,6 +34,10 @@ | |||
| 34 | #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ | 34 | #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ |
| 35 | #define MCI_STATUS_AR (1ULL<<55) /* Action required */ | 35 | #define MCI_STATUS_AR (1ULL<<55) /* Action required */ |
| 36 | 36 | ||
| 37 | /* AMD-specific bits */ | ||
| 38 | #define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */ | ||
| 39 | #define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ | ||
| 40 | |||
| 37 | /* | 41 | /* |
| 38 | * Note that the full MCACOD field of IA32_MCi_STATUS MSR is | 42 | * Note that the full MCACOD field of IA32_MCi_STATUS MSR is |
| 39 | * bits 15:0. But bit 12 is the 'F' bit, defined for corrected | 43 | * bits 15:0. But bit 12 is the 'F' bit, defined for corrected |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 09edd0b65fef..10b46906767f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h | |||
| @@ -3,6 +3,8 @@ | |||
| 3 | 3 | ||
| 4 | enum severity_level { | 4 | enum severity_level { |
| 5 | MCE_NO_SEVERITY, | 5 | MCE_NO_SEVERITY, |
| 6 | MCE_DEFERRED_SEVERITY, | ||
| 7 | MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY, | ||
| 6 | MCE_KEEP_SEVERITY, | 8 | MCE_KEEP_SEVERITY, |
| 7 | MCE_SOME_SEVERITY, | 9 | MCE_SOME_SEVERITY, |
| 8 | MCE_AO_SEVERITY, | 10 | MCE_AO_SEVERITY, |
| @@ -21,7 +23,7 @@ struct mce_bank { | |||
| 21 | char attrname[ATTR_LEN]; /* attribute name */ | 23 | char attrname[ATTR_LEN]; /* attribute name */ |
| 22 | }; | 24 | }; |
| 23 | 25 | ||
| 24 | int mce_severity(struct mce *a, int tolerant, char **msg); | 26 | int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); |
| 25 | struct dentry *mce_get_debugfs_dir(void); | 27 | struct dentry *mce_get_debugfs_dir(void); |
| 26 | 28 | ||
| 27 | extern struct mce_bank *mce_banks; | 29 | extern struct mce_bank *mce_banks; |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index c370e1c4468b..8bb433043a7f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c | |||
| @@ -31,6 +31,7 @@ | |||
| 31 | 31 | ||
| 32 | enum context { IN_KERNEL = 1, IN_USER = 2 }; | 32 | enum context { IN_KERNEL = 1, IN_USER = 2 }; |
| 33 | enum ser { SER_REQUIRED = 1, NO_SER = 2 }; | 33 | enum ser { SER_REQUIRED = 1, NO_SER = 2 }; |
| 34 | enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; | ||
| 34 | 35 | ||
| 35 | static struct severity { | 36 | static struct severity { |
| 36 | u64 mask; | 37 | u64 mask; |
| @@ -40,6 +41,7 @@ static struct severity { | |||
| 40 | unsigned char mcgres; | 41 | unsigned char mcgres; |
| 41 | unsigned char ser; | 42 | unsigned char ser; |
| 42 | unsigned char context; | 43 | unsigned char context; |
| 44 | unsigned char excp; | ||
| 43 | unsigned char covered; | 45 | unsigned char covered; |
| 44 | char *msg; | 46 | char *msg; |
| 45 | } severities[] = { | 47 | } severities[] = { |
| @@ -48,6 +50,8 @@ static struct severity { | |||
| 48 | #define USER .context = IN_USER | 50 | #define USER .context = IN_USER |
| 49 | #define SER .ser = SER_REQUIRED | 51 | #define SER .ser = SER_REQUIRED |
| 50 | #define NOSER .ser = NO_SER | 52 | #define NOSER .ser = NO_SER |
| 53 | #define EXCP .excp = EXCP_CONTEXT | ||
| 54 | #define NOEXCP .excp = NO_EXCP | ||
| 51 | #define BITCLR(x) .mask = x, .result = 0 | 55 | #define BITCLR(x) .mask = x, .result = 0 |
| 52 | #define BITSET(x) .mask = x, .result = x | 56 | #define BITSET(x) .mask = x, .result = x |
| 53 | #define MCGMASK(x, y) .mcgmask = x, .mcgres = y | 57 | #define MCGMASK(x, y) .mcgmask = x, .mcgres = y |
| @@ -62,7 +66,7 @@ static struct severity { | |||
| 62 | ), | 66 | ), |
| 63 | MCESEV( | 67 | MCESEV( |
| 64 | NO, "Not enabled", | 68 | NO, "Not enabled", |
| 65 | BITCLR(MCI_STATUS_EN) | 69 | EXCP, BITCLR(MCI_STATUS_EN) |
| 66 | ), | 70 | ), |
| 67 | MCESEV( | 71 | MCESEV( |
| 68 | PANIC, "Processor context corrupt", | 72 | PANIC, "Processor context corrupt", |
| @@ -71,16 +75,20 @@ static struct severity { | |||
| 71 | /* When MCIP is not set something is very confused */ | 75 | /* When MCIP is not set something is very confused */ |
| 72 | MCESEV( | 76 | MCESEV( |
| 73 | PANIC, "MCIP not set in MCA handler", | 77 | PANIC, "MCIP not set in MCA handler", |
| 74 | MCGMASK(MCG_STATUS_MCIP, 0) | 78 | EXCP, MCGMASK(MCG_STATUS_MCIP, 0) |
| 75 | ), | 79 | ), |
| 76 | /* Neither return not error IP -- no chance to recover -> PANIC */ | 80 | /* Neither return not error IP -- no chance to recover -> PANIC */ |
| 77 | MCESEV( | 81 | MCESEV( |
| 78 | PANIC, "Neither restart nor error IP", | 82 | PANIC, "Neither restart nor error IP", |
| 79 | MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) | 83 | EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) |
| 80 | ), | 84 | ), |
| 81 | MCESEV( | 85 | MCESEV( |
| 82 | PANIC, "In kernel and no restart IP", | 86 | PANIC, "In kernel and no restart IP", |
| 83 | KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) | 87 | EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) |
| 88 | ), | ||
| 89 | MCESEV( | ||
| 90 | DEFERRED, "Deferred error", | ||
| 91 | NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) | ||
| 84 | ), | 92 | ), |
| 85 | MCESEV( | 93 | MCESEV( |
| 86 | KEEP, "Corrected error", | 94 | KEEP, "Corrected error", |
| @@ -89,7 +97,7 @@ static struct severity { | |||
| 89 | 97 | ||
| 90 | /* ignore OVER for UCNA */ | 98 | /* ignore OVER for UCNA */ |
| 91 | MCESEV( | 99 | MCESEV( |
| 92 | KEEP, "Uncorrected no action required", | 100 | UCNA, "Uncorrected no action required", |
| 93 | SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) | 101 | SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) |
| 94 | ), | 102 | ), |
| 95 | MCESEV( | 103 | MCESEV( |
| @@ -178,8 +186,9 @@ static int error_context(struct mce *m) | |||
| 178 | return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; | 186 | return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; |
| 179 | } | 187 | } |
| 180 | 188 | ||
| 181 | int mce_severity(struct mce *m, int tolerant, char **msg) | 189 | int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) |
| 182 | { | 190 | { |
| 191 | enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); | ||
| 183 | enum context ctx = error_context(m); | 192 | enum context ctx = error_context(m); |
| 184 | struct severity *s; | 193 | struct severity *s; |
| 185 | 194 | ||
| @@ -194,6 +203,8 @@ int mce_severity(struct mce *m, int tolerant, char **msg) | |||
| 194 | continue; | 203 | continue; |
| 195 | if (s->context && ctx != s->context) | 204 | if (s->context && ctx != s->context) |
| 196 | continue; | 205 | continue; |
| 206 | if (s->excp && excp != s->excp) | ||
| 207 | continue; | ||
| 197 | if (msg) | 208 | if (msg) |
| 198 | *msg = s->msg; | 209 | *msg = s->msg; |
| 199 | s->covered = 1; | 210 | s->covered = 1; |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 61a9668cebfd..cfb16f631d52 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
| @@ -575,6 +575,37 @@ static void mce_read_aux(struct mce *m, int i) | |||
| 575 | } | 575 | } |
| 576 | } | 576 | } |
| 577 | 577 | ||
| 578 | static bool memory_error(struct mce *m) | ||
| 579 | { | ||
| 580 | struct cpuinfo_x86 *c = &boot_cpu_data; | ||
| 581 | |||
| 582 | if (c->x86_vendor == X86_VENDOR_AMD) { | ||
| 583 | /* | ||
| 584 | * coming soon | ||
| 585 | */ | ||
| 586 | return false; | ||
| 587 | } else if (c->x86_vendor == X86_VENDOR_INTEL) { | ||
| 588 | /* | ||
| 589 | * Intel SDM Volume 3B - 15.9.2 Compound Error Codes | ||
| 590 | * | ||
| 591 | * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for | ||
| 592 | * indicating a memory error. Bit 8 is used for indicating a | ||
| 593 | * cache hierarchy error. The combination of bit 2 and bit 3 | ||
| 594 | * is used for indicating a `generic' cache hierarchy error | ||
| 595 | * But we can't just blindly check the above bits, because if | ||
| 596 | * bit 11 is set, then it is a bus/interconnect error - and | ||
| 597 | * either way the above bits just gives more detail on what | ||
| 598 | * bus/interconnect error happened. Note that bit 12 can be | ||
| 599 | * ignored, as it's the "filter" bit. | ||
| 600 | */ | ||
| 601 | return (m->status & 0xef80) == BIT(7) || | ||
| 602 | (m->status & 0xef00) == BIT(8) || | ||
| 603 | (m->status & 0xeffc) == 0xc; | ||
| 604 | } | ||
| 605 | |||
| 606 | return false; | ||
| 607 | } | ||
| 608 | |||
| 578 | DEFINE_PER_CPU(unsigned, mce_poll_count); | 609 | DEFINE_PER_CPU(unsigned, mce_poll_count); |
| 579 | 610 | ||
| 580 | /* | 611 | /* |
| @@ -595,6 +626,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); | |||
| 595 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | 626 | void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) |
| 596 | { | 627 | { |
| 597 | struct mce m; | 628 | struct mce m; |
| 629 | int severity; | ||
| 598 | int i; | 630 | int i; |
| 599 | 631 | ||
| 600 | this_cpu_inc(mce_poll_count); | 632 | this_cpu_inc(mce_poll_count); |
| @@ -630,6 +662,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) | |||
| 630 | 662 | ||
| 631 | if (!(flags & MCP_TIMESTAMP)) | 663 | if (!(flags & MCP_TIMESTAMP)) |
| 632 | m.tsc = 0; | 664 | m.tsc = 0; |
| 665 | |||
| 666 | severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); | ||
| 667 | |||
| 668 | /* | ||
| 669 | * In the cases where we don't have a valid address after all, | ||
| 670 | * do not add it into the ring buffer. | ||
| 671 | */ | ||
| 672 | if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) { | ||
| 673 | if (m.status & MCI_STATUS_ADDRV) { | ||
| 674 | mce_ring_add(m.addr >> PAGE_SHIFT); | ||
| 675 | mce_schedule_work(); | ||
| 676 | } | ||
| 677 | } | ||
| 678 | |||
| 633 | /* | 679 | /* |
| 634 | * Don't get the IP here because it's unlikely to | 680 | * Don't get the IP here because it's unlikely to |
| 635 | * have anything to do with the actual error location. | 681 | * have anything to do with the actual error location. |
| @@ -668,7 +714,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, | |||
| 668 | if (quirk_no_way_out) | 714 | if (quirk_no_way_out) |
| 669 | quirk_no_way_out(i, m, regs); | 715 | quirk_no_way_out(i, m, regs); |
| 670 | } | 716 | } |
| 671 | if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) | 717 | if (mce_severity(m, mca_cfg.tolerant, msg, true) >= |
| 718 | MCE_PANIC_SEVERITY) | ||
| 672 | ret = 1; | 719 | ret = 1; |
| 673 | } | 720 | } |
| 674 | return ret; | 721 | return ret; |
| @@ -754,7 +801,7 @@ static void mce_reign(void) | |||
| 754 | for_each_possible_cpu(cpu) { | 801 | for_each_possible_cpu(cpu) { |
| 755 | int severity = mce_severity(&per_cpu(mces_seen, cpu), | 802 | int severity = mce_severity(&per_cpu(mces_seen, cpu), |
| 756 | mca_cfg.tolerant, | 803 | mca_cfg.tolerant, |
| 757 | &nmsg); | 804 | &nmsg, true); |
| 758 | if (severity > global_worst) { | 805 | if (severity > global_worst) { |
| 759 | msg = nmsg; | 806 | msg = nmsg; |
| 760 | global_worst = severity; | 807 | global_worst = severity; |
| @@ -1095,13 +1142,14 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
| 1095 | */ | 1142 | */ |
| 1096 | add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); | 1143 | add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); |
| 1097 | 1144 | ||
| 1098 | severity = mce_severity(&m, cfg->tolerant, NULL); | 1145 | severity = mce_severity(&m, cfg->tolerant, NULL, true); |
| 1099 | 1146 | ||
| 1100 | /* | 1147 | /* |
| 1101 | * When machine check was for corrected handler don't touch, | 1148 | * When machine check was for corrected/deferred handler don't |
| 1102 | * unless we're panicing. | 1149 | * touch, unless we're panicing. |
| 1103 | */ | 1150 | */ |
| 1104 | if (severity == MCE_KEEP_SEVERITY && !no_way_out) | 1151 | if ((severity == MCE_KEEP_SEVERITY || |
| 1152 | severity == MCE_UCNA_SEVERITY) && !no_way_out) | ||
| 1105 | continue; | 1153 | continue; |
| 1106 | __set_bit(i, toclear); | 1154 | __set_bit(i, toclear); |
| 1107 | if (severity == MCE_NO_SEVERITY) { | 1155 | if (severity == MCE_NO_SEVERITY) { |
diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 51b7e3a36e37..c2359a1ea6b3 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h | |||
| @@ -32,9 +32,6 @@ | |||
| 32 | #define R4(x) (((x) >> 4) & 0xf) | 32 | #define R4(x) (((x) >> 4) & 0xf) |
| 33 | #define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") | 33 | #define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") |
| 34 | 34 | ||
| 35 | #define MCI_STATUS_DEFERRED BIT_64(44) | ||
| 36 | #define MCI_STATUS_POISON BIT_64(43) | ||
| 37 | |||
| 38 | extern const char * const pp_msgs[]; | 35 | extern const char * const pp_msgs[]; |
| 39 | 36 | ||
| 40 | enum tt_ids { | 37 | enum tt_ids { |
