diff options
| author | Chen Yucong <slaoub@gmail.com> | 2014-11-17 21:09:19 -0500 |
|---|---|---|
| committer | Tony Luck <tony.luck@intel.com> | 2014-11-19 13:55:43 -0500 |
| commit | e3480271f59253cb60d030aa5e615bf00b731fea (patch) | |
| tree | 3a75fe5ed4dfa7788bfe82893ba6e8699794580a | |
| parent | 8dcf32ea220d87ca517e164de85d336480c9d172 (diff) | |
x86, mce, severity: Extend the the mce_severity mechanism to handle UCNA/DEFERRED error
Until now, the mce_severity mechanism can only identify the severity
of UCNA error as MCE_KEEP_SEVERITY. Meanwhile, it is not able to filter
out DEFERRED error for AMD platform.
This patch extends the mce_severity mechanism for handling
UCNA/DEFERRED error. In order to do this, the patch introduces a new
severity level - MCE_UCNA/DEFERRED_SEVERITY.
In addition, mce_severity is specific to machine check exception,
and it will check MCIP/EIPV/RIPV bits. In order to use mce_severity
mechanism in non-exception context, the patch also introduces a new
argument (is_excp) for mce_severity. `is_excp' is used to explicitly
specify the calling context of mce_severity.
Reviewed-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Chen Yucong <slaoub@gmail.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
| -rw-r--r-- | arch/x86/include/asm/mce.h | 4 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-internal.h | 4 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-severity.c | 23 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 14 | ||||
| -rw-r--r-- | drivers/edac/mce_amd.h | 3 |
5 files changed, 32 insertions, 16 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 276392f121fb..51b26e895933 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
| @@ -34,6 +34,10 @@ | |||
| 34 | #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ | 34 | #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ |
| 35 | #define MCI_STATUS_AR (1ULL<<55) /* Action required */ | 35 | #define MCI_STATUS_AR (1ULL<<55) /* Action required */ |
| 36 | 36 | ||
| 37 | /* AMD-specific bits */ | ||
| 38 | #define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */ | ||
| 39 | #define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ | ||
| 40 | |||
| 37 | /* | 41 | /* |
| 38 | * Note that the full MCACOD field of IA32_MCi_STATUS MSR is | 42 | * Note that the full MCACOD field of IA32_MCi_STATUS MSR is |
| 39 | * bits 15:0. But bit 12 is the 'F' bit, defined for corrected | 43 | * bits 15:0. But bit 12 is the 'F' bit, defined for corrected |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 09edd0b65fef..10b46906767f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h | |||
| @@ -3,6 +3,8 @@ | |||
| 3 | 3 | ||
| 4 | enum severity_level { | 4 | enum severity_level { |
| 5 | MCE_NO_SEVERITY, | 5 | MCE_NO_SEVERITY, |
| 6 | MCE_DEFERRED_SEVERITY, | ||
| 7 | MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY, | ||
| 6 | MCE_KEEP_SEVERITY, | 8 | MCE_KEEP_SEVERITY, |
| 7 | MCE_SOME_SEVERITY, | 9 | MCE_SOME_SEVERITY, |
| 8 | MCE_AO_SEVERITY, | 10 | MCE_AO_SEVERITY, |
| @@ -21,7 +23,7 @@ struct mce_bank { | |||
| 21 | char attrname[ATTR_LEN]; /* attribute name */ | 23 | char attrname[ATTR_LEN]; /* attribute name */ |
| 22 | }; | 24 | }; |
| 23 | 25 | ||
| 24 | int mce_severity(struct mce *a, int tolerant, char **msg); | 26 | int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); |
| 25 | struct dentry *mce_get_debugfs_dir(void); | 27 | struct dentry *mce_get_debugfs_dir(void); |
| 26 | 28 | ||
| 27 | extern struct mce_bank *mce_banks; | 29 | extern struct mce_bank *mce_banks; |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index c370e1c4468b..8bb433043a7f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c | |||
| @@ -31,6 +31,7 @@ | |||
| 31 | 31 | ||
| 32 | enum context { IN_KERNEL = 1, IN_USER = 2 }; | 32 | enum context { IN_KERNEL = 1, IN_USER = 2 }; |
| 33 | enum ser { SER_REQUIRED = 1, NO_SER = 2 }; | 33 | enum ser { SER_REQUIRED = 1, NO_SER = 2 }; |
| 34 | enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; | ||
| 34 | 35 | ||
| 35 | static struct severity { | 36 | static struct severity { |
| 36 | u64 mask; | 37 | u64 mask; |
| @@ -40,6 +41,7 @@ static struct severity { | |||
| 40 | unsigned char mcgres; | 41 | unsigned char mcgres; |
| 41 | unsigned char ser; | 42 | unsigned char ser; |
| 42 | unsigned char context; | 43 | unsigned char context; |
| 44 | unsigned char excp; | ||
| 43 | unsigned char covered; | 45 | unsigned char covered; |
| 44 | char *msg; | 46 | char *msg; |
| 45 | } severities[] = { | 47 | } severities[] = { |
| @@ -48,6 +50,8 @@ static struct severity { | |||
| 48 | #define USER .context = IN_USER | 50 | #define USER .context = IN_USER |
| 49 | #define SER .ser = SER_REQUIRED | 51 | #define SER .ser = SER_REQUIRED |
| 50 | #define NOSER .ser = NO_SER | 52 | #define NOSER .ser = NO_SER |
| 53 | #define EXCP .excp = EXCP_CONTEXT | ||
| 54 | #define NOEXCP .excp = NO_EXCP | ||
| 51 | #define BITCLR(x) .mask = x, .result = 0 | 55 | #define BITCLR(x) .mask = x, .result = 0 |
| 52 | #define BITSET(x) .mask = x, .result = x | 56 | #define BITSET(x) .mask = x, .result = x |
| 53 | #define MCGMASK(x, y) .mcgmask = x, .mcgres = y | 57 | #define MCGMASK(x, y) .mcgmask = x, .mcgres = y |
| @@ -62,7 +66,7 @@ static struct severity { | |||
| 62 | ), | 66 | ), |
| 63 | MCESEV( | 67 | MCESEV( |
| 64 | NO, "Not enabled", | 68 | NO, "Not enabled", |
| 65 | BITCLR(MCI_STATUS_EN) | 69 | EXCP, BITCLR(MCI_STATUS_EN) |
| 66 | ), | 70 | ), |
| 67 | MCESEV( | 71 | MCESEV( |
| 68 | PANIC, "Processor context corrupt", | 72 | PANIC, "Processor context corrupt", |
| @@ -71,16 +75,20 @@ static struct severity { | |||
| 71 | /* When MCIP is not set something is very confused */ | 75 | /* When MCIP is not set something is very confused */ |
| 72 | MCESEV( | 76 | MCESEV( |
| 73 | PANIC, "MCIP not set in MCA handler", | 77 | PANIC, "MCIP not set in MCA handler", |
| 74 | MCGMASK(MCG_STATUS_MCIP, 0) | 78 | EXCP, MCGMASK(MCG_STATUS_MCIP, 0) |
| 75 | ), | 79 | ), |
| 76 | /* Neither return not error IP -- no chance to recover -> PANIC */ | 80 | /* Neither return not error IP -- no chance to recover -> PANIC */ |
| 77 | MCESEV( | 81 | MCESEV( |
| 78 | PANIC, "Neither restart nor error IP", | 82 | PANIC, "Neither restart nor error IP", |
| 79 | MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) | 83 | EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) |
| 80 | ), | 84 | ), |
| 81 | MCESEV( | 85 | MCESEV( |
| 82 | PANIC, "In kernel and no restart IP", | 86 | PANIC, "In kernel and no restart IP", |
| 83 | KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) | 87 | EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) |
| 88 | ), | ||
| 89 | MCESEV( | ||
| 90 | DEFERRED, "Deferred error", | ||
| 91 | NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) | ||
| 84 | ), | 92 | ), |
| 85 | MCESEV( | 93 | MCESEV( |
| 86 | KEEP, "Corrected error", | 94 | KEEP, "Corrected error", |
| @@ -89,7 +97,7 @@ static struct severity { | |||
| 89 | 97 | ||
| 90 | /* ignore OVER for UCNA */ | 98 | /* ignore OVER for UCNA */ |
| 91 | MCESEV( | 99 | MCESEV( |
| 92 | KEEP, "Uncorrected no action required", | 100 | UCNA, "Uncorrected no action required", |
| 93 | SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) | 101 | SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) |
| 94 | ), | 102 | ), |
| 95 | MCESEV( | 103 | MCESEV( |
| @@ -178,8 +186,9 @@ static int error_context(struct mce *m) | |||
| 178 | return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; | 186 | return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; |
| 179 | } | 187 | } |
| 180 | 188 | ||
| 181 | int mce_severity(struct mce *m, int tolerant, char **msg) | 189 | int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) |
| 182 | { | 190 | { |
| 191 | enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); | ||
| 183 | enum context ctx = error_context(m); | 192 | enum context ctx = error_context(m); |
| 184 | struct severity *s; | 193 | struct severity *s; |
| 185 | 194 | ||
| @@ -194,6 +203,8 @@ int mce_severity(struct mce *m, int tolerant, char **msg) | |||
| 194 | continue; | 203 | continue; |
| 195 | if (s->context && ctx != s->context) | 204 | if (s->context && ctx != s->context) |
| 196 | continue; | 205 | continue; |
| 206 | if (s->excp && excp != s->excp) | ||
| 207 | continue; | ||
| 197 | if (msg) | 208 | if (msg) |
| 198 | *msg = s->msg; | 209 | *msg = s->msg; |
| 199 | s->covered = 1; | 210 | s->covered = 1; |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 61a9668cebfd..453e9bf90968 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
| @@ -668,7 +668,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, | |||
| 668 | if (quirk_no_way_out) | 668 | if (quirk_no_way_out) |
| 669 | quirk_no_way_out(i, m, regs); | 669 | quirk_no_way_out(i, m, regs); |
| 670 | } | 670 | } |
| 671 | if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) | 671 | if (mce_severity(m, mca_cfg.tolerant, msg, true) >= |
| 672 | MCE_PANIC_SEVERITY) | ||
| 672 | ret = 1; | 673 | ret = 1; |
| 673 | } | 674 | } |
| 674 | return ret; | 675 | return ret; |
| @@ -754,7 +755,7 @@ static void mce_reign(void) | |||
| 754 | for_each_possible_cpu(cpu) { | 755 | for_each_possible_cpu(cpu) { |
| 755 | int severity = mce_severity(&per_cpu(mces_seen, cpu), | 756 | int severity = mce_severity(&per_cpu(mces_seen, cpu), |
| 756 | mca_cfg.tolerant, | 757 | mca_cfg.tolerant, |
| 757 | &nmsg); | 758 | &nmsg, true); |
| 758 | if (severity > global_worst) { | 759 | if (severity > global_worst) { |
| 759 | msg = nmsg; | 760 | msg = nmsg; |
| 760 | global_worst = severity; | 761 | global_worst = severity; |
| @@ -1095,13 +1096,14 @@ void do_machine_check(struct pt_regs *regs, long error_code) | |||
| 1095 | */ | 1096 | */ |
| 1096 | add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); | 1097 | add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); |
| 1097 | 1098 | ||
| 1098 | severity = mce_severity(&m, cfg->tolerant, NULL); | 1099 | severity = mce_severity(&m, cfg->tolerant, NULL, true); |
| 1099 | 1100 | ||
| 1100 | /* | 1101 | /* |
| 1101 | * When machine check was for corrected handler don't touch, | 1102 | * When machine check was for corrected/deferred handler don't |
| 1102 | * unless we're panicing. | 1103 | * touch, unless we're panicing. |
| 1103 | */ | 1104 | */ |
| 1104 | if (severity == MCE_KEEP_SEVERITY && !no_way_out) | 1105 | if ((severity == MCE_KEEP_SEVERITY || |
| 1106 | severity == MCE_UCNA_SEVERITY) && !no_way_out) | ||
| 1105 | continue; | 1107 | continue; |
| 1106 | __set_bit(i, toclear); | 1108 | __set_bit(i, toclear); |
| 1107 | if (severity == MCE_NO_SEVERITY) { | 1109 | if (severity == MCE_NO_SEVERITY) { |
diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 51b7e3a36e37..c2359a1ea6b3 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h | |||
| @@ -32,9 +32,6 @@ | |||
| 32 | #define R4(x) (((x) >> 4) & 0xf) | 32 | #define R4(x) (((x) >> 4) & 0xf) |
| 33 | #define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") | 33 | #define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") |
| 34 | 34 | ||
| 35 | #define MCI_STATUS_DEFERRED BIT_64(44) | ||
| 36 | #define MCI_STATUS_POISON BIT_64(43) | ||
| 37 | |||
| 38 | extern const char * const pp_msgs[]; | 35 | extern const char * const pp_msgs[]; |
| 39 | 36 | ||
| 40 | enum tt_ids { | 37 | enum tt_ids { |
