diff options
author | Ingo Molnar <mingo@kernel.org> | 2015-03-31 04:47:18 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2015-03-31 04:47:18 -0400 |
commit | f5c8a104116a56503b6e824e7782b2e805b29abb (patch) | |
tree | 3da03b9d29bc15d06f19348f34a72d29f5a2ab49 | |
parent | c9ce8712838e48bf356144122c5ecdcdac5d1829 (diff) | |
parent | 43eaa2a1ad70d72876cdbb2eb5450a2665e4770f (diff) |
Merge tag 'amd_severity' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras into x86/ras
Pull RAS update from Borislav Petkov:
"This has been long in the making - an AMD-specific MCE-severity grading
function. And it is actually readable at a quick glance. Further error
recovery actions will be based on its output.
Patches tested on every relevant AMD family out there."
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | arch/x86/include/asm/mce.h | 8 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-internal.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-severity.c | 67 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 10 |
4 files changed, 85 insertions, 2 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index fd38a23e729f..1f5a86d518db 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h | |||
@@ -116,6 +116,12 @@ struct mca_config { | |||
116 | u32 rip_msr; | 116 | u32 rip_msr; |
117 | }; | 117 | }; |
118 | 118 | ||
119 | struct mce_vendor_flags { | ||
120 | __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */ | ||
121 | __reserved_0 : 63; | ||
122 | }; | ||
123 | extern struct mce_vendor_flags mce_flags; | ||
124 | |||
119 | extern struct mca_config mca_cfg; | 125 | extern struct mca_config mca_cfg; |
120 | extern void mce_register_decode_chain(struct notifier_block *nb); | 126 | extern void mce_register_decode_chain(struct notifier_block *nb); |
121 | extern void mce_unregister_decode_chain(struct notifier_block *nb); | 127 | extern void mce_unregister_decode_chain(struct notifier_block *nb); |
@@ -128,9 +134,11 @@ extern int mce_p5_enabled; | |||
128 | #ifdef CONFIG_X86_MCE | 134 | #ifdef CONFIG_X86_MCE |
129 | int mcheck_init(void); | 135 | int mcheck_init(void); |
130 | void mcheck_cpu_init(struct cpuinfo_x86 *c); | 136 | void mcheck_cpu_init(struct cpuinfo_x86 *c); |
137 | void mcheck_vendor_init_severity(void); | ||
131 | #else | 138 | #else |
132 | static inline int mcheck_init(void) { return 0; } | 139 | static inline int mcheck_init(void) { return 0; } |
133 | static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} | 140 | static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} |
141 | static inline void mcheck_vendor_init_severity(void) {} | ||
134 | #endif | 142 | #endif |
135 | 143 | ||
136 | #ifdef CONFIG_X86_ANCIENT_MCE | 144 | #ifdef CONFIG_X86_ANCIENT_MCE |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index e12f0bfb45c1..fe32074b865b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h | |||
@@ -24,7 +24,7 @@ struct mce_bank { | |||
24 | char attrname[ATTR_LEN]; /* attribute name */ | 24 | char attrname[ATTR_LEN]; /* attribute name */ |
25 | }; | 25 | }; |
26 | 26 | ||
27 | int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); | 27 | extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); |
28 | struct dentry *mce_get_debugfs_dir(void); | 28 | struct dentry *mce_get_debugfs_dir(void); |
29 | 29 | ||
30 | extern struct mce_bank *mce_banks; | 30 | extern struct mce_bank *mce_banks; |
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 8bb433043a7f..155c9261d3ef 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c | |||
@@ -186,7 +186,62 @@ static int error_context(struct mce *m) | |||
186 | return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; | 186 | return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; |
187 | } | 187 | } |
188 | 188 | ||
189 | int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) | 189 | /* |
190 | * See AMD Error Scope Hierarchy table in a newer BKDG. For example | ||
191 | * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" | ||
192 | */ | ||
193 | static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp) | ||
194 | { | ||
195 | enum context ctx = error_context(m); | ||
196 | |||
197 | /* Processor Context Corrupt, no need to fumble too much, die! */ | ||
198 | if (m->status & MCI_STATUS_PCC) | ||
199 | return MCE_PANIC_SEVERITY; | ||
200 | |||
201 | if (m->status & MCI_STATUS_UC) { | ||
202 | |||
203 | /* | ||
204 | * On older systems where overflow_recov flag is not present, we | ||
205 | * should simply panic if an error overflow occurs. If | ||
206 | * overflow_recov flag is present and set, then software can try | ||
207 | * to at least kill process to prolong system operation. | ||
208 | */ | ||
209 | if (mce_flags.overflow_recov) { | ||
210 | /* software can try to contain */ | ||
211 | if (!(m->mcgstatus & MCG_STATUS_RIPV)) | ||
212 | if (ctx == IN_KERNEL) | ||
213 | return MCE_PANIC_SEVERITY; | ||
214 | |||
215 | /* kill current process */ | ||
216 | return MCE_AR_SEVERITY; | ||
217 | } else { | ||
218 | /* at least one error was not logged */ | ||
219 | if (m->status & MCI_STATUS_OVER) | ||
220 | return MCE_PANIC_SEVERITY; | ||
221 | } | ||
222 | |||
223 | /* | ||
224 | * For any other case, return MCE_UC_SEVERITY so that we log the | ||
225 | * error and exit #MC handler. | ||
226 | */ | ||
227 | return MCE_UC_SEVERITY; | ||
228 | } | ||
229 | |||
230 | /* | ||
231 | * deferred error: poll handler catches these and adds to mce_ring so | ||
232 | * memory-failure can take recovery actions. | ||
233 | */ | ||
234 | if (m->status & MCI_STATUS_DEFERRED) | ||
235 | return MCE_DEFERRED_SEVERITY; | ||
236 | |||
237 | /* | ||
238 | * corrected error: poll handler catches these and passes responsibility | ||
239 | * of decoding the error to EDAC | ||
240 | */ | ||
241 | return MCE_KEEP_SEVERITY; | ||
242 | } | ||
243 | |||
244 | static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp) | ||
190 | { | 245 | { |
191 | enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); | 246 | enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); |
192 | enum context ctx = error_context(m); | 247 | enum context ctx = error_context(m); |
@@ -216,6 +271,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) | |||
216 | } | 271 | } |
217 | } | 272 | } |
218 | 273 | ||
274 | /* Default to mce_severity_intel */ | ||
275 | int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) = | ||
276 | mce_severity_intel; | ||
277 | |||
278 | void __init mcheck_vendor_init_severity(void) | ||
279 | { | ||
280 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) | ||
281 | mce_severity = mce_severity_amd; | ||
282 | } | ||
283 | |||
219 | #ifdef CONFIG_DEBUG_FS | 284 | #ifdef CONFIG_DEBUG_FS |
220 | static void *s_start(struct seq_file *f, loff_t *pos) | 285 | static void *s_start(struct seq_file *f, loff_t *pos) |
221 | { | 286 | { |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8548b714a16b..c7df30748629 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -64,6 +64,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); | |||
64 | DEFINE_PER_CPU(unsigned, mce_exception_count); | 64 | DEFINE_PER_CPU(unsigned, mce_exception_count); |
65 | 65 | ||
66 | struct mce_bank *mce_banks __read_mostly; | 66 | struct mce_bank *mce_banks __read_mostly; |
67 | struct mce_vendor_flags mce_flags __read_mostly; | ||
67 | 68 | ||
68 | struct mca_config mca_cfg __read_mostly = { | 69 | struct mca_config mca_cfg __read_mostly = { |
69 | .bootlog = -1, | 70 | .bootlog = -1, |
@@ -1535,6 +1536,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) | |||
1535 | mce_banks[0].ctl = 0; | 1536 | mce_banks[0].ctl = 0; |
1536 | 1537 | ||
1537 | /* | 1538 | /* |
1539 | * overflow_recov is supported for F15h Models 00h-0fh | ||
1540 | * even though we don't have a CPUID bit for it. | ||
1541 | */ | ||
1542 | if (c->x86 == 0x15 && c->x86_model <= 0xf) | ||
1543 | mce_flags.overflow_recov = 1; | ||
1544 | |||
1545 | /* | ||
1538 | * Turn off MC4_MISC thresholding banks on those models since | 1546 | * Turn off MC4_MISC thresholding banks on those models since |
1539 | * they're not supported there. | 1547 | * they're not supported there. |
1540 | */ | 1548 | */ |
@@ -1633,6 +1641,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) | |||
1633 | break; | 1641 | break; |
1634 | case X86_VENDOR_AMD: | 1642 | case X86_VENDOR_AMD: |
1635 | mce_amd_feature_init(c); | 1643 | mce_amd_feature_init(c); |
1644 | mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1; | ||
1636 | break; | 1645 | break; |
1637 | default: | 1646 | default: |
1638 | break; | 1647 | break; |
@@ -2017,6 +2026,7 @@ __setup("mce", mcheck_enable); | |||
2017 | int __init mcheck_init(void) | 2026 | int __init mcheck_init(void) |
2018 | { | 2027 | { |
2019 | mcheck_intel_therm_init(); | 2028 | mcheck_intel_therm_init(); |
2029 | mcheck_vendor_init_severity(); | ||
2020 | 2030 | ||
2021 | return 0; | 2031 | return 0; |
2022 | } | 2032 | } |