aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2015-03-31 04:47:18 -0400
committerIngo Molnar <mingo@kernel.org>2015-03-31 04:47:18 -0400
commitf5c8a104116a56503b6e824e7782b2e805b29abb (patch)
tree3da03b9d29bc15d06f19348f34a72d29f5a2ab49
parentc9ce8712838e48bf356144122c5ecdcdac5d1829 (diff)
parent43eaa2a1ad70d72876cdbb2eb5450a2665e4770f (diff)
Merge tag 'amd_severity' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras into x86/ras
Pull RAS update from Borislav Petkov: "This has been long in the making - an AMD-specific MCE-severity grading function. And it is actually readable at a quick glance. Further error recovery actions will be based on its output. Patches tested on every relevant AMD family out there." Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/include/asm/mce.h8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c67
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c10
4 files changed, 85 insertions, 2 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index fd38a23e729f..1f5a86d518db 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -116,6 +116,12 @@ struct mca_config {
116 u32 rip_msr; 116 u32 rip_msr;
117}; 117};
118 118
119struct mce_vendor_flags {
120 __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */
121 __reserved_0 : 63;
122};
123extern struct mce_vendor_flags mce_flags;
124
119extern struct mca_config mca_cfg; 125extern struct mca_config mca_cfg;
120extern void mce_register_decode_chain(struct notifier_block *nb); 126extern void mce_register_decode_chain(struct notifier_block *nb);
121extern void mce_unregister_decode_chain(struct notifier_block *nb); 127extern void mce_unregister_decode_chain(struct notifier_block *nb);
@@ -128,9 +134,11 @@ extern int mce_p5_enabled;
128#ifdef CONFIG_X86_MCE 134#ifdef CONFIG_X86_MCE
129int mcheck_init(void); 135int mcheck_init(void);
130void mcheck_cpu_init(struct cpuinfo_x86 *c); 136void mcheck_cpu_init(struct cpuinfo_x86 *c);
137void mcheck_vendor_init_severity(void);
131#else 138#else
132static inline int mcheck_init(void) { return 0; } 139static inline int mcheck_init(void) { return 0; }
133static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} 140static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
141static inline void mcheck_vendor_init_severity(void) {}
134#endif 142#endif
135 143
136#ifdef CONFIG_X86_ANCIENT_MCE 144#ifdef CONFIG_X86_ANCIENT_MCE
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index e12f0bfb45c1..fe32074b865b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -24,7 +24,7 @@ struct mce_bank {
24 char attrname[ATTR_LEN]; /* attribute name */ 24 char attrname[ATTR_LEN]; /* attribute name */
25}; 25};
26 26
27int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); 27extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
28struct dentry *mce_get_debugfs_dir(void); 28struct dentry *mce_get_debugfs_dir(void);
29 29
30extern struct mce_bank *mce_banks; 30extern struct mce_bank *mce_banks;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8bb433043a7f..155c9261d3ef 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -186,7 +186,62 @@ static int error_context(struct mce *m)
186 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; 186 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
187} 187}
188 188
189int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) 189/*
190 * See AMD Error Scope Hierarchy table in a newer BKDG. For example
191 * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
192 */
193static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
194{
195 enum context ctx = error_context(m);
196
197 /* Processor Context Corrupt, no need to fumble too much, die! */
198 if (m->status & MCI_STATUS_PCC)
199 return MCE_PANIC_SEVERITY;
200
201 if (m->status & MCI_STATUS_UC) {
202
203 /*
204 * On older systems where overflow_recov flag is not present, we
205 * should simply panic if an error overflow occurs. If
206 * overflow_recov flag is present and set, then software can try
207 * to at least kill process to prolong system operation.
208 */
209 if (mce_flags.overflow_recov) {
210 /* software can try to contain */
211 if (!(m->mcgstatus & MCG_STATUS_RIPV))
212 if (ctx == IN_KERNEL)
213 return MCE_PANIC_SEVERITY;
214
215 /* kill current process */
216 return MCE_AR_SEVERITY;
217 } else {
218 /* at least one error was not logged */
219 if (m->status & MCI_STATUS_OVER)
220 return MCE_PANIC_SEVERITY;
221 }
222
223 /*
224 * For any other case, return MCE_UC_SEVERITY so that we log the
225 * error and exit #MC handler.
226 */
227 return MCE_UC_SEVERITY;
228 }
229
230 /*
231 * deferred error: poll handler catches these and adds to mce_ring so
232 * memory-failure can take recovery actions.
233 */
234 if (m->status & MCI_STATUS_DEFERRED)
235 return MCE_DEFERRED_SEVERITY;
236
237 /*
238 * corrected error: poll handler catches these and passes responsibility
239 * of decoding the error to EDAC
240 */
241 return MCE_KEEP_SEVERITY;
242}
243
244static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
190{ 245{
191 enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); 246 enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
192 enum context ctx = error_context(m); 247 enum context ctx = error_context(m);
@@ -216,6 +271,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
216 } 271 }
217} 272}
218 273
274/* Default to mce_severity_intel */
275int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
276 mce_severity_intel;
277
278void __init mcheck_vendor_init_severity(void)
279{
280 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
281 mce_severity = mce_severity_amd;
282}
283
219#ifdef CONFIG_DEBUG_FS 284#ifdef CONFIG_DEBUG_FS
220static void *s_start(struct seq_file *f, loff_t *pos) 285static void *s_start(struct seq_file *f, loff_t *pos)
221{ 286{
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8548b714a16b..c7df30748629 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -64,6 +64,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
64DEFINE_PER_CPU(unsigned, mce_exception_count); 64DEFINE_PER_CPU(unsigned, mce_exception_count);
65 65
66struct mce_bank *mce_banks __read_mostly; 66struct mce_bank *mce_banks __read_mostly;
67struct mce_vendor_flags mce_flags __read_mostly;
67 68
68struct mca_config mca_cfg __read_mostly = { 69struct mca_config mca_cfg __read_mostly = {
69 .bootlog = -1, 70 .bootlog = -1,
@@ -1535,6 +1536,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1535 mce_banks[0].ctl = 0; 1536 mce_banks[0].ctl = 0;
1536 1537
1537 /* 1538 /*
1539 * overflow_recov is supported for F15h Models 00h-0fh
1540 * even though we don't have a CPUID bit for it.
1541 */
1542 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1543 mce_flags.overflow_recov = 1;
1544
1545 /*
1538 * Turn off MC4_MISC thresholding banks on those models since 1546 * Turn off MC4_MISC thresholding banks on those models since
1539 * they're not supported there. 1547 * they're not supported there.
1540 */ 1548 */
@@ -1633,6 +1641,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1633 break; 1641 break;
1634 case X86_VENDOR_AMD: 1642 case X86_VENDOR_AMD:
1635 mce_amd_feature_init(c); 1643 mce_amd_feature_init(c);
1644 mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
1636 break; 1645 break;
1637 default: 1646 default:
1638 break; 1647 break;
@@ -2017,6 +2026,7 @@ __setup("mce", mcheck_enable);
2017int __init mcheck_init(void) 2026int __init mcheck_init(void)
2018{ 2027{
2019 mcheck_intel_therm_init(); 2028 mcheck_intel_therm_init();
2029 mcheck_vendor_init_severity();
2020 2030
2021 return 0; 2031 return 0;
2022} 2032}