aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>2015-03-23 11:42:52 -0400
committerBorislav Petkov <bp@suse.de>2015-03-24 07:13:34 -0400
commitbf80bbd7dcf525e41e0673fbaa8cd21d2344b460 (patch)
tree5a07598febcb55aa340dc710248f6f22920416b2
parentc9ce8712838e48bf356144122c5ecdcdac5d1829 (diff)
x86/mce: Add an AMD severities-grading function
Add a severities function that caters to AMD processors. This allows us to do some vendor-specific work within the function if necessary. Also, introduce a vendor flag bitfield for vendor-specific settings. The severities code uses this to define error scope based on the prescence of the flags field. This is based off of work by Boris Petkov. Testing details: Fam10h, Model 9h (Greyhound) Fam15h: Models 0h-0fh (Orochi), 30h-3fh (Kaveri) and 60h-6fh (Carrizo), Fam16h Model 00h-0fh (Kabini) Boris: Intel SNB AMD K8 (JH-E0) Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@amd.com> Acked-by: Tony Luck <tony.luck@intel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@kernel.org> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Chen Yucong <slaoub@gmail.com> Cc: Andy Lutomirski <luto@amacapital.net> Cc: linux-edac@vger.kernel.org Link: http://lkml.kernel.org/r/1427125373-2918-2-git-send-email-Aravind.Gopalakrishnan@amd.com [ Fixup build, clean up comments. ] Signed-off-by: Borislav Petkov <bp@suse.de>
-rw-r--r--arch/x86/include/asm/mce.h6
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c56
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c9
3 files changed, 71 insertions, 0 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index fd38a23e729f..b574fbf62d39 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -116,6 +116,12 @@ struct mca_config {
116 u32 rip_msr; 116 u32 rip_msr;
117}; 117};
118 118
119struct mce_vendor_flags {
120 __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */
121 __reserved_0 : 63;
122};
123extern struct mce_vendor_flags mce_flags;
124
119extern struct mca_config mca_cfg; 125extern struct mca_config mca_cfg;
120extern void mce_register_decode_chain(struct notifier_block *nb); 126extern void mce_register_decode_chain(struct notifier_block *nb);
121extern void mce_unregister_decode_chain(struct notifier_block *nb); 127extern void mce_unregister_decode_chain(struct notifier_block *nb);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8bb433043a7f..e16f3f201e06 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -186,12 +186,68 @@ static int error_context(struct mce *m)
186 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; 186 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
187} 187}
188 188
189/*
190 * See AMD Error Scope Hierarchy table in a newer BKDG. For example
191 * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
192 */
193static int mce_severity_amd(struct mce *m, enum context ctx)
194{
195 /* Processor Context Corrupt, no need to fumble too much, die! */
196 if (m->status & MCI_STATUS_PCC)
197 return MCE_PANIC_SEVERITY;
198
199 if (m->status & MCI_STATUS_UC) {
200
201 /*
202 * On older systems where overflow_recov flag is not present, we
203 * should simply panic if an error overflow occurs. If
204 * overflow_recov flag is present and set, then software can try
205 * to at least kill process to prolong system operation.
206 */
207 if (mce_flags.overflow_recov) {
208 /* software can try to contain */
209 if (!(m->mcgstatus & MCG_STATUS_RIPV))
210 if (ctx == IN_KERNEL)
211 return MCE_PANIC_SEVERITY;
212
213 /* kill current process */
214 return MCE_AR_SEVERITY;
215 } else {
216 /* at least one error was not logged */
217 if (m->status & MCI_STATUS_OVER)
218 return MCE_PANIC_SEVERITY;
219 }
220
221 /*
222 * For any other case, return MCE_UC_SEVERITY so that we log the
223 * error and exit #MC handler.
224 */
225 return MCE_UC_SEVERITY;
226 }
227
228 /*
229 * deferred error: poll handler catches these and adds to mce_ring so
230 * memory-failure can take recovery actions.
231 */
232 if (m->status & MCI_STATUS_DEFERRED)
233 return MCE_DEFERRED_SEVERITY;
234
235 /*
236 * corrected error: poll handler catches these and passes responsibility
237 * of decoding the error to EDAC
238 */
239 return MCE_KEEP_SEVERITY;
240}
241
189int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) 242int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
190{ 243{
191 enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); 244 enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
192 enum context ctx = error_context(m); 245 enum context ctx = error_context(m);
193 struct severity *s; 246 struct severity *s;
194 247
248 if (m->cpuvendor == X86_VENDOR_AMD)
249 return mce_severity_amd(m, ctx);
250
195 for (s = severities;; s++) { 251 for (s = severities;; s++) {
196 if ((m->status & s->mask) != s->result) 252 if ((m->status & s->mask) != s->result)
197 continue; 253 continue;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8548b714a16b..1189f1150a19 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -64,6 +64,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
64DEFINE_PER_CPU(unsigned, mce_exception_count); 64DEFINE_PER_CPU(unsigned, mce_exception_count);
65 65
66struct mce_bank *mce_banks __read_mostly; 66struct mce_bank *mce_banks __read_mostly;
67struct mce_vendor_flags mce_flags __read_mostly;
67 68
68struct mca_config mca_cfg __read_mostly = { 69struct mca_config mca_cfg __read_mostly = {
69 .bootlog = -1, 70 .bootlog = -1,
@@ -1535,6 +1536,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1535 mce_banks[0].ctl = 0; 1536 mce_banks[0].ctl = 0;
1536 1537
1537 /* 1538 /*
1539 * overflow_recov is supported for F15h Models 00h-0fh
1540 * even though we don't have a CPUID bit for it.
1541 */
1542 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1543 mce_flags.overflow_recov = 1;
1544
1545 /*
1538 * Turn off MC4_MISC thresholding banks on those models since 1546 * Turn off MC4_MISC thresholding banks on those models since
1539 * they're not supported there. 1547 * they're not supported there.
1540 */ 1548 */
@@ -1633,6 +1641,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1633 break; 1641 break;
1634 case X86_VENDOR_AMD: 1642 case X86_VENDOR_AMD:
1635 mce_amd_feature_init(c); 1643 mce_amd_feature_init(c);
1644 mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
1636 break; 1645 break;
1637 default: 1646 default:
1638 break; 1647 break;