aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/mce.h4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c23
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c60
-rw-r--r--drivers/edac/mce_amd.h3
5 files changed, 78 insertions, 16 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 276392f121fb..51b26e895933 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -34,6 +34,10 @@
34#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ 34#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
35#define MCI_STATUS_AR (1ULL<<55) /* Action required */ 35#define MCI_STATUS_AR (1ULL<<55) /* Action required */
36 36
37/* AMD-specific bits */
38#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */
39#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
40
37/* 41/*
38 * Note that the full MCACOD field of IA32_MCi_STATUS MSR is 42 * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
39 * bits 15:0. But bit 12 is the 'F' bit, defined for corrected 43 * bits 15:0. But bit 12 is the 'F' bit, defined for corrected
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 09edd0b65fef..10b46906767f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -3,6 +3,8 @@
3 3
4enum severity_level { 4enum severity_level {
5 MCE_NO_SEVERITY, 5 MCE_NO_SEVERITY,
6 MCE_DEFERRED_SEVERITY,
7 MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
6 MCE_KEEP_SEVERITY, 8 MCE_KEEP_SEVERITY,
7 MCE_SOME_SEVERITY, 9 MCE_SOME_SEVERITY,
8 MCE_AO_SEVERITY, 10 MCE_AO_SEVERITY,
@@ -21,7 +23,7 @@ struct mce_bank {
21 char attrname[ATTR_LEN]; /* attribute name */ 23 char attrname[ATTR_LEN]; /* attribute name */
22}; 24};
23 25
24int mce_severity(struct mce *a, int tolerant, char **msg); 26int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
25struct dentry *mce_get_debugfs_dir(void); 27struct dentry *mce_get_debugfs_dir(void);
26 28
27extern struct mce_bank *mce_banks; 29extern struct mce_bank *mce_banks;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index c370e1c4468b..8bb433043a7f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -31,6 +31,7 @@
31 31
32enum context { IN_KERNEL = 1, IN_USER = 2 }; 32enum context { IN_KERNEL = 1, IN_USER = 2 };
33enum ser { SER_REQUIRED = 1, NO_SER = 2 }; 33enum ser { SER_REQUIRED = 1, NO_SER = 2 };
34enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
34 35
35static struct severity { 36static struct severity {
36 u64 mask; 37 u64 mask;
@@ -40,6 +41,7 @@ static struct severity {
40 unsigned char mcgres; 41 unsigned char mcgres;
41 unsigned char ser; 42 unsigned char ser;
42 unsigned char context; 43 unsigned char context;
44 unsigned char excp;
43 unsigned char covered; 45 unsigned char covered;
44 char *msg; 46 char *msg;
45} severities[] = { 47} severities[] = {
@@ -48,6 +50,8 @@ static struct severity {
48#define USER .context = IN_USER 50#define USER .context = IN_USER
49#define SER .ser = SER_REQUIRED 51#define SER .ser = SER_REQUIRED
50#define NOSER .ser = NO_SER 52#define NOSER .ser = NO_SER
53#define EXCP .excp = EXCP_CONTEXT
54#define NOEXCP .excp = NO_EXCP
51#define BITCLR(x) .mask = x, .result = 0 55#define BITCLR(x) .mask = x, .result = 0
52#define BITSET(x) .mask = x, .result = x 56#define BITSET(x) .mask = x, .result = x
53#define MCGMASK(x, y) .mcgmask = x, .mcgres = y 57#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
@@ -62,7 +66,7 @@ static struct severity {
62 ), 66 ),
63 MCESEV( 67 MCESEV(
64 NO, "Not enabled", 68 NO, "Not enabled",
65 BITCLR(MCI_STATUS_EN) 69 EXCP, BITCLR(MCI_STATUS_EN)
66 ), 70 ),
67 MCESEV( 71 MCESEV(
68 PANIC, "Processor context corrupt", 72 PANIC, "Processor context corrupt",
@@ -71,16 +75,20 @@ static struct severity {
71 /* When MCIP is not set something is very confused */ 75 /* When MCIP is not set something is very confused */
72 MCESEV( 76 MCESEV(
73 PANIC, "MCIP not set in MCA handler", 77 PANIC, "MCIP not set in MCA handler",
74 MCGMASK(MCG_STATUS_MCIP, 0) 78 EXCP, MCGMASK(MCG_STATUS_MCIP, 0)
75 ), 79 ),
76 /* Neither return not error IP -- no chance to recover -> PANIC */ 80 /* Neither return not error IP -- no chance to recover -> PANIC */
77 MCESEV( 81 MCESEV(
78 PANIC, "Neither restart nor error IP", 82 PANIC, "Neither restart nor error IP",
79 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) 83 EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
80 ), 84 ),
81 MCESEV( 85 MCESEV(
82 PANIC, "In kernel and no restart IP", 86 PANIC, "In kernel and no restart IP",
83 KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) 87 EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
88 ),
89 MCESEV(
90 DEFERRED, "Deferred error",
91 NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
84 ), 92 ),
85 MCESEV( 93 MCESEV(
86 KEEP, "Corrected error", 94 KEEP, "Corrected error",
@@ -89,7 +97,7 @@ static struct severity {
89 97
90 /* ignore OVER for UCNA */ 98 /* ignore OVER for UCNA */
91 MCESEV( 99 MCESEV(
92 KEEP, "Uncorrected no action required", 100 UCNA, "Uncorrected no action required",
93 SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) 101 SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
94 ), 102 ),
95 MCESEV( 103 MCESEV(
@@ -178,8 +186,9 @@ static int error_context(struct mce *m)
178 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; 186 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
179} 187}
180 188
181int mce_severity(struct mce *m, int tolerant, char **msg) 189int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
182{ 190{
191 enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
183 enum context ctx = error_context(m); 192 enum context ctx = error_context(m);
184 struct severity *s; 193 struct severity *s;
185 194
@@ -194,6 +203,8 @@ int mce_severity(struct mce *m, int tolerant, char **msg)
194 continue; 203 continue;
195 if (s->context && ctx != s->context) 204 if (s->context && ctx != s->context)
196 continue; 205 continue;
206 if (s->excp && excp != s->excp)
207 continue;
197 if (msg) 208 if (msg)
198 *msg = s->msg; 209 *msg = s->msg;
199 s->covered = 1; 210 s->covered = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 61a9668cebfd..cfb16f631d52 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -575,6 +575,37 @@ static void mce_read_aux(struct mce *m, int i)
575 } 575 }
576} 576}
577 577
578static bool memory_error(struct mce *m)
579{
580 struct cpuinfo_x86 *c = &boot_cpu_data;
581
582 if (c->x86_vendor == X86_VENDOR_AMD) {
583 /*
584 * coming soon
585 */
586 return false;
587 } else if (c->x86_vendor == X86_VENDOR_INTEL) {
588 /*
589 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
590 *
591 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
592 * indicating a memory error. Bit 8 is used for indicating a
593 * cache hierarchy error. The combination of bit 2 and bit 3
594 * is used for indicating a `generic' cache hierarchy error
595 * But we can't just blindly check the above bits, because if
596 * bit 11 is set, then it is a bus/interconnect error - and
597 * either way the above bits just gives more detail on what
598 * bus/interconnect error happened. Note that bit 12 can be
599 * ignored, as it's the "filter" bit.
600 */
601 return (m->status & 0xef80) == BIT(7) ||
602 (m->status & 0xef00) == BIT(8) ||
603 (m->status & 0xeffc) == 0xc;
604 }
605
606 return false;
607}
608
578DEFINE_PER_CPU(unsigned, mce_poll_count); 609DEFINE_PER_CPU(unsigned, mce_poll_count);
579 610
580/* 611/*
@@ -595,6 +626,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
595void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 626void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
596{ 627{
597 struct mce m; 628 struct mce m;
629 int severity;
598 int i; 630 int i;
599 631
600 this_cpu_inc(mce_poll_count); 632 this_cpu_inc(mce_poll_count);
@@ -630,6 +662,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
630 662
631 if (!(flags & MCP_TIMESTAMP)) 663 if (!(flags & MCP_TIMESTAMP))
632 m.tsc = 0; 664 m.tsc = 0;
665
666 severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
667
668 /*
669 * In the cases where we don't have a valid address after all,
670 * do not add it into the ring buffer.
671 */
672 if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
673 if (m.status & MCI_STATUS_ADDRV) {
674 mce_ring_add(m.addr >> PAGE_SHIFT);
675 mce_schedule_work();
676 }
677 }
678
633 /* 679 /*
634 * Don't get the IP here because it's unlikely to 680 * Don't get the IP here because it's unlikely to
635 * have anything to do with the actual error location. 681 * have anything to do with the actual error location.
@@ -668,7 +714,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
668 if (quirk_no_way_out) 714 if (quirk_no_way_out)
669 quirk_no_way_out(i, m, regs); 715 quirk_no_way_out(i, m, regs);
670 } 716 }
671 if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) 717 if (mce_severity(m, mca_cfg.tolerant, msg, true) >=
718 MCE_PANIC_SEVERITY)
672 ret = 1; 719 ret = 1;
673 } 720 }
674 return ret; 721 return ret;
@@ -754,7 +801,7 @@ static void mce_reign(void)
754 for_each_possible_cpu(cpu) { 801 for_each_possible_cpu(cpu) {
755 int severity = mce_severity(&per_cpu(mces_seen, cpu), 802 int severity = mce_severity(&per_cpu(mces_seen, cpu),
756 mca_cfg.tolerant, 803 mca_cfg.tolerant,
757 &nmsg); 804 &nmsg, true);
758 if (severity > global_worst) { 805 if (severity > global_worst) {
759 msg = nmsg; 806 msg = nmsg;
760 global_worst = severity; 807 global_worst = severity;
@@ -1095,13 +1142,14 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1095 */ 1142 */
1096 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 1143 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1097 1144
1098 severity = mce_severity(&m, cfg->tolerant, NULL); 1145 severity = mce_severity(&m, cfg->tolerant, NULL, true);
1099 1146
1100 /* 1147 /*
1101 * When machine check was for corrected handler don't touch, 1148 * When machine check was for corrected/deferred handler don't
1102 * unless we're panicing. 1149 * touch, unless we're panicing.
1103 */ 1150 */
1104 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1151 if ((severity == MCE_KEEP_SEVERITY ||
1152 severity == MCE_UCNA_SEVERITY) && !no_way_out)
1105 continue; 1153 continue;
1106 __set_bit(i, toclear); 1154 __set_bit(i, toclear);
1107 if (severity == MCE_NO_SEVERITY) { 1155 if (severity == MCE_NO_SEVERITY) {
diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h
index 51b7e3a36e37..c2359a1ea6b3 100644
--- a/drivers/edac/mce_amd.h
+++ b/drivers/edac/mce_amd.h
@@ -32,9 +32,6 @@
32#define R4(x) (((x) >> 4) & 0xf) 32#define R4(x) (((x) >> 4) & 0xf)
33#define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") 33#define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!")
34 34
35#define MCI_STATUS_DEFERRED BIT_64(44)
36#define MCI_STATUS_POISON BIT_64(43)
37
38extern const char * const pp_msgs[]; 35extern const char * const pp_msgs[];
39 36
40enum tt_ids { 37enum tt_ids {