aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2014-11-21 09:30:30 -0500
committerThomas Gleixner <tglx@linutronix.de>2014-11-21 09:31:35 -0500
commitb9e6df0a2dccdcd22090cdd140a1bfd8ae2e508b (patch)
tree6329a4c117ccc0169a487798cdfddbaaf702130c
parent8dcf32ea220d87ca517e164de85d336480c9d172 (diff)
parentfa92c58694268a7e9f7fa2c6881c1482221c2788 (diff)
Merge tag 'please-pull-ucna' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras into x86/ras
Merge RAS updates from Tony Luck: "Handle all uncorrected error reports in the same way (soft offline the page). We used to only do that for SRAO (software recoverable action optional) machine checks, but it makes sense to also do it for UCNA (UnCorrected No Action) logs found by CMCI or polling."
-rw-r--r--arch/x86/include/asm/mce.h4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c23
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c60
-rw-r--r--drivers/edac/mce_amd.h3
5 files changed, 78 insertions, 16 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 276392f121fb..51b26e895933 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -34,6 +34,10 @@
34#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ 34#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
35#define MCI_STATUS_AR (1ULL<<55) /* Action required */ 35#define MCI_STATUS_AR (1ULL<<55) /* Action required */
36 36
37/* AMD-specific bits */
38#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */
39#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
40
37/* 41/*
38 * Note that the full MCACOD field of IA32_MCi_STATUS MSR is 42 * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
39 * bits 15:0. But bit 12 is the 'F' bit, defined for corrected 43 * bits 15:0. But bit 12 is the 'F' bit, defined for corrected
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 09edd0b65fef..10b46906767f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -3,6 +3,8 @@
3 3
4enum severity_level { 4enum severity_level {
5 MCE_NO_SEVERITY, 5 MCE_NO_SEVERITY,
6 MCE_DEFERRED_SEVERITY,
7 MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
6 MCE_KEEP_SEVERITY, 8 MCE_KEEP_SEVERITY,
7 MCE_SOME_SEVERITY, 9 MCE_SOME_SEVERITY,
8 MCE_AO_SEVERITY, 10 MCE_AO_SEVERITY,
@@ -21,7 +23,7 @@ struct mce_bank {
21 char attrname[ATTR_LEN]; /* attribute name */ 23 char attrname[ATTR_LEN]; /* attribute name */
22}; 24};
23 25
24int mce_severity(struct mce *a, int tolerant, char **msg); 26int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
25struct dentry *mce_get_debugfs_dir(void); 27struct dentry *mce_get_debugfs_dir(void);
26 28
27extern struct mce_bank *mce_banks; 29extern struct mce_bank *mce_banks;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index c370e1c4468b..8bb433043a7f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -31,6 +31,7 @@
31 31
32enum context { IN_KERNEL = 1, IN_USER = 2 }; 32enum context { IN_KERNEL = 1, IN_USER = 2 };
33enum ser { SER_REQUIRED = 1, NO_SER = 2 }; 33enum ser { SER_REQUIRED = 1, NO_SER = 2 };
34enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
34 35
35static struct severity { 36static struct severity {
36 u64 mask; 37 u64 mask;
@@ -40,6 +41,7 @@ static struct severity {
40 unsigned char mcgres; 41 unsigned char mcgres;
41 unsigned char ser; 42 unsigned char ser;
42 unsigned char context; 43 unsigned char context;
44 unsigned char excp;
43 unsigned char covered; 45 unsigned char covered;
44 char *msg; 46 char *msg;
45} severities[] = { 47} severities[] = {
@@ -48,6 +50,8 @@ static struct severity {
48#define USER .context = IN_USER 50#define USER .context = IN_USER
49#define SER .ser = SER_REQUIRED 51#define SER .ser = SER_REQUIRED
50#define NOSER .ser = NO_SER 52#define NOSER .ser = NO_SER
53#define EXCP .excp = EXCP_CONTEXT
54#define NOEXCP .excp = NO_EXCP
51#define BITCLR(x) .mask = x, .result = 0 55#define BITCLR(x) .mask = x, .result = 0
52#define BITSET(x) .mask = x, .result = x 56#define BITSET(x) .mask = x, .result = x
53#define MCGMASK(x, y) .mcgmask = x, .mcgres = y 57#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
@@ -62,7 +66,7 @@ static struct severity {
62 ), 66 ),
63 MCESEV( 67 MCESEV(
64 NO, "Not enabled", 68 NO, "Not enabled",
65 BITCLR(MCI_STATUS_EN) 69 EXCP, BITCLR(MCI_STATUS_EN)
66 ), 70 ),
67 MCESEV( 71 MCESEV(
68 PANIC, "Processor context corrupt", 72 PANIC, "Processor context corrupt",
@@ -71,16 +75,20 @@ static struct severity {
71 /* When MCIP is not set something is very confused */ 75 /* When MCIP is not set something is very confused */
72 MCESEV( 76 MCESEV(
73 PANIC, "MCIP not set in MCA handler", 77 PANIC, "MCIP not set in MCA handler",
74 MCGMASK(MCG_STATUS_MCIP, 0) 78 EXCP, MCGMASK(MCG_STATUS_MCIP, 0)
75 ), 79 ),
76 /* Neither return not error IP -- no chance to recover -> PANIC */ 80 /* Neither return not error IP -- no chance to recover -> PANIC */
77 MCESEV( 81 MCESEV(
78 PANIC, "Neither restart nor error IP", 82 PANIC, "Neither restart nor error IP",
79 MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) 83 EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
80 ), 84 ),
81 MCESEV( 85 MCESEV(
82 PANIC, "In kernel and no restart IP", 86 PANIC, "In kernel and no restart IP",
83 KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) 87 EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
88 ),
89 MCESEV(
90 DEFERRED, "Deferred error",
91 NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
84 ), 92 ),
85 MCESEV( 93 MCESEV(
86 KEEP, "Corrected error", 94 KEEP, "Corrected error",
@@ -89,7 +97,7 @@ static struct severity {
89 97
90 /* ignore OVER for UCNA */ 98 /* ignore OVER for UCNA */
91 MCESEV( 99 MCESEV(
92 KEEP, "Uncorrected no action required", 100 UCNA, "Uncorrected no action required",
93 SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) 101 SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
94 ), 102 ),
95 MCESEV( 103 MCESEV(
@@ -178,8 +186,9 @@ static int error_context(struct mce *m)
178 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; 186 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
179} 187}
180 188
181int mce_severity(struct mce *m, int tolerant, char **msg) 189int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
182{ 190{
191 enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
183 enum context ctx = error_context(m); 192 enum context ctx = error_context(m);
184 struct severity *s; 193 struct severity *s;
185 194
@@ -194,6 +203,8 @@ int mce_severity(struct mce *m, int tolerant, char **msg)
194 continue; 203 continue;
195 if (s->context && ctx != s->context) 204 if (s->context && ctx != s->context)
196 continue; 205 continue;
206 if (s->excp && excp != s->excp)
207 continue;
197 if (msg) 208 if (msg)
198 *msg = s->msg; 209 *msg = s->msg;
199 s->covered = 1; 210 s->covered = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 61a9668cebfd..cfb16f631d52 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -575,6 +575,37 @@ static void mce_read_aux(struct mce *m, int i)
575 } 575 }
576} 576}
577 577
578static bool memory_error(struct mce *m)
579{
580 struct cpuinfo_x86 *c = &boot_cpu_data;
581
582 if (c->x86_vendor == X86_VENDOR_AMD) {
583 /*
584 * coming soon
585 */
586 return false;
587 } else if (c->x86_vendor == X86_VENDOR_INTEL) {
588 /*
589 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
590 *
591 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
592 * indicating a memory error. Bit 8 is used for indicating a
593 * cache hierarchy error. The combination of bit 2 and bit 3
594 * is used for indicating a `generic' cache hierarchy error
595 * But we can't just blindly check the above bits, because if
596 * bit 11 is set, then it is a bus/interconnect error - and
597 * either way the above bits just gives more detail on what
598 * bus/interconnect error happened. Note that bit 12 can be
599 * ignored, as it's the "filter" bit.
600 */
601 return (m->status & 0xef80) == BIT(7) ||
602 (m->status & 0xef00) == BIT(8) ||
603 (m->status & 0xeffc) == 0xc;
604 }
605
606 return false;
607}
608
578DEFINE_PER_CPU(unsigned, mce_poll_count); 609DEFINE_PER_CPU(unsigned, mce_poll_count);
579 610
580/* 611/*
@@ -595,6 +626,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
595void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 626void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
596{ 627{
597 struct mce m; 628 struct mce m;
629 int severity;
598 int i; 630 int i;
599 631
600 this_cpu_inc(mce_poll_count); 632 this_cpu_inc(mce_poll_count);
@@ -630,6 +662,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
630 662
631 if (!(flags & MCP_TIMESTAMP)) 663 if (!(flags & MCP_TIMESTAMP))
632 m.tsc = 0; 664 m.tsc = 0;
665
666 severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
667
668 /*
669 * In the cases where we don't have a valid address after all,
670 * do not add it into the ring buffer.
671 */
672 if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
673 if (m.status & MCI_STATUS_ADDRV) {
674 mce_ring_add(m.addr >> PAGE_SHIFT);
675 mce_schedule_work();
676 }
677 }
678
633 /* 679 /*
634 * Don't get the IP here because it's unlikely to 680 * Don't get the IP here because it's unlikely to
635 * have anything to do with the actual error location. 681 * have anything to do with the actual error location.
@@ -668,7 +714,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
668 if (quirk_no_way_out) 714 if (quirk_no_way_out)
669 quirk_no_way_out(i, m, regs); 715 quirk_no_way_out(i, m, regs);
670 } 716 }
671 if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) 717 if (mce_severity(m, mca_cfg.tolerant, msg, true) >=
718 MCE_PANIC_SEVERITY)
672 ret = 1; 719 ret = 1;
673 } 720 }
674 return ret; 721 return ret;
@@ -754,7 +801,7 @@ static void mce_reign(void)
754 for_each_possible_cpu(cpu) { 801 for_each_possible_cpu(cpu) {
755 int severity = mce_severity(&per_cpu(mces_seen, cpu), 802 int severity = mce_severity(&per_cpu(mces_seen, cpu),
756 mca_cfg.tolerant, 803 mca_cfg.tolerant,
757 &nmsg); 804 &nmsg, true);
758 if (severity > global_worst) { 805 if (severity > global_worst) {
759 msg = nmsg; 806 msg = nmsg;
760 global_worst = severity; 807 global_worst = severity;
@@ -1095,13 +1142,14 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1095 */ 1142 */
1096 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 1143 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1097 1144
1098 severity = mce_severity(&m, cfg->tolerant, NULL); 1145 severity = mce_severity(&m, cfg->tolerant, NULL, true);
1099 1146
1100 /* 1147 /*
1101 * When machine check was for corrected handler don't touch, 1148 * When machine check was for corrected/deferred handler don't
1102 * unless we're panicing. 1149 * touch, unless we're panicing.
1103 */ 1150 */
1104 if (severity == MCE_KEEP_SEVERITY && !no_way_out) 1151 if ((severity == MCE_KEEP_SEVERITY ||
1152 severity == MCE_UCNA_SEVERITY) && !no_way_out)
1105 continue; 1153 continue;
1106 __set_bit(i, toclear); 1154 __set_bit(i, toclear);
1107 if (severity == MCE_NO_SEVERITY) { 1155 if (severity == MCE_NO_SEVERITY) {
diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h
index 51b7e3a36e37..c2359a1ea6b3 100644
--- a/drivers/edac/mce_amd.h
+++ b/drivers/edac/mce_amd.h
@@ -32,9 +32,6 @@
32#define R4(x) (((x) >> 4) & 0xf) 32#define R4(x) (((x) >> 4) & 0xf)
33#define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") 33#define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!")
34 34
35#define MCI_STATUS_DEFERRED BIT_64(44)
36#define MCI_STATUS_POISON BIT_64(43)
37
38extern const char * const pp_msgs[]; 35extern const char * const pp_msgs[];
39 36
40enum tt_ids { 37enum tt_ids {