aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>2016-03-07 08:02:18 -0500
committerIngo Molnar <mingo@kernel.org>2016-03-08 05:48:14 -0500
commitbe0aec23bf4624fd55650629fe8df20483487049 (patch)
tree2544c2eee3714bdc6176e6a400f158e127f01aa1
parentadc53f2e0ae2fcff10a4b981df14729ffb1482fc (diff)
x86/mce/AMD, EDAC: Enable error decoding of Scalable MCA errors
For Scalable MCA enabled processors, errors are listed per IP block. And since it is not required for an IP to map to a particular bank, we need to use HWID and McaType values from the MCx_IPID register to figure out which IP a given bank represents. We also have a new bit (TCC) in the MCx_STATUS register to indicate Task context is corrupt. Add logic here to decode errors from all known IP blocks for Fam17h Model 00-0fh and to print TCC errors. [ Minor fixups. ] Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com> Signed-off-by: Borislav Petkov <bp@suse.de> Cc: Borislav Petkov <bp@alien8.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tony Luck <tony.luck@intel.com> Cc: linux-edac <linux-edac@vger.kernel.org> Link: http://lkml.kernel.org/r/1457021458-2522-3-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/include/asm/mce.h59
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c29
-rw-r--r--drivers/edac/mce_amd.c335
3 files changed, 420 insertions, 3 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 80ba0a8d6d06..9c467fe00551 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -42,6 +42,18 @@
42/* AMD-specific bits */ 42/* AMD-specific bits */
43#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */ 43#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */
44#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ 44#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
45#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
46
47/*
48 * McaX field if set indicates a given bank supports MCA extensions:
49 * - Deferred error interrupt type is specifiable by bank.
50 * - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
51 * But should not be used to determine MSR numbers.
52 * - TCC bit is present in MCx_STATUS.
53 */
54#define MCI_CONFIG_MCAX 0x1
55#define MCI_IPID_MCATYPE 0xFFFF0000
56#define MCI_IPID_HWID 0xFFF
45 57
46/* 58/*
47 * Note that the full MCACOD field of IA32_MCi_STATUS MSR is 59 * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
@@ -93,7 +105,9 @@
93 105
94/* AMD Scalable MCA */ 106/* AMD Scalable MCA */
95#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004 107#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
108#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
96#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x)) 109#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
110#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
97 111
98/* 112/*
99 * This structure contains all data related to the MCE log. Also 113 * This structure contains all data related to the MCE log. Also
@@ -291,4 +305,49 @@ struct cper_sec_mem_err;
291extern void apei_mce_report_mem_error(int corrected, 305extern void apei_mce_report_mem_error(int corrected,
292 struct cper_sec_mem_err *mem_err); 306 struct cper_sec_mem_err *mem_err);
293 307
308/*
309 * Enumerate new IP types and HWID values in AMD processors which support
310 * Scalable MCA.
311 */
312#ifdef CONFIG_X86_MCE_AMD
313enum amd_ip_types {
314 SMCA_F17H_CORE = 0, /* Core errors */
315 SMCA_DF, /* Data Fabric */
316 SMCA_UMC, /* Unified Memory Controller */
317 SMCA_PB, /* Parameter Block */
318 SMCA_PSP, /* Platform Security Processor */
319 SMCA_SMU, /* System Management Unit */
320 N_AMD_IP_TYPES
321};
322
323struct amd_hwid {
324 const char *name;
325 unsigned int hwid;
326};
327
328extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];
329
330enum amd_core_mca_blocks {
331 SMCA_LS = 0, /* Load Store */
332 SMCA_IF, /* Instruction Fetch */
333 SMCA_L2_CACHE, /* L2 cache */
334 SMCA_DE, /* Decoder unit */
335 RES, /* Reserved */
336 SMCA_EX, /* Execution unit */
337 SMCA_FP, /* Floating Point */
338 SMCA_L3_CACHE, /* L3 cache */
339 N_CORE_MCA_BLOCKS
340};
341
342extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
343
344enum amd_df_mca_blocks {
345 SMCA_CS = 0, /* Coherent Slave */
346 SMCA_PIE, /* Power management, Interrupts, etc */
347 N_DF_BLOCKS
348};
349
350extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
351#endif
352
294#endif /* _ASM_X86_MCE_H */ 353#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 88de27bd5797..ee487a93ebe7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -71,6 +71,35 @@ static const char * const th_names[] = {
71 "execution_unit", 71 "execution_unit",
72}; 72};
73 73
74/* Define HWID to IP type mappings for Scalable MCA */
75struct amd_hwid amd_hwids[] = {
76 [SMCA_F17H_CORE] = { "f17h_core", 0xB0 },
77 [SMCA_DF] = { "data_fabric", 0x2E },
78 [SMCA_UMC] = { "umc", 0x96 },
79 [SMCA_PB] = { "param_block", 0x5 },
80 [SMCA_PSP] = { "psp", 0xFF },
81 [SMCA_SMU] = { "smu", 0x1 },
82};
83EXPORT_SYMBOL_GPL(amd_hwids);
84
85const char * const amd_core_mcablock_names[] = {
86 [SMCA_LS] = "load_store",
87 [SMCA_IF] = "insn_fetch",
88 [SMCA_L2_CACHE] = "l2_cache",
89 [SMCA_DE] = "decode_unit",
90 [RES] = "",
91 [SMCA_EX] = "execution_unit",
92 [SMCA_FP] = "floating_point",
93 [SMCA_L3_CACHE] = "l3_cache",
94};
95EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
96
97const char * const amd_df_mcablock_names[] = {
98 [SMCA_CS] = "coherent_slave",
99 [SMCA_PIE] = "pie",
100};
101EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
102
74static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); 103static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
75static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ 104static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
76 105
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index e3a945ce374b..49768c08ac07 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -147,6 +147,135 @@ static const char * const mc6_mce_desc[] = {
147 "Status Register File", 147 "Status Register File",
148}; 148};
149 149
150/* Scalable MCA error strings */
151static const char * const f17h_ls_mce_desc[] = {
152 "Load queue parity",
153 "Store queue parity",
154 "Miss address buffer payload parity",
155 "L1 TLB parity",
156 "", /* reserved */
157 "DC tag error type 6",
158 "DC tag error type 1",
159 "Internal error type 1",
160 "Internal error type 2",
161 "Sys Read data error thread 0",
162 "Sys read data error thread 1",
163 "DC tag error type 2",
164 "DC data error type 1 (poison comsumption)",
165 "DC data error type 2",
166 "DC data error type 3",
167 "DC tag error type 4",
168 "L2 TLB parity",
169 "PDC parity error",
170 "DC tag error type 3",
171 "DC tag error type 5",
172 "L2 fill data error",
173};
174
175static const char * const f17h_if_mce_desc[] = {
176 "microtag probe port parity error",
177 "IC microtag or full tag multi-hit error",
178 "IC full tag parity",
179 "IC data array parity",
180 "Decoupling queue phys addr parity error",
181 "L0 ITLB parity error",
182 "L1 ITLB parity error",
183 "L2 ITLB parity error",
184 "BPQ snoop parity on Thread 0",
185 "BPQ snoop parity on Thread 1",
186 "L1 BTB multi-match error",
187 "L2 BTB multi-match error",
188};
189
190static const char * const f17h_l2_mce_desc[] = {
191 "L2M tag multi-way-hit error",
192 "L2M tag ECC error",
193 "L2M data ECC error",
194 "HW assert",
195};
196
197static const char * const f17h_de_mce_desc[] = {
198 "uop cache tag parity error",
199 "uop cache data parity error",
200 "Insn buffer parity error",
201 "Insn dispatch queue parity error",
202 "Fetch address FIFO parity",
203 "Patch RAM data parity",
204 "Patch RAM sequencer parity",
205 "uop buffer parity"
206};
207
208static const char * const f17h_ex_mce_desc[] = {
209 "Watchdog timeout error",
210 "Phy register file parity",
211 "Flag register file parity",
212 "Immediate displacement register file parity",
213 "Address generator payload parity",
214 "EX payload parity",
215 "Checkpoint queue parity",
216 "Retire dispatch queue parity",
217};
218
219static const char * const f17h_fp_mce_desc[] = {
220 "Physical register file parity",
221 "Freelist parity error",
222 "Schedule queue parity",
223 "NSQ parity error",
224 "Retire queue parity",
225 "Status register file parity",
226};
227
228static const char * const f17h_l3_mce_desc[] = {
229 "Shadow tag macro ECC error",
230 "Shadow tag macro multi-way-hit error",
231 "L3M tag ECC error",
232 "L3M tag multi-way-hit error",
233 "L3M data ECC error",
234 "XI parity, L3 fill done channel error",
235 "L3 victim queue parity",
236 "L3 HW assert",
237};
238
239static const char * const f17h_cs_mce_desc[] = {
240 "Illegal request from transport layer",
241 "Address violation",
242 "Security violation",
243 "Illegal response from transport layer",
244 "Unexpected response",
245 "Parity error on incoming request or probe response data",
246 "Parity error on incoming read response data",
247 "Atomic request parity",
248 "ECC error on probe filter access",
249};
250
251static const char * const f17h_pie_mce_desc[] = {
252 "HW assert",
253 "Internal PIE register security violation",
254 "Error on GMI link",
255 "Poison data written to internal PIE register",
256};
257
258static const char * const f17h_umc_mce_desc[] = {
259 "DRAM ECC error",
260 "Data poison error on DRAM",
261 "SDP parity error",
262 "Advanced peripheral bus error",
263 "Command/address parity error",
264 "Write data CRC error",
265};
266
267static const char * const f17h_pb_mce_desc[] = {
268 "Parameter Block RAM ECC error",
269};
270
271static const char * const f17h_psp_mce_desc[] = {
272 "PSP RAM ECC or parity error",
273};
274
275static const char * const f17h_smu_mce_desc[] = {
276 "SMU RAM ECC or parity error",
277};
278
150static bool f12h_mc0_mce(u16 ec, u8 xec) 279static bool f12h_mc0_mce(u16 ec, u8 xec)
151{ 280{
152 bool ret = false; 281 bool ret = false;
@@ -691,6 +820,177 @@ static void decode_mc6_mce(struct mce *m)
691 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); 820 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
692} 821}
693 822
823static void decode_f17h_core_errors(const char *ip_name, u8 xec,
824 unsigned int mca_type)
825{
826 const char * const *error_desc_array;
827 size_t len;
828
829 pr_emerg(HW_ERR "%s Error: ", ip_name);
830
831 switch (mca_type) {
832 case SMCA_LS:
833 error_desc_array = f17h_ls_mce_desc;
834 len = ARRAY_SIZE(f17h_ls_mce_desc) - 1;
835
836 if (xec == 0x4) {
837 pr_cont("Unrecognized LS MCA error code.\n");
838 return;
839 }
840 break;
841
842 case SMCA_IF:
843 error_desc_array = f17h_if_mce_desc;
844 len = ARRAY_SIZE(f17h_if_mce_desc) - 1;
845 break;
846
847 case SMCA_L2_CACHE:
848 error_desc_array = f17h_l2_mce_desc;
849 len = ARRAY_SIZE(f17h_l2_mce_desc) - 1;
850 break;
851
852 case SMCA_DE:
853 error_desc_array = f17h_de_mce_desc;
854 len = ARRAY_SIZE(f17h_de_mce_desc) - 1;
855 break;
856
857 case SMCA_EX:
858 error_desc_array = f17h_ex_mce_desc;
859 len = ARRAY_SIZE(f17h_ex_mce_desc) - 1;
860 break;
861
862 case SMCA_FP:
863 error_desc_array = f17h_fp_mce_desc;
864 len = ARRAY_SIZE(f17h_fp_mce_desc) - 1;
865 break;
866
867 case SMCA_L3_CACHE:
868 error_desc_array = f17h_l3_mce_desc;
869 len = ARRAY_SIZE(f17h_l3_mce_desc) - 1;
870 break;
871
872 default:
873 pr_cont("Corrupted MCA core error info.\n");
874 return;
875 }
876
877 if (xec > len) {
878 pr_cont("Unrecognized %s MCA bank error code.\n",
879 amd_core_mcablock_names[mca_type]);
880 return;
881 }
882
883 pr_cont("%s.\n", error_desc_array[xec]);
884}
885
886static void decode_df_errors(u8 xec, unsigned int mca_type)
887{
888 const char * const *error_desc_array;
889 size_t len;
890
891 pr_emerg(HW_ERR "Data Fabric Error: ");
892
893 switch (mca_type) {
894 case SMCA_CS:
895 error_desc_array = f17h_cs_mce_desc;
896 len = ARRAY_SIZE(f17h_cs_mce_desc) - 1;
897 break;
898
899 case SMCA_PIE:
900 error_desc_array = f17h_pie_mce_desc;
901 len = ARRAY_SIZE(f17h_pie_mce_desc) - 1;
902 break;
903
904 default:
905 pr_cont("Corrupted MCA Data Fabric info.\n");
906 return;
907 }
908
909 if (xec > len) {
910 pr_cont("Unrecognized %s MCA bank error code.\n",
911 amd_df_mcablock_names[mca_type]);
912 return;
913 }
914
915 pr_cont("%s.\n", error_desc_array[xec]);
916}
917
918/* Decode errors according to Scalable MCA specification */
919static void decode_smca_errors(struct mce *m)
920{
921 u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank);
922 unsigned int hwid, mca_type, i;
923 u8 xec = XEC(m->status, xec_mask);
924 const char * const *error_desc_array;
925 const char *ip_name;
926 u32 low, high;
927 size_t len;
928
929 if (rdmsr_safe(addr, &low, &high)) {
930 pr_emerg("Invalid IP block specified, error information is unreliable.\n");
931 return;
932 }
933
934 hwid = high & MCI_IPID_HWID;
935 mca_type = (high & MCI_IPID_MCATYPE) >> 16;
936
937 pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low);
938
939 /*
940 * Based on hwid and mca_type values, decode errors from respective IPs.
941 * Note: mca_type values make sense only in the context of an hwid.
942 */
943 for (i = 0; i < ARRAY_SIZE(amd_hwids); i++)
944 if (amd_hwids[i].hwid == hwid)
945 break;
946
947 switch (i) {
948 case SMCA_F17H_CORE:
949 ip_name = (mca_type == SMCA_L3_CACHE) ?
950 "L3 Cache" : "F17h Core";
951 return decode_f17h_core_errors(ip_name, xec, mca_type);
952 break;
953
954 case SMCA_DF:
955 return decode_df_errors(xec, mca_type);
956 break;
957
958 case SMCA_UMC:
959 error_desc_array = f17h_umc_mce_desc;
960 len = ARRAY_SIZE(f17h_umc_mce_desc) - 1;
961 break;
962
963 case SMCA_PB:
964 error_desc_array = f17h_pb_mce_desc;
965 len = ARRAY_SIZE(f17h_pb_mce_desc) - 1;
966 break;
967
968 case SMCA_PSP:
969 error_desc_array = f17h_psp_mce_desc;
970 len = ARRAY_SIZE(f17h_psp_mce_desc) - 1;
971 break;
972
973 case SMCA_SMU:
974 error_desc_array = f17h_smu_mce_desc;
975 len = ARRAY_SIZE(f17h_smu_mce_desc) - 1;
976 break;
977
978 default:
979 pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid);
980 return;
981 }
982
983 ip_name = amd_hwids[i].name;
984 pr_emerg(HW_ERR "%s Error: ", ip_name);
985
986 if (xec > len) {
987 pr_cont("Unrecognized %s MCA bank error code.\n", ip_name);
988 return;
989 }
990
991 pr_cont("%s.\n", error_desc_array[xec]);
992}
993
694static inline void amd_decode_err_code(u16 ec) 994static inline void amd_decode_err_code(u16 ec)
695{ 995{
696 if (INT_ERROR(ec)) { 996 if (INT_ERROR(ec)) {
@@ -752,6 +1052,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
752 struct mce *m = (struct mce *)data; 1052 struct mce *m = (struct mce *)data;
753 struct cpuinfo_x86 *c = &cpu_data(m->extcpu); 1053 struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
754 int ecc; 1054 int ecc;
1055 u32 ebx = cpuid_ebx(0x80000007);
755 1056
756 if (amd_filter_mce(m)) 1057 if (amd_filter_mce(m))
757 return NOTIFY_STOP; 1058 return NOTIFY_STOP;
@@ -769,11 +1070,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
769 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), 1070 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
770 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); 1071 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
771 1072
772 if (c->x86 == 0x15 || c->x86 == 0x16) 1073 if (c->x86 >= 0x15)
773 pr_cont("|%s|%s", 1074 pr_cont("|%s|%s",
774 ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), 1075 ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
775 ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); 1076 ((m->status & MCI_STATUS_POISON) ? "Poison" : "-"));
776 1077
1078 if (!!(ebx & BIT(3))) {
1079 u32 low, high;
1080 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
1081
1082 if (!rdmsr_safe(addr, &low, &high) &&
1083 (low & MCI_CONFIG_MCAX))
1084 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
1085 }
1086
777 /* do the two bits[14:13] together */ 1087 /* do the two bits[14:13] together */
778 ecc = (m->status >> 45) & 0x3; 1088 ecc = (m->status >> 45) & 0x3;
779 if (ecc) 1089 if (ecc)
@@ -784,6 +1094,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
784 if (m->status & MCI_STATUS_ADDRV) 1094 if (m->status & MCI_STATUS_ADDRV)
785 pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr); 1095 pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr);
786 1096
1097 if (!!(ebx & BIT(3))) {
1098 decode_smca_errors(m);
1099 goto err_code;
1100 }
1101
787 if (!fam_ops) 1102 if (!fam_ops)
788 goto err_code; 1103 goto err_code;
789 1104
@@ -834,6 +1149,7 @@ static struct notifier_block amd_mce_dec_nb = {
834static int __init mce_amd_init(void) 1149static int __init mce_amd_init(void)
835{ 1150{
836 struct cpuinfo_x86 *c = &boot_cpu_data; 1151 struct cpuinfo_x86 *c = &boot_cpu_data;
1152 u32 ebx;
837 1153
838 if (c->x86_vendor != X86_VENDOR_AMD) 1154 if (c->x86_vendor != X86_VENDOR_AMD)
839 return -ENODEV; 1155 return -ENODEV;
@@ -888,10 +1204,18 @@ static int __init mce_amd_init(void)
888 fam_ops->mc2_mce = f16h_mc2_mce; 1204 fam_ops->mc2_mce = f16h_mc2_mce;
889 break; 1205 break;
890 1206
1207 case 0x17:
1208 ebx = cpuid_ebx(0x80000007);
1209 xec_mask = 0x3f;
1210 if (!(ebx & BIT(3))) {
1211 printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1212 goto err_out;
1213 }
1214 break;
1215
891 default: 1216 default:
892 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); 1217 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
893 kfree(fam_ops); 1218 goto err_out;
894 fam_ops = NULL;
895 } 1219 }
896 1220
897 pr_info("MCE: In-kernel MCE decoding enabled.\n"); 1221 pr_info("MCE: In-kernel MCE decoding enabled.\n");
@@ -899,6 +1223,11 @@ static int __init mce_amd_init(void)
899 mce_register_decode_chain(&amd_mce_dec_nb); 1223 mce_register_decode_chain(&amd_mce_dec_nb);
900 1224
901 return 0; 1225 return 0;
1226
1227err_out:
1228 kfree(fam_ops);
1229 fam_ops = NULL;
1230 return -EINVAL;
902} 1231}
903early_initcall(mce_amd_init); 1232early_initcall(mce_amd_init);
904 1233