aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChen, Gong <gong.chen@linux.intel.com>2014-06-17 22:33:07 -0400
committerTony Luck <tony.luck@intel.com>2014-06-25 16:26:47 -0400
commit2dfb7d51a61d7ca91b131c8db612f27d9390f2d5 (patch)
treeb2e9375f1ffaf2dc93418d78c27b6a13b34c8e88
parentd963cd95bea93b7db9390a71d1e2cabbb3b2c3ea (diff)
trace, RAS: Add eMCA trace event interface
Add trace interface to elaborate all H/W error related information. Signed-off-by: Chen, Gong <gong.chen@linux.intel.com> Acked-by: Borislav Petkov <bp@suse.de> Signed-off-by: Tony Luck <tony.luck@intel.com>
-rw-r--r--drivers/acpi/Kconfig4
-rw-r--r--drivers/acpi/acpi_extlog.c27
-rw-r--r--drivers/firmware/efi/cper.c45
-rw-r--r--drivers/ras/ras.c3
-rw-r--r--include/linux/cper.h23
-rw-r--r--include/ras/ras_event.h64
6 files changed, 158 insertions, 8 deletions
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index a34a22841002..206942b8d105 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -370,6 +370,7 @@ config ACPI_EXTLOG
370 tristate "Extended Error Log support" 370 tristate "Extended Error Log support"
371 depends on X86_MCE && X86_LOCAL_APIC 371 depends on X86_MCE && X86_LOCAL_APIC
372 select UEFI_CPER 372 select UEFI_CPER
373 select RAS
373 default n 374 default n
374 help 375 help
375 Certain usages such as Predictive Failure Analysis (PFA) require 376 Certain usages such as Predictive Failure Analysis (PFA) require
@@ -384,6 +385,7 @@ config ACPI_EXTLOG
384 385
385 Enhanced MCA Logging allows firmware to provide additional error 386 Enhanced MCA Logging allows firmware to provide additional error
386 information to system software, synchronous with MCE or CMCI. This 387 information to system software, synchronous with MCE or CMCI. This
387 driver adds support for that functionality. 388 driver adds support for that functionality with corresponding
389 tracepoint which carries that information to userspace.
388 390
389endif # ACPI 391endif # ACPI
diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
index 185334114d71..e61da957f30f 100644
--- a/drivers/acpi/acpi_extlog.c
+++ b/drivers/acpi/acpi_extlog.c
@@ -16,6 +16,7 @@
16#include <asm/mce.h> 16#include <asm/mce.h>
17 17
18#include "apei/apei-internal.h" 18#include "apei/apei-internal.h"
19#include <ras/ras_event.h>
19 20
20#define EXT_ELOG_ENTRY_MASK GENMASK_ULL(51, 0) /* elog entry address mask */ 21#define EXT_ELOG_ENTRY_MASK GENMASK_ULL(51, 0) /* elog entry address mask */
21 22
@@ -137,8 +138,12 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
137 struct mce *mce = (struct mce *)data; 138 struct mce *mce = (struct mce *)data;
138 int bank = mce->bank; 139 int bank = mce->bank;
139 int cpu = mce->extcpu; 140 int cpu = mce->extcpu;
140 struct acpi_generic_status *estatus; 141 struct acpi_generic_status *estatus, *tmp;
141 int rc; 142 struct acpi_generic_data *gdata;
143 const uuid_le *fru_id = &NULL_UUID_LE;
144 char *fru_text = "";
145 uuid_le *sec_type;
146 static u32 err_seq;
142 147
143 estatus = extlog_elog_entry_check(cpu, bank); 148 estatus = extlog_elog_entry_check(cpu, bank);
144 if (estatus == NULL) 149 if (estatus == NULL)
@@ -148,7 +153,23 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
148 /* clear record status to enable BIOS to update it again */ 153 /* clear record status to enable BIOS to update it again */
149 estatus->block_status = 0; 154 estatus->block_status = 0;
150 155
151 rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu); 156 tmp = (struct acpi_generic_status *)elog_buf;
157 print_extlog_rcd(NULL, tmp, cpu);
158
159 /* log event via trace */
160 err_seq++;
161 gdata = (struct acpi_generic_data *)(tmp + 1);
162 if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
163 fru_id = (uuid_le *)gdata->fru_id;
164 if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
165 fru_text = gdata->fru_text;
166 sec_type = (uuid_le *)gdata->section_type;
167 if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
168 struct cper_sec_mem_err *mem = (void *)(gdata + 1);
169 if (gdata->error_data_length >= sizeof(*mem))
170 trace_extlog_mem_event(mem, err_seq, fru_id, fru_text,
171 (u8)gdata->error_severity);
172 }
152 173
153 return NOTIFY_STOP; 174 return NOTIFY_STOP;
154} 175}
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index ac33a9fed341..437e6fd47311 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -207,7 +207,7 @@ const char *cper_mem_err_type_str(unsigned int etype)
207} 207}
208EXPORT_SYMBOL_GPL(cper_mem_err_type_str); 208EXPORT_SYMBOL_GPL(cper_mem_err_type_str);
209 209
210static int cper_mem_err_location(const struct cper_sec_mem_err *mem, char *msg) 210static int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg)
211{ 211{
212 u32 len, n; 212 u32 len, n;
213 213
@@ -249,7 +249,7 @@ static int cper_mem_err_location(const struct cper_sec_mem_err *mem, char *msg)
249 return n; 249 return n;
250} 250}
251 251
252static int cper_dimm_err_location(const struct cper_sec_mem_err *mem, char *msg) 252static int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg)
253{ 253{
254 u32 len, n; 254 u32 len, n;
255 const char *bank = NULL, *device = NULL; 255 const char *bank = NULL, *device = NULL;
@@ -271,8 +271,44 @@ static int cper_dimm_err_location(const struct cper_sec_mem_err *mem, char *msg)
271 return n; 271 return n;
272} 272}
273 273
274void cper_mem_err_pack(const struct cper_sec_mem_err *mem,
275 struct cper_mem_err_compact *cmem)
276{
277 cmem->validation_bits = mem->validation_bits;
278 cmem->node = mem->node;
279 cmem->card = mem->card;
280 cmem->module = mem->module;
281 cmem->bank = mem->bank;
282 cmem->device = mem->device;
283 cmem->row = mem->row;
284 cmem->column = mem->column;
285 cmem->bit_pos = mem->bit_pos;
286 cmem->requestor_id = mem->requestor_id;
287 cmem->responder_id = mem->responder_id;
288 cmem->target_id = mem->target_id;
289 cmem->rank = mem->rank;
290 cmem->mem_array_handle = mem->mem_array_handle;
291 cmem->mem_dev_handle = mem->mem_dev_handle;
292}
293
294const char *cper_mem_err_unpack(struct trace_seq *p,
295 struct cper_mem_err_compact *cmem)
296{
297 const char *ret = p->buffer + p->len;
298
299 if (cper_mem_err_location(cmem, rcd_decode_str))
300 trace_seq_printf(p, "%s", rcd_decode_str);
301 if (cper_dimm_err_location(cmem, rcd_decode_str))
302 trace_seq_printf(p, "%s", rcd_decode_str);
303 trace_seq_putc(p, '\0');
304
305 return ret;
306}
307
274static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem) 308static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
275{ 309{
310 struct cper_mem_err_compact cmem;
311
276 if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS) 312 if (mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)
277 printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status); 313 printk("%s""error_status: 0x%016llx\n", pfx, mem->error_status);
278 if (mem->validation_bits & CPER_MEM_VALID_PA) 314 if (mem->validation_bits & CPER_MEM_VALID_PA)
@@ -281,14 +317,15 @@ static void cper_print_mem(const char *pfx, const struct cper_sec_mem_err *mem)
281 if (mem->validation_bits & CPER_MEM_VALID_PA_MASK) 317 if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
282 printk("%s""physical_address_mask: 0x%016llx\n", 318 printk("%s""physical_address_mask: 0x%016llx\n",
283 pfx, mem->physical_addr_mask); 319 pfx, mem->physical_addr_mask);
284 if (cper_mem_err_location(mem, rcd_decode_str)) 320 cper_mem_err_pack(mem, &cmem);
321 if (cper_mem_err_location(&cmem, rcd_decode_str))
285 printk("%s%s\n", pfx, rcd_decode_str); 322 printk("%s%s\n", pfx, rcd_decode_str);
286 if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { 323 if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
287 u8 etype = mem->error_type; 324 u8 etype = mem->error_type;
288 printk("%s""error_type: %d, %s\n", pfx, etype, 325 printk("%s""error_type: %d, %s\n", pfx, etype,
289 cper_mem_err_type_str(etype)); 326 cper_mem_err_type_str(etype));
290 } 327 }
291 if (cper_dimm_err_location(mem, rcd_decode_str)) 328 if (cper_dimm_err_location(&cmem, rcd_decode_str))
292 printk("%s%s\n", pfx, rcd_decode_str); 329 printk("%s%s\n", pfx, rcd_decode_str);
293} 330}
294 331
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index 4cac43a1e25c..b67dd362b7b6 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -23,4 +23,7 @@ static int __init ras_init(void)
23} 23}
24subsys_initcall(ras_init); 24subsys_initcall(ras_init);
25 25
26#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE)
27EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
28#endif
26EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event); 29EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
diff --git a/include/linux/cper.h b/include/linux/cper.h
index ed088b9c1298..76abba4b238e 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -22,6 +22,7 @@
22#define LINUX_CPER_H 22#define LINUX_CPER_H
23 23
24#include <linux/uuid.h> 24#include <linux/uuid.h>
25#include <linux/trace_seq.h>
25 26
26/* CPER record signature and the size */ 27/* CPER record signature and the size */
27#define CPER_SIG_RECORD "CPER" 28#define CPER_SIG_RECORD "CPER"
@@ -363,6 +364,24 @@ struct cper_sec_mem_err {
363 __u16 mem_dev_handle; /* module handle in UEFI 2.4 */ 364 __u16 mem_dev_handle; /* module handle in UEFI 2.4 */
364}; 365};
365 366
367struct cper_mem_err_compact {
368 __u64 validation_bits;
369 __u16 node;
370 __u16 card;
371 __u16 module;
372 __u16 bank;
373 __u16 device;
374 __u16 row;
375 __u16 column;
376 __u16 bit_pos;
377 __u64 requestor_id;
378 __u64 responder_id;
379 __u64 target_id;
380 __u16 rank;
381 __u16 mem_array_handle;
382 __u16 mem_dev_handle;
383};
384
366struct cper_sec_pcie { 385struct cper_sec_pcie {
367 __u64 validation_bits; 386 __u64 validation_bits;
368 __u32 port_type; 387 __u32 port_type;
@@ -406,5 +425,9 @@ const char *cper_severity_str(unsigned int);
406const char *cper_mem_err_type_str(unsigned int); 425const char *cper_mem_err_type_str(unsigned int);
407void cper_print_bits(const char *prefix, unsigned int bits, 426void cper_print_bits(const char *prefix, unsigned int bits,
408 const char * const strs[], unsigned int strs_size); 427 const char * const strs[], unsigned int strs_size);
428void cper_mem_err_pack(const struct cper_sec_mem_err *,
429 struct cper_mem_err_compact *);
430const char *cper_mem_err_unpack(struct trace_seq *,
431 struct cper_mem_err_compact *);
409 432
410#endif 433#endif
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index acbcbb88eaaa..47da53c27ffa 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -9,6 +9,70 @@
9#include <linux/edac.h> 9#include <linux/edac.h>
10#include <linux/ktime.h> 10#include <linux/ktime.h>
11#include <linux/aer.h> 11#include <linux/aer.h>
12#include <linux/cper.h>
13
14/*
15 * MCE Extended Error Log trace event
16 *
17 * These events are generated when hardware detects a corrected or
18 * uncorrected event.
19 */
20
21/* memory trace event */
22
23#if defined(CONFIG_ACPI_EXTLOG) || defined(CONFIG_ACPI_EXTLOG_MODULE)
24TRACE_EVENT(extlog_mem_event,
25 TP_PROTO(struct cper_sec_mem_err *mem,
26 u32 err_seq,
27 const uuid_le *fru_id,
28 const char *fru_text,
29 u8 sev),
30
31 TP_ARGS(mem, err_seq, fru_id, fru_text, sev),
32
33 TP_STRUCT__entry(
34 __field(u32, err_seq)
35 __field(u8, etype)
36 __field(u8, sev)
37 __field(u64, pa)
38 __field(u8, pa_mask_lsb)
39 __field_struct(uuid_le, fru_id)
40 __string(fru_text, fru_text)
41 __field_struct(struct cper_mem_err_compact, data)
42 ),
43
44 TP_fast_assign(
45 __entry->err_seq = err_seq;
46 if (mem->validation_bits & CPER_MEM_VALID_ERROR_TYPE)
47 __entry->etype = mem->error_type;
48 else
49 __entry->etype = ~0;
50 __entry->sev = sev;
51 if (mem->validation_bits & CPER_MEM_VALID_PA)
52 __entry->pa = mem->physical_addr;
53 else
54 __entry->pa = ~0ull;
55
56 if (mem->validation_bits & CPER_MEM_VALID_PA_MASK)
57 __entry->pa_mask_lsb = (u8)__ffs64(mem->physical_addr_mask);
58 else
59 __entry->pa_mask_lsb = ~0;
60 __entry->fru_id = *fru_id;
61 __assign_str(fru_text, fru_text);
62 cper_mem_err_pack(mem, &__entry->data);
63 ),
64
65 TP_printk("{%d} %s error: %s physical addr: %016llx (mask lsb: %x) %sFRU: %pUl %.20s",
66 __entry->err_seq,
67 cper_severity_str(__entry->sev),
68 cper_mem_err_type_str(__entry->etype),
69 __entry->pa,
70 __entry->pa_mask_lsb,
71 cper_mem_err_unpack(p, &__entry->data),
72 &__entry->fru_id,
73 __get_str(fru_text))
74);
75#endif
12 76
13/* 77/*
14 * Hardware Events Report 78 * Hardware Events Report