aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/edac/edac_core.h8
-rw-r--r--drivers/edac/edac_mc.c72
-rw-r--r--include/ras/ras_event.h102
3 files changed, 162 insertions, 20 deletions
diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
index f06ce9ab692c..740c7e22c023 100644
--- a/drivers/edac/edac_core.h
+++ b/drivers/edac/edac_core.h
@@ -463,12 +463,12 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
463 const unsigned long page_frame_number, 463 const unsigned long page_frame_number,
464 const unsigned long offset_in_page, 464 const unsigned long offset_in_page,
465 const unsigned long syndrome, 465 const unsigned long syndrome,
466 const int layer0, 466 const int top_layer,
467 const int layer1, 467 const int mid_layer,
468 const int layer2, 468 const int low_layer,
469 const char *msg, 469 const char *msg,
470 const char *other_detail, 470 const char *other_detail,
471 const void *mcelog); 471 const void *arch_log);
472 472
473/* 473/*
474 * edac_device APIs 474 * edac_device APIs
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 10f375032e96..ce25750a83f9 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -27,12 +27,17 @@
27#include <linux/list.h> 27#include <linux/list.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/edac.h> 29#include <linux/edac.h>
30#include <linux/bitops.h>
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/page.h> 32#include <asm/page.h>
32#include <asm/edac.h> 33#include <asm/edac.h>
33#include "edac_core.h" 34#include "edac_core.h"
34#include "edac_module.h" 35#include "edac_module.h"
35 36
37#define CREATE_TRACE_POINTS
38#define TRACE_INCLUDE_PATH ../../include/ras
39#include <ras/ras_event.h>
40
36/* lock to memory controller's control array */ 41/* lock to memory controller's control array */
37static DEFINE_MUTEX(mem_ctls_mutex); 42static DEFINE_MUTEX(mem_ctls_mutex);
38static LIST_HEAD(mc_devices); 43static LIST_HEAD(mc_devices);
@@ -384,6 +389,7 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
384 * which will perform kobj unregistration and the actual free 389 * which will perform kobj unregistration and the actual free
385 * will occur during the kobject callback operation 390 * will occur during the kobject callback operation
386 */ 391 */
392
387 return mci; 393 return mci;
388} 394}
389EXPORT_SYMBOL_GPL(edac_mc_alloc); 395EXPORT_SYMBOL_GPL(edac_mc_alloc);
@@ -902,19 +908,19 @@ static void edac_ce_error(struct mem_ctl_info *mci,
902 const bool enable_per_layer_report, 908 const bool enable_per_layer_report,
903 const unsigned long page_frame_number, 909 const unsigned long page_frame_number,
904 const unsigned long offset_in_page, 910 const unsigned long offset_in_page,
905 u32 grain) 911 long grain)
906{ 912{
907 unsigned long remapped_page; 913 unsigned long remapped_page;
908 914
909 if (edac_mc_get_log_ce()) { 915 if (edac_mc_get_log_ce()) {
910 if (other_detail && *other_detail) 916 if (other_detail && *other_detail)
911 edac_mc_printk(mci, KERN_WARNING, 917 edac_mc_printk(mci, KERN_WARNING,
912 "CE %s on %s (%s%s - %s)\n", 918 "CE %s on %s (%s %s - %s)\n",
913 msg, label, location, 919 msg, label, location,
914 detail, other_detail); 920 detail, other_detail);
915 else 921 else
916 edac_mc_printk(mci, KERN_WARNING, 922 edac_mc_printk(mci, KERN_WARNING,
917 "CE %s on %s (%s%s)\n", 923 "CE %s on %s (%s %s)\n",
918 msg, label, location, 924 msg, label, location,
919 detail); 925 detail);
920 } 926 }
@@ -953,12 +959,12 @@ static void edac_ue_error(struct mem_ctl_info *mci,
953 if (edac_mc_get_log_ue()) { 959 if (edac_mc_get_log_ue()) {
954 if (other_detail && *other_detail) 960 if (other_detail && *other_detail)
955 edac_mc_printk(mci, KERN_WARNING, 961 edac_mc_printk(mci, KERN_WARNING,
956 "UE %s on %s (%s%s - %s)\n", 962 "UE %s on %s (%s %s - %s)\n",
957 msg, label, location, detail, 963 msg, label, location, detail,
958 other_detail); 964 other_detail);
959 else 965 else
960 edac_mc_printk(mci, KERN_WARNING, 966 edac_mc_printk(mci, KERN_WARNING,
961 "UE %s on %s (%s%s)\n", 967 "UE %s on %s (%s %s)\n",
962 msg, label, location, detail); 968 msg, label, location, detail);
963 } 969 }
964 970
@@ -975,27 +981,50 @@ static void edac_ue_error(struct mem_ctl_info *mci,
975} 981}
976 982
977#define OTHER_LABEL " or " 983#define OTHER_LABEL " or "
984
985/**
986 * edac_mc_handle_error - reports a memory event to userspace
987 *
988 * @type: severity of the error (CE/UE/Fatal)
989 * @mci: a struct mem_ctl_info pointer
990 * @page_frame_number: mem page where the error occurred
991 * @offset_in_page: offset of the error inside the page
992 * @syndrome: ECC syndrome
993 * @top_layer: Memory layer[0] position
994 * @mid_layer: Memory layer[1] position
995 * @low_layer: Memory layer[2] position
996 * @msg: Message meaningful to the end users that
997 * explains the event
998 * @other_detail: Technical details about the event that
999 * may help hardware manufacturers and
1000 * EDAC developers to analyse the event
1001 * @arch_log: Architecture-specific struct that can
1002 * be used to add extended information to the
1003 * tracepoint, like dumping MCE registers.
1004 */
978void edac_mc_handle_error(const enum hw_event_mc_err_type type, 1005void edac_mc_handle_error(const enum hw_event_mc_err_type type,
979 struct mem_ctl_info *mci, 1006 struct mem_ctl_info *mci,
980 const unsigned long page_frame_number, 1007 const unsigned long page_frame_number,
981 const unsigned long offset_in_page, 1008 const unsigned long offset_in_page,
982 const unsigned long syndrome, 1009 const unsigned long syndrome,
983 const int layer0, 1010 const int top_layer,
984 const int layer1, 1011 const int mid_layer,
985 const int layer2, 1012 const int low_layer,
986 const char *msg, 1013 const char *msg,
987 const char *other_detail, 1014 const char *other_detail,
988 const void *mcelog) 1015 const void *arch_log)
989{ 1016{
990 /* FIXME: too much for stack: move it to some pre-alocated area */ 1017 /* FIXME: too much for stack: move it to some pre-alocated area */
991 char detail[80], location[80]; 1018 char detail[80], location[80];
992 char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms]; 1019 char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms];
993 char *p; 1020 char *p;
994 int row = -1, chan = -1; 1021 int row = -1, chan = -1;
995 int pos[EDAC_MAX_LAYERS] = { layer0, layer1, layer2 }; 1022 int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
996 int i; 1023 int i;
997 u32 grain; 1024 long grain;
998 bool enable_per_layer_report = false; 1025 bool enable_per_layer_report = false;
1026 u16 error_count; /* FIXME: make it a parameter */
1027 u8 grain_bits;
999 1028
1000 debugf3("MC%d: %s()\n", mci->mc_idx, __func__); 1029 debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
1001 1030
@@ -1045,11 +1074,11 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1045 for (i = 0; i < mci->tot_dimms; i++) { 1074 for (i = 0; i < mci->tot_dimms; i++) {
1046 struct dimm_info *dimm = &mci->dimms[i]; 1075 struct dimm_info *dimm = &mci->dimms[i];
1047 1076
1048 if (layer0 >= 0 && layer0 != dimm->location[0]) 1077 if (top_layer >= 0 && top_layer != dimm->location[0])
1049 continue; 1078 continue;
1050 if (layer1 >= 0 && layer1 != dimm->location[1]) 1079 if (mid_layer >= 0 && mid_layer != dimm->location[1])
1051 continue; 1080 continue;
1052 if (layer2 >= 0 && layer2 != dimm->location[2]) 1081 if (low_layer >= 0 && low_layer != dimm->location[2])
1053 continue; 1082 continue;
1054 1083
1055 /* get the max grain, over the error match range */ 1084 /* get the max grain, over the error match range */
@@ -1120,11 +1149,22 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1120 edac_layer_name[mci->layers[i].type], 1149 edac_layer_name[mci->layers[i].type],
1121 pos[i]); 1150 pos[i]);
1122 } 1151 }
1152 if (p > location)
1153 *(p - 1) = '\0';
1154
1155 /* Report the error via the trace interface */
1156
1157 error_count = 1; /* FIXME: allow change it */
1158 grain_bits = fls_long(grain) + 1;
1159 trace_mc_event(type, msg, label, error_count,
1160 mci->mc_idx, top_layer, mid_layer, low_layer,
1161 PAGES_TO_MiB(page_frame_number) | offset_in_page,
1162 grain_bits, syndrome, other_detail);
1123 1163
1124 /* Memory type dependent details about the error */ 1164 /* Memory type dependent details about the error */
1125 if (type == HW_EVENT_ERR_CORRECTED) { 1165 if (type == HW_EVENT_ERR_CORRECTED) {
1126 snprintf(detail, sizeof(detail), 1166 snprintf(detail, sizeof(detail),
1127 "page:0x%lx offset:0x%lx grain:%d syndrome:0x%lx", 1167 "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
1128 page_frame_number, offset_in_page, 1168 page_frame_number, offset_in_page,
1129 grain, syndrome); 1169 grain, syndrome);
1130 edac_ce_error(mci, pos, msg, location, label, detail, 1170 edac_ce_error(mci, pos, msg, location, label, detail,
@@ -1132,7 +1172,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1132 page_frame_number, offset_in_page, grain); 1172 page_frame_number, offset_in_page, grain);
1133 } else { 1173 } else {
1134 snprintf(detail, sizeof(detail), 1174 snprintf(detail, sizeof(detail),
1135 "page:0x%lx offset:0x%lx grain:%d", 1175 "page:0x%lx offset:0x%lx grain:%ld",
1136 page_frame_number, offset_in_page, grain); 1176 page_frame_number, offset_in_page, grain);
1137 1177
1138 edac_ue_error(mci, pos, msg, location, label, detail, 1178 edac_ue_error(mci, pos, msg, location, label, detail,
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
new file mode 100644
index 000000000000..260470e72483
--- /dev/null
+++ b/include/ras/ras_event.h
@@ -0,0 +1,102 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ras
3#define TRACE_INCLUDE_FILE ras_event
4
5#if !defined(_TRACE_HW_EVENT_MC_H) || defined(TRACE_HEADER_MULTI_READ)
6#define _TRACE_HW_EVENT_MC_H
7
8#include <linux/tracepoint.h>
9#include <linux/edac.h>
10#include <linux/ktime.h>
11
12/*
13 * Hardware Events Report
14 *
15 * Those events are generated when hardware detected a corrected or
16 * uncorrected event, and are meant to replace the current API to report
17 * errors defined on both EDAC and MCE subsystems.
18 *
19 * FIXME: Add events for handling memory errors originated from the
20 * MCE subsystem.
21 */
22
23/*
24 * Hardware-independent Memory Controller specific events
25 */
26
27/*
28 * Default error mechanisms for Memory Controller errors (CE and UE)
29 */
30TRACE_EVENT(mc_event,
31
32 TP_PROTO(const unsigned int err_type,
33 const char *error_msg,
34 const char *label,
35 const int error_count,
36 const u8 mc_index,
37 const s8 top_layer,
38 const s8 mid_layer,
39 const s8 low_layer,
40 unsigned long address,
41 const u8 grain_bits,
42 unsigned long syndrome,
43 const char *driver_detail),
44
45 TP_ARGS(err_type, error_msg, label, error_count, mc_index,
46 top_layer, mid_layer, low_layer, address, grain_bits,
47 syndrome, driver_detail),
48
49 TP_STRUCT__entry(
50 __field( unsigned int, error_type )
51 __string( msg, error_msg )
52 __string( label, label )
53 __field( u16, error_count )
54 __field( u8, mc_index )
55 __field( s8, top_layer )
56 __field( s8, middle_layer )
57 __field( s8, lower_layer )
58 __field( long, address )
59 __field( u8, grain_bits )
60 __field( long, syndrome )
61 __string( driver_detail, driver_detail )
62 ),
63
64 TP_fast_assign(
65 __entry->error_type = err_type;
66 __assign_str(msg, error_msg);
67 __assign_str(label, label);
68 __entry->error_count = error_count;
69 __entry->mc_index = mc_index;
70 __entry->top_layer = top_layer;
71 __entry->middle_layer = mid_layer;
72 __entry->lower_layer = low_layer;
73 __entry->address = address;
74 __entry->grain_bits = grain_bits;
75 __entry->syndrome = syndrome;
76 __assign_str(driver_detail, driver_detail);
77 ),
78
79 TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)",
80 __entry->error_count,
81 (__entry->error_type == HW_EVENT_ERR_CORRECTED) ? "Corrected" :
82 ((__entry->error_type == HW_EVENT_ERR_FATAL) ?
83 "Fatal" : "Uncorrected"),
84 __entry->error_count > 1 ? "s" : "",
85 ((char *)__get_str(msg))[0] ? " " : "",
86 __get_str(msg),
87 __get_str(label),
88 __entry->mc_index,
89 __entry->top_layer,
90 __entry->middle_layer,
91 __entry->lower_layer,
92 __entry->address,
93 1 << __entry->grain_bits,
94 __entry->syndrome,
95 ((char *)__get_str(driver_detail))[0] ? " " : "",
96 __get_str(driver_detail))
97);
98
99#endif /* _TRACE_HW_EVENT_MC_H */
100
101/* This part must be outside protection */
102#include <trace/define_trace.h>