diff options
-rw-r--r-- | drivers/edac/edac_core.h | 8 | ||||
-rw-r--r-- | drivers/edac/edac_mc.c | 72 | ||||
-rw-r--r-- | include/ras/ras_event.h | 102 |
3 files changed, 162 insertions, 20 deletions
diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h index f06ce9ab692c..740c7e22c023 100644 --- a/drivers/edac/edac_core.h +++ b/drivers/edac/edac_core.h | |||
@@ -463,12 +463,12 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, | |||
463 | const unsigned long page_frame_number, | 463 | const unsigned long page_frame_number, |
464 | const unsigned long offset_in_page, | 464 | const unsigned long offset_in_page, |
465 | const unsigned long syndrome, | 465 | const unsigned long syndrome, |
466 | const int layer0, | 466 | const int top_layer, |
467 | const int layer1, | 467 | const int mid_layer, |
468 | const int layer2, | 468 | const int low_layer, |
469 | const char *msg, | 469 | const char *msg, |
470 | const char *other_detail, | 470 | const char *other_detail, |
471 | const void *mcelog); | 471 | const void *arch_log); |
472 | 472 | ||
473 | /* | 473 | /* |
474 | * edac_device APIs | 474 | * edac_device APIs |
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 10f375032e96..ce25750a83f9 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c | |||
@@ -27,12 +27,17 @@ | |||
27 | #include <linux/list.h> | 27 | #include <linux/list.h> |
28 | #include <linux/ctype.h> | 28 | #include <linux/ctype.h> |
29 | #include <linux/edac.h> | 29 | #include <linux/edac.h> |
30 | #include <linux/bitops.h> | ||
30 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
31 | #include <asm/page.h> | 32 | #include <asm/page.h> |
32 | #include <asm/edac.h> | 33 | #include <asm/edac.h> |
33 | #include "edac_core.h" | 34 | #include "edac_core.h" |
34 | #include "edac_module.h" | 35 | #include "edac_module.h" |
35 | 36 | ||
37 | #define CREATE_TRACE_POINTS | ||
38 | #define TRACE_INCLUDE_PATH ../../include/ras | ||
39 | #include <ras/ras_event.h> | ||
40 | |||
36 | /* lock to memory controller's control array */ | 41 | /* lock to memory controller's control array */ |
37 | static DEFINE_MUTEX(mem_ctls_mutex); | 42 | static DEFINE_MUTEX(mem_ctls_mutex); |
38 | static LIST_HEAD(mc_devices); | 43 | static LIST_HEAD(mc_devices); |
@@ -384,6 +389,7 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num, | |||
384 | * which will perform kobj unregistration and the actual free | 389 | * which will perform kobj unregistration and the actual free |
385 | * will occur during the kobject callback operation | 390 | * will occur during the kobject callback operation |
386 | */ | 391 | */ |
392 | |||
387 | return mci; | 393 | return mci; |
388 | } | 394 | } |
389 | EXPORT_SYMBOL_GPL(edac_mc_alloc); | 395 | EXPORT_SYMBOL_GPL(edac_mc_alloc); |
@@ -902,19 +908,19 @@ static void edac_ce_error(struct mem_ctl_info *mci, | |||
902 | const bool enable_per_layer_report, | 908 | const bool enable_per_layer_report, |
903 | const unsigned long page_frame_number, | 909 | const unsigned long page_frame_number, |
904 | const unsigned long offset_in_page, | 910 | const unsigned long offset_in_page, |
905 | u32 grain) | 911 | long grain) |
906 | { | 912 | { |
907 | unsigned long remapped_page; | 913 | unsigned long remapped_page; |
908 | 914 | ||
909 | if (edac_mc_get_log_ce()) { | 915 | if (edac_mc_get_log_ce()) { |
910 | if (other_detail && *other_detail) | 916 | if (other_detail && *other_detail) |
911 | edac_mc_printk(mci, KERN_WARNING, | 917 | edac_mc_printk(mci, KERN_WARNING, |
912 | "CE %s on %s (%s%s - %s)\n", | 918 | "CE %s on %s (%s %s - %s)\n", |
913 | msg, label, location, | 919 | msg, label, location, |
914 | detail, other_detail); | 920 | detail, other_detail); |
915 | else | 921 | else |
916 | edac_mc_printk(mci, KERN_WARNING, | 922 | edac_mc_printk(mci, KERN_WARNING, |
917 | "CE %s on %s (%s%s)\n", | 923 | "CE %s on %s (%s %s)\n", |
918 | msg, label, location, | 924 | msg, label, location, |
919 | detail); | 925 | detail); |
920 | } | 926 | } |
@@ -953,12 +959,12 @@ static void edac_ue_error(struct mem_ctl_info *mci, | |||
953 | if (edac_mc_get_log_ue()) { | 959 | if (edac_mc_get_log_ue()) { |
954 | if (other_detail && *other_detail) | 960 | if (other_detail && *other_detail) |
955 | edac_mc_printk(mci, KERN_WARNING, | 961 | edac_mc_printk(mci, KERN_WARNING, |
956 | "UE %s on %s (%s%s - %s)\n", | 962 | "UE %s on %s (%s %s - %s)\n", |
957 | msg, label, location, detail, | 963 | msg, label, location, detail, |
958 | other_detail); | 964 | other_detail); |
959 | else | 965 | else |
960 | edac_mc_printk(mci, KERN_WARNING, | 966 | edac_mc_printk(mci, KERN_WARNING, |
961 | "UE %s on %s (%s%s)\n", | 967 | "UE %s on %s (%s %s)\n", |
962 | msg, label, location, detail); | 968 | msg, label, location, detail); |
963 | } | 969 | } |
964 | 970 | ||
@@ -975,27 +981,50 @@ static void edac_ue_error(struct mem_ctl_info *mci, | |||
975 | } | 981 | } |
976 | 982 | ||
977 | #define OTHER_LABEL " or " | 983 | #define OTHER_LABEL " or " |
984 | |||
985 | /** | ||
986 | * edac_mc_handle_error - reports a memory event to userspace | ||
987 | * | ||
988 | * @type: severity of the error (CE/UE/Fatal) | ||
989 | * @mci: a struct mem_ctl_info pointer | ||
990 | * @page_frame_number: mem page where the error occurred | ||
991 | * @offset_in_page: offset of the error inside the page | ||
992 | * @syndrome: ECC syndrome | ||
993 | * @top_layer: Memory layer[0] position | ||
994 | * @mid_layer: Memory layer[1] position | ||
995 | * @low_layer: Memory layer[2] position | ||
996 | * @msg: Message meaningful to the end users that | ||
997 | * explains the event | ||
998 | * @other_detail: Technical details about the event that | ||
999 | * may help hardware manufacturers and | ||
1000 | * EDAC developers to analyse the event | ||
1001 | * @arch_log: Architecture-specific struct that can | ||
1002 | * be used to add extended information to the | ||
1003 | * tracepoint, like dumping MCE registers. | ||
1004 | */ | ||
978 | void edac_mc_handle_error(const enum hw_event_mc_err_type type, | 1005 | void edac_mc_handle_error(const enum hw_event_mc_err_type type, |
979 | struct mem_ctl_info *mci, | 1006 | struct mem_ctl_info *mci, |
980 | const unsigned long page_frame_number, | 1007 | const unsigned long page_frame_number, |
981 | const unsigned long offset_in_page, | 1008 | const unsigned long offset_in_page, |
982 | const unsigned long syndrome, | 1009 | const unsigned long syndrome, |
983 | const int layer0, | 1010 | const int top_layer, |
984 | const int layer1, | 1011 | const int mid_layer, |
985 | const int layer2, | 1012 | const int low_layer, |
986 | const char *msg, | 1013 | const char *msg, |
987 | const char *other_detail, | 1014 | const char *other_detail, |
988 | const void *mcelog) | 1015 | const void *arch_log) |
989 | { | 1016 | { |
990 | /* FIXME: too much for stack: move it to some pre-alocated area */ | 1017 | /* FIXME: too much for stack: move it to some pre-alocated area */ |
991 | char detail[80], location[80]; | 1018 | char detail[80], location[80]; |
992 | char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms]; | 1019 | char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms]; |
993 | char *p; | 1020 | char *p; |
994 | int row = -1, chan = -1; | 1021 | int row = -1, chan = -1; |
995 | int pos[EDAC_MAX_LAYERS] = { layer0, layer1, layer2 }; | 1022 | int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer }; |
996 | int i; | 1023 | int i; |
997 | u32 grain; | 1024 | long grain; |
998 | bool enable_per_layer_report = false; | 1025 | bool enable_per_layer_report = false; |
1026 | u16 error_count; /* FIXME: make it a parameter */ | ||
1027 | u8 grain_bits; | ||
999 | 1028 | ||
1000 | debugf3("MC%d: %s()\n", mci->mc_idx, __func__); | 1029 | debugf3("MC%d: %s()\n", mci->mc_idx, __func__); |
1001 | 1030 | ||
@@ -1045,11 +1074,11 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, | |||
1045 | for (i = 0; i < mci->tot_dimms; i++) { | 1074 | for (i = 0; i < mci->tot_dimms; i++) { |
1046 | struct dimm_info *dimm = &mci->dimms[i]; | 1075 | struct dimm_info *dimm = &mci->dimms[i]; |
1047 | 1076 | ||
1048 | if (layer0 >= 0 && layer0 != dimm->location[0]) | 1077 | if (top_layer >= 0 && top_layer != dimm->location[0]) |
1049 | continue; | 1078 | continue; |
1050 | if (layer1 >= 0 && layer1 != dimm->location[1]) | 1079 | if (mid_layer >= 0 && mid_layer != dimm->location[1]) |
1051 | continue; | 1080 | continue; |
1052 | if (layer2 >= 0 && layer2 != dimm->location[2]) | 1081 | if (low_layer >= 0 && low_layer != dimm->location[2]) |
1053 | continue; | 1082 | continue; |
1054 | 1083 | ||
1055 | /* get the max grain, over the error match range */ | 1084 | /* get the max grain, over the error match range */ |
@@ -1120,11 +1149,22 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, | |||
1120 | edac_layer_name[mci->layers[i].type], | 1149 | edac_layer_name[mci->layers[i].type], |
1121 | pos[i]); | 1150 | pos[i]); |
1122 | } | 1151 | } |
1152 | if (p > location) | ||
1153 | *(p - 1) = '\0'; | ||
1154 | |||
1155 | /* Report the error via the trace interface */ | ||
1156 | |||
1157 | error_count = 1; /* FIXME: allow change it */ | ||
1158 | grain_bits = fls_long(grain) + 1; | ||
1159 | trace_mc_event(type, msg, label, error_count, | ||
1160 | mci->mc_idx, top_layer, mid_layer, low_layer, | ||
1161 | PAGES_TO_MiB(page_frame_number) | offset_in_page, | ||
1162 | grain_bits, syndrome, other_detail); | ||
1123 | 1163 | ||
1124 | /* Memory type dependent details about the error */ | 1164 | /* Memory type dependent details about the error */ |
1125 | if (type == HW_EVENT_ERR_CORRECTED) { | 1165 | if (type == HW_EVENT_ERR_CORRECTED) { |
1126 | snprintf(detail, sizeof(detail), | 1166 | snprintf(detail, sizeof(detail), |
1127 | "page:0x%lx offset:0x%lx grain:%d syndrome:0x%lx", | 1167 | "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx", |
1128 | page_frame_number, offset_in_page, | 1168 | page_frame_number, offset_in_page, |
1129 | grain, syndrome); | 1169 | grain, syndrome); |
1130 | edac_ce_error(mci, pos, msg, location, label, detail, | 1170 | edac_ce_error(mci, pos, msg, location, label, detail, |
@@ -1132,7 +1172,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, | |||
1132 | page_frame_number, offset_in_page, grain); | 1172 | page_frame_number, offset_in_page, grain); |
1133 | } else { | 1173 | } else { |
1134 | snprintf(detail, sizeof(detail), | 1174 | snprintf(detail, sizeof(detail), |
1135 | "page:0x%lx offset:0x%lx grain:%d", | 1175 | "page:0x%lx offset:0x%lx grain:%ld", |
1136 | page_frame_number, offset_in_page, grain); | 1176 | page_frame_number, offset_in_page, grain); |
1137 | 1177 | ||
1138 | edac_ue_error(mci, pos, msg, location, label, detail, | 1178 | edac_ue_error(mci, pos, msg, location, label, detail, |
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h new file mode 100644 index 000000000000..260470e72483 --- /dev/null +++ b/include/ras/ras_event.h | |||
@@ -0,0 +1,102 @@ | |||
1 | #undef TRACE_SYSTEM | ||
2 | #define TRACE_SYSTEM ras | ||
3 | #define TRACE_INCLUDE_FILE ras_event | ||
4 | |||
5 | #if !defined(_TRACE_HW_EVENT_MC_H) || defined(TRACE_HEADER_MULTI_READ) | ||
6 | #define _TRACE_HW_EVENT_MC_H | ||
7 | |||
8 | #include <linux/tracepoint.h> | ||
9 | #include <linux/edac.h> | ||
10 | #include <linux/ktime.h> | ||
11 | |||
12 | /* | ||
13 | * Hardware Events Report | ||
14 | * | ||
15 | * Those events are generated when hardware detected a corrected or | ||
16 | * uncorrected event, and are meant to replace the current API to report | ||
17 | * errors defined on both EDAC and MCE subsystems. | ||
18 | * | ||
19 | * FIXME: Add events for handling memory errors originated from the | ||
20 | * MCE subsystem. | ||
21 | */ | ||
22 | |||
23 | /* | ||
24 | * Hardware-independent Memory Controller specific events | ||
25 | */ | ||
26 | |||
27 | /* | ||
28 | * Default error mechanisms for Memory Controller errors (CE and UE) | ||
29 | */ | ||
30 | TRACE_EVENT(mc_event, | ||
31 | |||
32 | TP_PROTO(const unsigned int err_type, | ||
33 | const char *error_msg, | ||
34 | const char *label, | ||
35 | const int error_count, | ||
36 | const u8 mc_index, | ||
37 | const s8 top_layer, | ||
38 | const s8 mid_layer, | ||
39 | const s8 low_layer, | ||
40 | unsigned long address, | ||
41 | const u8 grain_bits, | ||
42 | unsigned long syndrome, | ||
43 | const char *driver_detail), | ||
44 | |||
45 | TP_ARGS(err_type, error_msg, label, error_count, mc_index, | ||
46 | top_layer, mid_layer, low_layer, address, grain_bits, | ||
47 | syndrome, driver_detail), | ||
48 | |||
49 | TP_STRUCT__entry( | ||
50 | __field( unsigned int, error_type ) | ||
51 | __string( msg, error_msg ) | ||
52 | __string( label, label ) | ||
53 | __field( u16, error_count ) | ||
54 | __field( u8, mc_index ) | ||
55 | __field( s8, top_layer ) | ||
56 | __field( s8, middle_layer ) | ||
57 | __field( s8, lower_layer ) | ||
58 | __field( long, address ) | ||
59 | __field( u8, grain_bits ) | ||
60 | __field( long, syndrome ) | ||
61 | __string( driver_detail, driver_detail ) | ||
62 | ), | ||
63 | |||
64 | TP_fast_assign( | ||
65 | __entry->error_type = err_type; | ||
66 | __assign_str(msg, error_msg); | ||
67 | __assign_str(label, label); | ||
68 | __entry->error_count = error_count; | ||
69 | __entry->mc_index = mc_index; | ||
70 | __entry->top_layer = top_layer; | ||
71 | __entry->middle_layer = mid_layer; | ||
72 | __entry->lower_layer = low_layer; | ||
73 | __entry->address = address; | ||
74 | __entry->grain_bits = grain_bits; | ||
75 | __entry->syndrome = syndrome; | ||
76 | __assign_str(driver_detail, driver_detail); | ||
77 | ), | ||
78 | |||
79 | TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)", | ||
80 | __entry->error_count, | ||
81 | (__entry->error_type == HW_EVENT_ERR_CORRECTED) ? "Corrected" : | ||
82 | ((__entry->error_type == HW_EVENT_ERR_FATAL) ? | ||
83 | "Fatal" : "Uncorrected"), | ||
84 | __entry->error_count > 1 ? "s" : "", | ||
85 | ((char *)__get_str(msg))[0] ? " " : "", | ||
86 | __get_str(msg), | ||
87 | __get_str(label), | ||
88 | __entry->mc_index, | ||
89 | __entry->top_layer, | ||
90 | __entry->middle_layer, | ||
91 | __entry->lower_layer, | ||
92 | __entry->address, | ||
93 | 1 << __entry->grain_bits, | ||
94 | __entry->syndrome, | ||
95 | ((char *)__get_str(driver_detail))[0] ? " " : "", | ||
96 | __get_str(driver_detail)) | ||
97 | ); | ||
98 | |||
99 | #endif /* _TRACE_HW_EVENT_MC_H */ | ||
100 | |||
101 | /* This part must be outside protection */ | ||
102 | #include <trace/define_trace.h> | ||