aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-02-28 23:42:33 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-28 23:42:33 -0500
commitad6c2c2eb34f234d6253292b9b3c047614fbfe7e (patch)
tree8ceb00db9874c09f3002b5ca579f1f9146b30a28 /drivers/edac
parent19cc90f58d4f2538b4cf5371681a057d2e5209f2 (diff)
parentb0769891ba7baa53f270dc70d71934748beb4c5b (diff)
Merge branch 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac
Pull EDAC fixes and ghes-edac from Mauro Carvalho Chehab: "For: - Some fixes at edac drivers (i7core_edac, sb_edac, i3200_edac); - error injection support for i5100, when EDAC debug is enabled; - fix edac when it is loaded builtin (early init for the subsystem); - a "Firmware First" EDAC driver, allowing ghes to report errors via EDAC (ghes-edac). With regards to ghes-edac, this fixes a longstanding BZ at Red Hat that happens with Nehalem and Sandy Bridge CPUs: when both GHES and i7core_edac or sb_edac are running, the error reports are unpredictable, as both BIOS and OS race to access the registers. With ghes-edac, the EDAC core will refuse to register any other concurrent memory error driver. This patchset moves the ghes struct definitions to a separate header file (include/acpi/ghes.h) and adds 3 hooks at apei/ghes.c to register/unregister and to report errors via ghes-edac. Those changes were acked by ghes driver maintainer (Huang)." * 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac: (30 commits) i5100_edac: convert to use simple_open() ghes_edac: fix to use list_for_each_entry_safe() when delete list items ghes_edac: Fix RAS tracing ghes_edac: Make it compliant with UEFI spec 2.3.1 ghes_edac: Improve driver's printk messages ghes_edac: Don't credit the same memory dimm twice ghes_edac: do a better job of filling EDAC DIMM info ghes_edac: add support for reporting errors via EDAC ghes_edac: Register at EDAC core the BIOS report ghes: add the needed hooks for EDAC error report ghes: move structures/enum to a header file edac: add support for error type "Info" edac: add support for raw error reports edac: reduce stack pressure by using a pre-allocated buffer edac: lock module owner to avoid error report conflicts edac: remove proc_name from mci structure edac: add a new memory layer type edac: initialize the core earlier edac: better report error conditions in debug mode i5100_edac: Remove two checkpatch warnings ...
Diffstat (limited to 'drivers/edac')
-rw-r--r--drivers/edac/Kconfig23
-rw-r--r--drivers/edac/Makefile1
-rw-r--r--drivers/edac/edac_core.h5
-rw-r--r--drivers/edac/edac_mc.c152
-rw-r--r--drivers/edac/edac_mc_sysfs.c36
-rw-r--r--drivers/edac/edac_module.c2
-rw-r--r--drivers/edac/edac_pci_sysfs.c2
-rw-r--r--drivers/edac/ghes_edac.c537
-rw-r--r--drivers/edac/i3200_edac.c37
-rw-r--r--drivers/edac/i5100_edac.c178
-rw-r--r--drivers/edac/i7core_edac.c8
-rw-r--r--drivers/edac/sb_edac.c2
12 files changed, 899 insertions, 84 deletions
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index acb709bfac0f..e443f2c1dfd1 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -80,6 +80,29 @@ config EDAC_MM_EDAC
80 occurred so that a particular failing memory module can be 80 occurred so that a particular failing memory module can be
81 replaced. If unsure, select 'Y'. 81 replaced. If unsure, select 'Y'.
82 82
83config EDAC_GHES
84 bool "Output ACPI APEI/GHES BIOS detected errors via EDAC"
85 depends on ACPI_APEI_GHES && (EDAC_MM_EDAC=y)
86 default y
87 help
88 Not all machines support hardware-driven error report. Some of those
89 provide a BIOS-driven error report mechanism via ACPI, using the
90 APEI/GHES driver. By enabling this option, the error reports provided
91 by GHES are sent to userspace via the EDAC API.
92
93 When this option is enabled, it will disable the hardware-driven
94 mechanisms, if a GHES BIOS is detected, entering into the
95 "Firmware First" mode.
96
97 It should be noticed that keeping both GHES and a hardware-driven
98 error mechanism won't work well, as BIOS will race with OS, while
99 reading the error registers. So, if you want to not use "Firmware
100 first" GHES error mechanism, you should disable GHES either at
101 compilation time or by passing "ghes.disable=1" Kernel parameter
102 at boot time.
103
104 In doubt, say 'Y'.
105
83config EDAC_AMD64 106config EDAC_AMD64
84 tristate "AMD64 (Opteron, Athlon64) K8, F10h" 107 tristate "AMD64 (Opteron, Athlon64) K8, F10h"
85 depends on EDAC_MM_EDAC && AMD_NB && X86_64 && EDAC_DECODE_MCE 108 depends on EDAC_MM_EDAC && AMD_NB && X86_64 && EDAC_DECODE_MCE
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 5608a9ba61b7..4154ed6a02c6 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -16,6 +16,7 @@ ifdef CONFIG_PCI
16edac_core-y += edac_pci.o edac_pci_sysfs.o 16edac_core-y += edac_pci.o edac_pci_sysfs.o
17endif 17endif
18 18
19obj-$(CONFIG_EDAC_GHES) += ghes_edac.o
19obj-$(CONFIG_EDAC_MCE_INJ) += mce_amd_inj.o 20obj-$(CONFIG_EDAC_MCE_INJ) += mce_amd_inj.o
20 21
21edac_mce_amd-y := mce_amd.o 22edac_mce_amd-y := mce_amd.o
diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
index 23bb99fa44f1..3c2625e7980d 100644
--- a/drivers/edac/edac_core.h
+++ b/drivers/edac/edac_core.h
@@ -453,6 +453,11 @@ extern struct mem_ctl_info *find_mci_by_dev(struct device *dev);
453extern struct mem_ctl_info *edac_mc_del_mc(struct device *dev); 453extern struct mem_ctl_info *edac_mc_del_mc(struct device *dev);
454extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, 454extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci,
455 unsigned long page); 455 unsigned long page);
456
457void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
458 struct mem_ctl_info *mci,
459 struct edac_raw_error_desc *e);
460
456void edac_mc_handle_error(const enum hw_event_mc_err_type type, 461void edac_mc_handle_error(const enum hw_event_mc_err_type type,
457 struct mem_ctl_info *mci, 462 struct mem_ctl_info *mci,
458 const u16 error_count, 463 const u16 error_count,
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index d1e9eb191f2b..cdb81aa73ab7 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -42,6 +42,12 @@
42static DEFINE_MUTEX(mem_ctls_mutex); 42static DEFINE_MUTEX(mem_ctls_mutex);
43static LIST_HEAD(mc_devices); 43static LIST_HEAD(mc_devices);
44 44
45/*
46 * Used to lock EDAC MC to just one module, avoiding two drivers e. g.
47 * apei/ghes and i7core_edac to be used at the same time.
48 */
49static void const *edac_mc_owner;
50
45unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf, 51unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf,
46 unsigned len) 52 unsigned len)
47{ 53{
@@ -441,13 +447,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
441 447
442 mci->op_state = OP_ALLOC; 448 mci->op_state = OP_ALLOC;
443 449
444 /* at this point, the root kobj is valid, and in order to
445 * 'free' the object, then the function:
446 * edac_mc_unregister_sysfs_main_kobj() must be called
447 * which will perform kobj unregistration and the actual free
448 * will occur during the kobject callback operation
449 */
450
451 return mci; 450 return mci;
452 451
453error: 452error:
@@ -666,9 +665,9 @@ fail1:
666 return 1; 665 return 1;
667} 666}
668 667
669static void del_mc_from_global_list(struct mem_ctl_info *mci) 668static int del_mc_from_global_list(struct mem_ctl_info *mci)
670{ 669{
671 atomic_dec(&edac_handlers); 670 int handlers = atomic_dec_return(&edac_handlers);
672 list_del_rcu(&mci->link); 671 list_del_rcu(&mci->link);
673 672
674 /* these are for safe removal of devices from global list while 673 /* these are for safe removal of devices from global list while
@@ -676,6 +675,8 @@ static void del_mc_from_global_list(struct mem_ctl_info *mci)
676 */ 675 */
677 synchronize_rcu(); 676 synchronize_rcu();
678 INIT_LIST_HEAD(&mci->link); 677 INIT_LIST_HEAD(&mci->link);
678
679 return handlers;
679} 680}
680 681
681/** 682/**
@@ -719,6 +720,7 @@ EXPORT_SYMBOL(edac_mc_find);
719/* FIXME - should a warning be printed if no error detection? correction? */ 720/* FIXME - should a warning be printed if no error detection? correction? */
720int edac_mc_add_mc(struct mem_ctl_info *mci) 721int edac_mc_add_mc(struct mem_ctl_info *mci)
721{ 722{
723 int ret = -EINVAL;
722 edac_dbg(0, "\n"); 724 edac_dbg(0, "\n");
723 725
724#ifdef CONFIG_EDAC_DEBUG 726#ifdef CONFIG_EDAC_DEBUG
@@ -749,6 +751,11 @@ int edac_mc_add_mc(struct mem_ctl_info *mci)
749#endif 751#endif
750 mutex_lock(&mem_ctls_mutex); 752 mutex_lock(&mem_ctls_mutex);
751 753
754 if (edac_mc_owner && edac_mc_owner != mci->mod_name) {
755 ret = -EPERM;
756 goto fail0;
757 }
758
752 if (add_mc_to_global_list(mci)) 759 if (add_mc_to_global_list(mci))
753 goto fail0; 760 goto fail0;
754 761
@@ -775,6 +782,8 @@ int edac_mc_add_mc(struct mem_ctl_info *mci)
775 edac_mc_printk(mci, KERN_INFO, "Giving out device to '%s' '%s':" 782 edac_mc_printk(mci, KERN_INFO, "Giving out device to '%s' '%s':"
776 " DEV %s\n", mci->mod_name, mci->ctl_name, edac_dev_name(mci)); 783 " DEV %s\n", mci->mod_name, mci->ctl_name, edac_dev_name(mci));
777 784
785 edac_mc_owner = mci->mod_name;
786
778 mutex_unlock(&mem_ctls_mutex); 787 mutex_unlock(&mem_ctls_mutex);
779 return 0; 788 return 0;
780 789
@@ -783,7 +792,7 @@ fail1:
783 792
784fail0: 793fail0:
785 mutex_unlock(&mem_ctls_mutex); 794 mutex_unlock(&mem_ctls_mutex);
786 return 1; 795 return ret;
787} 796}
788EXPORT_SYMBOL_GPL(edac_mc_add_mc); 797EXPORT_SYMBOL_GPL(edac_mc_add_mc);
789 798
@@ -809,7 +818,8 @@ struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
809 return NULL; 818 return NULL;
810 } 819 }
811 820
812 del_mc_from_global_list(mci); 821 if (!del_mc_from_global_list(mci))
822 edac_mc_owner = NULL;
813 mutex_unlock(&mem_ctls_mutex); 823 mutex_unlock(&mem_ctls_mutex);
814 824
815 /* flush workq processes */ 825 /* flush workq processes */
@@ -907,6 +917,7 @@ const char *edac_layer_name[] = {
907 [EDAC_MC_LAYER_CHANNEL] = "channel", 917 [EDAC_MC_LAYER_CHANNEL] = "channel",
908 [EDAC_MC_LAYER_SLOT] = "slot", 918 [EDAC_MC_LAYER_SLOT] = "slot",
909 [EDAC_MC_LAYER_CHIP_SELECT] = "csrow", 919 [EDAC_MC_LAYER_CHIP_SELECT] = "csrow",
920 [EDAC_MC_LAYER_ALL_MEM] = "memory",
910}; 921};
911EXPORT_SYMBOL_GPL(edac_layer_name); 922EXPORT_SYMBOL_GPL(edac_layer_name);
912 923
@@ -1054,7 +1065,46 @@ static void edac_ue_error(struct mem_ctl_info *mci,
1054 edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count); 1065 edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count);
1055} 1066}
1056 1067
1057#define OTHER_LABEL " or " 1068/**
1069 * edac_raw_mc_handle_error - reports a memory event to userspace without doing
1070 * anything to discover the error location
1071 *
1072 * @type: severity of the error (CE/UE/Fatal)
1073 * @mci: a struct mem_ctl_info pointer
1074 * @e: error description
1075 *
1076 * This raw function is used internally by edac_mc_handle_error(). It should
1077 * only be called directly when the hardware error come directly from BIOS,
1078 * like in the case of APEI GHES driver.
1079 */
1080void edac_raw_mc_handle_error(const enum hw_event_mc_err_type type,
1081 struct mem_ctl_info *mci,
1082 struct edac_raw_error_desc *e)
1083{
1084 char detail[80];
1085 int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer };
1086
1087 /* Memory type dependent details about the error */
1088 if (type == HW_EVENT_ERR_CORRECTED) {
1089 snprintf(detail, sizeof(detail),
1090 "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
1091 e->page_frame_number, e->offset_in_page,
1092 e->grain, e->syndrome);
1093 edac_ce_error(mci, e->error_count, pos, e->msg, e->location, e->label,
1094 detail, e->other_detail, e->enable_per_layer_report,
1095 e->page_frame_number, e->offset_in_page, e->grain);
1096 } else {
1097 snprintf(detail, sizeof(detail),
1098 "page:0x%lx offset:0x%lx grain:%ld",
1099 e->page_frame_number, e->offset_in_page, e->grain);
1100
1101 edac_ue_error(mci, e->error_count, pos, e->msg, e->location, e->label,
1102 detail, e->other_detail, e->enable_per_layer_report);
1103 }
1104
1105
1106}
1107EXPORT_SYMBOL_GPL(edac_raw_mc_handle_error);
1058 1108
1059/** 1109/**
1060 * edac_mc_handle_error - reports a memory event to userspace 1110 * edac_mc_handle_error - reports a memory event to userspace
@@ -1086,19 +1136,27 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1086 const char *msg, 1136 const char *msg,
1087 const char *other_detail) 1137 const char *other_detail)
1088{ 1138{
1089 /* FIXME: too much for stack: move it to some pre-alocated area */
1090 char detail[80], location[80];
1091 char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms];
1092 char *p; 1139 char *p;
1093 int row = -1, chan = -1; 1140 int row = -1, chan = -1;
1094 int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer }; 1141 int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
1095 int i; 1142 int i, n_labels = 0;
1096 long grain;
1097 bool enable_per_layer_report = false;
1098 u8 grain_bits; 1143 u8 grain_bits;
1144 struct edac_raw_error_desc *e = &mci->error_desc;
1099 1145
1100 edac_dbg(3, "MC%d\n", mci->mc_idx); 1146 edac_dbg(3, "MC%d\n", mci->mc_idx);
1101 1147
1148 /* Fills the error report buffer */
1149 memset(e, 0, sizeof (*e));
1150 e->error_count = error_count;
1151 e->top_layer = top_layer;
1152 e->mid_layer = mid_layer;
1153 e->low_layer = low_layer;
1154 e->page_frame_number = page_frame_number;
1155 e->offset_in_page = offset_in_page;
1156 e->syndrome = syndrome;
1157 e->msg = msg;
1158 e->other_detail = other_detail;
1159
1102 /* 1160 /*
1103 * Check if the event report is consistent and if the memory 1161 * Check if the event report is consistent and if the memory
1104 * location is known. If it is known, enable_per_layer_report will be 1162 * location is known. If it is known, enable_per_layer_report will be
@@ -1121,7 +1179,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1121 pos[i] = -1; 1179 pos[i] = -1;
1122 } 1180 }
1123 if (pos[i] >= 0) 1181 if (pos[i] >= 0)
1124 enable_per_layer_report = true; 1182 e->enable_per_layer_report = true;
1125 } 1183 }
1126 1184
1127 /* 1185 /*
@@ -1135,8 +1193,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1135 * where each memory belongs to a separate channel within the same 1193 * where each memory belongs to a separate channel within the same
1136 * branch. 1194 * branch.
1137 */ 1195 */
1138 grain = 0; 1196 p = e->label;
1139 p = label;
1140 *p = '\0'; 1197 *p = '\0';
1141 1198
1142 for (i = 0; i < mci->tot_dimms; i++) { 1199 for (i = 0; i < mci->tot_dimms; i++) {
@@ -1150,8 +1207,8 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1150 continue; 1207 continue;
1151 1208
1152 /* get the max grain, over the error match range */ 1209 /* get the max grain, over the error match range */
1153 if (dimm->grain > grain) 1210 if (dimm->grain > e->grain)
1154 grain = dimm->grain; 1211 e->grain = dimm->grain;
1155 1212
1156 /* 1213 /*
1157 * If the error is memory-controller wide, there's no need to 1214 * If the error is memory-controller wide, there's no need to
@@ -1159,8 +1216,13 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1159 * channel/memory controller/... may be affected. 1216 * channel/memory controller/... may be affected.
1160 * Also, don't show errors for empty DIMM slots. 1217 * Also, don't show errors for empty DIMM slots.
1161 */ 1218 */
1162 if (enable_per_layer_report && dimm->nr_pages) { 1219 if (e->enable_per_layer_report && dimm->nr_pages) {
1163 if (p != label) { 1220 if (n_labels >= EDAC_MAX_LABELS) {
1221 e->enable_per_layer_report = false;
1222 break;
1223 }
1224 n_labels++;
1225 if (p != e->label) {
1164 strcpy(p, OTHER_LABEL); 1226 strcpy(p, OTHER_LABEL);
1165 p += strlen(OTHER_LABEL); 1227 p += strlen(OTHER_LABEL);
1166 } 1228 }
@@ -1187,12 +1249,12 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1187 } 1249 }
1188 } 1250 }
1189 1251
1190 if (!enable_per_layer_report) { 1252 if (!e->enable_per_layer_report) {
1191 strcpy(label, "any memory"); 1253 strcpy(e->label, "any memory");
1192 } else { 1254 } else {
1193 edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan); 1255 edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan);
1194 if (p == label) 1256 if (p == e->label)
1195 strcpy(label, "unknown memory"); 1257 strcpy(e->label, "unknown memory");
1196 if (type == HW_EVENT_ERR_CORRECTED) { 1258 if (type == HW_EVENT_ERR_CORRECTED) {
1197 if (row >= 0) { 1259 if (row >= 0) {
1198 mci->csrows[row]->ce_count += error_count; 1260 mci->csrows[row]->ce_count += error_count;
@@ -1205,7 +1267,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1205 } 1267 }
1206 1268
1207 /* Fill the RAM location data */ 1269 /* Fill the RAM location data */
1208 p = location; 1270 p = e->location;
1209 1271
1210 for (i = 0; i < mci->n_layers; i++) { 1272 for (i = 0; i < mci->n_layers; i++) {
1211 if (pos[i] < 0) 1273 if (pos[i] < 0)
@@ -1215,32 +1277,16 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
1215 edac_layer_name[mci->layers[i].type], 1277 edac_layer_name[mci->layers[i].type],
1216 pos[i]); 1278 pos[i]);
1217 } 1279 }
1218 if (p > location) 1280 if (p > e->location)
1219 *(p - 1) = '\0'; 1281 *(p - 1) = '\0';
1220 1282
1221 /* Report the error via the trace interface */ 1283 /* Report the error via the trace interface */
1222 grain_bits = fls_long(grain) + 1; 1284 grain_bits = fls_long(e->grain) + 1;
1223 trace_mc_event(type, msg, label, error_count, 1285 trace_mc_event(type, e->msg, e->label, e->error_count,
1224 mci->mc_idx, top_layer, mid_layer, low_layer, 1286 mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
1225 PAGES_TO_MiB(page_frame_number) | offset_in_page, 1287 PAGES_TO_MiB(e->page_frame_number) | e->offset_in_page,
1226 grain_bits, syndrome, other_detail); 1288 grain_bits, e->syndrome, e->other_detail);
1227 1289
1228 /* Memory type dependent details about the error */ 1290 edac_raw_mc_handle_error(type, mci, e);
1229 if (type == HW_EVENT_ERR_CORRECTED) {
1230 snprintf(detail, sizeof(detail),
1231 "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
1232 page_frame_number, offset_in_page,
1233 grain, syndrome);
1234 edac_ce_error(mci, error_count, pos, msg, location, label,
1235 detail, other_detail, enable_per_layer_report,
1236 page_frame_number, offset_in_page, grain);
1237 } else {
1238 snprintf(detail, sizeof(detail),
1239 "page:0x%lx offset:0x%lx grain:%ld",
1240 page_frame_number, offset_in_page, grain);
1241
1242 edac_ue_error(mci, error_count, pos, msg, location, label,
1243 detail, other_detail, enable_per_layer_report);
1244 }
1245} 1291}
1246EXPORT_SYMBOL_GPL(edac_mc_handle_error); 1292EXPORT_SYMBOL_GPL(edac_mc_handle_error);
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 0ca1ca71157f..4f4b6137d74e 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -7,7 +7,7 @@
7 * 7 *
8 * Written Doug Thompson <norsk5@xmission.com> www.softwarebitmaker.com 8 * Written Doug Thompson <norsk5@xmission.com> www.softwarebitmaker.com
9 * 9 *
10 * (c) 2012 - Mauro Carvalho Chehab <mchehab@redhat.com> 10 * (c) 2012-2013 - Mauro Carvalho Chehab <mchehab@redhat.com>
11 * The entire API were re-written, and ported to use struct device 11 * The entire API were re-written, and ported to use struct device
12 * 12 *
13 */ 13 */
@@ -429,8 +429,12 @@ static int edac_create_csrow_objects(struct mem_ctl_info *mci)
429 if (!nr_pages_per_csrow(csrow)) 429 if (!nr_pages_per_csrow(csrow))
430 continue; 430 continue;
431 err = edac_create_csrow_object(mci, mci->csrows[i], i); 431 err = edac_create_csrow_object(mci, mci->csrows[i], i);
432 if (err < 0) 432 if (err < 0) {
433 edac_dbg(1,
434 "failure: create csrow objects for csrow %d\n",
435 i);
433 goto error; 436 goto error;
437 }
434 } 438 }
435 return 0; 439 return 0;
436 440
@@ -677,9 +681,6 @@ static ssize_t mci_sdram_scrub_rate_store(struct device *dev,
677 unsigned long bandwidth = 0; 681 unsigned long bandwidth = 0;
678 int new_bw = 0; 682 int new_bw = 0;
679 683
680 if (!mci->set_sdram_scrub_rate)
681 return -ENODEV;
682
683 if (strict_strtoul(data, 10, &bandwidth) < 0) 684 if (strict_strtoul(data, 10, &bandwidth) < 0)
684 return -EINVAL; 685 return -EINVAL;
685 686
@@ -703,9 +704,6 @@ static ssize_t mci_sdram_scrub_rate_show(struct device *dev,
703 struct mem_ctl_info *mci = to_mci(dev); 704 struct mem_ctl_info *mci = to_mci(dev);
704 int bandwidth = 0; 705 int bandwidth = 0;
705 706
706 if (!mci->get_sdram_scrub_rate)
707 return -ENODEV;
708
709 bandwidth = mci->get_sdram_scrub_rate(mci); 707 bandwidth = mci->get_sdram_scrub_rate(mci);
710 if (bandwidth < 0) { 708 if (bandwidth < 0) {
711 edac_printk(KERN_DEBUG, EDAC_MC, "Error reading scrub rate\n"); 709 edac_printk(KERN_DEBUG, EDAC_MC, "Error reading scrub rate\n");
@@ -866,8 +864,7 @@ DEVICE_ATTR(ce_count, S_IRUGO, mci_ce_count_show, NULL);
866DEVICE_ATTR(max_location, S_IRUGO, mci_max_location_show, NULL); 864DEVICE_ATTR(max_location, S_IRUGO, mci_max_location_show, NULL);
867 865
868/* memory scrubber attribute file */ 866/* memory scrubber attribute file */
869DEVICE_ATTR(sdram_scrub_rate, S_IRUGO | S_IWUSR, mci_sdram_scrub_rate_show, 867DEVICE_ATTR(sdram_scrub_rate, 0, NULL, NULL);
870 mci_sdram_scrub_rate_store);
871 868
872static struct attribute *mci_attrs[] = { 869static struct attribute *mci_attrs[] = {
873 &dev_attr_reset_counters.attr, 870 &dev_attr_reset_counters.attr,
@@ -878,7 +875,6 @@ static struct attribute *mci_attrs[] = {
878 &dev_attr_ce_noinfo_count.attr, 875 &dev_attr_ce_noinfo_count.attr,
879 &dev_attr_ue_count.attr, 876 &dev_attr_ue_count.attr,
880 &dev_attr_ce_count.attr, 877 &dev_attr_ce_count.attr,
881 &dev_attr_sdram_scrub_rate.attr,
882 &dev_attr_max_location.attr, 878 &dev_attr_max_location.attr,
883 NULL 879 NULL
884}; 880};
@@ -1007,11 +1003,28 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci)
1007 edac_dbg(0, "creating device %s\n", dev_name(&mci->dev)); 1003 edac_dbg(0, "creating device %s\n", dev_name(&mci->dev));
1008 err = device_add(&mci->dev); 1004 err = device_add(&mci->dev);
1009 if (err < 0) { 1005 if (err < 0) {
1006 edac_dbg(1, "failure: create device %s\n", dev_name(&mci->dev));
1010 bus_unregister(&mci->bus); 1007 bus_unregister(&mci->bus);
1011 kfree(mci->bus.name); 1008 kfree(mci->bus.name);
1012 return err; 1009 return err;
1013 } 1010 }
1014 1011
1012 if (mci->set_sdram_scrub_rate || mci->get_sdram_scrub_rate) {
1013 if (mci->get_sdram_scrub_rate) {
1014 dev_attr_sdram_scrub_rate.attr.mode |= S_IRUGO;
1015 dev_attr_sdram_scrub_rate.show = &mci_sdram_scrub_rate_show;
1016 }
1017 if (mci->set_sdram_scrub_rate) {
1018 dev_attr_sdram_scrub_rate.attr.mode |= S_IWUSR;
1019 dev_attr_sdram_scrub_rate.store = &mci_sdram_scrub_rate_store;
1020 }
1021 err = device_create_file(&mci->dev,
1022 &dev_attr_sdram_scrub_rate);
1023 if (err) {
1024 edac_dbg(1, "failure: create sdram_scrub_rate\n");
1025 goto fail2;
1026 }
1027 }
1015 /* 1028 /*
1016 * Create the dimm/rank devices 1029 * Create the dimm/rank devices
1017 */ 1030 */
@@ -1056,6 +1069,7 @@ fail:
1056 continue; 1069 continue;
1057 device_unregister(&dimm->dev); 1070 device_unregister(&dimm->dev);
1058 } 1071 }
1072fail2:
1059 device_unregister(&mci->dev); 1073 device_unregister(&mci->dev);
1060 bus_unregister(&mci->bus); 1074 bus_unregister(&mci->bus);
1061 kfree(mci->bus.name); 1075 kfree(mci->bus.name);
diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c
index 12c951a2c33d..a66941fea5a4 100644
--- a/drivers/edac/edac_module.c
+++ b/drivers/edac/edac_module.c
@@ -146,7 +146,7 @@ static void __exit edac_exit(void)
146/* 146/*
147 * Inform the kernel of our entry and exit points 147 * Inform the kernel of our entry and exit points
148 */ 148 */
149module_init(edac_init); 149subsys_initcall(edac_init);
150module_exit(edac_exit); 150module_exit(edac_exit);
151 151
152MODULE_LICENSE("GPL"); 152MODULE_LICENSE("GPL");
diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c
index 0056c4dae9d5..e8658e451762 100644
--- a/drivers/edac/edac_pci_sysfs.c
+++ b/drivers/edac/edac_pci_sysfs.c
@@ -429,8 +429,8 @@ static void edac_pci_main_kobj_teardown(void)
429 if (atomic_dec_return(&edac_pci_sysfs_refcount) == 0) { 429 if (atomic_dec_return(&edac_pci_sysfs_refcount) == 0) {
430 edac_dbg(0, "called kobject_put on main kobj\n"); 430 edac_dbg(0, "called kobject_put on main kobj\n");
431 kobject_put(edac_pci_top_main_kobj); 431 kobject_put(edac_pci_top_main_kobj);
432 edac_put_sysfs_subsys();
432 } 433 }
433 edac_put_sysfs_subsys();
434} 434}
435 435
436/* 436/*
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
new file mode 100644
index 000000000000..bb534670ec02
--- /dev/null
+++ b/drivers/edac/ghes_edac.c
@@ -0,0 +1,537 @@
1/*
2 * GHES/EDAC Linux driver
3 *
4 * This file may be distributed under the terms of the GNU General Public
5 * License version 2.
6 *
7 * Copyright (c) 2013 by Mauro Carvalho Chehab <mchehab@redhat.com>
8 *
9 * Red Hat Inc. http://www.redhat.com
10 */
11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14#include <acpi/ghes.h>
15#include <linux/edac.h>
16#include <linux/dmi.h>
17#include "edac_core.h"
18#include <ras/ras_event.h>
19
20#define GHES_EDAC_REVISION " Ver: 1.0.0"
21
22struct ghes_edac_pvt {
23 struct list_head list;
24 struct ghes *ghes;
25 struct mem_ctl_info *mci;
26
27 /* Buffers for the error handling routine */
28 char detail_location[240];
29 char other_detail[160];
30 char msg[80];
31};
32
33static LIST_HEAD(ghes_reglist);
34static DEFINE_MUTEX(ghes_edac_lock);
35static int ghes_edac_mc_num;
36
37
38/* Memory Device - Type 17 of SMBIOS spec */
39struct memdev_dmi_entry {
40 u8 type;
41 u8 length;
42 u16 handle;
43 u16 phys_mem_array_handle;
44 u16 mem_err_info_handle;
45 u16 total_width;
46 u16 data_width;
47 u16 size;
48 u8 form_factor;
49 u8 device_set;
50 u8 device_locator;
51 u8 bank_locator;
52 u8 memory_type;
53 u16 type_detail;
54 u16 speed;
55 u8 manufacturer;
56 u8 serial_number;
57 u8 asset_tag;
58 u8 part_number;
59 u8 attributes;
60 u32 extended_size;
61 u16 conf_mem_clk_speed;
62} __attribute__((__packed__));
63
64struct ghes_edac_dimm_fill {
65 struct mem_ctl_info *mci;
66 unsigned count;
67};
68
69char *memory_type[] = {
70 [MEM_EMPTY] = "EMPTY",
71 [MEM_RESERVED] = "RESERVED",
72 [MEM_UNKNOWN] = "UNKNOWN",
73 [MEM_FPM] = "FPM",
74 [MEM_EDO] = "EDO",
75 [MEM_BEDO] = "BEDO",
76 [MEM_SDR] = "SDR",
77 [MEM_RDR] = "RDR",
78 [MEM_DDR] = "DDR",
79 [MEM_RDDR] = "RDDR",
80 [MEM_RMBS] = "RMBS",
81 [MEM_DDR2] = "DDR2",
82 [MEM_FB_DDR2] = "FB_DDR2",
83 [MEM_RDDR2] = "RDDR2",
84 [MEM_XDR] = "XDR",
85 [MEM_DDR3] = "DDR3",
86 [MEM_RDDR3] = "RDDR3",
87};
88
89static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
90{
91 int *num_dimm = arg;
92
93 if (dh->type == DMI_ENTRY_MEM_DEVICE)
94 (*num_dimm)++;
95}
96
97static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
98{
99 struct ghes_edac_dimm_fill *dimm_fill = arg;
100 struct mem_ctl_info *mci = dimm_fill->mci;
101
102 if (dh->type == DMI_ENTRY_MEM_DEVICE) {
103 struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
104 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
105 mci->n_layers,
106 dimm_fill->count, 0, 0);
107
108 if (entry->size == 0xffff) {
109 pr_info("Can't get DIMM%i size\n",
110 dimm_fill->count);
111 dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */
112 } else if (entry->size == 0x7fff) {
113 dimm->nr_pages = MiB_TO_PAGES(entry->extended_size);
114 } else {
115 if (entry->size & 1 << 15)
116 dimm->nr_pages = MiB_TO_PAGES((entry->size &
117 0x7fff) << 10);
118 else
119 dimm->nr_pages = MiB_TO_PAGES(entry->size);
120 }
121
122 switch (entry->memory_type) {
123 case 0x12:
124 if (entry->type_detail & 1 << 13)
125 dimm->mtype = MEM_RDDR;
126 else
127 dimm->mtype = MEM_DDR;
128 break;
129 case 0x13:
130 if (entry->type_detail & 1 << 13)
131 dimm->mtype = MEM_RDDR2;
132 else
133 dimm->mtype = MEM_DDR2;
134 break;
135 case 0x14:
136 dimm->mtype = MEM_FB_DDR2;
137 break;
138 case 0x18:
139 if (entry->type_detail & 1 << 13)
140 dimm->mtype = MEM_RDDR3;
141 else
142 dimm->mtype = MEM_DDR3;
143 break;
144 default:
145 if (entry->type_detail & 1 << 6)
146 dimm->mtype = MEM_RMBS;
147 else if ((entry->type_detail & ((1 << 7) | (1 << 13)))
148 == ((1 << 7) | (1 << 13)))
149 dimm->mtype = MEM_RDR;
150 else if (entry->type_detail & 1 << 7)
151 dimm->mtype = MEM_SDR;
152 else if (entry->type_detail & 1 << 9)
153 dimm->mtype = MEM_EDO;
154 else
155 dimm->mtype = MEM_UNKNOWN;
156 }
157
158 /*
159 * Actually, we can only detect if the memory has bits for
160 * checksum or not
161 */
162 if (entry->total_width == entry->data_width)
163 dimm->edac_mode = EDAC_NONE;
164 else
165 dimm->edac_mode = EDAC_SECDED;
166
167 dimm->dtype = DEV_UNKNOWN;
168 dimm->grain = 128; /* Likely, worse case */
169
170 /*
171 * FIXME: It shouldn't be hard to also fill the DIMM labels
172 */
173
174 if (dimm->nr_pages) {
175 edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
176 dimm_fill->count, memory_type[dimm->mtype],
177 PAGES_TO_MiB(dimm->nr_pages),
178 (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : "");
179 edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n",
180 entry->memory_type, entry->type_detail,
181 entry->total_width, entry->data_width);
182 }
183
184 dimm_fill->count++;
185 }
186}
187
188void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
189 struct cper_sec_mem_err *mem_err)
190{
191 enum hw_event_mc_err_type type;
192 struct edac_raw_error_desc *e;
193 struct mem_ctl_info *mci;
194 struct ghes_edac_pvt *pvt = NULL;
195 char *p;
196 u8 grain_bits;
197
198 list_for_each_entry(pvt, &ghes_reglist, list) {
199 if (ghes == pvt->ghes)
200 break;
201 }
202 if (!pvt) {
203 pr_err("Internal error: Can't find EDAC structure\n");
204 return;
205 }
206 mci = pvt->mci;
207 e = &mci->error_desc;
208
209 /* Cleans the error report buffer */
210 memset(e, 0, sizeof (*e));
211 e->error_count = 1;
212 strcpy(e->label, "unknown label");
213 e->msg = pvt->msg;
214 e->other_detail = pvt->other_detail;
215 e->top_layer = -1;
216 e->mid_layer = -1;
217 e->low_layer = -1;
218 *pvt->other_detail = '\0';
219 *pvt->msg = '\0';
220
221 switch (sev) {
222 case GHES_SEV_CORRECTED:
223 type = HW_EVENT_ERR_CORRECTED;
224 break;
225 case GHES_SEV_RECOVERABLE:
226 type = HW_EVENT_ERR_UNCORRECTED;
227 break;
228 case GHES_SEV_PANIC:
229 type = HW_EVENT_ERR_FATAL;
230 break;
231 default:
232 case GHES_SEV_NO:
233 type = HW_EVENT_ERR_INFO;
234 }
235
236 edac_dbg(1, "error validation_bits: 0x%08llx\n",
237 (long long)mem_err->validation_bits);
238
239 /* Error type, mapped on e->msg */
240 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
241 p = pvt->msg;
242 switch (mem_err->error_type) {
243 case 0:
244 p += sprintf(p, "Unknown");
245 break;
246 case 1:
247 p += sprintf(p, "No error");
248 break;
249 case 2:
250 p += sprintf(p, "Single-bit ECC");
251 break;
252 case 3:
253 p += sprintf(p, "Multi-bit ECC");
254 break;
255 case 4:
256 p += sprintf(p, "Single-symbol ChipKill ECC");
257 break;
258 case 5:
259 p += sprintf(p, "Multi-symbol ChipKill ECC");
260 break;
261 case 6:
262 p += sprintf(p, "Master abort");
263 break;
264 case 7:
265 p += sprintf(p, "Target abort");
266 break;
267 case 8:
268 p += sprintf(p, "Parity Error");
269 break;
270 case 9:
271 p += sprintf(p, "Watchdog timeout");
272 break;
273 case 10:
274 p += sprintf(p, "Invalid address");
275 break;
276 case 11:
277 p += sprintf(p, "Mirror Broken");
278 break;
279 case 12:
280 p += sprintf(p, "Memory Sparing");
281 break;
282 case 13:
283 p += sprintf(p, "Scrub corrected error");
284 break;
285 case 14:
286 p += sprintf(p, "Scrub uncorrected error");
287 break;
288 case 15:
289 p += sprintf(p, "Physical Memory Map-out event");
290 break;
291 default:
292 p += sprintf(p, "reserved error (%d)",
293 mem_err->error_type);
294 }
295 } else {
296 strcpy(pvt->msg, "unknown error");
297 }
298
299 /* Error address */
300 if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
301 e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT;
302 e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK;
303 }
304
305 /* Error grain */
306 if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS_MASK) {
307 e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
308 }
309
310 /* Memory error location, mapped on e->location */
311 p = e->location;
312 if (mem_err->validation_bits & CPER_MEM_VALID_NODE)
313 p += sprintf(p, "node:%d ", mem_err->node);
314 if (mem_err->validation_bits & CPER_MEM_VALID_CARD)
315 p += sprintf(p, "card:%d ", mem_err->card);
316 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE)
317 p += sprintf(p, "module:%d ", mem_err->module);
318 if (mem_err->validation_bits & CPER_MEM_VALID_BANK)
319 p += sprintf(p, "bank:%d ", mem_err->bank);
320 if (mem_err->validation_bits & CPER_MEM_VALID_ROW)
321 p += sprintf(p, "row:%d ", mem_err->row);
322 if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN)
323 p += sprintf(p, "col:%d ", mem_err->column);
324 if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
325 p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
326 if (p > e->location)
327 *(p - 1) = '\0';
328
329 /* All other fields are mapped on e->other_detail */
330 p = pvt->other_detail;
331 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) {
332 u64 status = mem_err->error_status;
333
334 p += sprintf(p, "status(0x%016llx): ", (long long)status);
335 switch ((status >> 8) & 0xff) {
336 case 1:
337 p += sprintf(p, "Error detected internal to the component ");
338 break;
339 case 16:
340 p += sprintf(p, "Error detected in the bus ");
341 break;
342 case 4:
343 p += sprintf(p, "Storage error in DRAM memory ");
344 break;
345 case 5:
346 p += sprintf(p, "Storage error in TLB ");
347 break;
348 case 6:
349 p += sprintf(p, "Storage error in cache ");
350 break;
351 case 7:
352 p += sprintf(p, "Error in one or more functional units ");
353 break;
354 case 8:
355 p += sprintf(p, "component failed self test ");
356 break;
357 case 9:
358 p += sprintf(p, "Overflow or undervalue of internal queue ");
359 break;
360 case 17:
361 p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR ");
362 break;
363 case 18:
364 p += sprintf(p, "Improper access error ");
365 break;
366 case 19:
367 p += sprintf(p, "Access to a memory address which is not mapped to any component ");
368 break;
369 case 20:
370 p += sprintf(p, "Loss of Lockstep ");
371 break;
372 case 21:
373 p += sprintf(p, "Response not associated with a request ");
374 break;
375 case 22:
376 p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits ");
377 break;
378 case 23:
379 p += sprintf(p, "Detection of a PATH_ERROR ");
380 break;
381 case 25:
382 p += sprintf(p, "Bus operation timeout ");
383 break;
384 case 26:
385 p += sprintf(p, "A read was issued to data that has been poisoned ");
386 break;
387 default:
388 p += sprintf(p, "reserved ");
389 break;
390 }
391 }
392 if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
393 p += sprintf(p, "requestorID: 0x%016llx ",
394 (long long)mem_err->requestor_id);
395 if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
396 p += sprintf(p, "responderID: 0x%016llx ",
397 (long long)mem_err->responder_id);
398 if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID)
399 p += sprintf(p, "targetID: 0x%016llx ",
400 (long long)mem_err->responder_id);
401 if (p > pvt->other_detail)
402 *(p - 1) = '\0';
403
404 /* Generate the trace event */
405 grain_bits = fls_long(e->grain);
406 sprintf(pvt->detail_location, "APEI location: %s %s",
407 e->location, e->other_detail);
408 trace_mc_event(type, e->msg, e->label, e->error_count,
409 mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
410 PAGES_TO_MiB(e->page_frame_number) | e->offset_in_page,
411 grain_bits, e->syndrome, pvt->detail_location);
412
413 /* Report the error via EDAC API */
414 edac_raw_mc_handle_error(type, mci, e);
415}
416EXPORT_SYMBOL_GPL(ghes_edac_report_mem_error);
417
418int ghes_edac_register(struct ghes *ghes, struct device *dev)
419{
420 bool fake = false;
421 int rc, num_dimm = 0;
422 struct mem_ctl_info *mci;
423 struct edac_mc_layer layers[1];
424 struct ghes_edac_pvt *pvt;
425 struct ghes_edac_dimm_fill dimm_fill;
426
427 /* Get the number of DIMMs */
428 dmi_walk(ghes_edac_count_dimms, &num_dimm);
429
430 /* Check if we've got a bogus BIOS */
431 if (num_dimm == 0) {
432 fake = true;
433 num_dimm = 1;
434 }
435
436 layers[0].type = EDAC_MC_LAYER_ALL_MEM;
437 layers[0].size = num_dimm;
438 layers[0].is_virt_csrow = true;
439
440 /*
441 * We need to serialize edac_mc_alloc() and edac_mc_add_mc(),
442 * to avoid duplicated memory controller numbers
443 */
444 mutex_lock(&ghes_edac_lock);
445 mci = edac_mc_alloc(ghes_edac_mc_num, ARRAY_SIZE(layers), layers,
446 sizeof(*pvt));
447 if (!mci) {
448 pr_info("Can't allocate memory for EDAC data\n");
449 mutex_unlock(&ghes_edac_lock);
450 return -ENOMEM;
451 }
452
453 pvt = mci->pvt_info;
454 memset(pvt, 0, sizeof(*pvt));
455 list_add_tail(&pvt->list, &ghes_reglist);
456 pvt->ghes = ghes;
457 pvt->mci = mci;
458 mci->pdev = dev;
459
460 mci->mtype_cap = MEM_FLAG_EMPTY;
461 mci->edac_ctl_cap = EDAC_FLAG_NONE;
462 mci->edac_cap = EDAC_FLAG_NONE;
463 mci->mod_name = "ghes_edac.c";
464 mci->mod_ver = GHES_EDAC_REVISION;
465 mci->ctl_name = "ghes_edac";
466 mci->dev_name = "ghes";
467
468 if (!ghes_edac_mc_num) {
469 if (!fake) {
470 pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n");
471 pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n");
472 pr_info("So, the end result of using this driver varies from vendor to vendor.\n");
473 pr_info("If you find incorrect reports, please contact your hardware vendor\n");
474 pr_info("to correct its BIOS.\n");
475 pr_info("This system has %d DIMM sockets.\n",
476 num_dimm);
477 } else {
478 pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
479 pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
480 pr_info("work on such system. Use this driver with caution\n");
481 }
482 }
483
484 if (!fake) {
485 /*
486 * Fill DIMM info from DMI for the memory controller #0
487 *
488 * Keep it in blank for the other memory controllers, as
489 * there's no reliable way to properly credit each DIMM to
490 * the memory controller, as different BIOSes fill the
491 * DMI bank location fields on different ways
492 */
493 if (!ghes_edac_mc_num) {
494 dimm_fill.count = 0;
495 dimm_fill.mci = mci;
496 dmi_walk(ghes_edac_dmidecode, &dimm_fill);
497 }
498 } else {
499 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
500 mci->n_layers, 0, 0, 0);
501
502 dimm->nr_pages = 1;
503 dimm->grain = 128;
504 dimm->mtype = MEM_UNKNOWN;
505 dimm->dtype = DEV_UNKNOWN;
506 dimm->edac_mode = EDAC_SECDED;
507 }
508
509 rc = edac_mc_add_mc(mci);
510 if (rc < 0) {
511 pr_info("Can't register at EDAC core\n");
512 edac_mc_free(mci);
513 mutex_unlock(&ghes_edac_lock);
514 return -ENODEV;
515 }
516
517 ghes_edac_mc_num++;
518 mutex_unlock(&ghes_edac_lock);
519 return 0;
520}
521EXPORT_SYMBOL_GPL(ghes_edac_register);
522
523void ghes_edac_unregister(struct ghes *ghes)
524{
525 struct mem_ctl_info *mci;
526 struct ghes_edac_pvt *pvt, *tmp;
527
528 list_for_each_entry_safe(pvt, tmp, &ghes_reglist, list) {
529 if (ghes == pvt->ghes) {
530 mci = pvt->mci;
531 edac_mc_del_mc(mci->pdev);
532 edac_mc_free(mci);
533 list_del(&pvt->list);
534 }
535 }
536}
537EXPORT_SYMBOL_GPL(ghes_edac_unregister);
diff --git a/drivers/edac/i3200_edac.c b/drivers/edac/i3200_edac.c
index 4e8337602e78..aa44c1718f50 100644
--- a/drivers/edac/i3200_edac.c
+++ b/drivers/edac/i3200_edac.c
@@ -106,16 +106,26 @@ static int nr_channels;
106 106
107static int how_many_channels(struct pci_dev *pdev) 107static int how_many_channels(struct pci_dev *pdev)
108{ 108{
109 int n_channels;
110
109 unsigned char capid0_8b; /* 8th byte of CAPID0 */ 111 unsigned char capid0_8b; /* 8th byte of CAPID0 */
110 112
111 pci_read_config_byte(pdev, I3200_CAPID0 + 8, &capid0_8b); 113 pci_read_config_byte(pdev, I3200_CAPID0 + 8, &capid0_8b);
114
112 if (capid0_8b & 0x20) { /* check DCD: Dual Channel Disable */ 115 if (capid0_8b & 0x20) { /* check DCD: Dual Channel Disable */
113 edac_dbg(0, "In single channel mode\n"); 116 edac_dbg(0, "In single channel mode\n");
114 return 1; 117 n_channels = 1;
115 } else { 118 } else {
116 edac_dbg(0, "In dual channel mode\n"); 119 edac_dbg(0, "In dual channel mode\n");
117 return 2; 120 n_channels = 2;
118 } 121 }
122
123 if (capid0_8b & 0x10) /* check if both channels are filled */
124 edac_dbg(0, "2 DIMMS per channel disabled\n");
125 else
126 edac_dbg(0, "2 DIMMS per channel enabled\n");
127
128 return n_channels;
119} 129}
120 130
121static unsigned long eccerrlog_syndrome(u64 log) 131static unsigned long eccerrlog_syndrome(u64 log)
@@ -290,6 +300,8 @@ static void i3200_get_drbs(void __iomem *window,
290 for (i = 0; i < I3200_RANKS_PER_CHANNEL; i++) { 300 for (i = 0; i < I3200_RANKS_PER_CHANNEL; i++) {
291 drbs[0][i] = readw(window + I3200_C0DRB + 2*i) & I3200_DRB_MASK; 301 drbs[0][i] = readw(window + I3200_C0DRB + 2*i) & I3200_DRB_MASK;
292 drbs[1][i] = readw(window + I3200_C1DRB + 2*i) & I3200_DRB_MASK; 302 drbs[1][i] = readw(window + I3200_C1DRB + 2*i) & I3200_DRB_MASK;
303
304 edac_dbg(0, "drb[0][%d] = %d, drb[1][%d] = %d\n", i, drbs[0][i], i, drbs[1][i]);
293 } 305 }
294} 306}
295 307
@@ -311,6 +323,9 @@ static unsigned long drb_to_nr_pages(
311 int n; 323 int n;
312 324
313 n = drbs[channel][rank]; 325 n = drbs[channel][rank];
326 if (!n)
327 return 0;
328
314 if (rank > 0) 329 if (rank > 0)
315 n -= drbs[channel][rank - 1]; 330 n -= drbs[channel][rank - 1];
316 if (stacked && (channel == 1) && 331 if (stacked && (channel == 1) &&
@@ -377,19 +392,19 @@ static int i3200_probe1(struct pci_dev *pdev, int dev_idx)
377 * cumulative; the last one will contain the total memory 392 * cumulative; the last one will contain the total memory
378 * contained in all ranks. 393 * contained in all ranks.
379 */ 394 */
380 for (i = 0; i < mci->nr_csrows; i++) { 395 for (i = 0; i < I3200_DIMMS; i++) {
381 unsigned long nr_pages; 396 unsigned long nr_pages;
382 struct csrow_info *csrow = mci->csrows[i];
383 397
384 nr_pages = drb_to_nr_pages(drbs, stacked, 398 for (j = 0; j < nr_channels; j++) {
385 i / I3200_RANKS_PER_CHANNEL, 399 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
386 i % I3200_RANKS_PER_CHANNEL); 400 mci->n_layers, i, j, 0);
387 401
388 if (nr_pages == 0) 402 nr_pages = drb_to_nr_pages(drbs, stacked, j, i);
389 continue; 403 if (nr_pages == 0)
404 continue;
390 405
391 for (j = 0; j < nr_channels; j++) { 406 edac_dbg(0, "csrow %d, channel %d%s, size = %ld Mb\n", i, j,
392 struct dimm_info *dimm = csrow->channels[j]->dimm; 407 stacked ? " (stacked)" : "", PAGES_TO_MiB(nr_pages));
393 408
394 dimm->nr_pages = nr_pages; 409 dimm->nr_pages = nr_pages;
395 dimm->grain = nr_pages << PAGE_SHIFT; 410 dimm->grain = nr_pages << PAGE_SHIFT;
diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c
index d6955b2cc99f..1b635178cc44 100644
--- a/drivers/edac/i5100_edac.c
+++ b/drivers/edac/i5100_edac.c
@@ -27,6 +27,7 @@
27#include <linux/edac.h> 27#include <linux/edac.h>
28#include <linux/delay.h> 28#include <linux/delay.h>
29#include <linux/mmzone.h> 29#include <linux/mmzone.h>
30#include <linux/debugfs.h>
30 31
31#include "edac_core.h" 32#include "edac_core.h"
32 33
@@ -68,6 +69,14 @@
68 I5100_FERR_NF_MEM_M1ERR_MASK) 69 I5100_FERR_NF_MEM_M1ERR_MASK)
69#define I5100_NERR_NF_MEM 0xa4 /* MC Next Non-Fatal Errors */ 70#define I5100_NERR_NF_MEM 0xa4 /* MC Next Non-Fatal Errors */
70#define I5100_EMASK_MEM 0xa8 /* MC Error Mask Register */ 71#define I5100_EMASK_MEM 0xa8 /* MC Error Mask Register */
72#define I5100_MEM0EINJMSK0 0x200 /* Injection Mask0 Register Channel 0 */
73#define I5100_MEM1EINJMSK0 0x208 /* Injection Mask0 Register Channel 1 */
74#define I5100_MEMXEINJMSK0_EINJEN (1 << 27)
75#define I5100_MEM0EINJMSK1 0x204 /* Injection Mask1 Register Channel 0 */
76#define I5100_MEM1EINJMSK1 0x206 /* Injection Mask1 Register Channel 1 */
77
78/* Device 19, Function 0 */
79#define I5100_DINJ0 0x9a
71 80
72/* device 21 and 22, func 0 */ 81/* device 21 and 22, func 0 */
73#define I5100_MTR_0 0x154 /* Memory Technology Registers 0-3 */ 82#define I5100_MTR_0 0x154 /* Memory Technology Registers 0-3 */
@@ -338,13 +347,26 @@ struct i5100_priv {
338 unsigned ranksperchan; /* number of ranks per channel */ 347 unsigned ranksperchan; /* number of ranks per channel */
339 348
340 struct pci_dev *mc; /* device 16 func 1 */ 349 struct pci_dev *mc; /* device 16 func 1 */
350 struct pci_dev *einj; /* device 19 func 0 */
341 struct pci_dev *ch0mm; /* device 21 func 0 */ 351 struct pci_dev *ch0mm; /* device 21 func 0 */
342 struct pci_dev *ch1mm; /* device 22 func 0 */ 352 struct pci_dev *ch1mm; /* device 22 func 0 */
343 353
344 struct delayed_work i5100_scrubbing; 354 struct delayed_work i5100_scrubbing;
345 int scrub_enable; 355 int scrub_enable;
356
357 /* Error injection */
358 u8 inject_channel;
359 u8 inject_hlinesel;
360 u8 inject_deviceptr1;
361 u8 inject_deviceptr2;
362 u16 inject_eccmask1;
363 u16 inject_eccmask2;
364
365 struct dentry *debugfs;
346}; 366};
347 367
368static struct dentry *i5100_debugfs;
369
348/* map a rank/chan to a slot number on the mainboard */ 370/* map a rank/chan to a slot number on the mainboard */
349static int i5100_rank_to_slot(const struct mem_ctl_info *mci, 371static int i5100_rank_to_slot(const struct mem_ctl_info *mci,
350 int chan, int rank) 372 int chan, int rank)
@@ -863,13 +885,126 @@ static void i5100_init_csrows(struct mem_ctl_info *mci)
863 } 885 }
864} 886}
865 887
888/****************************************************************************
889 * Error injection routines
890 ****************************************************************************/
891
892static void i5100_do_inject(struct mem_ctl_info *mci)
893{
894 struct i5100_priv *priv = mci->pvt_info;
895 u32 mask0;
896 u16 mask1;
897
898 /* MEM[1:0]EINJMSK0
899 * 31 - ADDRMATCHEN
900 * 29:28 - HLINESEL
901 * 00 Reserved
902 * 01 Lower half of cache line
903 * 10 Upper half of cache line
904 * 11 Both upper and lower parts of cache line
905 * 27 - EINJEN
906 * 25:19 - XORMASK1 for deviceptr1
907 * 9:5 - SEC2RAM for deviceptr2
908 * 4:0 - FIR2RAM for deviceptr1
909 */
910 mask0 = ((priv->inject_hlinesel & 0x3) << 28) |
911 I5100_MEMXEINJMSK0_EINJEN |
912 ((priv->inject_eccmask1 & 0xffff) << 10) |
913 ((priv->inject_deviceptr2 & 0x1f) << 5) |
914 (priv->inject_deviceptr1 & 0x1f);
915
916 /* MEM[1:0]EINJMSK1
917 * 15:0 - XORMASK2 for deviceptr2
918 */
919 mask1 = priv->inject_eccmask2;
920
921 if (priv->inject_channel == 0) {
922 pci_write_config_dword(priv->mc, I5100_MEM0EINJMSK0, mask0);
923 pci_write_config_word(priv->mc, I5100_MEM0EINJMSK1, mask1);
924 } else {
925 pci_write_config_dword(priv->mc, I5100_MEM1EINJMSK0, mask0);
926 pci_write_config_word(priv->mc, I5100_MEM1EINJMSK1, mask1);
927 }
928
929 /* Error Injection Response Function
930 * Intel 5100 Memory Controller Hub Chipset (318378) datasheet
931 * hints about this register but carry no data about them. All
932 * data regarding device 19 is based on experimentation and the
933 * Intel 7300 Chipset Memory Controller Hub (318082) datasheet
934 * which appears to be accurate for the i5100 in this area.
935 *
936 * The injection code don't work without setting this register.
937 * The register needs to be flipped off then on else the hardware
938 * will only preform the first injection.
939 *
940 * Stop condition bits 7:4
941 * 1010 - Stop after one injection
942 * 1011 - Never stop injecting faults
943 *
944 * Start condition bits 3:0
945 * 1010 - Never start
946 * 1011 - Start immediately
947 */
948 pci_write_config_byte(priv->einj, I5100_DINJ0, 0xaa);
949 pci_write_config_byte(priv->einj, I5100_DINJ0, 0xab);
950}
951
952#define to_mci(k) container_of(k, struct mem_ctl_info, dev)
953static ssize_t inject_enable_write(struct file *file, const char __user *data,
954 size_t count, loff_t *ppos)
955{
956 struct device *dev = file->private_data;
957 struct mem_ctl_info *mci = to_mci(dev);
958
959 i5100_do_inject(mci);
960
961 return count;
962}
963
964static const struct file_operations i5100_inject_enable_fops = {
965 .open = simple_open,
966 .write = inject_enable_write,
967 .llseek = generic_file_llseek,
968};
969
970static int i5100_setup_debugfs(struct mem_ctl_info *mci)
971{
972 struct i5100_priv *priv = mci->pvt_info;
973
974 if (!i5100_debugfs)
975 return -ENODEV;
976
977 priv->debugfs = debugfs_create_dir(mci->bus.name, i5100_debugfs);
978
979 if (!priv->debugfs)
980 return -ENOMEM;
981
982 debugfs_create_x8("inject_channel", S_IRUGO | S_IWUSR, priv->debugfs,
983 &priv->inject_channel);
984 debugfs_create_x8("inject_hlinesel", S_IRUGO | S_IWUSR, priv->debugfs,
985 &priv->inject_hlinesel);
986 debugfs_create_x8("inject_deviceptr1", S_IRUGO | S_IWUSR, priv->debugfs,
987 &priv->inject_deviceptr1);
988 debugfs_create_x8("inject_deviceptr2", S_IRUGO | S_IWUSR, priv->debugfs,
989 &priv->inject_deviceptr2);
990 debugfs_create_x16("inject_eccmask1", S_IRUGO | S_IWUSR, priv->debugfs,
991 &priv->inject_eccmask1);
992 debugfs_create_x16("inject_eccmask2", S_IRUGO | S_IWUSR, priv->debugfs,
993 &priv->inject_eccmask2);
994 debugfs_create_file("inject_enable", S_IWUSR, priv->debugfs,
995 &mci->dev, &i5100_inject_enable_fops);
996
997 return 0;
998
999}
1000
866static int i5100_init_one(struct pci_dev *pdev, const struct pci_device_id *id) 1001static int i5100_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
867{ 1002{
868 int rc; 1003 int rc;
869 struct mem_ctl_info *mci; 1004 struct mem_ctl_info *mci;
870 struct edac_mc_layer layers[2]; 1005 struct edac_mc_layer layers[2];
871 struct i5100_priv *priv; 1006 struct i5100_priv *priv;
872 struct pci_dev *ch0mm, *ch1mm; 1007 struct pci_dev *ch0mm, *ch1mm, *einj;
873 int ret = 0; 1008 int ret = 0;
874 u32 dw; 1009 u32 dw;
875 int ranksperch; 1010 int ranksperch;
@@ -941,6 +1076,22 @@ static int i5100_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
941 goto bail_disable_ch1; 1076 goto bail_disable_ch1;
942 } 1077 }
943 1078
1079
1080 /* device 19, func 0, Error injection */
1081 einj = pci_get_device_func(PCI_VENDOR_ID_INTEL,
1082 PCI_DEVICE_ID_INTEL_5100_19, 0);
1083 if (!einj) {
1084 ret = -ENODEV;
1085 goto bail_einj;
1086 }
1087
1088 rc = pci_enable_device(einj);
1089 if (rc < 0) {
1090 ret = rc;
1091 goto bail_disable_einj;
1092 }
1093
1094
944 mci->pdev = &pdev->dev; 1095 mci->pdev = &pdev->dev;
945 1096
946 priv = mci->pvt_info; 1097 priv = mci->pvt_info;
@@ -948,6 +1099,7 @@ static int i5100_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
948 priv->mc = pdev; 1099 priv->mc = pdev;
949 priv->ch0mm = ch0mm; 1100 priv->ch0mm = ch0mm;
950 priv->ch1mm = ch1mm; 1101 priv->ch1mm = ch1mm;
1102 priv->einj = einj;
951 1103
952 INIT_DELAYED_WORK(&(priv->i5100_scrubbing), i5100_refresh_scrubbing); 1104 INIT_DELAYED_WORK(&(priv->i5100_scrubbing), i5100_refresh_scrubbing);
953 1105
@@ -975,6 +1127,13 @@ static int i5100_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
975 mci->set_sdram_scrub_rate = i5100_set_scrub_rate; 1127 mci->set_sdram_scrub_rate = i5100_set_scrub_rate;
976 mci->get_sdram_scrub_rate = i5100_get_scrub_rate; 1128 mci->get_sdram_scrub_rate = i5100_get_scrub_rate;
977 1129
1130 priv->inject_channel = 0;
1131 priv->inject_hlinesel = 0;
1132 priv->inject_deviceptr1 = 0;
1133 priv->inject_deviceptr2 = 0;
1134 priv->inject_eccmask1 = 0;
1135 priv->inject_eccmask2 = 0;
1136
978 i5100_init_csrows(mci); 1137 i5100_init_csrows(mci);
979 1138
980 /* this strange construction seems to be in every driver, dunno why */ 1139 /* this strange construction seems to be in every driver, dunno why */
@@ -992,6 +1151,8 @@ static int i5100_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
992 goto bail_scrub; 1151 goto bail_scrub;
993 } 1152 }
994 1153
1154 i5100_setup_debugfs(mci);
1155
995 return ret; 1156 return ret;
996 1157
997bail_scrub: 1158bail_scrub:
@@ -999,6 +1160,12 @@ bail_scrub:
999 cancel_delayed_work_sync(&(priv->i5100_scrubbing)); 1160 cancel_delayed_work_sync(&(priv->i5100_scrubbing));
1000 edac_mc_free(mci); 1161 edac_mc_free(mci);
1001 1162
1163bail_disable_einj:
1164 pci_disable_device(einj);
1165
1166bail_einj:
1167 pci_dev_put(einj);
1168
1002bail_disable_ch1: 1169bail_disable_ch1:
1003 pci_disable_device(ch1mm); 1170 pci_disable_device(ch1mm);
1004 1171
@@ -1030,14 +1197,18 @@ static void i5100_remove_one(struct pci_dev *pdev)
1030 1197
1031 priv = mci->pvt_info; 1198 priv = mci->pvt_info;
1032 1199
1200 debugfs_remove_recursive(priv->debugfs);
1201
1033 priv->scrub_enable = 0; 1202 priv->scrub_enable = 0;
1034 cancel_delayed_work_sync(&(priv->i5100_scrubbing)); 1203 cancel_delayed_work_sync(&(priv->i5100_scrubbing));
1035 1204
1036 pci_disable_device(pdev); 1205 pci_disable_device(pdev);
1037 pci_disable_device(priv->ch0mm); 1206 pci_disable_device(priv->ch0mm);
1038 pci_disable_device(priv->ch1mm); 1207 pci_disable_device(priv->ch1mm);
1208 pci_disable_device(priv->einj);
1039 pci_dev_put(priv->ch0mm); 1209 pci_dev_put(priv->ch0mm);
1040 pci_dev_put(priv->ch1mm); 1210 pci_dev_put(priv->ch1mm);
1211 pci_dev_put(priv->einj);
1041 1212
1042 edac_mc_free(mci); 1213 edac_mc_free(mci);
1043} 1214}
@@ -1060,13 +1231,16 @@ static int __init i5100_init(void)
1060{ 1231{
1061 int pci_rc; 1232 int pci_rc;
1062 1233
1063 pci_rc = pci_register_driver(&i5100_driver); 1234 i5100_debugfs = debugfs_create_dir("i5100_edac", NULL);
1064 1235
1236 pci_rc = pci_register_driver(&i5100_driver);
1065 return (pci_rc < 0) ? pci_rc : 0; 1237 return (pci_rc < 0) ? pci_rc : 0;
1066} 1238}
1067 1239
1068static void __exit i5100_exit(void) 1240static void __exit i5100_exit(void)
1069{ 1241{
1242 debugfs_remove(i5100_debugfs);
1243
1070 pci_unregister_driver(&i5100_driver); 1244 pci_unregister_driver(&i5100_driver);
1071} 1245}
1072 1246
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index e213d030b0dd..0ec3e95a12cd 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -420,21 +420,21 @@ static inline int numdimms(u32 dimms)
420 420
421static inline int numrank(u32 rank) 421static inline int numrank(u32 rank)
422{ 422{
423 static int ranks[4] = { 1, 2, 4, -EINVAL }; 423 static const int ranks[] = { 1, 2, 4, -EINVAL };
424 424
425 return ranks[rank & 0x3]; 425 return ranks[rank & 0x3];
426} 426}
427 427
428static inline int numbank(u32 bank) 428static inline int numbank(u32 bank)
429{ 429{
430 static int banks[4] = { 4, 8, 16, -EINVAL }; 430 static const int banks[] = { 4, 8, 16, -EINVAL };
431 431
432 return banks[bank & 0x3]; 432 return banks[bank & 0x3];
433} 433}
434 434
435static inline int numrow(u32 row) 435static inline int numrow(u32 row)
436{ 436{
437 static int rows[8] = { 437 static const int rows[] = {
438 1 << 12, 1 << 13, 1 << 14, 1 << 15, 438 1 << 12, 1 << 13, 1 << 14, 1 << 15,
439 1 << 16, -EINVAL, -EINVAL, -EINVAL, 439 1 << 16, -EINVAL, -EINVAL, -EINVAL,
440 }; 440 };
@@ -444,7 +444,7 @@ static inline int numrow(u32 row)
444 444
445static inline int numcol(u32 col) 445static inline int numcol(u32 col)
446{ 446{
447 static int cols[8] = { 447 static const int cols[] = {
448 1 << 10, 1 << 11, 1 << 12, -EINVAL, 448 1 << 10, 1 << 11, 1 << 12, -EINVAL,
449 }; 449 };
450 return cols[col & 0x3]; 450 return cols[col & 0x3];
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index da7e2986e3d5..57244f995614 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -639,7 +639,7 @@ static void get_memory_layout(const struct mem_ctl_info *mci)
639 tmp_mb = (1 + pvt->tohm) >> 20; 639 tmp_mb = (1 + pvt->tohm) >> 20;
640 640
641 mb = div_u64_rem(tmp_mb, 1000, &kb); 641 mb = div_u64_rem(tmp_mb, 1000, &kb);
642 edac_dbg(0, "TOHM: %u.%03u GB (0x%016Lx)", mb, kb, (u64)pvt->tohm); 642 edac_dbg(0, "TOHM: %u.%03u GB (0x%016Lx)\n", mb, kb, (u64)pvt->tohm);
643 643
644 /* 644 /*
645 * Step 2) Get SAD range and SAD Interleave list 645 * Step 2) Get SAD range and SAD Interleave list