aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac
diff options
context:
space:
mode:
authorMauro Carvalho Chehab <mchehab@redhat.com>2009-06-22 21:48:29 -0400
committerMauro Carvalho Chehab <mchehab@redhat.com>2010-05-10 10:44:46 -0400
commit442305b152778f07504e9fdf64815d4841279bbe (patch)
treeded4b61a2b5bc7ba5d98e0db63b3e53049fab9d5 /drivers/edac
parent87d1d272ba25a1863e40ebb1df4bc0eed7a8fd11 (diff)
i7core_edac: Add a memory check routine, based on device 3 function 4
This function appears only on Xeon 5500 datasheet. Yet, testing with a Xeon 3503 showed that this is also implemented on other Nehalem processors. At the first read, MC_TEST_ERR_RCV1 and MC_TEST_ERR_RCV0 can contain any value. Modify CE error logic to update the error count only after the second read. An alternative approach would be to do a write at rcv0 and rcv1 registers, but it seemed better to keep they untouched, since BIOS might eventually assume that they are exclusive for their usage. Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Diffstat (limited to 'drivers/edac')
-rw-r--r--drivers/edac/i7core_edac.c115
1 files changed, 108 insertions, 7 deletions
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index 190596af601a..b5dbc2b83961 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -62,6 +62,18 @@
62#define MC_STATUS 0x4c 62#define MC_STATUS 0x4c
63#define MC_MAX_DOD 0x64 63#define MC_MAX_DOD 0x64
64 64
65/*
66 * OFFSETS for Device 3 Function 4, as inicated on Xeon 5500 datasheet:
67 * http://www.arrownac.com/manufacturers/intel/s/nehalem/5500-datasheet-v2.pdf
68 */
69
70#define MC_TEST_ERR_RCV1 0x60
71 #define DIMM2_COR_ERR(r) ((r) & 0x7fff)
72
73#define MC_TEST_ERR_RCV0 0x64
74 #define DIMM1_COR_ERR(r) (((r) >> 16) & 0x7fff)
75 #define DIMM0_COR_ERR(r) ((r) & 0x7fff)
76
65 /* OFFSETS for Devices 4,5 and 6 Function 0 */ 77 /* OFFSETS for Devices 4,5 and 6 Function 0 */
66 78
67#define MC_CHANNEL_DIMM_INIT_PARAMS 0x58 79#define MC_CHANNEL_DIMM_INIT_PARAMS 0x58
@@ -136,8 +148,9 @@
136 */ 148 */
137 149
138#define NUM_CHANS 3 150#define NUM_CHANS 3
139#define NUM_MCR_FUNCS 4 151#define MAX_DIMMS 3 /* Max DIMMS per channel */
140#define NUM_CHAN_FUNCS 3 152#define MAX_MCR_FUNC 4
153#define MAX_CHAN_FUNC 3
141 154
142struct i7core_info { 155struct i7core_info {
143 u32 mc_control; 156 u32 mc_control;
@@ -159,8 +172,8 @@ struct i7core_inject {
159}; 172};
160 173
161struct i7core_channel { 174struct i7core_channel {
162 u32 ranks; 175 u32 ranks;
163 u32 dimms; 176 u32 dimms;
164}; 177};
165 178
166struct pci_id_descr { 179struct pci_id_descr {
@@ -171,11 +184,16 @@ struct pci_id_descr {
171}; 184};
172 185
173struct i7core_pvt { 186struct i7core_pvt {
174 struct pci_dev *pci_mcr[NUM_MCR_FUNCS]; 187 struct pci_dev *pci_mcr[MAX_MCR_FUNC + 1];
175 struct pci_dev *pci_ch[NUM_CHANS][NUM_CHAN_FUNCS]; 188 struct pci_dev *pci_ch[NUM_CHANS][MAX_CHAN_FUNC + 1];
176 struct i7core_info info; 189 struct i7core_info info;
177 struct i7core_inject inject; 190 struct i7core_inject inject;
178 struct i7core_channel channel[NUM_CHANS]; 191 struct i7core_channel channel[NUM_CHANS];
192
193 int ce_count_available;
194 unsigned long ce_count[MAX_DIMMS]; /* ECC corrected errors counts per dimm */
195 int last_ce_count[MAX_DIMMS];
196
179}; 197};
180 198
181/* Device name and register DID (Device ID) */ 199/* Device name and register DID (Device ID) */
@@ -749,6 +767,19 @@ static ssize_t i7core_inject_enable_show(struct mem_ctl_info *mci,
749 return sprintf(data, "%d\n", pvt->inject.enable); 767 return sprintf(data, "%d\n", pvt->inject.enable);
750} 768}
751 769
770static ssize_t i7core_ce_regs_show(struct mem_ctl_info *mci, char *data)
771{
772 struct i7core_pvt *pvt = mci->pvt_info;
773
774 if (!pvt->ce_count_available)
775 return sprintf(data, "unavailable\n");
776
777 return sprintf(data, "dimm0: %lu\ndimm1: %lu\ndimm2: %lu\n",
778 pvt->ce_count[0],
779 pvt->ce_count[1],
780 pvt->ce_count[2]);
781}
782
752/* 783/*
753 * Sysfs struct 784 * Sysfs struct
754 */ 785 */
@@ -789,6 +820,13 @@ static struct mcidev_sysfs_attribute i7core_inj_attrs[] = {
789 }, 820 },
790 .show = i7core_inject_enable_show, 821 .show = i7core_inject_enable_show,
791 .store = i7core_inject_enable_store, 822 .store = i7core_inject_enable_store,
823 }, {
824 .attr = {
825 .name = "corrected_error_counts",
826 .mode = (S_IRUGO | S_IWUSR)
827 },
828 .show = i7core_ce_regs_show,
829 .store = NULL,
792 }, 830 },
793}; 831};
794 832
@@ -879,13 +917,76 @@ static int i7core_get_devices(struct mem_ctl_info *mci, struct pci_dev *mcidev)
879 return 0; 917 return 0;
880} 918}
881 919
920/****************************************************************************
921 Error check routines
922 ****************************************************************************/
923
924/* This function is based on the device 3 function 4 registers as described on:
925 * Intel Xeon Processor 5500 Series Datasheet Volume 2
926 * http://www.intel.com/Assets/PDF/datasheet/321322.pdf
927 * also available at:
928 * http://www.arrownac.com/manufacturers/intel/s/nehalem/5500-datasheet-v2.pdf
929 */
930static void check_mc_test_err(struct mem_ctl_info *mci)
931{
932 struct i7core_pvt *pvt = mci->pvt_info;
933 u32 rcv1, rcv0;
934 int new0, new1, new2;
935
936 if (!pvt->pci_mcr[4]) {
937 debugf0("%s MCR registers not found\n",__func__);
938 return;
939 }
940
941 /* Corrected error reads */
942 pci_read_config_dword(pvt->pci_mcr[4], MC_TEST_ERR_RCV1, &rcv1);
943 pci_read_config_dword(pvt->pci_mcr[4], MC_TEST_ERR_RCV0, &rcv0);
944
945 /* Store the new values */
946 new2 = DIMM2_COR_ERR(rcv1);
947 new1 = DIMM1_COR_ERR(rcv0);
948 new0 = DIMM0_COR_ERR(rcv0);
949
950 debugf2("%s CE rcv1=0x%08x rcv0=0x%08x, %d %d %d\n",
951 (pvt->ce_count_available ? "UPDATE" : "READ"),
952 rcv1, rcv0, new0, new1, new2);
953
954 /* Updates CE counters if it is not the first time here */
955 if (pvt->ce_count_available) {
956 /* Updates CE counters */
957 int add0, add1, add2;
958
959 add2 = new2 - pvt->last_ce_count[2];
960 add1 = new1 - pvt->last_ce_count[1];
961 add0 = new0 - pvt->last_ce_count[0];
962
963 if (add2 < 0)
964 add2 += 0x7fff;
965 pvt->ce_count[2] += add2;
966
967 if (add1 < 0)
968 add1 += 0x7fff;
969 pvt->ce_count[1] += add1;
970
971 if (add0 < 0)
972 add0 += 0x7fff;
973 pvt->ce_count[0] += add0;
974 } else
975 pvt->ce_count_available = 1;
976
977 /* Store the new values */
978 pvt->last_ce_count[2] = new2;
979 pvt->last_ce_count[1] = new1;
980 pvt->last_ce_count[0] = new0;
981}
982
882/* 983/*
883 * i7core_check_error Retrieve and process errors reported by the 984 * i7core_check_error Retrieve and process errors reported by the
884 * hardware. Called by the Core module. 985 * hardware. Called by the Core module.
885 */ 986 */
886static void i7core_check_error(struct mem_ctl_info *mci) 987static void i7core_check_error(struct mem_ctl_info *mci)
887{ 988{
888 /* FIXME: need a real code here */ 989 check_mc_test_err(mci);
889} 990}
890 991
891/* 992/*