aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac
diff options
context:
space:
mode:
authorMauro Carvalho Chehab <mchehab@redhat.com>2009-07-09 21:06:41 -0400
committerMauro Carvalho Chehab <mchehab@redhat.com>2010-05-10 10:44:50 -0400
commitd5381642ab01b084787925acdf26b5524d434476 (patch)
treee8d73b1a3aa701fccb57dbf7f50e60faa75a0ecc /drivers/edac
parent963c5ba35984c87963480031d1d7e2e556256ad7 (diff)
i7core_edac: Add edac_mce glue
Adds a glue code to allow i7core to work with mcelog. With the glue, i7core registers itself on edac_mce. At mce, when an error is detected, it calls all registered drivers (in this case, i7core), for EDAC error handling. TODO: It currently just prints the MCE error log using about the same format as mce panic messages. The error message should be enhanced with mcelog userspace info and converted into the proper EDAC format, to feed the EDAC error counts. Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Diffstat (limited to 'drivers/edac')
-rw-r--r--drivers/edac/edac_mce.c11
-rw-r--r--drivers/edac/i7core_edac.c111
2 files changed, 116 insertions, 6 deletions
diff --git a/drivers/edac/edac_mce.c b/drivers/edac/edac_mce.c
index b1efa8e51921..9ccdc5b140e7 100644
--- a/drivers/edac/edac_mce.c
+++ b/drivers/edac/edac_mce.c
@@ -41,9 +41,7 @@ void edac_mce_unregister(struct edac_mce *edac_mce)
41} 41}
42EXPORT_SYMBOL(edac_mce_unregister); 42EXPORT_SYMBOL(edac_mce_unregister);
43 43
44 44int edac_mce_parse(struct mce *mce)
45
46int edac_mce_queue(struct mce *mce)
47{ 45{
48 struct edac_mce *edac_mce; 46 struct edac_mce *edac_mce;
49 47
@@ -55,4 +53,9 @@ int edac_mce_queue(struct mce *mce)
55 /* Nobody queued the error */ 53 /* Nobody queued the error */
56 return 0; 54 return 0;
57} 55}
58EXPORT_SYMBOL_GPL(edac_mce_queue); 56EXPORT_SYMBOL_GPL(edac_mce_parse);
57
58MODULE_LICENSE("GPL");
59MODULE_AUTHOR("Mauro Carvalho Chehab <mchehab@redhat.com>");
60MODULE_AUTHOR("Red Hat Inc. (http://www.redhat.com)");
61MODULE_DESCRIPTION("EDAC Driver for mcelog captured errors");
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index 914914759690..3c7bb5f405f6 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -27,6 +27,8 @@
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/edac.h> 28#include <linux/edac.h>
29#include <linux/mmzone.h> 29#include <linux/mmzone.h>
30#include <linux/edac_mce.h>
31#include <linux/spinlock.h>
30 32
31#include "edac_core.h" 33#include "edac_core.h"
32 34
@@ -195,6 +197,11 @@ struct i7core_pvt {
195 unsigned long ce_count[MAX_DIMMS]; /* ECC corrected errors counts per dimm */ 197 unsigned long ce_count[MAX_DIMMS]; /* ECC corrected errors counts per dimm */
196 int last_ce_count[MAX_DIMMS]; 198 int last_ce_count[MAX_DIMMS];
197 199
200 /* mcelog glue */
201 struct edac_mce edac_mce;
202 struct mce mce_entry[MCE_LOG_LEN];
203 unsigned mce_count;
204 spinlock_t mce_lock;
198}; 205};
199 206
200/* Device name and register DID (Device ID) */ 207/* Device name and register DID (Device ID) */
@@ -900,7 +907,7 @@ static ssize_t i7core_inject_enable_store(struct mem_ctl_info *mci,
900 pci_read_config_dword(pvt->pci_ch[pvt->inject.channel][0], 907 pci_read_config_dword(pvt->pci_ch[pvt->inject.channel][0],
901 MC_CHANNEL_ADDR_MATCH + 4, &rdmask2); 908 MC_CHANNEL_ADDR_MATCH + 4, &rdmask2);
902 909
903 debugf0("Inject addr match write 0x%016llx, read: 0x%08x%08x\n", 910 debugf0("Inject addr match write 0x%016llx, read: 0x%08x 0x%08x\n",
904 mask, rdmask1, rdmask2); 911 mask, rdmask1, rdmask2);
905#endif 912#endif
906#endif 913#endif
@@ -1162,9 +1169,11 @@ static void check_mc_test_err(struct mem_ctl_info *mci)
1162 new1 = DIMM1_COR_ERR(rcv0); 1169 new1 = DIMM1_COR_ERR(rcv0);
1163 new0 = DIMM0_COR_ERR(rcv0); 1170 new0 = DIMM0_COR_ERR(rcv0);
1164 1171
1172#if 0
1165 debugf2("%s CE rcv1=0x%08x rcv0=0x%08x, %d %d %d\n", 1173 debugf2("%s CE rcv1=0x%08x rcv0=0x%08x, %d %d %d\n",
1166 (pvt->ce_count_available ? "UPDATE" : "READ"), 1174 (pvt->ce_count_available ? "UPDATE" : "READ"),
1167 rcv1, rcv0, new0, new1, new2); 1175 rcv1, rcv0, new0, new1, new2);
1176#endif
1168 1177
1169 /* Updates CE counters if it is not the first time here */ 1178 /* Updates CE counters if it is not the first time here */
1170 if (pvt->ce_count_available) { 1179 if (pvt->ce_count_available) {
@@ -1195,16 +1204,97 @@ static void check_mc_test_err(struct mem_ctl_info *mci)
1195 pvt->last_ce_count[0] = new0; 1204 pvt->last_ce_count[0] = new0;
1196} 1205}
1197 1206
1207static void i7core_mce_output_error(struct mem_ctl_info *mci,
1208 struct mce *m)
1209{
1210 debugf0("CPU %d: Machine Check Exception: %16Lx"
1211 "Bank %d: %016Lx\n",
1212 m->cpu, m->mcgstatus, m->bank, m->status);
1213 if (m->ip) {
1214 debugf0("RIP%s %02x:<%016Lx>\n",
1215 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
1216 m->cs, m->ip);
1217 }
1218 printk(KERN_EMERG "TSC %llx ", m->tsc);
1219 if (m->addr)
1220 printk("ADDR %llx ", m->addr);
1221 if (m->misc)
1222 printk("MISC %llx ", m->misc);
1223
1224#if 0
1225 snprintf(msg, sizeof(msg),
1226 "%s (Branch=%d DRAM-Bank=%d Buffer ID = %d RDWR=%s "
1227 "RAS=%d CAS=%d %s Err=0x%lx (%s))",
1228 type, branch >> 1, bank, buf_id, rdwr_str(rdwr), ras, cas,
1229 type, allErrors, error_name[errnum]);
1230
1231 /* Call the helper to output message */
1232 edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg);
1233#endif
1234}
1235
1198/* 1236/*
1199 * i7core_check_error Retrieve and process errors reported by the 1237 * i7core_check_error Retrieve and process errors reported by the
1200 * hardware. Called by the Core module. 1238 * hardware. Called by the Core module.
1201 */ 1239 */
1202static void i7core_check_error(struct mem_ctl_info *mci) 1240static void i7core_check_error(struct mem_ctl_info *mci)
1203{ 1241{
1242 struct i7core_pvt *pvt = mci->pvt_info;
1243 int i;
1244 unsigned count = 0;
1245 struct mce *m = NULL;
1246 unsigned long flags;
1247
1248 debugf0(__FILE__ ": %s()\n", __func__);
1249
1250 /* Copy all mce errors into a temporary buffer */
1251 spin_lock_irqsave(&pvt->mce_lock, flags);
1252 if (pvt->mce_count) {
1253 m = kmalloc(sizeof(*m) * pvt->mce_count, GFP_ATOMIC);
1254 if (m) {
1255 count = pvt->mce_count;
1256 memcpy(m, &pvt->mce_entry, sizeof(*m) * count);
1257 }
1258 pvt->mce_count = 0;
1259 }
1260 spin_unlock_irqrestore(&pvt->mce_lock, flags);
1261
1262 /* proccess mcelog errors */
1263 for (i = 0; i < count; i++)
1264 i7core_mce_output_error(mci, &m[i]);
1265
1266 kfree(m);
1267
1268 /* check memory count errors */
1204 check_mc_test_err(mci); 1269 check_mc_test_err(mci);
1205} 1270}
1206 1271
1207/* 1272/*
1273 * i7core_mce_check_error Replicates mcelog routine to get errors
1274 * This routine simply queues mcelog errors, and
1275 * return. The error itself should be handled later
1276 * by i7core_check_error.
1277 */
1278static int i7core_mce_check_error(void *priv, struct mce *mce)
1279{
1280 struct i7core_pvt *pvt = priv;
1281 unsigned long flags;
1282
1283 debugf0(__FILE__ ": %s()\n", __func__);
1284
1285 spin_lock_irqsave(&pvt->mce_lock, flags);
1286 if (pvt->mce_count < MCE_LOG_LEN) {
1287 memcpy(&pvt->mce_entry[pvt->mce_count], mce, sizeof(*mce));
1288 pvt->mce_count++;
1289 }
1290 spin_unlock_irqrestore(&pvt->mce_lock, flags);
1291
1292 /* Advice mcelog that the error were handled */
1293// return 1;
1294 return 0; // Let's duplicate the log
1295}
1296
1297/*
1208 * i7core_probe Probe for ONE instance of device to see if it is 1298 * i7core_probe Probe for ONE instance of device to see if it is
1209 * present. 1299 * present.
1210 * return: 1300 * return:
@@ -1305,6 +1395,18 @@ static int __devinit i7core_probe(struct pci_dev *pdev,
1305 pvt->inject.page = -1; 1395 pvt->inject.page = -1;
1306 pvt->inject.col = -1; 1396 pvt->inject.col = -1;
1307 1397
1398 /* Registers on edac_mce in order to receive memory errors */
1399 pvt->edac_mce.priv = pvt;
1400 pvt->edac_mce.check_error = i7core_mce_check_error;
1401 spin_lock_init(&pvt->mce_lock);
1402
1403 rc = edac_mce_register(&pvt->edac_mce);
1404 if (unlikely (rc < 0)) {
1405 debugf0("MC: " __FILE__
1406 ": %s(): failed edac_mce_register()\n", __func__);
1407 goto fail1;
1408 }
1409
1308 i7core_printk(KERN_INFO, "Driver loaded.\n"); 1410 i7core_printk(KERN_INFO, "Driver loaded.\n");
1309 1411
1310 return 0; 1412 return 0;
@@ -1324,17 +1426,22 @@ fail0:
1324static void __devexit i7core_remove(struct pci_dev *pdev) 1426static void __devexit i7core_remove(struct pci_dev *pdev)
1325{ 1427{
1326 struct mem_ctl_info *mci; 1428 struct mem_ctl_info *mci;
1429 struct i7core_pvt *pvt;
1327 1430
1328 debugf0(__FILE__ ": %s()\n", __func__); 1431 debugf0(__FILE__ ": %s()\n", __func__);
1329 1432
1330 if (i7core_pci) 1433 if (i7core_pci)
1331 edac_pci_release_generic_ctl(i7core_pci); 1434 edac_pci_release_generic_ctl(i7core_pci);
1332 1435
1333 mci = edac_mc_del_mc(&pdev->dev);
1334 1436
1437 mci = edac_mc_del_mc(&pdev->dev);
1335 if (!mci) 1438 if (!mci)
1336 return; 1439 return;
1337 1440
1441 /* Unregisters on edac_mce in order to receive memory errors */
1442 pvt = mci->pvt_info;
1443 edac_mce_unregister(&pvt->edac_mce);
1444
1338 /* retrieve references to resources, and free those resources */ 1445 /* retrieve references to resources, and free those resources */
1339 i7core_put_devices(); 1446 i7core_put_devices();
1340 1447