aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/edac/edac_mce.c11
-rw-r--r--drivers/edac/i7core_edac.c111
2 files changed, 116 insertions, 6 deletions
diff --git a/drivers/edac/edac_mce.c b/drivers/edac/edac_mce.c
index b1efa8e51921..9ccdc5b140e7 100644
--- a/drivers/edac/edac_mce.c
+++ b/drivers/edac/edac_mce.c
@@ -41,9 +41,7 @@ void edac_mce_unregister(struct edac_mce *edac_mce)
41} 41}
42EXPORT_SYMBOL(edac_mce_unregister); 42EXPORT_SYMBOL(edac_mce_unregister);
43 43
44 44int edac_mce_parse(struct mce *mce)
45
46int edac_mce_queue(struct mce *mce)
47{ 45{
48 struct edac_mce *edac_mce; 46 struct edac_mce *edac_mce;
49 47
@@ -55,4 +53,9 @@ int edac_mce_queue(struct mce *mce)
55 /* Nobody queued the error */ 53 /* Nobody queued the error */
56 return 0; 54 return 0;
57} 55}
58EXPORT_SYMBOL_GPL(edac_mce_queue); 56EXPORT_SYMBOL_GPL(edac_mce_parse);
57
58MODULE_LICENSE("GPL");
59MODULE_AUTHOR("Mauro Carvalho Chehab <mchehab@redhat.com>");
60MODULE_AUTHOR("Red Hat Inc. (http://www.redhat.com)");
61MODULE_DESCRIPTION("EDAC Driver for mcelog captured errors");
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index 914914759690..3c7bb5f405f6 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -27,6 +27,8 @@
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/edac.h> 28#include <linux/edac.h>
29#include <linux/mmzone.h> 29#include <linux/mmzone.h>
30#include <linux/edac_mce.h>
31#include <linux/spinlock.h>
30 32
31#include "edac_core.h" 33#include "edac_core.h"
32 34
@@ -195,6 +197,11 @@ struct i7core_pvt {
195 unsigned long ce_count[MAX_DIMMS]; /* ECC corrected errors counts per dimm */ 197 unsigned long ce_count[MAX_DIMMS]; /* ECC corrected errors counts per dimm */
196 int last_ce_count[MAX_DIMMS]; 198 int last_ce_count[MAX_DIMMS];
197 199
200 /* mcelog glue */
201 struct edac_mce edac_mce;
202 struct mce mce_entry[MCE_LOG_LEN];
203 unsigned mce_count;
204 spinlock_t mce_lock;
198}; 205};
199 206
200/* Device name and register DID (Device ID) */ 207/* Device name and register DID (Device ID) */
@@ -900,7 +907,7 @@ static ssize_t i7core_inject_enable_store(struct mem_ctl_info *mci,
900 pci_read_config_dword(pvt->pci_ch[pvt->inject.channel][0], 907 pci_read_config_dword(pvt->pci_ch[pvt->inject.channel][0],
901 MC_CHANNEL_ADDR_MATCH + 4, &rdmask2); 908 MC_CHANNEL_ADDR_MATCH + 4, &rdmask2);
902 909
903 debugf0("Inject addr match write 0x%016llx, read: 0x%08x%08x\n", 910 debugf0("Inject addr match write 0x%016llx, read: 0x%08x 0x%08x\n",
904 mask, rdmask1, rdmask2); 911 mask, rdmask1, rdmask2);
905#endif 912#endif
906#endif 913#endif
@@ -1162,9 +1169,11 @@ static void check_mc_test_err(struct mem_ctl_info *mci)
1162 new1 = DIMM1_COR_ERR(rcv0); 1169 new1 = DIMM1_COR_ERR(rcv0);
1163 new0 = DIMM0_COR_ERR(rcv0); 1170 new0 = DIMM0_COR_ERR(rcv0);
1164 1171
1172#if 0
1165 debugf2("%s CE rcv1=0x%08x rcv0=0x%08x, %d %d %d\n", 1173 debugf2("%s CE rcv1=0x%08x rcv0=0x%08x, %d %d %d\n",
1166 (pvt->ce_count_available ? "UPDATE" : "READ"), 1174 (pvt->ce_count_available ? "UPDATE" : "READ"),
1167 rcv1, rcv0, new0, new1, new2); 1175 rcv1, rcv0, new0, new1, new2);
1176#endif
1168 1177
1169 /* Updates CE counters if it is not the first time here */ 1178 /* Updates CE counters if it is not the first time here */
1170 if (pvt->ce_count_available) { 1179 if (pvt->ce_count_available) {
@@ -1195,16 +1204,97 @@ static void check_mc_test_err(struct mem_ctl_info *mci)
1195 pvt->last_ce_count[0] = new0; 1204 pvt->last_ce_count[0] = new0;
1196} 1205}
1197 1206
1207static void i7core_mce_output_error(struct mem_ctl_info *mci,
1208 struct mce *m)
1209{
1210 debugf0("CPU %d: Machine Check Exception: %16Lx"
1211 "Bank %d: %016Lx\n",
1212 m->cpu, m->mcgstatus, m->bank, m->status);
1213 if (m->ip) {
1214 debugf0("RIP%s %02x:<%016Lx>\n",
1215 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
1216 m->cs, m->ip);
1217 }
1218 printk(KERN_EMERG "TSC %llx ", m->tsc);
1219 if (m->addr)
1220 printk("ADDR %llx ", m->addr);
1221 if (m->misc)
1222 printk("MISC %llx ", m->misc);
1223
1224#if 0
1225 snprintf(msg, sizeof(msg),
1226 "%s (Branch=%d DRAM-Bank=%d Buffer ID = %d RDWR=%s "
1227 "RAS=%d CAS=%d %s Err=0x%lx (%s))",
1228 type, branch >> 1, bank, buf_id, rdwr_str(rdwr), ras, cas,
1229 type, allErrors, error_name[errnum]);
1230
1231 /* Call the helper to output message */
1232 edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg);
1233#endif
1234}
1235
1198/* 1236/*
1199 * i7core_check_error Retrieve and process errors reported by the 1237 * i7core_check_error Retrieve and process errors reported by the
1200 * hardware. Called by the Core module. 1238 * hardware. Called by the Core module.
1201 */ 1239 */
1202static void i7core_check_error(struct mem_ctl_info *mci) 1240static void i7core_check_error(struct mem_ctl_info *mci)
1203{ 1241{
1242 struct i7core_pvt *pvt = mci->pvt_info;
1243 int i;
1244 unsigned count = 0;
1245 struct mce *m = NULL;
1246 unsigned long flags;
1247
1248 debugf0(__FILE__ ": %s()\n", __func__);
1249
1250 /* Copy all mce errors into a temporary buffer */
1251 spin_lock_irqsave(&pvt->mce_lock, flags);
1252 if (pvt->mce_count) {
1253 m = kmalloc(sizeof(*m) * pvt->mce_count, GFP_ATOMIC);
1254 if (m) {
1255 count = pvt->mce_count;
1256 memcpy(m, &pvt->mce_entry, sizeof(*m) * count);
1257 }
1258 pvt->mce_count = 0;
1259 }
1260 spin_unlock_irqrestore(&pvt->mce_lock, flags);
1261
1262 /* proccess mcelog errors */
1263 for (i = 0; i < count; i++)
1264 i7core_mce_output_error(mci, &m[i]);
1265
1266 kfree(m);
1267
1268 /* check memory count errors */
1204 check_mc_test_err(mci); 1269 check_mc_test_err(mci);
1205} 1270}
1206 1271
1207/* 1272/*
1273 * i7core_mce_check_error Replicates mcelog routine to get errors
1274 * This routine simply queues mcelog errors, and
1275 * return. The error itself should be handled later
1276 * by i7core_check_error.
1277 */
1278static int i7core_mce_check_error(void *priv, struct mce *mce)
1279{
1280 struct i7core_pvt *pvt = priv;
1281 unsigned long flags;
1282
1283 debugf0(__FILE__ ": %s()\n", __func__);
1284
1285 spin_lock_irqsave(&pvt->mce_lock, flags);
1286 if (pvt->mce_count < MCE_LOG_LEN) {
1287 memcpy(&pvt->mce_entry[pvt->mce_count], mce, sizeof(*mce));
1288 pvt->mce_count++;
1289 }
1290 spin_unlock_irqrestore(&pvt->mce_lock, flags);
1291
1292 /* Advice mcelog that the error were handled */
1293// return 1;
1294 return 0; // Let's duplicate the log
1295}
1296
1297/*
1208 * i7core_probe Probe for ONE instance of device to see if it is 1298 * i7core_probe Probe for ONE instance of device to see if it is
1209 * present. 1299 * present.
1210 * return: 1300 * return:
@@ -1305,6 +1395,18 @@ static int __devinit i7core_probe(struct pci_dev *pdev,
1305 pvt->inject.page = -1; 1395 pvt->inject.page = -1;
1306 pvt->inject.col = -1; 1396 pvt->inject.col = -1;
1307 1397
1398 /* Registers on edac_mce in order to receive memory errors */
1399 pvt->edac_mce.priv = pvt;
1400 pvt->edac_mce.check_error = i7core_mce_check_error;
1401 spin_lock_init(&pvt->mce_lock);
1402
1403 rc = edac_mce_register(&pvt->edac_mce);
1404 if (unlikely (rc < 0)) {
1405 debugf0("MC: " __FILE__
1406 ": %s(): failed edac_mce_register()\n", __func__);
1407 goto fail1;
1408 }
1409
1308 i7core_printk(KERN_INFO, "Driver loaded.\n"); 1410 i7core_printk(KERN_INFO, "Driver loaded.\n");
1309 1411
1310 return 0; 1412 return 0;
@@ -1324,17 +1426,22 @@ fail0:
1324static void __devexit i7core_remove(struct pci_dev *pdev) 1426static void __devexit i7core_remove(struct pci_dev *pdev)
1325{ 1427{
1326 struct mem_ctl_info *mci; 1428 struct mem_ctl_info *mci;
1429 struct i7core_pvt *pvt;
1327 1430
1328 debugf0(__FILE__ ": %s()\n", __func__); 1431 debugf0(__FILE__ ": %s()\n", __func__);
1329 1432
1330 if (i7core_pci) 1433 if (i7core_pci)
1331 edac_pci_release_generic_ctl(i7core_pci); 1434 edac_pci_release_generic_ctl(i7core_pci);
1332 1435
1333 mci = edac_mc_del_mc(&pdev->dev);
1334 1436
1437 mci = edac_mc_del_mc(&pdev->dev);
1335 if (!mci) 1438 if (!mci)
1336 return; 1439 return;
1337 1440
1441 /* Unregisters on edac_mce in order to receive memory errors */
1442 pvt = mci->pvt_info;
1443 edac_mce_unregister(&pvt->edac_mce);
1444
1338 /* retrieve references to resources, and free those resources */ 1445 /* retrieve references to resources, and free those resources */
1339 i7core_put_devices(); 1446 i7core_put_devices();
1340 1447