diff options
-rw-r--r-- | drivers/edac/edac_mce.c | 11 | ||||
-rw-r--r-- | drivers/edac/i7core_edac.c | 111 |
2 files changed, 116 insertions, 6 deletions
diff --git a/drivers/edac/edac_mce.c b/drivers/edac/edac_mce.c index b1efa8e51921..9ccdc5b140e7 100644 --- a/drivers/edac/edac_mce.c +++ b/drivers/edac/edac_mce.c | |||
@@ -41,9 +41,7 @@ void edac_mce_unregister(struct edac_mce *edac_mce) | |||
41 | } | 41 | } |
42 | EXPORT_SYMBOL(edac_mce_unregister); | 42 | EXPORT_SYMBOL(edac_mce_unregister); |
43 | 43 | ||
44 | 44 | int edac_mce_parse(struct mce *mce) | |
45 | |||
46 | int edac_mce_queue(struct mce *mce) | ||
47 | { | 45 | { |
48 | struct edac_mce *edac_mce; | 46 | struct edac_mce *edac_mce; |
49 | 47 | ||
@@ -55,4 +53,9 @@ int edac_mce_queue(struct mce *mce) | |||
55 | /* Nobody queued the error */ | 53 | /* Nobody queued the error */ |
56 | return 0; | 54 | return 0; |
57 | } | 55 | } |
58 | EXPORT_SYMBOL_GPL(edac_mce_queue); | 56 | EXPORT_SYMBOL_GPL(edac_mce_parse); |
57 | |||
58 | MODULE_LICENSE("GPL"); | ||
59 | MODULE_AUTHOR("Mauro Carvalho Chehab <mchehab@redhat.com>"); | ||
60 | MODULE_AUTHOR("Red Hat Inc. (http://www.redhat.com)"); | ||
61 | MODULE_DESCRIPTION("EDAC Driver for mcelog captured errors"); | ||
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index 914914759690..3c7bb5f405f6 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c | |||
@@ -27,6 +27,8 @@ | |||
27 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
28 | #include <linux/edac.h> | 28 | #include <linux/edac.h> |
29 | #include <linux/mmzone.h> | 29 | #include <linux/mmzone.h> |
30 | #include <linux/edac_mce.h> | ||
31 | #include <linux/spinlock.h> | ||
30 | 32 | ||
31 | #include "edac_core.h" | 33 | #include "edac_core.h" |
32 | 34 | ||
@@ -195,6 +197,11 @@ struct i7core_pvt { | |||
195 | unsigned long ce_count[MAX_DIMMS]; /* ECC corrected errors counts per dimm */ | 197 | unsigned long ce_count[MAX_DIMMS]; /* ECC corrected errors counts per dimm */ |
196 | int last_ce_count[MAX_DIMMS]; | 198 | int last_ce_count[MAX_DIMMS]; |
197 | 199 | ||
200 | /* mcelog glue */ | ||
201 | struct edac_mce edac_mce; | ||
202 | struct mce mce_entry[MCE_LOG_LEN]; | ||
203 | unsigned mce_count; | ||
204 | spinlock_t mce_lock; | ||
198 | }; | 205 | }; |
199 | 206 | ||
200 | /* Device name and register DID (Device ID) */ | 207 | /* Device name and register DID (Device ID) */ |
@@ -900,7 +907,7 @@ static ssize_t i7core_inject_enable_store(struct mem_ctl_info *mci, | |||
900 | pci_read_config_dword(pvt->pci_ch[pvt->inject.channel][0], | 907 | pci_read_config_dword(pvt->pci_ch[pvt->inject.channel][0], |
901 | MC_CHANNEL_ADDR_MATCH + 4, &rdmask2); | 908 | MC_CHANNEL_ADDR_MATCH + 4, &rdmask2); |
902 | 909 | ||
903 | debugf0("Inject addr match write 0x%016llx, read: 0x%08x%08x\n", | 910 | debugf0("Inject addr match write 0x%016llx, read: 0x%08x 0x%08x\n", |
904 | mask, rdmask1, rdmask2); | 911 | mask, rdmask1, rdmask2); |
905 | #endif | 912 | #endif |
906 | #endif | 913 | #endif |
@@ -1162,9 +1169,11 @@ static void check_mc_test_err(struct mem_ctl_info *mci) | |||
1162 | new1 = DIMM1_COR_ERR(rcv0); | 1169 | new1 = DIMM1_COR_ERR(rcv0); |
1163 | new0 = DIMM0_COR_ERR(rcv0); | 1170 | new0 = DIMM0_COR_ERR(rcv0); |
1164 | 1171 | ||
1172 | #if 0 | ||
1165 | debugf2("%s CE rcv1=0x%08x rcv0=0x%08x, %d %d %d\n", | 1173 | debugf2("%s CE rcv1=0x%08x rcv0=0x%08x, %d %d %d\n", |
1166 | (pvt->ce_count_available ? "UPDATE" : "READ"), | 1174 | (pvt->ce_count_available ? "UPDATE" : "READ"), |
1167 | rcv1, rcv0, new0, new1, new2); | 1175 | rcv1, rcv0, new0, new1, new2); |
1176 | #endif | ||
1168 | 1177 | ||
1169 | /* Updates CE counters if it is not the first time here */ | 1178 | /* Updates CE counters if it is not the first time here */ |
1170 | if (pvt->ce_count_available) { | 1179 | if (pvt->ce_count_available) { |
@@ -1195,16 +1204,97 @@ static void check_mc_test_err(struct mem_ctl_info *mci) | |||
1195 | pvt->last_ce_count[0] = new0; | 1204 | pvt->last_ce_count[0] = new0; |
1196 | } | 1205 | } |
1197 | 1206 | ||
1207 | static void i7core_mce_output_error(struct mem_ctl_info *mci, | ||
1208 | struct mce *m) | ||
1209 | { | ||
1210 | debugf0("CPU %d: Machine Check Exception: %16Lx" | ||
1211 | "Bank %d: %016Lx\n", | ||
1212 | m->cpu, m->mcgstatus, m->bank, m->status); | ||
1213 | if (m->ip) { | ||
1214 | debugf0("RIP%s %02x:<%016Lx>\n", | ||
1215 | !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", | ||
1216 | m->cs, m->ip); | ||
1217 | } | ||
1218 | printk(KERN_EMERG "TSC %llx ", m->tsc); | ||
1219 | if (m->addr) | ||
1220 | printk("ADDR %llx ", m->addr); | ||
1221 | if (m->misc) | ||
1222 | printk("MISC %llx ", m->misc); | ||
1223 | |||
1224 | #if 0 | ||
1225 | snprintf(msg, sizeof(msg), | ||
1226 | "%s (Branch=%d DRAM-Bank=%d Buffer ID = %d RDWR=%s " | ||
1227 | "RAS=%d CAS=%d %s Err=0x%lx (%s))", | ||
1228 | type, branch >> 1, bank, buf_id, rdwr_str(rdwr), ras, cas, | ||
1229 | type, allErrors, error_name[errnum]); | ||
1230 | |||
1231 | /* Call the helper to output message */ | ||
1232 | edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg); | ||
1233 | #endif | ||
1234 | } | ||
1235 | |||
1198 | /* | 1236 | /* |
1199 | * i7core_check_error Retrieve and process errors reported by the | 1237 | * i7core_check_error Retrieve and process errors reported by the |
1200 | * hardware. Called by the Core module. | 1238 | * hardware. Called by the Core module. |
1201 | */ | 1239 | */ |
1202 | static void i7core_check_error(struct mem_ctl_info *mci) | 1240 | static void i7core_check_error(struct mem_ctl_info *mci) |
1203 | { | 1241 | { |
1242 | struct i7core_pvt *pvt = mci->pvt_info; | ||
1243 | int i; | ||
1244 | unsigned count = 0; | ||
1245 | struct mce *m = NULL; | ||
1246 | unsigned long flags; | ||
1247 | |||
1248 | debugf0(__FILE__ ": %s()\n", __func__); | ||
1249 | |||
1250 | /* Copy all mce errors into a temporary buffer */ | ||
1251 | spin_lock_irqsave(&pvt->mce_lock, flags); | ||
1252 | if (pvt->mce_count) { | ||
1253 | m = kmalloc(sizeof(*m) * pvt->mce_count, GFP_ATOMIC); | ||
1254 | if (m) { | ||
1255 | count = pvt->mce_count; | ||
1256 | memcpy(m, &pvt->mce_entry, sizeof(*m) * count); | ||
1257 | } | ||
1258 | pvt->mce_count = 0; | ||
1259 | } | ||
1260 | spin_unlock_irqrestore(&pvt->mce_lock, flags); | ||
1261 | |||
1262 | /* proccess mcelog errors */ | ||
1263 | for (i = 0; i < count; i++) | ||
1264 | i7core_mce_output_error(mci, &m[i]); | ||
1265 | |||
1266 | kfree(m); | ||
1267 | |||
1268 | /* check memory count errors */ | ||
1204 | check_mc_test_err(mci); | 1269 | check_mc_test_err(mci); |
1205 | } | 1270 | } |
1206 | 1271 | ||
1207 | /* | 1272 | /* |
1273 | * i7core_mce_check_error Replicates mcelog routine to get errors | ||
1274 | * This routine simply queues mcelog errors, and | ||
1275 | * return. The error itself should be handled later | ||
1276 | * by i7core_check_error. | ||
1277 | */ | ||
1278 | static int i7core_mce_check_error(void *priv, struct mce *mce) | ||
1279 | { | ||
1280 | struct i7core_pvt *pvt = priv; | ||
1281 | unsigned long flags; | ||
1282 | |||
1283 | debugf0(__FILE__ ": %s()\n", __func__); | ||
1284 | |||
1285 | spin_lock_irqsave(&pvt->mce_lock, flags); | ||
1286 | if (pvt->mce_count < MCE_LOG_LEN) { | ||
1287 | memcpy(&pvt->mce_entry[pvt->mce_count], mce, sizeof(*mce)); | ||
1288 | pvt->mce_count++; | ||
1289 | } | ||
1290 | spin_unlock_irqrestore(&pvt->mce_lock, flags); | ||
1291 | |||
1292 | /* Advice mcelog that the error were handled */ | ||
1293 | // return 1; | ||
1294 | return 0; // Let's duplicate the log | ||
1295 | } | ||
1296 | |||
1297 | /* | ||
1208 | * i7core_probe Probe for ONE instance of device to see if it is | 1298 | * i7core_probe Probe for ONE instance of device to see if it is |
1209 | * present. | 1299 | * present. |
1210 | * return: | 1300 | * return: |
@@ -1305,6 +1395,18 @@ static int __devinit i7core_probe(struct pci_dev *pdev, | |||
1305 | pvt->inject.page = -1; | 1395 | pvt->inject.page = -1; |
1306 | pvt->inject.col = -1; | 1396 | pvt->inject.col = -1; |
1307 | 1397 | ||
1398 | /* Registers on edac_mce in order to receive memory errors */ | ||
1399 | pvt->edac_mce.priv = pvt; | ||
1400 | pvt->edac_mce.check_error = i7core_mce_check_error; | ||
1401 | spin_lock_init(&pvt->mce_lock); | ||
1402 | |||
1403 | rc = edac_mce_register(&pvt->edac_mce); | ||
1404 | if (unlikely (rc < 0)) { | ||
1405 | debugf0("MC: " __FILE__ | ||
1406 | ": %s(): failed edac_mce_register()\n", __func__); | ||
1407 | goto fail1; | ||
1408 | } | ||
1409 | |||
1308 | i7core_printk(KERN_INFO, "Driver loaded.\n"); | 1410 | i7core_printk(KERN_INFO, "Driver loaded.\n"); |
1309 | 1411 | ||
1310 | return 0; | 1412 | return 0; |
@@ -1324,17 +1426,22 @@ fail0: | |||
1324 | static void __devexit i7core_remove(struct pci_dev *pdev) | 1426 | static void __devexit i7core_remove(struct pci_dev *pdev) |
1325 | { | 1427 | { |
1326 | struct mem_ctl_info *mci; | 1428 | struct mem_ctl_info *mci; |
1429 | struct i7core_pvt *pvt; | ||
1327 | 1430 | ||
1328 | debugf0(__FILE__ ": %s()\n", __func__); | 1431 | debugf0(__FILE__ ": %s()\n", __func__); |
1329 | 1432 | ||
1330 | if (i7core_pci) | 1433 | if (i7core_pci) |
1331 | edac_pci_release_generic_ctl(i7core_pci); | 1434 | edac_pci_release_generic_ctl(i7core_pci); |
1332 | 1435 | ||
1333 | mci = edac_mc_del_mc(&pdev->dev); | ||
1334 | 1436 | ||
1437 | mci = edac_mc_del_mc(&pdev->dev); | ||
1335 | if (!mci) | 1438 | if (!mci) |
1336 | return; | 1439 | return; |
1337 | 1440 | ||
1441 | /* Unregisters on edac_mce in order to receive memory errors */ | ||
1442 | pvt = mci->pvt_info; | ||
1443 | edac_mce_unregister(&pvt->edac_mce); | ||
1444 | |||
1338 | /* retrieve references to resources, and free those resources */ | 1445 | /* retrieve references to resources, and free those resources */ |
1339 | i7core_put_devices(); | 1446 | i7core_put_devices(); |
1340 | 1447 | ||