aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/edac/ghes_edac.c
diff options
context:
space:
mode:
authorMauro Carvalho Chehab <mchehab@redhat.com>2013-02-19 17:24:12 -0500
committerMauro Carvalho Chehab <mchehab@redhat.com>2013-02-25 17:42:16 -0500
commit689c9cd8128f13bf9843a3e133423f5e3e0ce4aa (patch)
treead711935bc9105705f7a4b493db115198e7889f0 /drivers/edac/ghes_edac.c
parentd2a6856614fd34e36352146307a5655efbdbc14d (diff)
ghes_edac: Make it compliant with UEFI spec 2.3.1
The UEFI spec defines the memory error types ans the bits that validate each field on the memory error record, at Appendix N om items N.2.5 (Memory Error Section) and N.2.11 (Error Status). Make the error description compliant with it, only showing the valid fields. The EDAC error log is now properly reporting the error: [ 281.556854] mce: [Hardware Error]: Machine check events logged [ 281.557042] {2}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 0 [ 281.557044] {2}[Hardware Error]: APEI generic hardware error status [ 281.557046] {2}[Hardware Error]: severity: 2, corrected [ 281.557048] {2}[Hardware Error]: section: 0, severity: 2, corrected [ 281.557050] {2}[Hardware Error]: flags: 0x01 [ 281.557052] {2}[Hardware Error]: primary [ 281.557053] {2}[Hardware Error]: section_type: memory error [ 281.557055] {2}[Hardware Error]: error_status: 0x0000000000000400 [ 281.557056] {2}[Hardware Error]: node: 3 [ 281.557057] {2}[Hardware Error]: card: 0 [ 281.557058] {2}[Hardware Error]: module: 1 [ 281.557059] {2}[Hardware Error]: device: 0 [ 281.557061] {2}[Hardware Error]: error_type: 18, unknown [ 281.557067] EDAC DEBUG: ghes_edac_report_mem_error: error validation_bits: 0x000040b9 [ 281.557084] EDAC MC0: 1 CE reserved error (18) on unknown label (node:3 card:0 module:1 page:0x0 offset:0x0 grain:0 syndrome:0x0 - status(0x0000000000000400): Storage error in DRAM memory) Tested on a 4 CPUs E5-4650 Sandy Bridge machine. Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Diffstat (limited to 'drivers/edac/ghes_edac.c')
-rw-r--r--drivers/edac/ghes_edac.c195
1 files changed, 180 insertions, 15 deletions
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index b4acc4f2074d..1bde45141073 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -22,6 +22,10 @@ struct ghes_edac_pvt {
22 struct list_head list; 22 struct list_head list;
23 struct ghes *ghes; 23 struct ghes *ghes;
24 struct mem_ctl_info *mci; 24 struct mem_ctl_info *mci;
25
26 /* Buffers for the error handling routine */
27 char other_detail[160];
28 char msg[80];
25}; 29};
26 30
27static LIST_HEAD(ghes_reglist); 31static LIST_HEAD(ghes_reglist);
@@ -186,6 +190,7 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
186 struct edac_raw_error_desc *e; 190 struct edac_raw_error_desc *e;
187 struct mem_ctl_info *mci; 191 struct mem_ctl_info *mci;
188 struct ghes_edac_pvt *pvt = NULL; 192 struct ghes_edac_pvt *pvt = NULL;
193 char *p;
189 194
190 list_for_each_entry(pvt, &ghes_reglist, list) { 195 list_for_each_entry(pvt, &ghes_reglist, list) {
191 if (ghes == pvt->ghes) 196 if (ghes == pvt->ghes)
@@ -201,15 +206,14 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
201 /* Cleans the error report buffer */ 206 /* Cleans the error report buffer */
202 memset(e, 0, sizeof (*e)); 207 memset(e, 0, sizeof (*e));
203 e->error_count = 1; 208 e->error_count = 1;
204 e->msg = "APEI"; 209 strcpy(e->label, "unknown label");
205 strcpy(e->label, "unknown"); 210 e->msg = pvt->msg;
206 e->other_detail = ""; 211 e->other_detail = pvt->other_detail;
207 212 e->top_layer = -1;
208 if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) { 213 e->mid_layer = -1;
209 e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT; 214 e->low_layer = -1;
210 e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK; 215 *pvt->other_detail = '\0';
211 e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK); 216 *pvt->msg = '\0';
212 }
213 217
214 switch (sev) { 218 switch (sev) {
215 case GHES_SEV_CORRECTED: 219 case GHES_SEV_CORRECTED:
@@ -226,12 +230,173 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
226 type = HW_EVENT_ERR_INFO; 230 type = HW_EVENT_ERR_INFO;
227 } 231 }
228 232
229 sprintf(e->location, 233 edac_dbg(1, "error validation_bits: 0x%08llx\n",
230 "node:%d card:%d module:%d bank:%d device:%d row: %d column:%d bit_pos:%d", 234 (long long)mem_err->validation_bits);
231 mem_err->node, mem_err->card, mem_err->module, 235
232 mem_err->bank, mem_err->device, mem_err->row, mem_err->column, 236 /* Error type, mapped on e->msg */
233 mem_err->bit_pos); 237 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
234 edac_dbg(3, "error at location %s\n", e->location); 238 p = pvt->msg;
239 switch (mem_err->error_type) {
240 case 0:
241 p += sprintf(p, "Unknown");
242 break;
243 case 1:
244 p += sprintf(p, "No error");
245 break;
246 case 2:
247 p += sprintf(p, "Single-bit ECC");
248 break;
249 case 3:
250 p += sprintf(p, "Multi-bit ECC");
251 break;
252 case 4:
253 p += sprintf(p, "Single-symbol ChipKill ECC");
254 break;
255 case 5:
256 p += sprintf(p, "Multi-symbol ChipKill ECC");
257 break;
258 case 6:
259 p += sprintf(p, "Master abort");
260 break;
261 case 7:
262 p += sprintf(p, "Target abort");
263 break;
264 case 8:
265 p += sprintf(p, "Parity Error");
266 break;
267 case 9:
268 p += sprintf(p, "Watchdog timeout");
269 break;
270 case 10:
271 p += sprintf(p, "Invalid address");
272 break;
273 case 11:
274 p += sprintf(p, "Mirror Broken");
275 break;
276 case 12:
277 p += sprintf(p, "Memory Sparing");
278 break;
279 case 13:
280 p += sprintf(p, "Scrub corrected error");
281 break;
282 case 14:
283 p += sprintf(p, "Scrub uncorrected error");
284 break;
285 case 15:
286 p += sprintf(p, "Physical Memory Map-out event");
287 break;
288 default:
289 p += sprintf(p, "reserved error (%d)",
290 mem_err->error_type);
291 }
292 } else {
293 strcpy(pvt->msg, "unknown error");
294 }
295
296 /* Error address */
297 if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
298 e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT;
299 e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK;
300 }
301
302 /* Error grain */
303 if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS_MASK) {
304 e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
305 }
306
307 /* Memory error location, mapped on e->location */
308 p = e->location;
309 if (mem_err->validation_bits & CPER_MEM_VALID_NODE)
310 p += sprintf(p, "node:%d ", mem_err->node);
311 if (mem_err->validation_bits & CPER_MEM_VALID_CARD)
312 p += sprintf(p, "card:%d ", mem_err->card);
313 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE)
314 p += sprintf(p, "module:%d ", mem_err->module);
315 if (mem_err->validation_bits & CPER_MEM_VALID_BANK)
316 p += sprintf(p, "bank:%d ", mem_err->bank);
317 if (mem_err->validation_bits & CPER_MEM_VALID_ROW)
318 p += sprintf(p, "row:%d ", mem_err->row);
319 if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN)
320 p += sprintf(p, "col:%d ", mem_err->column);
321 if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
322 p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
323 if (p > e->location)
324 *(p - 1) = '\0';
325
326 /* All other fields are mapped on e->other_detail */
327 p = pvt->other_detail;
328 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) {
329 u64 status = mem_err->error_status;
330
331 p += sprintf(p, "status(0x%016llx): ", (long long)status);
332 switch ((status >> 8) & 0xff) {
333 case 1:
334 p += sprintf(p, "Error detected internal to the component ");
335 break;
336 case 16:
337 p += sprintf(p, "Error detected in the bus ");
338 break;
339 case 4:
340 p += sprintf(p, "Storage error in DRAM memory ");
341 break;
342 case 5:
343 p += sprintf(p, "Storage error in TLB ");
344 break;
345 case 6:
346 p += sprintf(p, "Storage error in cache ");
347 break;
348 case 7:
349 p += sprintf(p, "Error in one or more functional units ");
350 break;
351 case 8:
352 p += sprintf(p, "component failed self test ");
353 break;
354 case 9:
355 p += sprintf(p, "Overflow or undervalue of internal queue ");
356 break;
357 case 17:
358 p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR ");
359 break;
360 case 18:
361 p += sprintf(p, "Improper access error ");
362 break;
363 case 19:
364 p += sprintf(p, "Access to a memory address which is not mapped to any component ");
365 break;
366 case 20:
367 p += sprintf(p, "Loss of Lockstep ");
368 break;
369 case 21:
370 p += sprintf(p, "Response not associated with a request ");
371 break;
372 case 22:
373 p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits ");
374 break;
375 case 23:
376 p += sprintf(p, "Detection of a PATH_ERROR ");
377 break;
378 case 25:
379 p += sprintf(p, "Bus operation timeout ");
380 break;
381 case 26:
382 p += sprintf(p, "A read was issued to data that has been poisoned ");
383 break;
384 default:
385 p += sprintf(p, "reserved ");
386 break;
387 }
388 }
389 if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
390 p += sprintf(p, "requestorID: 0x%016llx ",
391 (long long)mem_err->requestor_id);
392 if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
393 p += sprintf(p, "responderID: 0x%016llx ",
394 (long long)mem_err->responder_id);
395 if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID)
396 p += sprintf(p, "targetID: 0x%016llx ",
397 (long long)mem_err->responder_id);
398 if (p > pvt->other_detail)
399 *(p - 1) = '\0';
235 400
236 edac_raw_mc_handle_error(type, mci, e); 401 edac_raw_mc_handle_error(type, mci, e);
237} 402}