diff options
Diffstat (limited to 'include/linux/edac.h')
-rw-r--r-- | include/linux/edac.h | 182 |
1 files changed, 160 insertions, 22 deletions
diff --git a/include/linux/edac.h b/include/linux/edac.h index c621d762bb2c..91ba3bae42ee 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h | |||
@@ -71,6 +71,25 @@ enum dev_type { | |||
71 | #define DEV_FLAG_X64 BIT(DEV_X64) | 71 | #define DEV_FLAG_X64 BIT(DEV_X64) |
72 | 72 | ||
73 | /** | 73 | /** |
74 | * enum hw_event_mc_err_type - type of the detected error | ||
75 | * | ||
76 | * @HW_EVENT_ERR_CORRECTED: Corrected Error - Indicates that an ECC | ||
77 | * corrected error was detected | ||
78 | * @HW_EVENT_ERR_UNCORRECTED: Uncorrected Error - Indicates an error that | ||
79 | * can't be corrected by ECC, but it is not | ||
80 | * fatal (maybe it is on an unused memory area, | ||
81 | * or the memory controller could recover from | ||
82 | * it for example, by re-trying the operation). | ||
83 | * @HW_EVENT_ERR_FATAL: Fatal Error - Uncorrected error that could not | ||
84 | * be recovered. | ||
85 | */ | ||
86 | enum hw_event_mc_err_type { | ||
87 | HW_EVENT_ERR_CORRECTED, | ||
88 | HW_EVENT_ERR_UNCORRECTED, | ||
89 | HW_EVENT_ERR_FATAL, | ||
90 | }; | ||
91 | |||
92 | /** | ||
74 | * enum mem_type - memory types. For a more detailed reference, please see | 93 | * enum mem_type - memory types. For a more detailed reference, please see |
75 | * http://en.wikipedia.org/wiki/DRAM | 94 | * http://en.wikipedia.org/wiki/DRAM |
76 | * | 95 | * |
@@ -313,38 +332,141 @@ enum scrub_type { | |||
313 | */ | 332 | */ |
314 | 333 | ||
315 | /** | 334 | /** |
335 | * enum edac_mc_layer - memory controller hierarchy layer | ||
336 | * | ||
337 | * @EDAC_MC_LAYER_BRANCH: memory layer is named "branch" | ||
338 | * @EDAC_MC_LAYER_CHANNEL: memory layer is named "channel" | ||
339 | * @EDAC_MC_LAYER_SLOT: memory layer is named "slot" | ||
340 | * @EDAC_MC_LAYER_CHIP_SELECT: memory layer is named "chip select" | ||
341 | * | ||
342 | * This enum is used by the drivers to tell edac_mc_sysfs what name should | ||
343 | * be used when describing a memory stick location. | ||
344 | */ | ||
345 | enum edac_mc_layer_type { | ||
346 | EDAC_MC_LAYER_BRANCH, | ||
347 | EDAC_MC_LAYER_CHANNEL, | ||
348 | EDAC_MC_LAYER_SLOT, | ||
349 | EDAC_MC_LAYER_CHIP_SELECT, | ||
350 | }; | ||
351 | |||
352 | /** | ||
353 | * struct edac_mc_layer - describes the memory controller hierarchy | ||
354 | * @layer: layer type | ||
355 | * @size: number of components per layer. For example, | ||
356 | * if the channel layer has two channels, size = 2 | ||
357 | * @is_virt_csrow: This layer is part of the "csrow" when old API | ||
358 | * compatibility mode is enabled. Otherwise, it is | ||
359 | * a channel | ||
360 | */ | ||
361 | struct edac_mc_layer { | ||
362 | enum edac_mc_layer_type type; | ||
363 | unsigned size; | ||
364 | bool is_virt_csrow; | ||
365 | }; | ||
366 | |||
367 | /* | ||
368 | * Maximum number of layers used by the memory controller to uniquely | ||
369 | * identify a single memory stick. | ||
370 | * NOTE: Changing this constant requires not only to change the constant | ||
371 | * below, but also to change the existing code at the core, as there are | ||
372 | * some code there that are optimized for 3 layers. | ||
373 | */ | ||
374 | #define EDAC_MAX_LAYERS 3 | ||
375 | |||
376 | /** | ||
377 | * EDAC_DIMM_PTR - Macro responsible to find a pointer inside a pointer array | ||
378 | * for the element given by [layer0,layer1,layer2] position | ||
379 | * | ||
380 | * @layers: a struct edac_mc_layer array, describing how many elements | ||
381 | * were allocated for each layer | ||
382 | * @var: name of the var where we want to get the pointer | ||
383 | * (like mci->dimms) | ||
384 | * @n_layers: Number of layers at the @layers array | ||
385 | * @layer0: layer0 position | ||
386 | * @layer1: layer1 position. Unused if n_layers < 2 | ||
387 | * @layer2: layer2 position. Unused if n_layers < 3 | ||
388 | * | ||
389 | * For 1 layer, this macro returns &var[layer0] | ||
390 | * For 2 layers, this macro is similar to allocate a bi-dimensional array | ||
391 | * and to return "&var[layer0][layer1]" | ||
392 | * For 3 layers, this macro is similar to allocate a tri-dimensional array | ||
393 | * and to return "&var[layer0][layer1][layer2]" | ||
394 | * | ||
395 | * A loop could be used here to make it more generic, but, as we only have | ||
396 | * 3 layers, this is a little faster. | ||
397 | * By design, layers can never be 0 or more than 3. If that ever happens, | ||
398 | * a NULL is returned, causing an OOPS during the memory allocation routine, | ||
399 | * with would point to the developer that he's doing something wrong. | ||
400 | */ | ||
401 | #define EDAC_DIMM_PTR(layers, var, nlayers, layer0, layer1, layer2) ({ \ | ||
402 | typeof(var) __p; \ | ||
403 | if ((nlayers) == 1) \ | ||
404 | __p = &var[layer0]; \ | ||
405 | else if ((nlayers) == 2) \ | ||
406 | __p = &var[(layer1) + ((layers[1]).size * (layer0))]; \ | ||
407 | else if ((nlayers) == 3) \ | ||
408 | __p = &var[(layer2) + ((layers[2]).size * ((layer1) + \ | ||
409 | ((layers[1]).size * (layer0))))]; \ | ||
410 | else \ | ||
411 | __p = NULL; \ | ||
412 | __p; \ | ||
413 | }) | ||
414 | |||
415 | |||
416 | /* FIXME: add the proper per-location error counts */ | ||
417 | struct dimm_info { | ||
418 | char label[EDAC_MC_LABEL_LEN + 1]; /* DIMM label on motherboard */ | ||
419 | |||
420 | /* Memory location data */ | ||
421 | unsigned location[EDAC_MAX_LAYERS]; | ||
422 | |||
423 | struct mem_ctl_info *mci; /* the parent */ | ||
424 | |||
425 | u32 grain; /* granularity of reported error in bytes */ | ||
426 | enum dev_type dtype; /* memory device type */ | ||
427 | enum mem_type mtype; /* memory dimm type */ | ||
428 | enum edac_type edac_mode; /* EDAC mode for this dimm */ | ||
429 | |||
430 | u32 nr_pages; /* number of pages on this dimm */ | ||
431 | |||
432 | unsigned csrow, cschannel; /* Points to the old API data */ | ||
433 | }; | ||
434 | |||
435 | /** | ||
316 | * struct rank_info - contains the information for one DIMM rank | 436 | * struct rank_info - contains the information for one DIMM rank |
317 | * | 437 | * |
318 | * @chan_idx: channel number where the rank is (typically, 0 or 1) | 438 | * @chan_idx: channel number where the rank is (typically, 0 or 1) |
319 | * @ce_count: number of correctable errors for this rank | 439 | * @ce_count: number of correctable errors for this rank |
320 | * @label: DIMM label. Different ranks for the same DIMM should be | ||
321 | * filled, on userspace, with the same label. | ||
322 | * FIXME: The core currently won't enforce it. | ||
323 | * @csrow: A pointer to the chip select row structure (the parent | 440 | * @csrow: A pointer to the chip select row structure (the parent |
324 | * structure). The location of the rank is given by | 441 | * structure). The location of the rank is given by |
325 | * the (csrow->csrow_idx, chan_idx) vector. | 442 | * the (csrow->csrow_idx, chan_idx) vector. |
443 | * @dimm: A pointer to the DIMM structure, where the DIMM label | ||
444 | * information is stored. | ||
445 | * | ||
446 | * FIXME: Currently, the EDAC core model will assume one DIMM per rank. | ||
447 | * This is a bad assumption, but it makes this patch easier. Later | ||
448 | * patches in this series will fix this issue. | ||
326 | */ | 449 | */ |
327 | struct rank_info { | 450 | struct rank_info { |
328 | int chan_idx; | 451 | int chan_idx; |
329 | u32 ce_count; | 452 | struct csrow_info *csrow; |
330 | char label[EDAC_MC_LABEL_LEN + 1]; | 453 | struct dimm_info *dimm; |
331 | struct csrow_info *csrow; /* the parent */ | 454 | |
455 | u32 ce_count; /* Correctable Errors for this csrow */ | ||
332 | }; | 456 | }; |
333 | 457 | ||
334 | struct csrow_info { | 458 | struct csrow_info { |
335 | unsigned long first_page; /* first page number in dimm */ | 459 | /* Used only by edac_mc_find_csrow_by_page() */ |
336 | unsigned long last_page; /* last page number in dimm */ | 460 | unsigned long first_page; /* first page number in csrow */ |
461 | unsigned long last_page; /* last page number in csrow */ | ||
337 | unsigned long page_mask; /* used for interleaving - | 462 | unsigned long page_mask; /* used for interleaving - |
338 | * 0UL for non intlv | 463 | * 0UL for non intlv */ |
339 | */ | 464 | |
340 | u32 nr_pages; /* number of pages in csrow */ | 465 | int csrow_idx; /* the chip-select row */ |
341 | u32 grain; /* granularity of reported error in bytes */ | 466 | |
342 | int csrow_idx; /* the chip-select row */ | ||
343 | enum dev_type dtype; /* memory device type */ | ||
344 | u32 ue_count; /* Uncorrectable Errors for this csrow */ | 467 | u32 ue_count; /* Uncorrectable Errors for this csrow */ |
345 | u32 ce_count; /* Correctable Errors for this csrow */ | 468 | u32 ce_count; /* Correctable Errors for this csrow */ |
346 | enum mem_type mtype; /* memory csrow type */ | 469 | |
347 | enum edac_type edac_mode; /* EDAC mode for this csrow */ | ||
348 | struct mem_ctl_info *mci; /* the parent */ | 470 | struct mem_ctl_info *mci; /* the parent */ |
349 | 471 | ||
350 | struct kobject kobj; /* sysfs kobject for this csrow */ | 472 | struct kobject kobj; /* sysfs kobject for this csrow */ |
@@ -426,8 +548,20 @@ struct mem_ctl_info { | |||
426 | unsigned long (*ctl_page_to_phys) (struct mem_ctl_info * mci, | 548 | unsigned long (*ctl_page_to_phys) (struct mem_ctl_info * mci, |
427 | unsigned long page); | 549 | unsigned long page); |
428 | int mc_idx; | 550 | int mc_idx; |
429 | int nr_csrows; | ||
430 | struct csrow_info *csrows; | 551 | struct csrow_info *csrows; |
552 | unsigned nr_csrows, num_cschannel; | ||
553 | |||
554 | /* Memory Controller hierarchy */ | ||
555 | unsigned n_layers; | ||
556 | struct edac_mc_layer *layers; | ||
557 | bool mem_is_per_rank; | ||
558 | |||
559 | /* | ||
560 | * DIMM info. Will eventually remove the entire csrows_info some day | ||
561 | */ | ||
562 | unsigned tot_dimms; | ||
563 | struct dimm_info *dimms; | ||
564 | |||
431 | /* | 565 | /* |
432 | * FIXME - what about controllers on other busses? - IDs must be | 566 | * FIXME - what about controllers on other busses? - IDs must be |
433 | * unique. dev pointer should be sufficiently unique, but | 567 | * unique. dev pointer should be sufficiently unique, but |
@@ -440,12 +574,16 @@ struct mem_ctl_info { | |||
440 | const char *dev_name; | 574 | const char *dev_name; |
441 | char proc_name[MC_PROC_NAME_MAX_LEN + 1]; | 575 | char proc_name[MC_PROC_NAME_MAX_LEN + 1]; |
442 | void *pvt_info; | 576 | void *pvt_info; |
443 | u32 ue_noinfo_count; /* Uncorrectable Errors w/o info */ | ||
444 | u32 ce_noinfo_count; /* Correctable Errors w/o info */ | ||
445 | u32 ue_count; /* Total Uncorrectable Errors for this MC */ | ||
446 | u32 ce_count; /* Total Correctable Errors for this MC */ | ||
447 | unsigned long start_time; /* mci load start time (in jiffies) */ | 577 | unsigned long start_time; /* mci load start time (in jiffies) */ |
448 | 578 | ||
579 | /* | ||
580 | * drivers shouldn't access those fields directly, as the core | ||
581 | * already handles that. | ||
582 | */ | ||
583 | u32 ce_noinfo_count, ue_noinfo_count; | ||
584 | u32 ue_mc, ce_mc; | ||
585 | u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; | ||
586 | |||
449 | struct completion complete; | 587 | struct completion complete; |
450 | 588 | ||
451 | /* edac sysfs device control */ | 589 | /* edac sysfs device control */ |
@@ -458,7 +596,7 @@ struct mem_ctl_info { | |||
458 | * by the low level driver. | 596 | * by the low level driver. |
459 | * | 597 | * |
460 | * Set by the low level driver to provide attributes at the | 598 | * Set by the low level driver to provide attributes at the |
461 | * controller level, same level as 'ue_count' and 'ce_count' above. | 599 | * controller level. |
462 | * An array of structures, NULL terminated | 600 | * An array of structures, NULL terminated |
463 | * | 601 | * |
464 | * If attributes are desired, then set to array of attributes | 602 | * If attributes are desired, then set to array of attributes |