aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-02-28 23:42:33 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-28 23:42:33 -0500
commitad6c2c2eb34f234d6253292b9b3c047614fbfe7e (patch)
tree8ceb00db9874c09f3002b5ca579f1f9146b30a28 /include
parent19cc90f58d4f2538b4cf5371681a057d2e5209f2 (diff)
parentb0769891ba7baa53f270dc70d71934748beb4c5b (diff)
Merge branch 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac
Pull EDAC fixes and ghes-edac from Mauro Carvalho Chehab: "For: - Some fixes at edac drivers (i7core_edac, sb_edac, i3200_edac); - error injection support for i5100, when EDAC debug is enabled; - fix edac when it is loaded builtin (early init for the subsystem); - a "Firmware First" EDAC driver, allowing ghes to report errors via EDAC (ghes-edac). With regards to ghes-edac, this fixes a longstanding BZ at Red Hat that happens with Nehalem and Sandy Bridge CPUs: when both GHES and i7core_edac or sb_edac are running, the error reports are unpredictable, as both BIOS and OS race to access the registers. With ghes-edac, the EDAC core will refuse to register any other concurrent memory error driver. This patchset moves the ghes struct definitions to a separate header file (include/acpi/ghes.h) and adds 3 hooks at apei/ghes.c to register/unregister and to report errors via ghes-edac. Those changes were acked by ghes driver maintainer (Huang)." * 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac: (30 commits) i5100_edac: convert to use simple_open() ghes_edac: fix to use list_for_each_entry_safe() when delete list items ghes_edac: Fix RAS tracing ghes_edac: Make it compliant with UEFI spec 2.3.1 ghes_edac: Improve driver's printk messages ghes_edac: Don't credit the same memory dimm twice ghes_edac: do a better job of filling EDAC DIMM info ghes_edac: add support for reporting errors via EDAC ghes_edac: Register at EDAC core the BIOS report ghes: add the needed hooks for EDAC error report ghes: move structures/enum to a header file edac: add support for error type "Info" edac: add support for raw error reports edac: reduce stack pressure by using a pre-allocated buffer edac: lock module owner to avoid error report conflicts edac: remove proc_name from mci structure edac: add a new memory layer type edac: initialize the core earlier edac: better report error conditions in debug mode i5100_edac: Remove two checkpatch warnings ...
Diffstat (limited to 'include')
-rw-r--r--include/acpi/ghes.h72
-rw-r--r--include/linux/edac.h79
-rw-r--r--include/linux/pci_ids.h1
-rw-r--r--include/ras/ras_event.h4
4 files changed, 150 insertions, 6 deletions
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
new file mode 100644
index 000000000000..720446cb243e
--- /dev/null
+++ b/include/acpi/ghes.h
@@ -0,0 +1,72 @@
1#include <acpi/apei.h>
2#include <acpi/hed.h>
3
4/*
5 * One struct ghes is created for each generic hardware error source.
6 * It provides the context for APEI hardware error timer/IRQ/SCI/NMI
7 * handler.
8 *
9 * estatus: memory buffer for error status block, allocated during
10 * HEST parsing.
11 */
12#define GHES_TO_CLEAR 0x0001
13#define GHES_EXITING 0x0002
14
15struct ghes {
16 struct acpi_hest_generic *generic;
17 struct acpi_hest_generic_status *estatus;
18 u64 buffer_paddr;
19 unsigned long flags;
20 union {
21 struct list_head list;
22 struct timer_list timer;
23 unsigned int irq;
24 };
25};
26
27struct ghes_estatus_node {
28 struct llist_node llnode;
29 struct acpi_hest_generic *generic;
30 struct ghes *ghes;
31};
32
33struct ghes_estatus_cache {
34 u32 estatus_len;
35 atomic_t count;
36 struct acpi_hest_generic *generic;
37 unsigned long long time_in;
38 struct rcu_head rcu;
39};
40
41enum {
42 GHES_SEV_NO = 0x0,
43 GHES_SEV_CORRECTED = 0x1,
44 GHES_SEV_RECOVERABLE = 0x2,
45 GHES_SEV_PANIC = 0x3,
46};
47
48/* From drivers/edac/ghes_edac.c */
49
50#ifdef CONFIG_EDAC_GHES
51void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
52 struct cper_sec_mem_err *mem_err);
53
54int ghes_edac_register(struct ghes *ghes, struct device *dev);
55
56void ghes_edac_unregister(struct ghes *ghes);
57
58#else
59static inline void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
60 struct cper_sec_mem_err *mem_err)
61{
62}
63
64static inline int ghes_edac_register(struct ghes *ghes, struct device *dev)
65{
66 return 0;
67}
68
69static inline void ghes_edac_unregister(struct ghes *ghes)
70{
71}
72#endif
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 1b8c02b36f76..4fd4999ccb5b 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -14,7 +14,6 @@
14 14
15#include <linux/atomic.h> 15#include <linux/atomic.h>
16#include <linux/device.h> 16#include <linux/device.h>
17#include <linux/kobject.h>
18#include <linux/completion.h> 17#include <linux/completion.h>
19#include <linux/workqueue.h> 18#include <linux/workqueue.h>
20#include <linux/debugfs.h> 19#include <linux/debugfs.h>
@@ -48,8 +47,17 @@ static inline void opstate_init(void)
48 return; 47 return;
49} 48}
50 49
50/* Max length of a DIMM label*/
51#define EDAC_MC_LABEL_LEN 31 51#define EDAC_MC_LABEL_LEN 31
52#define MC_PROC_NAME_MAX_LEN 7 52
53/* Maximum size of the location string */
54#define LOCATION_SIZE 80
55
56/* Defines the maximum number of labels that can be reported */
57#define EDAC_MAX_LABELS 8
58
59/* String used to join two or more labels */
60#define OTHER_LABEL " or "
53 61
54/** 62/**
55 * enum dev_type - describe the type of memory DRAM chips used at the stick 63 * enum dev_type - describe the type of memory DRAM chips used at the stick
@@ -101,8 +109,24 @@ enum hw_event_mc_err_type {
101 HW_EVENT_ERR_CORRECTED, 109 HW_EVENT_ERR_CORRECTED,
102 HW_EVENT_ERR_UNCORRECTED, 110 HW_EVENT_ERR_UNCORRECTED,
103 HW_EVENT_ERR_FATAL, 111 HW_EVENT_ERR_FATAL,
112 HW_EVENT_ERR_INFO,
104}; 113};
105 114
115static inline char *mc_event_error_type(const unsigned int err_type)
116{
117 switch (err_type) {
118 case HW_EVENT_ERR_CORRECTED:
119 return "Corrected";
120 case HW_EVENT_ERR_UNCORRECTED:
121 return "Uncorrected";
122 case HW_EVENT_ERR_FATAL:
123 return "Fatal";
124 default:
125 case HW_EVENT_ERR_INFO:
126 return "Info";
127 }
128}
129
106/** 130/**
107 * enum mem_type - memory types. For a more detailed reference, please see 131 * enum mem_type - memory types. For a more detailed reference, please see
108 * http://en.wikipedia.org/wiki/DRAM 132 * http://en.wikipedia.org/wiki/DRAM
@@ -376,6 +400,9 @@ enum scrub_type {
376 * @EDAC_MC_LAYER_CHANNEL: memory layer is named "channel" 400 * @EDAC_MC_LAYER_CHANNEL: memory layer is named "channel"
377 * @EDAC_MC_LAYER_SLOT: memory layer is named "slot" 401 * @EDAC_MC_LAYER_SLOT: memory layer is named "slot"
378 * @EDAC_MC_LAYER_CHIP_SELECT: memory layer is named "chip select" 402 * @EDAC_MC_LAYER_CHIP_SELECT: memory layer is named "chip select"
403 * @EDAC_MC_LAYER_ALL_MEM: memory layout is unknown. All memory is mapped
404 * as a single memory area. This is used when
405 * retrieving errors from a firmware driven driver.
379 * 406 *
380 * This enum is used by the drivers to tell edac_mc_sysfs what name should 407 * This enum is used by the drivers to tell edac_mc_sysfs what name should
381 * be used when describing a memory stick location. 408 * be used when describing a memory stick location.
@@ -385,6 +412,7 @@ enum edac_mc_layer_type {
385 EDAC_MC_LAYER_CHANNEL, 412 EDAC_MC_LAYER_CHANNEL,
386 EDAC_MC_LAYER_SLOT, 413 EDAC_MC_LAYER_SLOT,
387 EDAC_MC_LAYER_CHIP_SELECT, 414 EDAC_MC_LAYER_CHIP_SELECT,
415 EDAC_MC_LAYER_ALL_MEM,
388}; 416};
389 417
390/** 418/**
@@ -551,6 +579,46 @@ struct errcount_attribute_data {
551 int layer0, layer1, layer2; 579 int layer0, layer1, layer2;
552}; 580};
553 581
582/**
583 * edac_raw_error_desc - Raw error report structure
584 * @grain: minimum granularity for an error report, in bytes
585 * @error_count: number of errors of the same type
586 * @top_layer: top layer of the error (layer[0])
587 * @mid_layer: middle layer of the error (layer[1])
588 * @low_layer: low layer of the error (layer[2])
589 * @page_frame_number: page where the error happened
590 * @offset_in_page: page offset
591 * @syndrome: syndrome of the error (or 0 if unknown or if
592 * the syndrome is not applicable)
593 * @msg: error message
594 * @location: location of the error
595 * @label: label of the affected DIMM(s)
596 * @other_detail: other driver-specific detail about the error
597 * @enable_per_layer_report: if false, the error affects all layers
598 * (typically, a memory controller error)
599 */
600struct edac_raw_error_desc {
601 /*
602 * NOTE: everything before grain won't be cleaned by
603 * edac_raw_error_desc_clean()
604 */
605 char location[LOCATION_SIZE];
606 char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * EDAC_MAX_LABELS];
607 long grain;
608
609 /* the vars below and grain will be cleaned on every new error report */
610 u16 error_count;
611 int top_layer;
612 int mid_layer;
613 int low_layer;
614 unsigned long page_frame_number;
615 unsigned long offset_in_page;
616 unsigned long syndrome;
617 const char *msg;
618 const char *other_detail;
619 bool enable_per_layer_report;
620};
621
554/* MEMORY controller information structure 622/* MEMORY controller information structure
555 */ 623 */
556struct mem_ctl_info { 624struct mem_ctl_info {
@@ -630,7 +698,6 @@ struct mem_ctl_info {
630 const char *mod_ver; 698 const char *mod_ver;
631 const char *ctl_name; 699 const char *ctl_name;
632 const char *dev_name; 700 const char *dev_name;
633 char proc_name[MC_PROC_NAME_MAX_LEN + 1];
634 void *pvt_info; 701 void *pvt_info;
635 unsigned long start_time; /* mci load start time (in jiffies) */ 702 unsigned long start_time; /* mci load start time (in jiffies) */
636 703
@@ -659,6 +726,12 @@ struct mem_ctl_info {
659 /* work struct for this MC */ 726 /* work struct for this MC */
660 struct delayed_work work; 727 struct delayed_work work;
661 728
729 /*
730 * Used to report an error - by being at the global struct
731 * makes the memory allocated by the EDAC core
732 */
733 struct edac_raw_error_desc error_desc;
734
662 /* the internal state of this controller instance */ 735 /* the internal state of this controller instance */
663 int op_state; 736 int op_state;
664 737
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 31717bd287fd..f11c1c2609d5 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2802,6 +2802,7 @@
2802#define PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX 0x3ce0 2802#define PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX 0x3ce0
2803#define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f 2803#define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f
2804#define PCI_DEVICE_ID_INTEL_5100_16 0x65f0 2804#define PCI_DEVICE_ID_INTEL_5100_16 0x65f0
2805#define PCI_DEVICE_ID_INTEL_5100_19 0x65f3
2805#define PCI_DEVICE_ID_INTEL_5100_21 0x65f5 2806#define PCI_DEVICE_ID_INTEL_5100_21 0x65f5
2806#define PCI_DEVICE_ID_INTEL_5100_22 0x65f6 2807#define PCI_DEVICE_ID_INTEL_5100_22 0x65f6
2807#define PCI_DEVICE_ID_INTEL_5400_ERR 0x4030 2808#define PCI_DEVICE_ID_INTEL_5400_ERR 0x4030
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 260470e72483..21cdb0b7b0fb 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -78,9 +78,7 @@ TRACE_EVENT(mc_event,
78 78
79 TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)", 79 TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)",
80 __entry->error_count, 80 __entry->error_count,
81 (__entry->error_type == HW_EVENT_ERR_CORRECTED) ? "Corrected" : 81 mc_event_error_type(__entry->error_type),
82 ((__entry->error_type == HW_EVENT_ERR_FATAL) ?
83 "Fatal" : "Uncorrected"),
84 __entry->error_count > 1 ? "s" : "", 82 __entry->error_count > 1 ? "s" : "",
85 ((char *)__get_str(msg))[0] ? " " : "", 83 ((char *)__get_str(msg))[0] ? " " : "",
86 __get_str(msg), 84 __get_str(msg),