Merge branch 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac

Pull EDAC fixes and ghes-edac from Mauro Carvalho Chehab: "For: - Some fixes at edac drivers (i7core_edac, sb_edac, i3200_edac); - error injection support for i5100, when EDAC debug is enabled; - fix edac when it is loaded builtin (early init for the subsystem); - a "Firmware First" EDAC driver, allowing ghes to report errors via EDAC (ghes-edac). With regards to ghes-edac, this fixes a longstanding BZ at Red Hat that happens with Nehalem and Sandy Bridge CPUs: when both GHES and i7core_edac or sb_edac are running, the error reports are unpredictable, as both BIOS and OS race to access the registers. With ghes-edac, the EDAC core will refuse to register any other concurrent memory error driver. This patchset moves the ghes struct definitions to a separate header file (include/acpi/ghes.h) and adds 3 hooks at apei/ghes.c to register/unregister and to report errors via ghes-edac. Those changes were acked by ghes driver maintainer (Huang)." * 'linux_next' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac: (30 commits) i5100_edac: convert to use simple_open() ghes_edac: fix to use list_for_each_entry_safe() when delete list items ghes_edac: Fix RAS tracing ghes_edac: Make it compliant with UEFI spec 2.3.1 ghes_edac: Improve driver's printk messages ghes_edac: Don't credit the same memory dimm twice ghes_edac: do a better job of filling EDAC DIMM info ghes_edac: add support for reporting errors via EDAC ghes_edac: Register at EDAC core the BIOS report ghes: add the needed hooks for EDAC error report ghes: move structures/enum to a header file edac: add support for error type "Info" edac: add support for raw error reports edac: reduce stack pressure by using a pre-allocated buffer edac: lock module owner to avoid error report conflicts edac: remove proc_name from mci structure edac: add a new memory layer type edac: initialize the core earlier edac: better report error conditions in debug mode i5100_edac: Remove two checkpatch warnings ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-02-28 23:42:33 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-02-28 23:42:33 -0500
commit: ad6c2c2eb34f234d6253292b9b3c047614fbfe7e (patch)
tree: 8ceb00db9874c09f3002b5ca579f1f9146b30a28 /include
parent: 19cc90f58d4f2538b4cf5371681a057d2e5209f2 (diff)
parent: b0769891ba7baa53f270dc70d71934748beb4c5b (diff)
4 files changed, 150 insertions, 6 deletions
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
new file mode 100644
index 000000000000..720446cb243e
--- /dev/null
+++ b/include/acpi/ghes.h
@@ -0,0 +1,72 @@
+#include <acpi/apei.h>
+#include <acpi/hed.h>
+/*
+ * One struct ghes is created for each generic hardware error source.
+ * It provides the context for APEI hardware error timer/IRQ/SCI/NMI
+ * handler.
+ *
+ * estatus: memory buffer for error status block, allocated during
+ * HEST parsing.
+ */
+#define GHES_TO_CLEAR           0x0001
+#define GHES_EXITING            0x0002
+struct ghes {
+        struct acpi_hest_generic *generic;
+        struct acpi_hest_generic_status *estatus;
+        u64 buffer_paddr;
+        unsigned long flags;
+        union {
+                struct list_head list;
+                struct timer_list timer;
+                unsigned int irq;
+        };
+};
+struct ghes_estatus_node {
+        struct llist_node llnode;
+        struct acpi_hest_generic *generic;
+        struct ghes *ghes;
+};
+struct ghes_estatus_cache {
+        u32 estatus_len;
+        atomic_t count;
+        struct acpi_hest_generic *generic;
+        unsigned long long time_in;
+        struct rcu_head rcu;
+};
+enum {
+        GHES_SEV_NO = 0x0,
+        GHES_SEV_CORRECTED = 0x1,
+        GHES_SEV_RECOVERABLE = 0x2,
+        GHES_SEV_PANIC = 0x3,
+};
+/* From drivers/edac/ghes_edac.c */
+#ifdef CONFIG_EDAC_GHES
+void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
+                                struct cper_sec_mem_err *mem_err);
+int ghes_edac_register(struct ghes *ghes, struct device *dev);
+void ghes_edac_unregister(struct ghes *ghes);
+#else
+static inline void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
+                                       struct cper_sec_mem_err *mem_err)
+{
+}
+static inline int ghes_edac_register(struct ghes *ghes, struct device *dev)
+{
+        return 0;
+}
+static inline void ghes_edac_unregister(struct ghes *ghes)
+{
+}
+#endif
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 1b8c02b36f76..4fd4999ccb5b 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -14,7 +14,6 @@
 #include <linux/atomic.h>
 #include <linux/device.h>
-#include <linux/kobject.h>
 #include <linux/completion.h>
 #include <linux/workqueue.h>
 #include <linux/debugfs.h>
@@ -48,8 +47,17 @@ static inline void opstate_init(void)
        return;
 }
+/* Max length of a DIMM label*/
 #define EDAC_MC_LABEL_LEN       31
-#define MC_PROC_NAME_MAX_LEN    7
+/* Maximum size of the location string */
+#define LOCATION_SIZE 80
+/* Defines the maximum number of labels that can be reported */
+#define EDAC_MAX_LABELS         8
+/* String used to join two or more labels */
+#define OTHER_LABEL " or "
 /**
 * enum dev_type - describe the type of memory DRAM chips used at the stick
@@ -101,8 +109,24 @@ enum hw_event_mc_err_type {
        HW_EVENT_ERR_CORRECTED,
        HW_EVENT_ERR_UNCORRECTED,
        HW_EVENT_ERR_FATAL,
+        HW_EVENT_ERR_INFO,
 };
+static inline char *mc_event_error_type(const unsigned int err_type)
+{
+        switch (err_type) {
+        case HW_EVENT_ERR_CORRECTED:
+                return "Corrected";
+        case HW_EVENT_ERR_UNCORRECTED:
+                return "Uncorrected";
+        case HW_EVENT_ERR_FATAL:
+                return "Fatal";
+        default:
+        case HW_EVENT_ERR_INFO:
+                return "Info";
+        }
+}
 /**
 * enum mem_type - memory types. For a more detailed reference, please see
 *                      http://en.wikipedia.org/wiki/DRAM
@@ -376,6 +400,9 @@ enum scrub_type {
 * @EDAC_MC_LAYER_CHANNEL:      memory layer is named "channel"
 * @EDAC_MC_LAYER_SLOT:         memory layer is named "slot"
 * @EDAC_MC_LAYER_CHIP_SELECT:  memory layer is named "chip select"
+ * @EDAC_MC_LAYER_ALL_MEM:      memory layout is unknown. All memory is mapped
+ *                              as a single memory area. This is used when
+ *                              retrieving errors from a firmware driven driver.
 *
 * This enum is used by the drivers to tell edac_mc_sysfs what name should
 * be used when describing a memory stick location.
@@ -385,6 +412,7 @@ enum edac_mc_layer_type {
        EDAC_MC_LAYER_CHANNEL,
        EDAC_MC_LAYER_SLOT,
        EDAC_MC_LAYER_CHIP_SELECT,
+        EDAC_MC_LAYER_ALL_MEM,
 };
 /**
@@ -551,6 +579,46 @@ struct errcount_attribute_data {
        int layer0, layer1, layer2;
 };
+/**
+ * edac_raw_error_desc - Raw error report structure
+ * @grain:                      minimum granularity for an error report, in bytes
+ * @error_count:                number of errors of the same type
+ * @top_layer:                  top layer of the error (layer[0])
+ * @mid_layer:                  middle layer of the error (layer[1])
+ * @low_layer:                  low layer of the error (layer[2])
+ * @page_frame_number:          page where the error happened
+ * @offset_in_page:             page offset
+ * @syndrome:                   syndrome of the error (or 0 if unknown or if
+ *                              the syndrome is not applicable)
+ * @msg:                        error message
+ * @location:                   location of the error
+ * @label:                      label of the affected DIMM(s)
+ * @other_detail:               other driver-specific detail about the error
+ * @enable_per_layer_report:    if false, the error affects all layers
+ *                              (typically, a memory controller error)
+ */
+struct edac_raw_error_desc {
+        /*
+         * NOTE: everything before grain won't be cleaned by
+         * edac_raw_error_desc_clean()
+         */
+        char location[LOCATION_SIZE];
+        char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * EDAC_MAX_LABELS];
+        long grain;
+        /* the vars below and grain will be cleaned on every new error report */
+        u16 error_count;
+        int top_layer;
+        int mid_layer;
+        int low_layer;
+        unsigned long page_frame_number;
+        unsigned long offset_in_page;
+        unsigned long syndrome;
+        const char *msg;
+        const char *other_detail;
+        bool enable_per_layer_report;
+};
 /* MEMORY controller information structure
 */
 struct mem_ctl_info {
@@ -630,7 +698,6 @@ struct mem_ctl_info {
        const char *mod_ver;
        const char *ctl_name;
        const char *dev_name;
-        char proc_name[MC_PROC_NAME_MAX_LEN + 1];
        void *pvt_info;
        unsigned long start_time;       /* mci load start time (in jiffies) */
@@ -659,6 +726,12 @@ struct mem_ctl_info {
        /* work struct for this MC */
        struct delayed_work work;
+        /*
+         * Used to report an error - by being at the global struct
+         * makes the memory allocated by the EDAC core
+         */
+        struct edac_raw_error_desc error_desc;
        /* the internal state of this controller instance */
        int op_state;
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 31717bd287fd..f11c1c2609d5 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2802,6 +2802,7 @@
 #define PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX       0x3ce0
 #define PCI_DEVICE_ID_INTEL_IOAT_SNB    0x402f
 #define PCI_DEVICE_ID_INTEL_5100_16     0x65f0
+#define PCI_DEVICE_ID_INTEL_5100_19     0x65f3
 #define PCI_DEVICE_ID_INTEL_5100_21     0x65f5
 #define PCI_DEVICE_ID_INTEL_5100_22     0x65f6
 #define PCI_DEVICE_ID_INTEL_5400_ERR    0x4030
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 260470e72483..21cdb0b7b0fb 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -78,9 +78,7 @@ TRACE_EVENT(mc_event,
        TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)",
                  __entry->error_count,
-                  (__entry->error_type == HW_EVENT_ERR_CORRECTED) ? "Corrected" :
+                  mc_event_error_type(__entry->error_type),
-                        ((__entry->error_type == HW_EVENT_ERR_FATAL) ?
-                        "Fatal" : "Uncorrected"),
                  __entry->error_count > 1 ? "s" : "",
                  ((char *)__get_str(msg))[0] ? " " : "",
                  __get_str(msg),
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-02-28 23:42:33 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-02-28 23:42:33 -0500
commit	ad6c2c2eb34f234d6253292b9b3c047614fbfe7e (patch)
tree	8ceb00db9874c09f3002b5ca579f1f9146b30a28 /include
parent	19cc90f58d4f2538b4cf5371681a057d2e5209f2 (diff)
parent	b0769891ba7baa53f270dc70d71934748beb4c5b (diff)

diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h new file mode 100644 index 000000000000..720446cb243e --- /dev/null +++ b/include/acpi/ghes.h
@@ -0,0 +1,72 @@
		1	#include <acpi/apei.h>
		2	#include <acpi/hed.h>
		3
		4	/*
		5	* One struct ghes is created for each generic hardware error source.
		6	* It provides the context for APEI hardware error timer/IRQ/SCI/NMI
		7	* handler.
		8	*
		9	* estatus: memory buffer for error status block, allocated during
		10	* HEST parsing.
		11	*/
		12	#define GHES_TO_CLEAR 0x0001
		13	#define GHES_EXITING 0x0002
		14
		15	struct ghes {
		16	struct acpi_hest_generic *generic;
		17	struct acpi_hest_generic_status *estatus;
		18	u64 buffer_paddr;
		19	unsigned long flags;
		20	union {
		21	struct list_head list;
		22	struct timer_list timer;
		23	unsigned int irq;
		24	};
		25	};
		26
		27	struct ghes_estatus_node {
		28	struct llist_node llnode;
		29	struct acpi_hest_generic *generic;
		30	struct ghes *ghes;
		31	};
		32
		33	struct ghes_estatus_cache {
		34	u32 estatus_len;
		35	atomic_t count;
		36	struct acpi_hest_generic *generic;
		37	unsigned long long time_in;
		38	struct rcu_head rcu;
		39	};
		40
		41	enum {
		42	GHES_SEV_NO = 0x0,
		43	GHES_SEV_CORRECTED = 0x1,
		44	GHES_SEV_RECOVERABLE = 0x2,
		45	GHES_SEV_PANIC = 0x3,
		46	};
		47
		48	/* From drivers/edac/ghes_edac.c */
		49
		50	#ifdef CONFIG_EDAC_GHES
		51	void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
		52	struct cper_sec_mem_err *mem_err);
		53
		54	int ghes_edac_register(struct ghes ghes, struct device dev);
		55
		56	void ghes_edac_unregister(struct ghes *ghes);
		57
		58	#else
		59	static inline void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
		60	struct cper_sec_mem_err *mem_err)
		61	{
		62	}
		63
		64	static inline int ghes_edac_register(struct ghes ghes, struct device dev)
		65	{
		66	return 0;
		67	}
		68
		69	static inline void ghes_edac_unregister(struct ghes *ghes)
		70	{
		71	}
		72	#endif


diff --git a/include/linux/edac.h b/include/linux/edac.h index 1b8c02b36f76..4fd4999ccb5b 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h
@@ -14,7 +14,6 @@
14		14
15	#include <linux/atomic.h>	15	#include <linux/atomic.h>
16	#include <linux/device.h>	16	#include <linux/device.h>
17	#include <linux/kobject.h>
18	#include <linux/completion.h>	17	#include <linux/completion.h>
19	#include <linux/workqueue.h>	18	#include <linux/workqueue.h>
20	#include <linux/debugfs.h>	19	#include <linux/debugfs.h>
@@ -48,8 +47,17 @@ static inline void opstate_init(void)
48	return;	47	return;
49	}	48	}
50		49
		50	/* Max length of a DIMM label*/
51	#define EDAC_MC_LABEL_LEN 31	51	#define EDAC_MC_LABEL_LEN 31
52	#define MC_PROC_NAME_MAX_LEN 7	52
		53	/* Maximum size of the location string */
		54	#define LOCATION_SIZE 80
		55
		56	/* Defines the maximum number of labels that can be reported */
		57	#define EDAC_MAX_LABELS 8
		58
		59	/* String used to join two or more labels */
		60	#define OTHER_LABEL " or "
53		61
54	/**	62	/**
55	* enum dev_type - describe the type of memory DRAM chips used at the stick	63	* enum dev_type - describe the type of memory DRAM chips used at the stick
@@ -101,8 +109,24 @@ enum hw_event_mc_err_type {
101	HW_EVENT_ERR_CORRECTED,	109	HW_EVENT_ERR_CORRECTED,
102	HW_EVENT_ERR_UNCORRECTED,	110	HW_EVENT_ERR_UNCORRECTED,
103	HW_EVENT_ERR_FATAL,	111	HW_EVENT_ERR_FATAL,
		112	HW_EVENT_ERR_INFO,
104	};	113	};
105		114
		115	static inline char *mc_event_error_type(const unsigned int err_type)
		116	{
		117	switch (err_type) {
		118	case HW_EVENT_ERR_CORRECTED:
		119	return "Corrected";
		120	case HW_EVENT_ERR_UNCORRECTED:
		121	return "Uncorrected";
		122	case HW_EVENT_ERR_FATAL:
		123	return "Fatal";
		124	default:
		125	case HW_EVENT_ERR_INFO:
		126	return "Info";
		127	}
		128	}
		129
106	/**	130	/**
107	* enum mem_type - memory types. For a more detailed reference, please see	131	* enum mem_type - memory types. For a more detailed reference, please see
108	* http://en.wikipedia.org/wiki/DRAM	132	* http://en.wikipedia.org/wiki/DRAM
@@ -376,6 +400,9 @@ enum scrub_type {
376	* @EDAC_MC_LAYER_CHANNEL: memory layer is named "channel"	400	* @EDAC_MC_LAYER_CHANNEL: memory layer is named "channel"
377	* @EDAC_MC_LAYER_SLOT: memory layer is named "slot"	401	* @EDAC_MC_LAYER_SLOT: memory layer is named "slot"
378	* @EDAC_MC_LAYER_CHIP_SELECT: memory layer is named "chip select"	402	* @EDAC_MC_LAYER_CHIP_SELECT: memory layer is named "chip select"
		403	* @EDAC_MC_LAYER_ALL_MEM: memory layout is unknown. All memory is mapped
		404	* as a single memory area. This is used when
		405	* retrieving errors from a firmware driven driver.
379	*	406	*
380	* This enum is used by the drivers to tell edac_mc_sysfs what name should	407	* This enum is used by the drivers to tell edac_mc_sysfs what name should
381	* be used when describing a memory stick location.	408	* be used when describing a memory stick location.
@@ -385,6 +412,7 @@ enum edac_mc_layer_type {
385	EDAC_MC_LAYER_CHANNEL,	412	EDAC_MC_LAYER_CHANNEL,
386	EDAC_MC_LAYER_SLOT,	413	EDAC_MC_LAYER_SLOT,
387	EDAC_MC_LAYER_CHIP_SELECT,	414	EDAC_MC_LAYER_CHIP_SELECT,
		415	EDAC_MC_LAYER_ALL_MEM,
388	};	416	};
389		417
390	/**	418	/**
@@ -551,6 +579,46 @@ struct errcount_attribute_data {
551	int layer0, layer1, layer2;	579	int layer0, layer1, layer2;
552	};	580	};
553		581
		582	/**
		583	* edac_raw_error_desc - Raw error report structure
		584	* @grain: minimum granularity for an error report, in bytes
		585	* @error_count: number of errors of the same type
		586	* @top_layer: top layer of the error (layer[0])
		587	* @mid_layer: middle layer of the error (layer[1])
		588	* @low_layer: low layer of the error (layer[2])
		589	* @page_frame_number: page where the error happened
		590	* @offset_in_page: page offset
		591	* @syndrome: syndrome of the error (or 0 if unknown or if
		592	* the syndrome is not applicable)
		593	* @msg: error message
		594	* @location: location of the error
		595	* @label: label of the affected DIMM(s)
		596	* @other_detail: other driver-specific detail about the error
		597	* @enable_per_layer_report: if false, the error affects all layers
		598	* (typically, a memory controller error)
		599	*/
		600	struct edac_raw_error_desc {
		601	/*
		602	* NOTE: everything before grain won't be cleaned by
		603	* edac_raw_error_desc_clean()
		604	*/
		605	char location[LOCATION_SIZE];
		606	char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * EDAC_MAX_LABELS];
		607	long grain;
		608
		609	/* the vars below and grain will be cleaned on every new error report */
		610	u16 error_count;
		611	int top_layer;
		612	int mid_layer;
		613	int low_layer;
		614	unsigned long page_frame_number;
		615	unsigned long offset_in_page;
		616	unsigned long syndrome;
		617	const char *msg;
		618	const char *other_detail;
		619	bool enable_per_layer_report;
		620	};
		621
554	/* MEMORY controller information structure	622	/* MEMORY controller information structure
555	*/	623	*/
556	struct mem_ctl_info {	624	struct mem_ctl_info {
@@ -630,7 +698,6 @@ struct mem_ctl_info {
630	const char *mod_ver;	698	const char *mod_ver;
631	const char *ctl_name;	699	const char *ctl_name;
632	const char *dev_name;	700	const char *dev_name;
633	char proc_name[MC_PROC_NAME_MAX_LEN + 1];
634	void *pvt_info;	701	void *pvt_info;
635	unsigned long start_time; /* mci load start time (in jiffies) */	702	unsigned long start_time; /* mci load start time (in jiffies) */
636		703
@@ -659,6 +726,12 @@ struct mem_ctl_info {
659	/* work struct for this MC */	726	/* work struct for this MC */
660	struct delayed_work work;	727	struct delayed_work work;
661		728
		729	/*
		730	* Used to report an error - by being at the global struct
		731	* makes the memory allocated by the EDAC core
		732	*/
		733	struct edac_raw_error_desc error_desc;
		734
662	/* the internal state of this controller instance */	735	/* the internal state of this controller instance */
663	int op_state;	736	int op_state;
664		737


diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 31717bd287fd..f11c1c2609d5 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h
@@ -2802,6 +2802,7 @@
2802	#define PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX 0x3ce0	2802	#define PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX 0x3ce0
2803	#define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f	2803	#define PCI_DEVICE_ID_INTEL_IOAT_SNB 0x402f
2804	#define PCI_DEVICE_ID_INTEL_5100_16 0x65f0	2804	#define PCI_DEVICE_ID_INTEL_5100_16 0x65f0
		2805	#define PCI_DEVICE_ID_INTEL_5100_19 0x65f3
2805	#define PCI_DEVICE_ID_INTEL_5100_21 0x65f5	2806	#define PCI_DEVICE_ID_INTEL_5100_21 0x65f5
2806	#define PCI_DEVICE_ID_INTEL_5100_22 0x65f6	2807	#define PCI_DEVICE_ID_INTEL_5100_22 0x65f6
2807	#define PCI_DEVICE_ID_INTEL_5400_ERR 0x4030	2808	#define PCI_DEVICE_ID_INTEL_5400_ERR 0x4030


diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 260470e72483..21cdb0b7b0fb 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h
@@ -78,9 +78,7 @@ TRACE_EVENT(mc_event,
78		78
79	TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)",	79	TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)",
80	__entry->error_count,	80	__entry->error_count,
81	(__entry->error_type == HW_EVENT_ERR_CORRECTED) ? "Corrected" :	81	mc_event_error_type(__entry->error_type),
82	((__entry->error_type == HW_EVENT_ERR_FATAL) ?
83	"Fatal" : "Uncorrected"),
84	__entry->error_count > 1 ? "s" : "",	82	__entry->error_count > 1 ? "s" : "",
85	((char *)__get_str(msg))[0] ? " " : "",	83	((char *)__get_str(msg))[0] ? " " : "",
86	__get_str(msg),	84	__get_str(msg),