aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/acpi
diff options
context:
space:
mode:
authorChen, Gong <gong.chen@linux.intel.com>2013-10-21 17:29:25 -0400
committerTony Luck <tony.luck@intel.com>2013-10-23 13:09:07 -0400
commit4b3db708b114fc35ff1e0cd28a2bfb1490dbb5d3 (patch)
tree136ba42b08015d55861bfb103a6ec87f9dd5a54b /drivers/acpi
parent10ef6b0dffe404bcc54e94cb2ca1a5b18445a66b (diff)
ACPI, x86: Extended error log driver for x86 platform
This H/W error log driver (a.k.a eMCA driver) is implemented based on http://www.intel.com/content/www/us/en/architecture-and-technology/enhanced-mca-logging-xeon-paper.html After errors are captured, more detailed platform specific information can be got via this new enhanced H/W error log driver. Most notably we can track memory errors back to the DIMM slot silk screen label. Signed-off-by: Chen, Gong <gong.chen@linux.intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'drivers/acpi')
-rw-r--r--drivers/acpi/Kconfig19
-rw-r--r--drivers/acpi/Makefile2
-rw-r--r--drivers/acpi/acpi_extlog.c327
-rw-r--r--drivers/acpi/bus.c3
4 files changed, 350 insertions, 1 deletions
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 6efe2ac6902f..252f0e818a49 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -372,4 +372,23 @@ config ACPI_BGRT
372 372
373source "drivers/acpi/apei/Kconfig" 373source "drivers/acpi/apei/Kconfig"
374 374
375config ACPI_EXTLOG
376 tristate "Extended Error Log support"
377 depends on X86_MCE && ACPI_APEI
378 default n
379 help
380 Certain usages such as Predictive Failure Analysis (PFA) require
381 more information about the error than what can be described in
382 processor machine check banks. Most server processors log
383 additional information about the error in processor uncore
384 registers. Since the addresses and layout of these registers vary
385 widely from one processor to another, system software cannot
386 readily make use of them. To complicate matters further, some of
387 the additional error information cannot be constructed without
388 detailed knowledge about platform topology.
389
390 Enhanced MCA Logging allows firmware to provide additional error
391 information to system software, synchronous with MCE or CMCI. This
392 driver adds support for that functionality.
393
375endif # ACPI 394endif # ACPI
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index cdaf68b58b00..bce34afadcd0 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -82,3 +82,5 @@ processor-$(CONFIG_CPU_FREQ) += processor_perflib.o
82obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o 82obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o
83 83
84obj-$(CONFIG_ACPI_APEI) += apei/ 84obj-$(CONFIG_ACPI_APEI) += apei/
85
86obj-$(CONFIG_ACPI_EXTLOG) += acpi_extlog.o
diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
new file mode 100644
index 000000000000..a6869e110ce5
--- /dev/null
+++ b/drivers/acpi/acpi_extlog.c
@@ -0,0 +1,327 @@
1/*
2 * Extended Error Log driver
3 *
4 * Copyright (C) 2013 Intel Corp.
5 * Author: Chen, Gong <gong.chen@intel.com>
6 *
7 * This file is licensed under GPLv2.
8 */
9
10#include <linux/module.h>
11#include <linux/acpi.h>
12#include <acpi/acpi_bus.h>
13#include <linux/cper.h>
14#include <linux/ratelimit.h>
15#include <asm/cpu.h>
16#include <asm/mce.h>
17
18#include "apei/apei-internal.h"
19
20#define EXT_ELOG_ENTRY_MASK GENMASK_ULL(51, 0) /* elog entry address mask */
21
22#define EXTLOG_DSM_REV 0x0
23#define EXTLOG_FN_QUERY 0x0
24#define EXTLOG_FN_ADDR 0x1
25
26#define FLAG_OS_OPTIN BIT(0)
27#define EXTLOG_QUERY_L1_EXIST BIT(1)
28#define ELOG_ENTRY_VALID (1ULL<<63)
29#define ELOG_ENTRY_LEN 0x1000
30
31#define EMCA_BUG \
32 "Can not request iomem region <0x%016llx-0x%016llx> - eMCA disabled\n"
33
34struct extlog_l1_head {
35 u32 ver; /* Header Version */
36 u32 hdr_len; /* Header Length */
37 u64 total_len; /* entire L1 Directory length including this header */
38 u64 elog_base; /* MCA Error Log Directory base address */
39 u64 elog_len; /* MCA Error Log Directory length */
40 u32 flags; /* bit 0 - OS/VMM Opt-in */
41 u8 rev0[12];
42 u32 entries; /* Valid L1 Directory entries per logical processor */
43 u8 rev1[12];
44};
45
46static u8 extlog_dsm_uuid[] = "663E35AF-CC10-41A4-88EA-5470AF055295";
47
48/* L1 table related physical address */
49static u64 elog_base;
50static size_t elog_size;
51static u64 l1_dirbase;
52static size_t l1_size;
53
54/* L1 table related virtual address */
55static void __iomem *extlog_l1_addr;
56static void __iomem *elog_addr;
57
58static void *elog_buf;
59
60static u64 *l1_entry_base;
61static u32 l1_percpu_entry;
62
63#define ELOG_IDX(cpu, bank) \
64 (cpu_physical_id(cpu) * l1_percpu_entry + (bank))
65
66#define ELOG_ENTRY_DATA(idx) \
67 (*(l1_entry_base + (idx)))
68
69#define ELOG_ENTRY_ADDR(phyaddr) \
70 (phyaddr - elog_base + (u8 *)elog_addr)
71
72static struct acpi_generic_status *extlog_elog_entry_check(int cpu, int bank)
73{
74 int idx;
75 u64 data;
76 struct acpi_generic_status *estatus;
77
78 WARN_ON(cpu < 0);
79 idx = ELOG_IDX(cpu, bank);
80 data = ELOG_ENTRY_DATA(idx);
81 if ((data & ELOG_ENTRY_VALID) == 0)
82 return NULL;
83
84 data &= EXT_ELOG_ENTRY_MASK;
85 estatus = (struct acpi_generic_status *)ELOG_ENTRY_ADDR(data);
86
87 /* if no valid data in elog entry, just return */
88 if (estatus->block_status == 0)
89 return NULL;
90
91 return estatus;
92}
93
94static void __print_extlog_rcd(const char *pfx,
95 struct acpi_generic_status *estatus, int cpu)
96{
97 static atomic_t seqno;
98 unsigned int curr_seqno;
99 char pfx_seq[64];
100
101 if (!pfx) {
102 if (estatus->error_severity <= CPER_SEV_CORRECTED)
103 pfx = KERN_INFO;
104 else
105 pfx = KERN_ERR;
106 }
107 curr_seqno = atomic_inc_return(&seqno);
108 snprintf(pfx_seq, sizeof(pfx_seq), "%s{%u}", pfx, curr_seqno);
109 printk("%s""Hardware error detected on CPU%d\n", pfx_seq, cpu);
110 cper_estatus_print(pfx_seq, estatus);
111}
112
113static int print_extlog_rcd(const char *pfx,
114 struct acpi_generic_status *estatus, int cpu)
115{
116 /* Not more than 2 messages every 5 seconds */
117 static DEFINE_RATELIMIT_STATE(ratelimit_corrected, 5*HZ, 2);
118 static DEFINE_RATELIMIT_STATE(ratelimit_uncorrected, 5*HZ, 2);
119 struct ratelimit_state *ratelimit;
120
121 if (estatus->error_severity == CPER_SEV_CORRECTED ||
122 (estatus->error_severity == CPER_SEV_INFORMATIONAL))
123 ratelimit = &ratelimit_corrected;
124 else
125 ratelimit = &ratelimit_uncorrected;
126 if (__ratelimit(ratelimit)) {
127 __print_extlog_rcd(pfx, estatus, cpu);
128 return 0;
129 }
130
131 return 1;
132}
133
134static int extlog_print(struct notifier_block *nb, unsigned long val,
135 void *data)
136{
137 struct mce *mce = (struct mce *)data;
138 int bank = mce->bank;
139 int cpu = mce->extcpu;
140 struct acpi_generic_status *estatus;
141 int rc;
142
143 estatus = extlog_elog_entry_check(cpu, bank);
144 if (estatus == NULL)
145 return NOTIFY_DONE;
146
147 memcpy(elog_buf, (void *)estatus, ELOG_ENTRY_LEN);
148 /* clear record status to enable BIOS to update it again */
149 estatus->block_status = 0;
150
151 rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
152
153 return NOTIFY_DONE;
154}
155
156static int extlog_get_dsm(acpi_handle handle, int rev, int func, u64 *ret)
157{
158 struct acpi_buffer buf = {ACPI_ALLOCATE_BUFFER, NULL};
159 struct acpi_object_list input;
160 union acpi_object params[4], *obj;
161 u8 uuid[16];
162 int i;
163
164 acpi_str_to_uuid(extlog_dsm_uuid, uuid);
165 input.count = 4;
166 input.pointer = params;
167 params[0].type = ACPI_TYPE_BUFFER;
168 params[0].buffer.length = 16;
169 params[0].buffer.pointer = uuid;
170 params[1].type = ACPI_TYPE_INTEGER;
171 params[1].integer.value = rev;
172 params[2].type = ACPI_TYPE_INTEGER;
173 params[2].integer.value = func;
174 params[3].type = ACPI_TYPE_PACKAGE;
175 params[3].package.count = 0;
176 params[3].package.elements = NULL;
177
178 if (ACPI_FAILURE(acpi_evaluate_object(handle, "_DSM", &input, &buf)))
179 return -1;
180
181 *ret = 0;
182 obj = (union acpi_object *)buf.pointer;
183 if (obj->type == ACPI_TYPE_INTEGER) {
184 *ret = obj->integer.value;
185 } else if (obj->type == ACPI_TYPE_BUFFER) {
186 if (obj->buffer.length <= 8) {
187 for (i = 0; i < obj->buffer.length; i++)
188 *ret |= (obj->buffer.pointer[i] << (i * 8));
189 }
190 }
191 kfree(buf.pointer);
192
193 return 0;
194}
195
196static bool extlog_get_l1addr(void)
197{
198 acpi_handle handle;
199 u64 ret;
200
201 if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle)))
202 return false;
203
204 if (extlog_get_dsm(handle, EXTLOG_DSM_REV, EXTLOG_FN_QUERY, &ret) ||
205 !(ret & EXTLOG_QUERY_L1_EXIST))
206 return false;
207
208 if (extlog_get_dsm(handle, EXTLOG_DSM_REV, EXTLOG_FN_ADDR, &ret))
209 return false;
210
211 l1_dirbase = ret;
212 /* Spec says L1 directory must be 4K aligned, bail out if it isn't */
213 if (l1_dirbase & ((1 << 12) - 1)) {
214 pr_warn(FW_BUG "L1 Directory is invalid at physical %llx\n",
215 l1_dirbase);
216 return false;
217 }
218
219 return true;
220}
221static struct notifier_block extlog_mce_dec = {
222 .notifier_call = extlog_print,
223};
224
225static int __init extlog_init(void)
226{
227 struct extlog_l1_head *l1_head;
228 void __iomem *extlog_l1_hdr;
229 size_t l1_hdr_size;
230 struct resource *r;
231 u64 cap;
232 int rc;
233
234 rc = -ENODEV;
235
236 rdmsrl(MSR_IA32_MCG_CAP, cap);
237 if (!(cap & MCG_ELOG_P))
238 return rc;
239
240 if (!extlog_get_l1addr())
241 return rc;
242
243 rc = -EINVAL;
244 /* get L1 header to fetch necessary information */
245 l1_hdr_size = sizeof(struct extlog_l1_head);
246 r = request_mem_region(l1_dirbase, l1_hdr_size, "L1 DIR HDR");
247 if (!r) {
248 pr_warn(FW_BUG EMCA_BUG,
249 (unsigned long long)l1_dirbase,
250 (unsigned long long)l1_dirbase + l1_hdr_size);
251 goto err;
252 }
253
254 extlog_l1_hdr = acpi_os_map_memory(l1_dirbase, l1_hdr_size);
255 l1_head = (struct extlog_l1_head *)extlog_l1_hdr;
256 l1_size = l1_head->total_len;
257 l1_percpu_entry = l1_head->entries;
258 elog_base = l1_head->elog_base;
259 elog_size = l1_head->elog_len;
260 acpi_os_unmap_memory(extlog_l1_hdr, l1_hdr_size);
261 release_mem_region(l1_dirbase, l1_hdr_size);
262
263 /* remap L1 header again based on completed information */
264 r = request_mem_region(l1_dirbase, l1_size, "L1 Table");
265 if (!r) {
266 pr_warn(FW_BUG EMCA_BUG,
267 (unsigned long long)l1_dirbase,
268 (unsigned long long)l1_dirbase + l1_size);
269 goto err;
270 }
271 extlog_l1_addr = acpi_os_map_memory(l1_dirbase, l1_size);
272 l1_entry_base = (u64 *)((u8 *)extlog_l1_addr + l1_hdr_size);
273
274 /* remap elog table */
275 r = request_mem_region(elog_base, elog_size, "Elog Table");
276 if (!r) {
277 pr_warn(FW_BUG EMCA_BUG,
278 (unsigned long long)elog_base,
279 (unsigned long long)elog_base + elog_size);
280 goto err_release_l1_dir;
281 }
282 elog_addr = acpi_os_map_memory(elog_base, elog_size);
283
284 rc = -ENOMEM;
285 /* allocate buffer to save elog record */
286 elog_buf = kmalloc(ELOG_ENTRY_LEN, GFP_KERNEL);
287 if (elog_buf == NULL)
288 goto err_release_elog;
289
290 mce_register_decode_chain(&extlog_mce_dec);
291 /* enable OS to be involved to take over management from BIOS */
292 ((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN;
293
294 return 0;
295
296err_release_elog:
297 if (elog_addr)
298 acpi_os_unmap_memory(elog_addr, elog_size);
299 release_mem_region(elog_base, elog_size);
300err_release_l1_dir:
301 if (extlog_l1_addr)
302 acpi_os_unmap_memory(extlog_l1_addr, l1_size);
303 release_mem_region(l1_dirbase, l1_size);
304err:
305 pr_warn(FW_BUG "Extended error log disabled because of problems parsing f/w tables\n");
306 return rc;
307}
308
309static void __exit extlog_exit(void)
310{
311 mce_unregister_decode_chain(&extlog_mce_dec);
312 ((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN;
313 if (extlog_l1_addr)
314 acpi_os_unmap_memory(extlog_l1_addr, l1_size);
315 if (elog_addr)
316 acpi_os_unmap_memory(elog_addr, elog_size);
317 release_mem_region(elog_base, elog_size);
318 release_mem_region(l1_dirbase, l1_size);
319 kfree(elog_buf);
320}
321
322module_init(extlog_init);
323module_exit(extlog_exit);
324
325MODULE_AUTHOR("Chen, Gong <gong.chen@intel.com>");
326MODULE_DESCRIPTION("Extended MCA Error Log Driver");
327MODULE_LICENSE("GPL");
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index b587ec8257b2..e1bd9a181117 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -174,7 +174,7 @@ static void acpi_print_osc_error(acpi_handle handle,
174 printk("\n"); 174 printk("\n");
175} 175}
176 176
177static acpi_status acpi_str_to_uuid(char *str, u8 *uuid) 177acpi_status acpi_str_to_uuid(char *str, u8 *uuid)
178{ 178{
179 int i; 179 int i;
180 static int opc_map_to_uuid[16] = {6, 4, 2, 0, 11, 9, 16, 14, 19, 21, 180 static int opc_map_to_uuid[16] = {6, 4, 2, 0, 11, 9, 16, 14, 19, 21,
@@ -195,6 +195,7 @@ static acpi_status acpi_str_to_uuid(char *str, u8 *uuid)
195 } 195 }
196 return AE_OK; 196 return AE_OK;
197} 197}
198EXPORT_SYMBOL_GPL(acpi_str_to_uuid);
198 199
199acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context) 200acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context)
200{ 201{