aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorHuang Ying <ying.huang@intel.com>2010-05-18 02:35:20 -0400
committerLen Brown <len.brown@intel.com>2010-05-19 22:41:16 -0400
commitd334a49113a4a33109fd24e46073280ecd1bea0d (patch)
treee6fd9acf0d143559186876173a3345d940ba0870 /drivers
parent06d65deade9aabba58e0518df86dcd324e86b832 (diff)
ACPI, APEI, Generic Hardware Error Source memory error support
Generic Hardware Error Source provides a way to report platform hardware errors (such as that from chipset). It works in so called "Firmware First" mode, that is, hardware errors are reported to firmware firstly, then reported to Linux by firmware. This way, some non-standard hardware error registers or non-standard hardware link can be checked by firmware to produce more valuable hardware error information for Linux. Now, only SCI notification type and memory errors are supported. More notification type and hardware error type will be added later. These memory errors are reported to user space through /dev/mcelog via faking a corrected Machine Check, so that the error memory page can be offlined by /sbin/mcelog if the error count for one page is beyond the threshold. On some machines, Machine Check can not report physical address for some corrected memory errors, but GHES can do that. So this simplified GHES is implemented firstly. Signed-off-by: Huang Ying <ying.huang@intel.com> Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Len Brown <len.brown@intel.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/acpi/apei/Kconfig14
-rw-r--r--drivers/acpi/apei/Makefile1
-rw-r--r--drivers/acpi/apei/ghes.c427
3 files changed, 442 insertions, 0 deletions
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index 5f0a41c2bc62..f8c668f27b5a 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -7,6 +7,20 @@ config ACPI_APEI
7 especially. In addition it supports error serialization and 7 especially. In addition it supports error serialization and
8 error injection. 8 error injection.
9 9
10config ACPI_APEI_GHES
11 tristate "APEI Generic Hardware Error Source"
12 depends on ACPI_APEI && X86
13 select ACPI_HED
14 help
15 Generic Hardware Error Source provides a way to report
16 platform hardware errors (such as that from chipset). It
17 works in so called "Firmware First" mode, that is, hardware
18 errors are reported to firmware firstly, then reported to
19 Linux by firmware. This way, some non-standard hardware
20 error registers or non-standard hardware link can be checked
21 by firmware to produce more valuable hardware error
22 information for Linux.
23
10config ACPI_APEI_EINJ 24config ACPI_APEI_EINJ
11 tristate "APEI Error INJection (EINJ)" 25 tristate "APEI Error INJection (EINJ)"
12 depends on ACPI_APEI && DEBUG_FS 26 depends on ACPI_APEI && DEBUG_FS
diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile
index fef963ec5362..41c61db4c51c 100644
--- a/drivers/acpi/apei/Makefile
+++ b/drivers/acpi/apei/Makefile
@@ -1,4 +1,5 @@
1obj-$(CONFIG_ACPI_APEI) += apei.o 1obj-$(CONFIG_ACPI_APEI) += apei.o
2obj-$(CONFIG_ACPI_APEI_GHES) += ghes.o
2obj-$(CONFIG_ACPI_APEI_EINJ) += einj.o 3obj-$(CONFIG_ACPI_APEI_EINJ) += einj.o
3 4
4apei-y := apei-base.o hest.o cper.o 5apei-y := apei-base.o hest.o cper.o
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
new file mode 100644
index 000000000000..fd0cc016a099
--- /dev/null
+++ b/drivers/acpi/apei/ghes.c
@@ -0,0 +1,427 @@
1/*
2 * APEI Generic Hardware Error Source support
3 *
4 * Generic Hardware Error Source provides a way to report platform
5 * hardware errors (such as that from chipset). It works in so called
6 * "Firmware First" mode, that is, hardware errors are reported to
7 * firmware firstly, then reported to Linux by firmware. This way,
8 * some non-standard hardware error registers or non-standard hardware
9 * link can be checked by firmware to produce more hardware error
10 * information for Linux.
11 *
12 * For more information about Generic Hardware Error Source, please
13 * refer to ACPI Specification version 4.0, section 17.3.2.6
14 *
15 * Now, only SCI notification type and memory errors are
16 * supported. More notification type and hardware error type will be
17 * added later.
18 *
19 * Copyright 2010 Intel Corp.
20 * Author: Huang Ying <ying.huang@intel.com>
21 *
22 * This program is free software; you can redistribute it and/or
23 * modify it under the terms of the GNU General Public License version
24 * 2 as published by the Free Software Foundation;
25 *
26 * This program is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 * You should have received a copy of the GNU General Public License
32 * along with this program; if not, write to the Free Software
33 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
34 */
35
36#include <linux/kernel.h>
37#include <linux/module.h>
38#include <linux/init.h>
39#include <linux/acpi.h>
40#include <linux/io.h>
41#include <linux/interrupt.h>
42#include <linux/cper.h>
43#include <linux/kdebug.h>
44#include <acpi/apei.h>
45#include <acpi/atomicio.h>
46#include <acpi/hed.h>
47#include <asm/mce.h>
48
49#include "apei-internal.h"
50
51#define GHES_PFX "GHES: "
52
53#define GHES_ESTATUS_MAX_SIZE 65536
54
55/*
56 * One struct ghes is created for each generic hardware error
57 * source.
58 *
59 * It provides the context for APEI hardware error timer/IRQ/SCI/NMI
60 * handler. Handler for one generic hardware error source is only
61 * triggered after the previous one is done. So handler can uses
62 * struct ghes without locking.
63 *
64 * estatus: memory buffer for error status block, allocated during
65 * HEST parsing.
66 */
67#define GHES_TO_CLEAR 0x0001
68
69struct ghes {
70 struct acpi_hest_generic *generic;
71 struct acpi_hest_generic_status *estatus;
72 struct list_head list;
73 u64 buffer_paddr;
74 unsigned long flags;
75};
76
77/*
78 * Error source lists, one list for each notification method. The
79 * members in lists are struct ghes.
80 *
81 * The list members are only added in HEST parsing and deleted during
82 * module_exit, that is, single-threaded. So no lock is needed for
83 * that.
84 *
85 * But the mutual exclusion is needed between members adding/deleting
86 * and timer/IRQ/SCI/NMI handler, which may traverse the list. RCU is
87 * used for that.
88 */
89static LIST_HEAD(ghes_sci);
90
91static struct ghes *ghes_new(struct acpi_hest_generic *generic)
92{
93 struct ghes *ghes;
94 unsigned int error_block_length;
95 int rc;
96
97 ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
98 if (!ghes)
99 return ERR_PTR(-ENOMEM);
100 ghes->generic = generic;
101 INIT_LIST_HEAD(&ghes->list);
102 rc = acpi_pre_map_gar(&generic->error_status_address);
103 if (rc)
104 goto err_free;
105 error_block_length = generic->error_block_length;
106 if (error_block_length > GHES_ESTATUS_MAX_SIZE) {
107 pr_warning(FW_WARN GHES_PFX
108 "Error status block length is too long: %u for "
109 "generic hardware error source: %d.\n",
110 error_block_length, generic->header.source_id);
111 error_block_length = GHES_ESTATUS_MAX_SIZE;
112 }
113 ghes->estatus = kmalloc(error_block_length, GFP_KERNEL);
114 if (!ghes->estatus) {
115 rc = -ENOMEM;
116 goto err_unmap;
117 }
118
119 return ghes;
120
121err_unmap:
122 acpi_post_unmap_gar(&generic->error_status_address);
123err_free:
124 kfree(ghes);
125 return ERR_PTR(rc);
126}
127
128static void ghes_fini(struct ghes *ghes)
129{
130 kfree(ghes->estatus);
131 acpi_post_unmap_gar(&ghes->generic->error_status_address);
132}
133
134enum {
135 GHES_SER_NO = 0x0,
136 GHES_SER_CORRECTED = 0x1,
137 GHES_SER_RECOVERABLE = 0x2,
138 GHES_SER_PANIC = 0x3,
139};
140
141static inline int ghes_severity(int severity)
142{
143 switch (severity) {
144 case CPER_SER_INFORMATIONAL:
145 return GHES_SER_NO;
146 case CPER_SER_CORRECTED:
147 return GHES_SER_CORRECTED;
148 case CPER_SER_RECOVERABLE:
149 return GHES_SER_RECOVERABLE;
150 case CPER_SER_FATAL:
151 return GHES_SER_PANIC;
152 default:
153 /* Unkown, go panic */
154 return GHES_SER_PANIC;
155 }
156}
157
158/* SCI handler run in work queue, so ioremap can be used here */
159static int ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len,
160 int from_phys)
161{
162 void *vaddr;
163
164 vaddr = ioremap_cache(paddr, len);
165 if (!vaddr)
166 return -ENOMEM;
167 if (from_phys)
168 memcpy(buffer, vaddr, len);
169 else
170 memcpy(vaddr, buffer, len);
171 iounmap(vaddr);
172
173 return 0;
174}
175
176static int ghes_read_estatus(struct ghes *ghes, int silent)
177{
178 struct acpi_hest_generic *g = ghes->generic;
179 u64 buf_paddr;
180 u32 len;
181 int rc;
182
183 rc = acpi_atomic_read(&buf_paddr, &g->error_status_address);
184 if (rc) {
185 if (!silent && printk_ratelimit())
186 pr_warning(FW_WARN GHES_PFX
187"Failed to read error status block address for hardware error source: %d.\n",
188 g->header.source_id);
189 return -EIO;
190 }
191 if (!buf_paddr)
192 return -ENOENT;
193
194 rc = ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
195 sizeof(*ghes->estatus), 1);
196 if (rc)
197 return rc;
198 if (!ghes->estatus->block_status)
199 return -ENOENT;
200
201 ghes->buffer_paddr = buf_paddr;
202 ghes->flags |= GHES_TO_CLEAR;
203
204 rc = -EIO;
205 len = apei_estatus_len(ghes->estatus);
206 if (len < sizeof(*ghes->estatus))
207 goto err_read_block;
208 if (len > ghes->generic->error_block_length)
209 goto err_read_block;
210 if (apei_estatus_check_header(ghes->estatus))
211 goto err_read_block;
212 rc = ghes_copy_tofrom_phys(ghes->estatus + 1,
213 buf_paddr + sizeof(*ghes->estatus),
214 len - sizeof(*ghes->estatus), 1);
215 if (rc)
216 return rc;
217 if (apei_estatus_check(ghes->estatus))
218 goto err_read_block;
219 rc = 0;
220
221err_read_block:
222 if (rc && !silent)
223 pr_warning(FW_WARN GHES_PFX
224 "Failed to read error status block!\n");
225 return rc;
226}
227
228static void ghes_clear_estatus(struct ghes *ghes)
229{
230 ghes->estatus->block_status = 0;
231 if (!(ghes->flags & GHES_TO_CLEAR))
232 return;
233 ghes_copy_tofrom_phys(ghes->estatus, ghes->buffer_paddr,
234 sizeof(ghes->estatus->block_status), 0);
235 ghes->flags &= ~GHES_TO_CLEAR;
236}
237
238static void ghes_do_proc(struct ghes *ghes)
239{
240 int ser, processed = 0;
241 struct acpi_hest_generic_data *gdata;
242
243 ser = ghes_severity(ghes->estatus->error_severity);
244 apei_estatus_for_each_section(ghes->estatus, gdata) {
245#ifdef CONFIG_X86_MCE
246 if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
247 CPER_SEC_PLATFORM_MEM)) {
248 apei_mce_report_mem_error(
249 ser == GHES_SER_CORRECTED,
250 (struct cper_sec_mem_err *)(gdata+1));
251 processed = 1;
252 }
253#endif
254 }
255
256 if (!processed && printk_ratelimit())
257 pr_warning(GHES_PFX
258 "Unknown error record from generic hardware error source: %d\n",
259 ghes->generic->header.source_id);
260}
261
262static int ghes_proc(struct ghes *ghes)
263{
264 int rc;
265
266 rc = ghes_read_estatus(ghes, 0);
267 if (rc)
268 goto out;
269 ghes_do_proc(ghes);
270
271out:
272 ghes_clear_estatus(ghes);
273 return 0;
274}
275
276static int ghes_notify_sci(struct notifier_block *this,
277 unsigned long event, void *data)
278{
279 struct ghes *ghes;
280 int ret = NOTIFY_DONE;
281
282 rcu_read_lock();
283 list_for_each_entry_rcu(ghes, &ghes_sci, list) {
284 if (!ghes_proc(ghes))
285 ret = NOTIFY_OK;
286 }
287 rcu_read_unlock();
288
289 return ret;
290}
291
292static struct notifier_block ghes_notifier_sci = {
293 .notifier_call = ghes_notify_sci,
294};
295
296static int hest_ghes_parse(struct acpi_hest_header *hest_hdr, void *data)
297{
298 struct acpi_hest_generic *generic;
299 struct ghes *ghes = NULL;
300 int rc = 0;
301
302 if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR)
303 return 0;
304
305 generic = (struct acpi_hest_generic *)hest_hdr;
306 if (!generic->enabled)
307 return 0;
308
309 if (generic->error_block_length <
310 sizeof(struct acpi_hest_generic_status)) {
311 pr_warning(FW_BUG GHES_PFX
312"Invalid error block length: %u for generic hardware error source: %d\n",
313 generic->error_block_length,
314 generic->header.source_id);
315 goto err;
316 }
317 if (generic->records_to_preallocate == 0) {
318 pr_warning(FW_BUG GHES_PFX
319"Invalid records to preallocate: %u for generic hardware error source: %d\n",
320 generic->records_to_preallocate,
321 generic->header.source_id);
322 goto err;
323 }
324 ghes = ghes_new(generic);
325 if (IS_ERR(ghes)) {
326 rc = PTR_ERR(ghes);
327 ghes = NULL;
328 goto err;
329 }
330 switch (generic->notify.type) {
331 case ACPI_HEST_NOTIFY_POLLED:
332 pr_warning(GHES_PFX
333"Generic hardware error source: %d notified via POLL is not supported!\n",
334 generic->header.source_id);
335 break;
336 case ACPI_HEST_NOTIFY_EXTERNAL:
337 case ACPI_HEST_NOTIFY_LOCAL:
338 pr_warning(GHES_PFX
339"Generic hardware error source: %d notified via IRQ is not supported!\n",
340 generic->header.source_id);
341 break;
342 case ACPI_HEST_NOTIFY_SCI:
343 if (list_empty(&ghes_sci))
344 register_acpi_hed_notifier(&ghes_notifier_sci);
345 list_add_rcu(&ghes->list, &ghes_sci);
346 break;
347 case ACPI_HEST_NOTIFY_NMI:
348 pr_warning(GHES_PFX
349"Generic hardware error source: %d notified via NMI is not supported!\n",
350 generic->header.source_id);
351 break;
352 default:
353 pr_warning(FW_WARN GHES_PFX
354 "Unknown notification type: %u for generic hardware error source: %d\n",
355 generic->notify.type, generic->header.source_id);
356 break;
357 }
358
359 return 0;
360err:
361 if (ghes)
362 ghes_fini(ghes);
363 return rc;
364}
365
366static void ghes_cleanup(void)
367{
368 struct ghes *ghes, *nghes;
369
370 if (!list_empty(&ghes_sci))
371 unregister_acpi_hed_notifier(&ghes_notifier_sci);
372
373 synchronize_rcu();
374
375 list_for_each_entry_safe(ghes, nghes, &ghes_sci, list) {
376 list_del(&ghes->list);
377 ghes_fini(ghes);
378 kfree(ghes);
379 }
380}
381
382static int __init ghes_init(void)
383{
384 int rc;
385
386 if (acpi_disabled)
387 return -ENODEV;
388
389 if (hest_disable) {
390 pr_info(GHES_PFX "HEST is not enabled!\n");
391 return -EINVAL;
392 }
393
394 rc = apei_hest_parse(hest_ghes_parse, NULL);
395 if (rc) {
396 pr_err(GHES_PFX
397 "Error during parsing HEST generic hardware error sources.\n");
398 goto err_cleanup;
399 }
400
401 if (list_empty(&ghes_sci)) {
402 pr_info(GHES_PFX
403 "No functional generic hardware error sources.\n");
404 rc = -ENODEV;
405 goto err_cleanup;
406 }
407
408 pr_info(GHES_PFX
409 "Generic Hardware Error Source support is initialized.\n");
410
411 return 0;
412err_cleanup:
413 ghes_cleanup();
414 return rc;
415}
416
417static void __exit ghes_exit(void)
418{
419 ghes_cleanup();
420}
421
422module_init(ghes_init);
423module_exit(ghes_exit);
424
425MODULE_AUTHOR("Huang Ying");
426MODULE_DESCRIPTION("APEI Generic Hardware Error Source support");
427MODULE_LICENSE("GPL");