aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-08-04 03:53:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-08-04 03:53:27 -0400
commitc0c770e610cc4cdcd66c7e939bdf89cc3e72f79d (patch)
tree7cf6807258fef2a85a2ff212f4f4eb6d9dc336c6
parenta9e4e6e14c322e08d1c615afc8f504fb415f9613 (diff)
parentd0e323b47057f4492b8fa22345f38d80a469bf8d (diff)
Merge branch 'apei-release' of git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6
* 'apei-release' of git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6: ACPI, APEI, EINJ Param support is disabled by default APEI GHES: 32-bit buildfix ACPI: APEI build fix ACPI, APEI, GHES: Add hardware memory error recovery support HWPoison: add memory_failure_queue() ACPI, APEI, GHES, Error records content based throttle ACPI, APEI, GHES, printk support for recoverable error via NMI lib, Make gen_pool memory allocator lockless lib, Add lock-less NULL terminated single list Add Kconfig option ARCH_HAVE_NMI_SAFE_CMPXCHG ACPI, APEI, Add WHEA _OSC support ACPI, APEI, Add APEI bit support in generic _OSC call ACPI, APEI, GHES, Support disable GHES at boot time ACPI, APEI, GHES, Prevent GHES to be built as module ACPI, APEI, Use apei_exec_run_optional in APEI EINJ and ERST ACPI, APEI, Add apei_exec_run_optional ACPI, APEI, GHES, Do not ratelimit fatal error printk before panic ACPI, APEI, ERST, Fix erst-dbg long record reading issue ACPI, APEI, ERST, Prevent erst_dbg from loading if ERST is disabled
-rw-r--r--Documentation/acpi/apei/einj.txt11
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/alpha/Kconfig1
-rw-r--r--arch/avr32/Kconfig1
-rw-r--r--arch/frv/Kconfig1
-rw-r--r--arch/ia64/Kconfig1
-rw-r--r--arch/m68k/Kconfig1
-rw-r--r--arch/parisc/Kconfig1
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/sh/Kconfig1
-rw-r--r--arch/sparc/Kconfig1
-rw-r--r--arch/tile/Kconfig1
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--drivers/acpi/apei/Kconfig11
-rw-r--r--drivers/acpi/apei/apei-base.c35
-rw-r--r--drivers/acpi/apei/apei-internal.h15
-rw-r--r--drivers/acpi/apei/einj.c43
-rw-r--r--drivers/acpi/apei/erst-dbg.c6
-rw-r--r--drivers/acpi/apei/erst.c12
-rw-r--r--drivers/acpi/apei/ghes.c431
-rw-r--r--drivers/acpi/apei/hest.c17
-rw-r--r--drivers/acpi/bus.c14
-rw-r--r--include/acpi/apei.h5
-rw-r--r--include/linux/acpi.h2
-rw-r--r--include/linux/bitmap.h1
-rw-r--r--include/linux/genalloc.h34
-rw-r--r--include/linux/llist.h126
-rw-r--r--include/linux/mm.h1
-rw-r--r--lib/Kconfig3
-rw-r--r--lib/Makefile2
-rw-r--r--lib/bitmap.c2
-rw-r--r--lib/genalloc.c300
-rw-r--r--lib/llist.c129
-rw-r--r--mm/memory-failure.c92
35 files changed, 1172 insertions, 135 deletions
diff --git a/Documentation/acpi/apei/einj.txt b/Documentation/acpi/apei/einj.txt
index dfab71848dc8..5cc699ba5453 100644
--- a/Documentation/acpi/apei/einj.txt
+++ b/Documentation/acpi/apei/einj.txt
@@ -48,12 +48,19 @@ directory apei/einj. The following files are provided.
48- param1 48- param1
49 This file is used to set the first error parameter value. Effect of 49 This file is used to set the first error parameter value. Effect of
50 parameter depends on error_type specified. For memory error, this is 50 parameter depends on error_type specified. For memory error, this is
51 physical memory address. 51 physical memory address. Only available if param_extension module
52 parameter is specified.
52 53
53- param2 54- param2
54 This file is used to set the second error parameter value. Effect of 55 This file is used to set the second error parameter value. Effect of
55 parameter depends on error_type specified. For memory error, this is 56 parameter depends on error_type specified. For memory error, this is
56 physical memory address mask. 57 physical memory address mask. Only available if param_extension
58 module parameter is specified.
59
60Injecting parameter support is a BIOS version specific extension, that
61is, it only works on some BIOS version. If you want to use it, please
62make sure your BIOS version has the proper support and specify
63"param_extension=y" in module parameter.
57 64
58For more information about EINJ, please refer to ACPI specification 65For more information about EINJ, please refer to ACPI specification
59version 4.0, section 17.5. 66version 4.0, section 17.5.
diff --git a/arch/Kconfig b/arch/Kconfig
index 26b0e2397a57..4b0669cbb3b0 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -178,4 +178,7 @@ config HAVE_ARCH_MUTEX_CPU_RELAX
178config HAVE_RCU_TABLE_FREE 178config HAVE_RCU_TABLE_FREE
179 bool 179 bool
180 180
181config ARCH_HAVE_NMI_SAFE_CMPXCHG
182 bool
183
181source "kernel/gcov/Kconfig" 184source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index ca2da8da6e9c..60cde53d266c 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -14,6 +14,7 @@ config ALPHA
14 select AUTO_IRQ_AFFINITY if SMP 14 select AUTO_IRQ_AFFINITY if SMP
15 select GENERIC_IRQ_SHOW 15 select GENERIC_IRQ_SHOW
16 select ARCH_WANT_OPTIONAL_GPIOLIB 16 select ARCH_WANT_OPTIONAL_GPIOLIB
17 select ARCH_HAVE_NMI_SAFE_CMPXCHG
17 help 18 help
18 The Alpha is a 64-bit general-purpose processor designed and 19 The Alpha is a 64-bit general-purpose processor designed and
19 marketed by the Digital Equipment Corporation of blessed memory, 20 marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index e9d689b7c833..197e96f70405 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -10,6 +10,7 @@ config AVR32
10 select GENERIC_IRQ_PROBE 10 select GENERIC_IRQ_PROBE
11 select HARDIRQS_SW_RESEND 11 select HARDIRQS_SW_RESEND
12 select GENERIC_IRQ_SHOW 12 select GENERIC_IRQ_SHOW
13 select ARCH_HAVE_NMI_SAFE_CMPXCHG
13 help 14 help
14 AVR32 is a high-performance 32-bit RISC microprocessor core, 15 AVR32 is a high-performance 32-bit RISC microprocessor core,
15 designed for cost-sensitive embedded applications, with particular 16 designed for cost-sensitive embedded applications, with particular
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index cb884e489425..bad27a6ff407 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -7,6 +7,7 @@ config FRV
7 select HAVE_PERF_EVENTS 7 select HAVE_PERF_EVENTS
8 select HAVE_GENERIC_HARDIRQS 8 select HAVE_GENERIC_HARDIRQS
9 select GENERIC_IRQ_SHOW 9 select GENERIC_IRQ_SHOW
10 select ARCH_HAVE_NMI_SAFE_CMPXCHG
10 11
11config ZONE_DMA 12config ZONE_DMA
12 bool 13 bool
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 64c7ab7e7a81..124854714958 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -28,6 +28,7 @@ config IA64
28 select IRQ_PER_CPU 28 select IRQ_PER_CPU
29 select GENERIC_IRQ_SHOW 29 select GENERIC_IRQ_SHOW
30 select ARCH_WANT_OPTIONAL_GPIOLIB 30 select ARCH_WANT_OPTIONAL_GPIOLIB
31 select ARCH_HAVE_NMI_SAFE_CMPXCHG
31 default y 32 default y
32 help 33 help
33 The Itanium Processor Family is Intel's 64-bit successor to 34 The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 284cd3771eaa..9e8ee9d2b8ca 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -6,6 +6,7 @@ config M68K
6 select GENERIC_ATOMIC64 if MMU 6 select GENERIC_ATOMIC64 if MMU
7 select HAVE_GENERIC_HARDIRQS if !MMU 7 select HAVE_GENERIC_HARDIRQS if !MMU
8 select GENERIC_IRQ_SHOW if !MMU 8 select GENERIC_IRQ_SHOW if !MMU
9 select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS
9 10
10config RWSEM_GENERIC_SPINLOCK 11config RWSEM_GENERIC_SPINLOCK
11 bool 12 bool
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 65adc86a230e..e077b0bf56ca 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -15,6 +15,7 @@ config PARISC
15 select HAVE_GENERIC_HARDIRQS 15 select HAVE_GENERIC_HARDIRQS
16 select GENERIC_IRQ_PROBE 16 select GENERIC_IRQ_PROBE
17 select IRQ_PER_CPU 17 select IRQ_PER_CPU
18 select ARCH_HAVE_NMI_SAFE_CMPXCHG
18 19
19 help 20 help
20 The PA-RISC microprocessor is designed by Hewlett-Packard and used 21 The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 374c475e56a3..6926b61acfea 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -136,6 +136,7 @@ config PPC
136 select HAVE_SYSCALL_TRACEPOINTS 136 select HAVE_SYSCALL_TRACEPOINTS
137 select HAVE_BPF_JIT if (PPC64 && NET) 137 select HAVE_BPF_JIT if (PPC64 && NET)
138 select HAVE_ARCH_JUMP_LABEL 138 select HAVE_ARCH_JUMP_LABEL
139 select ARCH_HAVE_NMI_SAFE_CMPXCHG
139 140
140config EARLY_PRINTK 141config EARLY_PRINTK
141 bool 142 bool
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index c03fef7a9c22..0f98bbddade5 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -81,6 +81,7 @@ config S390
81 select INIT_ALL_POSSIBLE 81 select INIT_ALL_POSSIBLE
82 select HAVE_IRQ_WORK 82 select HAVE_IRQ_WORK
83 select HAVE_PERF_EVENTS 83 select HAVE_PERF_EVENTS
84 select ARCH_HAVE_NMI_SAFE_CMPXCHG
84 select HAVE_KERNEL_GZIP 85 select HAVE_KERNEL_GZIP
85 select HAVE_KERNEL_BZIP2 86 select HAVE_KERNEL_BZIP2
86 select HAVE_KERNEL_LZMA 87 select HAVE_KERNEL_LZMA
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 748ff1920068..ff9177c8f643 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -11,6 +11,7 @@ config SUPERH
11 select HAVE_DMA_ATTRS 11 select HAVE_DMA_ATTRS
12 select HAVE_IRQ_WORK 12 select HAVE_IRQ_WORK
13 select HAVE_PERF_EVENTS 13 select HAVE_PERF_EVENTS
14 select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A)
14 select PERF_USE_VMALLOC 15 select PERF_USE_VMALLOC
15 select HAVE_KERNEL_GZIP 16 select HAVE_KERNEL_GZIP
16 select HAVE_KERNEL_BZIP2 17 select HAVE_KERNEL_BZIP2
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 1074dddcb104..42c67beadcae 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -54,6 +54,7 @@ config SPARC64
54 select HAVE_PERF_EVENTS 54 select HAVE_PERF_EVENTS
55 select PERF_USE_VMALLOC 55 select PERF_USE_VMALLOC
56 select IRQ_PREFLOW_FASTEOI 56 select IRQ_PREFLOW_FASTEOI
57 select ARCH_HAVE_NMI_SAFE_CMPXCHG
57 58
58config ARCH_DEFCONFIG 59config ARCH_DEFCONFIG
59 string 60 string
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 0249b8b4db54..b30f71ac0d06 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -12,6 +12,7 @@ config TILE
12 select GENERIC_PENDING_IRQ if SMP 12 select GENERIC_PENDING_IRQ if SMP
13 select GENERIC_IRQ_SHOW 13 select GENERIC_IRQ_SHOW
14 select SYS_HYPERVISOR 14 select SYS_HYPERVISOR
15 select ARCH_HAVE_NMI_SAFE_CMPXCHG if !M386
15 16
16# FIXME: investigate whether we need/want these options. 17# FIXME: investigate whether we need/want these options.
17# select HAVE_IOREMAP_PROT 18# select HAVE_IOREMAP_PROT
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7cf916fc1ce7..6a47bb22657f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -72,6 +72,7 @@ config X86
72 select USE_GENERIC_SMP_HELPERS if SMP 72 select USE_GENERIC_SMP_HELPERS if SMP
73 select HAVE_BPF_JIT if (X86_64 && NET) 73 select HAVE_BPF_JIT if (X86_64 && NET)
74 select CLKEVT_I8253 74 select CLKEVT_I8253
75 select ARCH_HAVE_NMI_SAFE_CMPXCHG
75 76
76config INSTRUCTION_DECODER 77config INSTRUCTION_DECODER
77 def_bool (KPROBES || PERF_EVENTS) 78 def_bool (KPROBES || PERF_EVENTS)
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index f739a70b1c70..c34aa51af4ee 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -10,9 +10,11 @@ config ACPI_APEI
10 error injection. 10 error injection.
11 11
12config ACPI_APEI_GHES 12config ACPI_APEI_GHES
13 tristate "APEI Generic Hardware Error Source" 13 bool "APEI Generic Hardware Error Source"
14 depends on ACPI_APEI && X86 14 depends on ACPI_APEI && X86
15 select ACPI_HED 15 select ACPI_HED
16 select LLIST
17 select GENERIC_ALLOCATOR
16 help 18 help
17 Generic Hardware Error Source provides a way to report 19 Generic Hardware Error Source provides a way to report
18 platform hardware errors (such as that from chipset). It 20 platform hardware errors (such as that from chipset). It
@@ -30,6 +32,13 @@ config ACPI_APEI_PCIEAER
30 PCIe AER errors may be reported via APEI firmware first mode. 32 PCIe AER errors may be reported via APEI firmware first mode.
31 Turn on this option to enable the corresponding support. 33 Turn on this option to enable the corresponding support.
32 34
35config ACPI_APEI_MEMORY_FAILURE
36 bool "APEI memory error recovering support"
37 depends on ACPI_APEI && MEMORY_FAILURE
38 help
39 Memory errors may be reported via APEI firmware first mode.
40 Turn on this option to enable the memory recovering support.
41
33config ACPI_APEI_EINJ 42config ACPI_APEI_EINJ
34 tristate "APEI Error INJection (EINJ)" 43 tristate "APEI Error INJection (EINJ)"
35 depends on ACPI_APEI && DEBUG_FS 44 depends on ACPI_APEI && DEBUG_FS
diff --git a/drivers/acpi/apei/apei-base.c b/drivers/acpi/apei/apei-base.c
index 4a904a4bf05f..8041248fce9b 100644
--- a/drivers/acpi/apei/apei-base.c
+++ b/drivers/acpi/apei/apei-base.c
@@ -157,9 +157,10 @@ EXPORT_SYMBOL_GPL(apei_exec_noop);
157 * Interpret the specified action. Go through whole action table, 157 * Interpret the specified action. Go through whole action table,
158 * execute all instructions belong to the action. 158 * execute all instructions belong to the action.
159 */ 159 */
160int apei_exec_run(struct apei_exec_context *ctx, u8 action) 160int __apei_exec_run(struct apei_exec_context *ctx, u8 action,
161 bool optional)
161{ 162{
162 int rc; 163 int rc = -ENOENT;
163 u32 i, ip; 164 u32 i, ip;
164 struct acpi_whea_header *entry; 165 struct acpi_whea_header *entry;
165 apei_exec_ins_func_t run; 166 apei_exec_ins_func_t run;
@@ -198,9 +199,9 @@ rewind:
198 goto rewind; 199 goto rewind;
199 } 200 }
200 201
201 return 0; 202 return !optional && rc < 0 ? rc : 0;
202} 203}
203EXPORT_SYMBOL_GPL(apei_exec_run); 204EXPORT_SYMBOL_GPL(__apei_exec_run);
204 205
205typedef int (*apei_exec_entry_func_t)(struct apei_exec_context *ctx, 206typedef int (*apei_exec_entry_func_t)(struct apei_exec_context *ctx,
206 struct acpi_whea_header *entry, 207 struct acpi_whea_header *entry,
@@ -603,3 +604,29 @@ struct dentry *apei_get_debugfs_dir(void)
603 return dapei; 604 return dapei;
604} 605}
605EXPORT_SYMBOL_GPL(apei_get_debugfs_dir); 606EXPORT_SYMBOL_GPL(apei_get_debugfs_dir);
607
608int apei_osc_setup(void)
609{
610 static u8 whea_uuid_str[] = "ed855e0c-6c90-47bf-a62a-26de0fc5ad5c";
611 acpi_handle handle;
612 u32 capbuf[3];
613 struct acpi_osc_context context = {
614 .uuid_str = whea_uuid_str,
615 .rev = 1,
616 .cap.length = sizeof(capbuf),
617 .cap.pointer = capbuf,
618 };
619
620 capbuf[OSC_QUERY_TYPE] = OSC_QUERY_ENABLE;
621 capbuf[OSC_SUPPORT_TYPE] = 0;
622 capbuf[OSC_CONTROL_TYPE] = 0;
623
624 if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))
625 || ACPI_FAILURE(acpi_run_osc(handle, &context)))
626 return -EIO;
627 else {
628 kfree(context.ret.pointer);
629 return 0;
630 }
631}
632EXPORT_SYMBOL_GPL(apei_osc_setup);
diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h
index ef0581f2094d..f57050e7a5e7 100644
--- a/drivers/acpi/apei/apei-internal.h
+++ b/drivers/acpi/apei/apei-internal.h
@@ -50,7 +50,18 @@ static inline u64 apei_exec_ctx_get_output(struct apei_exec_context *ctx)
50 return ctx->value; 50 return ctx->value;
51} 51}
52 52
53int apei_exec_run(struct apei_exec_context *ctx, u8 action); 53int __apei_exec_run(struct apei_exec_context *ctx, u8 action, bool optional);
54
55static inline int apei_exec_run(struct apei_exec_context *ctx, u8 action)
56{
57 return __apei_exec_run(ctx, action, 0);
58}
59
60/* It is optional whether the firmware provides the action */
61static inline int apei_exec_run_optional(struct apei_exec_context *ctx, u8 action)
62{
63 return __apei_exec_run(ctx, action, 1);
64}
54 65
55/* Common instruction implementation */ 66/* Common instruction implementation */
56 67
@@ -113,4 +124,6 @@ void apei_estatus_print(const char *pfx,
113 const struct acpi_hest_generic_status *estatus); 124 const struct acpi_hest_generic_status *estatus);
114int apei_estatus_check_header(const struct acpi_hest_generic_status *estatus); 125int apei_estatus_check_header(const struct acpi_hest_generic_status *estatus);
115int apei_estatus_check(const struct acpi_hest_generic_status *estatus); 126int apei_estatus_check(const struct acpi_hest_generic_status *estatus);
127
128int apei_osc_setup(void);
116#endif 129#endif
diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
index f74b2ea11f21..589b96c38704 100644
--- a/drivers/acpi/apei/einj.c
+++ b/drivers/acpi/apei/einj.c
@@ -46,7 +46,8 @@
46 * Some BIOSes allow parameters to the SET_ERROR_TYPE entries in the 46 * Some BIOSes allow parameters to the SET_ERROR_TYPE entries in the
47 * EINJ table through an unpublished extension. Use with caution as 47 * EINJ table through an unpublished extension. Use with caution as
48 * most will ignore the parameter and make their own choice of address 48 * most will ignore the parameter and make their own choice of address
49 * for error injection. 49 * for error injection. This extension is used only if
50 * param_extension module parameter is specified.
50 */ 51 */
51struct einj_parameter { 52struct einj_parameter {
52 u64 type; 53 u64 type;
@@ -65,6 +66,9 @@ struct einj_parameter {
65 ((struct acpi_whea_header *)((char *)(tab) + \ 66 ((struct acpi_whea_header *)((char *)(tab) + \
66 sizeof(struct acpi_table_einj))) 67 sizeof(struct acpi_table_einj)))
67 68
69static bool param_extension;
70module_param(param_extension, bool, 0);
71
68static struct acpi_table_einj *einj_tab; 72static struct acpi_table_einj *einj_tab;
69 73
70static struct apei_resources einj_resources; 74static struct apei_resources einj_resources;
@@ -285,7 +289,7 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2)
285 289
286 einj_exec_ctx_init(&ctx); 290 einj_exec_ctx_init(&ctx);
287 291
288 rc = apei_exec_run(&ctx, ACPI_EINJ_BEGIN_OPERATION); 292 rc = apei_exec_run_optional(&ctx, ACPI_EINJ_BEGIN_OPERATION);
289 if (rc) 293 if (rc)
290 return rc; 294 return rc;
291 apei_exec_ctx_set_input(&ctx, type); 295 apei_exec_ctx_set_input(&ctx, type);
@@ -323,7 +327,7 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2)
323 rc = __einj_error_trigger(trigger_paddr); 327 rc = __einj_error_trigger(trigger_paddr);
324 if (rc) 328 if (rc)
325 return rc; 329 return rc;
326 rc = apei_exec_run(&ctx, ACPI_EINJ_END_OPERATION); 330 rc = apei_exec_run_optional(&ctx, ACPI_EINJ_END_OPERATION);
327 331
328 return rc; 332 return rc;
329} 333}
@@ -489,14 +493,6 @@ static int __init einj_init(void)
489 einj_debug_dir, NULL, &error_type_fops); 493 einj_debug_dir, NULL, &error_type_fops);
490 if (!fentry) 494 if (!fentry)
491 goto err_cleanup; 495 goto err_cleanup;
492 fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR,
493 einj_debug_dir, &error_param1);
494 if (!fentry)
495 goto err_cleanup;
496 fentry = debugfs_create_x64("param2", S_IRUSR | S_IWUSR,
497 einj_debug_dir, &error_param2);
498 if (!fentry)
499 goto err_cleanup;
500 fentry = debugfs_create_file("error_inject", S_IWUSR, 496 fentry = debugfs_create_file("error_inject", S_IWUSR,
501 einj_debug_dir, NULL, &error_inject_fops); 497 einj_debug_dir, NULL, &error_inject_fops);
502 if (!fentry) 498 if (!fentry)
@@ -513,12 +509,23 @@ static int __init einj_init(void)
513 rc = apei_exec_pre_map_gars(&ctx); 509 rc = apei_exec_pre_map_gars(&ctx);
514 if (rc) 510 if (rc)
515 goto err_release; 511 goto err_release;
516 param_paddr = einj_get_parameter_address(); 512 if (param_extension) {
517 if (param_paddr) { 513 param_paddr = einj_get_parameter_address();
518 einj_param = ioremap(param_paddr, sizeof(*einj_param)); 514 if (param_paddr) {
519 rc = -ENOMEM; 515 einj_param = ioremap(param_paddr, sizeof(*einj_param));
520 if (!einj_param) 516 rc = -ENOMEM;
521 goto err_unmap; 517 if (!einj_param)
518 goto err_unmap;
519 fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR,
520 einj_debug_dir, &error_param1);
521 if (!fentry)
522 goto err_unmap;
523 fentry = debugfs_create_x64("param2", S_IRUSR | S_IWUSR,
524 einj_debug_dir, &error_param2);
525 if (!fentry)
526 goto err_unmap;
527 } else
528 pr_warn(EINJ_PFX "Parameter extension is not supported.\n");
522 } 529 }
523 530
524 pr_info(EINJ_PFX "Error INJection is initialized.\n"); 531 pr_info(EINJ_PFX "Error INJection is initialized.\n");
@@ -526,6 +533,8 @@ static int __init einj_init(void)
526 return 0; 533 return 0;
527 534
528err_unmap: 535err_unmap:
536 if (einj_param)
537 iounmap(einj_param);
529 apei_exec_post_unmap_gars(&ctx); 538 apei_exec_post_unmap_gars(&ctx);
530err_release: 539err_release:
531 apei_resources_release(&einj_resources); 540 apei_resources_release(&einj_resources);
diff --git a/drivers/acpi/apei/erst-dbg.c b/drivers/acpi/apei/erst-dbg.c
index a4cfb64c86a1..903549df809b 100644
--- a/drivers/acpi/apei/erst-dbg.c
+++ b/drivers/acpi/apei/erst-dbg.c
@@ -33,7 +33,7 @@
33 33
34#define ERST_DBG_PFX "ERST DBG: " 34#define ERST_DBG_PFX "ERST DBG: "
35 35
36#define ERST_DBG_RECORD_LEN_MAX 4096 36#define ERST_DBG_RECORD_LEN_MAX 0x4000
37 37
38static void *erst_dbg_buf; 38static void *erst_dbg_buf;
39static unsigned int erst_dbg_buf_len; 39static unsigned int erst_dbg_buf_len;
@@ -213,6 +213,10 @@ static struct miscdevice erst_dbg_dev = {
213 213
214static __init int erst_dbg_init(void) 214static __init int erst_dbg_init(void)
215{ 215{
216 if (erst_disable) {
217 pr_info(ERST_DBG_PFX "ERST support is disabled.\n");
218 return -ENODEV;
219 }
216 return misc_register(&erst_dbg_dev); 220 return misc_register(&erst_dbg_dev);
217} 221}
218 222
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 6053f4780df9..2ca59dc69f7f 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -642,7 +642,7 @@ static int __erst_write_to_storage(u64 offset)
642 int rc; 642 int rc;
643 643
644 erst_exec_ctx_init(&ctx); 644 erst_exec_ctx_init(&ctx);
645 rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_WRITE); 645 rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_WRITE);
646 if (rc) 646 if (rc)
647 return rc; 647 return rc;
648 apei_exec_ctx_set_input(&ctx, offset); 648 apei_exec_ctx_set_input(&ctx, offset);
@@ -666,7 +666,7 @@ static int __erst_write_to_storage(u64 offset)
666 if (rc) 666 if (rc)
667 return rc; 667 return rc;
668 val = apei_exec_ctx_get_output(&ctx); 668 val = apei_exec_ctx_get_output(&ctx);
669 rc = apei_exec_run(&ctx, ACPI_ERST_END); 669 rc = apei_exec_run_optional(&ctx, ACPI_ERST_END);
670 if (rc) 670 if (rc)
671 return rc; 671 return rc;
672 672
@@ -681,7 +681,7 @@ static int __erst_read_from_storage(u64 record_id, u64 offset)
681 int rc; 681 int rc;
682 682
683 erst_exec_ctx_init(&ctx); 683 erst_exec_ctx_init(&ctx);
684 rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_READ); 684 rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_READ);
685 if (rc) 685 if (rc)
686 return rc; 686 return rc;
687 apei_exec_ctx_set_input(&ctx, offset); 687 apei_exec_ctx_set_input(&ctx, offset);
@@ -709,7 +709,7 @@ static int __erst_read_from_storage(u64 record_id, u64 offset)
709 if (rc) 709 if (rc)
710 return rc; 710 return rc;
711 val = apei_exec_ctx_get_output(&ctx); 711 val = apei_exec_ctx_get_output(&ctx);
712 rc = apei_exec_run(&ctx, ACPI_ERST_END); 712 rc = apei_exec_run_optional(&ctx, ACPI_ERST_END);
713 if (rc) 713 if (rc)
714 return rc; 714 return rc;
715 715
@@ -724,7 +724,7 @@ static int __erst_clear_from_storage(u64 record_id)
724 int rc; 724 int rc;
725 725
726 erst_exec_ctx_init(&ctx); 726 erst_exec_ctx_init(&ctx);
727 rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_CLEAR); 727 rc = apei_exec_run_optional(&ctx, ACPI_ERST_BEGIN_CLEAR);
728 if (rc) 728 if (rc)
729 return rc; 729 return rc;
730 apei_exec_ctx_set_input(&ctx, record_id); 730 apei_exec_ctx_set_input(&ctx, record_id);
@@ -748,7 +748,7 @@ static int __erst_clear_from_storage(u64 record_id)
748 if (rc) 748 if (rc)
749 return rc; 749 return rc;
750 val = apei_exec_ctx_get_output(&ctx); 750 val = apei_exec_ctx_get_output(&ctx);
751 rc = apei_exec_run(&ctx, ACPI_ERST_END); 751 rc = apei_exec_run_optional(&ctx, ACPI_ERST_END);
752 if (rc) 752 if (rc)
753 return rc; 753 return rc;
754 754
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index f703b2881153..0784f99a4665 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -12,7 +12,7 @@
12 * For more information about Generic Hardware Error Source, please 12 * For more information about Generic Hardware Error Source, please
13 * refer to ACPI Specification version 4.0, section 17.3.2.6 13 * refer to ACPI Specification version 4.0, section 17.3.2.6
14 * 14 *
15 * Copyright 2010 Intel Corp. 15 * Copyright 2010,2011 Intel Corp.
16 * Author: Huang Ying <ying.huang@intel.com> 16 * Author: Huang Ying <ying.huang@intel.com>
17 * 17 *
18 * This program is free software; you can redistribute it and/or 18 * This program is free software; you can redistribute it and/or
@@ -42,6 +42,9 @@
42#include <linux/mutex.h> 42#include <linux/mutex.h>
43#include <linux/ratelimit.h> 43#include <linux/ratelimit.h>
44#include <linux/vmalloc.h> 44#include <linux/vmalloc.h>
45#include <linux/irq_work.h>
46#include <linux/llist.h>
47#include <linux/genalloc.h>
45#include <acpi/apei.h> 48#include <acpi/apei.h>
46#include <acpi/atomicio.h> 49#include <acpi/atomicio.h>
47#include <acpi/hed.h> 50#include <acpi/hed.h>
@@ -53,6 +56,30 @@
53#define GHES_PFX "GHES: " 56#define GHES_PFX "GHES: "
54 57
55#define GHES_ESTATUS_MAX_SIZE 65536 58#define GHES_ESTATUS_MAX_SIZE 65536
59#define GHES_ESOURCE_PREALLOC_MAX_SIZE 65536
60
61#define GHES_ESTATUS_POOL_MIN_ALLOC_ORDER 3
62
63/* This is just an estimation for memory pool allocation */
64#define GHES_ESTATUS_CACHE_AVG_SIZE 512
65
66#define GHES_ESTATUS_CACHES_SIZE 4
67
68#define GHES_ESTATUS_IN_CACHE_MAX_NSEC 10000000000ULL
69/* Prevent too many caches are allocated because of RCU */
70#define GHES_ESTATUS_CACHE_ALLOCED_MAX (GHES_ESTATUS_CACHES_SIZE * 3 / 2)
71
72#define GHES_ESTATUS_CACHE_LEN(estatus_len) \
73 (sizeof(struct ghes_estatus_cache) + (estatus_len))
74#define GHES_ESTATUS_FROM_CACHE(estatus_cache) \
75 ((struct acpi_hest_generic_status *) \
76 ((struct ghes_estatus_cache *)(estatus_cache) + 1))
77
78#define GHES_ESTATUS_NODE_LEN(estatus_len) \
79 (sizeof(struct ghes_estatus_node) + (estatus_len))
80#define GHES_ESTATUS_FROM_NODE(estatus_node) \
81 ((struct acpi_hest_generic_status *) \
82 ((struct ghes_estatus_node *)(estatus_node) + 1))
56 83
57/* 84/*
58 * One struct ghes is created for each generic hardware error source. 85 * One struct ghes is created for each generic hardware error source.
@@ -77,6 +104,22 @@ struct ghes {
77 }; 104 };
78}; 105};
79 106
107struct ghes_estatus_node {
108 struct llist_node llnode;
109 struct acpi_hest_generic *generic;
110};
111
112struct ghes_estatus_cache {
113 u32 estatus_len;
114 atomic_t count;
115 struct acpi_hest_generic *generic;
116 unsigned long long time_in;
117 struct rcu_head rcu;
118};
119
120int ghes_disable;
121module_param_named(disable, ghes_disable, bool, 0);
122
80static int ghes_panic_timeout __read_mostly = 30; 123static int ghes_panic_timeout __read_mostly = 30;
81 124
82/* 125/*
@@ -121,6 +164,22 @@ static struct vm_struct *ghes_ioremap_area;
121static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi); 164static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi);
122static DEFINE_SPINLOCK(ghes_ioremap_lock_irq); 165static DEFINE_SPINLOCK(ghes_ioremap_lock_irq);
123 166
167/*
168 * printk is not safe in NMI context. So in NMI handler, we allocate
169 * required memory from lock-less memory allocator
170 * (ghes_estatus_pool), save estatus into it, put them into lock-less
171 * list (ghes_estatus_llist), then delay printk into IRQ context via
172 * irq_work (ghes_proc_irq_work). ghes_estatus_size_request record
173 * required pool size by all NMI error source.
174 */
175static struct gen_pool *ghes_estatus_pool;
176static unsigned long ghes_estatus_pool_size_request;
177static struct llist_head ghes_estatus_llist;
178static struct irq_work ghes_proc_irq_work;
179
180struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE];
181static atomic_t ghes_estatus_cache_alloced;
182
124static int ghes_ioremap_init(void) 183static int ghes_ioremap_init(void)
125{ 184{
126 ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES, 185 ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES,
@@ -180,6 +239,55 @@ static void ghes_iounmap_irq(void __iomem *vaddr_ptr)
180 __flush_tlb_one(vaddr); 239 __flush_tlb_one(vaddr);
181} 240}
182 241
242static int ghes_estatus_pool_init(void)
243{
244 ghes_estatus_pool = gen_pool_create(GHES_ESTATUS_POOL_MIN_ALLOC_ORDER, -1);
245 if (!ghes_estatus_pool)
246 return -ENOMEM;
247 return 0;
248}
249
250static void ghes_estatus_pool_free_chunk_page(struct gen_pool *pool,
251 struct gen_pool_chunk *chunk,
252 void *data)
253{
254 free_page(chunk->start_addr);
255}
256
257static void ghes_estatus_pool_exit(void)
258{
259 gen_pool_for_each_chunk(ghes_estatus_pool,
260 ghes_estatus_pool_free_chunk_page, NULL);
261 gen_pool_destroy(ghes_estatus_pool);
262}
263
264static int ghes_estatus_pool_expand(unsigned long len)
265{
266 unsigned long i, pages, size, addr;
267 int ret;
268
269 ghes_estatus_pool_size_request += PAGE_ALIGN(len);
270 size = gen_pool_size(ghes_estatus_pool);
271 if (size >= ghes_estatus_pool_size_request)
272 return 0;
273 pages = (ghes_estatus_pool_size_request - size) / PAGE_SIZE;
274 for (i = 0; i < pages; i++) {
275 addr = __get_free_page(GFP_KERNEL);
276 if (!addr)
277 return -ENOMEM;
278 ret = gen_pool_add(ghes_estatus_pool, addr, PAGE_SIZE, -1);
279 if (ret)
280 return ret;
281 }
282
283 return 0;
284}
285
286static void ghes_estatus_pool_shrink(unsigned long len)
287{
288 ghes_estatus_pool_size_request -= PAGE_ALIGN(len);
289}
290
183static struct ghes *ghes_new(struct acpi_hest_generic *generic) 291static struct ghes *ghes_new(struct acpi_hest_generic *generic)
184{ 292{
185 struct ghes *ghes; 293 struct ghes *ghes;
@@ -341,43 +449,196 @@ static void ghes_clear_estatus(struct ghes *ghes)
341 ghes->flags &= ~GHES_TO_CLEAR; 449 ghes->flags &= ~GHES_TO_CLEAR;
342} 450}
343 451
344static void ghes_do_proc(struct ghes *ghes) 452static void ghes_do_proc(const struct acpi_hest_generic_status *estatus)
345{ 453{
346 int sev, processed = 0; 454 int sev, sec_sev;
347 struct acpi_hest_generic_data *gdata; 455 struct acpi_hest_generic_data *gdata;
348 456
349 sev = ghes_severity(ghes->estatus->error_severity); 457 sev = ghes_severity(estatus->error_severity);
350 apei_estatus_for_each_section(ghes->estatus, gdata) { 458 apei_estatus_for_each_section(estatus, gdata) {
351#ifdef CONFIG_X86_MCE 459 sec_sev = ghes_severity(gdata->error_severity);
352 if (!uuid_le_cmp(*(uuid_le *)gdata->section_type, 460 if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
353 CPER_SEC_PLATFORM_MEM)) { 461 CPER_SEC_PLATFORM_MEM)) {
354 apei_mce_report_mem_error( 462 struct cper_sec_mem_err *mem_err;
355 sev == GHES_SEV_CORRECTED, 463 mem_err = (struct cper_sec_mem_err *)(gdata+1);
356 (struct cper_sec_mem_err *)(gdata+1)); 464#ifdef CONFIG_X86_MCE
357 processed = 1; 465 apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
358 } 466 mem_err);
359#endif 467#endif
468#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
469 if (sev == GHES_SEV_RECOVERABLE &&
470 sec_sev == GHES_SEV_RECOVERABLE &&
471 mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
472 unsigned long pfn;
473 pfn = mem_err->physical_addr >> PAGE_SHIFT;
474 memory_failure_queue(pfn, 0, 0);
475 }
476#endif
477 }
360 } 478 }
361} 479}
362 480
363static void ghes_print_estatus(const char *pfx, struct ghes *ghes) 481static void __ghes_print_estatus(const char *pfx,
482 const struct acpi_hest_generic *generic,
483 const struct acpi_hest_generic_status *estatus)
364{ 484{
365 /* Not more than 2 messages every 5 seconds */
366 static DEFINE_RATELIMIT_STATE(ratelimit, 5*HZ, 2);
367
368 if (pfx == NULL) { 485 if (pfx == NULL) {
369 if (ghes_severity(ghes->estatus->error_severity) <= 486 if (ghes_severity(estatus->error_severity) <=
370 GHES_SEV_CORRECTED) 487 GHES_SEV_CORRECTED)
371 pfx = KERN_WARNING HW_ERR; 488 pfx = KERN_WARNING HW_ERR;
372 else 489 else
373 pfx = KERN_ERR HW_ERR; 490 pfx = KERN_ERR HW_ERR;
374 } 491 }
375 if (__ratelimit(&ratelimit)) { 492 printk("%s""Hardware error from APEI Generic Hardware Error Source: %d\n",
376 printk( 493 pfx, generic->header.source_id);
377 "%s""Hardware error from APEI Generic Hardware Error Source: %d\n", 494 apei_estatus_print(pfx, estatus);
378 pfx, ghes->generic->header.source_id); 495}
379 apei_estatus_print(pfx, ghes->estatus); 496
497static int ghes_print_estatus(const char *pfx,
498 const struct acpi_hest_generic *generic,
499 const struct acpi_hest_generic_status *estatus)
500{
501 /* Not more than 2 messages every 5 seconds */
502 static DEFINE_RATELIMIT_STATE(ratelimit_corrected, 5*HZ, 2);
503 static DEFINE_RATELIMIT_STATE(ratelimit_uncorrected, 5*HZ, 2);
504 struct ratelimit_state *ratelimit;
505
506 if (ghes_severity(estatus->error_severity) <= GHES_SEV_CORRECTED)
507 ratelimit = &ratelimit_corrected;
508 else
509 ratelimit = &ratelimit_uncorrected;
510 if (__ratelimit(ratelimit)) {
511 __ghes_print_estatus(pfx, generic, estatus);
512 return 1;
380 } 513 }
514 return 0;
515}
516
517/*
518 * GHES error status reporting throttle, to report more kinds of
519 * errors, instead of just most frequently occurred errors.
520 */
521static int ghes_estatus_cached(struct acpi_hest_generic_status *estatus)
522{
523 u32 len;
524 int i, cached = 0;
525 unsigned long long now;
526 struct ghes_estatus_cache *cache;
527 struct acpi_hest_generic_status *cache_estatus;
528
529 len = apei_estatus_len(estatus);
530 rcu_read_lock();
531 for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) {
532 cache = rcu_dereference(ghes_estatus_caches[i]);
533 if (cache == NULL)
534 continue;
535 if (len != cache->estatus_len)
536 continue;
537 cache_estatus = GHES_ESTATUS_FROM_CACHE(cache);
538 if (memcmp(estatus, cache_estatus, len))
539 continue;
540 atomic_inc(&cache->count);
541 now = sched_clock();
542 if (now - cache->time_in < GHES_ESTATUS_IN_CACHE_MAX_NSEC)
543 cached = 1;
544 break;
545 }
546 rcu_read_unlock();
547 return cached;
548}
549
550static struct ghes_estatus_cache *ghes_estatus_cache_alloc(
551 struct acpi_hest_generic *generic,
552 struct acpi_hest_generic_status *estatus)
553{
554 int alloced;
555 u32 len, cache_len;
556 struct ghes_estatus_cache *cache;
557 struct acpi_hest_generic_status *cache_estatus;
558
559 alloced = atomic_add_return(1, &ghes_estatus_cache_alloced);
560 if (alloced > GHES_ESTATUS_CACHE_ALLOCED_MAX) {
561 atomic_dec(&ghes_estatus_cache_alloced);
562 return NULL;
563 }
564 len = apei_estatus_len(estatus);
565 cache_len = GHES_ESTATUS_CACHE_LEN(len);
566 cache = (void *)gen_pool_alloc(ghes_estatus_pool, cache_len);
567 if (!cache) {
568 atomic_dec(&ghes_estatus_cache_alloced);
569 return NULL;
570 }
571 cache_estatus = GHES_ESTATUS_FROM_CACHE(cache);
572 memcpy(cache_estatus, estatus, len);
573 cache->estatus_len = len;
574 atomic_set(&cache->count, 0);
575 cache->generic = generic;
576 cache->time_in = sched_clock();
577 return cache;
578}
579
580static void ghes_estatus_cache_free(struct ghes_estatus_cache *cache)
581{
582 u32 len;
583
584 len = apei_estatus_len(GHES_ESTATUS_FROM_CACHE(cache));
585 len = GHES_ESTATUS_CACHE_LEN(len);
586 gen_pool_free(ghes_estatus_pool, (unsigned long)cache, len);
587 atomic_dec(&ghes_estatus_cache_alloced);
588}
589
590static void ghes_estatus_cache_rcu_free(struct rcu_head *head)
591{
592 struct ghes_estatus_cache *cache;
593
594 cache = container_of(head, struct ghes_estatus_cache, rcu);
595 ghes_estatus_cache_free(cache);
596}
597
598static void ghes_estatus_cache_add(
599 struct acpi_hest_generic *generic,
600 struct acpi_hest_generic_status *estatus)
601{
602 int i, slot = -1, count;
603 unsigned long long now, duration, period, max_period = 0;
604 struct ghes_estatus_cache *cache, *slot_cache = NULL, *new_cache;
605
606 new_cache = ghes_estatus_cache_alloc(generic, estatus);
607 if (new_cache == NULL)
608 return;
609 rcu_read_lock();
610 now = sched_clock();
611 for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) {
612 cache = rcu_dereference(ghes_estatus_caches[i]);
613 if (cache == NULL) {
614 slot = i;
615 slot_cache = NULL;
616 break;
617 }
618 duration = now - cache->time_in;
619 if (duration >= GHES_ESTATUS_IN_CACHE_MAX_NSEC) {
620 slot = i;
621 slot_cache = cache;
622 break;
623 }
624 count = atomic_read(&cache->count);
625 period = duration;
626 do_div(period, (count + 1));
627 if (period > max_period) {
628 max_period = period;
629 slot = i;
630 slot_cache = cache;
631 }
632 }
633 /* new_cache must be put into array after its contents are written */
634 smp_wmb();
635 if (slot != -1 && cmpxchg(ghes_estatus_caches + slot,
636 slot_cache, new_cache) == slot_cache) {
637 if (slot_cache)
638 call_rcu(&slot_cache->rcu, ghes_estatus_cache_rcu_free);
639 } else
640 ghes_estatus_cache_free(new_cache);
641 rcu_read_unlock();
381} 642}
382 643
383static int ghes_proc(struct ghes *ghes) 644static int ghes_proc(struct ghes *ghes)
@@ -387,9 +648,11 @@ static int ghes_proc(struct ghes *ghes)
387 rc = ghes_read_estatus(ghes, 0); 648 rc = ghes_read_estatus(ghes, 0);
388 if (rc) 649 if (rc)
389 goto out; 650 goto out;
390 ghes_print_estatus(NULL, ghes); 651 if (!ghes_estatus_cached(ghes->estatus)) {
391 ghes_do_proc(ghes); 652 if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus))
392 653 ghes_estatus_cache_add(ghes->generic, ghes->estatus);
654 }
655 ghes_do_proc(ghes->estatus);
393out: 656out:
394 ghes_clear_estatus(ghes); 657 ghes_clear_estatus(ghes);
395 return 0; 658 return 0;
@@ -447,6 +710,45 @@ static int ghes_notify_sci(struct notifier_block *this,
447 return ret; 710 return ret;
448} 711}
449 712
713static void ghes_proc_in_irq(struct irq_work *irq_work)
714{
715 struct llist_node *llnode, *next, *tail = NULL;
716 struct ghes_estatus_node *estatus_node;
717 struct acpi_hest_generic *generic;
718 struct acpi_hest_generic_status *estatus;
719 u32 len, node_len;
720
721 /*
722 * Because the time order of estatus in list is reversed,
723 * revert it back to proper order.
724 */
725 llnode = llist_del_all(&ghes_estatus_llist);
726 while (llnode) {
727 next = llnode->next;
728 llnode->next = tail;
729 tail = llnode;
730 llnode = next;
731 }
732 llnode = tail;
733 while (llnode) {
734 next = llnode->next;
735 estatus_node = llist_entry(llnode, struct ghes_estatus_node,
736 llnode);
737 estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
738 len = apei_estatus_len(estatus);
739 node_len = GHES_ESTATUS_NODE_LEN(len);
740 ghes_do_proc(estatus);
741 if (!ghes_estatus_cached(estatus)) {
742 generic = estatus_node->generic;
743 if (ghes_print_estatus(NULL, generic, estatus))
744 ghes_estatus_cache_add(generic, estatus);
745 }
746 gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
747 node_len);
748 llnode = next;
749 }
750}
751
450static int ghes_notify_nmi(struct notifier_block *this, 752static int ghes_notify_nmi(struct notifier_block *this,
451 unsigned long cmd, void *data) 753 unsigned long cmd, void *data)
452{ 754{
@@ -476,7 +778,8 @@ static int ghes_notify_nmi(struct notifier_block *this,
476 778
477 if (sev_global >= GHES_SEV_PANIC) { 779 if (sev_global >= GHES_SEV_PANIC) {
478 oops_begin(); 780 oops_begin();
479 ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global); 781 __ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global->generic,
782 ghes_global->estatus);
480 /* reboot to log the error! */ 783 /* reboot to log the error! */
481 if (panic_timeout == 0) 784 if (panic_timeout == 0)
482 panic_timeout = ghes_panic_timeout; 785 panic_timeout = ghes_panic_timeout;
@@ -484,12 +787,34 @@ static int ghes_notify_nmi(struct notifier_block *this,
484 } 787 }
485 788
486 list_for_each_entry_rcu(ghes, &ghes_nmi, list) { 789 list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
790#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
791 u32 len, node_len;
792 struct ghes_estatus_node *estatus_node;
793 struct acpi_hest_generic_status *estatus;
794#endif
487 if (!(ghes->flags & GHES_TO_CLEAR)) 795 if (!(ghes->flags & GHES_TO_CLEAR))
488 continue; 796 continue;
489 /* Do not print estatus because printk is not NMI safe */ 797#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
490 ghes_do_proc(ghes); 798 if (ghes_estatus_cached(ghes->estatus))
799 goto next;
800 /* Save estatus for further processing in IRQ context */
801 len = apei_estatus_len(ghes->estatus);
802 node_len = GHES_ESTATUS_NODE_LEN(len);
803 estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool,
804 node_len);
805 if (estatus_node) {
806 estatus_node->generic = ghes->generic;
807 estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
808 memcpy(estatus, ghes->estatus, len);
809 llist_add(&estatus_node->llnode, &ghes_estatus_llist);
810 }
811next:
812#endif
491 ghes_clear_estatus(ghes); 813 ghes_clear_estatus(ghes);
492 } 814 }
815#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
816 irq_work_queue(&ghes_proc_irq_work);
817#endif
493 818
494out: 819out:
495 raw_spin_unlock(&ghes_nmi_lock); 820 raw_spin_unlock(&ghes_nmi_lock);
@@ -504,10 +829,26 @@ static struct notifier_block ghes_notifier_nmi = {
504 .notifier_call = ghes_notify_nmi, 829 .notifier_call = ghes_notify_nmi,
505}; 830};
506 831
832static unsigned long ghes_esource_prealloc_size(
833 const struct acpi_hest_generic *generic)
834{
835 unsigned long block_length, prealloc_records, prealloc_size;
836
837 block_length = min_t(unsigned long, generic->error_block_length,
838 GHES_ESTATUS_MAX_SIZE);
839 prealloc_records = max_t(unsigned long,
840 generic->records_to_preallocate, 1);
841 prealloc_size = min_t(unsigned long, block_length * prealloc_records,
842 GHES_ESOURCE_PREALLOC_MAX_SIZE);
843
844 return prealloc_size;
845}
846
507static int __devinit ghes_probe(struct platform_device *ghes_dev) 847static int __devinit ghes_probe(struct platform_device *ghes_dev)
508{ 848{
509 struct acpi_hest_generic *generic; 849 struct acpi_hest_generic *generic;
510 struct ghes *ghes = NULL; 850 struct ghes *ghes = NULL;
851 unsigned long len;
511 int rc = -EINVAL; 852 int rc = -EINVAL;
512 853
513 generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data; 854 generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data;
@@ -573,6 +914,8 @@ static int __devinit ghes_probe(struct platform_device *ghes_dev)
573 mutex_unlock(&ghes_list_mutex); 914 mutex_unlock(&ghes_list_mutex);
574 break; 915 break;
575 case ACPI_HEST_NOTIFY_NMI: 916 case ACPI_HEST_NOTIFY_NMI:
917 len = ghes_esource_prealloc_size(generic);
918 ghes_estatus_pool_expand(len);
576 mutex_lock(&ghes_list_mutex); 919 mutex_lock(&ghes_list_mutex);
577 if (list_empty(&ghes_nmi)) 920 if (list_empty(&ghes_nmi))
578 register_die_notifier(&ghes_notifier_nmi); 921 register_die_notifier(&ghes_notifier_nmi);
@@ -597,6 +940,7 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev)
597{ 940{
598 struct ghes *ghes; 941 struct ghes *ghes;
599 struct acpi_hest_generic *generic; 942 struct acpi_hest_generic *generic;
943 unsigned long len;
600 944
601 ghes = platform_get_drvdata(ghes_dev); 945 ghes = platform_get_drvdata(ghes_dev);
602 generic = ghes->generic; 946 generic = ghes->generic;
@@ -627,6 +971,8 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev)
627 * freed after NMI handler finishes. 971 * freed after NMI handler finishes.
628 */ 972 */
629 synchronize_rcu(); 973 synchronize_rcu();
974 len = ghes_esource_prealloc_size(generic);
975 ghes_estatus_pool_shrink(len);
630 break; 976 break;
631 default: 977 default:
632 BUG(); 978 BUG();
@@ -662,15 +1008,43 @@ static int __init ghes_init(void)
662 return -EINVAL; 1008 return -EINVAL;
663 } 1009 }
664 1010
1011 if (ghes_disable) {
1012 pr_info(GHES_PFX "GHES is not enabled!\n");
1013 return -EINVAL;
1014 }
1015
1016 init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq);
1017
665 rc = ghes_ioremap_init(); 1018 rc = ghes_ioremap_init();
666 if (rc) 1019 if (rc)
667 goto err; 1020 goto err;
668 1021
669 rc = platform_driver_register(&ghes_platform_driver); 1022 rc = ghes_estatus_pool_init();
670 if (rc) 1023 if (rc)
671 goto err_ioremap_exit; 1024 goto err_ioremap_exit;
672 1025
1026 rc = ghes_estatus_pool_expand(GHES_ESTATUS_CACHE_AVG_SIZE *
1027 GHES_ESTATUS_CACHE_ALLOCED_MAX);
1028 if (rc)
1029 goto err_pool_exit;
1030
1031 rc = platform_driver_register(&ghes_platform_driver);
1032 if (rc)
1033 goto err_pool_exit;
1034
1035 rc = apei_osc_setup();
1036 if (rc == 0 && osc_sb_apei_support_acked)
1037 pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit and WHEA _OSC.\n");
1038 else if (rc == 0 && !osc_sb_apei_support_acked)
1039 pr_info(GHES_PFX "APEI firmware first mode is enabled by WHEA _OSC.\n");
1040 else if (rc && osc_sb_apei_support_acked)
1041 pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit.\n");
1042 else
1043 pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n");
1044
673 return 0; 1045 return 0;
1046err_pool_exit:
1047 ghes_estatus_pool_exit();
674err_ioremap_exit: 1048err_ioremap_exit:
675 ghes_ioremap_exit(); 1049 ghes_ioremap_exit();
676err: 1050err:
@@ -680,6 +1054,7 @@ err:
680static void __exit ghes_exit(void) 1054static void __exit ghes_exit(void)
681{ 1055{
682 platform_driver_unregister(&ghes_platform_driver); 1056 platform_driver_unregister(&ghes_platform_driver);
1057 ghes_estatus_pool_exit();
683 ghes_ioremap_exit(); 1058 ghes_ioremap_exit();
684} 1059}
685 1060
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index 181bc2f7bb74..05fee06f4d6e 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -231,16 +231,17 @@ void __init acpi_hest_init(void)
231 goto err; 231 goto err;
232 } 232 }
233 233
234 rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count); 234 if (!ghes_disable) {
235 if (rc) 235 rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count);
236 goto err; 236 if (rc)
237 237 goto err;
238 rc = hest_ghes_dev_register(ghes_count); 238 rc = hest_ghes_dev_register(ghes_count);
239 if (!rc) { 239 if (rc)
240 pr_info(HEST_PFX "Table parsing has been initialized.\n"); 240 goto err;
241 return;
242 } 241 }
243 242
243 pr_info(HEST_PFX "Table parsing has been initialized.\n");
244 return;
244err: 245err:
245 hest_disable = 1; 246 hest_disable = 1;
246} 247}
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index d1e06c182cdb..437ddbf0c49a 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -39,6 +39,7 @@
39#include <linux/pci.h> 39#include <linux/pci.h>
40#include <acpi/acpi_bus.h> 40#include <acpi/acpi_bus.h>
41#include <acpi/acpi_drivers.h> 41#include <acpi/acpi_drivers.h>
42#include <acpi/apei.h>
42#include <linux/dmi.h> 43#include <linux/dmi.h>
43#include <linux/suspend.h> 44#include <linux/suspend.h>
44 45
@@ -519,6 +520,7 @@ out_kfree:
519} 520}
520EXPORT_SYMBOL(acpi_run_osc); 521EXPORT_SYMBOL(acpi_run_osc);
521 522
523bool osc_sb_apei_support_acked;
522static u8 sb_uuid_str[] = "0811B06E-4A27-44F9-8D60-3CBBC22E7B48"; 524static u8 sb_uuid_str[] = "0811B06E-4A27-44F9-8D60-3CBBC22E7B48";
523static void acpi_bus_osc_support(void) 525static void acpi_bus_osc_support(void)
524{ 526{
@@ -541,11 +543,19 @@ static void acpi_bus_osc_support(void)
541#if defined(CONFIG_ACPI_PROCESSOR) || defined(CONFIG_ACPI_PROCESSOR_MODULE) 543#if defined(CONFIG_ACPI_PROCESSOR) || defined(CONFIG_ACPI_PROCESSOR_MODULE)
542 capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_PPC_OST_SUPPORT; 544 capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_PPC_OST_SUPPORT;
543#endif 545#endif
546
547 if (!ghes_disable)
548 capbuf[OSC_SUPPORT_TYPE] |= OSC_SB_APEI_SUPPORT;
544 if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) 549 if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle)))
545 return; 550 return;
546 if (ACPI_SUCCESS(acpi_run_osc(handle, &context))) 551 if (ACPI_SUCCESS(acpi_run_osc(handle, &context))) {
552 u32 *capbuf_ret = context.ret.pointer;
553 if (context.ret.length > OSC_SUPPORT_TYPE)
554 osc_sb_apei_support_acked =
555 capbuf_ret[OSC_SUPPORT_TYPE] & OSC_SB_APEI_SUPPORT;
547 kfree(context.ret.pointer); 556 kfree(context.ret.pointer);
548 /* do we need to check the returned cap? Sounds no */ 557 }
558 /* do we need to check other returned cap? Sounds no */
549} 559}
550 560
551/* -------------------------------------------------------------------------- 561/* --------------------------------------------------------------------------
diff --git a/include/acpi/apei.h b/include/acpi/apei.h
index e67b523a50e1..51a527d24a8a 100644
--- a/include/acpi/apei.h
+++ b/include/acpi/apei.h
@@ -18,6 +18,11 @@
18 18
19extern int hest_disable; 19extern int hest_disable;
20extern int erst_disable; 20extern int erst_disable;
21#ifdef CONFIG_ACPI_APEI_GHES
22extern int ghes_disable;
23#else
24#define ghes_disable 1
25#endif
21 26
22#ifdef CONFIG_ACPI_APEI 27#ifdef CONFIG_ACPI_APEI
23void __init acpi_hest_init(void); 28void __init acpi_hest_init(void);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 2312e850aab8..6001b4da39dd 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -279,6 +279,8 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
279#define OSC_SB_CPUHP_OST_SUPPORT 8 279#define OSC_SB_CPUHP_OST_SUPPORT 8
280#define OSC_SB_APEI_SUPPORT 16 280#define OSC_SB_APEI_SUPPORT 16
281 281
282extern bool osc_sb_apei_support_acked;
283
282/* PCI defined _OSC bits */ 284/* PCI defined _OSC bits */
283/* _OSC DW1 Definition (OS Support Fields) */ 285/* _OSC DW1 Definition (OS Support Fields) */
284#define OSC_EXT_PCI_CONFIG_SUPPORT 1 286#define OSC_EXT_PCI_CONFIG_SUPPORT 1
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 3bac44cce142..7ad634501e48 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -146,6 +146,7 @@ extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
146extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits); 146extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
147extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits); 147extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);
148 148
149#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
149#define BITMAP_LAST_WORD_MASK(nbits) \ 150#define BITMAP_LAST_WORD_MASK(nbits) \
150( \ 151( \
151 ((nbits) % BITS_PER_LONG) ? \ 152 ((nbits) % BITS_PER_LONG) ? \
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 5bbebda78b02..5e98eeb2af3b 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -1,8 +1,26 @@
1/* 1/*
2 * Basic general purpose allocator for managing special purpose memory 2 * Basic general purpose allocator for managing special purpose
3 * not managed by the regular kmalloc/kfree interface. 3 * memory, for example, memory that is not managed by the regular
4 * Uses for this includes on-device special memory, uncached memory 4 * kmalloc/kfree interface. Uses for this includes on-device special
5 * etc. 5 * memory, uncached memory etc.
6 *
7 * It is safe to use the allocator in NMI handlers and other special
8 * unblockable contexts that could otherwise deadlock on locks. This
9 * is implemented by using atomic operations and retries on any
10 * conflicts. The disadvantage is that there may be livelocks in
11 * extreme cases. For better scalability, one allocator can be used
12 * for each CPU.
13 *
14 * The lockless operation only works if there is enough memory
15 * available. If new memory is added to the pool a lock has to be
16 * still taken. So any user relying on locklessness has to ensure
17 * that sufficient memory is preallocated.
18 *
19 * The basic atomic operation of this allocator is cmpxchg on long.
20 * On architectures that don't have NMI-safe cmpxchg implementation,
21 * the allocator can NOT be used in NMI handler. So code uses the
22 * allocator in NMI handler should depend on
23 * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
6 * 24 *
7 * This source code is licensed under the GNU General Public License, 25 * This source code is licensed under the GNU General Public License,
8 * Version 2. See the file COPYING for more details. 26 * Version 2. See the file COPYING for more details.
@@ -15,7 +33,7 @@
15 * General purpose special memory pool descriptor. 33 * General purpose special memory pool descriptor.
16 */ 34 */
17struct gen_pool { 35struct gen_pool {
18 rwlock_t lock; 36 spinlock_t lock;
19 struct list_head chunks; /* list of chunks in this pool */ 37 struct list_head chunks; /* list of chunks in this pool */
20 int min_alloc_order; /* minimum allocation order */ 38 int min_alloc_order; /* minimum allocation order */
21}; 39};
@@ -24,8 +42,8 @@ struct gen_pool {
24 * General purpose special memory pool chunk descriptor. 42 * General purpose special memory pool chunk descriptor.
25 */ 43 */
26struct gen_pool_chunk { 44struct gen_pool_chunk {
27 spinlock_t lock;
28 struct list_head next_chunk; /* next chunk in pool */ 45 struct list_head next_chunk; /* next chunk in pool */
46 atomic_t avail;
29 phys_addr_t phys_addr; /* physical starting address of memory chunk */ 47 phys_addr_t phys_addr; /* physical starting address of memory chunk */
30 unsigned long start_addr; /* starting address of memory chunk */ 48 unsigned long start_addr; /* starting address of memory chunk */
31 unsigned long end_addr; /* ending address of memory chunk */ 49 unsigned long end_addr; /* ending address of memory chunk */
@@ -56,4 +74,8 @@ static inline int gen_pool_add(struct gen_pool *pool, unsigned long addr,
56extern void gen_pool_destroy(struct gen_pool *); 74extern void gen_pool_destroy(struct gen_pool *);
57extern unsigned long gen_pool_alloc(struct gen_pool *, size_t); 75extern unsigned long gen_pool_alloc(struct gen_pool *, size_t);
58extern void gen_pool_free(struct gen_pool *, unsigned long, size_t); 76extern void gen_pool_free(struct gen_pool *, unsigned long, size_t);
77extern void gen_pool_for_each_chunk(struct gen_pool *,
78 void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *);
79extern size_t gen_pool_avail(struct gen_pool *);
80extern size_t gen_pool_size(struct gen_pool *);
59#endif /* __GENALLOC_H__ */ 81#endif /* __GENALLOC_H__ */
diff --git a/include/linux/llist.h b/include/linux/llist.h
new file mode 100644
index 000000000000..aa0c8b5b3cd0
--- /dev/null
+++ b/include/linux/llist.h
@@ -0,0 +1,126 @@
1#ifndef LLIST_H
2#define LLIST_H
3/*
4 * Lock-less NULL terminated single linked list
5 *
6 * If there are multiple producers and multiple consumers, llist_add
7 * can be used in producers and llist_del_all can be used in
8 * consumers. They can work simultaneously without lock. But
9 * llist_del_first can not be used here. Because llist_del_first
10 * depends on list->first->next does not changed if list->first is not
11 * changed during its operation, but llist_del_first, llist_add,
12 * llist_add (or llist_del_all, llist_add, llist_add) sequence in
13 * another consumer may violate that.
14 *
15 * If there are multiple producers and one consumer, llist_add can be
16 * used in producers and llist_del_all or llist_del_first can be used
17 * in the consumer.
18 *
19 * This can be summarized as follow:
20 *
21 * | add | del_first | del_all
22 * add | - | - | -
23 * del_first | | L | L
24 * del_all | | | -
25 *
26 * Where "-" stands for no lock is needed, while "L" stands for lock
27 * is needed.
28 *
29 * The list entries deleted via llist_del_all can be traversed with
30 * traversing function such as llist_for_each etc. But the list
31 * entries can not be traversed safely before deleted from the list.
32 * The order of deleted entries is from the newest to the oldest added
33 * one. If you want to traverse from the oldest to the newest, you
34 * must reverse the order by yourself before traversing.
35 *
36 * The basic atomic operation of this list is cmpxchg on long. On
37 * architectures that don't have NMI-safe cmpxchg implementation, the
38 * list can NOT be used in NMI handler. So code uses the list in NMI
39 * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
40 */
41
42struct llist_head {
43 struct llist_node *first;
44};
45
46struct llist_node {
47 struct llist_node *next;
48};
49
50#define LLIST_HEAD_INIT(name) { NULL }
51#define LLIST_HEAD(name) struct llist_head name = LLIST_HEAD_INIT(name)
52
53/**
54 * init_llist_head - initialize lock-less list head
55 * @head: the head for your lock-less list
56 */
57static inline void init_llist_head(struct llist_head *list)
58{
59 list->first = NULL;
60}
61
62/**
63 * llist_entry - get the struct of this entry
64 * @ptr: the &struct llist_node pointer.
65 * @type: the type of the struct this is embedded in.
66 * @member: the name of the llist_node within the struct.
67 */
68#define llist_entry(ptr, type, member) \
69 container_of(ptr, type, member)
70
71/**
72 * llist_for_each - iterate over some deleted entries of a lock-less list
73 * @pos: the &struct llist_node to use as a loop cursor
74 * @node: the first entry of deleted list entries
75 *
76 * In general, some entries of the lock-less list can be traversed
77 * safely only after being deleted from list, so start with an entry
78 * instead of list head.
79 *
80 * If being used on entries deleted from lock-less list directly, the
81 * traverse order is from the newest to the oldest added entry. If
82 * you want to traverse from the oldest to the newest, you must
83 * reverse the order by yourself before traversing.
84 */
85#define llist_for_each(pos, node) \
86 for ((pos) = (node); pos; (pos) = (pos)->next)
87
88/**
89 * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type
90 * @pos: the type * to use as a loop cursor.
91 * @node: the fist entry of deleted list entries.
92 * @member: the name of the llist_node with the struct.
93 *
94 * In general, some entries of the lock-less list can be traversed
95 * safely only after being removed from list, so start with an entry
96 * instead of list head.
97 *
98 * If being used on entries deleted from lock-less list directly, the
99 * traverse order is from the newest to the oldest added entry. If
100 * you want to traverse from the oldest to the newest, you must
101 * reverse the order by yourself before traversing.
102 */
103#define llist_for_each_entry(pos, node, member) \
104 for ((pos) = llist_entry((node), typeof(*(pos)), member); \
105 &(pos)->member != NULL; \
106 (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member))
107
108/**
109 * llist_empty - tests whether a lock-less list is empty
110 * @head: the list to test
111 *
112 * Not guaranteed to be accurate or up to date. Just a quick way to
113 * test whether the list is empty without deleting something from the
114 * list.
115 */
116static inline int llist_empty(const struct llist_head *head)
117{
118 return ACCESS_ONCE(head->first) == NULL;
119}
120
121void llist_add(struct llist_node *new, struct llist_head *head);
122void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
123 struct llist_head *head);
124struct llist_node *llist_del_first(struct llist_head *head);
125struct llist_node *llist_del_all(struct llist_head *head);
126#endif /* LLIST_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3172a1c0f08e..f2690cf49827 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1600,6 +1600,7 @@ enum mf_flags {
1600}; 1600};
1601extern void memory_failure(unsigned long pfn, int trapno); 1601extern void memory_failure(unsigned long pfn, int trapno);
1602extern int __memory_failure(unsigned long pfn, int trapno, int flags); 1602extern int __memory_failure(unsigned long pfn, int trapno, int flags);
1603extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
1603extern int unpoison_memory(unsigned long pfn); 1604extern int unpoison_memory(unsigned long pfn);
1604extern int sysctl_memory_failure_early_kill; 1605extern int sysctl_memory_failure_early_kill;
1605extern int sysctl_memory_failure_recovery; 1606extern int sysctl_memory_failure_recovery;
diff --git a/lib/Kconfig b/lib/Kconfig
index 32f3e5ae2be5..6c695ff9caba 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -276,4 +276,7 @@ config CORDIC
276 so its calculations are in fixed point. Modules can select this 276 so its calculations are in fixed point. Modules can select this
277 when they require this function. Module will be called cordic. 277 when they require this function. Module will be called cordic.
278 278
279config LLIST
280 bool
281
279endmenu 282endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 892f4e282ea1..6457af4a7caf 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -115,6 +115,8 @@ obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
115 115
116obj-$(CONFIG_CORDIC) += cordic.o 116obj-$(CONFIG_CORDIC) += cordic.o
117 117
118obj-$(CONFIG_LLIST) += llist.o
119
118hostprogs-y := gen_crc32table 120hostprogs-y := gen_crc32table
119clean-files := crc32table.h 121clean-files := crc32table.h
120 122
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 37ef4b048795..2f4412e4d071 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -271,8 +271,6 @@ int __bitmap_weight(const unsigned long *bitmap, int bits)
271} 271}
272EXPORT_SYMBOL(__bitmap_weight); 272EXPORT_SYMBOL(__bitmap_weight);
273 273
274#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
275
276void bitmap_set(unsigned long *map, int start, int nr) 274void bitmap_set(unsigned long *map, int start, int nr)
277{ 275{
278 unsigned long *p = map + BIT_WORD(start); 276 unsigned long *p = map + BIT_WORD(start);
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 577ddf805975..f352cc42f4f8 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -1,8 +1,26 @@
1/* 1/*
2 * Basic general purpose allocator for managing special purpose memory 2 * Basic general purpose allocator for managing special purpose
3 * not managed by the regular kmalloc/kfree interface. 3 * memory, for example, memory that is not managed by the regular
4 * Uses for this includes on-device special memory, uncached memory 4 * kmalloc/kfree interface. Uses for this includes on-device special
5 * etc. 5 * memory, uncached memory etc.
6 *
7 * It is safe to use the allocator in NMI handlers and other special
8 * unblockable contexts that could otherwise deadlock on locks. This
9 * is implemented by using atomic operations and retries on any
10 * conflicts. The disadvantage is that there may be livelocks in
11 * extreme cases. For better scalability, one allocator can be used
12 * for each CPU.
13 *
14 * The lockless operation only works if there is enough memory
15 * available. If new memory is added to the pool a lock has to be
16 * still taken. So any user relying on locklessness has to ensure
17 * that sufficient memory is preallocated.
18 *
19 * The basic atomic operation of this allocator is cmpxchg on long.
20 * On architectures that don't have NMI-safe cmpxchg implementation,
21 * the allocator can NOT be used in NMI handler. So code uses the
22 * allocator in NMI handler should depend on
23 * CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
6 * 24 *
7 * Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org> 25 * Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org>
8 * 26 *
@@ -13,8 +31,109 @@
13#include <linux/slab.h> 31#include <linux/slab.h>
14#include <linux/module.h> 32#include <linux/module.h>
15#include <linux/bitmap.h> 33#include <linux/bitmap.h>
34#include <linux/rculist.h>
35#include <linux/interrupt.h>
16#include <linux/genalloc.h> 36#include <linux/genalloc.h>
17 37
38static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set)
39{
40 unsigned long val, nval;
41
42 nval = *addr;
43 do {
44 val = nval;
45 if (val & mask_to_set)
46 return -EBUSY;
47 cpu_relax();
48 } while ((nval = cmpxchg(addr, val, val | mask_to_set)) != val);
49
50 return 0;
51}
52
53static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear)
54{
55 unsigned long val, nval;
56
57 nval = *addr;
58 do {
59 val = nval;
60 if ((val & mask_to_clear) != mask_to_clear)
61 return -EBUSY;
62 cpu_relax();
63 } while ((nval = cmpxchg(addr, val, val & ~mask_to_clear)) != val);
64
65 return 0;
66}
67
68/*
69 * bitmap_set_ll - set the specified number of bits at the specified position
70 * @map: pointer to a bitmap
71 * @start: a bit position in @map
72 * @nr: number of bits to set
73 *
74 * Set @nr bits start from @start in @map lock-lessly. Several users
75 * can set/clear the same bitmap simultaneously without lock. If two
76 * users set the same bit, one user will return remain bits, otherwise
77 * return 0.
78 */
79static int bitmap_set_ll(unsigned long *map, int start, int nr)
80{
81 unsigned long *p = map + BIT_WORD(start);
82 const int size = start + nr;
83 int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
84 unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
85
86 while (nr - bits_to_set >= 0) {
87 if (set_bits_ll(p, mask_to_set))
88 return nr;
89 nr -= bits_to_set;
90 bits_to_set = BITS_PER_LONG;
91 mask_to_set = ~0UL;
92 p++;
93 }
94 if (nr) {
95 mask_to_set &= BITMAP_LAST_WORD_MASK(size);
96 if (set_bits_ll(p, mask_to_set))
97 return nr;
98 }
99
100 return 0;
101}
102
103/*
104 * bitmap_clear_ll - clear the specified number of bits at the specified position
105 * @map: pointer to a bitmap
106 * @start: a bit position in @map
107 * @nr: number of bits to set
108 *
109 * Clear @nr bits start from @start in @map lock-lessly. Several users
110 * can set/clear the same bitmap simultaneously without lock. If two
111 * users clear the same bit, one user will return remain bits,
112 * otherwise return 0.
113 */
114static int bitmap_clear_ll(unsigned long *map, int start, int nr)
115{
116 unsigned long *p = map + BIT_WORD(start);
117 const int size = start + nr;
118 int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
119 unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
120
121 while (nr - bits_to_clear >= 0) {
122 if (clear_bits_ll(p, mask_to_clear))
123 return nr;
124 nr -= bits_to_clear;
125 bits_to_clear = BITS_PER_LONG;
126 mask_to_clear = ~0UL;
127 p++;
128 }
129 if (nr) {
130 mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
131 if (clear_bits_ll(p, mask_to_clear))
132 return nr;
133 }
134
135 return 0;
136}
18 137
19/** 138/**
20 * gen_pool_create - create a new special memory pool 139 * gen_pool_create - create a new special memory pool
@@ -30,7 +149,7 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
30 149
31 pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid); 150 pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid);
32 if (pool != NULL) { 151 if (pool != NULL) {
33 rwlock_init(&pool->lock); 152 spin_lock_init(&pool->lock);
34 INIT_LIST_HEAD(&pool->chunks); 153 INIT_LIST_HEAD(&pool->chunks);
35 pool->min_alloc_order = min_alloc_order; 154 pool->min_alloc_order = min_alloc_order;
36 } 155 }
@@ -63,14 +182,14 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy
63 if (unlikely(chunk == NULL)) 182 if (unlikely(chunk == NULL))
64 return -ENOMEM; 183 return -ENOMEM;
65 184
66 spin_lock_init(&chunk->lock);
67 chunk->phys_addr = phys; 185 chunk->phys_addr = phys;
68 chunk->start_addr = virt; 186 chunk->start_addr = virt;
69 chunk->end_addr = virt + size; 187 chunk->end_addr = virt + size;
188 atomic_set(&chunk->avail, size);
70 189
71 write_lock(&pool->lock); 190 spin_lock(&pool->lock);
72 list_add(&chunk->next_chunk, &pool->chunks); 191 list_add_rcu(&chunk->next_chunk, &pool->chunks);
73 write_unlock(&pool->lock); 192 spin_unlock(&pool->lock);
74 193
75 return 0; 194 return 0;
76} 195}
@@ -85,19 +204,19 @@ EXPORT_SYMBOL(gen_pool_add_virt);
85 */ 204 */
86phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr) 205phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr)
87{ 206{
88 struct list_head *_chunk;
89 struct gen_pool_chunk *chunk; 207 struct gen_pool_chunk *chunk;
208 phys_addr_t paddr = -1;
90 209
91 read_lock(&pool->lock); 210 rcu_read_lock();
92 list_for_each(_chunk, &pool->chunks) { 211 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
93 chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); 212 if (addr >= chunk->start_addr && addr < chunk->end_addr) {
94 213 paddr = chunk->phys_addr + (addr - chunk->start_addr);
95 if (addr >= chunk->start_addr && addr < chunk->end_addr) 214 break;
96 return chunk->phys_addr + addr - chunk->start_addr; 215 }
97 } 216 }
98 read_unlock(&pool->lock); 217 rcu_read_unlock();
99 218
100 return -1; 219 return paddr;
101} 220}
102EXPORT_SYMBOL(gen_pool_virt_to_phys); 221EXPORT_SYMBOL(gen_pool_virt_to_phys);
103 222
@@ -115,7 +234,6 @@ void gen_pool_destroy(struct gen_pool *pool)
115 int order = pool->min_alloc_order; 234 int order = pool->min_alloc_order;
116 int bit, end_bit; 235 int bit, end_bit;
117 236
118
119 list_for_each_safe(_chunk, _next_chunk, &pool->chunks) { 237 list_for_each_safe(_chunk, _next_chunk, &pool->chunks) {
120 chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); 238 chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
121 list_del(&chunk->next_chunk); 239 list_del(&chunk->next_chunk);
@@ -137,44 +255,50 @@ EXPORT_SYMBOL(gen_pool_destroy);
137 * @size: number of bytes to allocate from the pool 255 * @size: number of bytes to allocate from the pool
138 * 256 *
139 * Allocate the requested number of bytes from the specified pool. 257 * Allocate the requested number of bytes from the specified pool.
140 * Uses a first-fit algorithm. 258 * Uses a first-fit algorithm. Can not be used in NMI handler on
259 * architectures without NMI-safe cmpxchg implementation.
141 */ 260 */
142unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size) 261unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
143{ 262{
144 struct list_head *_chunk;
145 struct gen_pool_chunk *chunk; 263 struct gen_pool_chunk *chunk;
146 unsigned long addr, flags; 264 unsigned long addr = 0;
147 int order = pool->min_alloc_order; 265 int order = pool->min_alloc_order;
148 int nbits, start_bit, end_bit; 266 int nbits, start_bit = 0, end_bit, remain;
267
268#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
269 BUG_ON(in_nmi());
270#endif
149 271
150 if (size == 0) 272 if (size == 0)
151 return 0; 273 return 0;
152 274
153 nbits = (size + (1UL << order) - 1) >> order; 275 nbits = (size + (1UL << order) - 1) >> order;
154 276 rcu_read_lock();
155 read_lock(&pool->lock); 277 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
156 list_for_each(_chunk, &pool->chunks) { 278 if (size > atomic_read(&chunk->avail))
157 chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); 279 continue;
158 280
159 end_bit = (chunk->end_addr - chunk->start_addr) >> order; 281 end_bit = (chunk->end_addr - chunk->start_addr) >> order;
160 282retry:
161 spin_lock_irqsave(&chunk->lock, flags); 283 start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit,
162 start_bit = bitmap_find_next_zero_area(chunk->bits, end_bit, 0, 284 start_bit, nbits, 0);
163 nbits, 0); 285 if (start_bit >= end_bit)
164 if (start_bit >= end_bit) {
165 spin_unlock_irqrestore(&chunk->lock, flags);
166 continue; 286 continue;
287 remain = bitmap_set_ll(chunk->bits, start_bit, nbits);
288 if (remain) {
289 remain = bitmap_clear_ll(chunk->bits, start_bit,
290 nbits - remain);
291 BUG_ON(remain);
292 goto retry;
167 } 293 }
168 294
169 addr = chunk->start_addr + ((unsigned long)start_bit << order); 295 addr = chunk->start_addr + ((unsigned long)start_bit << order);
170 296 size = nbits << order;
171 bitmap_set(chunk->bits, start_bit, nbits); 297 atomic_sub(size, &chunk->avail);
172 spin_unlock_irqrestore(&chunk->lock, flags); 298 break;
173 read_unlock(&pool->lock);
174 return addr;
175 } 299 }
176 read_unlock(&pool->lock); 300 rcu_read_unlock();
177 return 0; 301 return addr;
178} 302}
179EXPORT_SYMBOL(gen_pool_alloc); 303EXPORT_SYMBOL(gen_pool_alloc);
180 304
@@ -184,33 +308,95 @@ EXPORT_SYMBOL(gen_pool_alloc);
184 * @addr: starting address of memory to free back to pool 308 * @addr: starting address of memory to free back to pool
185 * @size: size in bytes of memory to free 309 * @size: size in bytes of memory to free
186 * 310 *
187 * Free previously allocated special memory back to the specified pool. 311 * Free previously allocated special memory back to the specified
312 * pool. Can not be used in NMI handler on architectures without
313 * NMI-safe cmpxchg implementation.
188 */ 314 */
189void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size) 315void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
190{ 316{
191 struct list_head *_chunk;
192 struct gen_pool_chunk *chunk; 317 struct gen_pool_chunk *chunk;
193 unsigned long flags;
194 int order = pool->min_alloc_order; 318 int order = pool->min_alloc_order;
195 int bit, nbits; 319 int start_bit, nbits, remain;
196 320
197 nbits = (size + (1UL << order) - 1) >> order; 321#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
198 322 BUG_ON(in_nmi());
199 read_lock(&pool->lock); 323#endif
200 list_for_each(_chunk, &pool->chunks) {
201 chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
202 324
325 nbits = (size + (1UL << order) - 1) >> order;
326 rcu_read_lock();
327 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
203 if (addr >= chunk->start_addr && addr < chunk->end_addr) { 328 if (addr >= chunk->start_addr && addr < chunk->end_addr) {
204 BUG_ON(addr + size > chunk->end_addr); 329 BUG_ON(addr + size > chunk->end_addr);
205 spin_lock_irqsave(&chunk->lock, flags); 330 start_bit = (addr - chunk->start_addr) >> order;
206 bit = (addr - chunk->start_addr) >> order; 331 remain = bitmap_clear_ll(chunk->bits, start_bit, nbits);
207 while (nbits--) 332 BUG_ON(remain);
208 __clear_bit(bit++, chunk->bits); 333 size = nbits << order;
209 spin_unlock_irqrestore(&chunk->lock, flags); 334 atomic_add(size, &chunk->avail);
210 break; 335 rcu_read_unlock();
336 return;
211 } 337 }
212 } 338 }
213 BUG_ON(nbits > 0); 339 rcu_read_unlock();
214 read_unlock(&pool->lock); 340 BUG();
215} 341}
216EXPORT_SYMBOL(gen_pool_free); 342EXPORT_SYMBOL(gen_pool_free);
343
344/**
345 * gen_pool_for_each_chunk - call func for every chunk of generic memory pool
346 * @pool: the generic memory pool
347 * @func: func to call
348 * @data: additional data used by @func
349 *
350 * Call @func for every chunk of generic memory pool. The @func is
351 * called with rcu_read_lock held.
352 */
353void gen_pool_for_each_chunk(struct gen_pool *pool,
354 void (*func)(struct gen_pool *pool, struct gen_pool_chunk *chunk, void *data),
355 void *data)
356{
357 struct gen_pool_chunk *chunk;
358
359 rcu_read_lock();
360 list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk)
361 func(pool, chunk, data);
362 rcu_read_unlock();
363}
364EXPORT_SYMBOL(gen_pool_for_each_chunk);
365
366/**
367 * gen_pool_avail - get available free space of the pool
368 * @pool: pool to get available free space
369 *
370 * Return available free space of the specified pool.
371 */
372size_t gen_pool_avail(struct gen_pool *pool)
373{
374 struct gen_pool_chunk *chunk;
375 size_t avail = 0;
376
377 rcu_read_lock();
378 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
379 avail += atomic_read(&chunk->avail);
380 rcu_read_unlock();
381 return avail;
382}
383EXPORT_SYMBOL_GPL(gen_pool_avail);
384
385/**
386 * gen_pool_size - get size in bytes of memory managed by the pool
387 * @pool: pool to get size
388 *
389 * Return size in bytes of memory managed by the pool.
390 */
391size_t gen_pool_size(struct gen_pool *pool)
392{
393 struct gen_pool_chunk *chunk;
394 size_t size = 0;
395
396 rcu_read_lock();
397 list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
398 size += chunk->end_addr - chunk->start_addr;
399 rcu_read_unlock();
400 return size;
401}
402EXPORT_SYMBOL_GPL(gen_pool_size);
diff --git a/lib/llist.c b/lib/llist.c
new file mode 100644
index 000000000000..da445724fa1f
--- /dev/null
+++ b/lib/llist.c
@@ -0,0 +1,129 @@
1/*
2 * Lock-less NULL terminated single linked list
3 *
4 * The basic atomic operation of this list is cmpxchg on long. On
5 * architectures that don't have NMI-safe cmpxchg implementation, the
6 * list can NOT be used in NMI handler. So code uses the list in NMI
7 * handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
8 *
9 * Copyright 2010,2011 Intel Corp.
10 * Author: Huang Ying <ying.huang@intel.com>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License version
14 * 2 as published by the Free Software Foundation;
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25#include <linux/kernel.h>
26#include <linux/module.h>
27#include <linux/interrupt.h>
28#include <linux/llist.h>
29
30#include <asm/system.h>
31
32/**
33 * llist_add - add a new entry
34 * @new: new entry to be added
35 * @head: the head for your lock-less list
36 */
37void llist_add(struct llist_node *new, struct llist_head *head)
38{
39 struct llist_node *entry, *old_entry;
40
41#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
42 BUG_ON(in_nmi());
43#endif
44
45 entry = head->first;
46 do {
47 old_entry = entry;
48 new->next = entry;
49 cpu_relax();
50 } while ((entry = cmpxchg(&head->first, old_entry, new)) != old_entry);
51}
52EXPORT_SYMBOL_GPL(llist_add);
53
54/**
55 * llist_add_batch - add several linked entries in batch
56 * @new_first: first entry in batch to be added
57 * @new_last: last entry in batch to be added
58 * @head: the head for your lock-less list
59 */
60void llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
61 struct llist_head *head)
62{
63 struct llist_node *entry, *old_entry;
64
65#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
66 BUG_ON(in_nmi());
67#endif
68
69 entry = head->first;
70 do {
71 old_entry = entry;
72 new_last->next = entry;
73 cpu_relax();
74 } while ((entry = cmpxchg(&head->first, old_entry, new_first)) != old_entry);
75}
76EXPORT_SYMBOL_GPL(llist_add_batch);
77
78/**
79 * llist_del_first - delete the first entry of lock-less list
80 * @head: the head for your lock-less list
81 *
82 * If list is empty, return NULL, otherwise, return the first entry
83 * deleted, this is the newest added one.
84 *
85 * Only one llist_del_first user can be used simultaneously with
86 * multiple llist_add users without lock. Because otherwise
87 * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add,
88 * llist_add) sequence in another user may change @head->first->next,
89 * but keep @head->first. If multiple consumers are needed, please
90 * use llist_del_all or use lock between consumers.
91 */
92struct llist_node *llist_del_first(struct llist_head *head)
93{
94 struct llist_node *entry, *old_entry, *next;
95
96#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
97 BUG_ON(in_nmi());
98#endif
99
100 entry = head->first;
101 do {
102 if (entry == NULL)
103 return NULL;
104 old_entry = entry;
105 next = entry->next;
106 cpu_relax();
107 } while ((entry = cmpxchg(&head->first, old_entry, next)) != old_entry);
108
109 return entry;
110}
111EXPORT_SYMBOL_GPL(llist_del_first);
112
113/**
114 * llist_del_all - delete all entries from lock-less list
115 * @head: the head of lock-less list to delete all entries
116 *
117 * If list is empty, return NULL, otherwise, delete all entries and
118 * return the pointer to the first entry. The order of entries
119 * deleted is from the newest to the oldest added one.
120 */
121struct llist_node *llist_del_all(struct llist_head *head)
122{
123#ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
124 BUG_ON(in_nmi());
125#endif
126
127 return xchg(&head->first, NULL);
128}
129EXPORT_SYMBOL_GPL(llist_del_all);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 740c4f52059c..2b43ba051ac9 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -53,6 +53,7 @@
53#include <linux/hugetlb.h> 53#include <linux/hugetlb.h>
54#include <linux/memory_hotplug.h> 54#include <linux/memory_hotplug.h>
55#include <linux/mm_inline.h> 55#include <linux/mm_inline.h>
56#include <linux/kfifo.h>
56#include "internal.h" 57#include "internal.h"
57 58
58int sysctl_memory_failure_early_kill __read_mostly = 0; 59int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1178,6 +1179,97 @@ void memory_failure(unsigned long pfn, int trapno)
1178 __memory_failure(pfn, trapno, 0); 1179 __memory_failure(pfn, trapno, 0);
1179} 1180}
1180 1181
1182#define MEMORY_FAILURE_FIFO_ORDER 4
1183#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1184
1185struct memory_failure_entry {
1186 unsigned long pfn;
1187 int trapno;
1188 int flags;
1189};
1190
1191struct memory_failure_cpu {
1192 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1193 MEMORY_FAILURE_FIFO_SIZE);
1194 spinlock_t lock;
1195 struct work_struct work;
1196};
1197
1198static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1199
1200/**
1201 * memory_failure_queue - Schedule handling memory failure of a page.
1202 * @pfn: Page Number of the corrupted page
1203 * @trapno: Trap number reported in the signal to user space.
1204 * @flags: Flags for memory failure handling
1205 *
1206 * This function is called by the low level hardware error handler
1207 * when it detects hardware memory corruption of a page. It schedules
1208 * the recovering of error page, including dropping pages, killing
1209 * processes etc.
1210 *
1211 * The function is primarily of use for corruptions that
1212 * happen outside the current execution context (e.g. when
1213 * detected by a background scrubber)
1214 *
1215 * Can run in IRQ context.
1216 */
1217void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1218{
1219 struct memory_failure_cpu *mf_cpu;
1220 unsigned long proc_flags;
1221 struct memory_failure_entry entry = {
1222 .pfn = pfn,
1223 .trapno = trapno,
1224 .flags = flags,
1225 };
1226
1227 mf_cpu = &get_cpu_var(memory_failure_cpu);
1228 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1229 if (kfifo_put(&mf_cpu->fifo, &entry))
1230 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1231 else
1232 pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
1233 pfn);
1234 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1235 put_cpu_var(memory_failure_cpu);
1236}
1237EXPORT_SYMBOL_GPL(memory_failure_queue);
1238
1239static void memory_failure_work_func(struct work_struct *work)
1240{
1241 struct memory_failure_cpu *mf_cpu;
1242 struct memory_failure_entry entry = { 0, };
1243 unsigned long proc_flags;
1244 int gotten;
1245
1246 mf_cpu = &__get_cpu_var(memory_failure_cpu);
1247 for (;;) {
1248 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1249 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1250 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1251 if (!gotten)
1252 break;
1253 __memory_failure(entry.pfn, entry.trapno, entry.flags);
1254 }
1255}
1256
1257static int __init memory_failure_init(void)
1258{
1259 struct memory_failure_cpu *mf_cpu;
1260 int cpu;
1261
1262 for_each_possible_cpu(cpu) {
1263 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1264 spin_lock_init(&mf_cpu->lock);
1265 INIT_KFIFO(mf_cpu->fifo);
1266 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1267 }
1268
1269 return 0;
1270}
1271core_initcall(memory_failure_init);
1272
1181/** 1273/**
1182 * unpoison_memory - Unpoison a previously poisoned page 1274 * unpoison_memory - Unpoison a previously poisoned page
1183 * @pfn: Page number of the to be unpoisoned page 1275 * @pfn: Page number of the to be unpoisoned page