aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt6
-rw-r--r--arch/x86/Kconfig10
-rw-r--r--arch/x86/include/asm/mce.h12
-rw-r--r--arch/x86/include/asm/reboot.h1
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/dev-mcelog.c397
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c561
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c3
-rw-r--r--arch/x86/kernel/reboot.c5
-rw-r--r--arch/x86/ras/Kconfig14
-rw-r--r--drivers/acpi/apei/ghes.c5
-rw-r--r--drivers/ras/Makefile3
-rw-r--r--drivers/ras/cec.c532
-rw-r--r--drivers/ras/debugfs.c2
-rw-r--r--drivers/ras/debugfs.h8
-rw-r--r--drivers/ras/ras.c11
-rw-r--r--include/linux/ras.h13
18 files changed, 1154 insertions, 439 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 51d4dddde0b3..ba294509cd7e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3177,6 +3177,12 @@
3177 ramdisk_size= [RAM] Sizes of RAM disks in kilobytes 3177 ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
3178 See Documentation/blockdev/ramdisk.txt. 3178 See Documentation/blockdev/ramdisk.txt.
3179 3179
3180 ras=option[,option,...] [KNL] RAS-specific options
3181
3182 cec_disable [X86]
3183 Disable the Correctable Errors Collector,
3184 see CONFIG_RAS_CEC help text.
3185
3180 rcu_nocbs= [KNL] 3186 rcu_nocbs= [KNL]
3181 The argument is a cpu list, as described above. 3187 The argument is a cpu list, as described above.
3182 3188
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d50fdff77ee..2a00902e657a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1042,6 +1042,14 @@ config X86_MCE
1042 The action the kernel takes depends on the severity of the problem, 1042 The action the kernel takes depends on the severity of the problem,
1043 ranging from warning messages to halting the machine. 1043 ranging from warning messages to halting the machine.
1044 1044
1045config X86_MCELOG_LEGACY
1046 bool "Support for deprecated /dev/mcelog character device"
1047 depends on X86_MCE
1048 ---help---
1049 Enable support for /dev/mcelog which is needed by the old mcelog
1050 userspace logging daemon. Consider switching to the new generation
1051 rasdaemon solution.
1052
1045config X86_MCE_INTEL 1053config X86_MCE_INTEL
1046 def_bool y 1054 def_bool y
1047 prompt "Intel MCE features" 1055 prompt "Intel MCE features"
@@ -1071,7 +1079,7 @@ config X86_MCE_THRESHOLD
1071 def_bool y 1079 def_bool y
1072 1080
1073config X86_MCE_INJECT 1081config X86_MCE_INJECT
1074 depends on X86_MCE && X86_LOCAL_APIC 1082 depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY
1075 tristate "Machine check injector support" 1083 tristate "Machine check injector support"
1076 ---help--- 1084 ---help---
1077 Provide support for injecting machine checks for testing purposes. 1085 Provide support for injecting machine checks for testing purposes.
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index e63873683d4a..4fd5195deed0 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -128,7 +128,7 @@
128 * debugging tools. Each entry is only valid when its finished flag 128 * debugging tools. Each entry is only valid when its finished flag
129 * is set. 129 * is set.
130 */ 130 */
131struct mce_log { 131struct mce_log_buffer {
132 char signature[12]; /* "MACHINECHECK" */ 132 char signature[12]; /* "MACHINECHECK" */
133 unsigned len; /* = MCE_LOG_LEN */ 133 unsigned len; /* = MCE_LOG_LEN */
134 unsigned next; 134 unsigned next;
@@ -191,10 +191,12 @@ extern struct mca_config mca_cfg;
191extern struct mca_msr_regs msr_ops; 191extern struct mca_msr_regs msr_ops;
192 192
193enum mce_notifier_prios { 193enum mce_notifier_prios {
194 MCE_PRIO_SRAO = INT_MAX, 194 MCE_PRIO_FIRST = INT_MAX,
195 MCE_PRIO_EXTLOG = INT_MAX - 1, 195 MCE_PRIO_SRAO = INT_MAX - 1,
196 MCE_PRIO_NFIT = INT_MAX - 2, 196 MCE_PRIO_EXTLOG = INT_MAX - 2,
197 MCE_PRIO_EDAC = INT_MAX - 3, 197 MCE_PRIO_NFIT = INT_MAX - 3,
198 MCE_PRIO_EDAC = INT_MAX - 4,
199 MCE_PRIO_MCELOG = 1,
198 MCE_PRIO_LOWEST = 0, 200 MCE_PRIO_LOWEST = 0,
199}; 201};
200 202
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 2cb1cc253d51..fc62ba8dce93 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -15,6 +15,7 @@ struct machine_ops {
15}; 15};
16 16
17extern struct machine_ops machine_ops; 17extern struct machine_ops machine_ops;
18extern int crashing_cpu;
18 19
19void native_machine_crash_shutdown(struct pt_regs *regs); 20void native_machine_crash_shutdown(struct pt_regs *regs);
20void native_machine_shutdown(void); 21void native_machine_shutdown(void);
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index a3311c886194..43051f0777d4 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
9obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o 9obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
10 10
11obj-$(CONFIG_ACPI_APEI) += mce-apei.o 11obj-$(CONFIG_ACPI_APEI) += mce-apei.o
12
13obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
new file mode 100644
index 000000000000..9c632cb88546
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
@@ -0,0 +1,397 @@
1/*
2 * /dev/mcelog driver
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13#include <linux/miscdevice.h>
14#include <linux/slab.h>
15#include <linux/kmod.h>
16#include <linux/poll.h>
17
18#include "mce-internal.h"
19
20static DEFINE_MUTEX(mce_chrdev_read_mutex);
21
22static char mce_helper[128];
23static char *mce_helper_argv[2] = { mce_helper, NULL };
24
25#define mce_log_get_idx_check(p) \
26({ \
27 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
28 !lockdep_is_held(&mce_chrdev_read_mutex), \
29 "suspicious mce_log_get_idx_check() usage"); \
30 smp_load_acquire(&(p)); \
31})
32
33/*
34 * Lockless MCE logging infrastructure.
35 * This avoids deadlocks on printk locks without having to break locks. Also
36 * separate MCEs from kernel messages to avoid bogus bug reports.
37 */
38
39static struct mce_log_buffer mcelog = {
40 .signature = MCE_LOG_SIGNATURE,
41 .len = MCE_LOG_LEN,
42 .recordlen = sizeof(struct mce),
43};
44
45static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
46
47/* User mode helper program triggered by machine check event */
48extern char mce_helper[128];
49
50static int dev_mce_log(struct notifier_block *nb, unsigned long val,
51 void *data)
52{
53 struct mce *mce = (struct mce *)data;
54 unsigned int next, entry;
55
56 wmb();
57 for (;;) {
58 entry = mce_log_get_idx_check(mcelog.next);
59 for (;;) {
60
61 /*
62 * When the buffer fills up discard new entries.
63 * Assume that the earlier errors are the more
64 * interesting ones:
65 */
66 if (entry >= MCE_LOG_LEN) {
67 set_bit(MCE_OVERFLOW,
68 (unsigned long *)&mcelog.flags);
69 return NOTIFY_OK;
70 }
71 /* Old left over entry. Skip: */
72 if (mcelog.entry[entry].finished) {
73 entry++;
74 continue;
75 }
76 break;
77 }
78 smp_rmb();
79 next = entry + 1;
80 if (cmpxchg(&mcelog.next, entry, next) == entry)
81 break;
82 }
83 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
84 wmb();
85 mcelog.entry[entry].finished = 1;
86 wmb();
87
88 /* wake processes polling /dev/mcelog */
89 wake_up_interruptible(&mce_chrdev_wait);
90
91 return NOTIFY_OK;
92}
93
94static struct notifier_block dev_mcelog_nb = {
95 .notifier_call = dev_mce_log,
96 .priority = MCE_PRIO_MCELOG,
97};
98
99static void mce_do_trigger(struct work_struct *work)
100{
101 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
102}
103
104static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
105
106
107void mce_work_trigger(void)
108{
109 if (mce_helper[0])
110 schedule_work(&mce_trigger_work);
111}
112
113static ssize_t
114show_trigger(struct device *s, struct device_attribute *attr, char *buf)
115{
116 strcpy(buf, mce_helper);
117 strcat(buf, "\n");
118 return strlen(mce_helper) + 1;
119}
120
121static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
122 const char *buf, size_t siz)
123{
124 char *p;
125
126 strncpy(mce_helper, buf, sizeof(mce_helper));
127 mce_helper[sizeof(mce_helper)-1] = 0;
128 p = strchr(mce_helper, '\n');
129
130 if (p)
131 *p = 0;
132
133 return strlen(mce_helper) + !!p;
134}
135
136DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
137
138/*
139 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
140 */
141
142static DEFINE_SPINLOCK(mce_chrdev_state_lock);
143static int mce_chrdev_open_count; /* #times opened */
144static int mce_chrdev_open_exclu; /* already open exclusive? */
145
146static int mce_chrdev_open(struct inode *inode, struct file *file)
147{
148 spin_lock(&mce_chrdev_state_lock);
149
150 if (mce_chrdev_open_exclu ||
151 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
152 spin_unlock(&mce_chrdev_state_lock);
153
154 return -EBUSY;
155 }
156
157 if (file->f_flags & O_EXCL)
158 mce_chrdev_open_exclu = 1;
159 mce_chrdev_open_count++;
160
161 spin_unlock(&mce_chrdev_state_lock);
162
163 return nonseekable_open(inode, file);
164}
165
166static int mce_chrdev_release(struct inode *inode, struct file *file)
167{
168 spin_lock(&mce_chrdev_state_lock);
169
170 mce_chrdev_open_count--;
171 mce_chrdev_open_exclu = 0;
172
173 spin_unlock(&mce_chrdev_state_lock);
174
175 return 0;
176}
177
178static void collect_tscs(void *data)
179{
180 unsigned long *cpu_tsc = (unsigned long *)data;
181
182 cpu_tsc[smp_processor_id()] = rdtsc();
183}
184
185static int mce_apei_read_done;
186
187/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
188static int __mce_read_apei(char __user **ubuf, size_t usize)
189{
190 int rc;
191 u64 record_id;
192 struct mce m;
193
194 if (usize < sizeof(struct mce))
195 return -EINVAL;
196
197 rc = apei_read_mce(&m, &record_id);
198 /* Error or no more MCE record */
199 if (rc <= 0) {
200 mce_apei_read_done = 1;
201 /*
202 * When ERST is disabled, mce_chrdev_read() should return
203 * "no record" instead of "no device."
204 */
205 if (rc == -ENODEV)
206 return 0;
207 return rc;
208 }
209 rc = -EFAULT;
210 if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
211 return rc;
212 /*
213 * In fact, we should have cleared the record after that has
214 * been flushed to the disk or sent to network in
215 * /sbin/mcelog, but we have no interface to support that now,
216 * so just clear it to avoid duplication.
217 */
218 rc = apei_clear_mce(record_id);
219 if (rc) {
220 mce_apei_read_done = 1;
221 return rc;
222 }
223 *ubuf += sizeof(struct mce);
224
225 return 0;
226}
227
228static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
229 size_t usize, loff_t *off)
230{
231 char __user *buf = ubuf;
232 unsigned long *cpu_tsc;
233 unsigned prev, next;
234 int i, err;
235
236 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
237 if (!cpu_tsc)
238 return -ENOMEM;
239
240 mutex_lock(&mce_chrdev_read_mutex);
241
242 if (!mce_apei_read_done) {
243 err = __mce_read_apei(&buf, usize);
244 if (err || buf != ubuf)
245 goto out;
246 }
247
248 next = mce_log_get_idx_check(mcelog.next);
249
250 /* Only supports full reads right now */
251 err = -EINVAL;
252 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
253 goto out;
254
255 err = 0;
256 prev = 0;
257 do {
258 for (i = prev; i < next; i++) {
259 unsigned long start = jiffies;
260 struct mce *m = &mcelog.entry[i];
261
262 while (!m->finished) {
263 if (time_after_eq(jiffies, start + 2)) {
264 memset(m, 0, sizeof(*m));
265 goto timeout;
266 }
267 cpu_relax();
268 }
269 smp_rmb();
270 err |= copy_to_user(buf, m, sizeof(*m));
271 buf += sizeof(*m);
272timeout:
273 ;
274 }
275
276 memset(mcelog.entry + prev, 0,
277 (next - prev) * sizeof(struct mce));
278 prev = next;
279 next = cmpxchg(&mcelog.next, prev, 0);
280 } while (next != prev);
281
282 synchronize_sched();
283
284 /*
285 * Collect entries that were still getting written before the
286 * synchronize.
287 */
288 on_each_cpu(collect_tscs, cpu_tsc, 1);
289
290 for (i = next; i < MCE_LOG_LEN; i++) {
291 struct mce *m = &mcelog.entry[i];
292
293 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
294 err |= copy_to_user(buf, m, sizeof(*m));
295 smp_rmb();
296 buf += sizeof(*m);
297 memset(m, 0, sizeof(*m));
298 }
299 }
300
301 if (err)
302 err = -EFAULT;
303
304out:
305 mutex_unlock(&mce_chrdev_read_mutex);
306 kfree(cpu_tsc);
307
308 return err ? err : buf - ubuf;
309}
310
311static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
312{
313 poll_wait(file, &mce_chrdev_wait, wait);
314 if (READ_ONCE(mcelog.next))
315 return POLLIN | POLLRDNORM;
316 if (!mce_apei_read_done && apei_check_mce())
317 return POLLIN | POLLRDNORM;
318 return 0;
319}
320
321static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
322 unsigned long arg)
323{
324 int __user *p = (int __user *)arg;
325
326 if (!capable(CAP_SYS_ADMIN))
327 return -EPERM;
328
329 switch (cmd) {
330 case MCE_GET_RECORD_LEN:
331 return put_user(sizeof(struct mce), p);
332 case MCE_GET_LOG_LEN:
333 return put_user(MCE_LOG_LEN, p);
334 case MCE_GETCLEAR_FLAGS: {
335 unsigned flags;
336
337 do {
338 flags = mcelog.flags;
339 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
340
341 return put_user(flags, p);
342 }
343 default:
344 return -ENOTTY;
345 }
346}
347
348static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
349 size_t usize, loff_t *off);
350
351void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
352 const char __user *ubuf,
353 size_t usize, loff_t *off))
354{
355 mce_write = fn;
356}
357EXPORT_SYMBOL_GPL(register_mce_write_callback);
358
359static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
360 size_t usize, loff_t *off)
361{
362 if (mce_write)
363 return mce_write(filp, ubuf, usize, off);
364 else
365 return -EINVAL;
366}
367
368static const struct file_operations mce_chrdev_ops = {
369 .open = mce_chrdev_open,
370 .release = mce_chrdev_release,
371 .read = mce_chrdev_read,
372 .write = mce_chrdev_write,
373 .poll = mce_chrdev_poll,
374 .unlocked_ioctl = mce_chrdev_ioctl,
375 .llseek = no_llseek,
376};
377
378static struct miscdevice mce_chrdev_device = {
379 MISC_MCELOG_MINOR,
380 "mcelog",
381 &mce_chrdev_ops,
382};
383
384static __init int dev_mcelog_init_device(void)
385{
386 int err;
387
388 /* register character device /dev/mcelog */
389 err = misc_register(&mce_chrdev_device);
390 if (err) {
391 pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
392 return err;
393 }
394 mce_register_decode_chain(&dev_mcelog_nb);
395 return 0;
396}
397device_initcall_sync(dev_mcelog_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 19592ba1a320..654ad0668d72 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2)
96 m1->addr != m2->addr || 96 m1->addr != m2->addr ||
97 m1->misc != m2->misc; 97 m1->misc != m2->misc;
98} 98}
99
100extern struct device_attribute dev_attr_trigger;
101
102#ifdef CONFIG_X86_MCELOG_LEGACY
103extern void mce_work_trigger(void);
104#else
105static inline void mce_work_trigger(void) { }
106#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index af44ebeb593f..5abd4bf73d6e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -35,6 +35,7 @@
35#include <linux/poll.h> 35#include <linux/poll.h>
36#include <linux/nmi.h> 36#include <linux/nmi.h>
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38#include <linux/ras.h>
38#include <linux/smp.h> 39#include <linux/smp.h>
39#include <linux/fs.h> 40#include <linux/fs.h>
40#include <linux/mm.h> 41#include <linux/mm.h>
@@ -49,20 +50,11 @@
49#include <asm/tlbflush.h> 50#include <asm/tlbflush.h>
50#include <asm/mce.h> 51#include <asm/mce.h>
51#include <asm/msr.h> 52#include <asm/msr.h>
53#include <asm/reboot.h>
52 54
53#include "mce-internal.h" 55#include "mce-internal.h"
54 56
55static DEFINE_MUTEX(mce_chrdev_read_mutex); 57static DEFINE_MUTEX(mce_log_mutex);
56
57static int mce_chrdev_open_count; /* #times opened */
58
59#define mce_log_get_idx_check(p) \
60({ \
61 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
62 !lockdep_is_held(&mce_chrdev_read_mutex), \
63 "suspicious mce_log_get_idx_check() usage"); \
64 smp_load_acquire(&(p)); \
65})
66 58
67#define CREATE_TRACE_POINTS 59#define CREATE_TRACE_POINTS
68#include <trace/events/mce.h> 60#include <trace/events/mce.h>
@@ -87,15 +79,9 @@ struct mca_config mca_cfg __read_mostly = {
87 .monarch_timeout = -1 79 .monarch_timeout = -1
88}; 80};
89 81
90/* User mode helper program triggered by machine check event */
91static unsigned long mce_need_notify;
92static char mce_helper[128];
93static char *mce_helper_argv[2] = { mce_helper, NULL };
94
95static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
96
97static DEFINE_PER_CPU(struct mce, mces_seen); 82static DEFINE_PER_CPU(struct mce, mces_seen);
98static int cpu_missing; 83static unsigned long mce_need_notify;
84static int cpu_missing;
99 85
100/* 86/*
101 * MCA banks polled by the period polling timer for corrected events. 87 * MCA banks polled by the period polling timer for corrected events.
@@ -145,80 +131,36 @@ void mce_setup(struct mce *m)
145DEFINE_PER_CPU(struct mce, injectm); 131DEFINE_PER_CPU(struct mce, injectm);
146EXPORT_PER_CPU_SYMBOL_GPL(injectm); 132EXPORT_PER_CPU_SYMBOL_GPL(injectm);
147 133
148/* 134void mce_log(struct mce *m)
149 * Lockless MCE logging infrastructure.
150 * This avoids deadlocks on printk locks without having to break locks. Also
151 * separate MCEs from kernel messages to avoid bogus bug reports.
152 */
153
154static struct mce_log mcelog = {
155 .signature = MCE_LOG_SIGNATURE,
156 .len = MCE_LOG_LEN,
157 .recordlen = sizeof(struct mce),
158};
159
160void mce_log(struct mce *mce)
161{ 135{
162 unsigned next, entry; 136 if (!mce_gen_pool_add(m))
163
164 /* Emit the trace record: */
165 trace_mce_record(mce);
166
167 if (!mce_gen_pool_add(mce))
168 irq_work_queue(&mce_irq_work); 137 irq_work_queue(&mce_irq_work);
169
170 wmb();
171 for (;;) {
172 entry = mce_log_get_idx_check(mcelog.next);
173 for (;;) {
174
175 /*
176 * When the buffer fills up discard new entries.
177 * Assume that the earlier errors are the more
178 * interesting ones:
179 */
180 if (entry >= MCE_LOG_LEN) {
181 set_bit(MCE_OVERFLOW,
182 (unsigned long *)&mcelog.flags);
183 return;
184 }
185 /* Old left over entry. Skip: */
186 if (mcelog.entry[entry].finished) {
187 entry++;
188 continue;
189 }
190 break;
191 }
192 smp_rmb();
193 next = entry + 1;
194 if (cmpxchg(&mcelog.next, entry, next) == entry)
195 break;
196 }
197 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
198 wmb();
199 mcelog.entry[entry].finished = 1;
200 wmb();
201
202 set_bit(0, &mce_need_notify);
203} 138}
204 139
205void mce_inject_log(struct mce *m) 140void mce_inject_log(struct mce *m)
206{ 141{
207 mutex_lock(&mce_chrdev_read_mutex); 142 mutex_lock(&mce_log_mutex);
208 mce_log(m); 143 mce_log(m);
209 mutex_unlock(&mce_chrdev_read_mutex); 144 mutex_unlock(&mce_log_mutex);
210} 145}
211EXPORT_SYMBOL_GPL(mce_inject_log); 146EXPORT_SYMBOL_GPL(mce_inject_log);
212 147
213static struct notifier_block mce_srao_nb; 148static struct notifier_block mce_srao_nb;
214 149
150/*
151 * We run the default notifier if we have only the SRAO, the first and the
152 * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
153 * notifiers registered on the chain.
154 */
155#define NUM_DEFAULT_NOTIFIERS 3
215static atomic_t num_notifiers; 156static atomic_t num_notifiers;
216 157
217void mce_register_decode_chain(struct notifier_block *nb) 158void mce_register_decode_chain(struct notifier_block *nb)
218{ 159{
219 atomic_inc(&num_notifiers); 160 if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
161 return;
220 162
221 WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC); 163 atomic_inc(&num_notifiers);
222 164
223 blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); 165 blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
224} 166}
@@ -510,7 +452,6 @@ static void mce_schedule_work(void)
510 452
511static void mce_irq_work_cb(struct irq_work *entry) 453static void mce_irq_work_cb(struct irq_work *entry)
512{ 454{
513 mce_notify_irq();
514 mce_schedule_work(); 455 mce_schedule_work();
515} 456}
516 457
@@ -539,20 +480,97 @@ static void mce_report_event(struct pt_regs *regs)
539 */ 480 */
540static int mce_usable_address(struct mce *m) 481static int mce_usable_address(struct mce *m)
541{ 482{
542 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) 483 if (!(m->status & MCI_STATUS_ADDRV))
543 return 0; 484 return 0;
544 485
545 /* Checks after this one are Intel-specific: */ 486 /* Checks after this one are Intel-specific: */
546 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 487 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
547 return 1; 488 return 1;
548 489
490 if (!(m->status & MCI_STATUS_MISCV))
491 return 0;
492
549 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) 493 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
550 return 0; 494 return 0;
495
551 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) 496 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
552 return 0; 497 return 0;
498
553 return 1; 499 return 1;
554} 500}
555 501
502static bool memory_error(struct mce *m)
503{
504 struct cpuinfo_x86 *c = &boot_cpu_data;
505
506 if (c->x86_vendor == X86_VENDOR_AMD) {
507 /* ErrCodeExt[20:16] */
508 u8 xec = (m->status >> 16) & 0x1f;
509
510 return (xec == 0x0 || xec == 0x8);
511 } else if (c->x86_vendor == X86_VENDOR_INTEL) {
512 /*
513 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
514 *
515 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
516 * indicating a memory error. Bit 8 is used for indicating a
517 * cache hierarchy error. The combination of bit 2 and bit 3
518 * is used for indicating a `generic' cache hierarchy error
519 * But we can't just blindly check the above bits, because if
520 * bit 11 is set, then it is a bus/interconnect error - and
521 * either way the above bits just gives more detail on what
522 * bus/interconnect error happened. Note that bit 12 can be
523 * ignored, as it's the "filter" bit.
524 */
525 return (m->status & 0xef80) == BIT(7) ||
526 (m->status & 0xef00) == BIT(8) ||
527 (m->status & 0xeffc) == 0xc;
528 }
529
530 return false;
531}
532
533static bool cec_add_mce(struct mce *m)
534{
535 if (!m)
536 return false;
537
538 /* We eat only correctable DRAM errors with usable addresses. */
539 if (memory_error(m) &&
540 !(m->status & MCI_STATUS_UC) &&
541 mce_usable_address(m))
542 if (!cec_add_elem(m->addr >> PAGE_SHIFT))
543 return true;
544
545 return false;
546}
547
548static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
549 void *data)
550{
551 struct mce *m = (struct mce *)data;
552
553 if (!m)
554 return NOTIFY_DONE;
555
556 if (cec_add_mce(m))
557 return NOTIFY_STOP;
558
559 /* Emit the trace record: */
560 trace_mce_record(m);
561
562 set_bit(0, &mce_need_notify);
563
564 mce_notify_irq();
565
566 return NOTIFY_DONE;
567}
568
569static struct notifier_block first_nb = {
570 .notifier_call = mce_first_notifier,
571 .priority = MCE_PRIO_FIRST,
572};
573
556static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, 574static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
557 void *data) 575 void *data)
558{ 576{
@@ -582,15 +600,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
582 if (!m) 600 if (!m)
583 return NOTIFY_DONE; 601 return NOTIFY_DONE;
584 602
585 /* 603 if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
586 * Run the default notifier if we have only the SRAO
587 * notifier and us registered.
588 */
589 if (atomic_read(&num_notifiers) > 2)
590 return NOTIFY_DONE;
591
592 /* Don't print when mcelog is running */
593 if (mce_chrdev_open_count > 0)
594 return NOTIFY_DONE; 604 return NOTIFY_DONE;
595 605
596 __print_mce(m); 606 __print_mce(m);
@@ -643,37 +653,6 @@ static void mce_read_aux(struct mce *m, int i)
643 } 653 }
644} 654}
645 655
646static bool memory_error(struct mce *m)
647{
648 struct cpuinfo_x86 *c = &boot_cpu_data;
649
650 if (c->x86_vendor == X86_VENDOR_AMD) {
651 /* ErrCodeExt[20:16] */
652 u8 xec = (m->status >> 16) & 0x1f;
653
654 return (xec == 0x0 || xec == 0x8);
655 } else if (c->x86_vendor == X86_VENDOR_INTEL) {
656 /*
657 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
658 *
659 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
660 * indicating a memory error. Bit 8 is used for indicating a
661 * cache hierarchy error. The combination of bit 2 and bit 3
662 * is used for indicating a `generic' cache hierarchy error
663 * But we can't just blindly check the above bits, because if
664 * bit 11 is set, then it is a bus/interconnect error - and
665 * either way the above bits just gives more detail on what
666 * bus/interconnect error happened. Note that bit 12 can be
667 * ignored, as it's the "filter" bit.
668 */
669 return (m->status & 0xef80) == BIT(7) ||
670 (m->status & 0xef00) == BIT(8) ||
671 (m->status & 0xeffc) == 0xc;
672 }
673
674 return false;
675}
676
677DEFINE_PER_CPU(unsigned, mce_poll_count); 656DEFINE_PER_CPU(unsigned, mce_poll_count);
678 657
679/* 658/*
@@ -1122,9 +1101,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1122 * on Intel. 1101 * on Intel.
1123 */ 1102 */
1124 int lmce = 1; 1103 int lmce = 1;
1104 int cpu = smp_processor_id();
1125 1105
1126 /* If this CPU is offline, just bail out. */ 1106 /*
1127 if (cpu_is_offline(smp_processor_id())) { 1107 * Cases where we avoid rendezvous handler timeout:
1108 * 1) If this CPU is offline.
1109 *
1110 * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1111 * skip those CPUs which remain looping in the 1st kernel - see
1112 * crash_nmi_callback().
1113 *
1114 * Note: there still is a small window between kexec-ing and the new,
1115 * kdump kernel establishing a new #MC handler where a broadcasted MCE
1116 * might not get handled properly.
1117 */
1118 if (cpu_is_offline(cpu) ||
1119 (crashing_cpu != -1 && crashing_cpu != cpu)) {
1128 u64 mcgstatus; 1120 u64 mcgstatus;
1129 1121
1130 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 1122 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
@@ -1394,13 +1386,6 @@ static void mce_timer_delete_all(void)
1394 del_timer_sync(&per_cpu(mce_timer, cpu)); 1386 del_timer_sync(&per_cpu(mce_timer, cpu));
1395} 1387}
1396 1388
1397static void mce_do_trigger(struct work_struct *work)
1398{
1399 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1400}
1401
1402static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1403
1404/* 1389/*
1405 * Notify the user(s) about new machine check events. 1390 * Notify the user(s) about new machine check events.
1406 * Can be called from interrupt context, but not from machine check/NMI 1391 * Can be called from interrupt context, but not from machine check/NMI
@@ -1412,11 +1397,7 @@ int mce_notify_irq(void)
1412 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1397 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1413 1398
1414 if (test_and_clear_bit(0, &mce_need_notify)) { 1399 if (test_and_clear_bit(0, &mce_need_notify)) {
1415 /* wake processes polling /dev/mcelog */ 1400 mce_work_trigger();
1416 wake_up_interruptible(&mce_chrdev_wait);
1417
1418 if (mce_helper[0])
1419 schedule_work(&mce_trigger_work);
1420 1401
1421 if (__ratelimit(&ratelimit)) 1402 if (__ratelimit(&ratelimit))
1422 pr_info(HW_ERR "Machine check events logged\n"); 1403 pr_info(HW_ERR "Machine check events logged\n");
@@ -1683,30 +1664,35 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1683 return 0; 1664 return 0;
1684} 1665}
1685 1666
1686static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) 1667/*
1668 * Init basic CPU features needed for early decoding of MCEs.
1669 */
1670static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1687{ 1671{
1688 switch (c->x86_vendor) { 1672 if (c->x86_vendor == X86_VENDOR_AMD) {
1689 case X86_VENDOR_INTEL:
1690 mce_intel_feature_init(c);
1691 mce_adjust_timer = cmci_intel_adjust_timer;
1692 break;
1693
1694 case X86_VENDOR_AMD: {
1695 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); 1673 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1696 mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); 1674 mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
1697 mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); 1675 mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
1698 1676
1699 /*
1700 * Install proper ops for Scalable MCA enabled processors
1701 */
1702 if (mce_flags.smca) { 1677 if (mce_flags.smca) {
1703 msr_ops.ctl = smca_ctl_reg; 1678 msr_ops.ctl = smca_ctl_reg;
1704 msr_ops.status = smca_status_reg; 1679 msr_ops.status = smca_status_reg;
1705 msr_ops.addr = smca_addr_reg; 1680 msr_ops.addr = smca_addr_reg;
1706 msr_ops.misc = smca_misc_reg; 1681 msr_ops.misc = smca_misc_reg;
1707 } 1682 }
1708 mce_amd_feature_init(c); 1683 }
1684}
1685
1686static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1687{
1688 switch (c->x86_vendor) {
1689 case X86_VENDOR_INTEL:
1690 mce_intel_feature_init(c);
1691 mce_adjust_timer = cmci_intel_adjust_timer;
1692 break;
1709 1693
1694 case X86_VENDOR_AMD: {
1695 mce_amd_feature_init(c);
1710 break; 1696 break;
1711 } 1697 }
1712 1698
@@ -1793,6 +1779,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
1793 1779
1794 machine_check_vector = do_machine_check; 1780 machine_check_vector = do_machine_check;
1795 1781
1782 __mcheck_cpu_init_early(c);
1796 __mcheck_cpu_init_generic(); 1783 __mcheck_cpu_init_generic();
1797 __mcheck_cpu_init_vendor(c); 1784 __mcheck_cpu_init_vendor(c);
1798 __mcheck_cpu_init_clear_banks(); 1785 __mcheck_cpu_init_clear_banks();
@@ -1818,251 +1805,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1818 1805
1819} 1806}
1820 1807
1821/*
1822 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1823 */
1824
1825static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1826static int mce_chrdev_open_exclu; /* already open exclusive? */
1827
1828static int mce_chrdev_open(struct inode *inode, struct file *file)
1829{
1830 spin_lock(&mce_chrdev_state_lock);
1831
1832 if (mce_chrdev_open_exclu ||
1833 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1834 spin_unlock(&mce_chrdev_state_lock);
1835
1836 return -EBUSY;
1837 }
1838
1839 if (file->f_flags & O_EXCL)
1840 mce_chrdev_open_exclu = 1;
1841 mce_chrdev_open_count++;
1842
1843 spin_unlock(&mce_chrdev_state_lock);
1844
1845 return nonseekable_open(inode, file);
1846}
1847
1848static int mce_chrdev_release(struct inode *inode, struct file *file)
1849{
1850 spin_lock(&mce_chrdev_state_lock);
1851
1852 mce_chrdev_open_count--;
1853 mce_chrdev_open_exclu = 0;
1854
1855 spin_unlock(&mce_chrdev_state_lock);
1856
1857 return 0;
1858}
1859
1860static void collect_tscs(void *data)
1861{
1862 unsigned long *cpu_tsc = (unsigned long *)data;
1863
1864 cpu_tsc[smp_processor_id()] = rdtsc();
1865}
1866
1867static int mce_apei_read_done;
1868
1869/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1870static int __mce_read_apei(char __user **ubuf, size_t usize)
1871{
1872 int rc;
1873 u64 record_id;
1874 struct mce m;
1875
1876 if (usize < sizeof(struct mce))
1877 return -EINVAL;
1878
1879 rc = apei_read_mce(&m, &record_id);
1880 /* Error or no more MCE record */
1881 if (rc <= 0) {
1882 mce_apei_read_done = 1;
1883 /*
1884 * When ERST is disabled, mce_chrdev_read() should return
1885 * "no record" instead of "no device."
1886 */
1887 if (rc == -ENODEV)
1888 return 0;
1889 return rc;
1890 }
1891 rc = -EFAULT;
1892 if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1893 return rc;
1894 /*
1895 * In fact, we should have cleared the record after that has
1896 * been flushed to the disk or sent to network in
1897 * /sbin/mcelog, but we have no interface to support that now,
1898 * so just clear it to avoid duplication.
1899 */
1900 rc = apei_clear_mce(record_id);
1901 if (rc) {
1902 mce_apei_read_done = 1;
1903 return rc;
1904 }
1905 *ubuf += sizeof(struct mce);
1906
1907 return 0;
1908}
1909
1910static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1911 size_t usize, loff_t *off)
1912{
1913 char __user *buf = ubuf;
1914 unsigned long *cpu_tsc;
1915 unsigned prev, next;
1916 int i, err;
1917
1918 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1919 if (!cpu_tsc)
1920 return -ENOMEM;
1921
1922 mutex_lock(&mce_chrdev_read_mutex);
1923
1924 if (!mce_apei_read_done) {
1925 err = __mce_read_apei(&buf, usize);
1926 if (err || buf != ubuf)
1927 goto out;
1928 }
1929
1930 next = mce_log_get_idx_check(mcelog.next);
1931
1932 /* Only supports full reads right now */
1933 err = -EINVAL;
1934 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1935 goto out;
1936
1937 err = 0;
1938 prev = 0;
1939 do {
1940 for (i = prev; i < next; i++) {
1941 unsigned long start = jiffies;
1942 struct mce *m = &mcelog.entry[i];
1943
1944 while (!m->finished) {
1945 if (time_after_eq(jiffies, start + 2)) {
1946 memset(m, 0, sizeof(*m));
1947 goto timeout;
1948 }
1949 cpu_relax();
1950 }
1951 smp_rmb();
1952 err |= copy_to_user(buf, m, sizeof(*m));
1953 buf += sizeof(*m);
1954timeout:
1955 ;
1956 }
1957
1958 memset(mcelog.entry + prev, 0,
1959 (next - prev) * sizeof(struct mce));
1960 prev = next;
1961 next = cmpxchg(&mcelog.next, prev, 0);
1962 } while (next != prev);
1963
1964 synchronize_sched();
1965
1966 /*
1967 * Collect entries that were still getting written before the
1968 * synchronize.
1969 */
1970 on_each_cpu(collect_tscs, cpu_tsc, 1);
1971
1972 for (i = next; i < MCE_LOG_LEN; i++) {
1973 struct mce *m = &mcelog.entry[i];
1974
1975 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1976 err |= copy_to_user(buf, m, sizeof(*m));
1977 smp_rmb();
1978 buf += sizeof(*m);
1979 memset(m, 0, sizeof(*m));
1980 }
1981 }
1982
1983 if (err)
1984 err = -EFAULT;
1985
1986out:
1987 mutex_unlock(&mce_chrdev_read_mutex);
1988 kfree(cpu_tsc);
1989
1990 return err ? err : buf - ubuf;
1991}
1992
1993static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1994{
1995 poll_wait(file, &mce_chrdev_wait, wait);
1996 if (READ_ONCE(mcelog.next))
1997 return POLLIN | POLLRDNORM;
1998 if (!mce_apei_read_done && apei_check_mce())
1999 return POLLIN | POLLRDNORM;
2000 return 0;
2001}
2002
2003static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
2004 unsigned long arg)
2005{
2006 int __user *p = (int __user *)arg;
2007
2008 if (!capable(CAP_SYS_ADMIN))
2009 return -EPERM;
2010
2011 switch (cmd) {
2012 case MCE_GET_RECORD_LEN:
2013 return put_user(sizeof(struct mce), p);
2014 case MCE_GET_LOG_LEN:
2015 return put_user(MCE_LOG_LEN, p);
2016 case MCE_GETCLEAR_FLAGS: {
2017 unsigned flags;
2018
2019 do {
2020 flags = mcelog.flags;
2021 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
2022
2023 return put_user(flags, p);
2024 }
2025 default:
2026 return -ENOTTY;
2027 }
2028}
2029
2030static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
2031 size_t usize, loff_t *off);
2032
2033void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
2034 const char __user *ubuf,
2035 size_t usize, loff_t *off))
2036{
2037 mce_write = fn;
2038}
2039EXPORT_SYMBOL_GPL(register_mce_write_callback);
2040
2041static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
2042 size_t usize, loff_t *off)
2043{
2044 if (mce_write)
2045 return mce_write(filp, ubuf, usize, off);
2046 else
2047 return -EINVAL;
2048}
2049
2050static const struct file_operations mce_chrdev_ops = {
2051 .open = mce_chrdev_open,
2052 .release = mce_chrdev_release,
2053 .read = mce_chrdev_read,
2054 .write = mce_chrdev_write,
2055 .poll = mce_chrdev_poll,
2056 .unlocked_ioctl = mce_chrdev_ioctl,
2057 .llseek = no_llseek,
2058};
2059
2060static struct miscdevice mce_chrdev_device = {
2061 MISC_MCELOG_MINOR,
2062 "mcelog",
2063 &mce_chrdev_ops,
2064};
2065
2066static void __mce_disable_bank(void *arg) 1808static void __mce_disable_bank(void *arg)
2067{ 1809{
2068 int bank = *((int *)arg); 1810 int bank = *((int *)arg);
@@ -2136,6 +1878,7 @@ __setup("mce", mcheck_enable);
2136int __init mcheck_init(void) 1878int __init mcheck_init(void)
2137{ 1879{
2138 mcheck_intel_therm_init(); 1880 mcheck_intel_therm_init();
1881 mce_register_decode_chain(&first_nb);
2139 mce_register_decode_chain(&mce_srao_nb); 1882 mce_register_decode_chain(&mce_srao_nb);
2140 mce_register_decode_chain(&mce_default_nb); 1883 mce_register_decode_chain(&mce_default_nb);
2141 mcheck_vendor_init_severity(); 1884 mcheck_vendor_init_severity();
@@ -2280,29 +2023,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2280 return size; 2023 return size;
2281} 2024}
2282 2025
2283static ssize_t
2284show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2285{
2286 strcpy(buf, mce_helper);
2287 strcat(buf, "\n");
2288 return strlen(mce_helper) + 1;
2289}
2290
2291static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2292 const char *buf, size_t siz)
2293{
2294 char *p;
2295
2296 strncpy(mce_helper, buf, sizeof(mce_helper));
2297 mce_helper[sizeof(mce_helper)-1] = 0;
2298 p = strchr(mce_helper, '\n');
2299
2300 if (p)
2301 *p = 0;
2302
2303 return strlen(mce_helper) + !!p;
2304}
2305
2306static ssize_t set_ignore_ce(struct device *s, 2026static ssize_t set_ignore_ce(struct device *s,
2307 struct device_attribute *attr, 2027 struct device_attribute *attr,
2308 const char *buf, size_t size) 2028 const char *buf, size_t size)
@@ -2359,7 +2079,6 @@ static ssize_t store_int_with_restart(struct device *s,
2359 return ret; 2079 return ret;
2360} 2080}
2361 2081
2362static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2363static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); 2082static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2364static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); 2083static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2365static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); 2084static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
@@ -2382,7 +2101,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
2382static struct device_attribute *mce_device_attrs[] = { 2101static struct device_attribute *mce_device_attrs[] = {
2383 &dev_attr_tolerant.attr, 2102 &dev_attr_tolerant.attr,
2384 &dev_attr_check_interval.attr, 2103 &dev_attr_check_interval.attr,
2104#ifdef CONFIG_X86_MCELOG_LEGACY
2385 &dev_attr_trigger, 2105 &dev_attr_trigger,
2106#endif
2386 &dev_attr_monarch_timeout.attr, 2107 &dev_attr_monarch_timeout.attr,
2387 &dev_attr_dont_log_ce.attr, 2108 &dev_attr_dont_log_ce.attr,
2388 &dev_attr_ignore_ce.attr, 2109 &dev_attr_ignore_ce.attr,
@@ -2556,7 +2277,6 @@ static __init void mce_init_banks(void)
2556 2277
2557static __init int mcheck_init_device(void) 2278static __init int mcheck_init_device(void)
2558{ 2279{
2559 enum cpuhp_state hp_online;
2560 int err; 2280 int err;
2561 2281
2562 if (!mce_available(&boot_cpu_data)) { 2282 if (!mce_available(&boot_cpu_data)) {
@@ -2584,21 +2304,11 @@ static __init int mcheck_init_device(void)
2584 mce_cpu_online, mce_cpu_pre_down); 2304 mce_cpu_online, mce_cpu_pre_down);
2585 if (err < 0) 2305 if (err < 0)
2586 goto err_out_online; 2306 goto err_out_online;
2587 hp_online = err;
2588 2307
2589 register_syscore_ops(&mce_syscore_ops); 2308 register_syscore_ops(&mce_syscore_ops);
2590 2309
2591 /* register character device /dev/mcelog */
2592 err = misc_register(&mce_chrdev_device);
2593 if (err)
2594 goto err_register;
2595
2596 return 0; 2310 return 0;
2597 2311
2598err_register:
2599 unregister_syscore_ops(&mce_syscore_ops);
2600 cpuhp_remove_state(hp_online);
2601
2602err_out_online: 2312err_out_online:
2603 cpuhp_remove_state(CPUHP_X86_MCE_DEAD); 2313 cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
2604 2314
@@ -2606,7 +2316,7 @@ err_out_mem:
2606 free_cpumask_var(mce_device_initialized); 2316 free_cpumask_var(mce_device_initialized);
2607 2317
2608err_out: 2318err_out:
2609 pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); 2319 pr_err("Unable to init MCE device (rc: %d)\n", err);
2610 2320
2611 return err; 2321 return err;
2612} 2322}
@@ -2685,6 +2395,7 @@ static int __init mcheck_late_init(void)
2685 static_branch_inc(&mcsafe_key); 2395 static_branch_inc(&mcsafe_key);
2686 2396
2687 mcheck_debugfs_init(); 2397 mcheck_debugfs_init();
2398 cec_init();
2688 2399
2689 /* 2400 /*
2690 * Flush out everything that has been logged during early boot, now that 2401 * Flush out everything that has been logged during early boot, now that
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 190b3e6cef4d..e84db79ef272 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -481,6 +481,9 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
481 case INTEL_FAM6_BROADWELL_XEON_D: 481 case INTEL_FAM6_BROADWELL_XEON_D:
482 case INTEL_FAM6_BROADWELL_X: 482 case INTEL_FAM6_BROADWELL_X:
483 case INTEL_FAM6_SKYLAKE_X: 483 case INTEL_FAM6_SKYLAKE_X:
484 case INTEL_FAM6_XEON_PHI_KNL:
485 case INTEL_FAM6_XEON_PHI_KNM:
486
484 if (rdmsrl_safe(MSR_PPIN_CTL, &val)) 487 if (rdmsrl_safe(MSR_PPIN_CTL, &val))
485 return; 488 return;
486 489
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 067f9813fd2c..2544700a2a87 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -765,10 +765,11 @@ void machine_crash_shutdown(struct pt_regs *regs)
765#endif 765#endif
766 766
767 767
768/* This is the CPU performing the emergency shutdown work. */
769int crashing_cpu = -1;
770
768#if defined(CONFIG_SMP) 771#if defined(CONFIG_SMP)
769 772
770/* This keeps a track of which one is crashing cpu. */
771static int crashing_cpu;
772static nmi_shootdown_cb shootdown_callback; 773static nmi_shootdown_cb shootdown_callback;
773 774
774static atomic_t waiting_for_crash_ipi; 775static atomic_t waiting_for_crash_ipi;
diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
index 0bc60a308730..2a2d89d39af6 100644
--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
@@ -7,3 +7,17 @@ config MCE_AMD_INJ
7 aspects of the MCE handling code. 7 aspects of the MCE handling code.
8 8
9 WARNING: Do not even assume this interface is staying stable! 9 WARNING: Do not even assume this interface is staying stable!
10
11config RAS_CEC
12 bool "Correctable Errors Collector"
13 depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
14 ---help---
15 This is a small cache which collects correctable memory errors per 4K
16 page PFN and counts their repeated occurrence. Once the counter for a
17 PFN overflows, we try to soft-offline that page as we take it to mean
18 that it has reached a relatively high error count and would probably
19 be best if we don't use it anymore.
20
21 Bear in mind that this is absolutely useless if your platform doesn't
22 have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
23
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 79b3c9c5a3bc..d0855c09f32f 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -1005,9 +1005,8 @@ static int ghes_probe(struct platform_device *ghes_dev)
1005 1005
1006 switch (generic->notify.type) { 1006 switch (generic->notify.type) {
1007 case ACPI_HEST_NOTIFY_POLLED: 1007 case ACPI_HEST_NOTIFY_POLLED:
1008 ghes->timer.function = ghes_poll_func; 1008 setup_deferrable_timer(&ghes->timer, ghes_poll_func,
1009 ghes->timer.data = (unsigned long)ghes; 1009 (unsigned long)ghes);
1010 init_timer_deferrable(&ghes->timer);
1011 ghes_add_timer(ghes); 1010 ghes_add_timer(ghes);
1012 break; 1011 break;
1013 case ACPI_HEST_NOTIFY_EXTERNAL: 1012 case ACPI_HEST_NOTIFY_EXTERNAL:
diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile
index d7f73341ced3..7b26dd3aa5d0 100644
--- a/drivers/ras/Makefile
+++ b/drivers/ras/Makefile
@@ -1 +1,2 @@
1obj-$(CONFIG_RAS) += ras.o debugfs.o 1obj-$(CONFIG_RAS) += ras.o debugfs.o
2obj-$(CONFIG_RAS_CEC) += cec.o
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
new file mode 100644
index 000000000000..6aab46d91d33
--- /dev/null
+++ b/drivers/ras/cec.c
@@ -0,0 +1,532 @@
1#include <linux/mm.h>
2#include <linux/gfp.h>
3#include <linux/kernel.h>
4
5#include <asm/mce.h>
6
7#include "debugfs.h"
8
9/*
10 * RAS Correctable Errors Collector
11 *
12 * This is a simple gadget which collects correctable errors and counts their
13 * occurrence per physical page address.
14 *
15 * We've opted for possibly the simplest data structure to collect those - an
16 * array of the size of a memory page. It stores 512 u64's with the following
17 * structure:
18 *
19 * [63 ... PFN ... 12 | 11 ... generation ... 10 | 9 ... count ... 0]
20 *
21 * The generation in the two highest order bits is two bits which are set to 11b
22 * on every insertion. During the course of each entry's existence, the
23 * generation field gets decremented during spring cleaning to 10b, then 01b and
24 * then 00b.
25 *
26 * This way we're employing the natural numeric ordering to make sure that newly
27 * inserted/touched elements have higher 12-bit counts (which we've manufactured)
28 * and thus iterating over the array initially won't kick out those elements
29 * which were inserted last.
30 *
31 * Spring cleaning is what we do when we reach a certain number CLEAN_ELEMS of
32 * elements entered into the array, during which, we're decaying all elements.
33 * If, after decay, an element gets inserted again, its generation is set to 11b
34 * to make sure it has higher numerical count than other, older elements and
35 * thus emulate an an LRU-like behavior when deleting elements to free up space
36 * in the page.
37 *
38 * When an element reaches it's max count of count_threshold, we try to poison
39 * it by assuming that errors triggered count_threshold times in a single page
40 * are excessive and that page shouldn't be used anymore. count_threshold is
41 * initialized to COUNT_MASK which is the maximum.
42 *
43 * That error event entry causes cec_add_elem() to return !0 value and thus
44 * signal to its callers to log the error.
45 *
46 * To the question why we've chosen a page and moving elements around with
47 * memmove(), it is because it is a very simple structure to handle and max data
48 * movement is 4K which on highly optimized modern CPUs is almost unnoticeable.
49 * We wanted to avoid the pointer traversal of more complex structures like a
50 * linked list or some sort of a balancing search tree.
51 *
52 * Deleting an element takes O(n) but since it is only a single page, it should
53 * be fast enough and it shouldn't happen all too often depending on error
54 * patterns.
55 */
56
57#undef pr_fmt
58#define pr_fmt(fmt) "RAS: " fmt
59
60/*
61 * We use DECAY_BITS bits of PAGE_SHIFT bits for counting decay, i.e., how long
62 * elements have stayed in the array without having been accessed again.
63 */
64#define DECAY_BITS 2
65#define DECAY_MASK ((1ULL << DECAY_BITS) - 1)
66#define MAX_ELEMS (PAGE_SIZE / sizeof(u64))
67
68/*
69 * Threshold amount of inserted elements after which we start spring
70 * cleaning.
71 */
72#define CLEAN_ELEMS (MAX_ELEMS >> DECAY_BITS)
73
74/* Bits which count the number of errors happened in this 4K page. */
75#define COUNT_BITS (PAGE_SHIFT - DECAY_BITS)
76#define COUNT_MASK ((1ULL << COUNT_BITS) - 1)
77#define FULL_COUNT_MASK (PAGE_SIZE - 1)
78
79/*
80 * u64: [ 63 ... 12 | DECAY_BITS | COUNT_BITS ]
81 */
82
83#define PFN(e) ((e) >> PAGE_SHIFT)
84#define DECAY(e) (((e) >> COUNT_BITS) & DECAY_MASK)
85#define COUNT(e) ((unsigned int)(e) & COUNT_MASK)
86#define FULL_COUNT(e) ((e) & (PAGE_SIZE - 1))
87
88static struct ce_array {
89 u64 *array; /* container page */
90 unsigned int n; /* number of elements in the array */
91
92 unsigned int decay_count; /*
93 * number of element insertions/increments
94 * since the last spring cleaning.
95 */
96
97 u64 pfns_poisoned; /*
98 * number of PFNs which got poisoned.
99 */
100
101 u64 ces_entered; /*
102 * The number of correctable errors
103 * entered into the collector.
104 */
105
106 u64 decays_done; /*
107 * Times we did spring cleaning.
108 */
109
110 union {
111 struct {
112 __u32 disabled : 1, /* cmdline disabled */
113 __resv : 31;
114 };
115 __u32 flags;
116 };
117} ce_arr;
118
119static DEFINE_MUTEX(ce_mutex);
120static u64 dfs_pfn;
121
122/* Amount of errors after which we offline */
123static unsigned int count_threshold = COUNT_MASK;
124
125/*
126 * The timer "decays" element count each timer_interval which is 24hrs by
127 * default.
128 */
129
130#define CEC_TIMER_DEFAULT_INTERVAL 24 * 60 * 60 /* 24 hrs */
131#define CEC_TIMER_MIN_INTERVAL 1 * 60 * 60 /* 1h */
132#define CEC_TIMER_MAX_INTERVAL 30 * 24 * 60 * 60 /* one month */
133static struct timer_list cec_timer;
134static u64 timer_interval = CEC_TIMER_DEFAULT_INTERVAL;
135
136/*
137 * Decrement decay value. We're using DECAY_BITS bits to denote decay of an
138 * element in the array. On insertion and any access, it gets reset to max.
139 */
140static void do_spring_cleaning(struct ce_array *ca)
141{
142 int i;
143
144 for (i = 0; i < ca->n; i++) {
145 u8 decay = DECAY(ca->array[i]);
146
147 if (!decay)
148 continue;
149
150 decay--;
151
152 ca->array[i] &= ~(DECAY_MASK << COUNT_BITS);
153 ca->array[i] |= (decay << COUNT_BITS);
154 }
155 ca->decay_count = 0;
156 ca->decays_done++;
157}
158
159/*
160 * @interval in seconds
161 */
162static void cec_mod_timer(struct timer_list *t, unsigned long interval)
163{
164 unsigned long iv;
165
166 iv = interval * HZ + jiffies;
167
168 mod_timer(t, round_jiffies(iv));
169}
170
171static void cec_timer_fn(unsigned long data)
172{
173 struct ce_array *ca = (struct ce_array *)data;
174
175 do_spring_cleaning(ca);
176
177 cec_mod_timer(&cec_timer, timer_interval);
178}
179
180/*
181 * @to: index of the smallest element which is >= then @pfn.
182 *
183 * Return the index of the pfn if found, otherwise negative value.
184 */
185static int __find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
186{
187 u64 this_pfn;
188 int min = 0, max = ca->n;
189
190 while (min < max) {
191 int tmp = (max + min) >> 1;
192
193 this_pfn = PFN(ca->array[tmp]);
194
195 if (this_pfn < pfn)
196 min = tmp + 1;
197 else if (this_pfn > pfn)
198 max = tmp;
199 else {
200 min = tmp;
201 break;
202 }
203 }
204
205 if (to)
206 *to = min;
207
208 this_pfn = PFN(ca->array[min]);
209
210 if (this_pfn == pfn)
211 return min;
212
213 return -ENOKEY;
214}
215
216static int find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
217{
218 WARN_ON(!to);
219
220 if (!ca->n) {
221 *to = 0;
222 return -ENOKEY;
223 }
224 return __find_elem(ca, pfn, to);
225}
226
227static void del_elem(struct ce_array *ca, int idx)
228{
229 /* Save us a function call when deleting the last element. */
230 if (ca->n - (idx + 1))
231 memmove((void *)&ca->array[idx],
232 (void *)&ca->array[idx + 1],
233 (ca->n - (idx + 1)) * sizeof(u64));
234
235 ca->n--;
236}
237
238static u64 del_lru_elem_unlocked(struct ce_array *ca)
239{
240 unsigned int min = FULL_COUNT_MASK;
241 int i, min_idx = 0;
242
243 for (i = 0; i < ca->n; i++) {
244 unsigned int this = FULL_COUNT(ca->array[i]);
245
246 if (min > this) {
247 min = this;
248 min_idx = i;
249 }
250 }
251
252 del_elem(ca, min_idx);
253
254 return PFN(ca->array[min_idx]);
255}
256
257/*
258 * We return the 0th pfn in the error case under the assumption that it cannot
259 * be poisoned and excessive CEs in there are a serious deal anyway.
260 */
261static u64 __maybe_unused del_lru_elem(void)
262{
263 struct ce_array *ca = &ce_arr;
264 u64 pfn;
265
266 if (!ca->n)
267 return 0;
268
269 mutex_lock(&ce_mutex);
270 pfn = del_lru_elem_unlocked(ca);
271 mutex_unlock(&ce_mutex);
272
273 return pfn;
274}
275
276
277int cec_add_elem(u64 pfn)
278{
279 struct ce_array *ca = &ce_arr;
280 unsigned int to;
281 int count, ret = 0;
282
283 /*
284 * We can be called very early on the identify_cpu() path where we are
285 * not initialized yet. We ignore the error for simplicity.
286 */
287 if (!ce_arr.array || ce_arr.disabled)
288 return -ENODEV;
289
290 ca->ces_entered++;
291
292 mutex_lock(&ce_mutex);
293
294 if (ca->n == MAX_ELEMS)
295 WARN_ON(!del_lru_elem_unlocked(ca));
296
297 ret = find_elem(ca, pfn, &to);
298 if (ret < 0) {
299 /*
300 * Shift range [to-end] to make room for one more element.
301 */
302 memmove((void *)&ca->array[to + 1],
303 (void *)&ca->array[to],
304 (ca->n - to) * sizeof(u64));
305
306 ca->array[to] = (pfn << PAGE_SHIFT) |
307 (DECAY_MASK << COUNT_BITS) | 1;
308
309 ca->n++;
310
311 ret = 0;
312
313 goto decay;
314 }
315
316 count = COUNT(ca->array[to]);
317
318 if (count < count_threshold) {
319 ca->array[to] |= (DECAY_MASK << COUNT_BITS);
320 ca->array[to]++;
321
322 ret = 0;
323 } else {
324 u64 pfn = ca->array[to] >> PAGE_SHIFT;
325
326 if (!pfn_valid(pfn)) {
327 pr_warn("CEC: Invalid pfn: 0x%llx\n", pfn);
328 } else {
329 /* We have reached max count for this page, soft-offline it. */
330 pr_err("Soft-offlining pfn: 0x%llx\n", pfn);
331 memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE);
332 ca->pfns_poisoned++;
333 }
334
335 del_elem(ca, to);
336
337 /*
338 * Return a >0 value to denote that we've reached the offlining
339 * threshold.
340 */
341 ret = 1;
342
343 goto unlock;
344 }
345
346decay:
347 ca->decay_count++;
348
349 if (ca->decay_count >= CLEAN_ELEMS)
350 do_spring_cleaning(ca);
351
352unlock:
353 mutex_unlock(&ce_mutex);
354
355 return ret;
356}
357
358static int u64_get(void *data, u64 *val)
359{
360 *val = *(u64 *)data;
361
362 return 0;
363}
364
365static int pfn_set(void *data, u64 val)
366{
367 *(u64 *)data = val;
368
369 return cec_add_elem(val);
370}
371
372DEFINE_DEBUGFS_ATTRIBUTE(pfn_ops, u64_get, pfn_set, "0x%llx\n");
373
374static int decay_interval_set(void *data, u64 val)
375{
376 *(u64 *)data = val;
377
378 if (val < CEC_TIMER_MIN_INTERVAL)
379 return -EINVAL;
380
381 if (val > CEC_TIMER_MAX_INTERVAL)
382 return -EINVAL;
383
384 timer_interval = val;
385
386 cec_mod_timer(&cec_timer, timer_interval);
387 return 0;
388}
389DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n");
390
391static int count_threshold_set(void *data, u64 val)
392{
393 *(u64 *)data = val;
394
395 if (val > COUNT_MASK)
396 val = COUNT_MASK;
397
398 count_threshold = val;
399
400 return 0;
401}
402DEFINE_DEBUGFS_ATTRIBUTE(count_threshold_ops, u64_get, count_threshold_set, "%lld\n");
403
404static int array_dump(struct seq_file *m, void *v)
405{
406 struct ce_array *ca = &ce_arr;
407 u64 prev = 0;
408 int i;
409
410 mutex_lock(&ce_mutex);
411
412 seq_printf(m, "{ n: %d\n", ca->n);
413 for (i = 0; i < ca->n; i++) {
414 u64 this = PFN(ca->array[i]);
415
416 seq_printf(m, " %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
417
418 WARN_ON(prev > this);
419
420 prev = this;
421 }
422
423 seq_printf(m, "}\n");
424
425 seq_printf(m, "Stats:\nCEs: %llu\nofflined pages: %llu\n",
426 ca->ces_entered, ca->pfns_poisoned);
427
428 seq_printf(m, "Flags: 0x%x\n", ca->flags);
429
430 seq_printf(m, "Timer interval: %lld seconds\n", timer_interval);
431 seq_printf(m, "Decays: %lld\n", ca->decays_done);
432
433 seq_printf(m, "Action threshold: %d\n", count_threshold);
434
435 mutex_unlock(&ce_mutex);
436
437 return 0;
438}
439
440static int array_open(struct inode *inode, struct file *filp)
441{
442 return single_open(filp, array_dump, NULL);
443}
444
445static const struct file_operations array_ops = {
446 .owner = THIS_MODULE,
447 .open = array_open,
448 .read = seq_read,
449 .llseek = seq_lseek,
450 .release = single_release,
451};
452
453static int __init create_debugfs_nodes(void)
454{
455 struct dentry *d, *pfn, *decay, *count, *array;
456
457 d = debugfs_create_dir("cec", ras_debugfs_dir);
458 if (!d) {
459 pr_warn("Error creating cec debugfs node!\n");
460 return -1;
461 }
462
463 pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
464 if (!pfn) {
465 pr_warn("Error creating pfn debugfs node!\n");
466 goto err;
467 }
468
469 array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops);
470 if (!array) {
471 pr_warn("Error creating array debugfs node!\n");
472 goto err;
473 }
474
475 decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d,
476 &timer_interval, &decay_interval_ops);
477 if (!decay) {
478 pr_warn("Error creating decay_interval debugfs node!\n");
479 goto err;
480 }
481
482 count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d,
483 &count_threshold, &count_threshold_ops);
484 if (!decay) {
485 pr_warn("Error creating count_threshold debugfs node!\n");
486 goto err;
487 }
488
489
490 return 0;
491
492err:
493 debugfs_remove_recursive(d);
494
495 return 1;
496}
497
498void __init cec_init(void)
499{
500 if (ce_arr.disabled)
501 return;
502
503 ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL);
504 if (!ce_arr.array) {
505 pr_err("Error allocating CE array page!\n");
506 return;
507 }
508
509 if (create_debugfs_nodes())
510 return;
511
512 setup_timer(&cec_timer, cec_timer_fn, (unsigned long)&ce_arr);
513 cec_mod_timer(&cec_timer, CEC_TIMER_DEFAULT_INTERVAL);
514
515 pr_info("Correctable Errors collector initialized.\n");
516}
517
518int __init parse_cec_param(char *str)
519{
520 if (!str)
521 return 0;
522
523 if (*str == '=')
524 str++;
525
526 if (!strncmp(str, "cec_disable", 7))
527 ce_arr.disabled = 1;
528 else
529 return 0;
530
531 return 1;
532}
diff --git a/drivers/ras/debugfs.c b/drivers/ras/debugfs.c
index 0322acf67ea5..501603057dff 100644
--- a/drivers/ras/debugfs.c
+++ b/drivers/ras/debugfs.c
@@ -1,6 +1,6 @@
1#include <linux/debugfs.h> 1#include <linux/debugfs.h>
2 2
3static struct dentry *ras_debugfs_dir; 3struct dentry *ras_debugfs_dir;
4 4
5static atomic_t trace_count = ATOMIC_INIT(0); 5static atomic_t trace_count = ATOMIC_INIT(0);
6 6
diff --git a/drivers/ras/debugfs.h b/drivers/ras/debugfs.h
new file mode 100644
index 000000000000..db72e4513191
--- /dev/null
+++ b/drivers/ras/debugfs.h
@@ -0,0 +1,8 @@
1#ifndef __RAS_DEBUGFS_H__
2#define __RAS_DEBUGFS_H__
3
4#include <linux/debugfs.h>
5
6extern struct dentry *ras_debugfs_dir;
7
8#endif /* __RAS_DEBUGFS_H__ */
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index b67dd362b7b6..94f8038864b4 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -27,3 +27,14 @@ subsys_initcall(ras_init);
27EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event); 27EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
28#endif 28#endif
29EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event); 29EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
30
31
32int __init parse_ras_param(char *str)
33{
34#ifdef CONFIG_RAS_CEC
35 parse_cec_param(str);
36#endif
37
38 return 1;
39}
40__setup("ras", parse_ras_param);
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 2aceeafd6fe5..ffb147185e8d 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -1,14 +1,25 @@
1#ifndef __RAS_H__ 1#ifndef __RAS_H__
2#define __RAS_H__ 2#define __RAS_H__
3 3
4#include <asm/errno.h>
5
4#ifdef CONFIG_DEBUG_FS 6#ifdef CONFIG_DEBUG_FS
5int ras_userspace_consumers(void); 7int ras_userspace_consumers(void);
6void ras_debugfs_init(void); 8void ras_debugfs_init(void);
7int ras_add_daemon_trace(void); 9int ras_add_daemon_trace(void);
8#else 10#else
9static inline int ras_userspace_consumers(void) { return 0; } 11static inline int ras_userspace_consumers(void) { return 0; }
10static inline void ras_debugfs_init(void) { return; } 12static inline void ras_debugfs_init(void) { }
11static inline int ras_add_daemon_trace(void) { return 0; } 13static inline int ras_add_daemon_trace(void) { return 0; }
12#endif 14#endif
13 15
16#ifdef CONFIG_RAS_CEC
17void __init cec_init(void);
18int __init parse_cec_param(char *str);
19int cec_add_elem(u64 pfn);
20#else
21static inline void __init cec_init(void) { }
22static inline int cec_add_elem(u64 pfn) { return -ENODEV; }
14#endif 23#endif
24
25#endif /* __RAS_H__ */